File Coverage

blib/lib/Statistics/Descriptive/Discrete.pm

Criterion	Covered	Total	%
statement	220	226	97.3
branch	62	78	79.4
condition	11	15	73.3
subroutine	17	18	94.4
pod	8	9	88.8
total	318	346	91.9

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::Descriptive::Discrete;
2
3							### This module draws heavily from Statistics::Descriptive
4
5	3			3		211232	use strict;
	3					30
	3					92
6	3			3		16	use warnings;
	3					7
	3					111
7	3			3		16	use Carp;
	3					7
	3					183
8	3			3		1541	use AutoLoader;
	3					4301
	3					19
9	3			3		126	use vars qw($VERSION $AUTOLOAD $DEBUG $Tolerance %autosubs);
	3					7
	3					6397
10
11							$VERSION = '0.10';
12							$DEBUG = 0;
13
14							$Tolerance = 0.0;
15
16							#what subs can be autoloaded?
17							%autosubs = (
18							count => undef,
19							mean => undef,
20							geometric_mean=> undef,
21							harmonic_mean=>undef,
22							sum => undef,
23							mode => undef,
24							median => undef,
25							min => undef,
26							max => undef,
27							mindex => undef,
28							maxdex => undef,
29							standard_deviation => undef,
30							sample_range => undef,
31							variance => undef,
32							text => undef,
33							);
34
35
36							sub new
37							{
38	17			17	1	271	my $proto = shift;
39	17		33			78	my $class = ref($proto) \|\| $proto;
40	17					34	my $self = {};
41	17					52	$self->{_permitted} = \%autosubs;
42	17					32	$self->{data} = ();
43	17					31	$self->{_dataindex} = (); #index of where each value first seen when adding data
44	17					31	$self->{dirty} = 1; #is the data dirty?
45	17					30	$self->{_index} = 0; #current index of number of data items added
46
47	17					35	bless ($self,$class);
48	17	50				39	print __PACKAGE__,"->new(",join(',',@_),")\n" if $DEBUG;
49	17					50	return $self;
50							}
51
52							# Clear the stat object & erase all data
53							# Object will be ready to use as if new was called
54							# Not sure this is more efficient than just creating a new object but
55							# maintained for compatability with Statistics::Descriptive
56							sub clear
57							{
58	8			8	1	3842	my $self = shift;
59	8					16	my %keys = %{ $self };
	8					71
60
61							#remove _permitted from the deletion list
62	8					24	delete $keys{"_permitted"};
63
64	8					34	foreach my $key (keys %keys)
65							{ # Check each key in the object
66	132	50				224	print __PACKAGE__,"->clear, deleting $key\n" if $DEBUG;
67	132					210	delete $self->{$key}; # Delete any out of date cached key
68							}
69	8					19	$self->{data} = ();
70	8					16	$self->{_dataindex} = ();
71	8					12	$self->{dirty} = 1;
72	8					40	$self->{_index} = 0;
73							}
74
75							sub add_data
76							{
77							#add data but don't compute ANY statistics yet
78	21			21	1	671	my $self = shift;
79	21	50				47	print __PACKAGE__,"->add_data(",join(',',@_),")\n" if $DEBUG;
80
81							#get each element and add 0 to force it be a number
82							#that way, 0.000 and 0 are treated the same
83	21					36	my $val = shift;
84	21					62	while (defined $val)
85							{
86	136					199	$val += 0;
87	136					354	$self->{data}{$val}++;
88	136	100				353	if (not exists $self->{_dataindex}{$val}) {
89	74					158	$self->{_dataindex}{$val} = $self->{_index};
90							}
91	136					185	$self->{_index}++;
92							#set dirty flag so we know cached stats are invalid
93	136					199	$self->{dirty}++;
94	136					428	$val = shift; #get next element
95							}
96							}
97
98							sub add_data_tuple
99							{
100							#add data but don't compute ANY statistics yet
101							#the data are pairs of values and occurrences
102							#e.g. 4,2 means 2 occurrences of the value 4
103							#thanks to Bill Dueber for suggesting this
104
105	5			5	1	26	my $self = shift;
106	5	50				13	print __PACKAGE__,"->add_data_tuple(",join(',',@_),")\n" if $DEBUG;
107
108							#we want an even number of arguments (tuples in the form (value, count))
109	5	50				15	carp "argument list must have even number of elements" if @_ % 2;
110
111							#get each element and add 0 to force it be a number
112							#that way, 0.000 and 0 are treated the same
113							#if $count is 0, then this will set the dirty flag but have no effect on
114							#the statistics
115	5					7	my $val = shift;
116	5					9	my $count = shift;
117	5					13	while (defined $count)
118							{
119	18					26	$val += 0;
120	18					37	$self->{data}{$val} += $count;
121	18	50				37	if (not exists $self->{_dataindex}{$val}) {
122	18					32	$self->{_dataindex}{$val} = $self->{_index};
123							}
124	18					27	$self->{_index} += $count;
125							#set dirty flag so we know cached stats are invalid
126	18					24	$self->{dirty}++;
127	18					28	$val = shift; #get next element
128	18					40	$count = shift;
129							}
130							}
131
132							sub _test_for_too_small_val
133							{
134	112			112		176	my $self = shift;
135	112					159	my $val = shift;
136
137	112					285	return (abs($val) <= $Statistics::Descriptive::Discrete::Tolerance);
138							}
139
140							sub _calc_harmonic_mean
141							{
142	24			24		38	my $self = shift;
143	24					48	my $count = shift;
144	24					39	my $datakeys = shift; #array ref
145
146	24					36	my $hs = 0;
147
148	24					35	foreach my $val ( @{$datakeys} )
	24					50
149							{
150							##Guarantee that there are no divide by zeros
151	91	100				160	if ($self->_test_for_too_small_val($val))
152							{
153	3					10	return;
154							}
155
156	88					180	foreach (1..$self->{data}{$val})
157							{
158	201					355	$hs += 1/$val;
159							}
160							}
161
162	21	100				40	if ($self->_test_for_too_small_val($hs))
163							{
164	2					5	return;
165							}
166
167	19					42	return $count/$hs;
168							}
169
170							sub _all_stats
171							{
172							#compute all the stats in one sub to save overhead of sub calls
173							#a little wasteful to do this if all we want is count or sum for example but
174							#I want to keep add_data as lean as possible since it gets called a lot
175	28			28		49	my $self = shift;
176	28	50				59	print __PACKAGE__,"->_all_stats(",join(',',@_),")\n" if $DEBUG;
177
178							#if data is empty, set all stats to undef and return
179	28	100				70	if (!$self->{data})
180							{
181	4					6	foreach my $key (keys %{$self->{_permitted}})
	4					21
182							{
183	60					95	$self->{$key} = undef;
184							}
185	4					13	$self->{count} = 0;
186	4					7	return;
187							}
188
189							#count = total number of data values we have
190	24					38	my $count = 0;
191	24					38	$count += $_ foreach (values %{$self->{data}});
	24					110
192
193	24					41	my @datakeys = keys %{$self->{data}};
	24					68
194
195							#initialize min, max, mode to an arbitrary value that's in the hash
196	24					49	my $default = $datakeys[0];
197	24					40	my $max = $default;
198	24					34	my $min = $default;
199	24					39	my $mode = $default;
200	24					39	my $moden = 0;
201	24					37	my $sum = 0;
202
203							#find min, max, sum, and mode
204	24					44	foreach (@datakeys)
205							{
206	95					149	my $n = $self->{data}{$_};
207	95					225	$sum += $_ * $n;
208	95	100				198	$min = $_ if $_ < $min;
209	95	100				184	$max = $_ if $_ > $max;
210
211							#only finds one mode but there could be more than one
212							#also, there might not be any mode (all the same frequency)
213							#todo: need to make this more robust
214	95	100				192	if ($n > $moden)
215							{
216	28					43	$mode = $_;
217	28					52	$moden = $n;
218							}
219							}
220	24					45	my $mindex = $self->{_dataindex}{$min};
221	24					41	my $maxdex = $self->{_dataindex}{$max};
222
223	24					49	my $mean = $sum/$count;
224
225	24					35	my $stddev = 0;
226	24					39	my $variance = 0;
227
228	24	100				52	if ($count > 1)
229							{
230							# Thanks to Peter Dienes for finding and fixing a round-off error
231							# in the following variance calculation
232
233	23					47	foreach my $val (@datakeys)
234							{
235	94					213	$stddev += $self->{data}{$val} * (($val - $mean) ** 2);
236							}
237	23					41	$variance = $stddev / ($count - 1);
238	23					41	$stddev = sqrt($variance);
239							}
240	1					3	else {$stddev = undef}
241
242							#find median, and do it without creating a list of the all the data points
243							#if n=count is odd and n=2k+1 then median = data(k+1)
244							#if n=count is even and n=2k, then median = (data(k) + data(k+1))/2
245	24					46	my $odd = $count % 2; #odd or even number of points?
246	24					53	my $even = !$odd;
247	24	100				55	my $k = $odd ? ($count-1)/2 : $count/2;
248	24					38	my $median = undef;
249	24					39	my $temp = 0;
250	24					82	MEDIAN: foreach my $val (sort {$a <=> $b} (@datakeys))
	133					226
251							{
252	67					139	foreach (1..$self->{data}{$val})
253							{
254	124					173	$temp++;
255	124	100	100			375	if (($temp == $k) && $even)
		100
256							{
257	16					33	$median += $val;
258							}
259							elsif ($temp == $k+1)
260							{
261	24					39	$median += $val;
262	24	100				48	$median /= 2 if $even;
263	24					52	last MEDIAN;
264							}
265							}
266							}
267
268							#compute geometric mean
269	24					45	my $gm = 1;
270	24					41	my $exponent = 1/$count;
271	24					44	foreach my $val (@datakeys)
272							{
273	90	100				166	if ($val < 0)
274							{
275	3					8	$gm = undef;
276	3					5	last;
277							}
278	87					144	foreach (1..$self->{data}{$val})
279							{
280	200					393	$gm = $val*$exponent;
281							}
282							}
283
284							#compute harmonic mean
285	24					65	my $harmonic_mean = scalar $self->_calc_harmonic_mean($count, \@datakeys);
286
287	24	50				52	print __PACKAGE__,"count: $count, _index ",$self->{_index},"\n" if $DEBUG;
288
289	24					55	$self->{count} = $count;
290	24					51	$self->{sum} = $sum;
291	24					60	$self->{standard_deviation} = $stddev;
292	24					40	$self->{variance} = $variance;
293	24					44	$self->{min} = $min;
294	24					44	$self->{max} = $max;
295	24					40	$self->{mindex} = $mindex;
296	24					37	$self->{maxdex} = $maxdex;
297	24					49	$self->{sample_range} = $max - $min; #todo: does this require any bounds checking
298	24					40	$self->{mean} = $mean;
299	24					58	$self->{geometric_mean} = $gm;
300	24					39	$self->{harmonic_mean} = $harmonic_mean;
301	24					40	$self->{median} = $median;
302	24					39	$self->{mode} = $mode;
303
304							#clear dirty flag so we don't needlessly recompute the statistics
305	24					66	$self->{dirty} = 0;
306							}
307
308							sub set_text
309							{
310	0			0	0	0	my $self = shift;
311	0					0	$self->{text} = shift;
312							}
313
314							sub get_data
315							{
316							#returns a list of the data in sorted order
317							#the list could be very big an this defeat the purpose of using this module
318							#use this only if you really need it
319	1			1	1	2	my $self = shift;
320	1	50				7	print __PACKAGE__,"->get_data(",join(',',@_),")\n" if $DEBUG;
321
322	1					2	my @data;
323	1					2	foreach my $val (sort {$a <=> $b} (keys %{$self->{data}}))
	9					17
	1					6
324							{
325	5					18	push @data, $val foreach (1..$self->{data}{$val});
326							}
327	1					5	return @data;
328							}
329
330							# this is the previous frequency_distribution code
331							# redid this completely based on current implementation in
332							# Statistics::Descriptive
333							# sub frequency_distribution
334							# {
335							# #Compute frequency distribution (histogram), borrowed heavily from Statistics::Descriptive
336							# #Behavior is slightly different than Statistics::Descriptive
337							# #e.g. if partition is not specified, we use to set the number of partitions
338							# # if partition = 0, then we return the data hash WITHOUT binning it into equal bins
339							# # I often want to just see how many of each value I saw
340							# #Also, you can manually pass in the bin info (min bin, bin size, and number of partitions)
341							# #I don't cache the frequency data like Statistics::Descriptive does since it's not as expensive to compute
342							# #but I might add that later
343							# #todo: the minbin/binsize stuff is funky and not intuitive -- fix it
344							# my $self = shift;
345							# print __PACKAGE__,"->frequency_distribution(",join(',',@_),")\n" if $DEBUG;
346
347							# my $partitions = shift; #how many partitions (bins)?
348							# my $minbin = shift; #upper bound of first bin
349							# my $binsize = shift; #how wide is each bin?
350
351							# #if partition == 0, then return the data hash
352							# if (not defined $partitions \|\| ($partitions == 0))
353							# {
354							# $self->{frequency_partitions} = 0;
355							# %{$self->{frequency}} = %{$self->{data}};
356							# return %{$self->{frequency}};
357							# }
358
359							# #otherwise, partition better be >= 1
360							# return undef unless $partitions >= 1;
361
362							# $self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
363							# return undef if $self->{count} < 2; #must have at least 2 values
364
365							# #set up the bins
366							# my ($interval, $iter, $max);
367							# if (defined $minbin && defined $binsize)
368							# {
369							# $iter = $minbin;
370							# $max = $minbin+$partitions*$binsize - $binsize;
371							# $interval = $binsize;
372							# $iter -= $interval; #so that loop that sets up bins works correctly
373							# }
374							# else
375							# {
376							# $iter = $self->{min};
377							# $max = $self->{max};
378							# $interval = $self->{sample_range}/$partitions;
379							# }
380							# my @k;
381							# my %bins;
382							# while (($iter += $interval) < $max)
383							# {
384							# $bins{$iter} = 0;
385							# push @k, $iter;
386							# }
387							# $bins{$max} = 0;
388							# push @k, $max;
389
390							# VALUE: foreach my $val (keys %{$self->{data}})
391							# {
392							# foreach my $k (@k)
393							# {
394							# if ($val <= $k)
395							# {
396							# $bins{$k} += $self->{data}{$val}; #how many of this value do we have?
397							# next VALUE;
398							# }
399							# }
400							# # }
401
402							# %{$self->{frequency}} = %bins; #save it for later in case I add caching
403							# $self->{frequency_partitions} = $partitions; #in case I add caching in the future
404							# return %{$self->{frequency}};
405							# }
406
407							sub frequency_distribution_ref
408							{
409	12			12	1	1756	my $self = shift;
410	12					23	my @k = ();
411
412							# If called with no parameters, return the cached hashref
413							# if we have one and data is not dirty
414							# This is implemented this way because that's how Statistics::Descriptive
415							# implements this. I don't like it.
416	12	100	100			68	if ((!@_) && (! $self->{dirty}) && (defined $self->{_frequency}))
			66
417							{
418	3					29	return $self->{_frequency};
419							}
420
421	9	100				35	$self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
422
423							# Must have at least two elements
424	9	100				45	if ($self->count() < 2)
425							{
426	1					5	return undef;
427							}
428
429	8					27	my %bins;
430	8					14	my $partitions = shift;
431
432	8	100				24	if (ref($partitions) eq 'ARRAY')
433							{
434	4					8	@k = @{ $partitions };
	4					13
435	4	50				14	return undef unless @k; ##Empty array
436	4	50				12	if (@k > 1) {
437							##Check for monotonicity
438	4					10	my $element = $k[0];
439	4					24	for my $next_elem (@k[1..$#k]) {
440	18	50				37	if ($element > $next_elem) {
441	0					0	carp "Non monotonic array cannot be used as frequency bins!\n";
442	0					0	return undef;
443							}
444	18					34	$element = $next_elem;
445							}
446							}
447	4					10	%bins = map { $_ => 0 } @k;
	22					60
448							}
449							else
450							{
451	4	100	66			30	return undef unless (defined $partitions) && ($partitions >= 1);
452	3					17	my $interval = $self->sample_range() / $partitions;
453	3					11	foreach my $idx (1 .. ($partitions-1))
454							{
455	2					11	push @k, ($self->min() + $idx * $interval);
456							}
457
458	3					15	$bins{$self->max()} = 0;
459
460	3					14	push @k, $self->max();
461							}
462
463							ELEMENT:
464	7					16	foreach my $element (keys %{$self->{data}})
	7					26
465							{
466	45					70	foreach my $limit (@k)
467							{
468	109	100				222	if ($element <= $limit)
469							{
470	44					107	$bins{$limit} += $self->{data}{$element};
471	44					80	next ELEMENT;
472							}
473							}
474							}
475
476	7					39	$self->{_frequency} = \%bins;
477	7					30	return $self->{_frequency};
478							}
479
480							sub frequency_distribution {
481	3			3	1	403	my $self = shift;
482
483	3					12	my $ret = $self->frequency_distribution_ref(@_);
484
485	3	50				31	if (!defined($ret))
486							{
487	0					0	return undef;
488							}
489							else
490							{
491	3					19	return %$ret;
492							}
493							}
494
495							# return count of unique values in data if called in scalar context
496							# returns sorted array of unique data values if called in array context
497							# returns undef if no data
498							sub uniq
499							{
500	8			8	1	38	my $self = shift;
501
502	8	100				22	if (!$self->{data})
503							{
504	2					8	return undef;
505							}
506
507	6					11	my @datakeys = sort {$a <=> $b} keys %{$self->{data}};
	39					74
	6					30
508
509	6	100				20	if (wantarray)
510							{
511	2					9	return @datakeys;
512							}
513							else
514							{
515	4					8	my $uniq = @datakeys;
516	4					18	return $uniq;
517							}
518							}
519
520							sub AUTOLOAD {
521	82			82		8604	my $self = shift;
522	82	50				219	my $type = ref($self)
523							or croak "$self is not an object";
524	82					153	my $name = $AUTOLOAD;
525	82					434	$name =~ s/.*://; ##Strip fully qualified-package portion
526	82	100				545	return if $name eq "DESTROY";
527	65	50				162	unless (exists $self->{_permitted}{$name} ) {
528	0					0	croak "Can't access `$name' field in class $type";
529							}
530
531	65	50				130	print __PACKAGE__,"->AUTOLOAD $name\n" if $DEBUG;
532
533							#compute stats if necessary
534	65	100				195	$self->_all_stats() if $self->{dirty};
535	65					280	return $self->{$name};
536							}
537
538							1;
539
540							__END__