File Coverage

blib/lib/Text/Quantize.pm

Criterion	Covered	Total	%
statement	77	77	100.0
branch	36	40	90.0
condition	4	5	80.0
subroutine	7	7	100.0
pod	2	2	100.0
total	126	131	96.1

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Quantize;
2							# ABSTRACT: render a list of numbers as a textual chart
3	6			6		133479	use strict;
	6					13
	6					213
4	6			6		27	use warnings;
	6					11
	6					165
5	6			6		38	use List::Util 'sum';
	6					17
	6					871
6	6					66	use Sub::Exporter -setup => {
7							exports => ['quantize', 'bucketize'],
8							groups => {
9							default => ['quantize'],
10							},
11	6			6		17501	};
	6					90164
12
13							sub bucketize {
14	25			25	1	8154	my $elements = shift;
15	25	50				140	my $options = {
16							add_endpoints => 1,
17	25					34	%{ shift(@_) \|\| {} },
18							};
19
20	25					41	my %buckets;
21	25					45	for my $element (@$elements) {
22	359					311	my $bucket;
23
24	359	100				711	if ($element == 0) {
		100
25	8					9	$bucket = 0;
26							}
27							elsif ($element < 0) {
28							# log(negative) is an error, so take the log of the absolute value then negate it
29	123					224	$bucket = -1 * (2 ** int(log(-$element) / log(2)));
30							}
31							else {
32							# which power of 2 is this greater-than-or-equal to?
33	228					445	$bucket = 2 ** int(log($element) / log(2));
34							}
35
36	359					1460	$buckets{$bucket}++;
37							}
38
39							# allow user to specify only one of the two endpoints if desired, and figure the other out
40	25	100				84	if ($options->{add_endpoints}) {
41	6	100	66			37	unless (defined($options->{minimum}) && defined($options->{maximum})) {
42	5					24	my ($min, $max) = _endpoints_for(\%buckets);
43	5	50				33	$options->{minimum} = $min if !defined($options->{minimum});
44	5	100				51	$options->{maximum} = $max if !defined($options->{maximum});
45							}
46							}
47
48							# if add_endpoints, then use the expanded range that we calculated or the user specified
49							# otherwise, start at the first bucket with data and go until the last bucket with data
50	88					113	my ($start, $end) = $options->{add_endpoints}
51							? ($options->{minimum}, $options->{maximum})
52	25	100				109	: (sort { $a <=> $b } keys %buckets)[0,-1];
53
54							# force every bucket in the range to exist
55	25					56	my $i = $start;
56	25					68	while ($i <= $end) {
57	141		100			364	$buckets{$i} \|\|= 0;
58	141	100				350	if ($i == 0) {
		100
		100
59	9					17	$i = 1;
60							}
61							elsif ($i == -1) {
62	5					11	$i = 0;
63							}
64							elsif ($i < 0) {
65	29					57	$i /= 2; # since we're negative, increasing means smaller numbers
66							}
67							else {
68	98					204	$i *= 2;
69							}
70							}
71	25					89	return \%buckets;
72							}
73
74							sub quantize {
75	6			6	1	1968	my $elements = shift;
76	6	100				130	my %options = (
77							distribution_width => 40,
78							distribution_character => '@',
79							left_label => 'value',
80							middle_label => 'Distribution',
81							right_label => 'count',
82	6					19	%{ shift(@_) \|\| {} },
83							);
84
85	6					35	my $buckets = bucketize($elements, \%options);
86
87							# pull these out because we consult them a lot, and in loops
88	6					15	my $distribution_width = $options{distribution_width};
89	6					13	my $distribution_character = $options{distribution_character};
90
91							# the divisor deciding how wide each bucket's bar will be
92	6					71	my $sum = sum values %$buckets;
93
94							# how wide must the first column (with the left_label and values) be?
95	6					16	my $left_width = length($options{left_label});
96	6					24	for my $bucket (keys %$buckets) {
97	69	50				145	$left_width = length($bucket)
98							if length($bucket) > $left_width;
99							}
100							# add that extra space before every row
101	6					16	$left_width++;
102
103							# how many - characters do we need?
104	6					16	my $middle_spacer = $distribution_width - length($options{middle_label}) - 2;
105
106							# these will be different when $middle_spacer is odd, but we
107							# always want them to sum to $middle_spacer.
108	6					15	my $middle_left = int($middle_spacer / 2);
109	6					10	my $middle_right = $middle_spacer - $middle_left;
110
111	6					209	my @output = sprintf '%*s %s %s %s %s',
112							$left_width,
113							$options{left_label},
114							('-' x $middle_left),
115							$options{middle_label},
116							('-' x $middle_right),
117							$options{right_label};
118
119	6					34	for my $bucket (sort { $a <=> $b } keys %$buckets) {
	182					211
120	69					103	my $count = $buckets->{$bucket};
121	69					87	my $ratio = ($count / $sum);
122	69					86	my $width = $distribution_width * $ratio;
123
124	69					287	push @output, sprintf '%d \|%-s %d',
125							$left_width,
126							$bucket,
127							$distribution_width,
128							($distribution_character x $width),
129							$count;
130							}
131
132	6	50				112	return wantarray ? @output : (join "\n", @output)."\n";
133							}
134
135							# given a set of buckets, find the power of two smaller than the
136							# smallest element, and the power of two greater than or equal to the
137							# largest element. used for add_endpoints
138							sub _endpoints_for {
139	21			21		61	my $buckets = shift;
140	21					41	my ($min_endpoint, $max_endpoint);
141
142	21					80	my @sorted_buckets = (sort { $a <=> $b } keys %$buckets);
	144					216
143	21					56	my ($min, $max) = @sorted_buckets[0, -1];
144
145	21	100				69	if ($min == 0) {
		100
		100
146	4					4	$min_endpoint = -1;
147							}
148							elsif ($min == 1) {
149	1					2	$min_endpoint = 0;
150							}
151							elsif ($min < 0) {
152	6					24	$min_endpoint = -1 * (2 ** (int(log(-$min) / log(2)) + 1));
153							}
154							else {
155	10					30	$min_endpoint = 2 ** (int(log($min) / log(2)) - 1);
156							}
157
158	21	100				70	if ($max == 0) {
		100
		100
159	1					2	$max_endpoint = 1;
160							}
161							elsif ($max == -1) {
162	2					3	$max_endpoint = 0;
163							}
164							elsif ($max < 0) {
165	2					9	$max_endpoint = -1 * (2 ** (int(log(-$max) / log(2)) - 1));
166							}
167							else {
168	16					34	$max_endpoint = 2 ** (int(log($max) / log(2)) + 1);
169							}
170
171	21					84	return ($min_endpoint, $max_endpoint);
172							}
173
174							1;
175
176
177
178							=pod
179
180							=head1 NAME
181
182							Text::Quantize - render a list of numbers as a textual chart
183
184							=head1 VERSION
185
186							version 0.05
187
188							=head1 SYNOPSIS
189
190							use Text::Quantize;
191
192							print quantize([26, 24, 51, 77, 21]);
193
194							__END__
195
196							value ------------- Distribution ------------- count
197							8 \| 0
198							16 \|@@@@@@@@@@@@@@@@@@@@@@@@ 3
199							32 \|@@@@@@@@ 1
200							64 \|@@@@@@@@ 1
201							128 \| 0
202
203							=head1 FUSSY SYNOPSIS
204
205							use Text::Quantize ();
206
207							print Text::Quantize::quantize([map { chomp; $_ } ], {
208							left_label => 'microseconds',
209							middle_label => 'Calls per time bucket',
210							right_label => 'syscalls',
211							distribution_width => 80,
212							distribution_character => '=',
213							});
214
215							__END__
216
217							microseconds ---------------------------- Calls per time bucket ----------------------------- syscalls
218							256 \| 0
219							512 \|==== 5
220							1024 \|===== 7
221							2048 \|================== 23
222							4096 \|============================ 36
223							8192 \|======= 9
224							16384 \|= 2
225							32768 \| 1
226							262144 \| 1
227							524288 \| 1
228							1048576 \| 1
229							2097152 \|======= 9
230							4194304 \|=== 4
231							8388608 \| 1
232							16777216 \| 0
233
234							=head1 FUNCTIONS
235
236							=head2 C
237
238							C takes an array reference of integers and an optional
239							hash reference of options, and produces a textual histogram of the
240							integers bucketed into powers-of-2 sets.
241
242							Options include:
243
244							=over 4
245
246							=item C (default: C)
247
248							Controls the text of the left-most label which represents the
249							bucket's contents.
250
251							=item C (default: C)
252
253							Controls the text of the middle label which can be used to title
254							the histogram.
255
256							=item C (default: C)
257
258							Controls the text of the right-most label which represents how many
259							items are in that bucket.
260
261							=item C (default: C<40>)
262
263							Controls how many characters wide the textual histogram is. This
264							does not include the legends.
265
266							=item C (default: C<@>)
267
268							Controls the character used to represent the data in the histogram.
269
270							=item C (default: C<1>)
271
272							Controls whether the top and bottom lines (which are going to have
273							values of 0) are added. They're included by default because it hints
274							that the data set is complete.
275
276							=back
277
278							=head2 C
279
280							C takes an array reference of integers and an optional
281							hash reference of options, and produces a hash reference of those
282							integers bucketed into powers-of-2 sets.
283
284							Options include:
285
286							=over 4
287
288							=item add_endpoints (default: C<1>)
289
290							Controls whether extra buckets, smaller than the minimum value and
291							larger than the maximum value, (which are going to have values of
292							0) are added. They're included by default because it hints that the
293							data set is complete.
294
295							=back
296
297							=head1 SEE ALSO
298
299							C, which is where I first saw this kind of C
300							histogram.
301
302							L, which ported C to Perl first, and from which I
303							took a few insights.
304
305							=head1 AUTHOR
306
307							Shawn M Moore
308
309							=head1 COPYRIGHT AND LICENSE
310
311							This software is copyright (c) 2012 by Infinity Interactive.
312
313							This is free software; you can redistribute it and/or modify it under
314							the same terms as the Perl 5 programming language system itself.
315
316							=cut
317
318
319							__END__