File Coverage

blib/lib/String/REPartition.pm

Criterion	Covered	Total	%
statement	9	188	4.7
branch	0	76	0.0
condition	0	27	0.0
subroutine	3	10	30.0
pod	1	1	100.0
total	13	302	4.3

line	stmt	bran	cond	sub	pod	time	code
1							# String::REPartition, a module used to partition data using a regular
2							# expression.
3
4							package String::REPartition;
5
6							require 5;
7	2			2		29036	use strict;
	2					4
	2					68
8	2			2		12	use warnings;
	2					3
	2					57
9	2			2		10	use Exporter;
	2					7
	2					4936
10							our @ISA = qw(Exporter);
11							our @EXPORT = qw(make_partition_re);
12
13							our $VERSION = 1.6;
14
15							my $DEBUG = 0;
16
17							# This is the main (and only) accessor function for this module. Given a
18							# ratio and a reference to a list of strings, it will produce a regular
19							# expression that will match @{$ref} * $ratio of the words in the list,
20							# and (this is the important part) not the rest of them. For example, if
21							# $ratio is .4, the resulting regular expression will match 40% of the
22							# strings in the list, and will fail to match the remaining 60%.
23							sub make_partition_re {
24	0			0	1		my($ratio) = shift;
25	0						my($arryref) = shift;
26
27	0						my(%lenhash) = ();
28	0						my(@words) = ();
29
30							# Just checking inputs here.
31	0	0					warn("Checking inputs...\n") if $DEBUG;
32	0	0	0				unless ($ratio && _is_numeric($ratio) && ($ratio > 0) && ($ratio < 1)) {
			0
			0
33	0						return _whine ("Invalid ratio given. Must be a number between 0 and 1.");
34							}
35	0	0	0				unless ($arryref && ref($arryref) && ref($arryref) eq 'ARRAY') {
			0
36	0						return _whine ("Invalid reference given. Must be a reference to a list or an array.");
37							}
38	0						@words = @{ $arryref };
	0
39
40	0						chomp @words;
41
42							# First we build a hash recording the number of strings with each length.
43	0						foreach (@words) {
44	0						$lenhash{length($_)}++;
45							}
46	0	0					if ($DEBUG) {
47	0						print "My first length hash looks like this:\n";
48	0						foreach my $key (sort {$a <=> $b} (keys %lenhash)) {
	0
49	0						print " $key -> $lenhash{$key}\n";
50							}
51							}
52
53							# And then use the _make_list subroutine to examine that hash and determine
54							# a set of lengths which constitute the closest solution, given the ratio.
55	0						my(@soln) = _make_list(\%lenhash,$ratio);
56	0	0					if ($DEBUG) {
57	0						print "First solution is: ";
58	0						print join('--', @soln);
59	0						print "\n";
60							}
61
62							# The new ratio will be the last value of the returned array. It may or
63							# may not be defined. An undef ratio implies that an exact solution has
64							# been found.
65	0						$ratio = pop(@soln);
66	0	0					if ($DEBUG) {
67	0	0					if (defined $ratio) {
68	0						print "I found the ratio $ratio and want to split $soln[-1]\n";
69							}
70							else {
71	0						print "No ratio found -- must have found an exact solution on the first try\n";
72							}
73							}
74	0	0					my($split) = pop(@soln) if defined $ratio;
75	0						my($regex) = "^(";
76
77							# If any lengths were appropriate to go into the solution (there need not
78							# be), then we'll build the first part of the regex.
79	0	0					if (scalar @soln) {
80	0						$regex .= join('\|',map {'(' . '.{' . $_ . '})'} _shrink_list(@soln));
	0
81							}
82	0	0					print "Regex so far is $regex\n" if $DEBUG;
83	0						my($splitlen) = 0;
84	0						my(%alphahash) = ();
85	0						my(@solns) = ();
86
87							# Now, if ratio is defined, that means we have to further subdivide words
88							# of one of the lengths.
89	0	0					if (defined($ratio)) {
90
91							# This is just setting a bunch of stuff up.
92	0	0					print "Starting to re-split\n" if $DEBUG;
93	0						$splitlen = $split;
94	0						my($splitval) = $lenhash{$split};
95	0						my($letnum) = 0;
96	0						my($total) = 0;
97
98							# We only want to play with the words of the appropriate length.
99	0						@words = grep( (length($_) == $splitlen), @words );
100
101							# And now we will continue re-subdividing the words until we've been
102							# asked to split the sample too much.
103	0		0				until (
			0
104							(int($splitval * $ratio) <= 1) \|\|
105							(int($splitval * $ratio) >= ($splitval - 1)) \|\|
106							($letnum >= $splitlen)
107							) {
108	0						%alphahash = ();
109
110							# Here we build a hash similar to the lenhash before.
111	0						foreach my $word (@words) {
112	0						$alphahash{substr($word,$letnum,1)}++;
113							}
114
115							# And then build the solution with the new data.
116	0						@soln = _make_list(\%alphahash,$ratio);
117	0						$ratio = pop(@soln);
118	0	0					if ($DEBUG) {
119	0	0					if (defined $ratio) {
120	0						print "I found the ratio $ratio and want to split $soln[-1]\n";
121							}
122							else {
123	0						print "No ratio found -- must have found an exact solution\n";
124							}
125							}
126							# Store the solution...
127	0	0					if ($DEBUG) {
128	0						print "Adding: " . join('--',@soln) . " to the solutions.\n";
129							}
130	0						@{$solns[$letnum]} = @soln;
	0
131
132							# Maybe do some stuff if we have to further subdivide...
133	0	0					if (defined($ratio)) {
134	0						$split = pop(@soln);
135	0						@words = grep( (substr($_,$letnum,1) eq $split), @words );
136	0						$splitval = $alphahash{$split};
137	0						$letnum++;
138							}
139
140							# Otherwise, make the loop bomb out so we can get on with our lives.
141							else {
142	0						$ratio = -1;
143							}
144							}
145	0	0	0				if ($ratio >= 0 && (scalar @solns > 0)) {
146	0						pop(@{$solns[-1]});
	0
147							}
148							}
149
150							# Now, if we have some solutions from subdividing the remaining words,
151							# we want to incorporate that into our regex...
152	0						my($regex_annex) = "";
153	0	0					if (scalar @solns) {
154	0						my($prefix) = "";
155	0						my($templetter) = "";
156	0						foreach my $num (0..$#solns) {
157	0						$splitlen--;
158
159							# If there are more solutions in the solution array after the one
160							# we're looking at, then the last letter isn't part of the solution
161							# but rather the letter that'll be split for the next solution.
162							# Thus we have to save it and store it.
163	0	0					if ($num < $#solns) {
164	0						$templetter = pop(@{$solns[$num]});
	0
165							}
166							else {
167	0						$templetter = '';
168							}
169	0	0					if (scalar @{$solns[$num]} > 0) {
	0
170	0						$regex_annex .= "($prefix\[" . join('',@{$solns[$num]}) . ']';
	0
171	0	0					if ($splitlen > 0) {
172	0						$regex_annex .= '.{' . $splitlen . '}';
173							}
174	0						$regex_annex .= ')\|';
175							}
176	0						$prefix .= $templetter;
177							}
178	0						chop $regex_annex;
179							}
180	0	0	0				if (length($regex) > 2 && length($regex_annex)) {
181	0						$regex .= "\|";
182							}
183	0						$regex .= $regex_annex;
184	0						$regex .= ")\$";
185	0						$regex =~ s/\[\^/\[\\\^/g;
186	0						return $regex;
187							}
188
189							# This function takes a reference to a hashtable and a ratio as its arguments.
190							# The hashtable represents the names and sizes of the buckets available to
191							# make the solution, and the ratio represents the percentage of the total
192							# of all the bucket sizes that the solution must represent.
193							# This function returns a list, representing the solution to the proplem
194							# presented to it, in the following format:
195							# The last element is the ratio by which one of the buckets must be further
196							# subdivided. If an exact solution was found, then this ratio will be
197							# undefined.
198							# If the last element is defined, the next to last element will be the name
199							# of the bucket which needs to be subdivided by the ratio indicated therein.
200							# The rest of the list returned contains the names of the buckets which will
201							# go into the solution.
202							# This explanation is a little confusing, and since the action of this function
203							# is so central to the working of this module, I'll give an example to help
204							# clear things up. Let's say the hash you pass in looks like this:
205							# { 'a' => 4, 'b' => 2, 'c' => 4 }
206							# If the ratio given is .6, then a valid return from the function will be:
207							# ('a', 'b', undef), since the combination of the 'a' and 'b' buckets adds
208							# up exactly to 60% of the total of all the buckets. However, if the ratio
209							# asked for is .5, then the return would probably be:
210							# ('a', 'b', .5), since the only way to get 50% of the total is to take
211							# all of the 'a' bucket and half of the 'b' bucket.
212							# I hope that clears things up.
213
214							sub _make_list {
215
216	0			0			my($hashref, $ratio) = @_;
217
218	0						my(@values) = values(%{$hashref});
	0
219	0						my($target,$max) = (0,0);
220
221							# Here we figure out some attributes of the data we've been given -- what
222							# amount we're shooting for, and what the larget value is.
223	0						foreach (@values) {
224	0	0					die "Non-number found: $_\n" unless /^\d+$/;
225	0						$target += $_;
226	0	0					if ($_ > $max) { $max = $_ }
	0
227							}
228	0						$target *= $ratio;
229	0						$target = int($target);
230
231							# Once we have an understanding of the data we're working with, we can
232							# start trying to find a good solution. The first thing we do is
233							# try to solve the problem with no bounds -- having the third argument
234							# at $max+1 guarantees that all of the buckets will be considered
235							# for inclusion. The first returned value is a reference to a hash
236							# describing the solution and the second is a somewhat arbitrary "score"
237							# which describes how "good" that solution is. While the score is not
238							# really a good metric of anything realistic, it roughly decreases as
239							# the quality of the solution increases, and reaches 0 as the solution
240							# become perfect (requiring no further subdivison).
241	0						my($besthash, $bestscore) = _find_soln(\@values, $target, $max+1);
242
243							# So, if on our first try, we get a score of 0, we just return the
244							# solution with an undef ratio.
245	0	0					if ($bestscore == 0) { return (_get_words($besthash, $hashref), undef) }
	0
246	0						my($ref,$score) = ("",0);
247
248							# If the first solution wasn't perfect, then the theory is that maybe it
249							# didn't do very well because it included a bucket it shouldn't have. So
250							# what we do is try re-making the solution a number of times, excluding
251							# a number of different buckets each time, until we either come up with a
252							# perfect solution or run out of things to try. This is almost certainly
253							# not the best way to go about things, but it works, so I'm not going to
254							# worry about it until the next version. If then. :)
255	0						foreach $max (keys %{$besthash}) {
	0
256	0						($ref, $score) = _find_soln(\@values, $target, $max);
257	0	0					next unless defined $ref;
258	0	0					if ($score == 0) { return (_get_words($ref, $hashref), undef) }
	0
259	0	0					if ($score < $bestscore) { $bestscore = $score; $besthash = $ref }
	0
	0
260							}
261	0						return (_get_words($besthash, $hashref, $target));
262							}
263
264							# This function takes as inputs a list of values, a target total to aim for and
265							# a maximum bucket size to use. It then constructs a suitable combination of
266							# the values it was given to get as close to the target as possible without
267							# using any values that aren't smaller than the max. It doesn't do it very
268							# well.
269							# It returns a solution hash and a score for the "goodness" of that solution.
270							sub _find_soln {
271	0			0			my($ref, $target, $max) = @_;
272	0						my($val,$sum) = (0,0);
273	0						my(%soln) = ();
274	0						my(%left) = ();
275
276	0						foreach $val (sort {$b <=> $a} @{$ref}) {
	0
	0
277	0	0					next unless $val < $max;
278	0	0					if ($val + $sum <= $target) {
279	0						$sum += $val;
280	0						$soln{$val}++;
281							}
282							else {
283	0						$left{$val}++;
284							}
285							}
286	0						my($diff) = $target - $sum;
287	0						my($min) = (sort {$b <=> $a} keys %left)[-1];
	0
288	0	0					unless (defined $min) { return undef }
	0
289	0						my($score)= $diff * $min;
290	0						return (\%soln, $score);
291							}
292
293							# This function takes a solution hash as returned from _find_soln, the data
294							# hash as given to _make_list and an optional target, and returns a list
295							# appropriate to be returned by _make_list. Its action is uninteresting and
296							# straightforward, so I will not waste bytes in describing it further.
297							sub _get_words {
298	0			0			my($soln, $data, $target) = @_;
299	0						my(@retarray) = ();
300	0						my($total,%left) = (0,%{$data});
	0
301
302	0						foreach my $solkey (keys %{$soln}) {
	0
303	0						foreach my $num (1..$soln->{$solkey}) {
304	0						foreach my $datkey (keys %left) {
305	0	0					if ($data->{$datkey} == $solkey) {
306	0						push(@retarray, $datkey);
307	0						delete($left{$datkey});
308	0						$total += $solkey;
309	0						last;
310							}
311							}
312							}
313							}
314	0	0					if (defined $target) {
315	0						my($minkey) = "";
316	0						my($min) = (sort {$a <=> $b} values %{$data})[-1];
	0
	0
317	0						foreach my $leftkey (keys %left) {
318	0	0					if ($min >= $data->{$leftkey}) {
319	0						$minkey = $leftkey;
320	0						$min = $data->{$leftkey};
321							}
322							}
323	0						push(@retarray,($minkey, (($target - $total)/$min)));
324							}
325	0						return @retarray;
326							}
327
328							# Simple test for numericity, pulled straight from the FAQ.
329							sub _is_numeric {
330	0			0			my($test) = shift;
331
332	0	0					unless ($test =~ /^-?(?:\d+(?:\.\d*)?\|\.\d+)$/) {
333	0						return undef;
334							}
335	0						return $test;
336							}
337
338							# Used to return polite errors from the module.
339							sub _whine {
340	0			0			my($msg) = shift;
341
342	0	0					if ($^W) {
343	0						warn("String::REPartition says: $msg\n");
344							}
345	0						return undef;
346							}
347
348							# Given a list of numbers, sorts it and combines contiguous members into
349							# comma-separated pairs. That is, turns (1 2 3 4 7 8) into (1,4 7,8). This
350							# is useful in building a nice-looking regular expression.
351							sub _shrink_list {
352	0			0			my(@list) = sort {$a <=> $b} @_;
	0
353	0						my($num) = 0;
354
355	0						until ($num >= $#list) {
356	0	0					if ((split(',',$list[$num]))[-1] == ($list[$num+1]-1)) {
357	0	0					if ($list[$num] =~ /^\d+$/) {
358	0						$list[$num] .= "," . splice(@list,$num+1,1);
359							}
360							else {
361	0						substr($list[$num],index($list[$num],",")+1) = splice(@list,$num+1,1);
362							}
363							}
364							else {
365	0						$num++;
366							}
367							}
368	0						return @list;
369							}
370
371							1;
372
373							__END__