File Coverage

blib/lib/Acme/Wabby.pm
Criterion Covered Total %
statement 176 266 66.1
branch 55 116 47.4
condition 9 12 75.0
subroutine 16 19 84.2
pod 0 6 0.0
total 256 419 61.1


line stmt bran cond sub pod time code
1             # Copyright 2004 Nathan Poznick. All rights reserved.
2             # Create semi-random sentences based upon a body of text.
3             # Distributed under the terms of the GPL Version 2
4              
5             package Acme::Wabby;
6             $Acme::Wabby::VERSION='0.13';
7              
8 1     1   538 use strict;
  1         1  
  1         26  
9 1     1   5 use warnings;
  1         1  
  1         23  
10 1     1   2166 use Storable;
  1         3511  
  1         556  
11              
12 1     1   12 use vars qw($VERSION);
  1         2  
  1         67  
13              
14             # Default values for various configurable settings
15 1     1   6 use constant DEF_CASE => 1;
  1         2  
  1         83  
16 1     1   6 use constant DEF_MIN_LEN => 3;
  1         1  
  1         43  
17 1     1   5 use constant DEF_MAX_LEN => 30;
  1         9  
  1         51  
18 1     1   5 use constant DEF_MAX_ATTEMPTS => 1000;
  1         1  
  1         55  
19 1     1   11 use constant DEF_PUNCTUATION => [".","?","!","..."];
  1         2  
  1         40  
20 1     1   5 use constant DEF_HASH_FILE => "./wabbyhash.dat";
  1         1  
  1         44  
21 1     1   5 use constant DEF_LIST_FILE => "./wabbylist.dat";
  1         2  
  1         41  
22 1     1   6 use constant DEF_AUTOSAVE => 0;
  1         1  
  1         3882  
23              
24             # Constructor. Note that Acme::Wabby only supports an OO interface.
25             # Arguments: A reference to a hash containing configuration key/value pairs.
26             # Returns: A reference to the created object. Dies on error.
27             sub new {
28 3     3 0 54 my $self = shift;
29 3         7 my %conf;
30              
31 3 100       12 if (@_ == 1) {
    50          
32 1         2 my $ref = shift;
33 1 50       7 if (!ref($ref)) {
    50          
34 0         0 die "Sanity check: One parameter passed, and it is not a reference";
35             }
36             elsif (ref($ref) eq "HASH") {
37 1         2 %conf = %{$ref};
  1         7  
38             }
39             else {
40 0         0 die "Sanity check: One parameter passed, and it is not a hash reference";
41             }
42             }
43             elsif (@_ % 2 != 0) {
44 0         0 die "Sanity check: Odd number of parameters, expecting a hash";
45             }
46             else {
47 2         11 %conf = @_;
48             }
49              
50             # Set up the initial state of our data structures
51 3         7 my %hash = ();
52 3         13 my @list = ({ word => "(null)", num => []});
53 3         10 my %data = ('hash' => \%hash, 'list' => \@list);
54              
55             # Go through each of the configuration options, and if they were not
56             # explicitly set, assign to them the defaults.
57 3 100       11 if (!exists($conf{'case_sensitive'})) {
58 1         3 $conf{'case_sensitive'} = DEF_CASE;
59             }
60 3 100       7 if (!exists($conf{'min_len'})) {
61 1         2 $conf{'min_len'} = DEF_MIN_LEN;
62             }
63 3 100       8 if (!exists($conf{'max_len'})) {
64 1         2 $conf{'max_len'} = DEF_MAX_LEN;
65             }
66 3 100       7 if (!exists($conf{'max_attempts'})) {
67 1         2 $conf{'max_attempts'} = DEF_MAX_ATTEMPTS;
68             }
69 3 100       9 if (!exists($conf{'punctuation'})) {
70 1         2 $conf{'punctuation'} = DEF_PUNCTUATION;
71             }
72 3 100       7 if (!exists($conf{'hash_file'})) {
73 1         2 $conf{'hash_file'} = DEF_HASH_FILE;
74             }
75 3 100       7 if (!exists($conf{'list_file'})) {
76 1         2 $conf{'list_file'} = DEF_LIST_FILE;
77             }
78 3 100       9 if (!exists($conf{'autosave_on_destroy'})) {
79 1         4 $conf{'autosave_on_destroy'} = DEF_AUTOSAVE;
80             }
81              
82             # Do some simple sanity checks on the values that they sent us
83 3 50       8 if ($conf{'min_len'} > $conf{'max_len'}) {
84 0         0 die "min_len ($conf{'min_len'}) cannot be larger than"
85             ."max_len ($conf{'max_len'})";
86             }
87 3 50       14 if ($conf{'min_len'} < 1) {
88 0         0 die "Cannot be negative: min_len";
89             }
90 3 50       20 if ($conf{'max_len'} < 1) {
91 0         0 die "Cannot be negative: max_len";
92             }
93 3 50       13 if ($conf{'max_attempts'} < 1) {
94 0         0 die "Cannot be negative: max_attempts";
95             }
96 3 50       10 if (ref($conf{'punctuation'}) ne 'ARRAY') {
97 0         0 die "Not an array reference: punctuation";
98             }
99 3 50       4 if (scalar(@{$conf{'punctuation'}}) < 1) {
  3         8  
100 0         0 die "Array must contain at least one element: punctuation";
101             }
102 3 50       8 if ($conf{'list_file'} eq $conf{'hash_file'}) {
103 0         0 die "list_file and hash_file cannot point to the same file";
104             }
105              
106             # Return our blessed object
107 3         19 return bless({ conf=>\%conf, data=>\%data}, __PACKAGE__);
108             }
109              
110             # A destructor for the object, so that we get a chance to autosave the state
111             # if the caller so desired.
112             # Arguments: None.
113             # Returns: Nothing.
114             sub DESTROY {
115 3     3   14 my $self = shift;
116 3 50       9 die "Invalid object" unless (ref($self) eq __PACKAGE__);
117              
118 3 50       53 if ($self->{'conf'}{'autosave_on_destroy'}) {
119 0         0 $self->save;
120             }
121             }
122              
123             # A method for dumping the current state to files using Storable.
124             # Arguments: None.
125             # Returns: undef on failure, true on success.
126             sub save {
127 0     0 0 0 my $self = shift;
128 0 0       0 die "Invalid object" unless (ref($self) eq __PACKAGE__);
129              
130             # Since Storable can die on serious errors, or simply return an undef,
131             # we need to wrap these calls in evals
132 0         0 eval {
133 0 0       0 if (!store($self->{'data'}{'list'}, $self->{'conf'}{'list_file'})) {
134 0         0 $@ = 1;
135             }
136             };
137 0 0       0 if ($@) {
138 0         0 return undef;
139             }
140              
141 0         0 eval {
142 0 0       0 if (!store($self->{'data'}{'hash'}, $self->{'conf'}{'hash_file'})) {
143 0         0 $@ = 1;
144             }
145             };
146 0 0       0 if ($@) {
147 0         0 return undef;
148             }
149              
150 0         0 return 1;
151             }
152              
153             # A method for loading a previously saved state from files using Storable.
154             # Arguments: None.
155             # Returns: undef on failure, true on success
156             sub load {
157 0     0 0 0 my $self = shift;
158 0 0       0 die "Invalid object" unless (ref($self) eq __PACKAGE__);
159              
160             # Since Storable can die on serious errors, or simply return an undef,
161             # we need to wrap these calls in evals
162 0         0 my $ref;
163 0         0 eval {
164 0 0       0 if (!($ref = retrieve($self->{'conf'}{'list_file'}))) {
165 0         0 $@ = "Error retrieving list from " . $self->{'conf'}{'list_file'};
166             }
167             };
168 0 0       0 if ($@) {
169 0         0 return undef;
170             }
171 0         0 @{$self->{'data'}{'list'}} = @{$ref};
  0         0  
  0         0  
172              
173 0         0 eval {
174 0 0       0 if (!($ref = retrieve($self->{'conf'}{'hash_file'}))) {
175 0         0 $@ = "Error retrieving hash from " . $self->{'conf'}{'hash_file'};
176             }
177             };
178 0 0       0 if ($@) {
179 0         0 return undef;
180             }
181 0         0 %{$self->{'data'}{'hash'}} = %{$ref};
  0         0  
  0         0  
182              
183 0         0 return 1;
184             }
185              
186             # A method for adding a block of text to the current state.
187             # Arguments: Takes a scalar containing text to be added. Embedded newlines,
188             # random crap, et al are fine, they'll just be stripped out anyway.
189             # Returns: undef on failure, true on success. The only failure condition is
190             # currently if an invalid parameter is passed in.
191             sub add {
192 3     3 0 51 my $self = shift;
193 3 50       10 die "Invalid object" unless (ref($self) eq __PACKAGE__);
194              
195             # Make sure we actually got something to add
196 3         3 my $text = shift;
197 3 50       6 unless ($text) {
198 0         0 return undef;
199             }
200              
201             # If we don't care about case, lowercase the whole thing to start with
202 3 50       9 unless ($self->{'conf'}{'case_sensitive'}) {
203 0         0 $text = lc($text);
204             }
205              
206             # Split the text into component phrases, which we define as being delimited
207             # by the characters below. I left the comma out because it seems to lead
208             # to slightly more coherent results.
209 3         25 my @phrases = split /[.!?;]/, $text;
210 3         6 foreach my $phrase (@phrases) {
211              
212             # First, strip out any characters we don't want to deal with. We
213             # replace them with a space so that things like "the+dog" gets treated
214             # as "the dog".
215 6         102 $phrase =~ s/[^-a-zA-Z0-9 ']/ /g;
216              
217             # Trim leading and trailing whitespace, and see if we still have
218             # anything left.
219 6         15 $phrase =~ s/^\s+//;
220 6         65 $phrase =~ s/\s+$//;
221 6 100       18 next if $phrase eq "";
222              
223 3         4 my $last_word = 0;
224 3         26 my $idx = 0;
225             # Split the phrase into component words. We're splitting on simple
226             # whitespace here.
227 3         106 my @words = split /\s+/, $phrase;
228              
229             # First we're going to loop through the words and clean them up a bit.
230             # While we're at it, we're going to find the index of the last real
231             # word in this phrase.
232 3         16 foreach my $word (@words) {
233              
234             # Clean up the word a little bit. We allow hyphens and
235             # apostrophies to occur within words, but not at the beginning
236             # or ends of words.
237 318         498 $word =~ s/^\s+//;
238 318         361 $word =~ s/\s+$//;
239 318         299 $word =~ s/^-+//g;
240 318         309 $word =~ s/^'+//g;
241 318         336 $word =~ s/-+$//g;
242 318         505 $word =~ s/'+$//g;
243              
244             # Only allow the single-character words of 'a' and 'I'.
245             # FIXME - Need to be able to configure this so that persons with
246             # non-english texts can pick values that make sense.
247 318 50 66     656 if (length($word) == 1 && lc($word) ne "i" && lc($word) ne "a") {
      66        
248 0         0 $word = "";
249 0         0 $idx++;
250 0         0 next;
251             }
252              
253             # If this is a valid word, then mark this as a possible last word.
254 318 50       531 if ($word ne "") {
255 318         316 $last_word = $idx;
256             }
257 318         353 $idx++;
258             }
259              
260 3         14 $idx = 0;
261 3         3 my $new_index = 0;
262 3         4 my $old_index = 0;
263              
264             # Now we loop through the words, recording the transitions between them.
265 3         5 foreach my $word (@words) {
266              
267             # Shock shock, we're going to ignore non-existent words.
268 318 50       500 if ($word eq "") {
269 0         0 $idx++;
270 0         0 next;
271             }
272              
273             # If this is a new word that we've never seen before
274 318 100       654 if (!exists($self->{'data'}{'hash'}{$word})) {
275              
276             # Add this word to the end of the word list, and to the hash,
277             # taking care to record its index for the next loop iteration.
278 174         148 $new_index = scalar(@{$self->{'data'}{'list'}});
  174         270  
279 174         411 $self->{'data'}{'hash'}{$word} = $new_index;
280 174         214 push @{$self->{'data'}{'list'}}, {word => $word, num => []};
  174         630  
281              
282             # Add a transition from the previous word to this word.
283 174         269 push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
  174         150  
  174         391  
284             $new_index;
285              
286             # If this word happens to be the last in the phrase, add a -1
287             # to its possible transitions so that we have the possibility
288             # of ending sentences here.
289 174 50       328 if ($idx == $last_word) {
290 0         0 push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
  0         0  
  0         0  
291             }
292             }
293              
294             # If we've seen this word before
295             else {
296             # Record the index of this word for the next loop iteration,
297             # and add a transition from the previous word to this one.
298 144         220 $new_index = $self->{'data'}{'hash'}{$word};
299 144         140 push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
  144         137  
  144         311  
300             $new_index;
301              
302             # If this word happens to be the last in the phrase, add a -1
303             # to its possible transitions so that we have the possibility
304             # of ending sentences here.
305 144 50       324 if ($idx == $last_word) {
306 0         0 push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
  0         0  
  0         0  
307             }
308             }
309             # Move on to the next word.
310 318         405 $old_index = $new_index;
311             }
312             }
313              
314 3         14 return 1;
315             }
316              
317             # A function for generating a random line of text.
318             # Arguments: If no arguments, spew will try to generate a completely random
319             # sentence. If a string is passed in, spew will try to generate a
320             # random sentence beginning with the provided text.
321             # Returns: The generated string, or undef on any of several error conditions.
322             # Note that these error conditions are not fatal. They are:
323             # * At least (min_len * 10) words haven't been run through yet.
324             # (Must ->add() more text before trying again.)
325             # * A string was passed in containing nothing. (Don't do that.)
326             # * We don't know the last word in the sentence, and can therefore
327             # not generate a sentence with it. (Either teach us about it
328             # with ->add(), or try something else.)
329             # * A sentence of at least min_len words could not be generated,
330             # even after max_attempts tries at doing so. (Likely need to
331             # ->add() more text before trying again.)
332             #
333             sub spew {
334 6     6 0 63 my $self = shift;
335 6 50       14 die "Invalid object" unless (ref($self) eq __PACKAGE__);
336 6         9 my $text = shift;
337              
338             # If we don't have at least 10 * min_len words, we probably don't have a
339             # very good chance of making a sentence, so let's just return.
340 6 50       7 if (scalar(keys %{$self->{'data'}{'hash'}}) <
  6         30  
341             ($self->{'conf'}{'min_len'} * 10)) {
342 0         0 return undef;
343             }
344              
345 6         9 my $directed;
346             my $start;
347              
348             # If they passed in an argument, take a look at it.
349 6 100       11 if ($text) {
350 3         4 $directed = 1;
351              
352             # If we're case-insensitive, lowercase what they sent us.
353 3 50       8 unless ($self->{'conf'}{'case_sensitive'}) {
354 0         0 $text = lc($text);
355             }
356              
357             # Weed out unsavory characters.
358 3         7 $text =~ s/[^-a-zA-Z0-9 ']/ /gs;
359              
360             # Clean any long strings of whitespace to single spaces.
361 3         5 $text =~ s/\s+/ /g;
362              
363             # Remove leading and trailing whitespace.
364 3         7 $text =~ s/^\s+//;
365 3         6 $text =~ s/\s+$//;
366              
367             # If there's not a word left to talk about, return.
368 3 50       14 if ($text !~ /([-a-zA-Z0-9']+)$/) {
369 0         0 return undef;
370             }
371              
372             # If we don't know anything about this word, return.
373 3 50       3 if (!exists(${$self->{'data'}{'hash'}}{$1})) {
  3         14  
374 0         0 return undef;
375             }
376              
377             # Seems like a good starting place, so let's mark it.
378 3         4 $start = ${$self->{'data'}{'hash'}}{$1};
  3         8  
379             }
380             # They didn't pass an argument, so we're on our own.
381             else {
382 3         5 $directed = 0;
383              
384             # The 0th element in the list is 'special' in that no hash entry points
385             # to it, and it only contains pointers to words which are possible
386             # sentence starting points. Thus, let's grab a random entry out of the
387             # 0th element in the list and start there.
388 3         611 $start = ${${$self->{'data'}{'list'}}[0]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[0]{'num'}}];
  3         4  
  3         64  
  3         4  
  3         9  
389 3         4 $text = ${$self->{'data'}{'list'}}[$start]{'word'};
  3         9  
390             }
391              
392             # Since we're dealing with randomness, we can't always be sure that we'll
393             # be able to make a sentence of min_len, so we just keep retrying up to
394             # max_attempts times, relying on sheer dumb luck to help us out. On a
395             # reasonably-sized body of text, this works perfectly fine.
396 6         9 my $attempts = 0;
397 6         7 my $count = 0;
398 6         7 my $final = "";
399 6         6 my $next = $start;
400              
401 6   66     35 while ($count < $self->{'conf'}{'min_len'} &&
402             $attempts < $self->{'conf'}{'max_attempts'}) {
403              
404             # We start out with one word, and uppercase the first character in our
405             # starting text.
406 6         6 $count = 1;
407 6         11 $final = "\u$text";
408 6         7 $next = $start;
409              
410             # Keep adding new words to this sentence until we hit an sentence end
411             # mark, or we hit max_len
412 6   100     27 while ($next != -1 && ($count < $self->{'conf'}{'max_len'})) {
413              
414             # If the word we're on has no transitions, count this as a stopping
415             # point, since we can't go any further.
416 154 100       146 if (scalar(@{${$self->{'data'}{'list'}}[$next]{'num'}}) < 1) {
  154         137  
  154         352  
417 2         3 $next = -1;
418             }
419             # Otherwise, randomly pick the word we'll visit next out of the
420             # list of possible transitions from our current word.
421             else {
422 152         134 $next = ${${$self->{'data'}{'list'}}[$next]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[$next]{'num'}}];
  152         132  
  152         318  
  152         134  
  152         245  
423             }
424              
425             # If we're not at the end yet, add this word to our collected
426             # string, increment our word count, and do it all again.
427 154 100       296 if ($next != -1) {
428 152         129 $final .= " " . ${$self->{'data'}{'list'}}[$next]{'word'};
  152         312  
429 152         618 $count++;
430             }
431             }
432              
433             # If we failed to make a long enough sentence, we need to do something.
434 6 50       26 if ($count < $self->{'conf'}{'min_len'}) {
435              
436             # If we haven't yet passed our max number of attempts, try again.
437 0 0       0 if ($attempts < $self->{'conf'}{'max_attempts'}) {
438 0         0 $attempts++;
439 0         0 next;
440             }
441             # If we passed our max number of attempts, we can take one of two
442             # course of action.
443             else {
444             # If we're trying to talk about something in particular, we're
445             # always going to be stuck with the same starting point. Thus,
446             # there's not the best chance for continued success, so just
447             # give up and bail.
448 0 0       0 if ($directed) {
449 0         0 return undef;
450             }
451             # If we're talking about random things, we likely just got
452             # a bad starting point, so we'll pick a new random starting
453             # point, and do the whole thing over again.
454             else {
455 0         0 $attempts = 0;
456 0         0 $start = ${${$self->{'data'}{'list'}}[0]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[0]{'num'}}];
  0         0  
  0         0  
  0         0  
  0         0  
457 0         0 $text = ${$self->{'data'}{'list'}}[$start]{'word'};
  0         0  
458 0         0 next;
459             }
460             }
461             }
462             }
463              
464             # If we're not case sensitive, make sure any I's by themselves are
465             # capitalized, for aesthetic purposes. If we are, they probably want
466             # things to come out the way they are.
467             # FIXME - Need to be able to configure this so that persons with
468             # non-english texts can pick values that make sense.
469 6 50       16 unless ($self->{'conf'}{'case_sensitive'}) {
470 0         0 $final =~ s/(^|[^\w-])i($|[^\w-])/$1I$2/g
471             }
472              
473             # Pick a random piece of punctuation to add to the end of the sentence.
474 6         7 $final .= ${$self->{'conf'}{'punctuation'}}[int rand scalar @{$self->{'conf'}{'punctuation'}}];
  6         15  
  6         10  
475              
476 6         20 return $final;
477             }
478              
479             # A method for getting some basic information about the current state.
480             # Arguments: None.
481             # Returns: In a scalar context, this function returns a string describing the
482             # current state. In a list context, this function returns a list
483             # containing two numbers -- the first one is the number of words
484             # that this object knows about, and the second one is the average
485             # number of transitions between words.
486             sub stats {
487 0     0 0   my $self = shift;
488 0 0         die "Invalid object" unless (ref($self) eq __PACKAGE__);
489              
490             # Get the number of words in our hash.
491 0           my $word_count = scalar keys %{$self->{'data'}{'hash'}};
  0            
492              
493             # If we've got no words, just quit now.
494 0 0         if ($word_count == 0) {
495 0 0         return wantarray ? (0,0) : "I don't know anything!";
496             }
497              
498             # Iterate over the list, adding up the number of transitions for each word.
499 0           my $average = 0;
500 0           foreach (@{$self->{'data'}{'list'}}) {
  0            
501 0 0         $average += scalar @{$_->{'num'}} if defined($_->{'num'});
  0            
502             }
503              
504             # Calculate an average, trim it to two decimal points, and return it.
505 0           $average /= $word_count;
506 0           $average = sprintf "%.2f", $average;
507 0 0         return wantarray ? ($word_count, $average) : "Wabby knows $word_count "
508             ."words, with an average of $average connections between each word.";
509             }
510              
511             1;
512              
513             __END__