File Coverage

blib/lib/Acme/Wabby.pm

Criterion	Covered	Total	%
statement	176	266	66.1
branch	55	116	47.4
condition	9	12	75.0
subroutine	16	19	84.2
pod	0	6	0.0
total	256	419	61.1

line	stmt	bran	cond	sub	pod	time	code
1							# Copyright 2004 Nathan Poznick. All rights reserved.
2							# Create semi-random sentences based upon a body of text.
3							# Distributed under the terms of the GPL Version 2
4
5							package Acme::Wabby;
6							$Acme::Wabby::VERSION='0.13';
7
8	1			1		538	use strict;
	1					1
	1					26
9	1			1		5	use warnings;
	1					1
	1					23
10	1			1		2166	use Storable;
	1					3511
	1					556
11
12	1			1		12	use vars qw($VERSION);
	1					2
	1					67
13
14							# Default values for various configurable settings
15	1			1		6	use constant DEF_CASE => 1;
	1					2
	1					83
16	1			1		6	use constant DEF_MIN_LEN => 3;
	1					1
	1					43
17	1			1		5	use constant DEF_MAX_LEN => 30;
	1					9
	1					51
18	1			1		5	use constant DEF_MAX_ATTEMPTS => 1000;
	1					1
	1					55
19	1			1		11	use constant DEF_PUNCTUATION => [".","?","!","..."];
	1					2
	1					40
20	1			1		5	use constant DEF_HASH_FILE => "./wabbyhash.dat";
	1					1
	1					44
21	1			1		5	use constant DEF_LIST_FILE => "./wabbylist.dat";
	1					2
	1					41
22	1			1		6	use constant DEF_AUTOSAVE => 0;
	1					1
	1					3882
23
24							# Constructor. Note that Acme::Wabby only supports an OO interface.
25							# Arguments: A reference to a hash containing configuration key/value pairs.
26							# Returns: A reference to the created object. Dies on error.
27							sub new {
28	3			3	0	54	my $self = shift;
29	3					7	my %conf;
30
31	3	100				12	if (@_ == 1) {
		50
32	1					2	my $ref = shift;
33	1	50				7	if (!ref($ref)) {
		50
34	0					0	die "Sanity check: One parameter passed, and it is not a reference";
35							}
36							elsif (ref($ref) eq "HASH") {
37	1					2	%conf = %{$ref};
	1					7
38							}
39							else {
40	0					0	die "Sanity check: One parameter passed, and it is not a hash reference";
41							}
42							}
43							elsif (@_ % 2 != 0) {
44	0					0	die "Sanity check: Odd number of parameters, expecting a hash";
45							}
46							else {
47	2					11	%conf = @_;
48							}
49
50							# Set up the initial state of our data structures
51	3					7	my %hash = ();
52	3					13	my @list = ({ word => "(null)", num => []});
53	3					10	my %data = ('hash' => \%hash, 'list' => \@list);
54
55							# Go through each of the configuration options, and if they were not
56							# explicitly set, assign to them the defaults.
57	3	100				11	if (!exists($conf{'case_sensitive'})) {
58	1					3	$conf{'case_sensitive'} = DEF_CASE;
59							}
60	3	100				7	if (!exists($conf{'min_len'})) {
61	1					2	$conf{'min_len'} = DEF_MIN_LEN;
62							}
63	3	100				8	if (!exists($conf{'max_len'})) {
64	1					2	$conf{'max_len'} = DEF_MAX_LEN;
65							}
66	3	100				7	if (!exists($conf{'max_attempts'})) {
67	1					2	$conf{'max_attempts'} = DEF_MAX_ATTEMPTS;
68							}
69	3	100				9	if (!exists($conf{'punctuation'})) {
70	1					2	$conf{'punctuation'} = DEF_PUNCTUATION;
71							}
72	3	100				7	if (!exists($conf{'hash_file'})) {
73	1					2	$conf{'hash_file'} = DEF_HASH_FILE;
74							}
75	3	100				7	if (!exists($conf{'list_file'})) {
76	1					2	$conf{'list_file'} = DEF_LIST_FILE;
77							}
78	3	100				9	if (!exists($conf{'autosave_on_destroy'})) {
79	1					4	$conf{'autosave_on_destroy'} = DEF_AUTOSAVE;
80							}
81
82							# Do some simple sanity checks on the values that they sent us
83	3	50				8	if ($conf{'min_len'} > $conf{'max_len'}) {
84	0					0	die "min_len ($conf{'min_len'}) cannot be larger than"
85							."max_len ($conf{'max_len'})";
86							}
87	3	50				14	if ($conf{'min_len'} < 1) {
88	0					0	die "Cannot be negative: min_len";
89							}
90	3	50				20	if ($conf{'max_len'} < 1) {
91	0					0	die "Cannot be negative: max_len";
92							}
93	3	50				13	if ($conf{'max_attempts'} < 1) {
94	0					0	die "Cannot be negative: max_attempts";
95							}
96	3	50				10	if (ref($conf{'punctuation'}) ne 'ARRAY') {
97	0					0	die "Not an array reference: punctuation";
98							}
99	3	50				4	if (scalar(@{$conf{'punctuation'}}) < 1) {
	3					8
100	0					0	die "Array must contain at least one element: punctuation";
101							}
102	3	50				8	if ($conf{'list_file'} eq $conf{'hash_file'}) {
103	0					0	die "list_file and hash_file cannot point to the same file";
104							}
105
106							# Return our blessed object
107	3					19	return bless({ conf=>\%conf, data=>\%data}, __PACKAGE__);
108							}
109
110							# A destructor for the object, so that we get a chance to autosave the state
111							# if the caller so desired.
112							# Arguments: None.
113							# Returns: Nothing.
114							sub DESTROY {
115	3			3		14	my $self = shift;
116	3	50				9	die "Invalid object" unless (ref($self) eq __PACKAGE__);
117
118	3	50				53	if ($self->{'conf'}{'autosave_on_destroy'}) {
119	0					0	$self->save;
120							}
121							}
122
123							# A method for dumping the current state to files using Storable.
124							# Arguments: None.
125							# Returns: undef on failure, true on success.
126							sub save {
127	0			0	0	0	my $self = shift;
128	0	0				0	die "Invalid object" unless (ref($self) eq __PACKAGE__);
129
130							# Since Storable can die on serious errors, or simply return an undef,
131							# we need to wrap these calls in evals
132	0					0	eval {
133	0	0				0	if (!store($self->{'data'}{'list'}, $self->{'conf'}{'list_file'})) {
134	0					0	$@ = 1;
135							}
136							};
137	0	0				0	if ($@) {
138	0					0	return undef;
139							}
140
141	0					0	eval {
142	0	0				0	if (!store($self->{'data'}{'hash'}, $self->{'conf'}{'hash_file'})) {
143	0					0	$@ = 1;
144							}
145							};
146	0	0				0	if ($@) {
147	0					0	return undef;
148							}
149
150	0					0	return 1;
151							}
152
153							# A method for loading a previously saved state from files using Storable.
154							# Arguments: None.
155							# Returns: undef on failure, true on success
156							sub load {
157	0			0	0	0	my $self = shift;
158	0	0				0	die "Invalid object" unless (ref($self) eq __PACKAGE__);
159
160							# Since Storable can die on serious errors, or simply return an undef,
161							# we need to wrap these calls in evals
162	0					0	my $ref;
163	0					0	eval {
164	0	0				0	if (!($ref = retrieve($self->{'conf'}{'list_file'}))) {
165	0					0	$@ = "Error retrieving list from " . $self->{'conf'}{'list_file'};
166							}
167							};
168	0	0				0	if ($@) {
169	0					0	return undef;
170							}
171	0					0	@{$self->{'data'}{'list'}} = @{$ref};
	0					0
	0					0
172
173	0					0	eval {
174	0	0				0	if (!($ref = retrieve($self->{'conf'}{'hash_file'}))) {
175	0					0	$@ = "Error retrieving hash from " . $self->{'conf'}{'hash_file'};
176							}
177							};
178	0	0				0	if ($@) {
179	0					0	return undef;
180							}
181	0					0	%{$self->{'data'}{'hash'}} = %{$ref};
	0					0
	0					0
182
183	0					0	return 1;
184							}
185
186							# A method for adding a block of text to the current state.
187							# Arguments: Takes a scalar containing text to be added. Embedded newlines,
188							# random crap, et al are fine, they'll just be stripped out anyway.
189							# Returns: undef on failure, true on success. The only failure condition is
190							# currently if an invalid parameter is passed in.
191							sub add {
192	3			3	0	51	my $self = shift;
193	3	50				10	die "Invalid object" unless (ref($self) eq __PACKAGE__);
194
195							# Make sure we actually got something to add
196	3					3	my $text = shift;
197	3	50				6	unless ($text) {
198	0					0	return undef;
199							}
200
201							# If we don't care about case, lowercase the whole thing to start with
202	3	50				9	unless ($self->{'conf'}{'case_sensitive'}) {
203	0					0	$text = lc($text);
204							}
205
206							# Split the text into component phrases, which we define as being delimited
207							# by the characters below. I left the comma out because it seems to lead
208							# to slightly more coherent results.
209	3					25	my @phrases = split /[.!?;]/, $text;
210	3					6	foreach my $phrase (@phrases) {
211
212							# First, strip out any characters we don't want to deal with. We
213							# replace them with a space so that things like "the+dog" gets treated
214							# as "the dog".
215	6					102	$phrase =~ s/[^-a-zA-Z0-9 ']/ /g;
216
217							# Trim leading and trailing whitespace, and see if we still have
218							# anything left.
219	6					15	$phrase =~ s/^\s+//;
220	6					65	$phrase =~ s/\s+$//;
221	6	100				18	next if $phrase eq "";
222
223	3					4	my $last_word = 0;
224	3					26	my $idx = 0;
225							# Split the phrase into component words. We're splitting on simple
226							# whitespace here.
227	3					106	my @words = split /\s+/, $phrase;
228
229							# First we're going to loop through the words and clean them up a bit.
230							# While we're at it, we're going to find the index of the last real
231							# word in this phrase.
232	3					16	foreach my $word (@words) {
233
234							# Clean up the word a little bit. We allow hyphens and
235							# apostrophies to occur within words, but not at the beginning
236							# or ends of words.
237	318					498	$word =~ s/^\s+//;
238	318					361	$word =~ s/\s+$//;
239	318					299	$word =~ s/^-+//g;
240	318					309	$word =~ s/^'+//g;
241	318					336	$word =~ s/-+$//g;
242	318					505	$word =~ s/'+$//g;
243
244							# Only allow the single-character words of 'a' and 'I'.
245							# FIXME - Need to be able to configure this so that persons with
246							# non-english texts can pick values that make sense.
247	318	50	66			656	if (length($word) == 1 && lc($word) ne "i" && lc($word) ne "a") {
			66
248	0					0	$word = "";
249	0					0	$idx++;
250	0					0	next;
251							}
252
253							# If this is a valid word, then mark this as a possible last word.
254	318	50				531	if ($word ne "") {
255	318					316	$last_word = $idx;
256							}
257	318					353	$idx++;
258							}
259
260	3					14	$idx = 0;
261	3					3	my $new_index = 0;
262	3					4	my $old_index = 0;
263
264							# Now we loop through the words, recording the transitions between them.
265	3					5	foreach my $word (@words) {
266
267							# Shock shock, we're going to ignore non-existent words.
268	318	50				500	if ($word eq "") {
269	0					0	$idx++;
270	0					0	next;
271							}
272
273							# If this is a new word that we've never seen before
274	318	100				654	if (!exists($self->{'data'}{'hash'}{$word})) {
275
276							# Add this word to the end of the word list, and to the hash,
277							# taking care to record its index for the next loop iteration.
278	174					148	$new_index = scalar(@{$self->{'data'}{'list'}});
	174					270
279	174					411	$self->{'data'}{'hash'}{$word} = $new_index;
280	174					214	push @{$self->{'data'}{'list'}}, {word => $word, num => []};
	174					630
281
282							# Add a transition from the previous word to this word.
283	174					269	push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
	174					150
	174					391
284							$new_index;
285
286							# If this word happens to be the last in the phrase, add a -1
287							# to its possible transitions so that we have the possibility
288							# of ending sentences here.
289	174	50				328	if ($idx == $last_word) {
290	0					0	push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
	0					0
	0					0
291							}
292							}
293
294							# If we've seen this word before
295							else {
296							# Record the index of this word for the next loop iteration,
297							# and add a transition from the previous word to this one.
298	144					220	$new_index = $self->{'data'}{'hash'}{$word};
299	144					140	push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
	144					137
	144					311
300							$new_index;
301
302							# If this word happens to be the last in the phrase, add a -1
303							# to its possible transitions so that we have the possibility
304							# of ending sentences here.
305	144	50				324	if ($idx == $last_word) {
306	0					0	push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
	0					0
	0					0
307							}
308							}
309							# Move on to the next word.
310	318					405	$old_index = $new_index;
311							}
312							}
313
314	3					14	return 1;
315							}
316
317							# A function for generating a random line of text.
318							# Arguments: If no arguments, spew will try to generate a completely random
319							# sentence. If a string is passed in, spew will try to generate a
320							# random sentence beginning with the provided text.
321							# Returns: The generated string, or undef on any of several error conditions.
322							# Note that these error conditions are not fatal. They are:
323							# * At least (min_len * 10) words haven't been run through yet.
324							# (Must ->add() more text before trying again.)
325							# * A string was passed in containing nothing. (Don't do that.)
326							# * We don't know the last word in the sentence, and can therefore
327							# not generate a sentence with it. (Either teach us about it
328							# with ->add(), or try something else.)
329							# * A sentence of at least min_len words could not be generated,
330							# even after max_attempts tries at doing so. (Likely need to
331							# ->add() more text before trying again.)
332							#
333							sub spew {
334	6			6	0	63	my $self = shift;
335	6	50				14	die "Invalid object" unless (ref($self) eq __PACKAGE__);
336	6					9	my $text = shift;
337
338							# If we don't have at least 10 * min_len words, we probably don't have a
339							# very good chance of making a sentence, so let's just return.
340	6	50				7	if (scalar(keys %{$self->{'data'}{'hash'}}) <
	6					30
341							($self->{'conf'}{'min_len'} * 10)) {
342	0					0	return undef;
343							}
344
345	6					9	my $directed;
346							my $start;
347
348							# If they passed in an argument, take a look at it.
349	6	100				11	if ($text) {
350	3					4	$directed = 1;
351
352							# If we're case-insensitive, lowercase what they sent us.
353	3	50				8	unless ($self->{'conf'}{'case_sensitive'}) {
354	0					0	$text = lc($text);
355							}
356
357							# Weed out unsavory characters.
358	3					7	$text =~ s/[^-a-zA-Z0-9 ']/ /gs;
359
360							# Clean any long strings of whitespace to single spaces.
361	3					5	$text =~ s/\s+/ /g;
362
363							# Remove leading and trailing whitespace.
364	3					7	$text =~ s/^\s+//;
365	3					6	$text =~ s/\s+$//;
366
367							# If there's not a word left to talk about, return.
368	3	50				14	if ($text !~ /([-a-zA-Z0-9']+)$/) {
369	0					0	return undef;
370							}
371
372							# If we don't know anything about this word, return.
373	3	50				3	if (!exists(${$self->{'data'}{'hash'}}{$1})) {
	3					14
374	0					0	return undef;
375							}
376
377							# Seems like a good starting place, so let's mark it.
378	3					4	$start = ${$self->{'data'}{'hash'}}{$1};
	3					8
379							}
380							# They didn't pass an argument, so we're on our own.
381							else {
382	3					5	$directed = 0;
383
384							# The 0th element in the list is 'special' in that no hash entry points
385							# to it, and it only contains pointers to words which are possible
386							# sentence starting points. Thus, let's grab a random entry out of the
387							# 0th element in the list and start there.
388	3					611	$start = ${${$self->{'data'}{'list'}}[0]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[0]{'num'}}];
	3					4
	3					64
	3					4
	3					9
389	3					4	$text = ${$self->{'data'}{'list'}}[$start]{'word'};
	3					9
390							}
391
392							# Since we're dealing with randomness, we can't always be sure that we'll
393							# be able to make a sentence of min_len, so we just keep retrying up to
394							# max_attempts times, relying on sheer dumb luck to help us out. On a
395							# reasonably-sized body of text, this works perfectly fine.
396	6					9	my $attempts = 0;
397	6					7	my $count = 0;
398	6					7	my $final = "";
399	6					6	my $next = $start;
400
401	6		66			35	while ($count < $self->{'conf'}{'min_len'} &&
402							$attempts < $self->{'conf'}{'max_attempts'}) {
403
404							# We start out with one word, and uppercase the first character in our
405							# starting text.
406	6					6	$count = 1;
407	6					11	$final = "\u$text";
408	6					7	$next = $start;
409
410							# Keep adding new words to this sentence until we hit an sentence end
411							# mark, or we hit max_len
412	6		100			27	while ($next != -1 && ($count < $self->{'conf'}{'max_len'})) {
413
414							# If the word we're on has no transitions, count this as a stopping
415							# point, since we can't go any further.
416	154	100				146	if (scalar(@{${$self->{'data'}{'list'}}[$next]{'num'}}) < 1) {
	154					137
	154					352
417	2					3	$next = -1;
418							}
419							# Otherwise, randomly pick the word we'll visit next out of the
420							# list of possible transitions from our current word.
421							else {
422	152					134	$next = ${${$self->{'data'}{'list'}}[$next]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[$next]{'num'}}];
	152					132
	152					318
	152					134
	152					245
423							}
424
425							# If we're not at the end yet, add this word to our collected
426							# string, increment our word count, and do it all again.
427	154	100				296	if ($next != -1) {
428	152					129	$final .= " " . ${$self->{'data'}{'list'}}[$next]{'word'};
	152					312
429	152					618	$count++;
430							}
431							}
432
433							# If we failed to make a long enough sentence, we need to do something.
434	6	50				26	if ($count < $self->{'conf'}{'min_len'}) {
435
436							# If we haven't yet passed our max number of attempts, try again.
437	0	0				0	if ($attempts < $self->{'conf'}{'max_attempts'}) {
438	0					0	$attempts++;
439	0					0	next;
440							}
441							# If we passed our max number of attempts, we can take one of two
442							# course of action.
443							else {
444							# If we're trying to talk about something in particular, we're
445							# always going to be stuck with the same starting point. Thus,
446							# there's not the best chance for continued success, so just
447							# give up and bail.
448	0	0				0	if ($directed) {
449	0					0	return undef;
450							}
451							# If we're talking about random things, we likely just got
452							# a bad starting point, so we'll pick a new random starting
453							# point, and do the whole thing over again.
454							else {
455	0					0	$attempts = 0;
456	0					0	$start = ${${$self->{'data'}{'list'}}[0]{'num'}}[int rand scalar @{${$self->{'data'}{'list'}}[0]{'num'}}];
	0					0
	0					0
	0					0
	0					0
457	0					0	$text = ${$self->{'data'}{'list'}}[$start]{'word'};
	0					0
458	0					0	next;
459							}
460							}
461							}
462							}
463
464							# If we're not case sensitive, make sure any I's by themselves are
465							# capitalized, for aesthetic purposes. If we are, they probably want
466							# things to come out the way they are.
467							# FIXME - Need to be able to configure this so that persons with
468							# non-english texts can pick values that make sense.
469	6	50				16	unless ($self->{'conf'}{'case_sensitive'}) {
470	0					0	$final =~ s/(^\|[^\w-])i($\|[^\w-])/$1I$2/g
471							}
472
473							# Pick a random piece of punctuation to add to the end of the sentence.
474	6					7	$final .= ${$self->{'conf'}{'punctuation'}}[int rand scalar @{$self->{'conf'}{'punctuation'}}];
	6					15
	6					10
475
476	6					20	return $final;
477							}
478
479							# A method for getting some basic information about the current state.
480							# Arguments: None.
481							# Returns: In a scalar context, this function returns a string describing the
482							# current state. In a list context, this function returns a list
483							# containing two numbers -- the first one is the number of words
484							# that this object knows about, and the second one is the average
485							# number of transitions between words.
486							sub stats {
487	0			0	0		my $self = shift;
488	0	0					die "Invalid object" unless (ref($self) eq __PACKAGE__);
489
490							# Get the number of words in our hash.
491	0						my $word_count = scalar keys %{$self->{'data'}{'hash'}};
	0
492
493							# If we've got no words, just quit now.
494	0	0					if ($word_count == 0) {
495	0	0					return wantarray ? (0,0) : "I don't know anything!";
496							}
497
498							# Iterate over the list, adding up the number of transitions for each word.
499	0						my $average = 0;
500	0						foreach (@{$self->{'data'}{'list'}}) {
	0
501	0	0					$average += scalar @{$_->{'num'}} if defined($_->{'num'});
	0
502							}
503
504							# Calculate an average, trim it to two decimal points, and return it.
505	0						$average /= $word_count;
506	0						$average = sprintf "%.2f", $average;
507	0	0					return wantarray ? ($word_count, $average) : "Wabby knows $word_count "
508							."words, with an average of $average connections between each word.";
509							}
510
511							1;
512
513							__END__