File Coverage

blib/lib/Text/Conversation.pm

Criterion	Covered	Total	%
statement	221	329	67.1
branch	38	98	38.7
condition	14	30	46.6
subroutine	37	46	80.4
pod	2	3	66.6
total	312	506	61.6

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Conversation;
2
3	1			1		7	use warnings;
	1					2
	1					26
4	1			1		5	use strict;
	1					2
	1					28
5
6	1			1		6	use vars qw($VERSION);
	1					1
	1					46
7							$VERSION = '0.050';
8
9	1			1		991	use Lingua::StopWords::EN qw(getStopWords);
	1					506
	1					78
10	1			1		861	use Lingua::Stem::Snowball qw(stem);
	1					2128
	1					66
11	1			1		936	use String::Approx qw(amatch adistr);
	1					5754
	1					87
12
13	1			1		11	use constant CT_STOPWORDS => 0;
	1					2
	1					82
14	1			1		6	use constant CT_CONTEXT => 1;
	1					2
	1					41
15	1			1		6	use constant CT_NICKS => 2;
	1					2
	1					37
16	1			1		6	use constant CT_IDS => 3;
	1					2
	1					38
17	1			1		6	use constant CT_CONTEXT_MAX => 4;
	1					2
	1					37
18	1			1		6	use constant CT_WORDS => 5;
	1					2
	1					45
19	1			1		5	use constant CT_WORDS_TOTAL => 6;
	1					2
	1					37
20	1			1		6	use constant CT_DEBUG => 7;
	1					7
	1					54
21
22	1			1		6	use constant CTX_ID => 0;
	1					2
	1					44
23	1			1		5	use constant CTX_NICK => 1;
	1					1
	1					45
24	1			1		5	use constant CTX_ADDRESSEE => 2;
	1					1
	1					37
25	1			1		5	use constant CTX_WORDS => 3;
	1					3
	1					44
26
27	1			1		5	use constant ID_REFERENT => 0;
	1					2
	1					39
28	1			1		5	use constant ID_REFERERS => 1;
	1					1
	1					41
29	1			1		5	use constant ID_TEXT => 2;
	1					2
	1					41
30	1			1		5	use constant ID_NICK => 3;
	1					1
	1					4710
31
32							### Manage scrollback.
33
34							# The next ID is package static so messages will be unique across all
35							# threaders.
36
37							my $next_id = "a";
38
39							sub new {
40	1			1	1	3	my ($class, %args) = @_;
41
42	1		50			10	$args{thread_buffer} \|\|= 30;
43
44	1		50			14	my $self = bless [
			50
45							undef, # CT_STOPWORDS
46							[ ], # CT_CONTEXT
47							{ }, # CT_NICKS
48							{ }, # CT_IDS
49							$args{thread_buffer} \|\| 30, # CT_CONTEXT_MAX
50							{ }, # CT_WORDS
51							0, # CT_WORDS_TOTAL
52							$args{debug} \|\| 0, # CT_DEBUG
53							], $class;
54
55							# Stem stopwords.
56
57	1					6	my $stopwords = getStopWords();
58	1					263	my %stopwords;
59
60	1					46	foreach my $stopword (keys %$stopwords) {
61							# $stopwords{$self->_word_stem($stopword)}++;
62							}
63
64	1					15	$self->[CT_STOPWORDS] = \%stopwords;
65
66	1					27	return $self;
67							}
68
69							#sub hear {
70							# my ($self, $nick, $ident, $host, $text) = @_;
71							#}
72							#
73							#sub see {
74							# my ($self, $nick, $ident, $host, $text) = @_;
75							#}
76							#
77							#sub rename {
78							# my ($self, $old_nick, $new_nick, $ident, $host) = @_;
79							#}
80							#
81							#sub arrival {
82							# my ($self, $nick, $ident, $host) = @_;
83							#}
84							#
85							#sub departure {
86							# my ($self, $nick, $ident, $host) = @_;
87							#}
88
89							sub observe {
90	3			3	1	6	my ($self, $nick, $text) = @_;
91
92							# IRC nicks are case-insensitive.
93	3					9	$nick = $self->_nick_fix($nick);
94
95	3	50				9	if ($self->[CT_DEBUG]) {
96	0					0	warn ">>>> <$nick> $text\n";
97							}
98
99							# Extract non-stopwords from spoken text.
100							#
101							# TODO - Determine stopwords dynamically from observed context. Or
102							# perhaps generate stopwords from some logs.
103							#
104							# Stem the words here, so they go into the system as stems.
105
106	3					6	my $words_text = lc($text);
107	3					13	$words_text =~ s/[^\w\s]+/ /g;
108
109	3					4	my %my_words;
110	3					12	foreach my $word (
	15					37
111	15					32	grep { ! exists $self->[CT_STOPWORDS]{$_} }
112	15					27	map { $self->_word_stem($_) }
113							grep { length() > 1 }
114							split /\s+/, $words_text
115							) {
116	15					30	$my_words{$word}++;
117							}
118
119	3					14	my @my_words = keys %my_words;
120
121							# Find explicit addressees.
122
123	3					8	my $addressee_text = lc($text);
124
125	3					5	my $best_addressee = "";
126	3					4	my $best_addressee_score = 0;
127
128							# Nickname starts the line.
129
130	3	50	33			36	if (
			33
131							($addressee_text =~ /^\s(\S+?)\s[:,]\s+/) or
132							($addressee_text =~ /^\s(\S+?)-\s+/) or
133							($addressee_text =~ /^\s*t\s+(\S+?)\s+/)
134							) {
135	3					6	my $test = $self->_nick_fix($1);
136	3	50				9	if ($self->[CT_DEBUG]) {
137	0					0	warn " pre($test)\n";
138							}
139	3					9	my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
140
141							# Best addressee score is 3x because the nick is at the start.
142	3					6	$best_nick_score *= 3;
143
144	3	50				9	if ($best_nick_score > $best_addressee_score) {
145	0	0				0	if ($self->[CT_DEBUG]) {
146	0					0	warn(
147							" found $test ",
148							"($best_nick = $best_nick_score > $best_addressee_score)\n"
149							);
150							}
151	0					0	$best_addressee = $best_nick;
152	0					0	$best_addressee_score = $best_nick_score;
153							}
154							}
155
156							# Nickname ends the line.
157
158	3	50				62	if ($addressee_text =~ /[\s,](\S+?)[.?!'")\]\}\s]$/) {
159	3					8	my $test = $self->_nick_fix($1);
160	3	50				9	if ($self->[CT_DEBUG]) {
161	0					0	warn " post($test)\n";
162							}
163	3					9	my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
164
165							# Best addressee score is 2x because the nick is at the end.
166	3					4	$best_nick_score *= 2;
167
168	3	50				8	if ($best_nick_score > $best_addressee_score) {
169	0	0				0	if ($self->[CT_DEBUG]) {
170	0					0	warn(
171							" found $test ",
172							"($best_nick = $best_nick_score > $best_addressee_score)\n"
173							);
174							}
175	0					0	$best_addressee = $best_nick;
176	0					0	$best_addressee_score = $best_nick_score;
177							}
178							}
179
180							# Nickname occurs somewhere in the middle.
181
182	3					17	while ($addressee_text =~ m/\s,\s(\S+?)\s[,!?.]\s/g) {
183	0					0	my $test = $self->_nick_fix($1);
184	0	0				0	if ($self->[CT_DEBUG]) {
185	0					0	warn " in($test)";
186							}
187	0					0	my ($best_nick, $best_nick_score) = $self->_nick_exists($test);
188
189	0	0				0	if ($best_nick_score > $best_addressee_score) {
190	0	0				0	if ($self->[CT_DEBUG]) {
191	0					0	warn(
192							" found $test ",
193							"($best_nick = $best_nick_score > $best_addressee_score)\n"
194							);
195							}
196	0					0	$best_addressee = $best_nick;
197	0					0	$best_addressee_score = $best_nick_score;
198							}
199							}
200
201	3	50				9	if ($self->[CT_DEBUG]) {
202	0					0	warn " best addressee score = $best_addressee_score\n";
203							}
204
205							# TODO - If an implied statement goes to nobody, then perhaps it's a
206							# continuation of the last statement they said?
207
208	3					3	my $seen_them_factor = 0;
209	3					4	my $seen_me_factor = 0;
210	3					3	my $seen_other_factor = 0;
211
212	3					4	my $best_score = 0;
213	3					3	my $best_index;
214
215	3					3	my $index = @{$self->[CT_CONTEXT]};
	3					5
216	3					9	while ($index--) {
217	3					6	my $context = $self->[CT_CONTEXT][$index];
218	3					4	my $them = $context->[CTX_NICK];
219
220							# Figure out speaker/them affinity.
221
222	3					10	my $affinity = $self->_nick_score($nick, $them);
223
224	3					8	my $match_factor = $self->_correlate_statements(
225							\@my_words, $context->[CTX_WORDS]
226							);
227
228	3					4	my $distance_factor = @{$self->[CT_CONTEXT]} - $index;
	3					7
229
230	3					4	my $addressee_score = 0;
231	3	50				7	$addressee_score = $best_addressee_score if $them eq $best_addressee;
232
233							# Weigh factors.
234
235	3					3	my $weighted_addressee = $addressee_score * 30;
236	3					5	my $weighted_affinity = $affinity * 45; # half addressee
237	3					4	my $weighted_match = $match_factor * 30;
238	3					4	my $weighted_seen_them = $seen_them_factor * -3;
239	3					3	my $weighted_seen_me = $seen_me_factor * -3;
240	3					4	my $weighted_distance = $distance_factor * -1;
241
242							# Calculate a weighted score.
243
244	3					5	my $score = (
245							$weighted_affinity +
246							$weighted_addressee +
247							$weighted_match +
248							$weighted_seen_them +
249							$weighted_seen_me +
250							$weighted_distance
251							);
252
253	3	100				8	if ($score > $best_score) {
254	1					1	$best_score = $score;
255	1					2	$best_index = $index;
256							}
257
258	3					59	my $out = sprintf(
259							( " aff(%9.3f) addr(%9.3f) match(%9.3f) " .
260							"sthem(%9.3f) sme(%9.3f) dst(%9.3f) " .
261							"score(%9.3f) best(%9.3f) "
262							),
263							$weighted_affinity, $weighted_addressee, $weighted_match,
264							$weighted_seen_them, $weighted_seen_me, $weighted_distance,
265							$score, $best_score,
266							);
267
268	3					11	$out .= substr(
269							$self->_id_get_text($context->[CTX_ID]), 0, 156 - length($out) - 2
270							);
271
272	3	50				14	if ($self->[CT_DEBUG]) {
273	0					0	warn $out, "\n";
274							}
275
276							# Serious penalties for passing people by.
277	3	50				7	if ($nick eq $them) {
278	0	0				0	if ($seen_other_factor) {
279	0					0	$seen_me_factor++;
280							}
281							}
282							else {
283	3					9	$seen_other_factor++;
284							}
285							}
286
287	3					9	return $self->process_match(
288							$best_index, $nick, $text, \%my_words, $best_score
289							);
290							}
291
292							sub _context_get_id {
293	1			1		2	my ($self, $index) = @_;
294	1					3	return $self->[CT_CONTEXT][$index][CTX_ID];
295							}
296
297							sub _context_get_nick {
298	1			1		2	my ($self, $index) = @_;
299	1					3	return $self->[CT_CONTEXT][$index][CTX_NICK];
300							}
301
302							### Manage seen nicks.
303
304							# Add a nickname to the database, or update the confidence between
305							# $nick and $addressee of an existing nickname.
306
307							sub _nick_add {
308	3			3		5	my ($self, $nick, $addressee, $confidence) = @_;
309
310							# Make sure the nick exists.
311	3	50				17	unless (exists $self->[CT_NICKS]{$nick}) {
312	3					7	$self->[CT_NICKS]{$nick} = { };
313							}
314
315	3	50	66			13	if (
316							defined($addressee) and
317							!exists($self->[CT_NICKS]{$addressee})
318							) {
319	0					0	$self->[CT_NICKS]{$addressee} = { };
320							}
321
322							# Decay everybody. This is a lousy O(N**2) problem.
323	3					3	foreach my $me (keys %{$self->[CT_NICKS]}) {
	3					10
324	6					6	foreach my $them (keys %{$self->[CT_NICKS]{$me}}) {
	6					23
325	0					0	$self->_nick_decay_link($me, $them);
326							}
327							}
328
329							# Average in the new confidence.
330	3	100				10	if (defined $addressee) {
331	1	50				5	if (exists $self->[CT_NICKS]{$addressee}{$nick}) {
332	0	0				0	if ($self->[CT_NICKS]{$addressee}{$nick} < $confidence) {
333	0					0	$self->[CT_NICKS]{$addressee}{$nick} = $confidence;
334							}
335							}
336							else {
337	1					3	$self->[CT_NICKS]{$addressee}{$nick} = $confidence;
338							}
339
340	1	50				3	if (exists $self->[CT_NICKS]{$nick}{$addressee}) {
341	0	0				0	if ($self->[CT_NICKS]{$nick}{$addressee} < $confidence) {
342	0					0	$self->[CT_NICKS]{$nick}{$addressee} = $confidence;
343							}
344							}
345							else {
346	1					4	$self->[CT_NICKS]{$nick}{$addressee} = $confidence;
347							}
348							}
349							}
350
351							sub _nick_decay_link {
352	0			0		0	my ($self, $me, $them) = @_;
353	0					0	$self->[CT_NICKS]{$me}{$them} /= 4;
354	0	0				0	if ($self->[CT_NICKS]{$me}{$them} < 1) {
355	0					0	delete $self->[CT_NICKS]{$me}{$them};
356							}
357							}
358
359							sub _nick_del {
360	0			0		0	my $nick = shift;
361							# Nothing?
362							}
363
364							# The score is the average of the speaker/other and other/speaker
365							# links. It must be a number from 0 through 1.
366
367							sub _nick_score {
368	3			3		4	my ($self, $speaker, $other) = @_;
369
370							# Speaker to other.
371
372	3					4	my $total_speaker_to_other = 0;
373	3					3	my $speaker_to_other = 0;
374
375	3	50	33			10	if (
376							exists($self->[CT_NICKS]{$speaker}) and
377							exists($self->[CT_NICKS]{$speaker}{$other})
378							) {
379	0					0	$speaker_to_other = $self->[CT_NICKS]{$speaker}{$other};
380	0					0	foreach my $audience (keys %{$self->[CT_NICKS]{$speaker}}) {
	0					0
381	0					0	$total_speaker_to_other += $self->[CT_NICKS]{$speaker}{$audience};
382							}
383							}
384
385							# Other from speaker.
386
387	3					12	my $total_other_to_speaker = 0;
388	3					3	my $other_to_speaker = 0;
389
390	3	50	33			17	if (
391							exists($self->[CT_NICKS]{$other}) and
392							exists($self->[CT_NICKS]{$other}{$speaker})
393							) {
394	0					0	$other_to_speaker = $self->[CT_NICKS]{$other}{$speaker};
395	0					0	foreach my $them (keys %{$self->[CT_NICKS]{$other}}) {
	0					0
396	0					0	$total_other_to_speaker += $self->[CT_NICKS]{$other}{$them};
397							}
398							}
399
400							# If the total of the totals is zero, then avoid the division by
401							# zero.
402
403	3					5	my $total_total = $total_speaker_to_other + $total_other_to_speaker;
404	3	50				9	return 0 unless $total_total;
405
406	0					0	return( ($speaker_to_other + $other_to_speaker) / $total_total );
407							}
408
409							sub _nick_fix {
410	9			9		18	my ($self, $nick) = @_;
411
412	9					15	my $fixed_nick = lc($nick);
413	9					15	$fixed_nick =~ s/^q\[(\S+)]$/$1/; # q[nick] remove the quotes
414	9					41	$fixed_nick =~ s/[^A-Za-z0-9]*$//; # remove trailing junk
415	9					22	$fixed_nick =~ s/^[^A-Za-z0-9]*//; # remove leading junk
416
417							# If it's all junk, return it lowercased.
418	9	100				25	$fixed_nick = lc($nick) unless length $fixed_nick;
419
420	9					19	return $fixed_nick;
421							}
422
423							# Does a nickname exist? Return a new new nickname and a number
424							# between 0 and 1 that tells how much the given nickname matches it.
425
426							sub _nick_exists {
427	6			6		8	my ($self, $nick) = @_;
428
429							# No match if nothing here. Keeps amatch() from bailing.
430	6					7	my @known_nicks = keys %{$self->[CT_NICKS]};
	6					15
431	6	100				18	return (undef, 0) unless @known_nicks;
432
433							# Often a nickname is a shortened version of some other. Sometimes
434							# it's an extended version of it. Other times it's a bastardization
435							# of a known nickname.
436							#
437							# Find all the nicknames that begin with the specified nickname.
438							#
439							# TODO - If there are none, try string distances? Is there a better
440							# way to hash string distances with lengths?
441
442	4					62	my @found = grep /^\Q$nick/, @known_nicks;
443
444							# Never did find nothin'.
445	4	50				17	return (undef, 0) unless @found;
446
447	0	0				0	if ($self->[CT_DEBUG]) {
448	0					0	warn " $nick matches (@found)\n";
449							}
450
451							# Find the best match out of the found matches. "Best match" is a
452							# combination of string distance and ratio of entered nick to match.
453
454	0					0	my @proximities = map { 1 - $_ } adistr($nick, @found);
	0					0
455
456	0	0				0	if ($self->[CT_DEBUG]) {
457	0					0	warn " $nick proximities (@proximities)\n";
458							}
459
460	0					0	my ($best_nick, $best_score) = ("", 0);
461	0					0	while (@found) {
462	0	0				0	die unless @found == @proximities;
463	0					0	my $match = shift @found;
464	0					0	my $prox = shift @proximities;
465
466							# Words closer to the input length score higher.
467							# Squared so it diminishes faster.
468	0					0	my $length_score = (length($nick) / length($match)) ** 2;
469
470	0					0	my $score = $prox * $length_score;
471	0	0				0	next if $score < $best_score;
472
473	0	0				0	if ($self->[CT_DEBUG]) {
474	0					0	warn " $prox * $length_score = $score\n";
475							}
476
477	0					0	$best_nick = $match;
478	0					0	$best_score = $score;
479							}
480
481	0	0				0	if ($best_nick) {
482	0	0				0	if ($self->[CT_DEBUG]) {
483	0					0	warn " $nick = $best_nick ($best_score)\n";
484							}
485	0					0	return ($best_nick, $best_score);
486							}
487
488	0	0				0	if ($self->[CT_DEBUG]) {
489	0					0	warn " $nick not found\n";
490							}
491	0					0	return (undef, 0);
492							}
493
494							### Manage known IDs.
495
496							sub _id_fully_qualified {
497	0			0		0	my ($self, $id) = @_;
498
499	0					0	my @key;
500	0					0	while ($id) {
501	0					0	unshift @key, $id;
502	0					0	$id = $self->[CT_IDS]{$id}[ID_REFERENT];
503							}
504
505	0					0	return join "/", @key;
506							}
507
508							sub _id_add {
509	3			3		7	my ($self, $id, $referent, $nick, $text) = @_;
510
511	3					12	$self->[CT_IDS]{$id} = [
512							$referent, # ID_REFERENT
513							[ ], # ID_REFERERS
514							$text, # ID_TEXT
515							$nick, # ID_NICK
516							];
517
518	3	100	66			13	if ($referent and exists $self->[CT_IDS]{$referent}) {
519	1					2	push @{$self->[CT_IDS]{$referent}[ID_REFERERS]}, $id;
	1					4
520							}
521							}
522
523							sub _id_del {
524	0			0		0	my ($self, $id) = @_;
525
526	0					0	my $old = delete $self->[CT_IDS]{$id};
527
528							# Fix the statement's kids to stop pointing at the parent.
529
530	0					0	foreach my $referer (@{$old->[ID_REFERERS]}) {
	0					0
531	0					0	$self->[CT_IDS]{$referer}[ID_REFERENT] = undef;
532							}
533							}
534
535							sub _id_get_referent {
536	0			0		0	my ($self, $id) = @_;
537	0					0	return $self->[CT_IDS]{$id}[ID_REFERENT];
538							}
539
540							sub _id_get_nick {
541	0			0		0	my ($self, $id) = @_;
542
543							# XXX - Happens when someone explicitly addresses a nickname that
544							# hasn't appeared yet.
545							#
546							# Use of uninitialized value in hash element at ChatThread.pm line 352.
547
548	0					0	return $self->[CT_IDS]{$id}[ID_NICK];
549							}
550
551							sub _id_exists {
552	0			0		0	my ($self, $id) = @_;
553	0					0	return exists $self->[CT_IDS]{$id};
554							}
555
556							sub _id_get_text {
557	3			3		5	my ($self, $id) = @_;
558	3					13	return $self->[CT_IDS]{$id}[ID_TEXT];
559							}
560
561							sub _id_list {
562	0			0		0	my $self = shift;
563
564							# We do this rather than keys of CT_IDS because it's always
565							# guaranteed to be in time order. That is, referents come before
566							# stuff that refers to them.
567	0					0	return map { $_->[CTX_ID] } @{$self->[CT_CONTEXT]};
	0					0
	0					0
568							}
569
570							# Fuzzy match text.
571							#
572							# The current idea is to return a number that represents how much of
573							# @$my_words matches %$their_words; Each matching word multiplied by
574							# a per-word score that reflects the uniqueness of that word.
575
576							sub _correlate_statements {
577	3			3		6	my ($self, $my_words, $their_words) = @_;
578
579	3					4	my $match_factor = 0;
580	3		50			8	my $total_words = @$my_words \|\| 1;
581
582	3					4	foreach my $my_word (@$my_words) {
583	11	100				26	next unless exists $their_words->{$my_word};
584	2					8	$match_factor += $self->_word_get_score($my_word);
585							}
586
587	3					8	return $match_factor / $total_words;
588							}
589
590							###
591
592							sub process_match {
593	3			3	0	8	my ($self, $index, $nick, $text, $my_words, $confidence) = @_;
594
595	3					4	my $id = $next_id++;
596
597	3					4	my ($referent, $addressee, $print_addressee);
598
599	3	100				8	if (defined $index) {
600	1					12	$referent = $self->_context_get_id($index);
601	1					6	$addressee = $self->_context_get_nick($index);
602
603							# If the person refers to themselves, refer them instead to
604							# whoever they were talking to previously.
605	1	50				12	if ($addressee eq $nick) {
606	0	0				0	if ($self->_id_exists($referent)) {
607	0					0	$referent = $self->_id_get_referent($referent);
608	0					0	$addressee = $self->_id_get_nick($referent);
609							}
610							else {
611	0					0	$referent = $addressee = undef;
612							}
613							}
614							}
615
616	3	100				8	if (defined $addressee) {
617	1					2	$print_addressee = $addressee;
618							}
619							else {
620	2					3	$print_addressee = "(nobody)";
621							}
622
623	3	50				9	if ($self->[CT_DEBUG]) {
624	0					0	warn "<<<< ($id) $nick -> $print_addressee : $text\n";
625							}
626
627							# XXX - _context_add() ?
628	3					4	push @{$self->[CT_CONTEXT]}, [
	3					9
629							$id, # CTX_ID
630							$nick, # CTX_NICK
631							$addressee, # CTX_ADDRESSEE
632							$my_words, # CTX_WORDS
633							$referent, # CTX_REFERENT
634							];
635
636	3					9	my $debug_text = "<$nick> $text";
637
638	3					10	$self->_nick_add($nick, $addressee, $confidence);
639	3					8	$self->_id_add($id, $referent, $nick, $debug_text);
640	3					40	$self->_words_add($my_words);
641
642							# XXX - _context_prune() ?
643							# XXX - Deleting the words here is a cheezy way to decay word
644							# importance over time.
645	3					7	while (@{$self->[CT_CONTEXT]} > $self->[CT_CONTEXT_MAX]) {
	3					11
646	0					0	my $old = shift @{$self->[CT_CONTEXT]};
	0					0
647	0					0	$self->_nick_del($old->[CTX_NICK]);
648	0					0	$self->_words_del($old->[CTX_WORDS]);
649	0					0	$self->_id_del($old->[CTX_ID]);
650							}
651
652							return (
653	3					16	$id, # new ID
654							$referent, # referent ID
655							$debug_text, # display text
656							);
657							}
658
659							### Manage words, for frequency and feature extraction.
660
661							sub _word_stem {
662	15			15		27	my ($self, $word) = @_;
663	15					34	my $stem = stem("en", $word);
664	15					770	return $stem;
665							}
666
667							sub _words_add {
668	3			3		6	my ($self, $words) = @_;
669
670	3					8	foreach my $word (keys %$words) {
671	15					16	$self->[CT_WORDS_TOTAL] += $words->{$word};
672	15					32	$self->[CT_WORDS]{$word} += $words->{$word};
673							}
674							}
675
676							sub _words_del {
677	0			0		0	my ($self, $words) = @_;
678
679							# XXX - Experimenting with building a huge corpus.
680	0					0	return;
681
682	0					0	foreach my $word (keys %$words) {
683	0					0	$self->[CT_WORDS_TOTAL] -= $words->{$word};
684	0					0	$self->[CT_WORDS]{$word} -= $words->{$word};
685	0	0				0	next if $self->[CT_WORDS]{$word} > 0;
686	0					0	delete $self->[CT_WORDS]{$word};
687							}
688							}
689
690							# The word's score increases as its frequency decreases.
691
692							sub _word_get_score {
693	2			2		3	my ($self, $stem) = @_;
694
695	2		50			6	my $word_count = $self->[CT_WORDS]{$stem} \|\| 0;
696	2		50			6	my $corpus_count = $self->[CT_WORDS_TOTAL] \|\| 1;
697
698	2					4	my $word_score = ($corpus_count - $word_count) / $corpus_count;
699	2	50				6	if (exists $self->[CT_STOPWORDS]{$stem}) {
700	0					0	$word_score /= 2;
701							}
702
703	2					6	return $word_score;
704							}
705
706							1;
707
708							__END__