line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Conversation; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
4
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
6
|
use vars qw($VERSION); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
46
|
|
7
|
|
|
|
|
|
|
$VERSION = '0.050'; |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
991
|
use Lingua::StopWords::EN qw(getStopWords); |
|
1
|
|
|
|
|
506
|
|
|
1
|
|
|
|
|
78
|
|
10
|
1
|
|
|
1
|
|
861
|
use Lingua::Stem::Snowball qw(stem); |
|
1
|
|
|
|
|
2128
|
|
|
1
|
|
|
|
|
66
|
|
11
|
1
|
|
|
1
|
|
936
|
use String::Approx qw(amatch adistr); |
|
1
|
|
|
|
|
5754
|
|
|
1
|
|
|
|
|
87
|
|
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
11
|
use constant CT_STOPWORDS => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
82
|
|
14
|
1
|
|
|
1
|
|
6
|
use constant CT_CONTEXT => 1; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
41
|
|
15
|
1
|
|
|
1
|
|
6
|
use constant CT_NICKS => 2; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
16
|
1
|
|
|
1
|
|
6
|
use constant CT_IDS => 3; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
17
|
1
|
|
|
1
|
|
6
|
use constant CT_CONTEXT_MAX => 4; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
18
|
1
|
|
|
1
|
|
6
|
use constant CT_WORDS => 5; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
45
|
|
19
|
1
|
|
|
1
|
|
5
|
use constant CT_WORDS_TOTAL => 6; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
20
|
1
|
|
|
1
|
|
6
|
use constant CT_DEBUG => 7; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
54
|
|
21
|
|
|
|
|
|
|
|
22
|
1
|
|
|
1
|
|
6
|
use constant CTX_ID => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
23
|
1
|
|
|
1
|
|
5
|
use constant CTX_NICK => 1; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
45
|
|
24
|
1
|
|
|
1
|
|
5
|
use constant CTX_ADDRESSEE => 2; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
37
|
|
25
|
1
|
|
|
1
|
|
5
|
use constant CTX_WORDS => 3; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
44
|
|
26
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
5
|
use constant ID_REFERENT => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
28
|
1
|
|
|
1
|
|
5
|
use constant ID_REFERERS => 1; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
41
|
|
29
|
1
|
|
|
1
|
|
5
|
use constant ID_TEXT => 2; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
41
|
|
30
|
1
|
|
|
1
|
|
5
|
use constant ID_NICK => 3; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
4710
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
### Manage scrollback. |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# The next ID is package static so messages will be unique across all |
35
|
|
|
|
|
|
|
# threaders. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $next_id = "a"; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub new { |
40
|
1
|
|
|
1
|
1
|
3
|
my ($class, %args) = @_; |
41
|
|
|
|
|
|
|
|
42
|
1
|
|
50
|
|
|
10
|
$args{thread_buffer} ||= 30; |
43
|
|
|
|
|
|
|
|
44
|
1
|
|
50
|
|
|
14
|
my $self = bless [ |
|
|
|
50
|
|
|
|
|
45
|
|
|
|
|
|
|
undef, # CT_STOPWORDS |
46
|
|
|
|
|
|
|
[ ], # CT_CONTEXT |
47
|
|
|
|
|
|
|
{ }, # CT_NICKS |
48
|
|
|
|
|
|
|
{ }, # CT_IDS |
49
|
|
|
|
|
|
|
$args{thread_buffer} || 30, # CT_CONTEXT_MAX |
50
|
|
|
|
|
|
|
{ }, # CT_WORDS |
51
|
|
|
|
|
|
|
0, # CT_WORDS_TOTAL |
52
|
|
|
|
|
|
|
$args{debug} || 0, # CT_DEBUG |
53
|
|
|
|
|
|
|
], $class; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Stem stopwords. |
56
|
|
|
|
|
|
|
|
57
|
1
|
|
|
|
|
6
|
my $stopwords = getStopWords(); |
58
|
1
|
|
|
|
|
263
|
my %stopwords; |
59
|
|
|
|
|
|
|
|
60
|
1
|
|
|
|
|
46
|
foreach my $stopword (keys %$stopwords) { |
61
|
|
|
|
|
|
|
# $stopwords{$self->_word_stem($stopword)}++; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
1
|
|
|
|
|
15
|
$self->[CT_STOPWORDS] = \%stopwords; |
65
|
|
|
|
|
|
|
|
66
|
1
|
|
|
|
|
27
|
return $self; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
#sub hear { |
70
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host, $text) = @_; |
71
|
|
|
|
|
|
|
#} |
72
|
|
|
|
|
|
|
# |
73
|
|
|
|
|
|
|
#sub see { |
74
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host, $text) = @_; |
75
|
|
|
|
|
|
|
#} |
76
|
|
|
|
|
|
|
# |
77
|
|
|
|
|
|
|
#sub rename { |
78
|
|
|
|
|
|
|
# my ($self, $old_nick, $new_nick, $ident, $host) = @_; |
79
|
|
|
|
|
|
|
#} |
80
|
|
|
|
|
|
|
# |
81
|
|
|
|
|
|
|
#sub arrival { |
82
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host) = @_; |
83
|
|
|
|
|
|
|
#} |
84
|
|
|
|
|
|
|
# |
85
|
|
|
|
|
|
|
#sub departure { |
86
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host) = @_; |
87
|
|
|
|
|
|
|
#} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub observe { |
90
|
3
|
|
|
3
|
1
|
6
|
my ($self, $nick, $text) = @_; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# IRC nicks are case-insensitive. |
93
|
3
|
|
|
|
|
9
|
$nick = $self->_nick_fix($nick); |
94
|
|
|
|
|
|
|
|
95
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
96
|
0
|
|
|
|
|
0
|
warn ">>>> <$nick> $text\n"; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Extract non-stopwords from spoken text. |
100
|
|
|
|
|
|
|
# |
101
|
|
|
|
|
|
|
# TODO - Determine stopwords dynamically from observed context. Or |
102
|
|
|
|
|
|
|
# perhaps generate stopwords from some logs. |
103
|
|
|
|
|
|
|
# |
104
|
|
|
|
|
|
|
# Stem the words here, so they go into the system as stems. |
105
|
|
|
|
|
|
|
|
106
|
3
|
|
|
|
|
6
|
my $words_text = lc($text); |
107
|
3
|
|
|
|
|
13
|
$words_text =~ s/[^\w\s]+/ /g; |
108
|
|
|
|
|
|
|
|
109
|
3
|
|
|
|
|
4
|
my %my_words; |
110
|
3
|
|
|
|
|
12
|
foreach my $word ( |
|
15
|
|
|
|
|
37
|
|
111
|
15
|
|
|
|
|
32
|
grep { ! exists $self->[CT_STOPWORDS]{$_} } |
112
|
15
|
|
|
|
|
27
|
map { $self->_word_stem($_) } |
113
|
|
|
|
|
|
|
grep { length() > 1 } |
114
|
|
|
|
|
|
|
split /\s+/, $words_text |
115
|
|
|
|
|
|
|
) { |
116
|
15
|
|
|
|
|
30
|
$my_words{$word}++; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
3
|
|
|
|
|
14
|
my @my_words = keys %my_words; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Find explicit addressees. |
122
|
|
|
|
|
|
|
|
123
|
3
|
|
|
|
|
8
|
my $addressee_text = lc($text); |
124
|
|
|
|
|
|
|
|
125
|
3
|
|
|
|
|
5
|
my $best_addressee = ""; |
126
|
3
|
|
|
|
|
4
|
my $best_addressee_score = 0; |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# Nickname starts the line. |
129
|
|
|
|
|
|
|
|
130
|
3
|
50
|
33
|
|
|
36
|
if ( |
|
|
|
33
|
|
|
|
|
131
|
|
|
|
|
|
|
($addressee_text =~ /^\s*(\S+?)\s*[:,]\s+/) or |
132
|
|
|
|
|
|
|
($addressee_text =~ /^\s*(\S+?)-*\s+/) or |
133
|
|
|
|
|
|
|
($addressee_text =~ /^\s*t\s+(\S+?)\s+/) |
134
|
|
|
|
|
|
|
) { |
135
|
3
|
|
|
|
|
6
|
my $test = $self->_nick_fix($1); |
136
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
137
|
0
|
|
|
|
|
0
|
warn " pre($test)\n"; |
138
|
|
|
|
|
|
|
} |
139
|
3
|
|
|
|
|
9
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Best addressee score is 3x because the nick is at the start. |
142
|
3
|
|
|
|
|
6
|
$best_nick_score *= 3; |
143
|
|
|
|
|
|
|
|
144
|
3
|
50
|
|
|
|
9
|
if ($best_nick_score > $best_addressee_score) { |
145
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
146
|
0
|
|
|
|
|
0
|
warn( |
147
|
|
|
|
|
|
|
" found $test ", |
148
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
149
|
|
|
|
|
|
|
); |
150
|
|
|
|
|
|
|
} |
151
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
152
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# Nickname ends the line. |
157
|
|
|
|
|
|
|
|
158
|
3
|
50
|
|
|
|
62
|
if ($addressee_text =~ /[\s,]*(\S+?)[.?!'")\]\}\s]*$/) { |
159
|
3
|
|
|
|
|
8
|
my $test = $self->_nick_fix($1); |
160
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
161
|
0
|
|
|
|
|
0
|
warn " post($test)\n"; |
162
|
|
|
|
|
|
|
} |
163
|
3
|
|
|
|
|
9
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Best addressee score is 2x because the nick is at the end. |
166
|
3
|
|
|
|
|
4
|
$best_nick_score *= 2; |
167
|
|
|
|
|
|
|
|
168
|
3
|
50
|
|
|
|
8
|
if ($best_nick_score > $best_addressee_score) { |
169
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
170
|
0
|
|
|
|
|
0
|
warn( |
171
|
|
|
|
|
|
|
" found $test ", |
172
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
173
|
|
|
|
|
|
|
); |
174
|
|
|
|
|
|
|
} |
175
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
176
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# Nickname occurs somewhere in the middle. |
181
|
|
|
|
|
|
|
|
182
|
3
|
|
|
|
|
17
|
while ($addressee_text =~ m/\s*,\s*(\S+?)\s*[,!?.]\s*/g) { |
183
|
0
|
|
|
|
|
0
|
my $test = $self->_nick_fix($1); |
184
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
185
|
0
|
|
|
|
|
0
|
warn " in($test)"; |
186
|
|
|
|
|
|
|
} |
187
|
0
|
|
|
|
|
0
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
188
|
|
|
|
|
|
|
|
189
|
0
|
0
|
|
|
|
0
|
if ($best_nick_score > $best_addressee_score) { |
190
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
191
|
0
|
|
|
|
|
0
|
warn( |
192
|
|
|
|
|
|
|
" found $test ", |
193
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
194
|
|
|
|
|
|
|
); |
195
|
|
|
|
|
|
|
} |
196
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
197
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
202
|
0
|
|
|
|
|
0
|
warn " best addressee score = $best_addressee_score\n"; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# TODO - If an implied statement goes to nobody, then perhaps it's a |
206
|
|
|
|
|
|
|
# continuation of the last statement they said? |
207
|
|
|
|
|
|
|
|
208
|
3
|
|
|
|
|
3
|
my $seen_them_factor = 0; |
209
|
3
|
|
|
|
|
4
|
my $seen_me_factor = 0; |
210
|
3
|
|
|
|
|
3
|
my $seen_other_factor = 0; |
211
|
|
|
|
|
|
|
|
212
|
3
|
|
|
|
|
4
|
my $best_score = 0; |
213
|
3
|
|
|
|
|
3
|
my $best_index; |
214
|
|
|
|
|
|
|
|
215
|
3
|
|
|
|
|
3
|
my $index = @{$self->[CT_CONTEXT]}; |
|
3
|
|
|
|
|
5
|
|
216
|
3
|
|
|
|
|
9
|
while ($index--) { |
217
|
3
|
|
|
|
|
6
|
my $context = $self->[CT_CONTEXT][$index]; |
218
|
3
|
|
|
|
|
4
|
my $them = $context->[CTX_NICK]; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Figure out speaker/them affinity. |
221
|
|
|
|
|
|
|
|
222
|
3
|
|
|
|
|
10
|
my $affinity = $self->_nick_score($nick, $them); |
223
|
|
|
|
|
|
|
|
224
|
3
|
|
|
|
|
8
|
my $match_factor = $self->_correlate_statements( |
225
|
|
|
|
|
|
|
\@my_words, $context->[CTX_WORDS] |
226
|
|
|
|
|
|
|
); |
227
|
|
|
|
|
|
|
|
228
|
3
|
|
|
|
|
4
|
my $distance_factor = @{$self->[CT_CONTEXT]} - $index; |
|
3
|
|
|
|
|
7
|
|
229
|
|
|
|
|
|
|
|
230
|
3
|
|
|
|
|
4
|
my $addressee_score = 0; |
231
|
3
|
50
|
|
|
|
7
|
$addressee_score = $best_addressee_score if $them eq $best_addressee; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# Weigh factors. |
234
|
|
|
|
|
|
|
|
235
|
3
|
|
|
|
|
3
|
my $weighted_addressee = $addressee_score * 30; |
236
|
3
|
|
|
|
|
5
|
my $weighted_affinity = $affinity * 45; # half addressee |
237
|
3
|
|
|
|
|
4
|
my $weighted_match = $match_factor * 30; |
238
|
3
|
|
|
|
|
4
|
my $weighted_seen_them = $seen_them_factor * -3; |
239
|
3
|
|
|
|
|
3
|
my $weighted_seen_me = $seen_me_factor * -3; |
240
|
3
|
|
|
|
|
4
|
my $weighted_distance = $distance_factor * -1; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Calculate a weighted score. |
243
|
|
|
|
|
|
|
|
244
|
3
|
|
|
|
|
5
|
my $score = ( |
245
|
|
|
|
|
|
|
$weighted_affinity + |
246
|
|
|
|
|
|
|
$weighted_addressee + |
247
|
|
|
|
|
|
|
$weighted_match + |
248
|
|
|
|
|
|
|
$weighted_seen_them + |
249
|
|
|
|
|
|
|
$weighted_seen_me + |
250
|
|
|
|
|
|
|
$weighted_distance |
251
|
|
|
|
|
|
|
); |
252
|
|
|
|
|
|
|
|
253
|
3
|
100
|
|
|
|
8
|
if ($score > $best_score) { |
254
|
1
|
|
|
|
|
1
|
$best_score = $score; |
255
|
1
|
|
|
|
|
2
|
$best_index = $index; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
3
|
|
|
|
|
59
|
my $out = sprintf( |
259
|
|
|
|
|
|
|
( " aff(%9.3f) addr(%9.3f) match(%9.3f) " . |
260
|
|
|
|
|
|
|
"sthem(%9.3f) sme(%9.3f) dst(%9.3f) " . |
261
|
|
|
|
|
|
|
"score(%9.3f) best(%9.3f) " |
262
|
|
|
|
|
|
|
), |
263
|
|
|
|
|
|
|
$weighted_affinity, $weighted_addressee, $weighted_match, |
264
|
|
|
|
|
|
|
$weighted_seen_them, $weighted_seen_me, $weighted_distance, |
265
|
|
|
|
|
|
|
$score, $best_score, |
266
|
|
|
|
|
|
|
); |
267
|
|
|
|
|
|
|
|
268
|
3
|
|
|
|
|
11
|
$out .= substr( |
269
|
|
|
|
|
|
|
$self->_id_get_text($context->[CTX_ID]), 0, 156 - length($out) - 2 |
270
|
|
|
|
|
|
|
); |
271
|
|
|
|
|
|
|
|
272
|
3
|
50
|
|
|
|
14
|
if ($self->[CT_DEBUG]) { |
273
|
0
|
|
|
|
|
0
|
warn $out, "\n"; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# Serious penalties for passing people by. |
277
|
3
|
50
|
|
|
|
7
|
if ($nick eq $them) { |
278
|
0
|
0
|
|
|
|
0
|
if ($seen_other_factor) { |
279
|
0
|
|
|
|
|
0
|
$seen_me_factor++; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
else { |
283
|
3
|
|
|
|
|
9
|
$seen_other_factor++; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
3
|
|
|
|
|
9
|
return $self->process_match( |
288
|
|
|
|
|
|
|
$best_index, $nick, $text, \%my_words, $best_score |
289
|
|
|
|
|
|
|
); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _context_get_id { |
293
|
1
|
|
|
1
|
|
2
|
my ($self, $index) = @_; |
294
|
1
|
|
|
|
|
3
|
return $self->[CT_CONTEXT][$index][CTX_ID]; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub _context_get_nick { |
298
|
1
|
|
|
1
|
|
2
|
my ($self, $index) = @_; |
299
|
1
|
|
|
|
|
3
|
return $self->[CT_CONTEXT][$index][CTX_NICK]; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
### Manage seen nicks. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# Add a nickname to the database, or update the confidence between |
305
|
|
|
|
|
|
|
# $nick and $addressee of an existing nickname. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub _nick_add { |
308
|
3
|
|
|
3
|
|
5
|
my ($self, $nick, $addressee, $confidence) = @_; |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# Make sure the nick exists. |
311
|
3
|
50
|
|
|
|
17
|
unless (exists $self->[CT_NICKS]{$nick}) { |
312
|
3
|
|
|
|
|
7
|
$self->[CT_NICKS]{$nick} = { }; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
3
|
50
|
66
|
|
|
13
|
if ( |
316
|
|
|
|
|
|
|
defined($addressee) and |
317
|
|
|
|
|
|
|
!exists($self->[CT_NICKS]{$addressee}) |
318
|
|
|
|
|
|
|
) { |
319
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$addressee} = { }; |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# Decay everybody. This is a lousy O(N**2) problem. |
323
|
3
|
|
|
|
|
3
|
foreach my $me (keys %{$self->[CT_NICKS]}) { |
|
3
|
|
|
|
|
10
|
|
324
|
6
|
|
|
|
|
6
|
foreach my $them (keys %{$self->[CT_NICKS]{$me}}) { |
|
6
|
|
|
|
|
23
|
|
325
|
0
|
|
|
|
|
0
|
$self->_nick_decay_link($me, $them); |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Average in the new confidence. |
330
|
3
|
100
|
|
|
|
10
|
if (defined $addressee) { |
331
|
1
|
50
|
|
|
|
5
|
if (exists $self->[CT_NICKS]{$addressee}{$nick}) { |
332
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$addressee}{$nick} < $confidence) { |
333
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$addressee}{$nick} = $confidence; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
else { |
337
|
1
|
|
|
|
|
3
|
$self->[CT_NICKS]{$addressee}{$nick} = $confidence; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
1
|
50
|
|
|
|
3
|
if (exists $self->[CT_NICKS]{$nick}{$addressee}) { |
341
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$nick}{$addressee} < $confidence) { |
342
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$nick}{$addressee} = $confidence; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
else { |
346
|
1
|
|
|
|
|
4
|
$self->[CT_NICKS]{$nick}{$addressee} = $confidence; |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub _nick_decay_link { |
352
|
0
|
|
|
0
|
|
0
|
my ($self, $me, $them) = @_; |
353
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$me}{$them} /= 4; |
354
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$me}{$them} < 1) { |
355
|
0
|
|
|
|
|
0
|
delete $self->[CT_NICKS]{$me}{$them}; |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
sub _nick_del { |
360
|
0
|
|
|
0
|
|
0
|
my $nick = shift; |
361
|
|
|
|
|
|
|
# Nothing? |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# The score is the average of the speaker/other and other/speaker |
365
|
|
|
|
|
|
|
# links. It must be a number from 0 through 1. |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub _nick_score { |
368
|
3
|
|
|
3
|
|
4
|
my ($self, $speaker, $other) = @_; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Speaker to other. |
371
|
|
|
|
|
|
|
|
372
|
3
|
|
|
|
|
4
|
my $total_speaker_to_other = 0; |
373
|
3
|
|
|
|
|
3
|
my $speaker_to_other = 0; |
374
|
|
|
|
|
|
|
|
375
|
3
|
50
|
33
|
|
|
10
|
if ( |
376
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$speaker}) and |
377
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$speaker}{$other}) |
378
|
|
|
|
|
|
|
) { |
379
|
0
|
|
|
|
|
0
|
$speaker_to_other = $self->[CT_NICKS]{$speaker}{$other}; |
380
|
0
|
|
|
|
|
0
|
foreach my $audience (keys %{$self->[CT_NICKS]{$speaker}}) { |
|
0
|
|
|
|
|
0
|
|
381
|
0
|
|
|
|
|
0
|
$total_speaker_to_other += $self->[CT_NICKS]{$speaker}{$audience}; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# Other from speaker. |
386
|
|
|
|
|
|
|
|
387
|
3
|
|
|
|
|
12
|
my $total_other_to_speaker = 0; |
388
|
3
|
|
|
|
|
3
|
my $other_to_speaker = 0; |
389
|
|
|
|
|
|
|
|
390
|
3
|
50
|
33
|
|
|
17
|
if ( |
391
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$other}) and |
392
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$other}{$speaker}) |
393
|
|
|
|
|
|
|
) { |
394
|
0
|
|
|
|
|
0
|
$other_to_speaker = $self->[CT_NICKS]{$other}{$speaker}; |
395
|
0
|
|
|
|
|
0
|
foreach my $them (keys %{$self->[CT_NICKS]{$other}}) { |
|
0
|
|
|
|
|
0
|
|
396
|
0
|
|
|
|
|
0
|
$total_other_to_speaker += $self->[CT_NICKS]{$other}{$them}; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# If the total of the totals is zero, then avoid the division by |
401
|
|
|
|
|
|
|
# zero. |
402
|
|
|
|
|
|
|
|
403
|
3
|
|
|
|
|
5
|
my $total_total = $total_speaker_to_other + $total_other_to_speaker; |
404
|
3
|
50
|
|
|
|
9
|
return 0 unless $total_total; |
405
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
return( ($speaker_to_other + $other_to_speaker) / $total_total ); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub _nick_fix { |
410
|
9
|
|
|
9
|
|
18
|
my ($self, $nick) = @_; |
411
|
|
|
|
|
|
|
|
412
|
9
|
|
|
|
|
15
|
my $fixed_nick = lc($nick); |
413
|
9
|
|
|
|
|
15
|
$fixed_nick =~ s/^q\[(\S+)]$/$1/; # q[nick] remove the quotes |
414
|
9
|
|
|
|
|
41
|
$fixed_nick =~ s/[^A-Za-z0-9]*$//; # remove trailing junk |
415
|
9
|
|
|
|
|
22
|
$fixed_nick =~ s/^[^A-Za-z0-9]*//; # remove leading junk |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# If it's all junk, return it lowercased. |
418
|
9
|
100
|
|
|
|
25
|
$fixed_nick = lc($nick) unless length $fixed_nick; |
419
|
|
|
|
|
|
|
|
420
|
9
|
|
|
|
|
19
|
return $fixed_nick; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# Does a nickname exist? Return a new new nickname and a number |
424
|
|
|
|
|
|
|
# between 0 and 1 that tells how much the given nickname matches it. |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub _nick_exists { |
427
|
6
|
|
|
6
|
|
8
|
my ($self, $nick) = @_; |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
# No match if nothing here. Keeps amatch() from bailing. |
430
|
6
|
|
|
|
|
7
|
my @known_nicks = keys %{$self->[CT_NICKS]}; |
|
6
|
|
|
|
|
15
|
|
431
|
6
|
100
|
|
|
|
18
|
return (undef, 0) unless @known_nicks; |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Often a nickname is a shortened version of some other. Sometimes |
434
|
|
|
|
|
|
|
# it's an extended version of it. Other times it's a bastardization |
435
|
|
|
|
|
|
|
# of a known nickname. |
436
|
|
|
|
|
|
|
# |
437
|
|
|
|
|
|
|
# Find all the nicknames that begin with the specified nickname. |
438
|
|
|
|
|
|
|
# |
439
|
|
|
|
|
|
|
# TODO - If there are none, try string distances? Is there a better |
440
|
|
|
|
|
|
|
# way to hash string distances with lengths? |
441
|
|
|
|
|
|
|
|
442
|
4
|
|
|
|
|
62
|
my @found = grep /^\Q$nick/, @known_nicks; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# Never did find nothin'. |
445
|
4
|
50
|
|
|
|
17
|
return (undef, 0) unless @found; |
446
|
|
|
|
|
|
|
|
447
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
448
|
0
|
|
|
|
|
0
|
warn " $nick matches (@found)\n"; |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# Find the best match out of the found matches. "Best match" is a |
452
|
|
|
|
|
|
|
# combination of string distance and ratio of entered nick to match. |
453
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
0
|
my @proximities = map { 1 - $_ } adistr($nick, @found); |
|
0
|
|
|
|
|
0
|
|
455
|
|
|
|
|
|
|
|
456
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
457
|
0
|
|
|
|
|
0
|
warn " $nick proximities (@proximities)\n"; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
0
|
my ($best_nick, $best_score) = ("", 0); |
461
|
0
|
|
|
|
|
0
|
while (@found) { |
462
|
0
|
0
|
|
|
|
0
|
die unless @found == @proximities; |
463
|
0
|
|
|
|
|
0
|
my $match = shift @found; |
464
|
0
|
|
|
|
|
0
|
my $prox = shift @proximities; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# Words closer to the input length score higher. |
467
|
|
|
|
|
|
|
# Squared so it diminishes faster. |
468
|
0
|
|
|
|
|
0
|
my $length_score = (length($nick) / length($match)) ** 2; |
469
|
|
|
|
|
|
|
|
470
|
0
|
|
|
|
|
0
|
my $score = $prox * $length_score; |
471
|
0
|
0
|
|
|
|
0
|
next if $score < $best_score; |
472
|
|
|
|
|
|
|
|
473
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
474
|
0
|
|
|
|
|
0
|
warn " $prox * $length_score = $score\n"; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
0
|
$best_nick = $match; |
478
|
0
|
|
|
|
|
0
|
$best_score = $score; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
0
|
0
|
|
|
|
0
|
if ($best_nick) { |
482
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
483
|
0
|
|
|
|
|
0
|
warn " $nick = $best_nick ($best_score)\n"; |
484
|
|
|
|
|
|
|
} |
485
|
0
|
|
|
|
|
0
|
return ($best_nick, $best_score); |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
|
488
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
489
|
0
|
|
|
|
|
0
|
warn " $nick not found\n"; |
490
|
|
|
|
|
|
|
} |
491
|
0
|
|
|
|
|
0
|
return (undef, 0); |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
### Manage known IDs. |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub _id_fully_qualified { |
497
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
498
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
0
|
my @key; |
500
|
0
|
|
|
|
|
0
|
while ($id) { |
501
|
0
|
|
|
|
|
0
|
unshift @key, $id; |
502
|
0
|
|
|
|
|
0
|
$id = $self->[CT_IDS]{$id}[ID_REFERENT]; |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
0
|
return join "/", @key; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub _id_add { |
509
|
3
|
|
|
3
|
|
7
|
my ($self, $id, $referent, $nick, $text) = @_; |
510
|
|
|
|
|
|
|
|
511
|
3
|
|
|
|
|
12
|
$self->[CT_IDS]{$id} = [ |
512
|
|
|
|
|
|
|
$referent, # ID_REFERENT |
513
|
|
|
|
|
|
|
[ ], # ID_REFERERS |
514
|
|
|
|
|
|
|
$text, # ID_TEXT |
515
|
|
|
|
|
|
|
$nick, # ID_NICK |
516
|
|
|
|
|
|
|
]; |
517
|
|
|
|
|
|
|
|
518
|
3
|
100
|
66
|
|
|
13
|
if ($referent and exists $self->[CT_IDS]{$referent}) { |
519
|
1
|
|
|
|
|
2
|
push @{$self->[CT_IDS]{$referent}[ID_REFERERS]}, $id; |
|
1
|
|
|
|
|
4
|
|
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
sub _id_del { |
524
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
525
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
0
|
my $old = delete $self->[CT_IDS]{$id}; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# Fix the statement's kids to stop pointing at the parent. |
529
|
|
|
|
|
|
|
|
530
|
0
|
|
|
|
|
0
|
foreach my $referer (@{$old->[ID_REFERERS]}) { |
|
0
|
|
|
|
|
0
|
|
531
|
0
|
|
|
|
|
0
|
$self->[CT_IDS]{$referer}[ID_REFERENT] = undef; |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
sub _id_get_referent { |
536
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
537
|
0
|
|
|
|
|
0
|
return $self->[CT_IDS]{$id}[ID_REFERENT]; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub _id_get_nick { |
541
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# XXX - Happens when someone explicitly addresses a nickname that |
544
|
|
|
|
|
|
|
# hasn't appeared yet. |
545
|
|
|
|
|
|
|
# |
546
|
|
|
|
|
|
|
# Use of uninitialized value in hash element at ChatThread.pm line 352. |
547
|
|
|
|
|
|
|
|
548
|
0
|
|
|
|
|
0
|
return $self->[CT_IDS]{$id}[ID_NICK]; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
sub _id_exists { |
552
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
553
|
0
|
|
|
|
|
0
|
return exists $self->[CT_IDS]{$id}; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
sub _id_get_text { |
557
|
3
|
|
|
3
|
|
5
|
my ($self, $id) = @_; |
558
|
3
|
|
|
|
|
13
|
return $self->[CT_IDS]{$id}[ID_TEXT]; |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub _id_list { |
562
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# We do this rather than keys of CT_IDS because it's always |
565
|
|
|
|
|
|
|
# guaranteed to be in time order. That is, referents come before |
566
|
|
|
|
|
|
|
# stuff that refers to them. |
567
|
0
|
|
|
|
|
0
|
return map { $_->[CTX_ID] } @{$self->[CT_CONTEXT]}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
# Fuzzy match text. |
571
|
|
|
|
|
|
|
# |
572
|
|
|
|
|
|
|
# The current idea is to return a number that represents how much of |
573
|
|
|
|
|
|
|
# @$my_words matches %$their_words; Each matching word multiplied by |
574
|
|
|
|
|
|
|
# a per-word score that reflects the uniqueness of that word. |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub _correlate_statements { |
577
|
3
|
|
|
3
|
|
6
|
my ($self, $my_words, $their_words) = @_; |
578
|
|
|
|
|
|
|
|
579
|
3
|
|
|
|
|
4
|
my $match_factor = 0; |
580
|
3
|
|
50
|
|
|
8
|
my $total_words = @$my_words || 1; |
581
|
|
|
|
|
|
|
|
582
|
3
|
|
|
|
|
4
|
foreach my $my_word (@$my_words) { |
583
|
11
|
100
|
|
|
|
26
|
next unless exists $their_words->{$my_word}; |
584
|
2
|
|
|
|
|
8
|
$match_factor += $self->_word_get_score($my_word); |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
3
|
|
|
|
|
8
|
return $match_factor / $total_words; |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
### |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub process_match { |
593
|
3
|
|
|
3
|
0
|
8
|
my ($self, $index, $nick, $text, $my_words, $confidence) = @_; |
594
|
|
|
|
|
|
|
|
595
|
3
|
|
|
|
|
4
|
my $id = $next_id++; |
596
|
|
|
|
|
|
|
|
597
|
3
|
|
|
|
|
4
|
my ($referent, $addressee, $print_addressee); |
598
|
|
|
|
|
|
|
|
599
|
3
|
100
|
|
|
|
8
|
if (defined $index) { |
600
|
1
|
|
|
|
|
12
|
$referent = $self->_context_get_id($index); |
601
|
1
|
|
|
|
|
6
|
$addressee = $self->_context_get_nick($index); |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
# If the person refers to themselves, refer them instead to |
604
|
|
|
|
|
|
|
# whoever they were talking to previously. |
605
|
1
|
50
|
|
|
|
12
|
if ($addressee eq $nick) { |
606
|
0
|
0
|
|
|
|
0
|
if ($self->_id_exists($referent)) { |
607
|
0
|
|
|
|
|
0
|
$referent = $self->_id_get_referent($referent); |
608
|
0
|
|
|
|
|
0
|
$addressee = $self->_id_get_nick($referent); |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
else { |
611
|
0
|
|
|
|
|
0
|
$referent = $addressee = undef; |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
|
616
|
3
|
100
|
|
|
|
8
|
if (defined $addressee) { |
617
|
1
|
|
|
|
|
2
|
$print_addressee = $addressee; |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
else { |
620
|
2
|
|
|
|
|
3
|
$print_addressee = "(nobody)"; |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
|
623
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
624
|
0
|
|
|
|
|
0
|
warn "<<<< ($id) $nick -> $print_addressee : $text\n"; |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# XXX - _context_add() ? |
628
|
3
|
|
|
|
|
4
|
push @{$self->[CT_CONTEXT]}, [ |
|
3
|
|
|
|
|
9
|
|
629
|
|
|
|
|
|
|
$id, # CTX_ID |
630
|
|
|
|
|
|
|
$nick, # CTX_NICK |
631
|
|
|
|
|
|
|
$addressee, # CTX_ADDRESSEE |
632
|
|
|
|
|
|
|
$my_words, # CTX_WORDS |
633
|
|
|
|
|
|
|
$referent, # CTX_REFERENT |
634
|
|
|
|
|
|
|
]; |
635
|
|
|
|
|
|
|
|
636
|
3
|
|
|
|
|
9
|
my $debug_text = "<$nick> $text"; |
637
|
|
|
|
|
|
|
|
638
|
3
|
|
|
|
|
10
|
$self->_nick_add($nick, $addressee, $confidence); |
639
|
3
|
|
|
|
|
8
|
$self->_id_add($id, $referent, $nick, $debug_text); |
640
|
3
|
|
|
|
|
40
|
$self->_words_add($my_words); |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# XXX - _context_prune() ? |
643
|
|
|
|
|
|
|
# XXX - Deleting the words here is a cheezy way to decay word |
644
|
|
|
|
|
|
|
# importance over time. |
645
|
3
|
|
|
|
|
7
|
while (@{$self->[CT_CONTEXT]} > $self->[CT_CONTEXT_MAX]) { |
|
3
|
|
|
|
|
11
|
|
646
|
0
|
|
|
|
|
0
|
my $old = shift @{$self->[CT_CONTEXT]}; |
|
0
|
|
|
|
|
0
|
|
647
|
0
|
|
|
|
|
0
|
$self->_nick_del($old->[CTX_NICK]); |
648
|
0
|
|
|
|
|
0
|
$self->_words_del($old->[CTX_WORDS]); |
649
|
0
|
|
|
|
|
0
|
$self->_id_del($old->[CTX_ID]); |
650
|
|
|
|
|
|
|
} |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
return ( |
653
|
3
|
|
|
|
|
16
|
$id, # new ID |
654
|
|
|
|
|
|
|
$referent, # referent ID |
655
|
|
|
|
|
|
|
$debug_text, # display text |
656
|
|
|
|
|
|
|
); |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
### Manage words, for frequency and feature extraction. |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
sub _word_stem { |
662
|
15
|
|
|
15
|
|
27
|
my ($self, $word) = @_; |
663
|
15
|
|
|
|
|
34
|
my $stem = stem("en", $word); |
664
|
15
|
|
|
|
|
770
|
return $stem; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
sub _words_add { |
668
|
3
|
|
|
3
|
|
6
|
my ($self, $words) = @_; |
669
|
|
|
|
|
|
|
|
670
|
3
|
|
|
|
|
8
|
foreach my $word (keys %$words) { |
671
|
15
|
|
|
|
|
16
|
$self->[CT_WORDS_TOTAL] += $words->{$word}; |
672
|
15
|
|
|
|
|
32
|
$self->[CT_WORDS]{$word} += $words->{$word}; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
sub _words_del { |
677
|
0
|
|
|
0
|
|
0
|
my ($self, $words) = @_; |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# XXX - Experimenting with building a huge corpus. |
680
|
0
|
|
|
|
|
0
|
return; |
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
foreach my $word (keys %$words) { |
683
|
0
|
|
|
|
|
0
|
$self->[CT_WORDS_TOTAL] -= $words->{$word}; |
684
|
0
|
|
|
|
|
0
|
$self->[CT_WORDS]{$word} -= $words->{$word}; |
685
|
0
|
0
|
|
|
|
0
|
next if $self->[CT_WORDS]{$word} > 0; |
686
|
0
|
|
|
|
|
0
|
delete $self->[CT_WORDS]{$word}; |
687
|
|
|
|
|
|
|
} |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# The word's score increases as its frequency decreases. |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
sub _word_get_score { |
693
|
2
|
|
|
2
|
|
3
|
my ($self, $stem) = @_; |
694
|
|
|
|
|
|
|
|
695
|
2
|
|
50
|
|
|
6
|
my $word_count = $self->[CT_WORDS]{$stem} || 0; |
696
|
2
|
|
50
|
|
|
6
|
my $corpus_count = $self->[CT_WORDS_TOTAL] || 1; |
697
|
|
|
|
|
|
|
|
698
|
2
|
|
|
|
|
4
|
my $word_score = ($corpus_count - $word_count) / $corpus_count; |
699
|
2
|
50
|
|
|
|
6
|
if (exists $self->[CT_STOPWORDS]{$stem}) { |
700
|
0
|
|
|
|
|
0
|
$word_score /= 2; |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
2
|
|
|
|
|
6
|
return $word_score; |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
1; |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
__END__ |