| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Conversation; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
26
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
28
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
6
|
use vars qw($VERSION); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
46
|
|
|
7
|
|
|
|
|
|
|
$VERSION = '0.050'; |
|
8
|
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
991
|
use Lingua::StopWords::EN qw(getStopWords); |
|
|
1
|
|
|
|
|
506
|
|
|
|
1
|
|
|
|
|
78
|
|
|
10
|
1
|
|
|
1
|
|
861
|
use Lingua::Stem::Snowball qw(stem); |
|
|
1
|
|
|
|
|
2128
|
|
|
|
1
|
|
|
|
|
66
|
|
|
11
|
1
|
|
|
1
|
|
936
|
use String::Approx qw(amatch adistr); |
|
|
1
|
|
|
|
|
5754
|
|
|
|
1
|
|
|
|
|
87
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
11
|
use constant CT_STOPWORDS => 0; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
82
|
|
|
14
|
1
|
|
|
1
|
|
6
|
use constant CT_CONTEXT => 1; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
41
|
|
|
15
|
1
|
|
|
1
|
|
6
|
use constant CT_NICKS => 2; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
16
|
1
|
|
|
1
|
|
6
|
use constant CT_IDS => 3; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
38
|
|
|
17
|
1
|
|
|
1
|
|
6
|
use constant CT_CONTEXT_MAX => 4; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
18
|
1
|
|
|
1
|
|
6
|
use constant CT_WORDS => 5; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
45
|
|
|
19
|
1
|
|
|
1
|
|
5
|
use constant CT_WORDS_TOTAL => 6; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
20
|
1
|
|
|
1
|
|
6
|
use constant CT_DEBUG => 7; |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
54
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
1
|
|
|
1
|
|
6
|
use constant CTX_ID => 0; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
23
|
1
|
|
|
1
|
|
5
|
use constant CTX_NICK => 1; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
45
|
|
|
24
|
1
|
|
|
1
|
|
5
|
use constant CTX_ADDRESSEE => 2; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
37
|
|
|
25
|
1
|
|
|
1
|
|
5
|
use constant CTX_WORDS => 3; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
44
|
|
|
26
|
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
5
|
use constant ID_REFERENT => 0; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
39
|
|
|
28
|
1
|
|
|
1
|
|
5
|
use constant ID_REFERERS => 1; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
41
|
|
|
29
|
1
|
|
|
1
|
|
5
|
use constant ID_TEXT => 2; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
41
|
|
|
30
|
1
|
|
|
1
|
|
5
|
use constant ID_NICK => 3; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
4710
|
|
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
### Manage scrollback. |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# The next ID is package static so messages will be unique across all |
|
35
|
|
|
|
|
|
|
# threaders. |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $next_id = "a"; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub new { |
|
40
|
1
|
|
|
1
|
1
|
3
|
my ($class, %args) = @_; |
|
41
|
|
|
|
|
|
|
|
|
42
|
1
|
|
50
|
|
|
10
|
$args{thread_buffer} ||= 30; |
|
43
|
|
|
|
|
|
|
|
|
44
|
1
|
|
50
|
|
|
14
|
my $self = bless [ |
|
|
|
|
50
|
|
|
|
|
|
45
|
|
|
|
|
|
|
undef, # CT_STOPWORDS |
|
46
|
|
|
|
|
|
|
[ ], # CT_CONTEXT |
|
47
|
|
|
|
|
|
|
{ }, # CT_NICKS |
|
48
|
|
|
|
|
|
|
{ }, # CT_IDS |
|
49
|
|
|
|
|
|
|
$args{thread_buffer} || 30, # CT_CONTEXT_MAX |
|
50
|
|
|
|
|
|
|
{ }, # CT_WORDS |
|
51
|
|
|
|
|
|
|
0, # CT_WORDS_TOTAL |
|
52
|
|
|
|
|
|
|
$args{debug} || 0, # CT_DEBUG |
|
53
|
|
|
|
|
|
|
], $class; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Stem stopwords. |
|
56
|
|
|
|
|
|
|
|
|
57
|
1
|
|
|
|
|
6
|
my $stopwords = getStopWords(); |
|
58
|
1
|
|
|
|
|
263
|
my %stopwords; |
|
59
|
|
|
|
|
|
|
|
|
60
|
1
|
|
|
|
|
46
|
foreach my $stopword (keys %$stopwords) { |
|
61
|
|
|
|
|
|
|
# $stopwords{$self->_word_stem($stopword)}++; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
1
|
|
|
|
|
15
|
$self->[CT_STOPWORDS] = \%stopwords; |
|
65
|
|
|
|
|
|
|
|
|
66
|
1
|
|
|
|
|
27
|
return $self; |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
#sub hear { |
|
70
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host, $text) = @_; |
|
71
|
|
|
|
|
|
|
#} |
|
72
|
|
|
|
|
|
|
# |
|
73
|
|
|
|
|
|
|
#sub see { |
|
74
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host, $text) = @_; |
|
75
|
|
|
|
|
|
|
#} |
|
76
|
|
|
|
|
|
|
# |
|
77
|
|
|
|
|
|
|
#sub rename { |
|
78
|
|
|
|
|
|
|
# my ($self, $old_nick, $new_nick, $ident, $host) = @_; |
|
79
|
|
|
|
|
|
|
#} |
|
80
|
|
|
|
|
|
|
# |
|
81
|
|
|
|
|
|
|
#sub arrival { |
|
82
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host) = @_; |
|
83
|
|
|
|
|
|
|
#} |
|
84
|
|
|
|
|
|
|
# |
|
85
|
|
|
|
|
|
|
#sub departure { |
|
86
|
|
|
|
|
|
|
# my ($self, $nick, $ident, $host) = @_; |
|
87
|
|
|
|
|
|
|
#} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub observe { |
|
90
|
3
|
|
|
3
|
1
|
6
|
my ($self, $nick, $text) = @_; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# IRC nicks are case-insensitive. |
|
93
|
3
|
|
|
|
|
9
|
$nick = $self->_nick_fix($nick); |
|
94
|
|
|
|
|
|
|
|
|
95
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
|
96
|
0
|
|
|
|
|
0
|
warn ">>>> <$nick> $text\n"; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Extract non-stopwords from spoken text. |
|
100
|
|
|
|
|
|
|
# |
|
101
|
|
|
|
|
|
|
# TODO - Determine stopwords dynamically from observed context. Or |
|
102
|
|
|
|
|
|
|
# perhaps generate stopwords from some logs. |
|
103
|
|
|
|
|
|
|
# |
|
104
|
|
|
|
|
|
|
# Stem the words here, so they go into the system as stems. |
|
105
|
|
|
|
|
|
|
|
|
106
|
3
|
|
|
|
|
6
|
my $words_text = lc($text); |
|
107
|
3
|
|
|
|
|
13
|
$words_text =~ s/[^\w\s]+/ /g; |
|
108
|
|
|
|
|
|
|
|
|
109
|
3
|
|
|
|
|
4
|
my %my_words; |
|
110
|
3
|
|
|
|
|
12
|
foreach my $word ( |
|
|
15
|
|
|
|
|
37
|
|
|
111
|
15
|
|
|
|
|
32
|
grep { ! exists $self->[CT_STOPWORDS]{$_} } |
|
112
|
15
|
|
|
|
|
27
|
map { $self->_word_stem($_) } |
|
113
|
|
|
|
|
|
|
grep { length() > 1 } |
|
114
|
|
|
|
|
|
|
split /\s+/, $words_text |
|
115
|
|
|
|
|
|
|
) { |
|
116
|
15
|
|
|
|
|
30
|
$my_words{$word}++; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
3
|
|
|
|
|
14
|
my @my_words = keys %my_words; |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Find explicit addressees. |
|
122
|
|
|
|
|
|
|
|
|
123
|
3
|
|
|
|
|
8
|
my $addressee_text = lc($text); |
|
124
|
|
|
|
|
|
|
|
|
125
|
3
|
|
|
|
|
5
|
my $best_addressee = ""; |
|
126
|
3
|
|
|
|
|
4
|
my $best_addressee_score = 0; |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# Nickname starts the line. |
|
129
|
|
|
|
|
|
|
|
|
130
|
3
|
50
|
33
|
|
|
36
|
if ( |
|
|
|
|
33
|
|
|
|
|
|
131
|
|
|
|
|
|
|
($addressee_text =~ /^\s*(\S+?)\s*[:,]\s+/) or |
|
132
|
|
|
|
|
|
|
($addressee_text =~ /^\s*(\S+?)-*\s+/) or |
|
133
|
|
|
|
|
|
|
($addressee_text =~ /^\s*t\s+(\S+?)\s+/) |
|
134
|
|
|
|
|
|
|
) { |
|
135
|
3
|
|
|
|
|
6
|
my $test = $self->_nick_fix($1); |
|
136
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
|
137
|
0
|
|
|
|
|
0
|
warn " pre($test)\n"; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
3
|
|
|
|
|
9
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Best addressee score is 3x because the nick is at the start. |
|
142
|
3
|
|
|
|
|
6
|
$best_nick_score *= 3; |
|
143
|
|
|
|
|
|
|
|
|
144
|
3
|
50
|
|
|
|
9
|
if ($best_nick_score > $best_addressee_score) { |
|
145
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
146
|
0
|
|
|
|
|
0
|
warn( |
|
147
|
|
|
|
|
|
|
" found $test ", |
|
148
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
|
149
|
|
|
|
|
|
|
); |
|
150
|
|
|
|
|
|
|
} |
|
151
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
|
152
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# Nickname ends the line. |
|
157
|
|
|
|
|
|
|
|
|
158
|
3
|
50
|
|
|
|
62
|
if ($addressee_text =~ /[\s,]*(\S+?)[.?!'")\]\}\s]*$/) { |
|
159
|
3
|
|
|
|
|
8
|
my $test = $self->_nick_fix($1); |
|
160
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
|
161
|
0
|
|
|
|
|
0
|
warn " post($test)\n"; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
3
|
|
|
|
|
9
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Best addressee score is 2x because the nick is at the end. |
|
166
|
3
|
|
|
|
|
4
|
$best_nick_score *= 2; |
|
167
|
|
|
|
|
|
|
|
|
168
|
3
|
50
|
|
|
|
8
|
if ($best_nick_score > $best_addressee_score) { |
|
169
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
170
|
0
|
|
|
|
|
0
|
warn( |
|
171
|
|
|
|
|
|
|
" found $test ", |
|
172
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
|
173
|
|
|
|
|
|
|
); |
|
174
|
|
|
|
|
|
|
} |
|
175
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
|
176
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# Nickname occurs somewhere in the middle. |
|
181
|
|
|
|
|
|
|
|
|
182
|
3
|
|
|
|
|
17
|
while ($addressee_text =~ m/\s*,\s*(\S+?)\s*[,!?.]\s*/g) { |
|
183
|
0
|
|
|
|
|
0
|
my $test = $self->_nick_fix($1); |
|
184
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
185
|
0
|
|
|
|
|
0
|
warn " in($test)"; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
0
|
|
|
|
|
0
|
my ($best_nick, $best_nick_score) = $self->_nick_exists($test); |
|
188
|
|
|
|
|
|
|
|
|
189
|
0
|
0
|
|
|
|
0
|
if ($best_nick_score > $best_addressee_score) { |
|
190
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
191
|
0
|
|
|
|
|
0
|
warn( |
|
192
|
|
|
|
|
|
|
" found $test ", |
|
193
|
|
|
|
|
|
|
"($best_nick = $best_nick_score > $best_addressee_score)\n" |
|
194
|
|
|
|
|
|
|
); |
|
195
|
|
|
|
|
|
|
} |
|
196
|
0
|
|
|
|
|
0
|
$best_addressee = $best_nick; |
|
197
|
0
|
|
|
|
|
0
|
$best_addressee_score = $best_nick_score; |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
|
202
|
0
|
|
|
|
|
0
|
warn " best addressee score = $best_addressee_score\n"; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# TODO - If an implied statement goes to nobody, then perhaps it's a |
|
206
|
|
|
|
|
|
|
# continuation of the last statement they said? |
|
207
|
|
|
|
|
|
|
|
|
208
|
3
|
|
|
|
|
3
|
my $seen_them_factor = 0; |
|
209
|
3
|
|
|
|
|
4
|
my $seen_me_factor = 0; |
|
210
|
3
|
|
|
|
|
3
|
my $seen_other_factor = 0; |
|
211
|
|
|
|
|
|
|
|
|
212
|
3
|
|
|
|
|
4
|
my $best_score = 0; |
|
213
|
3
|
|
|
|
|
3
|
my $best_index; |
|
214
|
|
|
|
|
|
|
|
|
215
|
3
|
|
|
|
|
3
|
my $index = @{$self->[CT_CONTEXT]}; |
|
|
3
|
|
|
|
|
5
|
|
|
216
|
3
|
|
|
|
|
9
|
while ($index--) { |
|
217
|
3
|
|
|
|
|
6
|
my $context = $self->[CT_CONTEXT][$index]; |
|
218
|
3
|
|
|
|
|
4
|
my $them = $context->[CTX_NICK]; |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Figure out speaker/them affinity. |
|
221
|
|
|
|
|
|
|
|
|
222
|
3
|
|
|
|
|
10
|
my $affinity = $self->_nick_score($nick, $them); |
|
223
|
|
|
|
|
|
|
|
|
224
|
3
|
|
|
|
|
8
|
my $match_factor = $self->_correlate_statements( |
|
225
|
|
|
|
|
|
|
\@my_words, $context->[CTX_WORDS] |
|
226
|
|
|
|
|
|
|
); |
|
227
|
|
|
|
|
|
|
|
|
228
|
3
|
|
|
|
|
4
|
my $distance_factor = @{$self->[CT_CONTEXT]} - $index; |
|
|
3
|
|
|
|
|
7
|
|
|
229
|
|
|
|
|
|
|
|
|
230
|
3
|
|
|
|
|
4
|
my $addressee_score = 0; |
|
231
|
3
|
50
|
|
|
|
7
|
$addressee_score = $best_addressee_score if $them eq $best_addressee; |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# Weigh factors. |
|
234
|
|
|
|
|
|
|
|
|
235
|
3
|
|
|
|
|
3
|
my $weighted_addressee = $addressee_score * 30; |
|
236
|
3
|
|
|
|
|
5
|
my $weighted_affinity = $affinity * 45; # half addressee |
|
237
|
3
|
|
|
|
|
4
|
my $weighted_match = $match_factor * 30; |
|
238
|
3
|
|
|
|
|
4
|
my $weighted_seen_them = $seen_them_factor * -3; |
|
239
|
3
|
|
|
|
|
3
|
my $weighted_seen_me = $seen_me_factor * -3; |
|
240
|
3
|
|
|
|
|
4
|
my $weighted_distance = $distance_factor * -1; |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Calculate a weighted score. |
|
243
|
|
|
|
|
|
|
|
|
244
|
3
|
|
|
|
|
5
|
my $score = ( |
|
245
|
|
|
|
|
|
|
$weighted_affinity + |
|
246
|
|
|
|
|
|
|
$weighted_addressee + |
|
247
|
|
|
|
|
|
|
$weighted_match + |
|
248
|
|
|
|
|
|
|
$weighted_seen_them + |
|
249
|
|
|
|
|
|
|
$weighted_seen_me + |
|
250
|
|
|
|
|
|
|
$weighted_distance |
|
251
|
|
|
|
|
|
|
); |
|
252
|
|
|
|
|
|
|
|
|
253
|
3
|
100
|
|
|
|
8
|
if ($score > $best_score) { |
|
254
|
1
|
|
|
|
|
1
|
$best_score = $score; |
|
255
|
1
|
|
|
|
|
2
|
$best_index = $index; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
|
|
258
|
3
|
|
|
|
|
59
|
my $out = sprintf( |
|
259
|
|
|
|
|
|
|
( " aff(%9.3f) addr(%9.3f) match(%9.3f) " . |
|
260
|
|
|
|
|
|
|
"sthem(%9.3f) sme(%9.3f) dst(%9.3f) " . |
|
261
|
|
|
|
|
|
|
"score(%9.3f) best(%9.3f) " |
|
262
|
|
|
|
|
|
|
), |
|
263
|
|
|
|
|
|
|
$weighted_affinity, $weighted_addressee, $weighted_match, |
|
264
|
|
|
|
|
|
|
$weighted_seen_them, $weighted_seen_me, $weighted_distance, |
|
265
|
|
|
|
|
|
|
$score, $best_score, |
|
266
|
|
|
|
|
|
|
); |
|
267
|
|
|
|
|
|
|
|
|
268
|
3
|
|
|
|
|
11
|
$out .= substr( |
|
269
|
|
|
|
|
|
|
$self->_id_get_text($context->[CTX_ID]), 0, 156 - length($out) - 2 |
|
270
|
|
|
|
|
|
|
); |
|
271
|
|
|
|
|
|
|
|
|
272
|
3
|
50
|
|
|
|
14
|
if ($self->[CT_DEBUG]) { |
|
273
|
0
|
|
|
|
|
0
|
warn $out, "\n"; |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# Serious penalties for passing people by. |
|
277
|
3
|
50
|
|
|
|
7
|
if ($nick eq $them) { |
|
278
|
0
|
0
|
|
|
|
0
|
if ($seen_other_factor) { |
|
279
|
0
|
|
|
|
|
0
|
$seen_me_factor++; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
else { |
|
283
|
3
|
|
|
|
|
9
|
$seen_other_factor++; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
3
|
|
|
|
|
9
|
return $self->process_match( |
|
288
|
|
|
|
|
|
|
$best_index, $nick, $text, \%my_words, $best_score |
|
289
|
|
|
|
|
|
|
); |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _context_get_id { |
|
293
|
1
|
|
|
1
|
|
2
|
my ($self, $index) = @_; |
|
294
|
1
|
|
|
|
|
3
|
return $self->[CT_CONTEXT][$index][CTX_ID]; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub _context_get_nick { |
|
298
|
1
|
|
|
1
|
|
2
|
my ($self, $index) = @_; |
|
299
|
1
|
|
|
|
|
3
|
return $self->[CT_CONTEXT][$index][CTX_NICK]; |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
### Manage seen nicks. |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# Add a nickname to the database, or update the confidence between |
|
305
|
|
|
|
|
|
|
# $nick and $addressee of an existing nickname. |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub _nick_add { |
|
308
|
3
|
|
|
3
|
|
5
|
my ($self, $nick, $addressee, $confidence) = @_; |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# Make sure the nick exists. |
|
311
|
3
|
50
|
|
|
|
17
|
unless (exists $self->[CT_NICKS]{$nick}) { |
|
312
|
3
|
|
|
|
|
7
|
$self->[CT_NICKS]{$nick} = { }; |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
|
|
315
|
3
|
50
|
66
|
|
|
13
|
if ( |
|
316
|
|
|
|
|
|
|
defined($addressee) and |
|
317
|
|
|
|
|
|
|
!exists($self->[CT_NICKS]{$addressee}) |
|
318
|
|
|
|
|
|
|
) { |
|
319
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$addressee} = { }; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# Decay everybody. This is a lousy O(N**2) problem. |
|
323
|
3
|
|
|
|
|
3
|
foreach my $me (keys %{$self->[CT_NICKS]}) { |
|
|
3
|
|
|
|
|
10
|
|
|
324
|
6
|
|
|
|
|
6
|
foreach my $them (keys %{$self->[CT_NICKS]{$me}}) { |
|
|
6
|
|
|
|
|
23
|
|
|
325
|
0
|
|
|
|
|
0
|
$self->_nick_decay_link($me, $them); |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Average in the new confidence. |
|
330
|
3
|
100
|
|
|
|
10
|
if (defined $addressee) { |
|
331
|
1
|
50
|
|
|
|
5
|
if (exists $self->[CT_NICKS]{$addressee}{$nick}) { |
|
332
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$addressee}{$nick} < $confidence) { |
|
333
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$addressee}{$nick} = $confidence; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
else { |
|
337
|
1
|
|
|
|
|
3
|
$self->[CT_NICKS]{$addressee}{$nick} = $confidence; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
|
|
340
|
1
|
50
|
|
|
|
3
|
if (exists $self->[CT_NICKS]{$nick}{$addressee}) { |
|
341
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$nick}{$addressee} < $confidence) { |
|
342
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$nick}{$addressee} = $confidence; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
else { |
|
346
|
1
|
|
|
|
|
4
|
$self->[CT_NICKS]{$nick}{$addressee} = $confidence; |
|
347
|
|
|
|
|
|
|
} |
|
348
|
|
|
|
|
|
|
} |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub _nick_decay_link { |
|
352
|
0
|
|
|
0
|
|
0
|
my ($self, $me, $them) = @_; |
|
353
|
0
|
|
|
|
|
0
|
$self->[CT_NICKS]{$me}{$them} /= 4; |
|
354
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_NICKS]{$me}{$them} < 1) { |
|
355
|
0
|
|
|
|
|
0
|
delete $self->[CT_NICKS]{$me}{$them}; |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
sub _nick_del { |
|
360
|
0
|
|
|
0
|
|
0
|
my $nick = shift; |
|
361
|
|
|
|
|
|
|
# Nothing? |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# The score is the average of the speaker/other and other/speaker |
|
365
|
|
|
|
|
|
|
# links. It must be a number from 0 through 1. |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub _nick_score { |
|
368
|
3
|
|
|
3
|
|
4
|
my ($self, $speaker, $other) = @_; |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Speaker to other. |
|
371
|
|
|
|
|
|
|
|
|
372
|
3
|
|
|
|
|
4
|
my $total_speaker_to_other = 0; |
|
373
|
3
|
|
|
|
|
3
|
my $speaker_to_other = 0; |
|
374
|
|
|
|
|
|
|
|
|
375
|
3
|
50
|
33
|
|
|
10
|
if ( |
|
376
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$speaker}) and |
|
377
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$speaker}{$other}) |
|
378
|
|
|
|
|
|
|
) { |
|
379
|
0
|
|
|
|
|
0
|
$speaker_to_other = $self->[CT_NICKS]{$speaker}{$other}; |
|
380
|
0
|
|
|
|
|
0
|
foreach my $audience (keys %{$self->[CT_NICKS]{$speaker}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
381
|
0
|
|
|
|
|
0
|
$total_speaker_to_other += $self->[CT_NICKS]{$speaker}{$audience}; |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# Other from speaker. |
|
386
|
|
|
|
|
|
|
|
|
387
|
3
|
|
|
|
|
12
|
my $total_other_to_speaker = 0; |
|
388
|
3
|
|
|
|
|
3
|
my $other_to_speaker = 0; |
|
389
|
|
|
|
|
|
|
|
|
390
|
3
|
50
|
33
|
|
|
17
|
if ( |
|
391
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$other}) and |
|
392
|
|
|
|
|
|
|
exists($self->[CT_NICKS]{$other}{$speaker}) |
|
393
|
|
|
|
|
|
|
) { |
|
394
|
0
|
|
|
|
|
0
|
$other_to_speaker = $self->[CT_NICKS]{$other}{$speaker}; |
|
395
|
0
|
|
|
|
|
0
|
foreach my $them (keys %{$self->[CT_NICKS]{$other}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
396
|
0
|
|
|
|
|
0
|
$total_other_to_speaker += $self->[CT_NICKS]{$other}{$them}; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# If the total of the totals is zero, then avoid the division by |
|
401
|
|
|
|
|
|
|
# zero. |
|
402
|
|
|
|
|
|
|
|
|
403
|
3
|
|
|
|
|
5
|
my $total_total = $total_speaker_to_other + $total_other_to_speaker; |
|
404
|
3
|
50
|
|
|
|
9
|
return 0 unless $total_total; |
|
405
|
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
return( ($speaker_to_other + $other_to_speaker) / $total_total ); |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub _nick_fix { |
|
410
|
9
|
|
|
9
|
|
18
|
my ($self, $nick) = @_; |
|
411
|
|
|
|
|
|
|
|
|
412
|
9
|
|
|
|
|
15
|
my $fixed_nick = lc($nick); |
|
413
|
9
|
|
|
|
|
15
|
$fixed_nick =~ s/^q\[(\S+)]$/$1/; # q[nick] remove the quotes |
|
414
|
9
|
|
|
|
|
41
|
$fixed_nick =~ s/[^A-Za-z0-9]*$//; # remove trailing junk |
|
415
|
9
|
|
|
|
|
22
|
$fixed_nick =~ s/^[^A-Za-z0-9]*//; # remove leading junk |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# If it's all junk, return it lowercased. |
|
418
|
9
|
100
|
|
|
|
25
|
$fixed_nick = lc($nick) unless length $fixed_nick; |
|
419
|
|
|
|
|
|
|
|
|
420
|
9
|
|
|
|
|
19
|
return $fixed_nick; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# Does a nickname exist? Return a new new nickname and a number |
|
424
|
|
|
|
|
|
|
# between 0 and 1 that tells how much the given nickname matches it. |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub _nick_exists { |
|
427
|
6
|
|
|
6
|
|
8
|
my ($self, $nick) = @_; |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
# No match if nothing here. Keeps amatch() from bailing. |
|
430
|
6
|
|
|
|
|
7
|
my @known_nicks = keys %{$self->[CT_NICKS]}; |
|
|
6
|
|
|
|
|
15
|
|
|
431
|
6
|
100
|
|
|
|
18
|
return (undef, 0) unless @known_nicks; |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Often a nickname is a shortened version of some other. Sometimes |
|
434
|
|
|
|
|
|
|
# it's an extended version of it. Other times it's a bastardization |
|
435
|
|
|
|
|
|
|
# of a known nickname. |
|
436
|
|
|
|
|
|
|
# |
|
437
|
|
|
|
|
|
|
# Find all the nicknames that begin with the specified nickname. |
|
438
|
|
|
|
|
|
|
# |
|
439
|
|
|
|
|
|
|
# TODO - If there are none, try string distances? Is there a better |
|
440
|
|
|
|
|
|
|
# way to hash string distances with lengths? |
|
441
|
|
|
|
|
|
|
|
|
442
|
4
|
|
|
|
|
62
|
my @found = grep /^\Q$nick/, @known_nicks; |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# Never did find nothin'. |
|
445
|
4
|
50
|
|
|
|
17
|
return (undef, 0) unless @found; |
|
446
|
|
|
|
|
|
|
|
|
447
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
448
|
0
|
|
|
|
|
0
|
warn " $nick matches (@found)\n"; |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# Find the best match out of the found matches. "Best match" is a |
|
452
|
|
|
|
|
|
|
# combination of string distance and ratio of entered nick to match. |
|
453
|
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
0
|
my @proximities = map { 1 - $_ } adistr($nick, @found); |
|
|
0
|
|
|
|
|
0
|
|
|
455
|
|
|
|
|
|
|
|
|
456
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
457
|
0
|
|
|
|
|
0
|
warn " $nick proximities (@proximities)\n"; |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
0
|
my ($best_nick, $best_score) = ("", 0); |
|
461
|
0
|
|
|
|
|
0
|
while (@found) { |
|
462
|
0
|
0
|
|
|
|
0
|
die unless @found == @proximities; |
|
463
|
0
|
|
|
|
|
0
|
my $match = shift @found; |
|
464
|
0
|
|
|
|
|
0
|
my $prox = shift @proximities; |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# Words closer to the input length score higher. |
|
467
|
|
|
|
|
|
|
# Squared so it diminishes faster. |
|
468
|
0
|
|
|
|
|
0
|
my $length_score = (length($nick) / length($match)) ** 2; |
|
469
|
|
|
|
|
|
|
|
|
470
|
0
|
|
|
|
|
0
|
my $score = $prox * $length_score; |
|
471
|
0
|
0
|
|
|
|
0
|
next if $score < $best_score; |
|
472
|
|
|
|
|
|
|
|
|
473
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
474
|
0
|
|
|
|
|
0
|
warn " $prox * $length_score = $score\n"; |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
0
|
$best_nick = $match; |
|
478
|
0
|
|
|
|
|
0
|
$best_score = $score; |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
|
|
481
|
0
|
0
|
|
|
|
0
|
if ($best_nick) { |
|
482
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
483
|
0
|
|
|
|
|
0
|
warn " $nick = $best_nick ($best_score)\n"; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
0
|
|
|
|
|
0
|
return ($best_nick, $best_score); |
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
|
|
488
|
0
|
0
|
|
|
|
0
|
if ($self->[CT_DEBUG]) { |
|
489
|
0
|
|
|
|
|
0
|
warn " $nick not found\n"; |
|
490
|
|
|
|
|
|
|
} |
|
491
|
0
|
|
|
|
|
0
|
return (undef, 0); |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
### Manage known IDs. |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub _id_fully_qualified { |
|
497
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
|
498
|
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
0
|
my @key; |
|
500
|
0
|
|
|
|
|
0
|
while ($id) { |
|
501
|
0
|
|
|
|
|
0
|
unshift @key, $id; |
|
502
|
0
|
|
|
|
|
0
|
$id = $self->[CT_IDS]{$id}[ID_REFERENT]; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
0
|
return join "/", @key; |
|
506
|
|
|
|
|
|
|
} |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub _id_add { |
|
509
|
3
|
|
|
3
|
|
7
|
my ($self, $id, $referent, $nick, $text) = @_; |
|
510
|
|
|
|
|
|
|
|
|
511
|
3
|
|
|
|
|
12
|
$self->[CT_IDS]{$id} = [ |
|
512
|
|
|
|
|
|
|
$referent, # ID_REFERENT |
|
513
|
|
|
|
|
|
|
[ ], # ID_REFERERS |
|
514
|
|
|
|
|
|
|
$text, # ID_TEXT |
|
515
|
|
|
|
|
|
|
$nick, # ID_NICK |
|
516
|
|
|
|
|
|
|
]; |
|
517
|
|
|
|
|
|
|
|
|
518
|
3
|
100
|
66
|
|
|
13
|
if ($referent and exists $self->[CT_IDS]{$referent}) { |
|
519
|
1
|
|
|
|
|
2
|
push @{$self->[CT_IDS]{$referent}[ID_REFERERS]}, $id; |
|
|
1
|
|
|
|
|
4
|
|
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
sub _id_del { |
|
524
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
|
525
|
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
0
|
my $old = delete $self->[CT_IDS]{$id}; |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# Fix the statement's kids to stop pointing at the parent. |
|
529
|
|
|
|
|
|
|
|
|
530
|
0
|
|
|
|
|
0
|
foreach my $referer (@{$old->[ID_REFERERS]}) { |
|
|
0
|
|
|
|
|
0
|
|
|
531
|
0
|
|
|
|
|
0
|
$self->[CT_IDS]{$referer}[ID_REFERENT] = undef; |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
sub _id_get_referent { |
|
536
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
|
537
|
0
|
|
|
|
|
0
|
return $self->[CT_IDS]{$id}[ID_REFERENT]; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub _id_get_nick { |
|
541
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# XXX - Happens when someone explicitly addresses a nickname that |
|
544
|
|
|
|
|
|
|
# hasn't appeared yet. |
|
545
|
|
|
|
|
|
|
# |
|
546
|
|
|
|
|
|
|
# Use of uninitialized value in hash element at ChatThread.pm line 352. |
|
547
|
|
|
|
|
|
|
|
|
548
|
0
|
|
|
|
|
0
|
return $self->[CT_IDS]{$id}[ID_NICK]; |
|
549
|
|
|
|
|
|
|
} |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
sub _id_exists { |
|
552
|
0
|
|
|
0
|
|
0
|
my ($self, $id) = @_; |
|
553
|
0
|
|
|
|
|
0
|
return exists $self->[CT_IDS]{$id}; |
|
554
|
|
|
|
|
|
|
} |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
sub _id_get_text { |
|
557
|
3
|
|
|
3
|
|
5
|
my ($self, $id) = @_; |
|
558
|
3
|
|
|
|
|
13
|
return $self->[CT_IDS]{$id}[ID_TEXT]; |
|
559
|
|
|
|
|
|
|
} |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub _id_list { |
|
562
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# We do this rather than keys of CT_IDS because it's always |
|
565
|
|
|
|
|
|
|
# guaranteed to be in time order. That is, referents come before |
|
566
|
|
|
|
|
|
|
# stuff that refers to them. |
|
567
|
0
|
|
|
|
|
0
|
return map { $_->[CTX_ID] } @{$self->[CT_CONTEXT]}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
# Fuzzy match text. |
|
571
|
|
|
|
|
|
|
# |
|
572
|
|
|
|
|
|
|
# The current idea is to return a number that represents how much of |
|
573
|
|
|
|
|
|
|
# @$my_words matches %$their_words; Each matching word multiplied by |
|
574
|
|
|
|
|
|
|
# a per-word score that reflects the uniqueness of that word. |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub _correlate_statements { |
|
577
|
3
|
|
|
3
|
|
6
|
my ($self, $my_words, $their_words) = @_; |
|
578
|
|
|
|
|
|
|
|
|
579
|
3
|
|
|
|
|
4
|
my $match_factor = 0; |
|
580
|
3
|
|
50
|
|
|
8
|
my $total_words = @$my_words || 1; |
|
581
|
|
|
|
|
|
|
|
|
582
|
3
|
|
|
|
|
4
|
foreach my $my_word (@$my_words) { |
|
583
|
11
|
100
|
|
|
|
26
|
next unless exists $their_words->{$my_word}; |
|
584
|
2
|
|
|
|
|
8
|
$match_factor += $self->_word_get_score($my_word); |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
|
|
587
|
3
|
|
|
|
|
8
|
return $match_factor / $total_words; |
|
588
|
|
|
|
|
|
|
} |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
### |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub process_match { |
|
593
|
3
|
|
|
3
|
0
|
8
|
my ($self, $index, $nick, $text, $my_words, $confidence) = @_; |
|
594
|
|
|
|
|
|
|
|
|
595
|
3
|
|
|
|
|
4
|
my $id = $next_id++; |
|
596
|
|
|
|
|
|
|
|
|
597
|
3
|
|
|
|
|
4
|
my ($referent, $addressee, $print_addressee); |
|
598
|
|
|
|
|
|
|
|
|
599
|
3
|
100
|
|
|
|
8
|
if (defined $index) { |
|
600
|
1
|
|
|
|
|
12
|
$referent = $self->_context_get_id($index); |
|
601
|
1
|
|
|
|
|
6
|
$addressee = $self->_context_get_nick($index); |
|
602
|
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
# If the person refers to themselves, refer them instead to |
|
604
|
|
|
|
|
|
|
# whoever they were talking to previously. |
|
605
|
1
|
50
|
|
|
|
12
|
if ($addressee eq $nick) { |
|
606
|
0
|
0
|
|
|
|
0
|
if ($self->_id_exists($referent)) { |
|
607
|
0
|
|
|
|
|
0
|
$referent = $self->_id_get_referent($referent); |
|
608
|
0
|
|
|
|
|
0
|
$addressee = $self->_id_get_nick($referent); |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
else { |
|
611
|
0
|
|
|
|
|
0
|
$referent = $addressee = undef; |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
} |
|
615
|
|
|
|
|
|
|
|
|
616
|
3
|
100
|
|
|
|
8
|
if (defined $addressee) { |
|
617
|
1
|
|
|
|
|
2
|
$print_addressee = $addressee; |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
else { |
|
620
|
2
|
|
|
|
|
3
|
$print_addressee = "(nobody)"; |
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
|
|
623
|
3
|
50
|
|
|
|
9
|
if ($self->[CT_DEBUG]) { |
|
624
|
0
|
|
|
|
|
0
|
warn "<<<< ($id) $nick -> $print_addressee : $text\n"; |
|
625
|
|
|
|
|
|
|
} |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# XXX - _context_add() ? |
|
628
|
3
|
|
|
|
|
4
|
push @{$self->[CT_CONTEXT]}, [ |
|
|
3
|
|
|
|
|
9
|
|
|
629
|
|
|
|
|
|
|
$id, # CTX_ID |
|
630
|
|
|
|
|
|
|
$nick, # CTX_NICK |
|
631
|
|
|
|
|
|
|
$addressee, # CTX_ADDRESSEE |
|
632
|
|
|
|
|
|
|
$my_words, # CTX_WORDS |
|
633
|
|
|
|
|
|
|
$referent, # CTX_REFERENT |
|
634
|
|
|
|
|
|
|
]; |
|
635
|
|
|
|
|
|
|
|
|
636
|
3
|
|
|
|
|
9
|
my $debug_text = "<$nick> $text"; |
|
637
|
|
|
|
|
|
|
|
|
638
|
3
|
|
|
|
|
10
|
$self->_nick_add($nick, $addressee, $confidence); |
|
639
|
3
|
|
|
|
|
8
|
$self->_id_add($id, $referent, $nick, $debug_text); |
|
640
|
3
|
|
|
|
|
40
|
$self->_words_add($my_words); |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# XXX - _context_prune() ? |
|
643
|
|
|
|
|
|
|
# XXX - Deleting the words here is a cheezy way to decay word |
|
644
|
|
|
|
|
|
|
# importance over time. |
|
645
|
3
|
|
|
|
|
7
|
while (@{$self->[CT_CONTEXT]} > $self->[CT_CONTEXT_MAX]) { |
|
|
3
|
|
|
|
|
11
|
|
|
646
|
0
|
|
|
|
|
0
|
my $old = shift @{$self->[CT_CONTEXT]}; |
|
|
0
|
|
|
|
|
0
|
|
|
647
|
0
|
|
|
|
|
0
|
$self->_nick_del($old->[CTX_NICK]); |
|
648
|
0
|
|
|
|
|
0
|
$self->_words_del($old->[CTX_WORDS]); |
|
649
|
0
|
|
|
|
|
0
|
$self->_id_del($old->[CTX_ID]); |
|
650
|
|
|
|
|
|
|
} |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
return ( |
|
653
|
3
|
|
|
|
|
16
|
$id, # new ID |
|
654
|
|
|
|
|
|
|
$referent, # referent ID |
|
655
|
|
|
|
|
|
|
$debug_text, # display text |
|
656
|
|
|
|
|
|
|
); |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
### Manage words, for frequency and feature extraction. |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
sub _word_stem { |
|
662
|
15
|
|
|
15
|
|
27
|
my ($self, $word) = @_; |
|
663
|
15
|
|
|
|
|
34
|
my $stem = stem("en", $word); |
|
664
|
15
|
|
|
|
|
770
|
return $stem; |
|
665
|
|
|
|
|
|
|
} |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
sub _words_add { |
|
668
|
3
|
|
|
3
|
|
6
|
my ($self, $words) = @_; |
|
669
|
|
|
|
|
|
|
|
|
670
|
3
|
|
|
|
|
8
|
foreach my $word (keys %$words) { |
|
671
|
15
|
|
|
|
|
16
|
$self->[CT_WORDS_TOTAL] += $words->{$word}; |
|
672
|
15
|
|
|
|
|
32
|
$self->[CT_WORDS]{$word} += $words->{$word}; |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
} |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
sub _words_del { |
|
677
|
0
|
|
|
0
|
|
0
|
my ($self, $words) = @_; |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# XXX - Experimenting with building a huge corpus. |
|
680
|
0
|
|
|
|
|
0
|
return; |
|
681
|
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
foreach my $word (keys %$words) { |
|
683
|
0
|
|
|
|
|
0
|
$self->[CT_WORDS_TOTAL] -= $words->{$word}; |
|
684
|
0
|
|
|
|
|
0
|
$self->[CT_WORDS]{$word} -= $words->{$word}; |
|
685
|
0
|
0
|
|
|
|
0
|
next if $self->[CT_WORDS]{$word} > 0; |
|
686
|
0
|
|
|
|
|
0
|
delete $self->[CT_WORDS]{$word}; |
|
687
|
|
|
|
|
|
|
} |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# The word's score increases as its frequency decreases. |
|
691
|
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
sub _word_get_score { |
|
693
|
2
|
|
|
2
|
|
3
|
my ($self, $stem) = @_; |
|
694
|
|
|
|
|
|
|
|
|
695
|
2
|
|
50
|
|
|
6
|
my $word_count = $self->[CT_WORDS]{$stem} || 0; |
|
696
|
2
|
|
50
|
|
|
6
|
my $corpus_count = $self->[CT_WORDS_TOTAL] || 1; |
|
697
|
|
|
|
|
|
|
|
|
698
|
2
|
|
|
|
|
4
|
my $word_score = ($corpus_count - $word_count) / $corpus_count; |
|
699
|
2
|
50
|
|
|
|
6
|
if (exists $self->[CT_STOPWORDS]{$stem}) { |
|
700
|
0
|
|
|
|
|
0
|
$word_score /= 2; |
|
701
|
|
|
|
|
|
|
} |
|
702
|
|
|
|
|
|
|
|
|
703
|
2
|
|
|
|
|
6
|
return $word_score; |
|
704
|
|
|
|
|
|
|
} |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
1; |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
__END__ |