line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::ContextGraph; |
2
|
|
|
|
|
|
|
|
3
|
13
|
|
|
13
|
|
325052
|
use strict; |
|
13
|
|
|
|
|
32
|
|
|
13
|
|
|
|
|
445
|
|
4
|
13
|
|
|
13
|
|
71
|
use warnings; |
|
13
|
|
|
|
|
40
|
|
|
13
|
|
|
|
|
383
|
|
5
|
13
|
|
|
13
|
|
69
|
use Carp; |
|
13
|
|
|
|
|
27
|
|
|
13
|
|
|
|
|
1223
|
|
6
|
13
|
|
|
13
|
|
94
|
use base "Storable"; |
|
13
|
|
|
|
|
22
|
|
|
13
|
|
|
|
|
16539
|
|
7
|
13
|
|
|
13
|
|
62118
|
use File::Find; |
|
13
|
|
|
|
|
29
|
|
|
13
|
|
|
|
|
1383
|
|
8
|
13
|
|
|
13
|
|
27029
|
use IO::Socket; |
|
13
|
|
|
|
|
523789
|
|
|
13
|
|
|
|
|
73
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.15'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my $count = 0; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Search::ContextGraph - spreading activation search engine |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Search::ContextGraph; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $cg = Search::ContextGraph->new(); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# first you add some documents, perhaps all at once... |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my %docs = ( |
29
|
|
|
|
|
|
|
'first' => [ 'elephant', 'snake' ], |
30
|
|
|
|
|
|
|
'second' => [ 'camel', 'pony' ], |
31
|
|
|
|
|
|
|
'third' => { 'snake' => 2, 'constrictor' => 1 }, |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
$cg->bulk_add( %docs ); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# or in a loop... |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
foreach my $title ( keys %docs ) { |
39
|
|
|
|
|
|
|
$cg->add( $title, $docs{$title} ); |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# or from a file... |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $cg = Search::ContextGraph->load_from_dir( "./myfiles" ); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# you can store a graph object for later use |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
$cg->store( "stored.cng" ); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# and retrieve it later... |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my $cg = ContextGraph->retrieve( "stored.cng" ); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# SEARCHING |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# the easiest way |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
my @ranked_docs = $cg->simple_search( 'peanuts' ); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# get back both related terms and docs for more power |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
my ( $docs, $words ) = $cg->search('snake'); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# you can use a document as your query |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my ( $docs, $words ) = $cg->find_similar('First Document'); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Or you can query on a combination of things |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my ( $docs, $words ) = |
75
|
|
|
|
|
|
|
$cg->mixed_search( { docs => [ 'First Document' ], |
76
|
|
|
|
|
|
|
terms => [ 'snake', 'pony' ] |
77
|
|
|
|
|
|
|
); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Print out result set of returned documents |
81
|
|
|
|
|
|
|
foreach my $k ( sort { $docs->{$b} <=> $docs->{$a} } |
82
|
|
|
|
|
|
|
keys %{ $docs } ) { |
83
|
|
|
|
|
|
|
print "Document $k had relevance ", $docs->{$k}, "\n"; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Reload it |
89
|
|
|
|
|
|
|
my $new = Search::ContextGraph->retrieve( "filename" ); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head1 DESCRIPTION |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Spreading activation is a neat technique for building search engines that |
96
|
|
|
|
|
|
|
return accurate results for a query even when there is no exact keyword match. |
97
|
|
|
|
|
|
|
The engine works by building a data structure called a B, which |
98
|
|
|
|
|
|
|
is a giant network of document and term nodes. All document nodes are connected |
99
|
|
|
|
|
|
|
to the terms that occur in that document; similarly, every term node is connected |
100
|
|
|
|
|
|
|
to all of the document nodes that term occurs in. We search the graph by |
101
|
|
|
|
|
|
|
starting at a query node and distributing a set amount of energy to its neighbor |
102
|
|
|
|
|
|
|
nodes. Then we recurse, diminishing the energy at each stage, until this |
103
|
|
|
|
|
|
|
spreading energy falls below a given threshold. Each node keeps track of |
104
|
|
|
|
|
|
|
accumulated energy, and this serves as our measure of relevance. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
This means that documents that have many words in common will appear similar to the |
107
|
|
|
|
|
|
|
search engine. Likewise, words that occur together in many documents will be |
108
|
|
|
|
|
|
|
perceived as semantically related. Especially with larger, coherent document |
109
|
|
|
|
|
|
|
collections, the search engine can be quite effective at recognizing synonyms |
110
|
|
|
|
|
|
|
and finding useful relationships between documents. You can read a full |
111
|
|
|
|
|
|
|
description of the algorithm at L. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
The search engine gives expanded recall (relevant results even when there is no |
114
|
|
|
|
|
|
|
keyword match) without incurring the kind of computational and patent issues |
115
|
|
|
|
|
|
|
posed by latent semantic indexing (LSI). The technique used here was originally |
116
|
|
|
|
|
|
|
described in a 1981 dissertation by Scott Preece. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=over |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item new %PARAMS |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Object constructor. Possible parameters: |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=over |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=item auto_reweight |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Rebalance the graph every time a change occurs. Default is true. |
131
|
|
|
|
|
|
|
Disable and do by hand using L for better performance in |
132
|
|
|
|
|
|
|
graphs with frequent updates/additions/deletions. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item debug LEVEL |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Set this to 1 or 2 to turn on verbose debugging output |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item max_depth |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Set the maximum distance to spread energy out from the start |
142
|
|
|
|
|
|
|
node. Default is effectively unlimited. You can tweak it using L. |
143
|
|
|
|
|
|
|
Comes in handy if you find searches are too slow. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item xs |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
When true, tells the module to use compiled C internals. This reduces |
148
|
|
|
|
|
|
|
memory requirements by about 60%, but actually runs a little slower than the |
149
|
|
|
|
|
|
|
pure Perl version. Don't bother to turn it on unless you have a huge graph. |
150
|
|
|
|
|
|
|
Default is pure Perl. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=over |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item * using the compiled version makes it impossible to store the graph to disk. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item * xs is B in version 0.09. But it will return in triumph! |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=back |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item START_ENERGY |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Initial energy to assign to a query node. Default is 100. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item ACTIVATE_THRESHOLD |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Minimal energy needed to propagate search along the graph. Default is 1. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=item COLLECT_THRESHOLD |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Minimal energy needed for a node to enter the result set. Default is 1. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=back |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=cut |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub new { |
178
|
15
|
|
|
15
|
1
|
212
|
my ( $class, %params) = @_; |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# backwards compatible... |
181
|
15
|
|
|
|
|
75
|
*add_document = \&add; |
182
|
15
|
|
|
|
|
44
|
*add_documents = \&bulk_add; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# plucene friendly |
185
|
15
|
|
|
|
|
42
|
*optimize = \&reweight_graph; |
186
|
15
|
|
|
|
|
66
|
*is_indexed = \&has_doc; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# fail on all unknown paramters (helps fight typos) |
189
|
15
|
|
|
|
|
82
|
my @allowed = qw/debug auto_reweight use_global_weights max_depth START_ENERGY ACTIVATE_THRESHOLD COLLECT_THRESHOLD use_file xs/; |
190
|
15
|
|
|
|
|
28
|
my %check; |
191
|
15
|
|
|
|
|
161
|
$check{$_}++ foreach @allowed; |
192
|
|
|
|
|
|
|
|
193
|
15
|
|
|
|
|
34
|
my @forbidden; |
194
|
15
|
|
|
|
|
55
|
foreach my $k ( keys %params ) { |
195
|
12
|
50
|
|
|
|
52
|
push @forbidden, $k unless exists $check{$k}; |
196
|
|
|
|
|
|
|
} |
197
|
15
|
50
|
|
|
|
57
|
if ( @forbidden ) { |
198
|
0
|
|
|
|
|
0
|
croak "The following unrecognized parameters were detected: ", |
199
|
|
|
|
|
|
|
join ", ", @forbidden; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
|
203
|
15
|
|
|
|
|
167
|
my $obj = bless |
204
|
|
|
|
|
|
|
{ debug => 0, |
205
|
|
|
|
|
|
|
auto_reweight => 1, |
206
|
|
|
|
|
|
|
use_global_weights => 1, |
207
|
|
|
|
|
|
|
max_depth => 100000000, |
208
|
|
|
|
|
|
|
START_ENERGY => 100, |
209
|
|
|
|
|
|
|
ACTIVATE_THRESHOLD => 1, |
210
|
|
|
|
|
|
|
COLLECT_THRESHOLD => .2, |
211
|
|
|
|
|
|
|
%params, |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
depth => 0, |
214
|
|
|
|
|
|
|
neighbors => {}, |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
}, |
217
|
|
|
|
|
|
|
$class; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
15
|
100
|
|
|
|
180
|
if ( $obj->{use_file} ) { |
221
|
1
|
|
|
|
|
3
|
my %neighbors; |
222
|
13
|
|
|
13
|
|
24279
|
use MLDBM qw/DB_File Storable/; |
|
13
|
|
|
|
|
51331
|
|
|
13
|
|
|
|
|
100
|
|
223
|
13
|
|
|
13
|
|
518
|
use Fcntl; |
|
13
|
|
|
|
|
31
|
|
|
13
|
|
|
|
|
138808
|
|
224
|
1
|
|
|
|
|
165
|
warn "Using MLDBM: $obj->{use_file}"; |
225
|
1
|
50
|
|
|
|
12
|
$obj->{neighbors} = tie %neighbors, 'MLDBM', $obj->{use_file} or die $!; |
226
|
|
|
|
|
|
|
#$obj->{neighbors} = \%neighbors; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
14
|
|
|
|
|
79
|
return $obj; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=item load_from_dir DIR [, \&PARSE ] |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Load documents from a directory. Takes two arguments, a directory path |
239
|
|
|
|
|
|
|
and an optional parsing subroutine. If the parsing subroutine is passed |
240
|
|
|
|
|
|
|
an argument, it will use it to extract term tokens from the file. |
241
|
|
|
|
|
|
|
By default, the file is split on whitespace and stripped of numbers and |
242
|
|
|
|
|
|
|
punctuation. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=cut |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
{ |
247
|
|
|
|
|
|
|
my $parse_sub; |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub load_from_dir { |
250
|
0
|
|
|
0
|
1
|
0
|
my ( $class, $dir, $code ) = @_; |
251
|
|
|
|
|
|
|
|
252
|
0
|
0
|
|
|
|
0
|
croak "$dir is not a directory" unless -d $dir; |
253
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
0
|
require File::Find; |
255
|
0
|
0
|
0
|
|
|
0
|
unless ( defined $code |
|
|
|
0
|
|
|
|
|
256
|
|
|
|
|
|
|
and ref $code |
257
|
|
|
|
|
|
|
and ref $code eq 'CODE' ) { |
258
|
|
|
|
|
|
|
$code = sub { |
259
|
0
|
|
|
0
|
|
0
|
my $text = shift; |
260
|
0
|
|
|
|
|
0
|
$text =~ s/[^\w]/ /gs; |
261
|
0
|
|
|
|
|
0
|
my @toks = split /\s+/m, $text; |
262
|
0
|
|
|
|
|
0
|
return grep { length($_) > 1 } @toks; |
|
0
|
|
|
|
|
0
|
|
263
|
0
|
|
|
|
|
0
|
}; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
0
|
|
|
|
|
0
|
$parse_sub = $code; |
267
|
0
|
|
|
|
|
0
|
my %docs; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Recursively open every file and provide the contents |
270
|
|
|
|
|
|
|
# to whatever parsing subroutine we're using |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
my $reader = |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub { |
275
|
0
|
|
|
0
|
|
0
|
my ( $parse ) = @_; |
276
|
0
|
0
|
|
|
|
0
|
return if /^\./; |
277
|
0
|
0
|
|
|
|
0
|
return unless -f $_; |
278
|
0
|
0
|
|
|
|
0
|
open my $fh, $_ or |
279
|
|
|
|
|
|
|
croak "Could not open file $File::Find::name: $!"; |
280
|
0
|
|
|
|
|
0
|
local $/; |
281
|
0
|
|
|
|
|
0
|
my $contents = <$fh>; |
282
|
0
|
0
|
|
|
|
0
|
close $fh or croak "failed to close filehandle"; |
283
|
0
|
|
|
|
|
0
|
my @words = $parse_sub->($contents); |
284
|
0
|
|
|
|
|
0
|
$docs{ $File::Find::name } = \@words; |
285
|
0
|
|
|
|
|
0
|
}; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
0
|
find( $reader , $dir ); |
289
|
0
|
|
|
|
|
0
|
my $self = __PACKAGE__->new(); |
290
|
0
|
|
|
|
|
0
|
$self->bulk_add( %docs ); |
291
|
0
|
|
|
|
|
0
|
return $self; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item load_from_tdm FILENAME |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Opens and loads a term-document matrix (TDM) file to initialize the graph. |
300
|
|
|
|
|
|
|
The TDM encodes information about term-to-document links. |
301
|
|
|
|
|
|
|
This is a legacy method mainly for the convenience of the module author. |
302
|
|
|
|
|
|
|
For notes on the proper file format, see the README file. |
303
|
|
|
|
|
|
|
=cut |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub load_from_tdm { |
306
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
307
|
0
|
0
|
|
|
|
0
|
croak "TDM file $file does not exist" unless -f $file; |
308
|
0
|
0
|
|
|
|
0
|
return if $self->{'loaded'}; |
309
|
0
|
|
|
|
|
0
|
$self->_read_tdm( $file ); |
310
|
0
|
|
|
|
|
0
|
$self->{'loaded'} = 1; |
311
|
0
|
|
|
|
|
0
|
$self->reweight_graph(); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item rename OLD, NEW |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Renames a document. Will return undef if the new name is already in use. |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=cut |
320
|
|
|
|
|
|
|
sub rename { |
321
|
|
|
|
|
|
|
|
322
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $old, $new ) = @_; |
323
|
0
|
0
|
0
|
|
|
0
|
croak "rename method needs two arguments" unless |
324
|
|
|
|
|
|
|
defined $old and defined $new; |
325
|
0
|
0
|
|
|
|
0
|
croak "document $old not found" unless |
326
|
|
|
|
|
|
|
exists $self->{neighbors}{ _nodeify('D', $old ) }; |
327
|
|
|
|
|
|
|
|
328
|
0
|
|
|
|
|
0
|
my $bad = _nodeify( 'D', $old ); |
329
|
0
|
|
|
|
|
0
|
my $good = _nodeify( 'D', $new ); |
330
|
|
|
|
|
|
|
|
331
|
0
|
0
|
|
|
|
0
|
return if exists $self->{neighbors}{$good}; |
332
|
|
|
|
|
|
|
|
333
|
0
|
|
|
|
|
0
|
my $s = $self->{neighbors}; |
334
|
0
|
|
|
|
|
0
|
foreach my $n ( keys %{ $s->{$bad} } ) { |
|
0
|
|
|
|
|
0
|
|
335
|
0
|
|
|
|
|
0
|
$s->{$good}{$n} = |
336
|
|
|
|
|
|
|
$s->{$n}{$good} = |
337
|
|
|
|
|
|
|
$s->{$bad}{$n}; |
338
|
0
|
|
|
|
|
0
|
delete $s->{$bad}{$n}; |
339
|
0
|
|
|
|
|
0
|
delete $s->{$n}{$bad}; |
340
|
|
|
|
|
|
|
} |
341
|
0
|
|
|
|
|
0
|
delete $s->{$bad}; |
342
|
0
|
|
|
|
|
0
|
return 1; |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item retrieve FILENAME |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Loads a previously stored graph from disk, using Storable. |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=cut |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub retrieve { |
355
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
356
|
0
|
0
|
|
|
|
0
|
croak "Must provide a filename to retrieve graph" |
357
|
|
|
|
|
|
|
unless $file; |
358
|
0
|
0
|
|
|
|
0
|
croak "'$file' is not a file" unless |
359
|
|
|
|
|
|
|
-f $file; |
360
|
|
|
|
|
|
|
|
361
|
0
|
|
|
|
|
0
|
Storable::retrieve( $file ); |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=back |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head1 ACCESSORS |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=over |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item [get|set]_activate_threshold |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Accessor for node activation threshold value. This value determines how far |
374
|
|
|
|
|
|
|
energy can spread in the graph. Lower it to increase the number of results. |
375
|
|
|
|
|
|
|
Default is 1. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
378
|
|
|
|
|
|
|
|
379
|
2
|
|
|
2
|
0
|
1147
|
sub get_activate_threshold { $_[0]->{'ACTIVATE_THRESHOLD'} } |
380
|
|
|
|
|
|
|
sub set_activate_threshold { |
381
|
3
|
|
|
3
|
0
|
818
|
my ( $self, $threshold ) = @_; |
382
|
3
|
100
|
|
|
|
123
|
croak "Can't set activate threshold to zero" |
383
|
|
|
|
|
|
|
unless $threshold; |
384
|
2
|
100
|
|
|
|
110
|
croak "Can't set activate threshold to negative value" |
385
|
|
|
|
|
|
|
unless $threshold > 0; |
386
|
1
|
|
|
|
|
6
|
$self->{'ACTIVATE_THRESHOLD'} = $_[1]; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=item [get|set]_auto_reweight |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Accessor for auto reweight flag. If true, edge weights will be recalculated |
393
|
|
|
|
|
|
|
every time a document is added, updated or removed. This can significantly slow |
394
|
|
|
|
|
|
|
down large graphs. On by default. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=cut |
397
|
|
|
|
|
|
|
|
398
|
0
|
|
|
0
|
0
|
0
|
sub get_auto_reweight{ $_[0]->{auto_reweight} } |
399
|
0
|
|
|
0
|
0
|
0
|
sub set_auto_reweight{ $_[0]->{auto_reweight} = $_[0]->[1]; } |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item [get|set]_collect_threshold |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Accessor for collection threshold value. This determines how much energy a |
405
|
|
|
|
|
|
|
node must have to make it into the result set. Lower it to increase the |
406
|
|
|
|
|
|
|
number of results. Default is 1. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=cut |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub get_collect_threshold { |
411
|
2
|
50
|
|
2
|
0
|
21
|
return ( $_[0]->{'xs'} ? |
412
|
|
|
|
|
|
|
$_[0]->{Graph}->collectionThreshold : |
413
|
|
|
|
|
|
|
$_[0]->{'COLLECT_THRESHOLD'}) |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub set_collect_threshold { |
417
|
2
|
|
|
2
|
0
|
5
|
my ( $self, $newval ) = @_; |
418
|
|
|
|
|
|
|
|
419
|
2
|
|
100
|
|
|
10
|
$newval ||=0; |
420
|
|
|
|
|
|
|
|
421
|
2
|
50
|
|
|
|
7
|
$self->{Graph}->collectionThreshold( $newval ) |
422
|
|
|
|
|
|
|
if $self->{'xs'}; |
423
|
|
|
|
|
|
|
|
424
|
2
|
|
100
|
|
|
10
|
$self->{'COLLECT_THRESHOLD'} = $newval || 0; |
425
|
2
|
|
|
|
|
6
|
return 1; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item [get|set]_debug_mode LEVEL |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Turns debugging on or off. 1 is verbose, 2 is very verbose, 0 is off. |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=cut |
433
|
|
|
|
|
|
|
|
434
|
0
|
|
|
0
|
0
|
0
|
sub get_debug_mode { $_[0]->{debug} } |
435
|
|
|
|
|
|
|
sub set_debug_mode { |
436
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $mode ) = @_; |
437
|
0
|
|
|
|
|
0
|
$self->{'debug'} = $mode; |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=item [get|set]_initial_energy |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Accessor for initial energy value at the query node. This controls how |
445
|
|
|
|
|
|
|
much energy gets poured into the graph at the start of the search. |
446
|
|
|
|
|
|
|
Increase this value to get more results from your queries. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=cut |
449
|
|
|
|
|
|
|
|
450
|
2
|
|
|
2
|
0
|
15
|
sub get_initial_energy { $_[0]->{'START_ENERGY'} } |
451
|
|
|
|
|
|
|
sub set_initial_energy { |
452
|
2
|
|
|
2
|
0
|
6
|
my ( $self, $start_energy ) = @_; |
453
|
2
|
50
|
|
|
|
9
|
croak "Can't set initial energy to zero" |
454
|
|
|
|
|
|
|
unless $start_energy; |
455
|
2
|
100
|
|
|
|
115
|
croak "Can't set initial energy to negative value" |
456
|
|
|
|
|
|
|
unless $start_energy > 0; |
457
|
1
|
|
|
|
|
30
|
$self->{'START_ENERGY'} = $start_energy ; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=item [get|set]_max_depth LEVEL |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
You can tell the graph to cut off searches after a certain distance from |
463
|
|
|
|
|
|
|
the query node. This can speed up searches on very large graphs, and has |
464
|
|
|
|
|
|
|
little adverse effect, especially if you are interested in just the first |
465
|
|
|
|
|
|
|
few search results. Set this value to undef to restore the default (10^8). |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=cut |
468
|
|
|
|
|
|
|
|
469
|
3
|
|
|
3
|
0
|
450
|
sub get_max_depth { $_[0]->{max_depth} } |
470
|
3
|
100
|
|
3
|
0
|
215
|
sub set_max_depth { croak "Tried to set maximum depth to an undefined value" |
471
|
|
|
|
|
|
|
unless defined $_[1]; |
472
|
2
|
|
100
|
|
|
19
|
$_[0]->{max_depth} = $_[1] || 100000000 |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=back |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=head1 METHODS |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=over |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=item add DOC, WORDS |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Add a document to the search engine. Takes as arguments a unique doc |
487
|
|
|
|
|
|
|
identifier and a reference to an array or hash of words in the |
488
|
|
|
|
|
|
|
document. |
489
|
|
|
|
|
|
|
For example: |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
TITLE => { WORD1 => COUNT1, WORD2 => COUNT2 ... } |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
or |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
TITLE => [ WORD1, WORD2, WORD3 ] |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
Use L if you want to pass in a bunch of docs all at once. |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=cut |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub add { |
503
|
|
|
|
|
|
|
|
504
|
420
|
|
|
420
|
1
|
28619
|
my ( $self, $title, $words ) = @_; |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
|
507
|
420
|
50
|
|
|
|
1228
|
croak "Please provide a word list" unless defined $words; |
508
|
420
|
50
|
66
|
|
|
3263
|
croak "Word list is not a reference to an array or hash" |
|
|
|
66
|
|
|
|
|
509
|
|
|
|
|
|
|
unless ref $words and ref $words eq "HASH" or ref $words eq "ARRAY"; |
510
|
|
|
|
|
|
|
|
511
|
420
|
50
|
|
|
|
934
|
croak "Please provide a document identifier" unless defined $title; |
512
|
|
|
|
|
|
|
|
513
|
420
|
|
|
|
|
853
|
my $dnode = _nodeify( 'D', $title ); |
514
|
420
|
50
|
|
|
|
1461
|
croak "Tried to add document with duplicate identifier: '$title'\n" |
515
|
|
|
|
|
|
|
if exists $self->{neighbors}{$dnode}; |
516
|
|
|
|
|
|
|
|
517
|
420
|
|
|
|
|
668
|
my @list; |
518
|
420
|
100
|
|
|
|
1104
|
if ( ref $words eq 'ARRAY' ) { |
519
|
419
|
|
|
|
|
489
|
@list = @{$words}; |
|
419
|
|
|
|
|
3733
|
|
520
|
|
|
|
|
|
|
} else { |
521
|
1
|
|
|
|
|
3
|
@list = keys %{$words}; |
|
1
|
|
|
|
|
46
|
|
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
420
|
50
|
|
|
|
1048
|
croak "Tried to add a document with no content" unless scalar @list; |
525
|
|
|
|
|
|
|
|
526
|
420
|
|
|
|
|
493
|
my @edges; |
527
|
420
|
|
|
|
|
905
|
foreach my $term ( @list ) { |
528
|
13489
|
|
|
|
|
27818
|
my $tnode = _nodeify( 'T', lc( $term ) ); |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# Local weight for the document |
531
|
13489
|
100
|
|
|
|
27162
|
my $lcount = ( ref $words eq 'HASH' ? $words->{$term} : 1 ); |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Update number of docs this word occurs in |
534
|
13489
|
|
|
|
|
35123
|
my $gcount = ++$self->{term_count}{lc( $term )}; |
535
|
|
|
|
|
|
|
|
536
|
13489
|
|
|
|
|
13877
|
my $final_weight = 1; |
537
|
13489
|
|
|
|
|
33581
|
push @edges, [ $dnode, $tnode, $final_weight, $lcount ]; |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
} |
540
|
420
|
|
|
|
|
787
|
$self->{reweight_flag} = 1; |
541
|
420
|
|
|
|
|
908
|
__normalize( \@edges ); |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=cut |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
DEVELOPMENT |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
if ( $self->{supersize} ) { |
549
|
|
|
|
|
|
|
my $n = $self->{neighbors}; |
550
|
|
|
|
|
|
|
foreach my $e ( @edges ) { |
551
|
|
|
|
|
|
|
#warn "adding edge $e->[0], $e->[1]\n"; |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
$n->{$e->[0]} = {} unless exists $n->{$e->[0]}; |
554
|
|
|
|
|
|
|
$n->{$e->[1]} = {} unless exists $n->{$e->[1]}; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
my $tmp = $n->{$e->[0]}; |
557
|
|
|
|
|
|
|
$tmp->{$e->[1]} = join ',', $e->[2], $e->[3]; |
558
|
|
|
|
|
|
|
$tmp = $n->{$e->[1]}; |
559
|
|
|
|
|
|
|
$tmp->{$e->[0]} = join ',', $e->[2], $e->[3]; |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
=cut |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# PURE PERL VERSION |
565
|
|
|
|
|
|
|
#} else { |
566
|
420
|
|
|
|
|
698
|
foreach my $e ( @edges ) { |
567
|
13489
|
|
|
|
|
72348
|
$self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3]; |
568
|
13489
|
|
|
|
|
75829
|
$self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3]; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
#} |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
#print "Reweighting graph\n"; |
574
|
420
|
100
|
|
|
|
2151
|
$self->reweight_graph() if $self->{auto_reweight}; |
575
|
420
|
|
|
|
|
15729
|
return 1; |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=item add_file PATH [, name => NAME, parse => CODE] |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
Adds a document from a file. By default, uses the PATH provided as the document |
583
|
|
|
|
|
|
|
identifier, and parses the file by splitting on whitespace. If a fancier title, |
584
|
|
|
|
|
|
|
or more elegant parsing behavior is desired, pass in named arguments as indicated. |
585
|
|
|
|
|
|
|
NAME can be any string, CODE should be a reference to a subroutine that takes one |
586
|
|
|
|
|
|
|
argument (the contents of the file) and returns an array of tokens, or a hash in the |
587
|
|
|
|
|
|
|
form TOKEN => COUNT, or a reference to the same. |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=cut |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
sub add_file { |
592
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $path, %params ) = @_; |
593
|
|
|
|
|
|
|
|
594
|
0
|
0
|
0
|
|
|
0
|
croak "Invalid file '$path' provided to add_file method." |
595
|
|
|
|
|
|
|
unless defined $path and -f $path; |
596
|
|
|
|
|
|
|
|
597
|
0
|
0
|
|
|
|
0
|
my $title = ( exists $params{name} ? $params{name} : $path ); |
598
|
|
|
|
|
|
|
|
599
|
0
|
|
|
|
|
0
|
local $/; |
600
|
0
|
0
|
|
|
|
0
|
open my $fh, $path or croak "Unable to open $path: $!"; |
601
|
0
|
|
|
|
|
0
|
my $content = <$fh>; |
602
|
|
|
|
|
|
|
|
603
|
0
|
|
|
|
|
0
|
my $ref; |
604
|
|
|
|
|
|
|
|
605
|
0
|
0
|
|
|
|
0
|
if ( exists $params{parse} ) { |
606
|
0
|
0
|
|
|
|
0
|
croak "code provided is not a reference" unless |
607
|
|
|
|
|
|
|
ref $params{parse}; |
608
|
0
|
0
|
|
|
|
0
|
croak "code provided is not a subroutine" unless |
609
|
|
|
|
|
|
|
ref $params{parse} eq 'CODE'; |
610
|
|
|
|
|
|
|
|
611
|
0
|
|
|
|
|
0
|
$ref = $params{parse}->( $content ); |
612
|
0
|
0
|
0
|
|
|
0
|
croak "did not get an appropriate reference back after parsing" |
613
|
|
|
|
|
|
|
unless ref $ref and ref $ref =~ /(HASH|ARRAY)/; |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
} else { |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
my $code = sub { |
619
|
0
|
|
|
0
|
|
0
|
my $txt = shift; |
620
|
0
|
|
|
|
|
0
|
$txt =~ s/\W/ /g; |
621
|
0
|
|
|
|
|
0
|
my @toks = split m/\s+/, $txt; |
622
|
0
|
|
|
|
|
0
|
\@toks; |
623
|
0
|
|
|
|
|
0
|
}; |
624
|
0
|
|
|
|
|
0
|
$ref = $code->($content); |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
0
|
0
|
|
|
|
0
|
return unless $ref; |
628
|
0
|
|
|
|
|
0
|
$self->add( $title, $ref ); |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=item bulk_add DOCS |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
Add documents to the graph in bulk. Takes as an argument a hash |
635
|
|
|
|
|
|
|
whose keys are document identifiers, and values are references |
636
|
|
|
|
|
|
|
to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...} |
637
|
|
|
|
|
|
|
This method is faster than adding in documents one by one if |
638
|
|
|
|
|
|
|
you have auto_rebalance turned on. |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=cut |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub bulk_add { |
643
|
|
|
|
|
|
|
|
644
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %incoming_docs ) = @_; |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# Disable graph rebalancing until we've added everything |
647
|
|
|
|
|
|
|
{ |
648
|
0
|
|
|
|
|
0
|
local $self->{auto_reweight} = 0; |
|
0
|
|
|
|
|
0
|
|
649
|
|
|
|
|
|
|
|
650
|
0
|
|
|
|
|
0
|
foreach my $doc ( keys %incoming_docs ) { |
651
|
0
|
|
|
|
|
0
|
$self->add( $doc, $incoming_docs{$doc}); |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
} |
654
|
0
|
0
|
|
|
|
0
|
$self->reweight_graph() if $self->{auto_reweight}; |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=item degree NODE |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Given a raw node, returns the degree (raw node means the node must |
661
|
|
|
|
|
|
|
be prefixed with 'D:' or 'T:' depending on type ) |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=cut |
664
|
|
|
|
|
|
|
|
665
|
5
|
|
|
5
|
1
|
3434
|
sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} } |
|
5
|
|
|
|
|
46
|
|
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item delete DOC |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
Remove a document from the graph. Takes a document identifier |
671
|
|
|
|
|
|
|
as an argument. Returns 1 if successful, undef otherwise. |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
=cut |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
sub delete { |
676
|
|
|
|
|
|
|
|
677
|
37
|
|
|
37
|
1
|
282
|
my ( $self, $type, $name ) = @_; |
678
|
|
|
|
|
|
|
|
679
|
37
|
50
|
|
|
|
153
|
croak "Must provide a node type to delete() method" unless defined $type; |
680
|
37
|
50
|
|
|
|
215
|
croak "Invalid type $type passed to delete method. Must be one of [TD]" |
681
|
|
|
|
|
|
|
unless $type =~ /^[TD]$/io; |
682
|
37
|
50
|
|
|
|
84
|
croak "Please provide a node name" unless defined $name; |
683
|
|
|
|
|
|
|
|
684
|
37
|
50
|
|
|
|
97
|
return unless defined $name; |
685
|
37
|
|
|
|
|
87
|
my $node = _nodeify( $type, $name); |
686
|
|
|
|
|
|
|
|
687
|
37
|
|
|
|
|
87
|
my $n = $self->{neighbors}; |
688
|
37
|
50
|
|
|
|
148
|
croak "Found a neighborless node $node" |
689
|
|
|
|
|
|
|
unless exists $n->{$node}; |
690
|
|
|
|
|
|
|
|
691
|
37
|
|
|
|
|
60
|
my @terms = keys %{ $n->{$node} }; |
|
37
|
|
|
|
|
462
|
|
692
|
|
|
|
|
|
|
|
693
|
37
|
50
|
|
|
|
179
|
warn "found ", scalar @terms, " neighbors attached to $node\n" |
694
|
|
|
|
|
|
|
if $self->{debug}; |
695
|
|
|
|
|
|
|
# Check to see if we have orphaned any terms |
696
|
37
|
|
|
|
|
61
|
foreach my $t ( @terms ) { |
697
|
|
|
|
|
|
|
|
698
|
1205
|
|
|
|
|
2144
|
delete $n->{$node}{$t}; |
699
|
1205
|
|
|
|
|
2124
|
delete $n->{$t}{$node}; |
700
|
|
|
|
|
|
|
|
701
|
1205
|
100
|
|
|
|
1260
|
if ( scalar keys %{ $n->{$t} } == 0 ) { |
|
1205
|
|
|
|
|
2994
|
|
702
|
976
|
50
|
|
|
|
1922
|
warn "\tdeleting orphaned node $t" if $self->{debug}; |
703
|
976
|
|
|
|
|
3124
|
my ( $subtype, $name ) = $t =~ /^(.):(.*)$/; |
704
|
|
|
|
|
|
|
#$self->delete( $subtype, $name ); |
705
|
976
|
|
|
|
|
2199
|
delete $n->{$t}; |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
|
709
|
37
|
|
|
|
|
113
|
delete $n->{$node}; |
710
|
37
|
|
|
|
|
156
|
$self->check_consistency(); |
711
|
37
|
|
|
|
|
921
|
$self->{reweight_flag} = 1; |
712
|
37
|
50
|
|
|
|
308
|
$self->reweight_graph if $self->{auto_reweight}; |
713
|
37
|
|
|
|
|
570
|
1; |
714
|
|
|
|
|
|
|
} |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
=item has_doc DOC |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
Returns true if the document with identifier DOC is in the collection |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
=cut |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
sub has_doc { |
725
|
4
|
|
|
4
|
1
|
15
|
my ( $self, $doc ) = @_; |
726
|
4
|
50
|
|
|
|
10
|
carp "Received undefined value for has_doc" unless defined $doc; |
727
|
4
|
|
|
|
|
9
|
my $node = _nodeify( 'D', $doc ); |
728
|
4
|
|
100
|
|
|
34
|
return exists $self->{neighbors}{$node} || undef; |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item has_term TERM |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Returns true if the term TERM is in the collection |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=cut |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
sub has_term { |
738
|
4
|
|
|
4
|
1
|
6
|
my ( $self, $term ) = @_; |
739
|
4
|
50
|
|
|
|
11
|
carp "Received undefined value for has_term" unless defined $term; |
740
|
4
|
|
|
|
|
8
|
my $node = _nodeify( 'T', $term ); |
741
|
4
|
|
100
|
|
|
31
|
return exists $self->{neighbors}{$node} || undef; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=item distance NODE1, NODE2, TYPE |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
Calculates the distance between two nodes of the same type (D or T) |
749
|
|
|
|
|
|
|
using the formula: |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
distance = ... |
752
|
|
|
|
|
|
|
=cut |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
sub distance { |
755
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $n1, $n2, $type ) = @_; |
756
|
0
|
0
|
|
|
|
0
|
croak unless $type; |
757
|
0
|
|
|
|
|
0
|
$type = lc( $type ); |
758
|
0
|
0
|
|
|
|
0
|
croak unless $type =~ /^[dt]$/; |
759
|
0
|
0
|
|
|
|
0
|
my $key = ( $type eq 't' ? 'terms' : 'documents' ); |
760
|
0
|
|
|
|
|
0
|
my @shared = $self->intersection( $key => [ $n1, $n2 ] ); |
761
|
0
|
0
|
|
|
|
0
|
return 0 unless @shared; |
762
|
|
|
|
|
|
|
#warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n"; |
763
|
|
|
|
|
|
|
|
764
|
0
|
|
|
|
|
0
|
my $node1 = _nodeify( $type, $n1 ); |
765
|
0
|
|
|
|
|
0
|
my $node2 = _nodeify( $type, $n2 ); |
766
|
|
|
|
|
|
|
# formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 ) |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
#warn "Calculating distance\n"; |
769
|
0
|
|
|
|
|
0
|
my $sum1 = 0; |
770
|
0
|
|
|
|
|
0
|
my $sum2 = 0; |
771
|
0
|
|
|
|
|
0
|
foreach my $next ( @shared ) { |
772
|
0
|
|
|
|
|
0
|
my ( undef, $lcount1) = split m/,/, $self->{neighbors}{$node1}{$next}; |
773
|
0
|
|
|
|
|
0
|
my ( undef, $lcount2) = split m/,/, $self->{neighbors}{$node2}{$next}; |
774
|
|
|
|
|
|
|
|
775
|
0
|
|
|
|
|
0
|
my $degree = $self->degree( $next ); |
776
|
|
|
|
|
|
|
#warn "\t degree of $next is $degree\n"; |
777
|
0
|
|
|
|
|
0
|
my $elem1 = $lcount1 / $degree; |
778
|
0
|
|
|
|
|
0
|
$sum1 += $elem1; |
779
|
0
|
|
|
|
|
0
|
my $elem2 = $lcount2 / $degree; |
780
|
0
|
|
|
|
|
0
|
$sum2 += $elem2; |
781
|
|
|
|
|
|
|
} |
782
|
|
|
|
|
|
|
#warn "sum is $sum1, $sum2\n"; |
783
|
0
|
|
|
|
|
0
|
my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 )); |
784
|
|
|
|
|
|
|
#warn "final is $final\n"; |
785
|
0
|
|
|
|
|
0
|
return $final; |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=item distance_matrix TYPE LIMIT |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Used for clustering using linear local embedding. Produces a similarity matrix |
793
|
|
|
|
|
|
|
in a format I'm too tired to document right now. LIMIT is the maximum number |
794
|
|
|
|
|
|
|
of neighbors to keep for each node. |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
=cut |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
sub distance_matrix { |
799
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $type, $limit ) = @_; |
800
|
0
|
0
|
|
|
|
0
|
croak "Must provide type argument to distance_matrix()" |
801
|
|
|
|
|
|
|
unless defined $type; |
802
|
0
|
0
|
|
|
|
0
|
croak "must provide limit" unless $limit; |
803
|
0
|
|
|
|
|
0
|
my @nodes; |
804
|
0
|
0
|
|
|
|
0
|
if ( lc( $type ) eq 'd' ) { |
|
|
0
|
|
|
|
|
|
805
|
0
|
|
|
|
|
0
|
@nodes = $self->doc_list(); |
806
|
|
|
|
|
|
|
} elsif ( lc( $type ) eq 't' ) { |
807
|
0
|
|
|
|
|
0
|
@nodes = $self->term_list(); |
808
|
|
|
|
|
|
|
} else { |
809
|
0
|
|
|
|
|
0
|
croak "Unsupported type $type"; |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
|
812
|
0
|
|
|
|
|
0
|
my @ret; |
813
|
0
|
|
|
|
|
0
|
my $count = 0; |
814
|
0
|
|
|
|
|
0
|
foreach my $from ( @nodes ) { |
815
|
0
|
|
|
|
|
0
|
warn $from, " - $count\n"; |
816
|
0
|
|
|
|
|
0
|
$count++; |
817
|
0
|
|
|
|
|
0
|
my $index = -1; |
818
|
0
|
|
|
|
|
0
|
my @found; |
819
|
0
|
|
|
|
|
0
|
foreach my $to ( @nodes ) { |
820
|
0
|
|
|
|
|
0
|
$index++; |
821
|
0
|
0
|
|
|
|
0
|
next if $from eq $to; |
822
|
0
|
|
|
|
|
0
|
my $dist = $self->distance( $from, $to, $type ); |
823
|
0
|
0
|
|
|
|
0
|
push @found, [ $index, $dist ] if $dist; |
824
|
|
|
|
|
|
|
#print( $index++, ' ', $dist, " " ) if $dist; |
825
|
|
|
|
|
|
|
} |
826
|
0
|
|
|
|
|
0
|
my @sorted = sort { $b->[1] <=> $a->[1] } @found; |
|
0
|
|
|
|
|
0
|
|
827
|
0
|
|
|
|
|
0
|
my @final = splice ( @sorted, 0, $limit ); |
828
|
0
|
|
|
|
|
0
|
push @ret, join " ", ( map { join ' ', $_->[0], substr($_->[1], 0, 7) } |
|
0
|
|
|
|
|
0
|
|
829
|
0
|
|
|
|
|
0
|
sort { $a->[0] <=> $b->[0] } |
830
|
|
|
|
|
|
|
@final), "\n"; |
831
|
|
|
|
|
|
|
#print "\n"; |
832
|
|
|
|
|
|
|
} |
833
|
0
|
|
|
|
|
0
|
return join "\n", @ret; |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=item intersection @NODES |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Returns a list of neighbor nodes that all the given nodes share in common |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=cut |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
sub intersection { |
844
|
2
|
|
|
2
|
1
|
1324
|
my ( $self, %nodes ) = @_; |
845
|
2
|
|
|
|
|
3
|
my @nodes; |
846
|
2
|
100
|
|
|
|
17
|
if ( exists $nodes{documents} ) { |
847
|
1
|
|
|
|
|
3
|
push @nodes, map { _nodeify( 'D', $_ ) } @{ $nodes{documents}}; |
|
2
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
4
|
|
848
|
|
|
|
|
|
|
} |
849
|
2
|
100
|
|
|
|
33
|
if ( exists $nodes{terms} ) { |
850
|
1
|
|
|
|
|
1
|
push @nodes, map { _nodeify( 'T', $_ ) } @{ $nodes{terms}}; |
|
2
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
3
|
|
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
|
853
|
2
|
|
|
|
|
4
|
my %seen; |
854
|
2
|
|
|
|
|
4
|
foreach my $n ( @nodes ) { |
855
|
4
|
|
|
|
|
9
|
my @neighbors = $self->_neighbors( $n ); |
856
|
4
|
|
|
|
|
101
|
$seen{ $_ }++ foreach @neighbors; |
857
|
|
|
|
|
|
|
} |
858
|
4
|
|
|
|
|
12
|
return map { s/^[DT]://; $_ } |
|
4
|
|
|
|
|
23
|
|
|
131
|
|
|
|
|
169
|
|
859
|
2
|
|
|
|
|
14
|
grep { $seen{$_} == scalar @nodes } |
860
|
|
|
|
|
|
|
keys %seen; |
861
|
|
|
|
|
|
|
} |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
=item raw_search @NODES |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
Given a list of nodes, returns a hash of nearest nodes with relevance values, |
866
|
|
|
|
|
|
|
in the format NODE => RELEVANCE, for all nodes above the threshold value. |
867
|
|
|
|
|
|
|
(You probably want one of L, L, or L instead). |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
=cut |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
sub raw_search { |
872
|
12
|
|
|
12
|
1
|
27
|
my ( $self, @query ) = @_; |
873
|
|
|
|
|
|
|
|
874
|
12
|
|
|
|
|
46
|
$self->_clear(); |
875
|
12
|
|
|
|
|
37
|
foreach ( @query ) { |
876
|
12
|
|
|
|
|
51
|
$self->_energize( $_, $self->{'START_ENERGY'}); |
877
|
|
|
|
|
|
|
} |
878
|
12
|
|
|
|
|
36
|
my $results_ref = $self->_collect(); |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
|
881
|
12
|
|
|
|
|
30
|
return $results_ref; |
882
|
|
|
|
|
|
|
} |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=item reweight_graph |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
Iterates through the graph, calculating edge weights and normalizing |
890
|
|
|
|
|
|
|
around nodes. This method is automatically called every time a |
891
|
|
|
|
|
|
|
document is added, removed, or updated, unless you turn the option |
892
|
|
|
|
|
|
|
off with auto_reweight(0). When adding a lot of docs, this can be |
893
|
|
|
|
|
|
|
time consuming, so either set auto_reweight to off or use the |
894
|
|
|
|
|
|
|
L method to add lots of docs at once |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=cut |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub reweight_graph { |
899
|
294
|
|
|
294
|
1
|
1615
|
my ( $self ) = @_; |
900
|
|
|
|
|
|
|
|
901
|
294
|
|
|
|
|
543
|
my $n = $self->{neighbors}; #shortcut |
902
|
294
|
|
|
|
|
766
|
my $doc_count = $self->doc_count(); |
903
|
|
|
|
|
|
|
#print "Renormalizing for doc count $doc_count\n" if $self->{debug}; |
904
|
294
|
|
|
|
|
7166
|
foreach my $node ( keys %{$n} ) { |
|
294
|
|
|
|
|
16117
|
|
905
|
|
|
|
|
|
|
|
906
|
150286
|
100
|
|
|
|
325623
|
next unless $node =~ /^D:/o; |
907
|
5210
|
50
|
|
|
|
24072
|
warn "reweighting at node $node\n" if $self->{debug} > 1; |
908
|
5210
|
|
|
|
|
6133
|
my @terms = keys %{ $n->{$node} }; |
|
5210
|
|
|
|
|
73395
|
|
909
|
5210
|
|
|
|
|
19343
|
my @edges; |
910
|
5210
|
|
|
|
|
7402
|
foreach my $t ( @terms ) { |
911
|
|
|
|
|
|
|
|
912
|
173163
|
|
|
|
|
357962
|
my $pair = $n->{$node}{$t}; |
913
|
173163
|
|
|
|
|
411221
|
my ( undef, $lcount ) = split /,/, $pair; |
914
|
173163
|
|
|
|
|
639016
|
( my $term = $t ) =~ s/^T://; |
915
|
173163
|
50
|
|
|
|
375967
|
croak "did not receive a local count" unless $lcount; |
916
|
173163
|
|
|
|
|
174757
|
my $weight; |
917
|
173163
|
50
|
|
|
|
349496
|
if ( $self->{use_global_weights} ) { |
918
|
|
|
|
|
|
|
|
919
|
173163
|
|
|
|
|
349136
|
my $gweight = log( $doc_count / $self->doc_count( $term ) ) + 1; |
920
|
173163
|
|
|
|
|
384076
|
my $lweight = log( $lcount ) + 1; |
921
|
173163
|
|
|
|
|
265720
|
$weight = ( $gweight * $lweight ); |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
} else { |
924
|
|
|
|
|
|
|
|
925
|
0
|
|
|
|
|
0
|
$weight = log( $lcount ) + 1; |
926
|
|
|
|
|
|
|
} |
927
|
173163
|
|
|
|
|
574606
|
push @edges, [ $node, $t, $weight, $lcount ]; |
928
|
|
|
|
|
|
|
} |
929
|
|
|
|
|
|
|
|
930
|
5210
|
|
|
|
|
13666
|
__normalize( \@edges ); |
931
|
|
|
|
|
|
|
|
932
|
5210
|
|
|
|
|
9523
|
foreach my $e ( @edges ) { |
933
|
173163
|
|
|
|
|
571156
|
my $pair = join ',', $e->[2], $e->[3]; |
934
|
173163
|
|
|
|
|
566737
|
$n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
294
|
|
|
|
|
17793
|
$self->{reweight_flag} = 0; |
938
|
294
|
|
|
|
|
648
|
return 1; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=item update ID, WORDS |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
Given a document identifier and a word list, updates the information for |
947
|
|
|
|
|
|
|
that document in the graph. Returns the number of changes made |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=cut |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
sub update { |
952
|
|
|
|
|
|
|
|
953
|
2
|
|
|
2
|
1
|
68
|
my ( $self, $id, $words ) = @_; |
954
|
|
|
|
|
|
|
|
955
|
2
|
50
|
|
|
|
10
|
croak "update not implemented in XS" if $self->{xs}; |
956
|
2
|
50
|
|
|
|
7
|
croak "Must provide a document identifier to update_document" unless defined $id; |
957
|
2
|
|
|
|
|
8
|
my $dnode = _nodeify( 'D', $id ); |
958
|
|
|
|
|
|
|
|
959
|
2
|
100
|
|
|
|
13
|
return unless exists $self->{neighbors}{$dnode}; |
960
|
1
|
50
|
33
|
|
|
19
|
croak "must provide a word list " |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
961
|
|
|
|
|
|
|
unless defined $words and |
962
|
|
|
|
|
|
|
ref $words and |
963
|
|
|
|
|
|
|
( ref $words eq 'HASH' or |
964
|
|
|
|
|
|
|
ref $words eq 'ARRAY' ); |
965
|
|
|
|
|
|
|
|
966
|
1
|
|
|
|
|
32
|
my $n = $self->{neighbors}{$dnode}; |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
# Get the current word list |
969
|
1
|
|
|
|
|
2
|
my @terms = keys %{ $n }; |
|
1
|
|
|
|
|
6
|
|
970
|
|
|
|
|
|
|
|
971
|
1
|
50
|
|
|
|
6
|
if ( ref $words eq 'ARRAY' ) { |
972
|
1
|
|
|
|
|
2
|
my %words; |
973
|
1
|
|
|
|
|
8
|
$words{$_}++ foreach @$words; |
974
|
1
|
|
|
|
|
4
|
$words = \%words; |
975
|
|
|
|
|
|
|
} |
976
|
|
|
|
|
|
|
|
977
|
1
|
|
|
|
|
4
|
local $self->{auto_reweight} = 0; |
978
|
|
|
|
|
|
|
|
979
|
1
|
|
|
|
|
2
|
my $must_reweight = 0; |
980
|
1
|
|
|
|
|
2
|
my %seen; |
981
|
|
|
|
|
|
|
|
982
|
1
|
|
|
|
|
2
|
foreach my $term ( keys %{$words} ) { |
|
1
|
|
|
|
|
4
|
|
983
|
|
|
|
|
|
|
|
984
|
4
|
|
|
|
|
10
|
my $t = _nodeify( 'T', $term ); |
985
|
|
|
|
|
|
|
|
986
|
4
|
100
|
|
|
|
11
|
if ( exists $n->{$t} ){ |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
# Update the local count, if necessary |
989
|
3
|
|
|
|
|
7
|
my $curr_val = $n->{$t}; |
990
|
3
|
|
|
|
|
9
|
my ( undef, $loc ) = split m/,/, $curr_val; |
991
|
|
|
|
|
|
|
|
992
|
3
|
50
|
|
|
|
12
|
unless ( $loc == $words->{$term} ) { |
993
|
0
|
|
|
|
|
0
|
$n->{$t} = join ',', 1, $words->{$term}; |
994
|
0
|
|
|
|
|
0
|
$must_reweight++; |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
else { |
999
|
|
|
|
|
|
|
|
1000
|
1
|
|
|
|
|
8
|
$n->{$t} = |
1001
|
|
|
|
|
|
|
$self->{neighbors}{$t}{$dnode} = |
1002
|
|
|
|
|
|
|
join ',', 1, $words->{$term}; |
1003
|
1
|
|
|
|
|
2
|
$must_reweight++; |
1004
|
|
|
|
|
|
|
} |
1005
|
|
|
|
|
|
|
|
1006
|
4
|
|
|
|
|
12
|
$seen{$t}++; |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# Check for deleted words |
1010
|
1
|
|
|
|
|
5
|
foreach my $t ( @terms ) { |
1011
|
3
|
50
|
|
|
|
9
|
$must_reweight++ |
1012
|
|
|
|
|
|
|
unless exists $seen{$t}; |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
1
|
50
|
|
|
|
16
|
$self->reweight_graph() if |
1016
|
|
|
|
|
|
|
$must_reweight; |
1017
|
|
|
|
|
|
|
|
1018
|
1
|
|
|
|
|
17
|
return $must_reweight; |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
=item doc_count [TERM] |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
Returns a count of all documents that TERM occurs in. |
1026
|
|
|
|
|
|
|
If no argument is provided, returns a document count |
1027
|
|
|
|
|
|
|
for the entire collection. |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=cut |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
sub doc_count { |
1032
|
173490
|
|
|
173490
|
1
|
254700
|
my ( $self, $term ) = @_; |
1033
|
173490
|
100
|
|
|
|
301573
|
if ( defined $term ) { |
1034
|
173186
|
50
|
|
|
|
498440
|
$term = _nodeify( 'T', $term ) unless $term =~ /^T:/; |
1035
|
173186
|
|
|
|
|
395011
|
my $node = $self->{neighbors}{$term}; |
1036
|
173186
|
100
|
|
|
|
326908
|
return 0 unless defined $node; |
1037
|
173183
|
|
|
|
|
199690
|
return scalar keys %{$node}; |
|
173183
|
|
|
|
|
568622
|
|
1038
|
|
|
|
|
|
|
} else { |
1039
|
304
|
|
|
|
|
61002
|
return scalar grep /D:/, |
1040
|
304
|
|
|
|
|
414
|
keys %{ $self->{'neighbors'} }; |
1041
|
|
|
|
|
|
|
} |
1042
|
|
|
|
|
|
|
} |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
=item doc_list [TERM] |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
Returns a sorted list of document identifiers that contain |
1048
|
|
|
|
|
|
|
TERM, in ASCII-betical order. If no argument is given, |
1049
|
|
|
|
|
|
|
returns a sorted document list for the whole collection. |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
=cut |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
sub doc_list { |
1054
|
4
|
|
|
4
|
1
|
74
|
my ( $self, $term ) = @_; |
1055
|
4
|
|
|
|
|
5
|
my $t; |
1056
|
4
|
50
|
33
|
|
|
16
|
if ( defined $term and $term !~ /T:/) { |
1057
|
0
|
|
|
|
|
0
|
$t = _nodeify( 'T', $term ); |
1058
|
|
|
|
|
|
|
} |
1059
|
4
|
50
|
|
|
|
14
|
my $hash = ( defined $term ? |
1060
|
|
|
|
|
|
|
$self->{neighbors}{$t} : |
1061
|
|
|
|
|
|
|
$self->{neighbors} ); |
1062
|
|
|
|
|
|
|
|
1063
|
136
|
|
|
|
|
216
|
sort map { s/^D://o; $_ } |
|
136
|
|
|
|
|
235
|
|
|
4
|
|
|
|
|
862
|
|
1064
|
4
|
|
|
|
|
6
|
grep /^D:/, keys %{ $hash }; |
1065
|
|
|
|
|
|
|
} |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
sub dump { |
1069
|
0
|
|
|
0
|
0
|
0
|
my ( $self ) = @_; |
1070
|
0
|
|
|
|
|
0
|
my @docs = $self->doc_list(); |
1071
|
|
|
|
|
|
|
|
1072
|
0
|
|
|
|
|
0
|
foreach my $d ( @docs ) { |
1073
|
0
|
|
|
|
|
0
|
print $self->dump_node( $d ); |
1074
|
|
|
|
|
|
|
} |
1075
|
|
|
|
|
|
|
} |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=item dump_node NODE |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
Lists all of the neighbors of a node, together with edge |
1080
|
|
|
|
|
|
|
weights connecting to them |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
=cut |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
sub dump_node { |
1085
|
2
|
|
|
2
|
1
|
10
|
my ( $self, $node ) = @_; |
1086
|
|
|
|
|
|
|
|
1087
|
2
|
|
|
|
|
4
|
my @lines; |
1088
|
2
|
|
|
|
|
8
|
push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR"; |
1089
|
|
|
|
|
|
|
|
1090
|
2
|
|
|
|
|
5
|
foreach my $n ( keys %{ $self->{neighbors}{$node} } ) { |
|
2
|
|
|
|
|
33
|
|
1091
|
116
|
|
|
|
|
180
|
my $v = $self->{neighbors}{$node}{$n}; |
1092
|
116
|
|
|
|
|
188
|
my ( $weight, $count ) = split /,/, $v; |
1093
|
116
|
|
|
|
|
237
|
push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n; |
1094
|
|
|
|
|
|
|
} |
1095
|
2
|
|
|
|
|
31
|
return @lines; |
1096
|
|
|
|
|
|
|
} |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=item dump_tdm [FILE] |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
Dumps internal state in term-document matrix (TDM) format, which looks |
1103
|
|
|
|
|
|
|
like this: |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
A B C B C B C |
1106
|
|
|
|
|
|
|
A B C B C B C |
1107
|
|
|
|
|
|
|
A B C B C B C |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
Where each row represents a document, A is the number of terms in the |
1110
|
|
|
|
|
|
|
document, B is the term node and C is the edge weight between the doc |
1111
|
|
|
|
|
|
|
node and B. Mostly used as a legacy format by the module author. |
1112
|
|
|
|
|
|
|
Doc and term nodes are printed in ASCII-betical sorted order, zero-based |
1113
|
|
|
|
|
|
|
indexing. Up to you to keep track of the ID => title mappings, neener-neener! |
1114
|
|
|
|
|
|
|
Use doc_list and term_list to get an equivalently sorted list |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
=cut |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
sub dump_tdm { |
1119
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $file ) = @_; |
1120
|
|
|
|
|
|
|
|
1121
|
0
|
|
|
|
|
0
|
my $counter = 0; |
1122
|
0
|
|
|
|
|
0
|
my %lookup; |
1123
|
0
|
|
|
|
|
0
|
$lookup{$_} = $counter++ foreach $self->term_list; |
1124
|
|
|
|
|
|
|
|
1125
|
0
|
|
|
|
|
0
|
my @docs = $self->doc_list; |
1126
|
|
|
|
|
|
|
|
1127
|
0
|
|
|
|
|
0
|
my $fh; |
1128
|
0
|
0
|
|
|
|
0
|
if ( defined $file ) { |
1129
|
0
|
0
|
|
|
|
0
|
open $fh, "> $file" or croak |
1130
|
|
|
|
|
|
|
"Could not open TDM output file: $!"; |
1131
|
|
|
|
|
|
|
} else { |
1132
|
0
|
|
|
|
|
0
|
*fh = *STDOUT; |
1133
|
|
|
|
|
|
|
} |
1134
|
0
|
|
|
|
|
0
|
foreach my $doc ( @docs ) { |
1135
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}{$doc}; |
1136
|
|
|
|
|
|
|
|
1137
|
0
|
|
|
|
|
0
|
my $row_count = scalar keys %{$n}; |
|
0
|
|
|
|
|
0
|
|
1138
|
0
|
|
|
|
|
0
|
print $fh $row_count; |
1139
|
|
|
|
|
|
|
|
1140
|
0
|
|
|
|
|
0
|
foreach my $t ( sort keys %{$doc} ) { |
|
0
|
|
|
|
|
0
|
|
1141
|
0
|
|
|
|
|
0
|
my $index = $lookup{$t}; |
1142
|
0
|
|
|
|
|
0
|
my ( $weight, undef ) = split m/,/, $n->{$t}; |
1143
|
0
|
|
|
|
|
0
|
print $fh ' ', $index, ' ', $weight; |
1144
|
|
|
|
|
|
|
} |
1145
|
0
|
|
|
|
|
0
|
print $fh "\n"; |
1146
|
|
|
|
|
|
|
} |
1147
|
|
|
|
|
|
|
} |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
=item near_neighbors [NODE] |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two |
1154
|
|
|
|
|
|
|
hops away. |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=cut |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
sub near_neighbors { |
1159
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $name, $type ) = @_; |
1160
|
|
|
|
|
|
|
|
1161
|
0
|
|
|
|
|
0
|
my $node = _nodeify( $type, $name ); |
1162
|
|
|
|
|
|
|
|
1163
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}{$node}; |
1164
|
|
|
|
|
|
|
|
1165
|
0
|
|
|
|
|
0
|
my %found; |
1166
|
0
|
|
|
|
|
0
|
foreach my $next ( keys %{$n} ) { |
|
0
|
|
|
|
|
0
|
|
1167
|
0
|
|
|
|
|
0
|
foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){ |
|
0
|
|
|
|
|
0
|
|
1168
|
0
|
|
|
|
|
0
|
$found{$mynext}++; |
1169
|
|
|
|
|
|
|
} |
1170
|
|
|
|
|
|
|
} |
1171
|
0
|
|
|
|
|
0
|
delete $found{$node}; |
1172
|
0
|
|
|
|
|
0
|
return keys %found; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
=item term_count [DOC] |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
Returns the number of unique terms in a document or, |
1179
|
|
|
|
|
|
|
if no document is specified, in the entire collection. |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=cut |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
sub term_count { |
1184
|
9
|
|
|
9
|
1
|
2187
|
my ( $self, $doc ) = @_; |
1185
|
9
|
50
|
|
|
|
34
|
if ( defined $doc ) { |
1186
|
0
|
|
|
|
|
0
|
my $node = $self->{neighbors}{ _nodeify( 'D', $doc) }; |
1187
|
0
|
0
|
|
|
|
0
|
return 0 unless defined $node; |
1188
|
0
|
|
|
|
|
0
|
return scalar keys %{$node}; |
|
0
|
|
|
|
|
0
|
|
1189
|
|
|
|
|
|
|
} else { |
1190
|
9
|
|
|
|
|
8036
|
return scalar grep /T:/, |
1191
|
9
|
|
|
|
|
17
|
keys %{ $self->{neighbors} }; |
1192
|
|
|
|
|
|
|
} |
1193
|
|
|
|
|
|
|
} |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
=item term_list [DOC] |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
Returns a sorted list of unique terms appearing in the document |
1199
|
|
|
|
|
|
|
with identifier DOC, in ASCII-betical order. If no argument is |
1200
|
|
|
|
|
|
|
given, returns a sorted term list for the whole collection. |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
=cut |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
sub term_list { |
1205
|
3
|
|
|
3
|
1
|
1345
|
my ( $self, $doc ) = @_; |
1206
|
|
|
|
|
|
|
|
1207
|
3
|
100
|
|
|
|
16
|
my $node = ( defined $doc ? |
1208
|
|
|
|
|
|
|
$self->{neighbors}{ _nodeify( 'D', $doc) } : |
1209
|
|
|
|
|
|
|
$self->{neighbors} |
1210
|
|
|
|
|
|
|
); |
1211
|
|
|
|
|
|
|
|
1212
|
1782
|
|
|
|
|
3740
|
sort map { s/^T://o; $_ } |
|
1782
|
|
|
|
|
4800
|
|
|
3
|
|
|
|
|
1120
|
|
1213
|
3
|
|
|
|
|
218
|
grep /^T:/, keys %{ $node }; |
1214
|
|
|
|
|
|
|
} |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
=item word_count [TERM] |
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
Returns the total occurence count for a term, or if no argument is given, |
1221
|
|
|
|
|
|
|
a word count for the entire collection. The word count is always greater than |
1222
|
|
|
|
|
|
|
or equal to the term count. |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=cut |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
sub word_count { |
1227
|
|
|
|
|
|
|
|
1228
|
4
|
|
|
4
|
1
|
13
|
my ( $self, $term ) = @_; |
1229
|
|
|
|
|
|
|
|
1230
|
4
|
|
|
|
|
12
|
my $n = $self->{neighbors}; # shortcut |
1231
|
|
|
|
|
|
|
|
1232
|
4
|
|
|
|
|
9
|
my $count = 0; |
1233
|
4
|
|
|
|
|
9
|
my @terms; |
1234
|
4
|
100
|
|
|
|
17
|
if ( defined $term ) { |
1235
|
3
|
|
|
|
|
8
|
push @terms, $term; |
1236
|
|
|
|
|
|
|
} else { |
1237
|
1
|
|
|
|
|
3
|
@terms = $self->term_list(); |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
|
1240
|
4
|
|
|
|
|
138
|
foreach my $term (@terms ) { |
1241
|
879
|
50
|
|
|
|
2252
|
$term = _nodeify( 'T', $term) unless $term =~/^T:/o; |
1242
|
879
|
|
|
|
|
930
|
foreach my $doc ( keys %{ $n->{$term} } ) { |
|
879
|
|
|
|
|
2124
|
|
1243
|
1092
|
|
|
|
|
2615
|
( undef, my $lcount ) = split /,/, $n->{$term}{$doc}; |
1244
|
1092
|
|
|
|
|
2603
|
$count += $lcount; |
1245
|
|
|
|
|
|
|
} |
1246
|
|
|
|
|
|
|
} |
1247
|
|
|
|
|
|
|
|
1248
|
4
|
|
|
|
|
94
|
return $count; |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
=item search @QUERY |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
Searches the graph for all of the words in @QUERY. Use find_similar if you |
1258
|
|
|
|
|
|
|
want to do a document similarity instead, or mixed_search if you want |
1259
|
|
|
|
|
|
|
to search on any combination of words and documents. Returns a pair of hashrefs: |
1260
|
|
|
|
|
|
|
the first a reference to a hash of docs and relevance values, the second to |
1261
|
|
|
|
|
|
|
a hash of words and relevance values. |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
=cut |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
sub search { |
1266
|
9
|
|
|
9
|
1
|
2906
|
my ( $self, @query ) = @_; |
1267
|
9
|
|
|
|
|
27
|
my @nodes = _nodeify( 'T', @query ); |
1268
|
9
|
|
|
|
|
40
|
my $results = $self->raw_search( @nodes ); |
1269
|
9
|
|
|
|
|
28
|
my ($docs, $words) = _partition( $results ); |
1270
|
9
|
|
|
|
|
50
|
return ( $docs, $words); |
1271
|
|
|
|
|
|
|
} |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
=item simple_search QUERY |
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
This is the DWIM method - takes a query string as its argument, and returns an array |
1278
|
|
|
|
|
|
|
of documents, sorted by relevance. |
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=cut |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
sub simple_search { |
1283
|
1
|
|
|
1
|
1
|
12
|
my ( $self, $query ) = @_; |
1284
|
1
|
|
|
|
|
3
|
my @words = map { s/\W+//g; lc($_) } |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
1285
|
|
|
|
|
|
|
split m/\s+/, $query; |
1286
|
1
|
|
|
|
|
4
|
my @nodes = _nodeify( 'T', @words ); |
1287
|
1
|
|
|
|
|
4
|
my $results = $self->raw_search( @nodes ); |
1288
|
1
|
|
|
|
|
3
|
my ($docs, $words) = _partition( $results ); |
1289
|
1
|
|
|
|
|
3
|
my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs}; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
4
|
|
1290
|
1
|
|
|
|
|
7
|
return @sorted_docs; |
1291
|
|
|
|
|
|
|
} |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
=item find_by_title @TITLES |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
Given a list of patterns, searches for documents with matching titles |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
=cut |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
sub find_by_title { |
1300
|
3
|
|
|
3
|
1
|
1035
|
my ( $self, @titles ) = @_; |
1301
|
3
|
|
|
|
|
4
|
my @found; |
1302
|
3
|
|
|
|
|
9
|
my @docs = $self->doc_list(); |
1303
|
3
|
|
|
|
|
109
|
my $pattern = join '|', @titles; |
1304
|
3
|
|
|
|
|
66
|
my $match_me = qr/$pattern/i; |
1305
|
|
|
|
|
|
|
#warn $match_me, "\n"; |
1306
|
3
|
|
|
|
|
6
|
foreach my $d ( @docs ) { |
1307
|
|
|
|
|
|
|
# warn $d, "\n"; |
1308
|
102
|
100
|
|
|
|
318
|
push @found, $d if $d =~ $match_me; |
1309
|
|
|
|
|
|
|
} |
1310
|
3
|
|
|
|
|
27
|
return @found; |
1311
|
|
|
|
|
|
|
} |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
=item find_similar @DOCS |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
Given an array of document identifiers, performs a similarity search |
1317
|
|
|
|
|
|
|
and returns a pair of hashrefs. First hashref is to a hash of docs and relevance |
1318
|
|
|
|
|
|
|
values, second is to a hash of words and relevance values. |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
=cut |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
sub find_similar { |
1323
|
2
|
|
|
2
|
1
|
2082
|
my ( $self, @docs ) = @_; |
1324
|
2
|
|
|
|
|
6
|
my @nodes = _nodeify( 'D', @docs ); |
1325
|
2
|
|
|
|
|
6
|
my $results = $self->raw_search( @nodes ); |
1326
|
2
|
|
|
|
|
5
|
my ($docs, $words) = _partition( $results ); |
1327
|
2
|
|
|
|
|
12
|
return ( $docs, $words); |
1328
|
|
|
|
|
|
|
} |
1329
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
=item merge TYPE, GOOD, @BAD |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
Combine all the nodes in @BAD into the node with identifier GOOD. |
1334
|
|
|
|
|
|
|
First argument must be one of 'T' or 'D' to indicate term or |
1335
|
|
|
|
|
|
|
document nodes. Used to combine synonyms in the graph. |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
=cut |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
sub merge { |
1340
|
2
|
|
|
2
|
1
|
1349
|
my ( $self, $type, $good, @bad ) = @_; |
1341
|
2
|
50
|
|
|
|
14
|
croak "must provide a type argument to merge" |
1342
|
|
|
|
|
|
|
unless defined $type; |
1343
|
2
|
50
|
|
|
|
11
|
croak "Invalid type argument $type to merge [must be one of (D,T)]" |
1344
|
|
|
|
|
|
|
unless $type =~ /^[DT]/io; |
1345
|
|
|
|
|
|
|
|
1346
|
2
|
|
|
|
|
6
|
my $target = _nodeify( $type, $good ); |
1347
|
2
|
|
|
|
|
5
|
my @sources = _nodeify( $type, @bad ); |
1348
|
|
|
|
|
|
|
|
1349
|
2
|
|
|
|
|
7
|
my $tnode = $self->{neighbors}{$target}; |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
|
1352
|
2
|
|
|
|
|
3
|
foreach my $bad_node ( @sources ) { |
1353
|
|
|
|
|
|
|
#print "Examining $bad_node\n"; |
1354
|
2
|
50
|
|
|
|
6
|
next if $bad_node eq $target; |
1355
|
2
|
|
|
|
|
4
|
my %neighbors = %{$self->{neighbors}{$bad_node}}; |
|
2
|
|
|
|
|
20
|
|
1356
|
|
|
|
|
|
|
|
1357
|
2
|
|
|
|
|
9
|
foreach my $n ( keys %neighbors ) { |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
#print "\t $target ($bad_node) neighbor $n\n"; |
1360
|
14
|
100
|
|
|
|
35
|
if ( exists $self->{neighbors}{$target}{$n} ) { |
1361
|
|
|
|
|
|
|
#print "\t\t$n has link to $bad_node\n"; |
1362
|
|
|
|
|
|
|
# combine the local counts for the term members of the edge |
1363
|
5
|
|
|
|
|
9
|
my $curr_val = $tnode->{$n}; |
1364
|
5
|
|
|
|
|
11
|
my $aug_val = $self->{neighbors}{$bad_node}{$n}; |
1365
|
5
|
|
|
|
|
14
|
my ($w1, $c1) = split m/,/, $curr_val; |
1366
|
5
|
|
|
|
|
11
|
my ($w2, $c2) = split m/,/, $aug_val; |
1367
|
5
|
|
|
|
|
8
|
my $new_count = $c1 + $c2; |
1368
|
5
|
|
|
|
|
25
|
$curr_val =~ s/,\d+$/,$new_count/; |
1369
|
5
|
|
|
|
|
14
|
$tnode->{$n} = $curr_val; |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
} else { |
1373
|
|
|
|
|
|
|
|
1374
|
9
|
50
|
|
|
|
21
|
die "sanity check failed for existence test" |
1375
|
|
|
|
|
|
|
if exists $self->{neighbors}{$target}{$n}; |
1376
|
|
|
|
|
|
|
|
1377
|
9
|
|
|
|
|
15
|
my $val = $self->{neighbors}{$bad_node}{$n}; |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
#print "\tno existing link -- reassigning $target -- $n\n"; |
1380
|
|
|
|
|
|
|
# reassign the current value of this edge |
1381
|
|
|
|
|
|
|
|
1382
|
9
|
|
|
|
|
22
|
$self->{neighbors}{$n}{$target} = $val; |
1383
|
9
|
|
|
|
|
20
|
$self->{neighbors}{$target}{$n} = $val; |
1384
|
|
|
|
|
|
|
} |
1385
|
|
|
|
|
|
|
|
1386
|
14
|
|
|
|
|
23
|
delete $self->{neighbors}{$bad_node}{$n}; |
1387
|
14
|
|
|
|
|
35
|
delete $self->{neighbors}{$n}{$bad_node}; |
1388
|
|
|
|
|
|
|
} |
1389
|
2
|
|
|
|
|
14
|
delete $self->{neighbors}{$bad_node}; |
1390
|
|
|
|
|
|
|
} |
1391
|
|
|
|
|
|
|
} |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
=item mixed_search @DOCS |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
Given a hashref in the form: |
1396
|
|
|
|
|
|
|
{ docs => [ 'Title 1', 'Title 2' ], |
1397
|
|
|
|
|
|
|
terms => ['buffalo', 'fox' ], } |
1398
|
|
|
|
|
|
|
} |
1399
|
|
|
|
|
|
|
Runs a combined search on the terms and documents provided, and |
1400
|
|
|
|
|
|
|
returns a pair of hashrefs. The first hashref is to a hash of docs |
1401
|
|
|
|
|
|
|
and relevance values, second is to a hash of words and relevance values. |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
=cut |
1404
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
sub mixed_search { |
1406
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $incoming ) = @_; |
1407
|
|
|
|
|
|
|
|
1408
|
0
|
0
|
0
|
|
|
0
|
croak "must provide hash ref to mixed_search method" |
|
|
|
0
|
|
|
|
|
1409
|
|
|
|
|
|
|
unless defined $incoming && |
1410
|
|
|
|
|
|
|
ref( $incoming ) && |
1411
|
|
|
|
|
|
|
ref( $incoming ) eq 'HASH'; |
1412
|
|
|
|
|
|
|
|
1413
|
0
|
|
0
|
|
|
0
|
my $tref = $incoming->{'terms'} || []; |
1414
|
0
|
|
0
|
|
|
0
|
my $dref = $incoming->{'docs'} || []; |
1415
|
|
|
|
|
|
|
|
1416
|
0
|
|
|
|
|
0
|
my @dnodes = _nodeify( 'D', @{$dref} ); |
|
0
|
|
|
|
|
0
|
|
1417
|
0
|
|
|
|
|
0
|
my @tnodes = _nodeify( 'T', @{$tref} ); |
|
0
|
|
|
|
|
0
|
|
1418
|
|
|
|
|
|
|
|
1419
|
0
|
|
|
|
|
0
|
my $results = $self->raw_search( @dnodes, @tnodes ); |
1420
|
0
|
|
|
|
|
0
|
my ($docs, $words) = _partition( $results ); |
1421
|
0
|
|
|
|
|
0
|
return ( $docs, $words); |
1422
|
|
|
|
|
|
|
} |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
|
=item store FILENAME |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
Stores the object to a file for later use. Not compatible (yet) |
1428
|
|
|
|
|
|
|
with compiled XS version, which will give a fatal error. |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
=cut |
1431
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
sub store { |
1433
|
0
|
|
|
0
|
1
|
0
|
my ( $self, @args ) = @_; |
1434
|
0
|
0
|
|
|
|
0
|
if ( $self->{'xs'} ) { |
1435
|
0
|
|
|
|
|
0
|
croak "Cannot store object when running in XS mode."; |
1436
|
|
|
|
|
|
|
} else { |
1437
|
0
|
|
|
|
|
0
|
$self->SUPER::nstore(@args); |
1438
|
|
|
|
|
|
|
} |
1439
|
|
|
|
|
|
|
} |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
# Partition - internal method. |
1443
|
|
|
|
|
|
|
# Takes a result set and splits it into two hashrefs - one for |
1444
|
|
|
|
|
|
|
# words and one for documents |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
sub _partition { |
1447
|
12
|
|
|
12
|
|
19
|
my ( $e ) = @_; |
1448
|
12
|
|
|
|
|
17
|
my ( $docs, $words ); |
1449
|
12
|
|
|
|
|
16
|
foreach my $k ( sort { $e->{$b} <=> $e->{$a} } |
|
209
|
|
|
|
|
277
|
|
|
12
|
|
|
|
|
83
|
|
1450
|
|
|
|
|
|
|
keys %{ $e } ) { |
1451
|
|
|
|
|
|
|
|
1452
|
81
|
|
|
|
|
219
|
(my $name = $k ) =~ s/^[DT]://o; |
1453
|
81
|
100
|
|
|
|
285
|
$k =~ /^D:/ ? |
1454
|
|
|
|
|
|
|
$docs->{$name} = $e->{$k} : |
1455
|
|
|
|
|
|
|
$words->{$name} = $e->{$k} ; |
1456
|
|
|
|
|
|
|
} |
1457
|
12
|
|
|
|
|
37
|
return ( $docs, $words ); |
1458
|
|
|
|
|
|
|
} |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
# return a list of all neighbor nodes |
1461
|
|
|
|
|
|
|
sub _neighbors { |
1462
|
4
|
|
|
4
|
|
8
|
my ( $self, $node ) = @_; |
1463
|
4
|
50
|
|
|
|
13
|
return unless exists $self->{neighbors}{$node}; |
1464
|
4
|
|
|
|
|
5
|
return keys %{ $self->{neighbors}{$node} }; |
|
4
|
|
|
|
|
44
|
|
1465
|
|
|
|
|
|
|
} |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
sub _nodeify { |
1469
|
188046
|
|
|
188046
|
|
383716
|
my ( $prefix, @list ) = @_; |
1470
|
188046
|
|
|
|
|
206997
|
my @nodes; |
1471
|
188046
|
|
|
|
|
275645
|
foreach my $item ( @list ) { |
1472
|
188046
|
|
|
|
|
531772
|
push @nodes, uc($prefix).':'.$item; |
1473
|
|
|
|
|
|
|
} |
1474
|
188046
|
100
|
|
|
|
578801
|
( wantarray ? @nodes : $nodes[0] ); |
1475
|
|
|
|
|
|
|
} |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
sub _read_tdm { |
1480
|
0
|
|
|
0
|
|
0
|
my ( $self, $file ) = @_; |
1481
|
0
|
0
|
|
|
|
0
|
print "Loading TDM...\n" if $self->{'debug'} > 1; |
1482
|
|
|
|
|
|
|
|
1483
|
0
|
0
|
|
|
|
0
|
croak "File does not exist" unless -f $file; |
1484
|
0
|
0
|
|
|
|
0
|
open my $fh, $file or croak "Could not open $file: $!"; |
1485
|
0
|
|
|
|
|
0
|
for ( 1..4 ){ |
1486
|
0
|
|
|
|
|
0
|
my $skip = <$fh>; |
1487
|
|
|
|
|
|
|
} |
1488
|
0
|
|
|
|
|
0
|
my %neighbors; |
1489
|
0
|
|
|
|
|
0
|
my $doc = 0; |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
######### XS VERSION ############## |
1493
|
0
|
0
|
|
|
|
0
|
if ( $self->{'xs'} ) { |
1494
|
|
|
|
|
|
|
|
1495
|
0
|
|
|
|
|
0
|
my $map = $self->{'node_map'}; # shortcut alias |
1496
|
0
|
|
|
|
|
0
|
while (<$fh>) { |
1497
|
0
|
|
|
|
|
0
|
chomp; |
1498
|
0
|
|
|
|
|
0
|
my $dindex = $self->_add_node( "D:$doc", 2 ); |
1499
|
|
|
|
|
|
|
#warn "Added node $doc\n"; |
1500
|
0
|
|
|
|
|
0
|
my ( $count, %vals ) = split; |
1501
|
0
|
|
|
|
|
0
|
while ( my ( $term, $edge ) = each %vals ) { |
1502
|
0
|
|
|
|
|
0
|
$self->{'term_count'}{$term}++; |
1503
|
0
|
|
|
|
|
0
|
my $tnode = "T:$term"; |
1504
|
|
|
|
|
|
|
|
1505
|
0
|
0
|
|
|
|
0
|
my $tindex = ( defined $map->{$tnode} ? |
1506
|
|
|
|
|
|
|
$map->{$tnode} : |
1507
|
|
|
|
|
|
|
$self->_add_node( $tnode, 1 ) |
1508
|
|
|
|
|
|
|
); |
1509
|
0
|
|
|
|
|
0
|
$self->{Graph}->set_edge( $dindex, $tindex, $edge ); |
1510
|
|
|
|
|
|
|
} |
1511
|
0
|
|
|
|
|
0
|
$doc++; |
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
####### PURE PERL VERSION ########## |
1515
|
|
|
|
|
|
|
} else { |
1516
|
0
|
|
|
|
|
0
|
while (<$fh>) { |
1517
|
0
|
|
|
|
|
0
|
chomp; |
1518
|
0
|
|
|
|
|
0
|
my $dnode = "D:$doc"; |
1519
|
0
|
|
|
|
|
0
|
my ( $count, %vals ) = split; |
1520
|
0
|
|
|
|
|
0
|
while ( my ( $term, $edge ) = each %vals ) { |
1521
|
0
|
|
|
|
|
0
|
$self->{'term_count'}{$term}++; |
1522
|
0
|
|
|
|
|
0
|
my $tnode = "T:$term"; |
1523
|
|
|
|
|
|
|
|
1524
|
0
|
|
|
|
|
0
|
$neighbors{$dnode}{$tnode} = $edge.',1'; |
1525
|
0
|
|
|
|
|
0
|
$neighbors{$tnode}{$dnode} = $edge.',1'; |
1526
|
|
|
|
|
|
|
} |
1527
|
0
|
|
|
|
|
0
|
$doc++; |
1528
|
|
|
|
|
|
|
} |
1529
|
0
|
|
|
|
|
0
|
$self->{'neighbors'} = \%neighbors; |
1530
|
|
|
|
|
|
|
} |
1531
|
|
|
|
|
|
|
|
1532
|
0
|
0
|
|
|
|
0
|
print "Loaded.\n" if $self->{'debug'} > 1; |
1533
|
0
|
|
|
|
|
0
|
$self->{'from_TDM'} = 1; |
1534
|
0
|
|
|
|
|
0
|
$self->{'doc_count'} = $doc; |
1535
|
|
|
|
|
|
|
} |
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
# XS version only |
1540
|
|
|
|
|
|
|
# |
1541
|
|
|
|
|
|
|
# This sub maintains a mapping between node names and integer index |
1542
|
|
|
|
|
|
|
# values. |
1543
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
sub _add_node { |
1545
|
0
|
|
|
0
|
|
0
|
my ( $self, $node_name, $type ) = @_; |
1546
|
0
|
0
|
|
|
|
0
|
croak "Must provide a type" unless $type; |
1547
|
0
|
0
|
|
|
|
0
|
croak "Must provide a node name" unless $node_name; |
1548
|
0
|
0
|
|
|
|
0
|
croak "This node already exists" if |
1549
|
|
|
|
|
|
|
$self->{'node_map'}{$node_name}; |
1550
|
|
|
|
|
|
|
|
1551
|
0
|
|
|
|
|
0
|
my $new_id = $self->{'next_free_id'}++; |
1552
|
0
|
|
|
|
|
0
|
$self->{'node_map'}{$node_name} = $new_id; |
1553
|
0
|
|
|
|
|
0
|
$self->{'id_map'}[$new_id] = $node_name; |
1554
|
0
|
|
|
|
|
0
|
$self->{'Graph'}->add_node( $new_id, $type ); |
1555
|
|
|
|
|
|
|
|
1556
|
0
|
|
|
|
|
0
|
return $new_id; |
1557
|
|
|
|
|
|
|
} |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
# |
1562
|
|
|
|
|
|
|
# INTERNAL METHODS |
1563
|
|
|
|
|
|
|
# |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
# each node should have the same number of inbound |
1566
|
|
|
|
|
|
|
# and outbound links |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
sub check_consistency { |
1569
|
|
|
|
|
|
|
|
1570
|
37
|
|
|
37
|
0
|
60
|
my ( $self ) = @_; |
1571
|
37
|
|
|
|
|
57
|
my %inbound; |
1572
|
|
|
|
|
|
|
my %outbound; |
1573
|
|
|
|
|
|
|
|
1574
|
|
|
|
|
|
|
|
1575
|
37
|
|
|
|
|
62
|
foreach my $node ( keys %{$self->{neighbors}} ) { |
|
37
|
|
|
|
|
3748
|
|
1576
|
17194
|
50
|
|
|
|
48162
|
next unless $node =~ /^[DT]:/; # for MLDBM compatibility |
1577
|
17194
|
|
|
|
|
18401
|
$outbound{$node} = scalar keys %{$self->{neighbors}{$node}}; |
|
17194
|
|
|
|
|
40965
|
|
1578
|
17194
|
|
|
|
|
19908
|
foreach my $neighbor ( keys %{ $self->{neighbors}{$node} } ) { |
|
17194
|
|
|
|
|
43491
|
|
1579
|
39868
|
|
|
|
|
74583
|
$inbound{$neighbor}++; |
1580
|
|
|
|
|
|
|
} |
1581
|
|
|
|
|
|
|
} |
1582
|
|
|
|
|
|
|
|
1583
|
37
|
|
|
|
|
1361
|
my $in = scalar keys %inbound; |
1584
|
37
|
|
|
|
|
76
|
my $out = scalar keys %outbound; |
1585
|
37
|
50
|
|
|
|
128
|
carp "number of nodes with inbound links ($in) does not match number of nodes with outbound links ( $out )" |
1586
|
|
|
|
|
|
|
unless scalar keys %inbound == scalar keys %outbound; |
1587
|
|
|
|
|
|
|
|
1588
|
37
|
|
|
|
|
2195
|
foreach my $node ( keys %inbound ) { |
1589
|
17194
|
|
50
|
|
|
28993
|
$outbound{$node} ||= 0; |
1590
|
17194
|
50
|
|
|
|
37030
|
carp "$node has $inbound{$node} inbound links, $outbound{$node} outbound links\n" |
1591
|
|
|
|
|
|
|
unless $inbound{$node} == $outbound{$node}; |
1592
|
|
|
|
|
|
|
} |
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
} |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
=item have_edge RAWNODE1, RAWNODE2 |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
Returns true if the nodes share an edge. Node names must be prefixed with 'D' or 'T' |
1600
|
|
|
|
|
|
|
as appropriate. |
1601
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
=cut |
1603
|
|
|
|
|
|
|
|
1604
|
|
|
|
|
|
|
sub have_edge { |
1605
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $node1, $node2 ) = @_; |
1606
|
0
|
|
|
|
|
0
|
return exists $self->{neighbors}{$node1}{$node2}; |
1607
|
|
|
|
|
|
|
} |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
{ |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
my %visited; |
1613
|
|
|
|
|
|
|
my %component; |
1614
|
|
|
|
|
|
|
my $depth; |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
=item connected_components |
1617
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
Returns an array of connected components in the graph. Each component is a list |
1619
|
|
|
|
|
|
|
of nodes that are mutually accessible by traveling along edges. |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
=cut |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
sub connected_components { |
1624
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
1625
|
|
|
|
|
|
|
|
1626
|
0
|
|
|
|
|
0
|
%visited = (); # clear any old info |
1627
|
0
|
|
|
|
|
0
|
%component = (); |
1628
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
|
1630
|
0
|
|
|
|
|
0
|
my $n = $self->{neighbors}; |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
|
1633
|
0
|
|
|
|
|
0
|
my @node_list = keys %{$n}; |
|
0
|
|
|
|
|
0
|
|
1634
|
0
|
|
|
|
|
0
|
my @components; |
1635
|
|
|
|
|
|
|
|
1636
|
0
|
|
|
|
|
0
|
while ( @node_list ) { |
1637
|
0
|
|
|
|
|
0
|
my $start = shift @node_list; |
1638
|
0
|
0
|
|
|
|
0
|
next if exists $visited{$start}; |
1639
|
|
|
|
|
|
|
|
1640
|
0
|
0
|
|
|
|
0
|
last unless $start; |
1641
|
0
|
|
|
|
|
0
|
warn "Visiting neighbors for $start\n"; |
1642
|
0
|
|
|
|
|
0
|
visit_neighbors( $n, $start ); |
1643
|
0
|
|
|
|
|
0
|
push @components, [ keys %component ]; |
1644
|
0
|
|
|
|
|
0
|
%component = (); |
1645
|
|
|
|
|
|
|
} |
1646
|
|
|
|
|
|
|
|
1647
|
0
|
|
|
|
|
0
|
warn "Found ", scalar @components, " connected components\n"; |
1648
|
0
|
|
|
|
|
0
|
return @components; |
1649
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
} |
1652
|
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
|
sub visit_neighbors { |
1654
|
0
|
|
|
0
|
0
|
0
|
my ( $g, $l ) = @_; |
1655
|
0
|
0
|
|
|
|
0
|
return if $visited{$l}; |
1656
|
0
|
|
|
|
|
0
|
$depth++; |
1657
|
0
|
|
|
|
|
0
|
$visited{$l}++; $component{$l}++; |
|
0
|
|
|
|
|
0
|
|
1658
|
0
|
|
|
|
|
0
|
warn $depth, " $l\n"; |
1659
|
0
|
|
|
|
|
0
|
my @neigh = keys %{ $g->{$l} }; |
|
0
|
|
|
|
|
0
|
|
1660
|
0
|
|
|
|
|
0
|
foreach my $n ( @neigh ) { |
1661
|
0
|
|
|
|
|
0
|
visit_neighbors( $g, $n ); |
1662
|
|
|
|
|
|
|
} |
1663
|
0
|
|
|
|
|
0
|
$depth--; |
1664
|
|
|
|
|
|
|
} |
1665
|
|
|
|
|
|
|
} |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
# Wipe the graph free of stored energies |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
sub _clear { |
1671
|
12
|
|
|
12
|
|
18
|
my ( $self ) = @_; |
1672
|
12
|
|
|
|
|
31
|
$self->{'energy'} = undef; |
1673
|
|
|
|
|
|
|
} |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
|
1676
|
|
|
|
|
|
|
# Gather the stored energy values from the graph |
1677
|
|
|
|
|
|
|
|
1678
|
|
|
|
|
|
|
sub _collect { |
1679
|
12
|
|
|
12
|
|
15
|
my ( $self ) = @_; |
1680
|
12
|
|
|
|
|
17
|
my $e = $self->{'energy'}; |
1681
|
12
|
|
|
|
|
20
|
my $result = {}; |
1682
|
12
|
|
|
|
|
18
|
foreach my $k ( keys %{$self->{'energy'}} ) { |
|
12
|
|
|
|
|
40
|
|
1683
|
81
|
50
|
|
|
|
159
|
next unless $e->{$k} > $self->{'COLLECT_THRESHOLD'}; |
1684
|
81
|
|
|
|
|
130
|
$result->{$k} = $e->{$k}; |
1685
|
|
|
|
|
|
|
} |
1686
|
12
|
|
|
|
|
28
|
return $result; |
1687
|
|
|
|
|
|
|
} |
1688
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
# Assign a starting energy ENERGY to NODE, and recursively distribute the |
1693
|
|
|
|
|
|
|
# energy to neighbor nodes. Singleton nodes get special treatment |
1694
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
sub _energize { |
1696
|
|
|
|
|
|
|
|
1697
|
81
|
|
|
81
|
|
113
|
my ( $self, $node, $energy ) = @_; |
1698
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
|
1700
|
81
|
50
|
|
|
|
206
|
return unless defined $self->{neighbors}{$node}; |
1701
|
81
|
|
50
|
|
|
340
|
my $orig = $self->{energy}{$node} || 0; |
1702
|
81
|
|
|
|
|
154
|
$self->{energy}->{$node} += $energy; |
1703
|
81
|
50
|
|
|
|
180
|
return if ( $self->{depth} == $self->{max_depth} ); |
1704
|
81
|
|
|
|
|
83
|
$self->{depth}++; |
1705
|
|
|
|
|
|
|
|
1706
|
81
|
50
|
|
|
|
189
|
if ( $self->{'debug'} > 1 ) { |
1707
|
0
|
|
|
|
|
0
|
print ' ' x $self->{'depth'}; |
1708
|
0
|
|
|
|
|
0
|
print "$node: energizing $orig + $energy\n"; |
1709
|
|
|
|
|
|
|
} |
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
|
1712
|
81
|
|
|
|
|
98
|
my $n = $self->{neighbors}; |
1713
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
#sleep 1; |
1715
|
81
|
|
|
|
|
78
|
my $degree = scalar keys %{ $n->{$node} }; |
|
81
|
|
|
|
|
138
|
|
1716
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
|
1718
|
81
|
50
|
|
|
|
147
|
if ( $degree == 0 ) { |
1719
|
|
|
|
|
|
|
|
1720
|
0
|
|
|
|
|
0
|
carp "WARNING: reached a node without neighbors: $node at search depth $self->{depth}\n"; |
1721
|
0
|
|
|
|
|
0
|
$self->{depth}--; |
1722
|
0
|
|
|
|
|
0
|
return; |
1723
|
|
|
|
|
|
|
} |
1724
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
|
1726
|
81
|
|
|
|
|
152
|
my $subenergy = $energy / (log($degree)+1); |
1727
|
|
|
|
|
|
|
|
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
# At singleton nodes (words that appear in only one document, for example) |
1730
|
|
|
|
|
|
|
# Don't spread energy any further. This avoids a "reflection" back and |
1731
|
|
|
|
|
|
|
# forth from singleton nodes to their neighbors. |
1732
|
|
|
|
|
|
|
|
1733
|
81
|
100
|
100
|
|
|
298
|
if ( $degree == 1 and $energy < $self->{'START_ENERGY'} ) { |
|
|
100
|
|
|
|
|
|
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
#do nothing |
1736
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
} elsif ( $subenergy > $self->{ACTIVATE_THRESHOLD} ) { |
1738
|
12
|
50
|
|
|
|
33
|
print ' ' x $self->{'depth'}, |
1739
|
|
|
|
|
|
|
"$node: propagating subenergy $subenergy to $degree neighbors\n" |
1740
|
|
|
|
|
|
|
if $self->{'debug'} > 1; |
1741
|
12
|
|
|
|
|
15
|
foreach my $neighbor ( keys %{ $n->{$node} } ) { |
|
12
|
|
|
|
|
44
|
|
1742
|
69
|
|
|
|
|
148
|
my $pair = $n->{$node}{$neighbor}; |
1743
|
69
|
|
|
|
|
166
|
my ( $edge, undef ) = split /,/, $pair; |
1744
|
69
|
|
|
|
|
149
|
my $weighted_energy = $subenergy * $edge; |
1745
|
69
|
50
|
|
|
|
135
|
print ' ' x $self->{'depth'}, |
1746
|
|
|
|
|
|
|
" edge $edge ($node, $neighbor)\n" |
1747
|
|
|
|
|
|
|
if $self->{'debug'} > 1; |
1748
|
69
|
|
|
|
|
140
|
$self->_energize( $neighbor, $weighted_energy ); |
1749
|
|
|
|
|
|
|
} |
1750
|
|
|
|
|
|
|
} |
1751
|
81
|
|
|
|
|
101
|
$self->{'depth'}--; |
1752
|
81
|
|
|
|
|
153
|
return 1; |
1753
|
|
|
|
|
|
|
} |
1754
|
|
|
|
|
|
|
|
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
# Given an array, normalize using cosine normalization |
1757
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
sub __normalize { |
1759
|
5630
|
|
|
5630
|
|
7511
|
my ( $arr ) = @_; |
1760
|
|
|
|
|
|
|
|
1761
|
5630
|
50
|
33
|
|
|
41861
|
croak "Must provide array ref to __normalize" unless |
|
|
|
33
|
|
|
|
|
1762
|
|
|
|
|
|
|
defined $arr and |
1763
|
|
|
|
|
|
|
ref $arr and |
1764
|
|
|
|
|
|
|
ref $arr eq 'ARRAY'; |
1765
|
|
|
|
|
|
|
|
1766
|
5630
|
|
|
|
|
6060
|
my $sum; |
1767
|
5630
|
|
|
|
|
6009
|
$sum += $_->[2] foreach @{$arr}; |
|
5630
|
|
|
|
|
81879
|
|
1768
|
5630
|
|
|
|
|
8577
|
$_->[2]/= $sum foreach @{$arr}; |
|
5630
|
|
|
|
|
68705
|
|
1769
|
5630
|
|
|
|
|
15044
|
return 1; |
1770
|
|
|
|
|
|
|
} |
1771
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
|
1774
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
sub DESTROY { |
1776
|
15
|
|
|
15
|
|
35508
|
undef $_[0]->{Graph} |
1777
|
|
|
|
|
|
|
} |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
1; |
1780
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
__END__ |