line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::VSM; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
4
|
|
|
|
|
|
|
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free |
5
|
|
|
|
|
|
|
# software. You may modify and/or distribute it under the same terms as Perl itself. |
6
|
|
|
|
|
|
|
# This copyright notice must remain attached to the file. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Algorithm::VSM is a Perl module for retrieving the documents from a software |
9
|
|
|
|
|
|
|
# library that match a list of words in a query. The matching criterion used depends |
10
|
|
|
|
|
|
|
# on whether you ask the module to construct a full-dimensionality VSM or a |
11
|
|
|
|
|
|
|
# reduced-dimensionality LSA model for the library. |
12
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
13
|
|
|
|
|
|
|
|
14
|
1
|
|
|
1
|
|
12635
|
use 5.10.0; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
15
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
25
|
|
16
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
27
|
|
17
|
1
|
|
|
1
|
|
3
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
70
|
|
18
|
1
|
|
|
1
|
|
386
|
use SDBM_File; |
|
1
|
|
|
|
|
1624
|
|
|
1
|
|
|
|
|
38
|
|
19
|
1
|
|
|
1
|
|
372
|
use PDL::Lite; |
|
1
|
|
|
|
|
127119
|
|
|
1
|
|
|
|
|
28
|
|
20
|
1
|
|
|
1
|
|
6
|
use PDL::MatrixOps; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6
|
|
21
|
1
|
|
|
1
|
|
151
|
use File::Basename; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
68
|
|
22
|
1
|
|
|
1
|
|
441
|
use File::Spec::Functions qw(rel2abs); |
|
1
|
|
|
|
|
612
|
|
|
1
|
|
|
|
|
55
|
|
23
|
1
|
|
|
1
|
|
5
|
use Fcntl; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
267
|
|
24
|
1
|
|
|
1
|
|
544
|
use Storable; |
|
1
|
|
|
|
|
2352
|
|
|
1
|
|
|
|
|
46
|
|
25
|
1
|
|
|
1
|
|
4
|
use Cwd; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
8326
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our $VERSION = '1.61'; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# for camelcase splits (from perlmonks): |
30
|
|
|
|
|
|
|
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
################################### Constructor ####################################### |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Constructor for creating a VSM or LSA model of a corpus. The model instance |
35
|
|
|
|
|
|
|
# returned by the constructor can be used for retrieving documents from the corpus |
36
|
|
|
|
|
|
|
# in response to queries. |
37
|
|
|
|
|
|
|
sub new { |
38
|
2
|
|
|
2
|
1
|
490
|
my ($class, %args) = @_; |
39
|
2
|
|
|
|
|
11
|
my @params = keys %args; |
40
|
2
|
50
|
|
|
|
11
|
croak "\nYou have used a wrong name for a keyword argument " . |
41
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
42
|
|
|
|
|
|
|
if _check_for_illegal_params(@params) == 0; |
43
|
2
|
50
|
50
|
|
|
5447
|
bless { |
|
|
50
|
50
|
|
|
|
|
|
|
50
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
44
|
|
|
|
|
|
|
_corpus_directory => $args{corpus_directory} || "", |
45
|
|
|
|
|
|
|
_save_model_on_disk => $args{save_model_on_disk} || 0, |
46
|
|
|
|
|
|
|
_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ? |
47
|
|
|
|
|
|
|
$args{break_camelcased_and_underscored} : 1, |
48
|
|
|
|
|
|
|
_corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db", |
49
|
|
|
|
|
|
|
_doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db", |
50
|
|
|
|
|
|
|
_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db", |
51
|
|
|
|
|
|
|
_stop_words_file => $args{stop_words_file} || "", |
52
|
|
|
|
|
|
|
_case_sensitive => $args{case_sensitive} || 0, |
53
|
|
|
|
|
|
|
_query_file => $args{query_file} || "", |
54
|
|
|
|
|
|
|
_file_types => $args{file_types} || [], |
55
|
|
|
|
|
|
|
_min_word_length => $args{min_word_length} || 4, |
56
|
|
|
|
|
|
|
_want_stemming => $args{want_stemming} || 0, |
57
|
|
|
|
|
|
|
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1, |
58
|
|
|
|
|
|
|
_max_number_retrievals => $args{max_number_retrievals} || 30, |
59
|
|
|
|
|
|
|
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01, |
60
|
|
|
|
|
|
|
_relevancy_threshold => exists $args{relevancy_threshold} ? |
61
|
|
|
|
|
|
|
$args{relevancy_threshold} : 1, |
62
|
|
|
|
|
|
|
_relevancy_file => $args{relevancy_file} || "", |
63
|
|
|
|
|
|
|
_debug => $args{debug} || 0, |
64
|
|
|
|
|
|
|
_working_directory => cwd, |
65
|
|
|
|
|
|
|
_vocab_hist_on_disk => {}, |
66
|
|
|
|
|
|
|
_vocab_hist => {}, |
67
|
|
|
|
|
|
|
_doc_hist_template => {}, |
68
|
|
|
|
|
|
|
_corpus_doc_vectors => {}, |
69
|
|
|
|
|
|
|
_normalized_doc_vecs => {}, |
70
|
|
|
|
|
|
|
_query_vector => {}, |
71
|
|
|
|
|
|
|
_stop_words => [], |
72
|
|
|
|
|
|
|
_term_document_matrix => [], |
73
|
|
|
|
|
|
|
_corpus_vocab_done => 0, |
74
|
|
|
|
|
|
|
_scan_dir_for_rels => 0, |
75
|
|
|
|
|
|
|
_vocab_size => undef, |
76
|
|
|
|
|
|
|
_doc_vecs_trunc_lsa => {}, |
77
|
|
|
|
|
|
|
_lsa_vec_truncator => undef, |
78
|
|
|
|
|
|
|
_queries_for_relevancy => {}, |
79
|
|
|
|
|
|
|
_relevancy_estimates => {}, |
80
|
|
|
|
|
|
|
_precision_for_queries => {}, |
81
|
|
|
|
|
|
|
_recall_for_queries => {}, |
82
|
|
|
|
|
|
|
_map_vals_for_queries => {}, |
83
|
|
|
|
|
|
|
_vocab_idf_hist => {}, |
84
|
|
|
|
|
|
|
_idf_t => {}, |
85
|
|
|
|
|
|
|
_total_num_of_docs => 0, |
86
|
|
|
|
|
|
|
}, $class; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
###################### Get corpus vocabulary and word counts ######################### |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub get_corpus_vocabulary_and_word_counts { |
93
|
2
|
|
|
2
|
1
|
32
|
my $self = shift; |
94
|
2
|
50
|
|
|
|
25
|
die "You must supply the name of the corpus directory to the constructor" |
95
|
|
|
|
|
|
|
unless $self->{_corpus_directory}; |
96
|
2
|
50
|
|
|
|
12
|
print "Scanning the directory '$self->{_corpus_directory}' for\n" . |
97
|
|
|
|
|
|
|
" model construction\n\n" if $self->{_debug}; |
98
|
2
|
|
|
|
|
18
|
$self->_scan_directory( $self->{_corpus_directory} ); |
99
|
2
|
50
|
|
|
|
9
|
$self->_drop_stop_words() if $self->{_stop_words_file}; |
100
|
2
|
50
|
|
|
|
6
|
if ($self->{_debug}) { |
101
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
0
|
|
|
|
|
0
|
|
102
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
2
|
50
|
|
|
|
7
|
if ($self->{_save_model_on_disk}) { |
106
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_corpus_vocab_db}.*"; |
107
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_doc_vectors_db}.*"; |
108
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_normalized_doc_vecs_db}.*"; |
109
|
0
|
0
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
0
|
|
|
|
|
0
|
|
110
|
|
|
|
|
|
|
$self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640 |
111
|
|
|
|
|
|
|
or die "Can't create DBM files: $!"; |
112
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist}}) { |
|
0
|
|
|
|
|
0
|
|
113
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_}; |
114
|
|
|
|
|
|
|
} |
115
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
0
|
|
|
|
|
0
|
|
116
|
|
|
|
|
|
|
} |
117
|
2
|
|
|
|
|
6
|
$self->{_corpus_vocab_done} = 1; |
118
|
2
|
|
|
|
|
3
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
2
|
|
|
|
|
13
|
|
119
|
2
|
50
|
|
|
|
6
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
120
|
|
|
|
|
|
|
if $self->{_debug}; |
121
|
|
|
|
|
|
|
# Calculate idf(t): |
122
|
2
|
|
|
|
|
2
|
foreach (keys %{$self->{_vocab_idf_hist}}) { |
|
2
|
|
|
|
|
25
|
|
123
|
200
|
|
|
|
|
373
|
$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs} |
124
|
|
|
|
|
|
|
/ |
125
|
|
|
|
|
|
|
(1 + $self->{_vocab_idf_hist}->{$_}))) |
126
|
|
|
|
|
|
|
/ log(10) ); |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub display_corpus_vocab { |
131
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
132
|
0
|
|
|
|
|
0
|
die "corpus vocabulary not yet constructed" |
133
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_hist}}; |
134
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying corpus vocabulary:\n\n"; |
135
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_vocab_hist}}){ |
|
0
|
|
|
|
|
0
|
|
136
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_}); |
137
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub display_corpus_vocab_size { |
142
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
143
|
0
|
|
|
|
|
0
|
die "corpus vocabulary not yet constructed" |
144
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_hist}}; |
145
|
0
|
|
|
|
|
0
|
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} ); |
|
0
|
|
|
|
|
0
|
|
146
|
0
|
|
|
|
|
0
|
print "\nSize of the corpus vocabulary: $vocab_size\n\n"; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub write_corpus_vocab_to_file { |
150
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
151
|
0
|
|
|
|
|
0
|
my $file = shift; |
152
|
0
|
0
|
|
|
|
0
|
die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}}; |
|
0
|
|
|
|
|
0
|
|
153
|
0
|
0
|
|
|
|
0
|
open OUT, "> $file" |
154
|
|
|
|
|
|
|
or die "unable to open for output a file with name `$file': $!"; |
155
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_vocab_hist}}){ |
|
0
|
|
|
|
|
0
|
|
156
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_}); |
157
|
0
|
|
|
|
|
0
|
print OUT "$outstring\n"; |
158
|
|
|
|
|
|
|
} |
159
|
0
|
|
|
|
|
0
|
close OUT; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub display_inverse_document_frequencies { |
163
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
164
|
0
|
|
|
|
|
0
|
die "corpus vocabulary not yet constructed" |
165
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_idf_hist}}; |
166
|
0
|
0
|
|
|
|
0
|
print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n" |
167
|
|
|
|
|
|
|
unless $self->{_idf_filter_option}; |
168
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying inverse document frequencies:\n"; |
169
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_idf_hist}} ) { |
|
0
|
|
|
|
|
0
|
|
170
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %d", |
171
|
|
|
|
|
|
|
$_, $self->{_vocab_idf_hist}->{$_}); |
172
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
173
|
|
|
|
|
|
|
} |
174
|
0
|
|
|
|
|
0
|
print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n"; |
175
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_idf_t}} ) { |
|
0
|
|
|
|
|
0
|
|
176
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_}); |
177
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub get_all_document_names { |
182
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
183
|
0
|
|
|
|
|
0
|
my @all_files = sort keys %{$self->{_corpus_doc_vectors}}; |
|
0
|
|
|
|
|
0
|
|
184
|
0
|
|
|
|
|
0
|
return \@all_files; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
############################ Generate Document Vectors ################################# |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub generate_document_vectors { |
190
|
2
|
|
|
2
|
1
|
28
|
my $self = shift; |
191
|
2
|
|
|
|
|
23
|
chdir $self->{_working_directory}; |
192
|
2
|
|
|
|
|
3
|
foreach ( sort keys %{$self->{_vocab_hist}} ) { |
|
2
|
|
|
|
|
95
|
|
193
|
200
|
|
|
|
|
211
|
$self->{_doc_hist_template}->{$_} = 0; |
194
|
|
|
|
|
|
|
} |
195
|
2
|
|
|
|
|
19
|
$self->_scan_directory( $self->{_corpus_directory} ); |
196
|
2
|
|
|
|
|
30
|
chdir $self->{_working_directory}; |
197
|
2
|
50
|
|
|
|
27
|
if ($self->{_save_model_on_disk}) { |
198
|
0
|
0
|
0
|
|
|
0
|
die "You did not specify in the constructor call the names for the diskfiles " . |
199
|
|
|
|
|
|
|
"for storing the disk-based hash tables consisting of document vectors " . |
200
|
|
|
|
|
|
|
"and their normalized versions" |
201
|
|
|
|
|
|
|
unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db}; |
202
|
0
|
|
|
|
|
0
|
eval { |
203
|
0
|
|
|
|
|
0
|
store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} ); |
204
|
|
|
|
|
|
|
}; |
205
|
0
|
0
|
|
|
|
0
|
if ($@) { |
206
|
0
|
|
|
|
|
0
|
print "Something went wrong with disk storage of document vectors: $@"; |
207
|
|
|
|
|
|
|
} |
208
|
0
|
|
|
|
|
0
|
eval { |
209
|
0
|
|
|
|
|
0
|
store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db}); |
210
|
|
|
|
|
|
|
}; |
211
|
0
|
0
|
|
|
|
0
|
if ($@) { |
212
|
0
|
|
|
|
|
0
|
print "Something wrong with disk storage of normalized doc vecs: $@"; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub display_doc_vectors { |
218
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
219
|
0
|
|
|
|
|
0
|
die "document vectors not yet constructed" |
220
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_corpus_doc_vectors}}; |
221
|
0
|
|
|
|
|
0
|
foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) { |
|
0
|
|
|
|
|
0
|
|
222
|
0
|
|
|
|
|
0
|
print "\n\ndisplay doc vec for $file:\n"; |
223
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) { |
|
0
|
|
|
|
|
0
|
|
224
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n"; |
225
|
|
|
|
|
|
|
} |
226
|
0
|
|
|
|
|
0
|
my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}}; |
|
0
|
|
|
|
|
0
|
|
227
|
0
|
|
|
|
|
0
|
print "\nSize of vector for $file: $docvec_size\n"; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub display_normalized_doc_vectors { |
232
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
233
|
0
|
|
|
|
|
0
|
die "normalized document vectors not yet constructed" |
234
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_normalized_doc_vecs}}; |
235
|
0
|
0
|
|
|
|
0
|
unless ($self->{_idf_filter_option}) { |
236
|
0
|
|
|
|
|
0
|
print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n"; |
237
|
0
|
|
|
|
|
0
|
return; |
238
|
|
|
|
|
|
|
} |
239
|
0
|
|
|
|
|
0
|
foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) { |
|
0
|
|
|
|
|
0
|
|
240
|
0
|
|
|
|
|
0
|
print "\n\ndisplay normalized doc vec for $file:\n"; |
241
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) { |
|
0
|
|
|
|
|
0
|
|
242
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n"; |
243
|
|
|
|
|
|
|
} |
244
|
0
|
|
|
|
|
0
|
my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}}; |
|
0
|
|
|
|
|
0
|
|
245
|
0
|
|
|
|
|
0
|
print "\nSize of normalized vector for $file: $docvec_size\n"; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
######################## Calculate Pairwise Document Similarities ###################### |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# Returns the similarity score for two documents whose actual names are are supplied |
252
|
|
|
|
|
|
|
# as its two arguments. |
253
|
|
|
|
|
|
|
sub pairwise_similarity_for_docs { |
254
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
255
|
0
|
|
|
|
|
0
|
my $doc1 = shift; |
256
|
0
|
|
|
|
|
0
|
my $doc2 = shift; |
257
|
0
|
|
|
|
|
0
|
my @all_files = keys %{$self->{_corpus_doc_vectors}}; |
|
0
|
|
|
|
|
0
|
|
258
|
0
|
0
|
|
|
|
0
|
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files); |
259
|
0
|
0
|
|
|
|
0
|
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files); |
260
|
0
|
|
|
|
|
0
|
my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1}; |
261
|
0
|
|
|
|
|
0
|
my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2}; |
262
|
0
|
|
|
|
|
0
|
my @vec1 = (); |
263
|
0
|
|
|
|
|
0
|
my @vec2 = (); |
264
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %$vec_hash_ref1) { |
265
|
0
|
|
|
|
|
0
|
push @vec1, $vec_hash_ref1->{$word}; |
266
|
0
|
|
|
|
|
0
|
push @vec2, $vec_hash_ref2->{$word}; |
267
|
|
|
|
|
|
|
} |
268
|
0
|
|
|
|
|
0
|
my $vec_mag1 = vec_magnitude(\@vec1); |
269
|
0
|
|
|
|
|
0
|
my $vec_mag2 = vec_magnitude(\@vec2); |
270
|
0
|
|
|
|
|
0
|
my $product = vec_scalar_product(\@vec1, \@vec2); |
271
|
0
|
|
|
|
|
0
|
$product /= $vec_mag1 * $vec_mag2; |
272
|
0
|
|
|
|
|
0
|
return $product; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub pairwise_similarity_for_normalized_docs { |
276
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
277
|
0
|
|
|
|
|
0
|
my $doc1 = shift; |
278
|
0
|
|
|
|
|
0
|
my $doc2 = shift; |
279
|
0
|
|
|
|
|
0
|
my @all_files = keys %{$self->{_corpus_doc_vectors}}; |
|
0
|
|
|
|
|
0
|
|
280
|
0
|
0
|
|
|
|
0
|
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files); |
281
|
0
|
0
|
|
|
|
0
|
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files); |
282
|
0
|
|
|
|
|
0
|
my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1}; |
283
|
0
|
|
|
|
|
0
|
my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2}; |
284
|
0
|
|
|
|
|
0
|
my @vec1 = (); |
285
|
0
|
|
|
|
|
0
|
my @vec2 = (); |
286
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %$vec_hash_ref1) { |
287
|
0
|
|
|
|
|
0
|
push @vec1, $vec_hash_ref1->{$word}; |
288
|
0
|
|
|
|
|
0
|
push @vec2, $vec_hash_ref2->{$word}; |
289
|
|
|
|
|
|
|
} |
290
|
0
|
|
|
|
|
0
|
my $vec_mag1 = vec_magnitude(\@vec1); |
291
|
0
|
|
|
|
|
0
|
my $vec_mag2 = vec_magnitude(\@vec2); |
292
|
0
|
|
|
|
|
0
|
my $product = vec_scalar_product(\@vec1, \@vec2); |
293
|
0
|
|
|
|
|
0
|
$product /= $vec_mag1 * $vec_mag2; |
294
|
0
|
|
|
|
|
0
|
return $product; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
############################### Retrieve with VSM Model ################################ |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
sub retrieve_with_vsm { |
300
|
1
|
|
|
1
|
1
|
18
|
my $self = shift; |
301
|
1
|
|
|
|
|
4
|
my $query = shift; |
302
|
1
|
|
|
|
|
1
|
my @clean_words; |
303
|
1
|
|
|
|
|
5
|
my $min = $self->{_min_word_length}; |
304
|
|
|
|
|
|
|
|
305
|
1
|
50
|
|
|
|
10
|
if ($self->{_break_camelcased_and_underscored}) { |
306
|
1
|
|
|
|
|
31
|
my @brokenup = grep $_, split /\W|_|\s+/, "@$query"; |
307
|
1
|
|
|
|
|
4
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
8
|
|
|
|
|
30
|
|
308
|
0
|
|
|
|
|
0
|
@clean_words = $self->{_case_sensitive} ? |
309
|
0
|
0
|
|
|
|
0
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words : |
|
8
|
|
|
|
|
63
|
|
310
|
1
|
100
|
|
|
|
7
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
8
|
50
|
|
|
|
30
|
|
311
|
|
|
|
|
|
|
} else { |
312
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query"; |
313
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
314
|
|
|
|
|
|
|
} |
315
|
1
|
|
|
|
|
4
|
$query = \@clean_words; |
316
|
1
|
50
|
|
|
|
51
|
print "\nYour query words are: @$query\n" if $self->{_debug}; |
317
|
1
|
50
|
|
|
|
4
|
if ($self->{_idf_filter_option}) { |
318
|
1
|
|
|
|
|
9
|
die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()" |
319
|
1
|
|
|
|
|
6
|
unless scalar(keys %{$self->{_vocab_hist}}) |
320
|
1
|
50
|
50
|
|
|
2
|
&& scalar(keys %{$self->{_normalized_doc_vecs}}); |
321
|
|
|
|
|
|
|
} else { |
322
|
0
|
|
|
|
|
0
|
die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()" |
323
|
0
|
|
|
|
|
0
|
unless scalar(keys %{$self->{_vocab_hist}}) |
324
|
0
|
0
|
0
|
|
|
0
|
&& scalar(keys %{$self->{_corpus_doc_vectors}}); |
325
|
|
|
|
|
|
|
} |
326
|
1
|
|
|
|
|
7
|
foreach ( keys %{$self->{_vocab_hist}} ) { |
|
1
|
|
|
|
|
34
|
|
327
|
100
|
|
|
|
|
127
|
$self->{_query_vector}->{$_} = 0; |
328
|
|
|
|
|
|
|
} |
329
|
1
|
|
|
|
|
7
|
foreach (@$query) { |
330
|
7
|
50
|
|
|
|
12
|
if ($self->{_case_sensitive}) { |
331
|
0
|
0
|
|
|
|
0
|
$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_}; |
332
|
|
|
|
|
|
|
} else { |
333
|
7
|
100
|
|
|
|
21
|
$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"}; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
} |
336
|
1
|
|
|
|
|
1
|
my @query_word_counts = values %{$self->{_query_vector}}; |
|
1
|
|
|
|
|
13
|
|
337
|
1
|
|
|
|
|
6
|
my $query_word_count_total = reduce(\@query_word_counts); |
338
|
1
|
50
|
|
|
|
15
|
die "\nYour query does not contain corpus words. Nothing retrieved.\n" |
339
|
|
|
|
|
|
|
unless $query_word_count_total; |
340
|
1
|
|
|
|
|
3
|
my %retrievals; |
341
|
1
|
50
|
|
|
|
4
|
if ($self->{_idf_filter_option}) { |
342
|
1
|
50
|
|
|
|
5
|
print "\n\nUsing idf filter option for retrieval:\n\n" |
343
|
|
|
|
|
|
|
if $self->{_debug}; |
344
|
1
|
|
|
|
|
2
|
foreach (sort {$self->_doc_vec_comparator} |
|
15
|
|
|
|
|
34
|
|
|
1
|
|
|
|
|
12
|
|
345
|
|
|
|
|
|
|
keys %{$self->{_normalized_doc_vecs}}) { |
346
|
8
|
|
|
|
|
17
|
$retrievals{$_} = $self->_similarity_to_query($_); |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
} else { |
349
|
0
|
0
|
|
|
|
0
|
print "\n\nNOT using idf filter option for retrieval:\n\n" |
350
|
|
|
|
|
|
|
if $self->{_debug}; |
351
|
0
|
|
|
|
|
0
|
foreach (sort {$self->_doc_vec_comparator} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
352
|
|
|
|
|
|
|
keys %{$self->{_corpus_doc_vectors}}) { |
353
|
0
|
|
|
|
|
0
|
$retrievals{$_} = $self->_similarity_to_query($_); |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
} |
356
|
1
|
50
|
|
|
|
5
|
if ($self->{_debug}) { |
357
|
0
|
|
|
|
|
0
|
print "\n\nShowing the VSM retrievals and the similarity scores:\n\n"; |
358
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { |
|
0
|
|
|
|
|
0
|
|
359
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals{$_}\n"; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
1
|
|
|
|
|
12
|
return \%retrievals; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
######################### Upload a Previously Constructed Model ######################### |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub upload_vsm_model_from_disk { |
368
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
369
|
0
|
0
|
0
|
|
|
0
|
die "\nCannot find the database files for the VSM model" |
370
|
|
|
|
|
|
|
unless -s "$self->{_corpus_vocab_db}.pag" |
371
|
|
|
|
|
|
|
&& -s $self->{_doc_vectors_db}; |
372
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
373
|
0
|
0
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
0
|
|
|
|
|
0
|
|
374
|
|
|
|
|
|
|
$self->{_corpus_vocab_db}, O_RDONLY, 0640 |
375
|
|
|
|
|
|
|
or die "Can't open DBM file: $!"; |
376
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
377
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
0
|
|
|
|
|
0
|
|
378
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
} |
381
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist_on_disk}}) { |
|
0
|
|
|
|
|
0
|
|
382
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_}; |
383
|
|
|
|
|
|
|
} |
384
|
0
|
|
|
|
|
0
|
$self->{_corpus_vocab_done} = 1; |
385
|
0
|
|
|
|
|
0
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
0
|
|
|
|
|
0
|
|
386
|
0
|
0
|
|
|
|
0
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
387
|
|
|
|
|
|
|
if $self->{_debug}; |
388
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
389
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
0
|
|
|
|
|
0
|
|
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub upload_normalized_vsm_model_from_disk { |
393
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
394
|
0
|
0
|
0
|
|
|
0
|
die "\nCannot find the database files for the VSM model" |
395
|
|
|
|
|
|
|
unless -s "$self->{_corpus_vocab_db}.pag" |
396
|
|
|
|
|
|
|
&& -s $self->{_normalized_doc_vecs_db}; |
397
|
0
|
|
|
|
|
0
|
$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db}); |
398
|
0
|
0
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
0
|
|
|
|
|
0
|
|
399
|
|
|
|
|
|
|
$self->{_corpus_vocab_db}, O_RDONLY, 0640 |
400
|
|
|
|
|
|
|
or die "Can't open DBM file: $!"; |
401
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
402
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
0
|
|
|
|
|
0
|
|
403
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
} |
406
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist_on_disk}}) { |
|
0
|
|
|
|
|
0
|
|
407
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_}; |
408
|
|
|
|
|
|
|
} |
409
|
0
|
|
|
|
|
0
|
$self->{_corpus_vocab_done} = 1; |
410
|
0
|
|
|
|
|
0
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
0
|
|
|
|
|
0
|
|
411
|
0
|
0
|
|
|
|
0
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
412
|
|
|
|
|
|
|
if $self->{_debug}; |
413
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
0
|
|
|
|
|
0
|
|
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
############################## Display Retrieval Results ################################ |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub display_retrievals { |
419
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
420
|
0
|
|
|
|
|
0
|
my $retrievals = shift; |
421
|
0
|
|
|
|
|
0
|
print "\n\nShowing the retrievals and the similarity scores:\n\n"; |
422
|
0
|
|
|
|
|
0
|
my $iter = 0; |
423
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){ |
|
0
|
|
|
|
|
0
|
|
424
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals->{$_}\n"; |
425
|
0
|
|
|
|
|
0
|
$iter++; |
426
|
0
|
0
|
|
|
|
0
|
last if $iter > $self->{_max_number_retrievals}; |
427
|
|
|
|
|
|
|
} |
428
|
0
|
|
|
|
|
0
|
print "\n\n"; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
############################### Directory Scanner ################################ |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub _scan_directory { |
434
|
4
|
|
|
4
|
|
17
|
my $self = shift; |
435
|
4
|
|
|
|
|
32
|
my $dir = rel2abs( shift ); |
436
|
4
|
|
|
|
|
9878
|
my $current_dir = cwd; |
437
|
4
|
50
|
|
|
|
136
|
chdir $dir or die "Unable to change directory to $dir: $!"; |
438
|
4
|
|
|
|
|
558
|
foreach ( glob "*" ) { |
439
|
32
|
50
|
33
|
|
|
2567
|
if ( -d and !(-l) ) { |
|
|
50
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
440
|
0
|
|
|
|
|
0
|
$self->_scan_directory( $_ ); |
441
|
0
|
0
|
|
|
|
0
|
chdir $dir |
442
|
|
|
|
|
|
|
or die "Unable to change directory to $dir: $!"; |
443
|
|
|
|
|
|
|
} elsif (-r _ and |
444
|
|
|
|
|
|
|
-T _ and |
445
|
|
|
|
|
|
|
-M _ > 0.00001 and # modification age is at least 1 sec |
446
|
|
|
|
|
|
|
!( -l $_ ) and |
447
|
|
|
|
|
|
|
$self->ok_to_filetype($_) ) { |
448
|
32
|
50
|
|
|
|
77
|
$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels}; |
449
|
32
|
100
|
|
|
|
101
|
$self->_scan_file($_) unless $self->{_corpus_vocab_done}; |
450
|
32
|
100
|
|
|
|
129
|
$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done}; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
} |
453
|
4
|
|
|
|
|
100
|
chdir $current_dir; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
sub _scan_file { |
457
|
16
|
|
|
16
|
|
17
|
my $self = shift; |
458
|
16
|
|
|
|
|
18
|
my $file = shift; |
459
|
16
|
|
|
|
|
381
|
open IN, $file; |
460
|
16
|
|
|
|
|
31
|
my $min = $self->{_min_word_length}; |
461
|
16
|
|
|
|
|
34
|
my %uniques = (); |
462
|
16
|
|
|
|
|
142
|
while () { |
463
|
834
|
100
|
|
|
|
2524
|
next if /^[ ]*\r?\n?$/; |
464
|
710
|
|
|
|
|
1913
|
$_ =~ s/\r?\n?$//; |
465
|
710
|
|
|
|
|
610
|
my @clean_words; |
466
|
710
|
50
|
|
|
|
929
|
if ($self->{_break_camelcased_and_underscored}) { |
467
|
710
|
|
|
|
|
6819
|
my @brokenup = grep $_, split /\W|_|\s+/, $_; |
468
|
710
|
|
|
|
|
1131
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
2574
|
|
|
|
|
6930
|
|
469
|
0
|
|
|
|
|
0
|
@clean_words = $self->{_case_sensitive} ? |
470
|
0
|
0
|
|
|
|
0
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words : |
|
2712
|
|
|
|
|
4892
|
|
471
|
710
|
100
|
|
|
|
1374
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
2712
|
50
|
|
|
|
6549
|
|
472
|
|
|
|
|
|
|
} else { |
473
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_; |
474
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
475
|
|
|
|
|
|
|
} |
476
|
710
|
100
|
|
|
|
2029
|
next unless @clean_words; |
477
|
426
|
50
|
|
|
|
1042
|
@clean_words = grep $_, map &simple_stemmer($_), @clean_words |
478
|
|
|
|
|
|
|
if $self->{_want_stemming}; |
479
|
426
|
50
|
|
|
|
863
|
if ($self->{_case_sensitive}) { |
480
|
0
|
|
|
|
|
0
|
map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words; |
|
0
|
|
|
|
|
0
|
|
481
|
|
|
|
|
|
|
} else { |
482
|
426
|
|
|
|
|
551
|
map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words; |
|
1718
|
|
|
|
|
2444
|
|
483
|
|
|
|
|
|
|
} |
484
|
426
|
|
|
|
|
636
|
for (@clean_words) { $uniques{$_}++ }; |
|
1718
|
|
|
|
|
3004
|
|
485
|
|
|
|
|
|
|
} |
486
|
16
|
|
|
|
|
149
|
close( IN ); |
487
|
16
|
|
|
|
|
107
|
map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques; |
|
820
|
|
|
|
|
816
|
|
488
|
16
|
|
|
|
|
110
|
$self->{_total_num_of_docs}++; |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
sub ok_to_filetype { |
492
|
32
|
|
|
32
|
0
|
61
|
my $self = shift; |
493
|
32
|
|
|
|
|
88
|
my $filename = shift; |
494
|
32
|
|
|
|
|
1525
|
my ($base, $dir, $suffix) = fileparse($filename, '\..*'); |
495
|
32
|
|
|
|
|
126
|
croak "You called this module without specifying the file types in the constructor" |
496
|
32
|
50
|
|
|
|
59
|
unless @{$self->{_file_types}} > 0; |
497
|
32
|
50
|
|
|
|
61
|
return 1 if contained_in($suffix, @{$self->{_file_types}}); |
|
32
|
|
|
|
|
169
|
|
498
|
0
|
|
|
|
|
0
|
return 0; |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
############################## LSA Modeling and Retrieval ################################ |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub construct_lsa_model { |
504
|
1
|
|
|
1
|
1
|
18
|
my $self = shift; |
505
|
1
|
50
|
|
|
|
10
|
if ($self->{_idf_filter_option}) { |
506
|
1
|
50
|
33
|
|
|
15
|
if (!$self->{_normalized_doc_vecs} and |
507
|
|
|
|
|
|
|
-s $self->{_normalized_doc_vecs_db}) { |
508
|
0
|
|
|
|
|
0
|
$self->{_normalized_doc_vecs} = |
509
|
|
|
|
|
|
|
retrieve($self->{_normalized_doc_vecs_db}); |
510
|
|
|
|
|
|
|
} |
511
|
1
|
|
|
|
|
4
|
foreach (sort keys %{$self->{_normalized_doc_vecs}}) { |
|
1
|
|
|
|
|
18
|
|
512
|
8
|
|
|
|
|
10
|
my $term_frequency_vec; |
513
|
8
|
|
|
|
|
11
|
foreach my $word (sort keys |
|
8
|
|
|
|
|
504
|
|
514
|
|
|
|
|
|
|
%{$self->{_normalized_doc_vecs}->{$_}}){ |
515
|
800
|
|
|
|
|
1354
|
push @$term_frequency_vec, |
516
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$_}->{$word}; |
517
|
|
|
|
|
|
|
} |
518
|
8
|
|
|
|
|
60
|
push @{$self->{_term_document_matrix}}, $term_frequency_vec; |
|
8
|
|
|
|
|
28
|
|
519
|
|
|
|
|
|
|
} |
520
|
|
|
|
|
|
|
} else { |
521
|
0
|
0
|
0
|
|
|
0
|
if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) { |
522
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
523
|
|
|
|
|
|
|
} |
524
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_corpus_doc_vectors}}) { |
|
0
|
|
|
|
|
0
|
|
525
|
0
|
|
|
|
|
0
|
my $term_frequency_vec; |
526
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){ |
|
0
|
|
|
|
|
0
|
|
527
|
0
|
|
|
|
|
0
|
push @$term_frequency_vec, |
528
|
|
|
|
|
|
|
$self->{_corpus_doc_vectors}->{$_}->{$word}; |
529
|
|
|
|
|
|
|
} |
530
|
0
|
|
|
|
|
0
|
push @{$self->{_term_document_matrix}}, $term_frequency_vec; |
|
0
|
|
|
|
|
0
|
|
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
} |
533
|
1
|
|
|
|
|
3
|
my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) ); |
|
1
|
|
|
|
|
12
|
|
534
|
1
|
|
|
|
|
497
|
my ($U,$SIGMA,$V) = svd $A; |
535
|
1
|
50
|
|
|
|
12
|
print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug}; |
536
|
1
|
50
|
|
|
|
6
|
print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug}; |
537
|
1
|
|
|
|
|
11
|
my $index = return_index_of_last_value_above_threshold($SIGMA, |
538
|
|
|
|
|
|
|
$self->{_lsa_svd_threshold}); |
539
|
1
|
|
|
|
|
8
|
my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever; |
540
|
1
|
50
|
|
|
|
32
|
print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug}; |
541
|
|
|
|
|
|
|
# When you measure the size of a matrix in PDL, the zeroth dimension |
542
|
|
|
|
|
|
|
# is considered to be along the horizontal and the one-th dimension |
543
|
|
|
|
|
|
|
# along the rows. This is opposite of how we want to look at |
544
|
|
|
|
|
|
|
# matrices. For a matrix of size MxN, we mean M rows and N columns. |
545
|
|
|
|
|
|
|
# With this 'rows x columns' convention for matrix size, if you had |
546
|
|
|
|
|
|
|
# to check the size of, say, U matrix, you would call |
547
|
|
|
|
|
|
|
# my @size = ( $U->getdim(1), $U->getdim(0) ); |
548
|
|
|
|
|
|
|
# print "\nsize of U: @size\n"; |
549
|
1
|
|
|
|
|
7
|
my $U_trunc = $U->slice("0:$index,:")->sever; |
550
|
1
|
|
|
|
|
49
|
my $V_trunc = $V->slice("0:$index,0:$index")->sever; |
551
|
1
|
|
|
|
|
29
|
$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x |
552
|
|
|
|
|
|
|
PDL::Basic::transpose($U_trunc); |
553
|
1
|
50
|
|
|
|
10129
|
print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n" |
554
|
|
|
|
|
|
|
if $self->{_debug}; |
555
|
1
|
|
|
|
|
10
|
my @sorted_doc_names = $self->{_idf_filter_option} ? |
556
|
0
|
|
|
|
|
0
|
sort keys %{$self->{_normalized_doc_vecs}} : |
557
|
1
|
50
|
|
|
|
7
|
sort keys %{$self->{_corpus_doc_vectors}}; |
558
|
1
|
|
|
|
|
2
|
my $i = 0; |
559
|
1
|
|
|
|
|
2
|
foreach (@{$self->{_term_document_matrix}}) { |
|
1
|
|
|
|
|
3
|
|
560
|
8
|
|
|
|
|
17
|
my $truncated_doc_vec = $self->{_lsa_vec_truncator} x |
561
|
|
|
|
|
|
|
PDL::Basic::transpose(pdl($_)); |
562
|
8
|
|
|
|
|
883
|
my $doc_name = $sorted_doc_names[$i++]; |
563
|
8
|
50
|
|
|
|
24
|
print "\n\nTruncated doc vec for $doc_name: " . |
564
|
|
|
|
|
|
|
$truncated_doc_vec . "\n" if $self->{_debug}; |
565
|
8
|
|
|
|
|
20
|
$self->{_doc_vecs_trunc_lsa}->{$doc_name} |
566
|
|
|
|
|
|
|
= $truncated_doc_vec; |
567
|
|
|
|
|
|
|
} |
568
|
1
|
|
|
|
|
39
|
chdir $self->{_working_directory}; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
sub retrieve_with_lsa { |
572
|
1
|
|
|
1
|
1
|
11
|
my $self = shift; |
573
|
1
|
|
|
|
|
3
|
my $query = shift; |
574
|
1
|
|
|
|
|
2
|
my @clean_words; |
575
|
1
|
|
|
|
|
3
|
my $min = $self->{_min_word_length}; |
576
|
1
|
50
|
|
|
|
4
|
if ($self->{_break_camelcased_and_underscored}) { |
577
|
1
|
|
|
|
|
26
|
my @brokenup = grep $_, split /\W|_|\s+/, "@$query"; |
578
|
1
|
|
|
|
|
4
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
8
|
|
|
|
|
24
|
|
579
|
1
|
100
|
|
|
|
2
|
@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
8
|
|
|
|
|
78
|
|
|
8
|
|
|
|
|
29
|
|
580
|
|
|
|
|
|
|
} else { |
581
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query"; |
582
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
583
|
|
|
|
|
|
|
} |
584
|
1
|
|
|
|
|
3
|
$query = \@clean_words; |
585
|
1
|
50
|
|
|
|
4
|
print "\nYour processed query words are: @$query\n" if $self->{_debug}; |
586
|
1
|
|
|
|
|
7
|
die "Your vocabulary histogram is empty" |
587
|
1
|
50
|
|
|
|
2
|
unless scalar(keys %{$self->{_vocab_hist}}); |
588
|
1
|
|
|
|
|
5
|
die "You must first construct an LSA model" |
589
|
1
|
50
|
|
|
|
2
|
unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}}); |
590
|
1
|
|
|
|
|
2
|
foreach ( keys %{$self->{_vocab_hist}} ) { |
|
1
|
|
|
|
|
45
|
|
591
|
100
|
|
|
|
|
110
|
$self->{_query_vector}->{$_} = 0; |
592
|
|
|
|
|
|
|
} |
593
|
1
|
|
|
|
|
5
|
foreach (@$query) { |
594
|
7
|
100
|
|
|
|
17
|
$self->{_query_vector}->{"\L$_"}++ |
595
|
|
|
|
|
|
|
if exists $self->{_vocab_hist}->{"\L$_"}; |
596
|
|
|
|
|
|
|
} |
597
|
1
|
|
|
|
|
2
|
my @query_word_counts = values %{$self->{_query_vector}}; |
|
1
|
|
|
|
|
10
|
|
598
|
1
|
|
|
|
|
7
|
my $query_word_count_total = reduce(\@query_word_counts); |
599
|
1
|
50
|
|
|
|
8
|
die "Query does not contain corpus words. Nothing retrieved." |
600
|
|
|
|
|
|
|
unless $query_word_count_total; |
601
|
1
|
|
|
|
|
2
|
my $query_vec; |
602
|
1
|
|
|
|
|
2
|
foreach (sort keys %{$self->{_query_vector}}) { |
|
1
|
|
|
|
|
51
|
|
603
|
100
|
|
|
|
|
99
|
push @$query_vec, $self->{_query_vector}->{$_}; |
604
|
|
|
|
|
|
|
} |
605
|
1
|
50
|
|
|
|
7
|
print "\n\nQuery vector: @$query_vec\n" if $self->{_debug}; |
606
|
1
|
|
|
|
|
4
|
my $truncated_query_vec = $self->{_lsa_vec_truncator} x |
607
|
|
|
|
|
|
|
PDL::Basic::transpose(pdl($query_vec)); |
608
|
1
|
50
|
|
|
|
131
|
print "\n\nTruncated query vector: " . $truncated_query_vec . "\n" |
609
|
|
|
|
|
|
|
if $self->{_debug}; |
610
|
1
|
|
|
|
|
1
|
my %retrievals; |
611
|
1
|
|
|
|
|
2
|
foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) { |
|
1
|
|
|
|
|
6
|
|
612
|
8
|
|
|
|
|
68
|
my $dot_product = PDL::Basic::transpose($truncated_query_vec) |
613
|
|
|
|
|
|
|
x pdl($self->{_doc_vecs_trunc_lsa}->{$_}); |
614
|
8
|
50
|
|
|
|
487
|
print "\n\nLSA: dot product of truncated query and\n" . |
615
|
|
|
|
|
|
|
" truncated vec for doc $_ is " . $dot_product->sclr . "\n" |
616
|
|
|
|
|
|
|
if $self->{_debug}; |
617
|
8
|
|
|
|
|
14
|
$retrievals{$_} = $dot_product->sclr; |
618
|
|
|
|
|
|
|
} |
619
|
1
|
50
|
|
|
|
13
|
if ($self->{_debug}) { |
620
|
0
|
|
|
|
|
0
|
print "\n\nShowing LSA retrievals and similarity scores:\n\n"; |
621
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { |
|
0
|
|
|
|
|
0
|
|
622
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals{$_}\n"; |
623
|
|
|
|
|
|
|
} |
624
|
0
|
|
|
|
|
0
|
print "\n\n"; |
625
|
|
|
|
|
|
|
} |
626
|
1
|
|
|
|
|
12
|
return \%retrievals; |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
sub _construct_doc_vector { |
630
|
16
|
|
|
16
|
|
24
|
my $self = shift; |
631
|
16
|
|
|
|
|
29
|
my $file = shift; |
632
|
16
|
|
|
|
|
22
|
my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})}; |
|
16
|
|
|
|
|
51
|
|
633
|
16
|
|
|
|
|
174
|
foreach ( sort keys %{$self->{_doc_hist_template}} ) { |
|
16
|
|
|
|
|
755
|
|
634
|
1600
|
|
|
|
|
1499
|
$document_vector{$_} = 0; |
635
|
|
|
|
|
|
|
} |
636
|
16
|
|
|
|
|
111
|
my $min = $self->{_min_word_length}; |
637
|
16
|
|
|
|
|
34
|
my $total_words_in_doc = 0; |
638
|
16
|
50
|
|
|
|
449
|
unless (open IN, $file) { |
639
|
0
|
0
|
|
|
|
0
|
print "Unable to open file $file in the corpus: $!\n" |
640
|
|
|
|
|
|
|
if $self->{_debug}; |
641
|
0
|
|
|
|
|
0
|
return; |
642
|
|
|
|
|
|
|
} |
643
|
16
|
|
|
|
|
257
|
while () { |
644
|
834
|
100
|
|
|
|
2823
|
next if /^[ ]*\r?\n?$/; |
645
|
710
|
|
|
|
|
2175
|
$_ =~ s/\r?\n?$//; |
646
|
710
|
|
|
|
|
5708
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_; |
647
|
710
|
|
|
|
|
1027
|
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
4702
|
|
|
|
|
7927
|
|
|
4702
|
|
|
|
|
7359
|
|
648
|
710
|
100
|
|
|
|
2290
|
next unless @clean_words; |
649
|
426
|
50
|
|
|
|
1403
|
@clean_words = grep $_, |
650
|
|
|
|
|
|
|
map &simple_stemmer($_, $self->{_debug}), @clean_words |
651
|
|
|
|
|
|
|
if $self->{_want_stemming}; |
652
|
0
|
|
|
|
|
0
|
$self->{_case_sensitive} ? |
653
|
0
|
|
|
|
|
0
|
map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words : |
|
1470
|
|
|
|
|
3140
|
|
654
|
1592
|
|
|
|
|
2631
|
map { $document_vector{"\L$_"}++ } |
655
|
426
|
50
|
|
|
|
1043
|
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; |
656
|
|
|
|
|
|
|
} |
657
|
16
|
|
|
|
|
164
|
close IN; |
658
|
16
|
50
|
|
|
|
74
|
die "Something went wrong. Doc vector size unequal to vocab size" |
659
|
|
|
|
|
|
|
unless $self->{_vocab_size} == scalar(keys %document_vector); |
660
|
16
|
|
|
|
|
212
|
foreach (keys %document_vector) { |
661
|
1600
|
|
|
|
|
1548
|
$total_words_in_doc += $document_vector{$_}; |
662
|
|
|
|
|
|
|
} |
663
|
16
|
|
|
|
|
74
|
my %normalized_doc_vec; |
664
|
16
|
50
|
|
|
|
49
|
if ($self->{_idf_filter_option}) { |
665
|
16
|
|
|
|
|
136
|
foreach (keys %document_vector) { |
666
|
1600
|
|
|
|
|
2666
|
$normalized_doc_vec{$_} = $document_vector{$_} |
667
|
|
|
|
|
|
|
* |
668
|
|
|
|
|
|
|
$self->{_idf_t}->{$_} |
669
|
|
|
|
|
|
|
/ |
670
|
|
|
|
|
|
|
$total_words_in_doc; |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
} |
673
|
16
|
|
|
|
|
38053
|
my $pwd = cwd; |
674
|
16
|
|
|
|
|
342
|
$pwd =~ m{$self->{_corpus_directory}.?(\S*)$}; |
675
|
16
|
|
|
|
|
68
|
my $file_path_name; |
676
|
16
|
50
|
|
|
|
123
|
unless ( $1 eq "" ) { |
677
|
0
|
|
|
|
|
0
|
$file_path_name = "$1/$file"; |
678
|
|
|
|
|
|
|
} else { |
679
|
16
|
|
|
|
|
30
|
$file_path_name = $file; |
680
|
|
|
|
|
|
|
} |
681
|
16
|
|
|
|
|
127
|
$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector; |
682
|
16
|
|
|
|
|
252
|
$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec; |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
################################### Drop Stop Words ################################### |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
sub _drop_stop_words { |
688
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
689
|
0
|
0
|
|
|
|
0
|
open( IN, "$self->{_working_directory}/$self->{_stop_words_file}") |
690
|
|
|
|
|
|
|
or die "unable to open stop words file: $!"; |
691
|
0
|
|
|
|
|
0
|
while () { |
692
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
693
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*$/; |
694
|
0
|
|
|
|
|
0
|
chomp; |
695
|
0
|
0
|
|
|
|
0
|
delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_}; |
696
|
0
|
|
|
|
|
0
|
unshift @{$self->{_stop_words}}, $_; |
|
0
|
|
|
|
|
0
|
|
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
################################### Support Methods #################################### |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub _doc_vec_comparator { |
703
|
15
|
|
|
15
|
|
16
|
my $self = shift; |
704
|
15
|
|
|
|
|
15
|
my %query_vector = %{$self->{_query_vector}}; |
|
15
|
|
|
|
|
351
|
|
705
|
15
|
50
|
|
|
|
87
|
my $vec1_hash_ref = $self->{_idf_filter_option} ? |
706
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$a} : |
707
|
|
|
|
|
|
|
$self->{_corpus_doc_vectors}->{$a}; |
708
|
15
|
50
|
|
|
|
29
|
my $vec2_hash_ref = $self->{_idf_filter_option} ? |
709
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$b} : |
710
|
|
|
|
|
|
|
$self->{_corpus_doc_vectors}->{$b}; |
711
|
15
|
|
|
|
|
18
|
my @vec1 = (); |
712
|
15
|
|
|
|
|
17
|
my @vec2 = (); |
713
|
15
|
|
|
|
|
11
|
my @qvec = (); |
714
|
15
|
|
|
|
|
12
|
foreach my $word (sort keys %{$self->{_vocab_hist}}) { |
|
15
|
|
|
|
|
560
|
|
715
|
1500
|
|
|
|
|
1369
|
push @vec1, $vec1_hash_ref->{$word}; |
716
|
1500
|
|
|
|
|
1319
|
push @vec2, $vec2_hash_ref->{$word}; |
717
|
1500
|
|
|
|
|
1554
|
push @qvec, $query_vector{$word}; |
718
|
|
|
|
|
|
|
} |
719
|
15
|
|
|
|
|
89
|
my $vec1_mag = vec_magnitude(\@vec1); |
720
|
15
|
|
|
|
|
24
|
my $vec2_mag = vec_magnitude(\@vec2); |
721
|
15
|
|
|
|
|
27
|
my $qvec_mag = vec_magnitude(\@qvec); |
722
|
15
|
|
|
|
|
26
|
my $product1 = vec_scalar_product(\@vec1, \@qvec); |
723
|
15
|
|
|
|
|
17
|
$product1 /= $vec1_mag * $qvec_mag; |
724
|
15
|
|
|
|
|
27
|
my $product2 = vec_scalar_product(\@vec2, \@qvec); |
725
|
15
|
|
|
|
|
19
|
$product2 /= $vec2_mag * $qvec_mag; |
726
|
15
|
100
|
|
|
|
175
|
return 1 if $product1 < $product2; |
727
|
5
|
50
|
|
|
|
9
|
return 0 if $product1 == $product2; |
728
|
5
|
50
|
|
|
|
89
|
return -1 if $product1 > $product2; |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub _similarity_to_query { |
732
|
8
|
|
|
8
|
|
9
|
my $self = shift; |
733
|
8
|
|
|
|
|
9
|
my $doc_name = shift; |
734
|
8
|
50
|
|
|
|
20
|
my $vec_hash_ref = $self->{_idf_filter_option} ? |
735
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$doc_name} : |
736
|
|
|
|
|
|
|
$self->{_corpus_doc_vectors}->{$doc_name}; |
737
|
8
|
|
|
|
|
10
|
my @vec = (); |
738
|
8
|
|
|
|
|
7
|
my @qvec = (); |
739
|
8
|
|
|
|
|
328
|
foreach my $word (sort keys %$vec_hash_ref) { |
740
|
800
|
|
|
|
|
655
|
push @vec, $vec_hash_ref->{$word}; |
741
|
800
|
|
|
|
|
758
|
push @qvec, $self->{_query_vector}->{$word}; |
742
|
|
|
|
|
|
|
} |
743
|
8
|
|
|
|
|
39
|
my $vec_mag = vec_magnitude(\@vec); |
744
|
8
|
|
|
|
|
15
|
my $qvec_mag = vec_magnitude(\@qvec); |
745
|
8
|
|
|
|
|
12
|
my $product = vec_scalar_product(\@vec, \@qvec); |
746
|
8
|
|
|
|
|
14
|
$product /= $vec_mag * $qvec_mag; |
747
|
8
|
|
|
|
|
56
|
return $product; |
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
###################### Relevance Judgments for Testing Purposes ####################### |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
## IMPORTANT: This estimation of document relevancies to queries is NOT for |
753
|
|
|
|
|
|
|
## serious work. A document is considered to be relevant to a |
754
|
|
|
|
|
|
|
## query if it contains several of the query words. As to the |
755
|
|
|
|
|
|
|
## minimum number of query words that must exist in a document |
756
|
|
|
|
|
|
|
## in order for the latter to be considered relevant is |
757
|
|
|
|
|
|
|
## determined by the relevancy_threshold parameter in the VSM |
758
|
|
|
|
|
|
|
## constructor. (See the relevancy and precision-recall related |
759
|
|
|
|
|
|
|
## scripts in the 'examples' directory.) The reason for why the |
760
|
|
|
|
|
|
|
## function shown below is not for serious work is because |
761
|
|
|
|
|
|
|
## ultimately it is the humans who are the best judges of the |
762
|
|
|
|
|
|
|
## relevancies of documents to queries. The humans bring to |
763
|
|
|
|
|
|
|
## bear semantic considerations on the relevancy determination |
764
|
|
|
|
|
|
|
## problem that are beyond the scope of this module. |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
sub estimate_doc_relevancies { |
767
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
768
|
0
|
0
|
|
|
|
0
|
die "You did not set the 'query_file' parameter in the constructor" |
769
|
|
|
|
|
|
|
unless $self->{_query_file}; |
770
|
0
|
0
|
|
|
|
0
|
open( IN, $self->{_query_file} ) |
771
|
|
|
|
|
|
|
or die "unable to open the query file $self->{_query_file}: $!"; |
772
|
0
|
0
|
|
|
|
0
|
croak "\n\nYou need to specify a name for the relevancy file in \n" . |
773
|
|
|
|
|
|
|
" in which the relevancy judgments will be dumped." |
774
|
|
|
|
|
|
|
unless $self->{_relevancy_file}; |
775
|
0
|
|
|
|
|
0
|
while () { |
776
|
0
|
|
|
|
|
0
|
chomp; |
777
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
778
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*$/; |
779
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/; |
780
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+):[ ]*(.*)/; |
781
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
782
|
0
|
|
|
|
|
0
|
my $query = $2; |
783
|
0
|
0
|
|
|
|
0
|
next unless $query; |
784
|
0
|
|
|
|
|
0
|
$self->{_queries_for_relevancy}->{$query_label} = $query; |
785
|
|
|
|
|
|
|
} |
786
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
787
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_queries_for_relevancy}}) { |
|
0
|
|
|
|
|
0
|
|
788
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_queries_for_relevancy}->{$_}\n"; |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
} |
791
|
0
|
|
|
|
|
0
|
$self->{_scan_dir_for_rels} = 1; |
792
|
0
|
|
|
|
|
0
|
$self->_scan_directory($self->{_corpus_directory}); |
793
|
0
|
|
|
|
|
0
|
$self->{_scan_dir_for_rels} = 0; |
794
|
0
|
|
|
|
|
0
|
chdir $self->{_working_directory}; |
795
|
0
|
0
|
|
|
|
0
|
open(OUT, ">$self->{_relevancy_file}") |
796
|
|
|
|
|
|
|
or die "unable to open the relevancy file $self->{_relevancy_file}: $!"; |
797
|
0
|
|
|
|
|
0
|
my @relevancy_list_for_query; |
798
|
0
|
|
|
|
|
0
|
foreach (sort |
799
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
|
0
|
|
|
|
|
0
|
|
800
|
|
|
|
|
|
|
keys %{$self->{_relevancy_estimates}}) { |
801
|
0
|
|
|
|
|
0
|
@relevancy_list_for_query = |
802
|
0
|
|
|
|
|
0
|
keys %{$self->{_relevancy_estimates}->{$_}}; |
803
|
0
|
|
|
|
|
0
|
print OUT "$_ => @relevancy_list_for_query\n\n"; |
804
|
0
|
|
|
|
|
0
|
print "Number of relevant docs for query $_: " . |
805
|
|
|
|
|
|
|
scalar(@relevancy_list_for_query) . "\n"; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
# If there are available human-supplied relevancy judgments in a disk |
810
|
|
|
|
|
|
|
# file, use this script to upload that information. One of the scripts |
811
|
|
|
|
|
|
|
# in the 'examples' directory carries out the precision-recall analysis |
812
|
|
|
|
|
|
|
# by using this approach. IMPORTANT: The human-supplied relevancy |
813
|
|
|
|
|
|
|
# judgments must be in a format that is shown in the sample file |
814
|
|
|
|
|
|
|
# relevancy.txt in the 'examples' directory. |
815
|
|
|
|
|
|
|
sub upload_document_relevancies_from_file { |
816
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
817
|
0
|
|
|
|
|
0
|
chdir $self->{_working_directory}; |
818
|
0
|
0
|
|
|
|
0
|
open( IN, $self->{_relevancy_file} ) |
819
|
|
|
|
|
|
|
or die "unable to open the relevancy file $self->{_relevancy_file}: $!"; |
820
|
0
|
|
|
|
|
0
|
while () { |
821
|
0
|
|
|
|
|
0
|
chomp; |
822
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
823
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*$/; |
824
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/; |
825
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/; |
826
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
827
|
0
|
|
|
|
|
0
|
my $relevancy_docs_string = $2; |
828
|
0
|
0
|
|
|
|
0
|
next unless $relevancy_docs_string; |
829
|
0
|
|
|
|
|
0
|
my @relevancy_docs = grep $_, split / /, $relevancy_docs_string; |
830
|
0
|
|
|
|
|
0
|
my %relevancies = map {$_ => 1} @relevancy_docs; |
|
0
|
|
|
|
|
0
|
|
831
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query_label} = \%relevancies; |
832
|
|
|
|
|
|
|
} |
833
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
834
|
0
|
|
|
|
|
0
|
for (sort keys %{$self->{_relevancy_estimates}}) { |
|
0
|
|
|
|
|
0
|
|
835
|
0
|
|
|
|
|
0
|
my @rels = keys %{$self->{_relevancy_estimates}->{$_}}; |
|
0
|
|
|
|
|
0
|
|
836
|
0
|
|
|
|
|
0
|
print "$_ => @rels\n"; |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
sub display_doc_relevancies { |
842
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
843
|
0
|
|
|
|
|
0
|
die "You must first estimate or provide the doc relevancies" |
844
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_relevancy_estimates}}); |
845
|
0
|
|
|
|
|
0
|
print "\nDisplaying relevancy judgments:\n\n"; |
846
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_relevancy_estimates}}) { |
|
0
|
|
|
|
|
0
|
|
847
|
0
|
|
|
|
|
0
|
print "Query $query\n"; |
848
|
0
|
|
|
|
|
0
|
foreach my $file (sort { |
|
0
|
|
|
|
|
0
|
|
849
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query}->{$b} |
850
|
|
|
|
|
|
|
<=> |
851
|
|
|
|
|
|
|
$self->{_relevancy_estimates}->{$query}->{$a} |
852
|
|
|
|
|
|
|
} |
853
|
|
|
|
|
|
|
keys %{$self->{_relevancy_estimates}->{$query}}){ |
854
|
0
|
|
|
|
|
0
|
print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n"; |
855
|
|
|
|
|
|
|
} |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
sub _scan_file_for_rels { |
860
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
861
|
0
|
|
|
|
|
0
|
my $file = shift; |
862
|
0
|
|
|
|
|
0
|
open IN, $file; |
863
|
0
|
|
|
|
|
0
|
my @all_text = ; |
864
|
0
|
|
|
|
|
0
|
@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
865
|
0
|
|
|
|
|
0
|
my $all_text = join ' ', @all_text; |
866
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) { |
|
0
|
|
|
|
|
0
|
|
867
|
0
|
|
|
|
|
0
|
my $count = 0; |
868
|
0
|
|
|
|
|
0
|
my @query_words = grep $_, |
869
|
|
|
|
|
|
|
split /\s+/, $self->{_queries_for_relevancy}->{$query}; |
870
|
0
|
0
|
|
|
|
0
|
print "Query words for $query: @query_words\n" if $self->{_debug}; |
871
|
0
|
|
|
|
|
0
|
foreach my $word (@query_words) { |
872
|
0
|
|
|
|
|
0
|
my @matches = $all_text =~ /$word/gi; |
873
|
0
|
0
|
|
|
|
0
|
print "Number of occurrences for word '$word' in file $file: " . |
874
|
|
|
|
|
|
|
scalar(@matches) . "\n" if $self->{_debug}; |
875
|
0
|
0
|
|
|
|
0
|
$count += @matches if @matches; |
876
|
|
|
|
|
|
|
} |
877
|
0
|
0
|
|
|
|
0
|
print "\nRelevancy count for query $query and file $file: $count\n\n" |
878
|
|
|
|
|
|
|
if $self->{_debug}; |
879
|
0
|
0
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query}->{$file} = $count |
880
|
|
|
|
|
|
|
if $count >= $self->{_relevancy_threshold}; |
881
|
|
|
|
|
|
|
} |
882
|
|
|
|
|
|
|
} |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
######################### Calculate Precision versus Recall ########################## |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
sub precision_and_recall_calculator { |
887
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
888
|
0
|
|
|
|
|
0
|
my $retrieval_type = shift; |
889
|
0
|
|
|
|
|
0
|
die "You must first estimate or provide the doc relevancies" |
890
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_relevancy_estimates}}); |
891
|
0
|
0
|
|
|
|
0
|
unless (scalar(keys %{$self->{_queries_for_relevancy}})) { |
|
0
|
|
|
|
|
0
|
|
892
|
0
|
0
|
|
|
|
0
|
open( IN, $self->{_query_file}) |
893
|
|
|
|
|
|
|
or die "unable to open the query file $self->{_query_file}: $!"; |
894
|
0
|
|
|
|
|
0
|
while () { |
895
|
0
|
|
|
|
|
0
|
chomp; |
896
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
897
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*$/; |
898
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/; |
899
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+):[ ]*(.*)/; |
900
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
901
|
0
|
|
|
|
|
0
|
my $query = $2; |
902
|
0
|
0
|
|
|
|
0
|
next unless $query; |
903
|
0
|
|
|
|
|
0
|
$self->{_queries_for_relevancy}->{$query_label} = $query; |
904
|
|
|
|
|
|
|
} |
905
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
906
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying queries in the query file:\n\n"; |
907
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_queries_for_relevancy}}) { |
|
0
|
|
|
|
|
0
|
|
908
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_queries_for_relevancy}->{$_}\n"; |
909
|
|
|
|
|
|
|
} |
910
|
|
|
|
|
|
|
} |
911
|
|
|
|
|
|
|
} |
912
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) { |
|
0
|
|
|
|
|
0
|
|
913
|
0
|
0
|
|
|
|
0
|
print "\n\n\nQuery $query:\n" if $self->{_debug}; |
914
|
0
|
|
|
|
|
0
|
my @query_words = grep $_, |
915
|
|
|
|
|
|
|
split /\s+/, $self->{_queries_for_relevancy}->{$query}; |
916
|
0
|
|
|
|
|
0
|
my $retrievals; |
917
|
0
|
0
|
|
|
|
0
|
croak "\n\nYou have not specified the retrieval type for " . |
918
|
|
|
|
|
|
|
"precision-recall calculation. See code in 'examples'" . |
919
|
|
|
|
|
|
|
"directory:" if !defined $retrieval_type; |
920
|
0
|
0
|
|
|
|
0
|
if ($retrieval_type eq 'vsm') { |
|
|
0
|
|
|
|
|
|
921
|
0
|
|
|
|
|
0
|
$retrievals = $self->retrieve_with_vsm( \@query_words ); |
922
|
|
|
|
|
|
|
} elsif ($retrieval_type eq 'lsa') { |
923
|
0
|
|
|
|
|
0
|
$retrievals = $self->retrieve_with_lsa( \@query_words ); |
924
|
|
|
|
|
|
|
} |
925
|
0
|
|
|
|
|
0
|
my %ranked_retrievals; |
926
|
0
|
|
|
|
|
0
|
my $i = 1; |
927
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} |
|
0
|
|
|
|
|
0
|
|
928
|
|
|
|
|
|
|
keys %$retrievals) { |
929
|
0
|
|
|
|
|
0
|
$ranked_retrievals{$i++} = $_; |
930
|
|
|
|
|
|
|
} |
931
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
932
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying ranked retrievals for query $query:\n\n"; |
933
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %ranked_retrievals) { |
|
0
|
|
|
|
|
0
|
|
934
|
0
|
|
|
|
|
0
|
print "$_ => $ranked_retrievals{$_}\n"; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
# At this time, ranking of relevant documents based on their |
938
|
|
|
|
|
|
|
# relevancy counts serves no particular purpose since all we want |
939
|
|
|
|
|
|
|
# for the calculation of Precision and Recall are the total |
940
|
|
|
|
|
|
|
# number of relevant documents. However, I believe such a |
941
|
|
|
|
|
|
|
# ranking will play an important role in the future. |
942
|
|
|
|
|
|
|
# IMPORTANT: The relevancy judgments are ranked only when |
943
|
|
|
|
|
|
|
# estimated by the method estimate_doc_relevancies() |
944
|
|
|
|
|
|
|
# of the VSM class. When relevancies are supplied |
945
|
|
|
|
|
|
|
# directly through a disk file, they all carry the |
946
|
|
|
|
|
|
|
# same rank. |
947
|
0
|
|
|
|
|
0
|
my %ranked_relevancies; |
948
|
0
|
|
|
|
|
0
|
$i = 1; |
949
|
0
|
|
|
|
|
0
|
foreach my $file (sort { |
|
0
|
|
|
|
|
0
|
|
950
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query}->{$b} |
951
|
|
|
|
|
|
|
<=> |
952
|
|
|
|
|
|
|
$self->{_relevancy_estimates}->{$query}->{$a} |
953
|
|
|
|
|
|
|
} |
954
|
|
|
|
|
|
|
keys %{$self->{_relevancy_estimates}->{$query}}) { |
955
|
0
|
|
|
|
|
0
|
$ranked_relevancies{$i++} = $file; |
956
|
|
|
|
|
|
|
} |
957
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
958
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying ranked relevancies for query $query:\n\n"; |
959
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %ranked_relevancies) { |
|
0
|
|
|
|
|
0
|
|
960
|
0
|
|
|
|
|
0
|
print "$_ => $ranked_relevancies{$_}\n"; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
} |
963
|
0
|
|
|
|
|
0
|
my @relevant_set = values %ranked_relevancies; |
964
|
0
|
0
|
|
|
|
0
|
warn "\n\nNo relevant docs found for query $query.\n" . |
965
|
|
|
|
|
|
|
"Will skip over this query for precision and\n" . |
966
|
|
|
|
|
|
|
"recall calculations\n\n" unless @relevant_set; |
967
|
0
|
0
|
|
|
|
0
|
next unless @relevant_set; |
968
|
0
|
0
|
|
|
|
0
|
print "\n\nRelevant set for query $query: @relevant_set\n\n" |
969
|
|
|
|
|
|
|
if $self->{_debug}; |
970
|
0
|
|
|
|
|
0
|
my @retrieved; |
971
|
0
|
|
|
|
|
0
|
foreach (sort keys %ranked_retrievals) { |
972
|
0
|
|
|
|
|
0
|
push @retrieved, $ranked_retrievals{$_}; |
973
|
|
|
|
|
|
|
} |
974
|
0
|
0
|
|
|
|
0
|
print "\n\nRetrieved set for query $query: @retrieved\n\n" |
975
|
|
|
|
|
|
|
if $self->{_debug}; |
976
|
0
|
|
|
|
|
0
|
my @Precision_values = (); |
977
|
0
|
|
|
|
|
0
|
my @Recall_values = (); |
978
|
0
|
|
|
|
|
0
|
my $rank = 1; |
979
|
0
|
|
|
|
|
0
|
while ($rank < @retrieved + 1) { |
980
|
0
|
|
|
|
|
0
|
my $index = 1; |
981
|
0
|
|
|
|
|
0
|
my @retrieved_at_rank = (); |
982
|
0
|
|
|
|
|
0
|
while ($index <= $rank) { |
983
|
0
|
|
|
|
|
0
|
push @retrieved_at_rank, $ranked_retrievals{$index}; |
984
|
0
|
|
|
|
|
0
|
$index++; |
985
|
|
|
|
|
|
|
} |
986
|
0
|
|
|
|
|
0
|
my $intersection =set_intersection(\@retrieved_at_rank, |
987
|
|
|
|
|
|
|
\@relevant_set); |
988
|
0
|
0
|
|
|
|
0
|
my $precision_at_rank = @retrieved_at_rank ? |
989
|
|
|
|
|
|
|
(@$intersection / @retrieved_at_rank) : 0; |
990
|
0
|
|
|
|
|
0
|
push @Precision_values, $precision_at_rank; |
991
|
0
|
|
|
|
|
0
|
my $recall_at_rank = @$intersection / @relevant_set; |
992
|
0
|
|
|
|
|
0
|
push @Recall_values, $recall_at_rank; |
993
|
0
|
|
|
|
|
0
|
$rank++; |
994
|
|
|
|
|
|
|
} |
995
|
0
|
0
|
|
|
|
0
|
print "\n\nFor query $query, precision values: @Precision_values\n" |
996
|
|
|
|
|
|
|
if $self->{_debug}; |
997
|
0
|
0
|
|
|
|
0
|
print "\nFor query $query, recall values: @Recall_values\n" |
998
|
|
|
|
|
|
|
if $self->{_debug}; |
999
|
0
|
|
|
|
|
0
|
$self->{_precision_for_queries}->{$query} = \@Precision_values; |
1000
|
0
|
|
|
|
|
0
|
$self->{_recall_for_queries}->{$query} = \@Recall_values; |
1001
|
0
|
|
|
|
|
0
|
my $area = 0; |
1002
|
|
|
|
|
|
|
# Use trapezoidal rule to find the area under the precision-recall |
1003
|
|
|
|
|
|
|
# curve: |
1004
|
0
|
|
|
|
|
0
|
for my $j (1..@Precision_values-1) { |
1005
|
0
|
|
|
|
|
0
|
my $height = ($Precision_values[$j]+$Precision_values[$j-1])/2.0; |
1006
|
0
|
|
|
|
|
0
|
my $base = ($Recall_values[$j] - $Recall_values[$j-1]); |
1007
|
0
|
|
|
|
|
0
|
$area += $base * $height; |
1008
|
|
|
|
|
|
|
} |
1009
|
0
|
|
|
|
|
0
|
my $map_for_query = $area; |
1010
|
0
|
0
|
|
|
|
0
|
print "\nMAP for query $query: $map_for_query\n" if $self->{_debug}; |
1011
|
0
|
|
|
|
|
0
|
$self->{_map_vals_for_queries}->{$query} = $map_for_query; |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
sub display_map_values_for_queries { |
1016
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1017
|
0
|
|
|
|
|
0
|
die "You must first invoke precision_and_recall_calculator function" |
1018
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_map_vals_for_queries}}); |
1019
|
0
|
|
|
|
|
0
|
my $map = 0; |
1020
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying average precision for different queries:\n\n"; |
1021
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
0
|
|
|
|
|
0
|
|
1022
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
1023
|
|
|
|
|
|
|
keys %{$self->{_map_vals_for_queries}}) { |
1024
|
0
|
|
|
|
|
0
|
my $output = sprintf "Query %s => %.3f", |
1025
|
|
|
|
|
|
|
$query, $self->{_map_vals_for_queries}->{$query}; |
1026
|
0
|
|
|
|
|
0
|
print "$output\n"; |
1027
|
0
|
|
|
|
|
0
|
$map += $self->{_map_vals_for_queries}->{$query}; |
1028
|
|
|
|
|
|
|
} |
1029
|
0
|
|
|
|
|
0
|
print "\n\n"; |
1030
|
0
|
|
|
|
|
0
|
my $avg_map_for_all_queries = |
1031
|
0
|
|
|
|
|
0
|
$map / scalar(keys %{$self->{_map_vals_for_queries}}); |
1032
|
0
|
|
|
|
|
0
|
print "MAP value: $avg_map_for_all_queries\n\n"; |
1033
|
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
sub display_precision_vs_recall_for_queries { |
1036
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1037
|
0
|
|
|
|
|
0
|
die "You must first invoke precision_and_recall_calculator function" |
1038
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_precision_for_queries}}); |
1039
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying precision and recall values for different queries:\n\n"; |
1040
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
0
|
|
|
|
|
0
|
|
1041
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
1042
|
|
|
|
|
|
|
keys %{$self->{_map_vals_for_queries}}) { |
1043
|
0
|
|
|
|
|
0
|
print "\n\nQuery $query:\n"; |
1044
|
0
|
|
|
|
|
0
|
print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n"; |
1045
|
0
|
|
|
|
|
0
|
my @precision_vals = @{$self->{_precision_for_queries}->{$query}}; |
|
0
|
|
|
|
|
0
|
|
1046
|
0
|
|
|
|
|
0
|
@precision_vals = map {sprintf "%.3f", $_} @precision_vals; |
|
0
|
|
|
|
|
0
|
|
1047
|
0
|
|
|
|
|
0
|
print " Precision at rank => @precision_vals\n"; |
1048
|
0
|
|
|
|
|
0
|
my @recall_vals = @{$self->{_recall_for_queries}->{$query}}; |
|
0
|
|
|
|
|
0
|
|
1049
|
0
|
|
|
|
|
0
|
@recall_vals = map {sprintf "%.3f", $_} @recall_vals; |
|
0
|
|
|
|
|
0
|
|
1050
|
0
|
|
|
|
|
0
|
print "\n Recall at rank => @recall_vals\n"; |
1051
|
|
|
|
|
|
|
} |
1052
|
0
|
|
|
|
|
0
|
print "\n\n"; |
1053
|
|
|
|
|
|
|
} |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
sub get_query_sorted_average_precision_for_queries { |
1056
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1057
|
0
|
|
|
|
|
0
|
die "You must first invoke precision_and_recall_calculator function" |
1058
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_map_vals_for_queries}}); |
1059
|
0
|
|
|
|
|
0
|
my @average_precisions_for_queries = (); |
1060
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
0
|
|
|
|
|
0
|
|
1061
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
1062
|
|
|
|
|
|
|
keys %{$self->{_map_vals_for_queries}}) { |
1063
|
0
|
|
|
|
|
0
|
my $output = sprintf "%.3f", $self->{_map_vals_for_queries}->{$query}; |
1064
|
0
|
|
|
|
|
0
|
push @average_precisions_for_queries, $output; |
1065
|
|
|
|
|
|
|
} |
1066
|
0
|
|
|
|
|
0
|
return \@average_precisions_for_queries; |
1067
|
|
|
|
|
|
|
} |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
################################### Utility Routines ################################### |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
sub _check_for_illegal_params { |
1072
|
2
|
|
|
2
|
|
6
|
my @params = @_; |
1073
|
2
|
|
|
|
|
13
|
my @legal_params = qw / corpus_directory |
1074
|
|
|
|
|
|
|
corpus_vocab_db |
1075
|
|
|
|
|
|
|
doc_vectors_db |
1076
|
|
|
|
|
|
|
normalized_doc_vecs_db |
1077
|
|
|
|
|
|
|
use_idf_filter |
1078
|
|
|
|
|
|
|
stop_words_file |
1079
|
|
|
|
|
|
|
file_types |
1080
|
|
|
|
|
|
|
case_sensitive |
1081
|
|
|
|
|
|
|
max_number_retrievals |
1082
|
|
|
|
|
|
|
query_file |
1083
|
|
|
|
|
|
|
relevancy_file |
1084
|
|
|
|
|
|
|
min_word_length |
1085
|
|
|
|
|
|
|
want_stemming |
1086
|
|
|
|
|
|
|
lsa_svd_threshold |
1087
|
|
|
|
|
|
|
relevancy_threshold |
1088
|
|
|
|
|
|
|
break_camelcased_and_underscored |
1089
|
|
|
|
|
|
|
save_model_on_disk |
1090
|
|
|
|
|
|
|
debug |
1091
|
|
|
|
|
|
|
/; |
1092
|
2
|
|
|
|
|
2
|
my $found_match_flag; |
1093
|
2
|
|
|
|
|
5
|
foreach my $param (@params) { |
1094
|
17
|
|
|
|
|
16
|
foreach my $legal (@legal_params) { |
1095
|
102
|
|
|
|
|
70
|
$found_match_flag = 0; |
1096
|
102
|
100
|
|
|
|
137
|
if ($param eq $legal) { |
1097
|
17
|
|
|
|
|
13
|
$found_match_flag = 1; |
1098
|
17
|
|
|
|
|
14
|
last; |
1099
|
|
|
|
|
|
|
} |
1100
|
|
|
|
|
|
|
} |
1101
|
17
|
50
|
|
|
|
31
|
last if $found_match_flag == 0; |
1102
|
|
|
|
|
|
|
} |
1103
|
2
|
|
|
|
|
9
|
return $found_match_flag; |
1104
|
|
|
|
|
|
|
} |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
# checks whether an element is in an array: |
1107
|
|
|
|
|
|
|
sub contained_in { |
1108
|
32
|
|
|
32
|
0
|
58
|
my $ele = shift; |
1109
|
32
|
|
|
|
|
101
|
my @array = @_; |
1110
|
32
|
|
|
|
|
42
|
my $count = 0; |
1111
|
32
|
100
|
|
|
|
71
|
map {$count++ if $ele eq $_} @array; |
|
64
|
|
|
|
|
240
|
|
1112
|
32
|
|
|
|
|
181
|
return $count; |
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
# Meant only for an un-nested hash: |
1116
|
|
|
|
|
|
|
sub deep_copy_hash { |
1117
|
16
|
|
|
16
|
0
|
30
|
my $ref_in = shift; |
1118
|
16
|
|
|
|
|
31
|
my $ref_out = {}; |
1119
|
16
|
|
|
|
|
28
|
foreach ( keys %{$ref_in} ) { |
|
16
|
|
|
|
|
600
|
|
1120
|
1600
|
|
|
|
|
2128
|
$ref_out->{$_} = $ref_in->{$_}; |
1121
|
|
|
|
|
|
|
} |
1122
|
16
|
|
|
|
|
575
|
return $ref_out; |
1123
|
|
|
|
|
|
|
} |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
sub vec_scalar_product { |
1126
|
38
|
|
|
38
|
0
|
35
|
my $vec1 = shift; |
1127
|
38
|
|
|
|
|
33
|
my $vec2 = shift; |
1128
|
38
|
50
|
|
|
|
102
|
croak "Something is wrong --- the two vectors are of unequal length" |
1129
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
1130
|
38
|
|
|
|
|
27
|
my $product; |
1131
|
38
|
|
|
|
|
57
|
for my $i (0..@$vec1-1) { |
1132
|
3800
|
|
|
|
|
3935
|
$product += $vec1->[$i] * $vec2->[$i]; |
1133
|
|
|
|
|
|
|
} |
1134
|
38
|
|
|
|
|
70
|
return $product; |
1135
|
|
|
|
|
|
|
} |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
sub vec_magnitude { |
1138
|
61
|
|
|
61
|
0
|
56
|
my $vec = shift; |
1139
|
61
|
|
|
|
|
45
|
my $mag_squared = 0; |
1140
|
61
|
|
|
|
|
67
|
foreach my $num (@$vec) { |
1141
|
6100
|
|
|
|
|
5394
|
$mag_squared += $num ** 2; |
1142
|
|
|
|
|
|
|
} |
1143
|
61
|
|
|
|
|
93
|
return sqrt $mag_squared; |
1144
|
|
|
|
|
|
|
} |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
sub reduce { |
1147
|
2
|
|
|
2
|
0
|
4
|
my $vec = shift; |
1148
|
2
|
|
|
|
|
3
|
my $result; |
1149
|
2
|
|
|
|
|
6
|
for my $item (@$vec) { |
1150
|
200
|
|
|
|
|
175
|
$result += $item; |
1151
|
|
|
|
|
|
|
} |
1152
|
2
|
|
|
|
|
8
|
return $result; |
1153
|
|
|
|
|
|
|
} |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
sub simple_stemmer { |
1156
|
3310
|
|
|
3310
|
0
|
3727
|
my $word = shift; |
1157
|
3310
|
|
|
|
|
2505
|
my $debug = shift; |
1158
|
3310
|
50
|
|
|
|
4473
|
print "\nStemming the word: $word\n" if $debug; |
1159
|
3310
|
|
|
|
|
12710
|
$word =~ s/(.*[a-z][^aeious])s$/$1/i; |
1160
|
3310
|
|
|
|
|
4754
|
$word =~ s/(.*[a-z]s)es$/$1/i; |
1161
|
3310
|
|
|
|
|
4436
|
$word =~ s/(.*[a-z][ck])es$/$1e/i; |
1162
|
3310
|
|
|
|
|
12004
|
$word =~ s/(.*[a-z]+)tions$/$1tion/i; |
1163
|
3310
|
|
|
|
|
9997
|
$word =~ s/(.*[a-z]+)mming$/$1m/i; |
1164
|
3310
|
|
|
|
|
18875
|
$word =~ s/(.*[a-z]+[^rl])ing$/$1/i; |
1165
|
3310
|
|
|
|
|
10422
|
$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i; |
1166
|
3310
|
|
|
|
|
10782
|
$word =~ s/(.*[a-z]+)tices$/$1tex/i; |
1167
|
3310
|
|
|
|
|
14847
|
$word =~ s/(.*[a-z]+)pes$/$1pe/i; |
1168
|
3310
|
|
|
|
|
14581
|
$word =~ s/(.*[a-z]+)sed$/$1se/i; |
1169
|
3310
|
|
|
|
|
17184
|
$word =~ s/(.*[a-z]+)ed$/$1/i; |
1170
|
3310
|
|
|
|
|
7657
|
$word =~ s/(.*[a-z]+)tation$/$1t/i; |
1171
|
3310
|
50
|
|
|
|
4601
|
print "Stemmed word: $word\n\n" if $debug; |
1172
|
3310
|
|
|
|
|
8313
|
return $word; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
# Assumes the array is sorted in a descending order, as would be the |
1176
|
|
|
|
|
|
|
# case with an array of singular values produced by an SVD algorithm |
1177
|
|
|
|
|
|
|
sub return_index_of_last_value_above_threshold { |
1178
|
1
|
|
|
1
|
0
|
2
|
my $pdl_obj = shift; |
1179
|
1
|
|
|
|
|
7
|
my $size = $pdl_obj->getdim(0); |
1180
|
1
|
|
|
|
|
2
|
my $threshold = shift; |
1181
|
1
|
|
|
|
|
13
|
my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold; |
1182
|
1
|
|
|
|
|
91
|
my $i = 0; |
1183
|
1
|
|
66
|
|
|
19
|
while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;} |
|
8
|
|
|
|
|
212
|
|
1184
|
1
|
|
|
|
|
6
|
return $i-1; |
1185
|
|
|
|
|
|
|
} |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
sub set_intersection { |
1188
|
0
|
|
|
0
|
0
|
|
my $set1 = shift; |
1189
|
0
|
|
|
|
|
|
my $set2 = shift; |
1190
|
0
|
|
|
|
|
|
my %hset1 = map {$_ => 1} @$set1; |
|
0
|
|
|
|
|
|
|
1191
|
0
|
|
|
|
|
|
my @common_elements = grep {$hset1{$_}} @$set2; |
|
0
|
|
|
|
|
|
|
1192
|
0
|
0
|
|
|
|
|
return @common_elements ? \@common_elements : []; |
1193
|
|
|
|
|
|
|
} |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
sub get_integer_suffix { |
1196
|
0
|
|
|
0
|
0
|
|
my $label = shift; |
1197
|
0
|
|
|
|
|
|
$label =~ /(\d*)$/; |
1198
|
0
|
|
|
|
|
|
return $1; |
1199
|
|
|
|
|
|
|
} |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
1; |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
=pod |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=head1 NAME |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
Algorithm::VSM --- A Perl module for retrieving files and documents from a software |
1208
|
|
|
|
|
|
|
library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis) |
1209
|
|
|
|
|
|
|
algorithms in response to search words and phrases. |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=head1 SYNOPSIS |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL: |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
use Algorithm::VSM; |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
1218
|
|
|
|
|
|
|
my @query = qw/ program ListIterator add ArrayList args /; |
1219
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
1220
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
1221
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1222
|
|
|
|
|
|
|
case_sensitive => 0, |
1223
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1224
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1225
|
|
|
|
|
|
|
max_number_retrievals => 10, |
1226
|
|
|
|
|
|
|
min_word_length => 4, |
1227
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1228
|
|
|
|
|
|
|
use_idf_filter => 1, |
1229
|
|
|
|
|
|
|
want_stemming => 1, |
1230
|
|
|
|
|
|
|
); |
1231
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
1232
|
|
|
|
|
|
|
$vsm->display_corpus_vocab(); |
1233
|
|
|
|
|
|
|
$vsm->display_corpus_vocab_size(); |
1234
|
|
|
|
|
|
|
$vsm->write_corpus_vocab_to_file("vocabulary_dump.txt"); |
1235
|
|
|
|
|
|
|
$vsm->display_inverse_document_frequencies(); |
1236
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
1237
|
|
|
|
|
|
|
$vsm->display_doc_vectors(); |
1238
|
|
|
|
|
|
|
$vsm->display_normalized_doc_vectors(); |
1239
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query ); |
1240
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
The purpose of each constructor option and what is accomplished by the method |
1243
|
|
|
|
|
|
|
calls should be obvious by their names. If not, they are explained in greater |
1244
|
|
|
|
|
|
|
detail elsewhere in this documentation page. Note that the methods |
1245
|
|
|
|
|
|
|
display_corpus_vocab() and display_doc_vectors() are there only for testing |
1246
|
|
|
|
|
|
|
purposes with small corpora. If you must use them for large libraries/corpora, |
1247
|
|
|
|
|
|
|
you might wish to redirect the output to a file. |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
By default, a call to a constructor calculates normalized term-frequency vectors |
1250
|
|
|
|
|
|
|
for the documents. Normalization consists of first calculating the term |
1251
|
|
|
|
|
|
|
frequency tf(t) of a term t in a document as a proportion of the total numbers |
1252
|
|
|
|
|
|
|
of words in the document and then multiplying it by idf(t), where idf(t) stands |
1253
|
|
|
|
|
|
|
for the inverse document frequency associated with that term. Note that 'word' |
1254
|
|
|
|
|
|
|
and 'term' mean the same thing. |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
# FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL: |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
1261
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1262
|
|
|
|
|
|
|
case_sensitive => 0, |
1263
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1264
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1265
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
1266
|
|
|
|
|
|
|
max_number_retrievals => 10, |
1267
|
|
|
|
|
|
|
min_word_length => 4, |
1268
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1269
|
|
|
|
|
|
|
use_idf_filter => 1, |
1270
|
|
|
|
|
|
|
want_stemming => 1, |
1271
|
|
|
|
|
|
|
); |
1272
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
1273
|
|
|
|
|
|
|
$lsa->display_corpus_vocab(); |
1274
|
|
|
|
|
|
|
$lsa->display_corpus_vocab_size(); |
1275
|
|
|
|
|
|
|
$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt"); |
1276
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
1277
|
|
|
|
|
|
|
$lsa->construct_lsa_model(); |
1278
|
|
|
|
|
|
|
my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query ); |
1279
|
|
|
|
|
|
|
$lsa->display_retrievals( $retrievals ); |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
The initialization code before the constructor call and the calls for displaying |
1282
|
|
|
|
|
|
|
the vocabulary and the vectors after the call remain the same as for the VSM case |
1283
|
|
|
|
|
|
|
shown previously in this Synopsis. In the call above, the constructor parameter |
1284
|
|
|
|
|
|
|
'lsa_svd_threshold' determines how many of the singular values will be retained |
1285
|
|
|
|
|
|
|
after we have carried out an SVD decomposition of the term-frequency matrix for |
1286
|
|
|
|
|
|
|
the documents in the corpus. Singular values smaller than this threshold |
1287
|
|
|
|
|
|
|
fraction of the largest value are rejected. |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR VSM: |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
1294
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
1295
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
1296
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; # All relevancy judgments |
1297
|
|
|
|
|
|
|
# will be stored in this file |
1298
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
1299
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1300
|
|
|
|
|
|
|
case_sensitive => 0, |
1301
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1302
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1303
|
|
|
|
|
|
|
min_word_length => 4, |
1304
|
|
|
|
|
|
|
query_file => $query_file, |
1305
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
1306
|
|
|
|
|
|
|
relevancy_threshold => 5, |
1307
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1308
|
|
|
|
|
|
|
want_stemming => 1, |
1309
|
|
|
|
|
|
|
); |
1310
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
1311
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
1312
|
|
|
|
|
|
|
$vsm->estimate_doc_relevancies(); |
1313
|
|
|
|
|
|
|
$vsm->display_doc_relevancies(); # used only for testing |
1314
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
1315
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
1316
|
|
|
|
|
|
|
$vsm->display_map_values_for_queries(); |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
Measuring precision and recall requires a set of queries. These are supplied |
1319
|
|
|
|
|
|
|
through the constructor parameter 'query_file'. The format of the this file |
1320
|
|
|
|
|
|
|
must be according to the sample file 'test_queries.txt' in the 'examples' |
1321
|
|
|
|
|
|
|
directory. The module estimates the relevancies of the documents to the |
1322
|
|
|
|
|
|
|
queries and dumps the relevancies in a file named by the 'relevancy_file' |
1323
|
|
|
|
|
|
|
constructor parameter. The constructor parameter 'relevancy_threshold' is used |
1324
|
|
|
|
|
|
|
to decide which of the documents are considered to be relevant to a query. A |
1325
|
|
|
|
|
|
|
document must contain at least the 'relevancy_threshold' occurrences of query |
1326
|
|
|
|
|
|
|
words in order to be considered relevant to a query. |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR LSA: |
1331
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
1333
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1334
|
|
|
|
|
|
|
case_sensitive => 0, |
1335
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1336
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1337
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
1338
|
|
|
|
|
|
|
min_word_length => 4, |
1339
|
|
|
|
|
|
|
query_file => $query_file, |
1340
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
1341
|
|
|
|
|
|
|
relevancy_threshold => 5, |
1342
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1343
|
|
|
|
|
|
|
want_stemming => 1, |
1344
|
|
|
|
|
|
|
); |
1345
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
1346
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
1347
|
|
|
|
|
|
|
$lsa->construct_lsa_model(); |
1348
|
|
|
|
|
|
|
$lsa->estimate_doc_relevancies(); |
1349
|
|
|
|
|
|
|
$lsa->display_doc_relevancies(); |
1350
|
|
|
|
|
|
|
$lsa->precision_and_recall_calculator('lsa'); |
1351
|
|
|
|
|
|
|
$lsa->display_precision_vs_recall_for_queries(); |
1352
|
|
|
|
|
|
|
$lsa->display_map_values_for_queries(); |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
We have already explained the purpose of the constructor parameter 'query_file' |
1355
|
|
|
|
|
|
|
and about the constraints on the format of queries in the file named through |
1356
|
|
|
|
|
|
|
this parameter. As mentioned earlier, the module estimates the relevancies of |
1357
|
|
|
|
|
|
|
the documents to the queries and dumps the relevancies in a file named by the |
1358
|
|
|
|
|
|
|
'relevancy_file' constructor parameter. The constructor parameter |
1359
|
|
|
|
|
|
|
'relevancy_threshold' is used in deciding which of the documents are considered |
1360
|
|
|
|
|
|
|
to be relevant to a query. A document must contain at least the |
1361
|
|
|
|
|
|
|
'relevancy_threshold' occurrences of query words in order to be considered |
1362
|
|
|
|
|
|
|
relevant to a query. We have previously explained the role of the constructor |
1363
|
|
|
|
|
|
|
parameter 'lsa_svd_threshold'. |
1364
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS: |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
1370
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
1371
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
1372
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; |
1373
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
1374
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1375
|
|
|
|
|
|
|
case_sensitive => 0, |
1376
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1377
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1378
|
|
|
|
|
|
|
min_word_length => 4, |
1379
|
|
|
|
|
|
|
query_file => $query_file, |
1380
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
1381
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1382
|
|
|
|
|
|
|
want_stemming => 1, |
1383
|
|
|
|
|
|
|
); |
1384
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
1385
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
1386
|
|
|
|
|
|
|
$vsm->upload_document_relevancies_from_file(); |
1387
|
|
|
|
|
|
|
$vsm->display_doc_relevancies(); |
1388
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
1389
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
1390
|
|
|
|
|
|
|
$vsm->display_map_values_for_queries(); |
1391
|
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
Now the filename supplied through the constructor parameter 'relevancy_file' must |
1393
|
|
|
|
|
|
|
contain relevance judgments for the queries that are named in the file supplied |
1394
|
|
|
|
|
|
|
through the parameter 'query_file'. The format of these two files must be |
1395
|
|
|
|
|
|
|
according to what is shown in the sample files 'test_queries.txt' and |
1396
|
|
|
|
|
|
|
'relevancy.txt' in the 'examples' directory. |
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS: |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
1403
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
1404
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
1405
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; |
1406
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
1407
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1408
|
|
|
|
|
|
|
case_sensitive => 0, |
1409
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1410
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1411
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
1412
|
|
|
|
|
|
|
min_word_length => 4, |
1413
|
|
|
|
|
|
|
query_file => $query_file, |
1414
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
1415
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1416
|
|
|
|
|
|
|
want_stemming => 1, |
1417
|
|
|
|
|
|
|
); |
1418
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
1419
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
1420
|
|
|
|
|
|
|
$lsa->upload_document_relevancies_from_file(); |
1421
|
|
|
|
|
|
|
$lsa->display_doc_relevancies(); |
1422
|
|
|
|
|
|
|
$lsa->precision_and_recall_calculator('vsm'); |
1423
|
|
|
|
|
|
|
$lsa->display_precision_vs_recall_for_queries(); |
1424
|
|
|
|
|
|
|
$lsa->display_map_values_for_queries(); |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
As mentioned for the previous code block, the filename supplied through the |
1427
|
|
|
|
|
|
|
constructor parameter 'relevancy_file' must contain relevance judgments for the |
1428
|
|
|
|
|
|
|
queries that are named in the file supplied through the parameter 'query_file'. |
1429
|
|
|
|
|
|
|
The format of this file must be according to what is shown in the sample file |
1430
|
|
|
|
|
|
|
'relevancy.txt' in the 'examples' directory. We have already explained the roles |
1431
|
|
|
|
|
|
|
played by the constructor parameters such as 'lsa_svd_threshold'. |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
# FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS: |
1436
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
1438
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
1439
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
1440
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1441
|
|
|
|
|
|
|
case_sensitive => 0, |
1442
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
1443
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
1444
|
|
|
|
|
|
|
min_word_length => 4, |
1445
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
1446
|
|
|
|
|
|
|
want_stemming => 1, |
1447
|
|
|
|
|
|
|
); |
1448
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
1449
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
1450
|
|
|
|
|
|
|
# code for calculating pairwise similarities as shown in the |
1451
|
|
|
|
|
|
|
# script calculate_similarity_matrix_for_all_docs.pl in the |
1452
|
|
|
|
|
|
|
# examples directory. This script makes calls to |
1453
|
|
|
|
|
|
|
# |
1454
|
|
|
|
|
|
|
# $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]); |
1455
|
|
|
|
|
|
|
# |
1456
|
|
|
|
|
|
|
# for every pair of documents. |
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
=head1 CHANGES |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
Version 1.61 improves the implementation of the directory scanner to make it more |
1461
|
|
|
|
|
|
|
platform independent. Additionally, you are now required to specify in the |
1462
|
|
|
|
|
|
|
constructor call the file types to be considered for computing the database model. |
1463
|
|
|
|
|
|
|
If, say, you have a large software library and you want only Java and text files to |
1464
|
|
|
|
|
|
|
be scanned for creating the VSM (or the LSA) model, you must supply that information |
1465
|
|
|
|
|
|
|
to the module by setting the constructor parameter C to the anonymous |
1466
|
|
|
|
|
|
|
list C<['.java', '.txt']>. An additional constructor parameter introduced in this |
1467
|
|
|
|
|
|
|
version is C. If you set it to 1, that will force the database model |
1468
|
|
|
|
|
|
|
and query matching to become case sensitive. |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
Version 1.60 reflects the fact that people are now more likely to use this module by |
1471
|
|
|
|
|
|
|
keeping the model constructed for a corpus in the fast memory (as opposed to storing |
1472
|
|
|
|
|
|
|
the models in disk-based hash tables) for its repeated invocation for different |
1473
|
|
|
|
|
|
|
queries. As a result, the default value for the constructor option |
1474
|
|
|
|
|
|
|
C was changed from 1 to 0. For those who still wish to store on |
1475
|
|
|
|
|
|
|
a disk the model that is constructed, the script |
1476
|
|
|
|
|
|
|
C shows how you can do that. |
1477
|
|
|
|
|
|
|
Other changes in 1.60 include a slight reorganization of the scripts in the |
1478
|
|
|
|
|
|
|
C directory. Most scripts now do not by default store their models in |
1479
|
|
|
|
|
|
|
disk-based hash tables. This reorganization is reflected in the description of the |
1480
|
|
|
|
|
|
|
C directory in this documentation. The basic logic of constructing VSM and |
1481
|
|
|
|
|
|
|
LSA models and how these are used for retrievals remains unchanged. |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
Version 1.50 incorporates a couple of new features: (1) You now have the option to |
1484
|
|
|
|
|
|
|
split camel-cased and underscored words for constructing your vocabulary set; and (2) |
1485
|
|
|
|
|
|
|
Storing the VSM and LSA models in database files on the disk is now optional. The |
1486
|
|
|
|
|
|
|
second feature, in particular, should prove useful to those who are using this module |
1487
|
|
|
|
|
|
|
for large collections of documents. |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
Version 1.42 includes two new methods, C and |
1490
|
|
|
|
|
|
|
C, for those folks who deal with very large datasets. |
1491
|
|
|
|
|
|
|
You can get a better sense of the overall vocabulary being used by the module for |
1492
|
|
|
|
|
|
|
file retrieval by examining the contents of a dump file whose name is supplied as an |
1493
|
|
|
|
|
|
|
argument to C. |
1494
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
Version 1.41 downshifts the required version of the PDL module. Also cleaned up are |
1496
|
|
|
|
|
|
|
the dependencies between this module and the submodules of PDL. |
1497
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
Version 1.4 makes it easier for a user to calculate a similarity matrix over all the |
1499
|
|
|
|
|
|
|
documents in the corpus. The elements of such a matrix express pairwise similarities |
1500
|
|
|
|
|
|
|
between the documents. The pairwise similarities are based on the dot product of two |
1501
|
|
|
|
|
|
|
document vectors divided by the product of the vector magnitudes. The 'examples' |
1502
|
|
|
|
|
|
|
directory contains two scripts to illustrate how such matrices can be calculated by |
1503
|
|
|
|
|
|
|
the user. The similarity matrix is output as a CSV file. |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a |
1506
|
|
|
|
|
|
|
document file. What that means is that the words that appear in most of the documents |
1507
|
|
|
|
|
|
|
get reduced weighting since such words are non-discriminatory with respect to the |
1508
|
|
|
|
|
|
|
retrieval of the documents. A typical formula that is used to calculate the IDF |
1509
|
|
|
|
|
|
|
weight for a word is the logarithm of the ratio of the total number of documents to |
1510
|
|
|
|
|
|
|
the number of documents in which the word appears. So if a word were to appear in |
1511
|
|
|
|
|
|
|
all the documents, its IDF multiplier would be zero in the vector representation of a |
1512
|
|
|
|
|
|
|
document. If so desired, you can turn off the IDF weighting of the words by |
1513
|
|
|
|
|
|
|
explicitly setting the constructor parameter C to zero. |
1514
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
Version 1.2 includes a code correction and some general code and documentation |
1516
|
|
|
|
|
|
|
cleanup. |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
With Version 1.1, you can access the retrieval precision results so that you can |
1519
|
|
|
|
|
|
|
compare two different retrieval algorithms (VSM or LSA with different choices for |
1520
|
|
|
|
|
|
|
some of the constructor parameters) with significance testing. (Version 1.0 merely |
1521
|
|
|
|
|
|
|
sent those results to standard output, typically your terminal window.) In Version |
1522
|
|
|
|
|
|
|
1.1, the new script B in the 'examples' directory |
1523
|
|
|
|
|
|
|
illustrates significance testing with Randomization and with Student's Paired t-Test. |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
=head1 DESCRIPTION |
1526
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
B is a I module for constructing a Vector Space Model (VSM) or |
1528
|
|
|
|
|
|
|
a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred |
1529
|
|
|
|
|
|
|
to as a corpus, and then retrieving the documents in response to search words in a |
1530
|
|
|
|
|
|
|
query. |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
VSM and LSA models have been around for a long time in the Information Retrieval (IR) |
1533
|
|
|
|
|
|
|
community. More recently such models have been shown to be effective in retrieving |
1534
|
|
|
|
|
|
|
files/documents from software libraries. For an account of this research that was |
1535
|
|
|
|
|
|
|
presented by Shivani Rao and the author of this module at the 2011 Mining Software |
1536
|
|
|
|
|
|
|
Repositories conference, see L. |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2) |
1539
|
|
|
|
|
|
|
Stemming the words so extracted and eliminating the designated stop words from the |
1540
|
|
|
|
|
|
|
vocabulary. Stemming means that closely related words like 'programming' and |
1541
|
|
|
|
|
|
|
'programs' are reduced to the common root word 'program' and the stop words are the |
1542
|
|
|
|
|
|
|
non-discriminating words that can be expected to exist in virtually all the |
1543
|
|
|
|
|
|
|
documents. (3) Constructing document vectors for the individual files in the corpus |
1544
|
|
|
|
|
|
|
--- the document vectors taken together constitute what is usually referred to as a |
1545
|
|
|
|
|
|
|
'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to |
1546
|
|
|
|
|
|
|
factor out the effect of document size and, if desired, multiplying the term |
1547
|
|
|
|
|
|
|
frequencies by the IDF (Inverse Document Frequency) values for the words to reduce |
1548
|
|
|
|
|
|
|
the weight of the words that appear in a large number of documents. (5) Constructing |
1549
|
|
|
|
|
|
|
a query vector for the search query after the query is subject to the same stemming |
1550
|
|
|
|
|
|
|
and stop-word elimination rules that were applied to the corpus. And, lastly, (6) |
1551
|
|
|
|
|
|
|
Using a similarity metric to return the set of documents that are most similar to the |
1552
|
|
|
|
|
|
|
query vector. The commonly used similarity metric is one based on the cosine |
1553
|
|
|
|
|
|
|
distance between two vectors. Also note that all the vectors mentioned here are of |
1554
|
|
|
|
|
|
|
the same size, the size of the vocabulary. An element of a vector is the frequency |
1555
|
|
|
|
|
|
|
of occurrence of the word corresponding to that position in the vector. |
1556
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one |
1558
|
|
|
|
|
|
|
step further by subjecting the term-frequency matrix for the corpus to singular value |
1559
|
|
|
|
|
|
|
decomposition (SVD). By retaining only a subset of the singular values (usually the |
1560
|
|
|
|
|
|
|
N largest for some value of N), you can construct reduced-dimensionality vectors for |
1561
|
|
|
|
|
|
|
the documents and the queries. In VSM, as mentioned above, the size of the document |
1562
|
|
|
|
|
|
|
and the query vectors is equal to the size of the vocabulary. For large corpora, |
1563
|
|
|
|
|
|
|
this size may involve tens of thousands of words --- this can slow down the VSM |
1564
|
|
|
|
|
|
|
modeling and retrieval process. So you are very likely to get faster performance |
1565
|
|
|
|
|
|
|
with retrieval based on LSA modeling, especially if you store the model once |
1566
|
|
|
|
|
|
|
constructed in a database file on the disk and carry out retrievals using the |
1567
|
|
|
|
|
|
|
disk-based model. |
1568
|
|
|
|
|
|
|
|
1569
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL? |
1571
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
This module has only been tested for software retrieval. For more general text |
1573
|
|
|
|
|
|
|
retrieval, you would need to replace the simple stemmer used in the module by one |
1574
|
|
|
|
|
|
|
based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the |
1575
|
|
|
|
|
|
|
list of stop words appropriate to the text corpora of interest to you. As previously |
1576
|
|
|
|
|
|
|
mentioned, the stop words are the commonly occurring words that do not carry much |
1577
|
|
|
|
|
|
|
discriminatory power from the standpoint of distinguishing between the documents. |
1578
|
|
|
|
|
|
|
See the file 'stop_words.txt' in the 'examples' directory for how such a file must be |
1579
|
|
|
|
|
|
|
formatted. |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA? |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
It is not uncommon for large software libraries to consist of tens of thousands of |
1585
|
|
|
|
|
|
|
documents that include source-code files, documentation files, README files, |
1586
|
|
|
|
|
|
|
configuration files, etc. The bug-localization work presented recently by Shivani |
1587
|
|
|
|
|
|
|
Rao and this author at the 2011 Mining Software Repository conference (MSR11) was |
1588
|
|
|
|
|
|
|
based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary |
1589
|
|
|
|
|
|
|
size of 7553 unique words. (Here is a link to this work: |
1590
|
|
|
|
|
|
|
L. Also note that the iBUGS dataset |
1591
|
|
|
|
|
|
|
was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of |
1592
|
|
|
|
|
|
|
automated bug detection and localization tools.) If C is the size of the |
1593
|
|
|
|
|
|
|
vocabulary and C the number of the documents in the corpus, the size of each |
1594
|
|
|
|
|
|
|
vector will be C and size of the term-frequency matrix for the entire corpus will |
1595
|
|
|
|
|
|
|
be CxC. So if you were to duplicate the bug localization experiments in |
1596
|
|
|
|
|
|
|
L you would be dealing with vectors of |
1597
|
|
|
|
|
|
|
size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers |
1598
|
|
|
|
|
|
|
to really large libraries/corpora, we are obviously talking about very large matrices |
1599
|
|
|
|
|
|
|
for SVD decomposition. For large libraries/corpora, it would be best to store away |
1600
|
|
|
|
|
|
|
the model in a disk file and to base all subsequent retrievals on the disk-stored |
1601
|
|
|
|
|
|
|
models. The 'examples' directory contains scripts that carry out retrievals on the |
1602
|
|
|
|
|
|
|
basis of disk-based models. Further speedup in retrieval can be achieved by using |
1603
|
|
|
|
|
|
|
LSA to create reduced-dimensionality representations for the documents and by basing |
1604
|
|
|
|
|
|
|
retrievals on the stored versions of such reduced-dimensionality representations. |
1605
|
|
|
|
|
|
|
|
1606
|
|
|
|
|
|
|
|
1607
|
|
|
|
|
|
|
=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
The performance of a retrieval algorithm is typically measured by two properties: |
1610
|
|
|
|
|
|
|
C and C. As mentioned in the |
1611
|
|
|
|
|
|
|
L publication, at a given rank C, |
1612
|
|
|
|
|
|
|
C is the ratio of the number of retrieved documents that are relevant to |
1613
|
|
|
|
|
|
|
the total number of retrieved documents up to that rank. And, along the same lines, |
1614
|
|
|
|
|
|
|
C at a given rank C is the ratio of the number of retrieved documents that |
1615
|
|
|
|
|
|
|
are relevant to the total number of relevant documents. The area under the |
1616
|
|
|
|
|
|
|
C--C curve is called the C for a query. When |
1617
|
|
|
|
|
|
|
the C is averaged over all the queries, we obtain what is known as |
1618
|
|
|
|
|
|
|
C (MAP). For an oracle, the value of MAP should be 1.0. On |
1619
|
|
|
|
|
|
|
the other hand, for purely random retrieval from a corpus, the value of MAP will be |
1620
|
|
|
|
|
|
|
inversely proportional to the size of the corpus. (See the discussion in |
1621
|
|
|
|
|
|
|
L for further |
1622
|
|
|
|
|
|
|
explanation on these retrieval precision evaluators.) This module includes methods |
1623
|
|
|
|
|
|
|
that allow you to carry out these retrieval accuracy measurements using the relevancy |
1624
|
|
|
|
|
|
|
judgments supplied through a disk file. If human-supplied relevancy judgments are |
1625
|
|
|
|
|
|
|
not available, the module will be happy to estimate relevancies for you just by |
1626
|
|
|
|
|
|
|
determining the number of query words that exist in a document. Note, however, that |
1627
|
|
|
|
|
|
|
relevancy judgments estimated in this manner cannot be trusted. That is because |
1628
|
|
|
|
|
|
|
ultimately it is the humans who are the best judges of the relevancies of documents |
1629
|
|
|
|
|
|
|
to queries. The humans bring to bear semantic considerations on the relevancy |
1630
|
|
|
|
|
|
|
determination problem that are beyond the scope of this module. |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
=head1 METHODS |
1634
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
The module provides the following methods for constructing VSM and LSA models of a |
1636
|
|
|
|
|
|
|
corpus, for using the models thus constructed for retrieval, and for carrying out |
1637
|
|
|
|
|
|
|
precision versus recall calculations for the determination of retrieval accuracy on |
1638
|
|
|
|
|
|
|
the corpora of interest to you. |
1639
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
=over |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
=item B |
1643
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
A call to C constructs a new instance of the C class: |
1645
|
|
|
|
|
|
|
|
1646
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
1647
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
1648
|
|
|
|
|
|
|
case_sensitive => 0, |
1649
|
|
|
|
|
|
|
corpus_directory => "", |
1650
|
|
|
|
|
|
|
corpus_vocab_db => "corpus_vocab_db", |
1651
|
|
|
|
|
|
|
doc_vectors_db => "doc_vectors_db", |
1652
|
|
|
|
|
|
|
file_types => $my_file_types, |
1653
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
1654
|
|
|
|
|
|
|
max_number_retrievals => 10, |
1655
|
|
|
|
|
|
|
min_word_length => 4, |
1656
|
|
|
|
|
|
|
normalized_doc_vecs_db => "normalized_doc_vecs_db", |
1657
|
|
|
|
|
|
|
query_file => "", |
1658
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
1659
|
|
|
|
|
|
|
relevancy_threshold => 5, |
1660
|
|
|
|
|
|
|
save_model_on_disk => 0, |
1661
|
|
|
|
|
|
|
stop_words_file => "", |
1662
|
|
|
|
|
|
|
use_idf_filter => 1, |
1663
|
|
|
|
|
|
|
want_stemming => 1, |
1664
|
|
|
|
|
|
|
); |
1665
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
The values shown on the right side of the big arrows are the B
|
1667
|
|
|
|
|
|
|
parameters>. The value supplied through the variable C<$my_file_types> would be |
1668
|
|
|
|
|
|
|
something like C<['.java', '.txt']> if, say, you wanted only Java and text files to |
1669
|
|
|
|
|
|
|
be included in creating the database model. The following nested list will now |
1670
|
|
|
|
|
|
|
describe each of the constructor parameters shown above: |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
=over 16 |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
=item I |
1675
|
|
|
|
|
|
|
|
1676
|
|
|
|
|
|
|
The parameter B when set causes the |
1677
|
|
|
|
|
|
|
underscored and camel-cased words to be split. By default the parameter is |
1678
|
|
|
|
|
|
|
set. So if you don't want such words to be split, you must set it |
1679
|
|
|
|
|
|
|
explicitly to 0. |
1680
|
|
|
|
|
|
|
|
1681
|
|
|
|
|
|
|
=item I |
1682
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
The parameter B points to the root of the directory of documents |
1684
|
|
|
|
|
|
|
for which you want to create a VSM or LSA model. |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
=item I |
1687
|
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
|
The parameter B is for naming the DBM in which the corpus vocabulary |
1689
|
|
|
|
|
|
|
will be stored after it is subject to stemming and the elimination of stop words. |
1690
|
|
|
|
|
|
|
Once a disk-based VSM model is created and stored away in the file named by this |
1691
|
|
|
|
|
|
|
parameter and the parameter to be described next, it can subsequently be used |
1692
|
|
|
|
|
|
|
directly for speedier retrieval. |
1693
|
|
|
|
|
|
|
|
1694
|
|
|
|
|
|
|
=item I |
1695
|
|
|
|
|
|
|
|
1696
|
|
|
|
|
|
|
When set to 1, this parameter forces the module to maintain the case of the terms in |
1697
|
|
|
|
|
|
|
the corpus files when creating the vocabulary and the document vectors. Setting |
1698
|
|
|
|
|
|
|
C to 1 also causes the query matching to become case sensitive. |
1699
|
|
|
|
|
|
|
(This constructor parameter was introduced in Version 1.61.) |
1700
|
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
|
=item I |
1702
|
|
|
|
|
|
|
|
1703
|
|
|
|
|
|
|
The database named by B stores the document vector representation for |
1704
|
|
|
|
|
|
|
each document in the corpus. Each document vector has the same size as the |
1705
|
|
|
|
|
|
|
corpus-wide vocabulary; each element of such a vector is the number of occurrences of |
1706
|
|
|
|
|
|
|
the word that corresponds to that position in the vocabulary vector. |
1707
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
=item I |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
This parameter tells the module what types of files in the corpus directory you want |
1711
|
|
|
|
|
|
|
scanned for creating the database model. The value supplied for this parameter is an |
1712
|
|
|
|
|
|
|
anonymous list of the file suffixes for the file types. For example, if you wanted |
1713
|
|
|
|
|
|
|
only Java and text files to be scanned, you will set this parameter to C<['.java', |
1714
|
|
|
|
|
|
|
'.txt']>. The module throws an exception if this parameter is left unspecified. |
1715
|
|
|
|
|
|
|
(This constructor parameter was introduced in Version 1.61.) |
1716
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
=item I |
1718
|
|
|
|
|
|
|
|
1719
|
|
|
|
|
|
|
The parameter B is used for rejecting singular values that are |
1720
|
|
|
|
|
|
|
smaller than this threshold fraction of the largest singular value. This plays a |
1721
|
|
|
|
|
|
|
critical role in creating reduced-dimensionality document vectors in LSA modeling of |
1722
|
|
|
|
|
|
|
a corpus. |
1723
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
=item I |
1725
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
The constructor parameter B stands for what it means. |
1727
|
|
|
|
|
|
|
|
1728
|
|
|
|
|
|
|
=item I |
1729
|
|
|
|
|
|
|
|
1730
|
|
|
|
|
|
|
The parameter B sets the minimum number of characters in a |
1731
|
|
|
|
|
|
|
word in order for it to be included in the corpus vocabulary. |
1732
|
|
|
|
|
|
|
|
1733
|
|
|
|
|
|
|
=item I |
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
The database named by B stores the normalized document |
1736
|
|
|
|
|
|
|
vectors. Normalization consists of factoring out the size of the documents by |
1737
|
|
|
|
|
|
|
dividing the term frequency for each word in a document by the number of words in the |
1738
|
|
|
|
|
|
|
document, and then multiplying the result by the idf (Inverse Document Frequency) |
1739
|
|
|
|
|
|
|
value for the word. |
1740
|
|
|
|
|
|
|
|
1741
|
|
|
|
|
|
|
=item I |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
The parameter B points to a file that contains the queries to be used for |
1744
|
|
|
|
|
|
|
calculating retrieval performance with C and C numbers. The format |
1745
|
|
|
|
|
|
|
of the query file must be as shown in the sample file C in the |
1746
|
|
|
|
|
|
|
'examples' directory. |
1747
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
=item I |
1749
|
|
|
|
|
|
|
|
1750
|
|
|
|
|
|
|
This option names the disk file for storing the relevancy judgments. |
1751
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
=item I |
1753
|
|
|
|
|
|
|
|
1754
|
|
|
|
|
|
|
The constructor parameter B is used for automatic determination |
1755
|
|
|
|
|
|
|
of document relevancies to queries on the basis of the number of occurrences of query |
1756
|
|
|
|
|
|
|
words in a document. You can exercise control over the process of determining |
1757
|
|
|
|
|
|
|
relevancy of a document to a query by giving a suitable value to the constructor |
1758
|
|
|
|
|
|
|
parameter B. A document is considered relevant to a query only |
1759
|
|
|
|
|
|
|
when the document contains at least B number of query words. |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
=item I |
1762
|
|
|
|
|
|
|
|
1763
|
|
|
|
|
|
|
The constructor parameter B will cause the basic |
1764
|
|
|
|
|
|
|
information about the VSM and the LSA models to be stored on the disk. |
1765
|
|
|
|
|
|
|
Subsequently, any retrievals can be carried out from the disk-based model. |
1766
|
|
|
|
|
|
|
|
1767
|
|
|
|
|
|
|
=item I |
1768
|
|
|
|
|
|
|
|
1769
|
|
|
|
|
|
|
The parameter B is for naming the file that contains the stop words |
1770
|
|
|
|
|
|
|
that you do not wish to include in the corpus vocabulary. The format of this file |
1771
|
|
|
|
|
|
|
must be as shown in the sample file C in the 'examples' directory. |
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
=item I |
1774
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
The constructor parameter B is set by default. If you want |
1776
|
|
|
|
|
|
|
to turn off the normalization of the document vectors, including turning |
1777
|
|
|
|
|
|
|
off the weighting of the term frequencies of the words by their idf values, |
1778
|
|
|
|
|
|
|
you must set this parameter explicitly to 0. |
1779
|
|
|
|
|
|
|
|
1780
|
|
|
|
|
|
|
=item I |
1781
|
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
|
The boolean parameter B determines whether or not the words extracted |
1783
|
|
|
|
|
|
|
from the documents would be subject to stemming. As mentioned elsewhere, stemming |
1784
|
|
|
|
|
|
|
means that related words like 'programming' and 'programs' would both be reduced to |
1785
|
|
|
|
|
|
|
the root word 'program'. |
1786
|
|
|
|
|
|
|
|
1787
|
|
|
|
|
|
|
=back |
1788
|
|
|
|
|
|
|
|
1789
|
|
|
|
|
|
|
=begin html |
1790
|
|
|
|
|
|
|
|
1791
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
=end html |
1794
|
|
|
|
|
|
|
|
1795
|
|
|
|
|
|
|
=item B |
1796
|
|
|
|
|
|
|
|
1797
|
|
|
|
|
|
|
You call this subroutine for constructing an LSA model for your corpus |
1798
|
|
|
|
|
|
|
after you have extracted the corpus vocabulary and constructed document |
1799
|
|
|
|
|
|
|
vectors: |
1800
|
|
|
|
|
|
|
|
1801
|
|
|
|
|
|
|
$vsm->construct_lsa_model(); |
1802
|
|
|
|
|
|
|
|
1803
|
|
|
|
|
|
|
The SVD decomposition that is carried out in LSA model construction uses the |
1804
|
|
|
|
|
|
|
constructor parameter C to decide how many of the singular values |
1805
|
|
|
|
|
|
|
to retain for the LSA model. A singular is retained only if it is larger than the |
1806
|
|
|
|
|
|
|
C fraction of the largest singular value. |
1807
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
|
1809
|
|
|
|
|
|
|
=item B |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
If you would like to see corpus vocabulary as constructed by the previous call, make |
1812
|
|
|
|
|
|
|
the call |
1813
|
|
|
|
|
|
|
|
1814
|
|
|
|
|
|
|
$vsm->display_corpus_vocab(); |
1815
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
Note that this is a useful thing to do only on small test corpora. If you need |
1817
|
|
|
|
|
|
|
to examine the vocabulary for a large corpus, call the two methods listed below. |
1818
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
=item B |
1821
|
|
|
|
|
|
|
|
1822
|
|
|
|
|
|
|
If you would like for the module to print out in your terminal window the size of the |
1823
|
|
|
|
|
|
|
vocabulary, make the call |
1824
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
$vsm->display_corpus_vocab_size(); |
1826
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
=item B |
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
If you would like to see the document relevancies generated by the previous method, |
1831
|
|
|
|
|
|
|
you can call |
1832
|
|
|
|
|
|
|
|
1833
|
|
|
|
|
|
|
$vsm->display_doc_relevancies() |
1834
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
|
=item B |
1837
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
If you would like to see the document vectors constructed by the previous call, make |
1839
|
|
|
|
|
|
|
the call: |
1840
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
$vsm->display_doc_vectors(); |
1842
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
Note that this is a useful thing to do only on small test corpora. If you must call |
1844
|
|
|
|
|
|
|
this method on a large corpus, you might wish to direct the output to a file. |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
=item B |
1848
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
You can display the idf value associated with each word in the corpus by |
1850
|
|
|
|
|
|
|
|
1851
|
|
|
|
|
|
|
$vsm->display_inverse_document_frequencies(); |
1852
|
|
|
|
|
|
|
|
1853
|
|
|
|
|
|
|
The idf of a word in the corpus is calculated typically as the logarithm of the ratio |
1854
|
|
|
|
|
|
|
of the total number of documents in the corpus to the number of documents in which |
1855
|
|
|
|
|
|
|
the word appears (with protection built in to prevent division by zero). Ideally, if |
1856
|
|
|
|
|
|
|
a word appears in all the documents, its idf would be small, close to zero. Words |
1857
|
|
|
|
|
|
|
with small idf values are non-discriminatory and should get reduced weighting in |
1858
|
|
|
|
|
|
|
document retrieval. |
1859
|
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
|
|
1861
|
|
|
|
|
|
|
=item B |
1862
|
|
|
|
|
|
|
|
1863
|
|
|
|
|
|
|
The area under the precision vs. recall curve for a given query is called C
|
1864
|
|
|
|
|
|
|
Precision> for that query. When this area is averaged over all the queries, you get |
1865
|
|
|
|
|
|
|
C |
1866
|
|
|
|
|
|
|
algorithm. The C values for the queries and the overall C |
1867
|
|
|
|
|
|
|
can be printed out by calling |
1868
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
$vsm->display_map_values_for_queries(); |
1870
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
|
1872
|
|
|
|
|
|
|
=item B |
1873
|
|
|
|
|
|
|
|
1874
|
|
|
|
|
|
|
If you would like to see the normalized document vectors, make the call: |
1875
|
|
|
|
|
|
|
|
1876
|
|
|
|
|
|
|
$vsm->display_normalized_doc_vectors(); |
1877
|
|
|
|
|
|
|
|
1878
|
|
|
|
|
|
|
See the comment made previously as to what is meant by the normalization of a |
1879
|
|
|
|
|
|
|
document vector. |
1880
|
|
|
|
|
|
|
|
1881
|
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
=item B |
1883
|
|
|
|
|
|
|
|
1884
|
|
|
|
|
|
|
A call to C will normally be followed by the |
1885
|
|
|
|
|
|
|
following call |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
1888
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
for displaying the C and C values. |
1890
|
|
|
|
|
|
|
|
1891
|
|
|
|
|
|
|
|
1892
|
|
|
|
|
|
|
=item B |
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
You can display the retrieved document names by calling this method using the syntax: |
1895
|
|
|
|
|
|
|
|
1896
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
1897
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
where C<$retrievals> is a reference to the hash returned by a call to one of the |
1899
|
|
|
|
|
|
|
C methods. The display method shown here respects the retrieval size |
1900
|
|
|
|
|
|
|
constraints expressed by the constructor parameter C. |
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
|
1903
|
|
|
|
|
|
|
=item B |
1904
|
|
|
|
|
|
|
|
1905
|
|
|
|
|
|
|
Before you can carry out precision and recall calculations to test the accuracy of |
1906
|
|
|
|
|
|
|
VSM and LSA based retrievals from a corpus, you need to have available the relevancy |
1907
|
|
|
|
|
|
|
judgments for the queries. (A relevancy judgment for a query is simply the list of |
1908
|
|
|
|
|
|
|
documents relevant to that query.) Relevancy judgments are commonly supplied by the |
1909
|
|
|
|
|
|
|
humans who are familiar with the corpus. But if such human-supplied relevance |
1910
|
|
|
|
|
|
|
judgments are not available, you can invoke the following method to estimate them: |
1911
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
$vsm->estimate_doc_relevancies(); |
1913
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
For the above method call, a document is considered to be relevant to a query if it |
1915
|
|
|
|
|
|
|
contains several of the query words. As to the minimum number of query words that |
1916
|
|
|
|
|
|
|
must exist in a document in order for the latter to be considered relevant, that is |
1917
|
|
|
|
|
|
|
determined by the C parameter in the VSM constructor. |
1918
|
|
|
|
|
|
|
|
1919
|
|
|
|
|
|
|
But note that this estimation of document relevancies to queries is NOT for serious |
1920
|
|
|
|
|
|
|
work. The reason for that is because ultimately it is the humans who are the best |
1921
|
|
|
|
|
|
|
judges of the relevancies of documents to queries. The humans bring to bear semantic |
1922
|
|
|
|
|
|
|
considerations on the relevancy determination problem that are beyond the scope of |
1923
|
|
|
|
|
|
|
this module. |
1924
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
The generated relevancies are deposited in a file named by the constructor parameter |
1926
|
|
|
|
|
|
|
C. |
1927
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
|
1929
|
|
|
|
|
|
|
=item B |
1930
|
|
|
|
|
|
|
|
1931
|
|
|
|
|
|
|
If you want to get hold of all the filenames in the corpus in your own script, you |
1932
|
|
|
|
|
|
|
can call |
1933
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
my @docs = @{$vsm->get_all_document_names()}; |
1935
|
|
|
|
|
|
|
|
1936
|
|
|
|
|
|
|
The array on the left will contain an alphabetized list of the files. |
1937
|
|
|
|
|
|
|
|
1938
|
|
|
|
|
|
|
|
1939
|
|
|
|
|
|
|
=item B |
1940
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
This is a necessary step after the vocabulary used by a corpus is constructed. (Of |
1942
|
|
|
|
|
|
|
course, if you will be doing document retrieval through a disk-stored VSM or LSA |
1943
|
|
|
|
|
|
|
model, then you do not need to call this method. You construct document vectors |
1944
|
|
|
|
|
|
|
through the following call: |
1945
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
1947
|
|
|
|
|
|
|
|
1948
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
=item B |
1950
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
After you have constructed a new instance of the C class, you must |
1952
|
|
|
|
|
|
|
now scan the corpus documents for constructing the corpus vocabulary. This you do by: |
1953
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
1955
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
The only time you do NOT need to call this method is when you are using a previously |
1957
|
|
|
|
|
|
|
constructed disk-stored VSM model for retrieval. |
1958
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
=item B |
1961
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
If you want to run significance tests on the retrieval accuracies you obtain on a |
1963
|
|
|
|
|
|
|
given corpus and with different algorithms (VSM or LSA with different choices for the |
1964
|
|
|
|
|
|
|
constructor parameters), your own script would need access to the average precision |
1965
|
|
|
|
|
|
|
data for a set of queries. You can get hold of this data by calling |
1966
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
$vsm->get_query_sorted_average_precision_for_queries(); |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
The script C in the 'examples' directory shows how you can |
1970
|
|
|
|
|
|
|
use this method for significance testing. |
1971
|
|
|
|
|
|
|
|
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
=item B |
1974
|
|
|
|
|
|
|
|
1975
|
|
|
|
|
|
|
=item B |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
If you would like to compare in your own script any two documents in the corpus, you |
1978
|
|
|
|
|
|
|
can call |
1979
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2"); |
1981
|
|
|
|
|
|
|
or |
1982
|
|
|
|
|
|
|
my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2"); |
1983
|
|
|
|
|
|
|
|
1984
|
|
|
|
|
|
|
Both these calls return a number that is the dot product of the two document vectors |
1985
|
|
|
|
|
|
|
normalized by the product of their magnitudes. The first call uses the regular |
1986
|
|
|
|
|
|
|
document vectors and the second the normalized document vectors. |
1987
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
=item B |
1990
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
After you have created or obtained the relevancy judgments for your test queries, you |
1992
|
|
|
|
|
|
|
can make the following call to calculate C and C: |
1993
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
1995
|
|
|
|
|
|
|
or |
1996
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('lsa'); |
1997
|
|
|
|
|
|
|
|
1998
|
|
|
|
|
|
|
depending on whether you are testing VSM-based retrieval or LSA-based retrieval. |
1999
|
|
|
|
|
|
|
|
2000
|
|
|
|
|
|
|
=item B |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
After you have built an LSA model through the call to C, you |
2003
|
|
|
|
|
|
|
can retrieve the document names most similar to the query by: |
2004
|
|
|
|
|
|
|
|
2005
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_lsa( \@query ); |
2006
|
|
|
|
|
|
|
|
2007
|
|
|
|
|
|
|
Subsequently, you can display the retrievals by calling the |
2008
|
|
|
|
|
|
|
C method described previously. |
2009
|
|
|
|
|
|
|
|
2010
|
|
|
|
|
|
|
|
2011
|
|
|
|
|
|
|
=item B |
2012
|
|
|
|
|
|
|
|
2013
|
|
|
|
|
|
|
After you have constructed a VSM model, you call this method for document retrieval |
2014
|
|
|
|
|
|
|
for a given query C<@query>. The call syntax is: |
2015
|
|
|
|
|
|
|
|
2016
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_vsm( \@query ); |
2017
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
The argument, C<@query>, is simply a list of words that you wish to use for |
2019
|
|
|
|
|
|
|
retrieval. The method returns a hash whose keys are the document names and whose |
2020
|
|
|
|
|
|
|
values the similarity distance between the document and the query. As is commonly |
2021
|
|
|
|
|
|
|
the case with VSM, this module uses the cosine similarity distance when comparing a |
2022
|
|
|
|
|
|
|
document vector with the query vector. |
2023
|
|
|
|
|
|
|
|
2024
|
|
|
|
|
|
|
|
2025
|
|
|
|
|
|
|
=item B |
2026
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
When human-supplied relevancies are available, you can upload them into the program |
2028
|
|
|
|
|
|
|
by calling |
2029
|
|
|
|
|
|
|
|
2030
|
|
|
|
|
|
|
$vsm->upload_document_relevancies_from_file(); |
2031
|
|
|
|
|
|
|
|
2032
|
|
|
|
|
|
|
These relevance judgments will be read from a file that is named with the |
2033
|
|
|
|
|
|
|
C constructor parameter. |
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
|
2036
|
|
|
|
|
|
|
=item B |
2037
|
|
|
|
|
|
|
|
2038
|
|
|
|
|
|
|
When you invoke the methods C and |
2039
|
|
|
|
|
|
|
C, that automatically deposits the VSM model in the |
2040
|
|
|
|
|
|
|
database files named with the constructor parameters C, |
2041
|
|
|
|
|
|
|
C and C. Subsequently, you can carry out |
2042
|
|
|
|
|
|
|
retrieval by directly using this disk-based VSM model for speedier performance. In |
2043
|
|
|
|
|
|
|
order to do so, you must upload the disk-based model by |
2044
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
$vsm->upload_normalized_vsm_model_from_disk(); |
2046
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
Subsequently you call |
2048
|
|
|
|
|
|
|
|
2049
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_vsm( \@query ); |
2050
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
2051
|
|
|
|
|
|
|
|
2052
|
|
|
|
|
|
|
for retrieval and for displaying the results. |
2053
|
|
|
|
|
|
|
|
2054
|
|
|
|
|
|
|
|
2055
|
|
|
|
|
|
|
=item B |
2056
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
This is the method to call for large text corpora if you would like to examine the |
2058
|
|
|
|
|
|
|
vocabulary created. The call syntax is |
2059
|
|
|
|
|
|
|
|
2060
|
|
|
|
|
|
|
$vsm->write_corpus_vocab_to_file($filename); |
2061
|
|
|
|
|
|
|
|
2062
|
|
|
|
|
|
|
where C<$filename> is the name of the file that you want the vocabulary to be written |
2063
|
|
|
|
|
|
|
out to. This call will also show the frequency of each vocabulary word in your |
2064
|
|
|
|
|
|
|
corpus. |
2065
|
|
|
|
|
|
|
|
2066
|
|
|
|
|
|
|
|
2067
|
|
|
|
|
|
|
=back |
2068
|
|
|
|
|
|
|
|
2069
|
|
|
|
|
|
|
|
2070
|
|
|
|
|
|
|
=head1 REQUIRED |
2071
|
|
|
|
|
|
|
|
2072
|
|
|
|
|
|
|
This module requires the following modules: |
2073
|
|
|
|
|
|
|
|
2074
|
|
|
|
|
|
|
SDBM_File |
2075
|
|
|
|
|
|
|
Storable |
2076
|
|
|
|
|
|
|
PDL |
2077
|
|
|
|
|
|
|
File::Basename |
2078
|
|
|
|
|
|
|
File::Spec::Functions |
2079
|
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
|
The first two of these are needed for creating disk-based database records for the |
2081
|
|
|
|
|
|
|
VSM and LSA models. The third is needed for calculating the SVD of the |
2082
|
|
|
|
|
|
|
term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed |
2083
|
|
|
|
|
|
|
by the directory scanner to make pathnames platform independent. |
2084
|
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
|
=head1 EXAMPLES |
2086
|
|
|
|
|
|
|
|
2087
|
|
|
|
|
|
|
See the 'examples' directory in the distribution for the scripts listed below: |
2088
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
=over |
2090
|
|
|
|
|
|
|
|
2091
|
|
|
|
|
|
|
=item B |
2092
|
|
|
|
|
|
|
|
2093
|
|
|
|
|
|
|
For basic VSM-based model construction and retrieval, run the script: |
2094
|
|
|
|
|
|
|
|
2095
|
|
|
|
|
|
|
retrieve_with_VSM.pl |
2096
|
|
|
|
|
|
|
|
2097
|
|
|
|
|
|
|
Starting with version 1.60, this script does not store away the VSM model in |
2098
|
|
|
|
|
|
|
disk-based hash tables. If you want your model to be stored on the disk, you must |
2099
|
|
|
|
|
|
|
run the script C for that. |
2100
|
|
|
|
|
|
|
|
2101
|
|
|
|
|
|
|
=item B |
2102
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
If you want to run an infinite loop for repeated retrievals from a VSM model, run the |
2104
|
|
|
|
|
|
|
script |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
continuously_running_VSM_retrieval_engine.pl |
2107
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
You can create a script similar to this for doing the same with LSA models. |
2109
|
|
|
|
|
|
|
|
2110
|
|
|
|
|
|
|
=item B |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
For storing the model information in disk-based DBM files that can subsequently be |
2113
|
|
|
|
|
|
|
used for both VSM and LSA retrieval, run the script: |
2114
|
|
|
|
|
|
|
|
2115
|
|
|
|
|
|
|
retrieve_with_VSM_and_also_create_disk_based_model.pl |
2116
|
|
|
|
|
|
|
|
2117
|
|
|
|
|
|
|
=item B |
2118
|
|
|
|
|
|
|
|
2119
|
|
|
|
|
|
|
For basic LSA-based model construction and retrieval, run the script: |
2120
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
retrieve_with_LSA.pl |
2122
|
|
|
|
|
|
|
|
2123
|
|
|
|
|
|
|
Starting with version 1.60, this script does not store away the model information in |
2124
|
|
|
|
|
|
|
disk-based hash tables. If you want your model to be stored on the disk, you must |
2125
|
|
|
|
|
|
|
run the script C for that. |
2126
|
|
|
|
|
|
|
|
2127
|
|
|
|
|
|
|
=item B |
2128
|
|
|
|
|
|
|
|
2129
|
|
|
|
|
|
|
If you have previously run a script like |
2130
|
|
|
|
|
|
|
C, you can run the script |
2131
|
|
|
|
|
|
|
|
2132
|
|
|
|
|
|
|
retrieve_with_disk_based_VSM.pl |
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
for repeated VSM-based retrievals from a disk-based model. |
2135
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
=item B |
2137
|
|
|
|
|
|
|
|
2138
|
|
|
|
|
|
|
If you have previously run a script like |
2139
|
|
|
|
|
|
|
C, you can run the script |
2140
|
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
retrieve_with_disk_based_LSA.pl |
2142
|
|
|
|
|
|
|
|
2143
|
|
|
|
|
|
|
for repeated LSA-based retrievals from a disk-based model. |
2144
|
|
|
|
|
|
|
|
2145
|
|
|
|
|
|
|
=item B |
2146
|
|
|
|
|
|
|
|
2147
|
|
|
|
|
|
|
To experiment with precision and recall calculations for VSM retrieval, run the |
2148
|
|
|
|
|
|
|
script: |
2149
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
calculate_precision_and_recall_for_VSM.pl |
2151
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
Note that this script will carry out its own estimation of relevancy judgments --- |
2153
|
|
|
|
|
|
|
which in most cases would not be a safe thing to do. |
2154
|
|
|
|
|
|
|
|
2155
|
|
|
|
|
|
|
=item B |
2156
|
|
|
|
|
|
|
|
2157
|
|
|
|
|
|
|
To experiment with precision and recall calculations for LSA retrieval, run the |
2158
|
|
|
|
|
|
|
script: |
2159
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
calculate_precision_and_recall_for_LSA.pl |
2161
|
|
|
|
|
|
|
|
2162
|
|
|
|
|
|
|
Note that this script will carry out its own estimation of relevancy judgments --- |
2163
|
|
|
|
|
|
|
which in most cases would not be a safe thing to do. |
2164
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
=item B
|
2166
|
|
|
|
|
|
|
Human-Supplied Relevancies:> |
2167
|
|
|
|
|
|
|
|
2168
|
|
|
|
|
|
|
Precision and recall calculations for retrieval accuracy determination are best |
2169
|
|
|
|
|
|
|
carried out with human-supplied judgments of relevancies of the documents to queries. |
2170
|
|
|
|
|
|
|
If such judgments are available, run the script: |
2171
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl |
2173
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
This script will print out the average precisions for the different test queries and |
2175
|
|
|
|
|
|
|
calculate the MAP metric of retrieval accuracy. |
2176
|
|
|
|
|
|
|
|
2177
|
|
|
|
|
|
|
=item B
|
2178
|
|
|
|
|
|
|
Human-Supplied Relevancies:> |
2179
|
|
|
|
|
|
|
|
2180
|
|
|
|
|
|
|
If human-supplied relevancy judgments are available and you wish to experiment with |
2181
|
|
|
|
|
|
|
precision and recall calculations for LSA-based retrieval, run the script: |
2182
|
|
|
|
|
|
|
|
2183
|
|
|
|
|
|
|
calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl |
2184
|
|
|
|
|
|
|
|
2185
|
|
|
|
|
|
|
This script will print out the average precisions for the different test queries and |
2186
|
|
|
|
|
|
|
calculate the MAP metric of retrieval accuracy. |
2187
|
|
|
|
|
|
|
|
2188
|
|
|
|
|
|
|
=item B
|
2189
|
|
|
|
|
|
|
Randomization or with Student's Paired t-Test:> |
2190
|
|
|
|
|
|
|
|
2191
|
|
|
|
|
|
|
significance_testing.pl randomization |
2192
|
|
|
|
|
|
|
|
2193
|
|
|
|
|
|
|
or |
2194
|
|
|
|
|
|
|
|
2195
|
|
|
|
|
|
|
significance_testing.pl t-test |
2196
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
Significance testing consists of forming a null hypothesis that the two retrieval |
2198
|
|
|
|
|
|
|
algorithms you are considering are the same from a black-box perspective and then |
2199
|
|
|
|
|
|
|
calculating what is known as a C. If the C is less than, say, |
2200
|
|
|
|
|
|
|
0.05, you reject the null hypothesis. |
2201
|
|
|
|
|
|
|
|
2202
|
|
|
|
|
|
|
=item B |
2203
|
|
|
|
|
|
|
|
2204
|
|
|
|
|
|
|
calculate_similarity_matrix_for_all_docs.pl |
2205
|
|
|
|
|
|
|
|
2206
|
|
|
|
|
|
|
or |
2207
|
|
|
|
|
|
|
|
2208
|
|
|
|
|
|
|
calculate_similarity_matrix_for_all_normalized_docs.pl |
2209
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
The former uses regular document vectors for calculating the similarity between every |
2211
|
|
|
|
|
|
|
pair of documents in the corpus. And the latter uses normalized document vectors for |
2212
|
|
|
|
|
|
|
the same purpose. The document order used for row and column indexing of the matrix |
2213
|
|
|
|
|
|
|
corresponds to the alphabetic ordering of the document names in the corpus directory. |
2214
|
|
|
|
|
|
|
|
2215
|
|
|
|
|
|
|
=back |
2216
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
|
2218
|
|
|
|
|
|
|
=head1 EXPORT |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
None by design. |
2221
|
|
|
|
|
|
|
|
2222
|
|
|
|
|
|
|
=head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS |
2223
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
You have to be careful when carrying out Precision verses Recall calculations if you |
2225
|
|
|
|
|
|
|
do not wish to lose the previously created relevancy judgments. Invoking the method |
2226
|
|
|
|
|
|
|
C in your own script will cause the file C |
2227
|
|
|
|
|
|
|
to be overwritten. If you have created a relevancy database and stored it in a file |
2228
|
|
|
|
|
|
|
called, say, C, you should make a backup copy of this file before |
2229
|
|
|
|
|
|
|
executing a script that calls C. |
2230
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
=head1 BUGS |
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
Please notify the author if you encounter any bugs. When sending email, please place |
2234
|
|
|
|
|
|
|
the string 'VSM' in the subject line to get past my spam filter. |
2235
|
|
|
|
|
|
|
|
2236
|
|
|
|
|
|
|
=head1 INSTALLATION |
2237
|
|
|
|
|
|
|
|
2238
|
|
|
|
|
|
|
Download the archive from CPAN in any directory of your choice. Unpack the archive |
2239
|
|
|
|
|
|
|
with a command that on a Linux machine would look like: |
2240
|
|
|
|
|
|
|
|
2241
|
|
|
|
|
|
|
tar zxvf Algorithm-VSM-1.61.tar.gz |
2242
|
|
|
|
|
|
|
|
2243
|
|
|
|
|
|
|
This will create an installation directory for you whose name will be |
2244
|
|
|
|
|
|
|
C. Enter this directory and execute the following commands for a |
2245
|
|
|
|
|
|
|
standard install of the module if you have root privileges: |
2246
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
perl Makefile.PL |
2248
|
|
|
|
|
|
|
make |
2249
|
|
|
|
|
|
|
make test |
2250
|
|
|
|
|
|
|
sudo make install |
2251
|
|
|
|
|
|
|
|
2252
|
|
|
|
|
|
|
If you do not have root privileges, you can carry out a non-standard install the |
2253
|
|
|
|
|
|
|
module in any directory of your choice by: |
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
perl Makefile.PL prefix=/some/other/directory/ |
2256
|
|
|
|
|
|
|
make |
2257
|
|
|
|
|
|
|
make test |
2258
|
|
|
|
|
|
|
make install |
2259
|
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
|
With a non-standard install, you may also have to set your PERL5LIB environment |
2261
|
|
|
|
|
|
|
variable so that this module can find the required other modules. How you do that |
2262
|
|
|
|
|
|
|
would depend on what platform you are working on. In order to install this module in |
2263
|
|
|
|
|
|
|
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment |
2264
|
|
|
|
|
|
|
variable by |
2265
|
|
|
|
|
|
|
|
2266
|
|
|
|
|
|
|
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
2267
|
|
|
|
|
|
|
|
2268
|
|
|
|
|
|
|
If I used bash, I'd need to declare: |
2269
|
|
|
|
|
|
|
|
2270
|
|
|
|
|
|
|
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
2271
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
|
2273
|
|
|
|
|
|
|
=head1 THANKS |
2274
|
|
|
|
|
|
|
|
2275
|
|
|
|
|
|
|
Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their |
2276
|
|
|
|
|
|
|
deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in |
2277
|
|
|
|
|
|
|
similarity matrix characterization of a corpus. Thanks, Zahn! Several of the recent |
2278
|
|
|
|
|
|
|
changes to the module are a result of the feedback I have received from Naveen |
2279
|
|
|
|
|
|
|
Kulkarni of Infosys Labs. Thanks, Naveen! |
2280
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
=head1 AUTHOR |
2282
|
|
|
|
|
|
|
|
2283
|
|
|
|
|
|
|
Avinash Kak, kak@purdue.edu |
2284
|
|
|
|
|
|
|
|
2285
|
|
|
|
|
|
|
If you send email, please place the string "VSM" in your subject line to get past my |
2286
|
|
|
|
|
|
|
spam filter. |
2287
|
|
|
|
|
|
|
|
2288
|
|
|
|
|
|
|
=head1 COPYRIGHT |
2289
|
|
|
|
|
|
|
|
2290
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under the |
2291
|
|
|
|
|
|
|
same terms as Perl itself. |
2292
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
Copyright 2015 Avinash Kak |
2294
|
|
|
|
|
|
|
|
2295
|
|
|
|
|
|
|
=cut |
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
|