| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Algorithm::VSM; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
|
4
|
|
|
|
|
|
|
# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free |
|
5
|
|
|
|
|
|
|
# software. You may modify and/or distribute it under the same terms as Perl itself. |
|
6
|
|
|
|
|
|
|
# This copyright notice must remain attached to the file. |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Algorithm::VSM is a Perl module for retrieving the documents from a software |
|
9
|
|
|
|
|
|
|
# library that match a list of words in a query. The matching criterion used depends |
|
10
|
|
|
|
|
|
|
# on whether you ask the module to construct a full-dimensionality VSM or a |
|
11
|
|
|
|
|
|
|
# reduced-dimensionality LSA model for the library. |
|
12
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
#use 5.10.0; |
|
15
|
1
|
|
|
1
|
|
33328
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
37
|
|
|
16
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
37
|
|
|
17
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
103
|
|
|
18
|
1
|
|
|
1
|
|
882
|
use SDBM_File; |
|
|
1
|
|
|
|
|
29572
|
|
|
|
1
|
|
|
|
|
104
|
|
|
19
|
1
|
|
|
1
|
|
1082
|
use PDL::Lite; |
|
|
1
|
|
|
|
|
217243
|
|
|
|
1
|
|
|
|
|
33
|
|
|
20
|
1
|
|
|
1
|
|
9
|
use PDL::MatrixOps; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
9
|
|
|
21
|
1
|
|
|
1
|
|
217
|
use File::Basename; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
94
|
|
|
22
|
1
|
|
|
1
|
|
837
|
use File::Spec::Functions qw(rel2abs); |
|
|
1
|
|
|
|
|
929
|
|
|
|
1
|
|
|
|
|
68
|
|
|
23
|
1
|
|
|
1
|
|
13
|
use Fcntl; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
272
|
|
|
24
|
1
|
|
|
1
|
|
5464
|
use Storable; |
|
|
1
|
|
|
|
|
15157
|
|
|
|
1
|
|
|
|
|
83
|
|
|
25
|
1
|
|
|
1
|
|
8
|
use Cwd; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
13366
|
|
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our $VERSION = '1.70'; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# for camelcase splits (from perlmonks): |
|
30
|
|
|
|
|
|
|
my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
################################### Constructor ####################################### |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Constructor for creating a VSM or LSA model of a corpus. The model instance |
|
35
|
|
|
|
|
|
|
# returned by the constructor can be used for retrieving documents from the corpus |
|
36
|
|
|
|
|
|
|
# in response to queries. |
|
37
|
|
|
|
|
|
|
sub new { |
|
38
|
2
|
|
|
2
|
1
|
1315
|
my ($class, %args) = @_; |
|
39
|
2
|
|
|
|
|
13
|
my @params = keys %args; |
|
40
|
2
|
50
|
|
|
|
12
|
croak "\nYou have used a wrong name for a keyword argument " . |
|
41
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
|
42
|
|
|
|
|
|
|
if _check_for_illegal_params(@params) == 0; |
|
43
|
|
|
|
|
|
|
bless { |
|
44
|
|
|
|
|
|
|
_corpus_directory => $args{corpus_directory} || "", |
|
45
|
|
|
|
|
|
|
_save_model_on_disk => $args{save_model_on_disk} || 0, |
|
46
|
|
|
|
|
|
|
_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ? |
|
47
|
|
|
|
|
|
|
$args{break_camelcased_and_underscored} : 1, |
|
48
|
|
|
|
|
|
|
_corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db", |
|
49
|
|
|
|
|
|
|
_doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db", |
|
50
|
|
|
|
|
|
|
_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db", |
|
51
|
|
|
|
|
|
|
_stop_words_file => $args{stop_words_file} || "", |
|
52
|
|
|
|
|
|
|
_case_sensitive => $args{case_sensitive} || 0, |
|
53
|
|
|
|
|
|
|
_query_file => $args{query_file} || "", |
|
54
|
|
|
|
|
|
|
_file_types => $args{file_types} || [], |
|
55
|
|
|
|
|
|
|
_min_word_length => $args{min_word_length} || 4, |
|
56
|
|
|
|
|
|
|
_want_stemming => $args{want_stemming} || 0, |
|
57
|
|
|
|
|
|
|
_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1, |
|
58
|
|
|
|
|
|
|
_max_number_retrievals => $args{max_number_retrievals} || 30, |
|
59
|
|
|
|
|
|
|
_lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01, |
|
60
|
|
|
|
|
|
|
_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1, |
|
61
|
|
|
|
|
|
|
_relevancy_file => $args{relevancy_file} || "", |
|
62
|
2
|
50
|
50
|
|
|
9796
|
_debug => $args{debug} || 0, |
|
|
|
50
|
50
|
|
|
|
|
|
|
|
50
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
63
|
|
|
|
|
|
|
_working_directory => cwd, |
|
64
|
|
|
|
|
|
|
_vocab_hist_on_disk => {}, |
|
65
|
|
|
|
|
|
|
_vocab_hist => {}, |
|
66
|
|
|
|
|
|
|
_doc_hist_template => {}, |
|
67
|
|
|
|
|
|
|
_corpus_doc_vectors => {}, |
|
68
|
|
|
|
|
|
|
_normalized_doc_vecs => {}, |
|
69
|
|
|
|
|
|
|
_query_vector => {}, |
|
70
|
|
|
|
|
|
|
_stop_words => [], |
|
71
|
|
|
|
|
|
|
_term_document_matrix => [], |
|
72
|
|
|
|
|
|
|
_corpus_vocab_done => 0, |
|
73
|
|
|
|
|
|
|
_scan_dir_for_rels => 0, |
|
74
|
|
|
|
|
|
|
_vocab_size => undef, |
|
75
|
|
|
|
|
|
|
_doc_vecs_trunc_lsa => {}, |
|
76
|
|
|
|
|
|
|
_lsa_vec_truncator => undef, |
|
77
|
|
|
|
|
|
|
_queries_for_relevancy => {}, |
|
78
|
|
|
|
|
|
|
_relevancy_estimates => {}, |
|
79
|
|
|
|
|
|
|
_precision_for_queries => {}, |
|
80
|
|
|
|
|
|
|
_avg_precision_for_queries => {}, |
|
81
|
|
|
|
|
|
|
_recall_for_queries => {}, |
|
82
|
|
|
|
|
|
|
_map => undef, |
|
83
|
|
|
|
|
|
|
_vocab_idf_hist => {}, |
|
84
|
|
|
|
|
|
|
_idf_t => {}, |
|
85
|
|
|
|
|
|
|
_total_num_of_docs => 0, |
|
86
|
|
|
|
|
|
|
}, $class; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
###################### Get corpus vocabulary and word counts ######################### |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub get_corpus_vocabulary_and_word_counts { |
|
93
|
2
|
|
|
2
|
1
|
27
|
my $self = shift; |
|
94
|
|
|
|
|
|
|
die "You must supply the name of the corpus directory to the constructor" |
|
95
|
2
|
50
|
|
|
|
22
|
unless $self->{_corpus_directory}; |
|
96
|
|
|
|
|
|
|
print "Scanning the directory '$self->{_corpus_directory}' for\n" . |
|
97
|
2
|
50
|
|
|
|
14
|
" model construction\n\n" if $self->{_debug}; |
|
98
|
2
|
|
|
|
|
28
|
$self->_scan_directory( $self->{_corpus_directory} ); |
|
99
|
2
|
50
|
|
|
|
10
|
$self->_drop_stop_words() if $self->{_stop_words_file}; |
|
100
|
2
|
50
|
|
|
|
10
|
if ($self->{_debug}) { |
|
101
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
102
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
} |
|
105
|
2
|
50
|
|
|
|
8
|
if ($self->{_save_model_on_disk}) { |
|
106
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_corpus_vocab_db}.*"; |
|
107
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_doc_vectors_db}.*"; |
|
108
|
0
|
|
|
|
|
0
|
unlink glob "$self->{_normalized_doc_vecs_db}.*"; |
|
109
|
0
|
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
110
|
0
|
0
|
|
|
|
0
|
$self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640 |
|
111
|
|
|
|
|
|
|
or die "Can't create DBM files: $!"; |
|
112
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
113
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_}; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
|
0
|
|
|
|
|
0
|
|
|
116
|
|
|
|
|
|
|
} |
|
117
|
2
|
|
|
|
|
8
|
$self->{_corpus_vocab_done} = 1; |
|
118
|
2
|
|
|
|
|
5
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
|
2
|
|
|
|
|
17
|
|
|
119
|
|
|
|
|
|
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
|
120
|
2
|
50
|
|
|
|
8
|
if $self->{_debug}; |
|
121
|
|
|
|
|
|
|
# Calculate idf(t): |
|
122
|
2
|
|
|
|
|
3
|
foreach (keys %{$self->{_vocab_idf_hist}}) { |
|
|
2
|
|
|
|
|
33
|
|
|
123
|
|
|
|
|
|
|
$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs} |
|
124
|
|
|
|
|
|
|
/ |
|
125
|
218
|
|
|
|
|
576
|
(1 + $self->{_vocab_idf_hist}->{$_}))) |
|
126
|
|
|
|
|
|
|
/ log(10) ); |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub display_corpus_vocab { |
|
131
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
132
|
|
|
|
|
|
|
die "corpus vocabulary not yet constructed" |
|
133
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_hist}}; |
|
|
0
|
|
|
|
|
0
|
|
|
134
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying corpus vocabulary:\n\n"; |
|
135
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_vocab_hist}}){ |
|
|
0
|
|
|
|
|
0
|
|
|
136
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_}); |
|
137
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub display_corpus_vocab_size { |
|
142
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
143
|
|
|
|
|
|
|
die "corpus vocabulary not yet constructed" |
|
144
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_hist}}; |
|
|
0
|
|
|
|
|
0
|
|
|
145
|
0
|
|
|
|
|
0
|
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} ); |
|
|
0
|
|
|
|
|
0
|
|
|
146
|
0
|
|
|
|
|
0
|
print "\nSize of the corpus vocabulary: $vocab_size\n\n"; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub write_corpus_vocab_to_file { |
|
150
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
151
|
0
|
|
|
|
|
0
|
my $file = shift; |
|
152
|
0
|
0
|
|
|
|
0
|
die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}}; |
|
|
0
|
|
|
|
|
0
|
|
|
153
|
0
|
0
|
|
|
|
0
|
open OUT, "> $file" |
|
154
|
|
|
|
|
|
|
or die "unable to open for output a file with name `$file': $!"; |
|
155
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_vocab_hist}}){ |
|
|
0
|
|
|
|
|
0
|
|
|
156
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_}); |
|
157
|
0
|
|
|
|
|
0
|
print OUT "$outstring\n"; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
0
|
|
|
|
|
0
|
close OUT; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub display_inverse_document_frequencies { |
|
163
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
164
|
|
|
|
|
|
|
die "corpus vocabulary not yet constructed" |
|
165
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_vocab_idf_hist}}; |
|
|
0
|
|
|
|
|
0
|
|
|
166
|
|
|
|
|
|
|
print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n" |
|
167
|
0
|
0
|
|
|
|
0
|
unless $self->{_idf_filter_option}; |
|
168
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying inverse document frequencies:\n"; |
|
169
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_idf_hist}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
170
|
|
|
|
|
|
|
my $outstring = sprintf("%30s %d", |
|
171
|
0
|
|
|
|
|
0
|
$_, $self->{_vocab_idf_hist}->{$_}); |
|
172
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
0
|
|
|
|
|
0
|
print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n"; |
|
175
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_idf_t}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
176
|
0
|
|
|
|
|
0
|
my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_}); |
|
177
|
0
|
|
|
|
|
0
|
print "$outstring\n"; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub get_all_document_names { |
|
182
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
183
|
0
|
|
|
|
|
0
|
my @all_files = sort keys %{$self->{_corpus_doc_vectors}}; |
|
|
0
|
|
|
|
|
0
|
|
|
184
|
0
|
|
|
|
|
0
|
return \@all_files; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
############################ Generate Document Vectors ################################# |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub generate_document_vectors { |
|
190
|
2
|
|
|
2
|
1
|
34
|
my $self = shift; |
|
191
|
2
|
|
|
|
|
28
|
chdir $self->{_working_directory}; |
|
192
|
2
|
|
|
|
|
6
|
foreach ( sort keys %{$self->{_vocab_hist}} ) { |
|
|
2
|
|
|
|
|
108
|
|
|
193
|
218
|
|
|
|
|
353
|
$self->{_doc_hist_template}->{$_} = 0; |
|
194
|
|
|
|
|
|
|
} |
|
195
|
2
|
|
|
|
|
26
|
$self->_scan_directory( $self->{_corpus_directory} ); |
|
196
|
2
|
|
|
|
|
36
|
chdir $self->{_working_directory}; |
|
197
|
2
|
50
|
|
|
|
40
|
if ($self->{_save_model_on_disk}) { |
|
198
|
|
|
|
|
|
|
die "You did not specify in the constructor call the names for the diskfiles " . |
|
199
|
|
|
|
|
|
|
"for storing the disk-based hash tables consisting of document vectors " . |
|
200
|
|
|
|
|
|
|
"and their normalized versions" |
|
201
|
0
|
0
|
0
|
|
|
0
|
unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db}; |
|
202
|
0
|
|
|
|
|
0
|
eval { |
|
203
|
0
|
|
|
|
|
0
|
store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} ); |
|
204
|
|
|
|
|
|
|
}; |
|
205
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
206
|
0
|
|
|
|
|
0
|
print "Something went wrong with disk storage of document vectors: $@"; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
0
|
|
|
|
|
0
|
eval { |
|
209
|
0
|
|
|
|
|
0
|
store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db}); |
|
210
|
|
|
|
|
|
|
}; |
|
211
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
212
|
0
|
|
|
|
|
0
|
print "Something wrong with disk storage of normalized doc vecs: $@"; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub display_doc_vectors { |
|
218
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
219
|
|
|
|
|
|
|
die "document vectors not yet constructed" |
|
220
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_corpus_doc_vectors}}; |
|
|
0
|
|
|
|
|
0
|
|
|
221
|
0
|
|
|
|
|
0
|
foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
222
|
0
|
|
|
|
|
0
|
print "\n\ndisplay doc vec for $file:\n"; |
|
223
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
224
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n"; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
0
|
|
|
|
|
0
|
my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}}; |
|
|
0
|
|
|
|
|
0
|
|
|
227
|
0
|
|
|
|
|
0
|
print "\nSize of vector for $file: $docvec_size\n"; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub display_normalized_doc_vectors { |
|
232
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
233
|
|
|
|
|
|
|
die "normalized document vectors not yet constructed" |
|
234
|
0
|
0
|
|
|
|
0
|
unless keys %{$self->{_normalized_doc_vecs}}; |
|
|
0
|
|
|
|
|
0
|
|
|
235
|
0
|
0
|
|
|
|
0
|
unless ($self->{_idf_filter_option}) { |
|
236
|
0
|
|
|
|
|
0
|
print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n"; |
|
237
|
0
|
|
|
|
|
0
|
return; |
|
238
|
|
|
|
|
|
|
} |
|
239
|
0
|
|
|
|
|
0
|
foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
240
|
0
|
|
|
|
|
0
|
print "\n\ndisplay normalized doc vec for $file:\n"; |
|
241
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
242
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n"; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
0
|
|
|
|
|
0
|
my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}}; |
|
|
0
|
|
|
|
|
0
|
|
|
245
|
0
|
|
|
|
|
0
|
print "\nSize of normalized vector for $file: $docvec_size\n"; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
######################## Calculate Pairwise Document Similarities ###################### |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# Returns the similarity score for two documents whose actual names are are supplied |
|
252
|
|
|
|
|
|
|
# as its two arguments. |
|
253
|
|
|
|
|
|
|
sub pairwise_similarity_for_docs { |
|
254
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
255
|
0
|
|
|
|
|
0
|
my $doc1 = shift; |
|
256
|
0
|
|
|
|
|
0
|
my $doc2 = shift; |
|
257
|
0
|
|
|
|
|
0
|
my @all_files = keys %{$self->{_corpus_doc_vectors}}; |
|
|
0
|
|
|
|
|
0
|
|
|
258
|
0
|
0
|
|
|
|
0
|
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files); |
|
259
|
0
|
0
|
|
|
|
0
|
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files); |
|
260
|
0
|
|
|
|
|
0
|
my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1}; |
|
261
|
0
|
|
|
|
|
0
|
my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2}; |
|
262
|
0
|
|
|
|
|
0
|
my @vec1 = (); |
|
263
|
0
|
|
|
|
|
0
|
my @vec2 = (); |
|
264
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %$vec_hash_ref1) { |
|
265
|
0
|
|
|
|
|
0
|
push @vec1, $vec_hash_ref1->{$word}; |
|
266
|
0
|
|
|
|
|
0
|
push @vec2, $vec_hash_ref2->{$word}; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
0
|
|
|
|
|
0
|
my $vec_mag1 = vec_magnitude(\@vec1); |
|
269
|
0
|
|
|
|
|
0
|
my $vec_mag2 = vec_magnitude(\@vec2); |
|
270
|
0
|
|
|
|
|
0
|
my $product = vec_scalar_product(\@vec1, \@vec2); |
|
271
|
0
|
|
|
|
|
0
|
$product /= $vec_mag1 * $vec_mag2; |
|
272
|
0
|
|
|
|
|
0
|
return $product; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub pairwise_similarity_for_normalized_docs { |
|
276
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
277
|
0
|
|
|
|
|
0
|
my $doc1 = shift; |
|
278
|
0
|
|
|
|
|
0
|
my $doc2 = shift; |
|
279
|
0
|
|
|
|
|
0
|
my @all_files = keys %{$self->{_corpus_doc_vectors}}; |
|
|
0
|
|
|
|
|
0
|
|
|
280
|
0
|
0
|
|
|
|
0
|
croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files); |
|
281
|
0
|
0
|
|
|
|
0
|
croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files); |
|
282
|
0
|
|
|
|
|
0
|
my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1}; |
|
283
|
0
|
|
|
|
|
0
|
my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2}; |
|
284
|
0
|
|
|
|
|
0
|
my @vec1 = (); |
|
285
|
0
|
|
|
|
|
0
|
my @vec2 = (); |
|
286
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %$vec_hash_ref1) { |
|
287
|
0
|
|
|
|
|
0
|
push @vec1, $vec_hash_ref1->{$word}; |
|
288
|
0
|
|
|
|
|
0
|
push @vec2, $vec_hash_ref2->{$word}; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
0
|
|
|
|
|
0
|
my $vec_mag1 = vec_magnitude(\@vec1); |
|
291
|
0
|
|
|
|
|
0
|
my $vec_mag2 = vec_magnitude(\@vec2); |
|
292
|
0
|
|
|
|
|
0
|
my $product = vec_scalar_product(\@vec1, \@vec2); |
|
293
|
0
|
|
|
|
|
0
|
$product /= $vec_mag1 * $vec_mag2; |
|
294
|
0
|
|
|
|
|
0
|
return $product; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
############################### Retrieve with VSM Model ################################ |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
sub retrieve_with_vsm { |
|
300
|
1
|
|
|
1
|
1
|
18
|
my $self = shift; |
|
301
|
1
|
|
|
|
|
6
|
my $query = shift; |
|
302
|
1
|
|
|
|
|
8
|
my @clean_words; |
|
303
|
1
|
|
|
|
|
5
|
my $min = $self->{_min_word_length}; |
|
304
|
|
|
|
|
|
|
|
|
305
|
1
|
50
|
|
|
|
6
|
if ($self->{_break_camelcased_and_underscored}) { |
|
306
|
1
|
|
|
|
|
41
|
my @brokenup = grep $_, split /\W|_|\s+/, "@$query"; |
|
307
|
1
|
|
|
|
|
6
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
|
8
|
|
|
|
|
41
|
|
|
308
|
|
|
|
|
|
|
@clean_words = $self->{_case_sensitive} ? |
|
309
|
0
|
0
|
|
|
|
0
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words : |
|
|
0
|
|
|
|
|
0
|
|
|
310
|
1
|
100
|
|
|
|
12
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
|
8
|
50
|
|
|
|
82
|
|
|
|
8
|
|
|
|
|
43
|
|
|
311
|
|
|
|
|
|
|
} else { |
|
312
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query"; |
|
313
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
314
|
|
|
|
|
|
|
} |
|
315
|
1
|
|
|
|
|
6
|
$query = \@clean_words; |
|
316
|
1
|
50
|
|
|
|
6
|
print "\nYour query words are: @$query\n" if $self->{_debug}; |
|
317
|
1
|
50
|
|
|
|
16
|
if ($self->{_idf_filter_option}) { |
|
318
|
|
|
|
|
|
|
die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()" |
|
319
|
1
|
|
|
|
|
82
|
unless scalar(keys %{$self->{_vocab_hist}}) |
|
320
|
1
|
50
|
50
|
|
|
10
|
&& scalar(keys %{$self->{_normalized_doc_vecs}}); |
|
|
1
|
|
|
|
|
6
|
|
|
321
|
|
|
|
|
|
|
} else { |
|
322
|
|
|
|
|
|
|
die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()" |
|
323
|
0
|
|
|
|
|
0
|
unless scalar(keys %{$self->{_vocab_hist}}) |
|
324
|
0
|
0
|
0
|
|
|
0
|
&& scalar(keys %{$self->{_corpus_doc_vectors}}); |
|
|
0
|
|
|
|
|
0
|
|
|
325
|
|
|
|
|
|
|
} |
|
326
|
1
|
|
|
|
|
2
|
foreach ( keys %{$self->{_vocab_hist}} ) { |
|
|
1
|
|
|
|
|
60
|
|
|
327
|
109
|
|
|
|
|
184
|
$self->{_query_vector}->{$_} = 0; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
1
|
|
|
|
|
11
|
foreach (@$query) { |
|
330
|
7
|
50
|
|
|
|
18
|
if ($self->{_case_sensitive}) { |
|
331
|
0
|
0
|
|
|
|
0
|
$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_}; |
|
332
|
|
|
|
|
|
|
} else { |
|
333
|
7
|
100
|
|
|
|
26
|
$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"}; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
} |
|
336
|
1
|
|
|
|
|
3
|
my @query_word_counts = values %{$self->{_query_vector}}; |
|
|
1
|
|
|
|
|
12
|
|
|
337
|
1
|
|
|
|
|
13
|
my $query_word_count_total = sum(\@query_word_counts); |
|
338
|
1
|
50
|
|
|
|
10
|
die "\nYour query does not contain corpus words. Nothing retrieved.\n" |
|
339
|
|
|
|
|
|
|
unless $query_word_count_total; |
|
340
|
1
|
|
|
|
|
2
|
my %retrievals; |
|
341
|
1
|
50
|
|
|
|
6
|
if ($self->{_idf_filter_option}) { |
|
342
|
|
|
|
|
|
|
print "\n\nUsing idf filter option for retrieval:\n\n" |
|
343
|
1
|
50
|
|
|
|
6
|
if $self->{_debug}; |
|
344
|
1
|
|
|
|
|
2
|
foreach (sort {$self->_doc_vec_comparator} |
|
|
24
|
|
|
|
|
62
|
|
|
345
|
1
|
|
|
|
|
21
|
keys %{$self->{_normalized_doc_vecs}}) { |
|
346
|
10
|
100
|
|
|
|
28
|
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0; |
|
347
|
|
|
|
|
|
|
} |
|
348
|
|
|
|
|
|
|
} else { |
|
349
|
|
|
|
|
|
|
print "\n\nNOT using idf filter option for retrieval:\n\n" |
|
350
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
351
|
0
|
|
|
|
|
0
|
foreach (sort {$self->_doc_vec_comparator} |
|
|
0
|
|
|
|
|
0
|
|
|
352
|
0
|
|
|
|
|
0
|
keys %{$self->{_corpus_doc_vectors}}) { |
|
353
|
0
|
0
|
|
|
|
0
|
$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0; |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
} |
|
356
|
1
|
50
|
|
|
|
6
|
if ($self->{_debug}) { |
|
357
|
0
|
|
|
|
|
0
|
print "\n\nShowing the VSM retrievals and the similarity scores:\n\n"; |
|
358
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { |
|
|
0
|
|
|
|
|
0
|
|
|
359
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals{$_}\n"; |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
} |
|
362
|
1
|
|
|
|
|
11
|
return \%retrievals; |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
######################### Upload a Previously Constructed Model ######################### |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub upload_vsm_model_from_disk { |
|
368
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
|
369
|
|
|
|
|
|
|
die "\nCannot find the database files for the VSM model" |
|
370
|
|
|
|
|
|
|
unless -s "$self->{_corpus_vocab_db}.pag" |
|
371
|
0
|
0
|
0
|
|
|
0
|
&& -s $self->{_doc_vectors_db}; |
|
372
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
|
373
|
0
|
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
374
|
0
|
0
|
|
|
|
0
|
$self->{_corpus_vocab_db}, O_RDONLY, 0640 |
|
375
|
|
|
|
|
|
|
or die "Can't open DBM file: $!"; |
|
376
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
377
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
378
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
} |
|
381
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist_on_disk}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
382
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_}; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
0
|
|
|
|
|
0
|
$self->{_corpus_vocab_done} = 1; |
|
385
|
0
|
|
|
|
|
0
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
|
0
|
|
|
|
|
0
|
|
|
386
|
|
|
|
|
|
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
|
387
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
388
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
|
389
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
|
0
|
|
|
|
|
0
|
|
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub upload_normalized_vsm_model_from_disk { |
|
393
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
394
|
|
|
|
|
|
|
die "\nCannot find the database files for the VSM model" |
|
395
|
|
|
|
|
|
|
unless -s "$self->{_corpus_vocab_db}.pag" |
|
396
|
0
|
0
|
0
|
|
|
0
|
&& -s $self->{_normalized_doc_vecs_db}; |
|
397
|
0
|
|
|
|
|
0
|
$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db}); |
|
398
|
0
|
|
|
|
|
0
|
tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File', |
|
399
|
0
|
0
|
|
|
|
0
|
$self->{_corpus_vocab_db}, O_RDONLY, 0640 |
|
400
|
|
|
|
|
|
|
or die "Can't open DBM file: $!"; |
|
401
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
402
|
0
|
|
|
|
|
0
|
foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
403
|
0
|
|
|
|
|
0
|
printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} ); |
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
} |
|
406
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_vocab_hist_on_disk}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
407
|
0
|
|
|
|
|
0
|
$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_}; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
0
|
|
|
|
|
0
|
$self->{_corpus_vocab_done} = 1; |
|
410
|
0
|
|
|
|
|
0
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
|
|
0
|
|
|
|
|
0
|
|
|
411
|
|
|
|
|
|
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
|
412
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
413
|
0
|
|
|
|
|
0
|
untie %{$self->{_vocab_hist_on_disk}}; |
|
|
0
|
|
|
|
|
0
|
|
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
############################## Display Retrieval Results ################################ |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub display_retrievals { |
|
419
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
420
|
0
|
|
|
|
|
0
|
my $retrievals = shift; |
|
421
|
0
|
|
|
|
|
0
|
print "\n\nShowing the retrievals and the similarity scores:\n\n"; |
|
422
|
0
|
|
|
|
|
0
|
my $iter = 0; |
|
423
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){ |
|
|
0
|
|
|
|
|
0
|
|
|
424
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals->{$_}\n"; |
|
425
|
0
|
|
|
|
|
0
|
$iter++; |
|
426
|
0
|
0
|
|
|
|
0
|
last if $iter > $self->{_max_number_retrievals}; |
|
427
|
|
|
|
|
|
|
} |
|
428
|
0
|
|
|
|
|
0
|
print "\n\n"; |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
############################### Directory Scanner ################################ |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub _scan_directory { |
|
434
|
4
|
|
|
4
|
|
9
|
my $self = shift; |
|
435
|
4
|
|
|
|
|
46
|
my $dir = rel2abs( shift ); |
|
436
|
4
|
|
|
|
|
18346
|
my $current_dir = cwd; |
|
437
|
4
|
50
|
|
|
|
245
|
chdir $dir or die "Unable to change directory to $dir: $!"; |
|
438
|
4
|
|
|
|
|
942
|
foreach ( glob "*" ) { |
|
439
|
40
|
50
|
33
|
|
|
30743
|
if ( -d and !(-l) ) { |
|
|
|
50
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
440
|
0
|
|
|
|
|
0
|
$self->_scan_directory( $_ ); |
|
441
|
0
|
0
|
|
|
|
0
|
chdir $dir |
|
442
|
|
|
|
|
|
|
or die "Unable to change directory to $dir: $!"; |
|
443
|
|
|
|
|
|
|
} elsif (-r _ and |
|
444
|
|
|
|
|
|
|
-T _ and |
|
445
|
|
|
|
|
|
|
-M _ > 0.00001 and # modification age is at least 1 sec |
|
446
|
|
|
|
|
|
|
!( -l $_ ) and |
|
447
|
|
|
|
|
|
|
$self->ok_to_filetype($_) ) { |
|
448
|
40
|
50
|
|
|
|
100
|
$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels}; |
|
449
|
40
|
100
|
|
|
|
174
|
$self->_scan_file($_) unless $self->{_corpus_vocab_done}; |
|
450
|
40
|
100
|
|
|
|
199
|
$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done}; |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
} |
|
453
|
4
|
|
|
|
|
149
|
chdir $current_dir; |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
sub _scan_file { |
|
457
|
20
|
|
|
20
|
|
32
|
my $self = shift; |
|
458
|
20
|
|
|
|
|
39
|
my $file = shift; |
|
459
|
20
|
|
|
|
|
493
|
open IN, $file; |
|
460
|
20
|
|
|
|
|
51
|
my $min = $self->{_min_word_length}; |
|
461
|
20
|
|
|
|
|
49
|
my %uniques = (); |
|
462
|
20
|
|
|
|
|
218
|
while () { |
|
463
|
856
|
100
|
|
|
|
3855
|
next if /^[ ]*\r?\n?$/; |
|
464
|
730
|
|
|
|
|
3101
|
$_ =~ s/\r?\n?$//; |
|
465
|
730
|
|
|
|
|
1177
|
my @clean_words; |
|
466
|
730
|
50
|
|
|
|
1563
|
if ($self->{_break_camelcased_and_underscored}) { |
|
467
|
730
|
|
|
|
|
10261
|
my @brokenup = grep $_, split /\W|_|\s+/, $_; |
|
468
|
730
|
|
|
|
|
1993
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
|
2594
|
|
|
|
|
11704
|
|
|
469
|
|
|
|
|
|
|
@clean_words = $self->{_case_sensitive} ? |
|
470
|
0
|
0
|
|
|
|
0
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words : |
|
|
0
|
|
|
|
|
0
|
|
|
471
|
730
|
100
|
|
|
|
2140
|
grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
|
2732
|
50
|
|
|
|
8037
|
|
|
|
2732
|
|
|
|
|
10985
|
|
|
472
|
|
|
|
|
|
|
} else { |
|
473
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_; |
|
474
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
475
|
|
|
|
|
|
|
} |
|
476
|
730
|
100
|
|
|
|
3037
|
next unless @clean_words; |
|
477
|
|
|
|
|
|
|
@clean_words = grep $_, map &simple_stemmer($_), @clean_words |
|
478
|
444
|
50
|
|
|
|
2255
|
if $self->{_want_stemming}; |
|
479
|
|
|
|
|
|
|
$self->{_case_sensitive} ? |
|
480
|
0
|
|
|
|
|
0
|
map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words : |
|
481
|
444
|
50
|
|
|
|
1697
|
map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words; |
|
|
1736
|
|
|
|
|
3999
|
|
|
482
|
444
|
50
|
|
|
|
905
|
if ($self->{_case_sensitive}) { |
|
483
|
0
|
|
|
|
|
0
|
for (@clean_words) { $uniques{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
484
|
|
|
|
|
|
|
} else { |
|
485
|
444
|
|
|
|
|
965
|
for (@clean_words) { $uniques{"\L$_"}++ } |
|
|
1736
|
|
|
|
|
4630
|
|
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
} |
|
488
|
20
|
|
|
|
|
567
|
close( IN ); |
|
489
|
20
|
|
|
|
|
139
|
map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques; |
|
|
838
|
|
|
|
|
1431
|
|
|
490
|
20
|
|
|
|
|
175
|
$self->{_total_num_of_docs}++; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
sub ok_to_filetype { |
|
494
|
40
|
|
|
40
|
0
|
81
|
my $self = shift; |
|
495
|
40
|
|
|
|
|
186
|
my $filename = shift; |
|
496
|
40
|
|
|
|
|
2215
|
my ($base, $dir, $suffix) = fileparse($filename, '\..*'); |
|
497
|
|
|
|
|
|
|
croak "You called this module without specifying the file types in the constructor" |
|
498
|
40
|
50
|
|
|
|
73
|
unless @{$self->{_file_types}} > 0; |
|
|
40
|
|
|
|
|
166
|
|
|
499
|
40
|
50
|
|
|
|
71
|
return 1 if contained_in($suffix, @{$self->{_file_types}}); |
|
|
40
|
|
|
|
|
317
|
|
|
500
|
0
|
|
|
|
|
0
|
return 0; |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
############################## LSA Modeling and Retrieval ################################ |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
sub construct_lsa_model { |
|
506
|
1
|
|
|
1
|
1
|
21
|
my $self = shift; |
|
507
|
1
|
50
|
|
|
|
12
|
if ($self->{_idf_filter_option}) { |
|
508
|
1
|
50
|
33
|
|
|
19
|
if (!$self->{_normalized_doc_vecs} and |
|
509
|
|
|
|
|
|
|
-s $self->{_normalized_doc_vecs_db}) { |
|
510
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs} = |
|
511
|
0
|
|
|
|
|
0
|
retrieve($self->{_normalized_doc_vecs_db}); |
|
512
|
|
|
|
|
|
|
} |
|
513
|
1
|
|
|
|
|
6
|
foreach (sort keys %{$self->{_normalized_doc_vecs}}) { |
|
|
1
|
|
|
|
|
18
|
|
|
514
|
10
|
|
|
|
|
15
|
my $term_frequency_vec; |
|
515
|
10
|
|
|
|
|
14
|
foreach my $word (sort keys |
|
516
|
10
|
|
|
|
|
576
|
%{$self->{_normalized_doc_vecs}->{$_}}){ |
|
517
|
|
|
|
|
|
|
push @$term_frequency_vec, |
|
518
|
1090
|
|
|
|
|
1895
|
$self->{_normalized_doc_vecs}->{$_}->{$word}; |
|
519
|
|
|
|
|
|
|
} |
|
520
|
10
|
|
|
|
|
57
|
push @{$self->{_term_document_matrix}}, $term_frequency_vec; |
|
|
10
|
|
|
|
|
31
|
|
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
} else { |
|
523
|
0
|
0
|
0
|
|
|
0
|
if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) { |
|
524
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db}); |
|
525
|
|
|
|
|
|
|
} |
|
526
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_corpus_doc_vectors}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
527
|
0
|
|
|
|
|
0
|
my $term_frequency_vec; |
|
528
|
0
|
|
|
|
|
0
|
foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){ |
|
|
0
|
|
|
|
|
0
|
|
|
529
|
|
|
|
|
|
|
push @$term_frequency_vec, |
|
530
|
0
|
|
|
|
|
0
|
$self->{_corpus_doc_vectors}->{$_}->{$word}; |
|
531
|
|
|
|
|
|
|
} |
|
532
|
0
|
|
|
|
|
0
|
push @{$self->{_term_document_matrix}}, $term_frequency_vec; |
|
|
0
|
|
|
|
|
0
|
|
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
} |
|
535
|
1
|
|
|
|
|
5
|
my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) ); |
|
|
1
|
|
|
|
|
19
|
|
|
536
|
1
|
|
|
|
|
485
|
my ($U,$SIGMA,$V) = svd $A; |
|
537
|
1
|
50
|
|
|
|
8
|
print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug}; |
|
538
|
1
|
50
|
|
|
|
7
|
print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug}; |
|
539
|
|
|
|
|
|
|
my $index = return_index_of_last_value_above_threshold($SIGMA, |
|
540
|
1
|
|
|
|
|
10
|
$self->{_lsa_svd_threshold}); |
|
541
|
1
|
|
|
|
|
14
|
my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever; |
|
542
|
1
|
50
|
|
|
|
32
|
print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug}; |
|
543
|
|
|
|
|
|
|
# When you measure the size of a matrix in PDL, the zeroth dimension |
|
544
|
|
|
|
|
|
|
# is considered to be along the horizontal and the one-th dimension |
|
545
|
|
|
|
|
|
|
# along the rows. This is opposite of how we want to look at |
|
546
|
|
|
|
|
|
|
# matrices. For a matrix of size MxN, we mean M rows and N columns. |
|
547
|
|
|
|
|
|
|
# With this 'rows x columns' convention for matrix size, if you had |
|
548
|
|
|
|
|
|
|
# to check the size of, say, U matrix, you would call |
|
549
|
|
|
|
|
|
|
# my @size = ( $U->getdim(1), $U->getdim(0) ); |
|
550
|
|
|
|
|
|
|
# print "\nsize of U: @size\n"; |
|
551
|
1
|
|
|
|
|
7
|
my $U_trunc = $U->slice("0:$index,:")->sever; |
|
552
|
1
|
|
|
|
|
46
|
my $V_trunc = $V->slice("0:$index,0:$index")->sever; |
|
553
|
1
|
|
|
|
|
33
|
$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x |
|
554
|
|
|
|
|
|
|
PDL::Basic::transpose($U_trunc); |
|
555
|
|
|
|
|
|
|
print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n" |
|
556
|
1
|
50
|
|
|
|
14151
|
if $self->{_debug}; |
|
557
|
|
|
|
|
|
|
my @sorted_doc_names = $self->{_idf_filter_option} ? |
|
558
|
1
|
|
|
|
|
10
|
sort keys %{$self->{_normalized_doc_vecs}} : |
|
559
|
1
|
50
|
|
|
|
7
|
sort keys %{$self->{_corpus_doc_vectors}}; |
|
|
0
|
|
|
|
|
0
|
|
|
560
|
1
|
|
|
|
|
3
|
my $i = 0; |
|
561
|
1
|
|
|
|
|
5
|
foreach (@{$self->{_term_document_matrix}}) { |
|
|
1
|
|
|
|
|
4
|
|
|
562
|
|
|
|
|
|
|
my $truncated_doc_vec = $self->{_lsa_vec_truncator} x |
|
563
|
10
|
|
|
|
|
31
|
PDL::Basic::transpose(pdl($_)); |
|
564
|
10
|
|
|
|
|
1384
|
my $doc_name = $sorted_doc_names[$i++]; |
|
565
|
|
|
|
|
|
|
print "\n\nTruncated doc vec for $doc_name: " . |
|
566
|
10
|
50
|
|
|
|
26
|
$truncated_doc_vec . "\n" if $self->{_debug}; |
|
567
|
10
|
|
|
|
|
30
|
$self->{_doc_vecs_trunc_lsa}->{$doc_name} |
|
568
|
|
|
|
|
|
|
= $truncated_doc_vec; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
1
|
|
|
|
|
88
|
chdir $self->{_working_directory}; |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub retrieve_with_lsa { |
|
574
|
1
|
|
|
1
|
1
|
11
|
my $self = shift; |
|
575
|
1
|
|
|
|
|
5
|
my $query = shift; |
|
576
|
1
|
|
|
|
|
2
|
my @clean_words; |
|
577
|
1
|
|
|
|
|
6
|
my $min = $self->{_min_word_length}; |
|
578
|
1
|
50
|
|
|
|
5
|
if ($self->{_break_camelcased_and_underscored}) { |
|
579
|
1
|
|
|
|
|
33
|
my @brokenup = grep $_, split /\W|_|\s+/, "@$query"; |
|
580
|
1
|
|
|
|
|
4
|
@clean_words = map {$_ =~ /$_regex/g} @brokenup; |
|
|
8
|
|
|
|
|
38
|
|
|
581
|
1
|
100
|
|
|
|
3
|
@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words; |
|
|
8
|
|
|
|
|
57
|
|
|
|
8
|
|
|
|
|
37
|
|
|
582
|
|
|
|
|
|
|
} else { |
|
583
|
0
|
|
|
|
|
0
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query"; |
|
584
|
0
|
|
|
|
|
0
|
@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
585
|
|
|
|
|
|
|
} |
|
586
|
1
|
|
|
|
|
3
|
$query = \@clean_words; |
|
587
|
1
|
50
|
|
|
|
6
|
print "\nYour processed query words are: @$query\n" if $self->{_debug}; |
|
588
|
|
|
|
|
|
|
die "Your vocabulary histogram is empty" |
|
589
|
1
|
50
|
|
|
|
3
|
unless scalar(keys %{$self->{_vocab_hist}}); |
|
|
1
|
|
|
|
|
7
|
|
|
590
|
|
|
|
|
|
|
die "You must first construct an LSA model" |
|
591
|
1
|
50
|
|
|
|
2
|
unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}}); |
|
|
1
|
|
|
|
|
10
|
|
|
592
|
1
|
|
|
|
|
2
|
foreach ( keys %{$self->{_vocab_hist}} ) { |
|
|
1
|
|
|
|
|
21
|
|
|
593
|
109
|
|
|
|
|
190
|
$self->{_query_vector}->{$_} = 0; |
|
594
|
|
|
|
|
|
|
} |
|
595
|
1
|
|
|
|
|
8
|
foreach (@$query) { |
|
596
|
|
|
|
|
|
|
$self->{_query_vector}->{"\L$_"}++ |
|
597
|
7
|
100
|
|
|
|
24
|
if exists $self->{_vocab_hist}->{"\L$_"}; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
1
|
|
|
|
|
2
|
my @query_word_counts = values %{$self->{_query_vector}}; |
|
|
1
|
|
|
|
|
13
|
|
|
600
|
1
|
|
|
|
|
11
|
my $query_word_count_total = sum(\@query_word_counts); |
|
601
|
1
|
50
|
|
|
|
4
|
die "Query does not contain corpus words. Nothing retrieved." |
|
602
|
|
|
|
|
|
|
unless $query_word_count_total; |
|
603
|
1
|
|
|
|
|
2
|
my $query_vec; |
|
604
|
1
|
|
|
|
|
4
|
foreach (sort keys %{$self->{_query_vector}}) { |
|
|
1
|
|
|
|
|
50
|
|
|
605
|
109
|
|
|
|
|
180
|
push @$query_vec, $self->{_query_vector}->{$_}; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
1
|
50
|
|
|
|
8
|
print "\n\nQuery vector: @$query_vec\n" if $self->{_debug}; |
|
608
|
|
|
|
|
|
|
my $truncated_query_vec = $self->{_lsa_vec_truncator} x |
|
609
|
1
|
|
|
|
|
5
|
PDL::Basic::transpose(pdl($query_vec)); |
|
610
|
|
|
|
|
|
|
print "\n\nTruncated query vector: " . $truncated_query_vec . "\n" |
|
611
|
1
|
50
|
|
|
|
163
|
if $self->{_debug}; |
|
612
|
1
|
|
|
|
|
2
|
my %retrievals; |
|
613
|
1
|
|
|
|
|
2
|
foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) { |
|
|
1
|
|
|
|
|
9
|
|
|
614
|
|
|
|
|
|
|
my $dot_product = PDL::Basic::transpose($truncated_query_vec) |
|
615
|
10
|
|
|
|
|
175
|
x pdl($self->{_doc_vecs_trunc_lsa}->{$_}); |
|
616
|
|
|
|
|
|
|
print "\n\nLSA: dot product of truncated query and\n" . |
|
617
|
|
|
|
|
|
|
" truncated vec for doc $_ is " . $dot_product->sclr . "\n" |
|
618
|
10
|
50
|
|
|
|
886
|
if $self->{_debug}; |
|
619
|
10
|
100
|
|
|
|
30
|
$retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
1
|
50
|
|
|
|
15
|
if ($self->{_debug}) { |
|
622
|
0
|
|
|
|
|
0
|
print "\n\nShowing LSA retrievals and similarity scores:\n\n"; |
|
623
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { |
|
|
0
|
|
|
|
|
0
|
|
|
624
|
0
|
|
|
|
|
0
|
print "$_ => $retrievals{$_}\n"; |
|
625
|
|
|
|
|
|
|
} |
|
626
|
0
|
|
|
|
|
0
|
print "\n\n"; |
|
627
|
|
|
|
|
|
|
} |
|
628
|
1
|
|
|
|
|
13
|
return \%retrievals; |
|
629
|
|
|
|
|
|
|
} |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub _construct_doc_vector { |
|
632
|
20
|
|
|
20
|
|
34
|
my $self = shift; |
|
633
|
20
|
|
|
|
|
44
|
my $file = shift; |
|
634
|
20
|
|
|
|
|
35
|
my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})}; |
|
|
20
|
|
|
|
|
77
|
|
|
635
|
20
|
|
|
|
|
318
|
foreach ( sort keys %{$self->{_doc_hist_template}} ) { |
|
|
20
|
|
|
|
|
1129
|
|
|
636
|
2180
|
|
|
|
|
3237
|
$document_vector{$_} = 0; |
|
637
|
|
|
|
|
|
|
} |
|
638
|
20
|
|
|
|
|
179
|
my $min = $self->{_min_word_length}; |
|
639
|
20
|
|
|
|
|
65
|
my $total_words_in_doc = 0; |
|
640
|
20
|
50
|
|
|
|
557
|
unless (open IN, $file) { |
|
641
|
|
|
|
|
|
|
print "Unable to open file $file in the corpus: $!\n" |
|
642
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
643
|
0
|
|
|
|
|
0
|
return; |
|
644
|
|
|
|
|
|
|
} |
|
645
|
20
|
|
|
|
|
432
|
while () { |
|
646
|
856
|
100
|
|
|
|
3682
|
next if /^[ ]*\r?\n?$/; |
|
647
|
730
|
|
|
|
|
2846
|
$_ =~ s/\r?\n?$//; |
|
648
|
730
|
|
|
|
|
6639
|
my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_; |
|
649
|
730
|
|
|
|
|
1238
|
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
|
|
4726
|
|
|
|
|
10785
|
|
|
|
4726
|
|
|
|
|
10663
|
|
|
650
|
730
|
100
|
|
|
|
2848
|
next unless @clean_words; |
|
651
|
|
|
|
|
|
|
@clean_words = grep $_, |
|
652
|
|
|
|
|
|
|
map &simple_stemmer($_, $self->{_debug}), @clean_words |
|
653
|
444
|
50
|
|
|
|
1719
|
if $self->{_want_stemming}; |
|
654
|
|
|
|
|
|
|
$self->{_case_sensitive} ? |
|
655
|
0
|
|
|
|
|
0
|
map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words : |
|
|
0
|
|
|
|
|
0
|
|
|
656
|
1488
|
|
|
|
|
4267
|
map { $document_vector{"\L$_"}++ } |
|
657
|
444
|
50
|
|
|
|
1354
|
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; |
|
|
1610
|
|
|
|
|
4037
|
|
|
658
|
|
|
|
|
|
|
} |
|
659
|
20
|
|
|
|
|
201
|
close IN; |
|
660
|
|
|
|
|
|
|
die "Something went wrong. Doc vector size unequal to vocab size" |
|
661
|
20
|
50
|
|
|
|
98
|
unless $self->{_vocab_size} == scalar(keys %document_vector); |
|
662
|
20
|
|
|
|
|
266
|
foreach (keys %document_vector) { |
|
663
|
2180
|
|
|
|
|
3083
|
$total_words_in_doc += $document_vector{$_}; |
|
664
|
|
|
|
|
|
|
} |
|
665
|
20
|
|
|
|
|
121
|
my %normalized_doc_vec; |
|
666
|
20
|
50
|
|
|
|
57
|
if ($self->{_idf_filter_option}) { |
|
667
|
20
|
|
|
|
|
238
|
foreach (keys %document_vector) { |
|
668
|
|
|
|
|
|
|
$normalized_doc_vec{$_} = $document_vector{$_} |
|
669
|
|
|
|
|
|
|
* |
|
670
|
2180
|
|
|
|
|
4973
|
$self->{_idf_t}->{$_} |
|
671
|
|
|
|
|
|
|
/ |
|
672
|
|
|
|
|
|
|
$total_words_in_doc; |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
} |
|
675
|
20
|
|
|
|
|
81749
|
my $pwd = cwd; |
|
676
|
20
|
|
|
|
|
567
|
$pwd =~ m{$self->{_corpus_directory}.?(\S*)$}; |
|
677
|
20
|
|
|
|
|
43
|
my $file_path_name; |
|
678
|
20
|
50
|
|
|
|
201
|
unless ( $1 eq "" ) { |
|
679
|
0
|
|
|
|
|
0
|
$file_path_name = "$1/$file"; |
|
680
|
|
|
|
|
|
|
} else { |
|
681
|
20
|
|
|
|
|
71
|
$file_path_name = $file; |
|
682
|
|
|
|
|
|
|
} |
|
683
|
20
|
|
|
|
|
232
|
$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector; |
|
684
|
20
|
|
|
|
|
507
|
$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
################################### Drop Stop Words ################################### |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub _drop_stop_words { |
|
690
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
691
|
0
|
0
|
|
|
|
0
|
open( IN, "$self->{_working_directory}/$self->{_stop_words_file}") |
|
692
|
|
|
|
|
|
|
or die "unable to open stop words file: $!"; |
|
693
|
0
|
|
|
|
|
0
|
while () { |
|
694
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
|
695
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*\r?\n?$/; |
|
696
|
0
|
|
|
|
|
0
|
$_ =~ s/\r?\n?$//; |
|
697
|
0
|
0
|
|
|
|
0
|
delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_}; |
|
698
|
0
|
|
|
|
|
0
|
unshift @{$self->{_stop_words}}, $_; |
|
|
0
|
|
|
|
|
0
|
|
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
################################### Support Methods #################################### |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub _doc_vec_comparator { |
|
705
|
24
|
|
|
24
|
|
32
|
my $self = shift; |
|
706
|
24
|
|
|
|
|
30
|
my %query_vector = %{$self->{_query_vector}}; |
|
|
24
|
|
|
|
|
642
|
|
|
707
|
|
|
|
|
|
|
my $vec1_hash_ref = $self->{_idf_filter_option} ? |
|
708
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$a} : |
|
709
|
24
|
50
|
|
|
|
198
|
$self->{_corpus_doc_vectors}->{$a}; |
|
710
|
|
|
|
|
|
|
my $vec2_hash_ref = $self->{_idf_filter_option} ? |
|
711
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$b} : |
|
712
|
24
|
50
|
|
|
|
61
|
$self->{_corpus_doc_vectors}->{$b}; |
|
713
|
24
|
|
|
|
|
38
|
my @vec1 = (); |
|
714
|
24
|
|
|
|
|
31
|
my @vec2 = (); |
|
715
|
24
|
|
|
|
|
35
|
my @qvec = (); |
|
716
|
24
|
|
|
|
|
30
|
foreach my $word (sort keys %{$self->{_vocab_hist}}) { |
|
|
24
|
|
|
|
|
1255
|
|
|
717
|
2616
|
|
|
|
|
3808
|
push @vec1, $vec1_hash_ref->{$word}; |
|
718
|
2616
|
|
|
|
|
3607
|
push @vec2, $vec2_hash_ref->{$word}; |
|
719
|
2616
|
|
|
|
|
4249
|
push @qvec, $query_vector{$word}; |
|
720
|
|
|
|
|
|
|
} |
|
721
|
24
|
|
|
|
|
185
|
my $vec1_mag = vec_magnitude(\@vec1); |
|
722
|
24
|
|
|
|
|
55
|
my $vec2_mag = vec_magnitude(\@vec2); |
|
723
|
24
|
|
|
|
|
45
|
my $qvec_mag = vec_magnitude(\@qvec); |
|
724
|
24
|
|
|
|
|
54
|
my $product1 = vec_scalar_product(\@vec1, \@qvec); |
|
725
|
24
|
|
|
|
|
51
|
$product1 /= $vec1_mag * $qvec_mag; |
|
726
|
24
|
|
|
|
|
57
|
my $product2 = vec_scalar_product(\@vec2, \@qvec); |
|
727
|
24
|
|
|
|
|
37
|
$product2 /= $vec2_mag * $qvec_mag; |
|
728
|
24
|
100
|
|
|
|
230
|
return 1 if $product1 < $product2; |
|
729
|
14
|
100
|
|
|
|
49
|
return 0 if $product1 == $product2; |
|
730
|
13
|
50
|
|
|
|
253
|
return -1 if $product1 > $product2; |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
sub _similarity_to_query { |
|
734
|
18
|
|
|
18
|
|
24
|
my $self = shift; |
|
735
|
18
|
|
|
|
|
26
|
my $doc_name = shift; |
|
736
|
|
|
|
|
|
|
my $vec_hash_ref = $self->{_idf_filter_option} ? |
|
737
|
|
|
|
|
|
|
$self->{_normalized_doc_vecs}->{$doc_name} : |
|
738
|
18
|
50
|
|
|
|
50
|
$self->{_corpus_doc_vectors}->{$doc_name}; |
|
739
|
18
|
|
|
|
|
29
|
my @vec = (); |
|
740
|
18
|
|
|
|
|
22
|
my @qvec = (); |
|
741
|
18
|
|
|
|
|
925
|
foreach my $word (sort keys %$vec_hash_ref) { |
|
742
|
1962
|
|
|
|
|
2684
|
push @vec, $vec_hash_ref->{$word}; |
|
743
|
1962
|
|
|
|
|
3059
|
push @qvec, $self->{_query_vector}->{$word}; |
|
744
|
|
|
|
|
|
|
} |
|
745
|
18
|
|
|
|
|
134
|
my $vec_mag = vec_magnitude(\@vec); |
|
746
|
18
|
|
|
|
|
42
|
my $qvec_mag = vec_magnitude(\@qvec); |
|
747
|
18
|
|
|
|
|
40
|
my $product = vec_scalar_product(\@vec, \@qvec); |
|
748
|
18
|
|
|
|
|
32
|
$product /= $vec_mag * $qvec_mag; |
|
749
|
18
|
|
|
|
|
178
|
return $product; |
|
750
|
|
|
|
|
|
|
} |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
###################### Relevance Judgments for Testing Purposes ####################### |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
## IMPORTANT: This estimation of document relevancies to queries is NOT for |
|
755
|
|
|
|
|
|
|
## serious work. A document is considered to be relevant to a |
|
756
|
|
|
|
|
|
|
## query if it contains several of the query words. As to the |
|
757
|
|
|
|
|
|
|
## minimum number of query words that must exist in a document |
|
758
|
|
|
|
|
|
|
## in order for the latter to be considered relevant is |
|
759
|
|
|
|
|
|
|
## determined by the relevancy_threshold parameter in the VSM |
|
760
|
|
|
|
|
|
|
## constructor. (See the relevancy and precision-recall related |
|
761
|
|
|
|
|
|
|
## scripts in the 'examples' directory.) The reason for why the |
|
762
|
|
|
|
|
|
|
## function shown below is not for serious work is because |
|
763
|
|
|
|
|
|
|
## ultimately it is the humans who are the best judges of the |
|
764
|
|
|
|
|
|
|
## relevancies of documents to queries. The humans bring to |
|
765
|
|
|
|
|
|
|
## bear semantic considerations on the relevancy determination |
|
766
|
|
|
|
|
|
|
## problem that are beyond the scope of this module. |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
sub estimate_doc_relevancies { |
|
769
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
770
|
|
|
|
|
|
|
die "You did not set the 'query_file' parameter in the constructor" |
|
771
|
0
|
0
|
|
|
|
0
|
unless $self->{_query_file}; |
|
772
|
|
|
|
|
|
|
open( IN, $self->{_query_file} ) |
|
773
|
0
|
0
|
|
|
|
0
|
or die "unable to open the query file $self->{_query_file}: $!"; |
|
774
|
|
|
|
|
|
|
croak "\n\nYou need to specify a name for the relevancy file in \n" . |
|
775
|
|
|
|
|
|
|
" in which the relevancy judgments will be dumped." |
|
776
|
0
|
0
|
|
|
|
0
|
unless $self->{_relevancy_file}; |
|
777
|
0
|
|
|
|
|
0
|
while () { |
|
778
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
|
779
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*\r?\n?$/; |
|
780
|
0
|
|
|
|
|
0
|
$_ =~ s/\r?\n?$//; |
|
781
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/; |
|
782
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+):[ ]*(.*)/; |
|
783
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
|
784
|
0
|
|
|
|
|
0
|
my $query = $2; |
|
785
|
0
|
0
|
|
|
|
0
|
next unless $query; |
|
786
|
0
|
|
|
|
|
0
|
$self->{_queries_for_relevancy}->{$query_label} = $query; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
789
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_queries_for_relevancy}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
790
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_queries_for_relevancy}->{$_}\n"; |
|
791
|
|
|
|
|
|
|
} |
|
792
|
|
|
|
|
|
|
} |
|
793
|
0
|
|
|
|
|
0
|
$self->{_scan_dir_for_rels} = 1; |
|
794
|
0
|
|
|
|
|
0
|
$self->_scan_directory($self->{_corpus_directory}); |
|
795
|
0
|
|
|
|
|
0
|
$self->{_scan_dir_for_rels} = 0; |
|
796
|
0
|
|
|
|
|
0
|
chdir $self->{_working_directory}; |
|
797
|
0
|
0
|
|
|
|
0
|
open(OUT, ">$self->{_relevancy_file}") |
|
798
|
|
|
|
|
|
|
or die "unable to open the relevancy file $self->{_relevancy_file}: $!"; |
|
799
|
0
|
|
|
|
|
0
|
my @relevancy_list_for_query; |
|
800
|
0
|
|
|
|
|
0
|
foreach (sort |
|
801
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
|
802
|
0
|
|
|
|
|
0
|
keys %{$self->{_relevancy_estimates}}) { |
|
803
|
|
|
|
|
|
|
@relevancy_list_for_query = |
|
804
|
0
|
|
|
|
|
0
|
keys %{$self->{_relevancy_estimates}->{$_}}; |
|
|
0
|
|
|
|
|
0
|
|
|
805
|
0
|
|
|
|
|
0
|
print OUT "$_ => @relevancy_list_for_query\n\n"; |
|
806
|
0
|
|
|
|
|
0
|
print "Number of relevant docs for query $_: " . |
|
807
|
|
|
|
|
|
|
scalar(@relevancy_list_for_query) . "\n"; |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
} |
|
810
|
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
# If there are available human-supplied relevancy judgments in a disk |
|
812
|
|
|
|
|
|
|
# file, use this script to upload that information. One of the scripts |
|
813
|
|
|
|
|
|
|
# in the 'examples' directory carries out the precision-recall analysis |
|
814
|
|
|
|
|
|
|
# by using this approach. IMPORTANT: The human-supplied relevancy |
|
815
|
|
|
|
|
|
|
# judgments must be in a format that is shown in the sample file |
|
816
|
|
|
|
|
|
|
# relevancy.txt in the 'examples' directory. |
|
817
|
|
|
|
|
|
|
sub upload_document_relevancies_from_file { |
|
818
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
819
|
0
|
|
|
|
|
0
|
chdir $self->{_working_directory}; |
|
820
|
|
|
|
|
|
|
open( IN, $self->{_relevancy_file} ) |
|
821
|
0
|
0
|
|
|
|
0
|
or die "unable to open the relevancy file $self->{_relevancy_file}: $!"; |
|
822
|
0
|
|
|
|
|
0
|
while () { |
|
823
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
|
824
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*\r?\n?$/; |
|
825
|
0
|
|
|
|
|
0
|
$_ =~ s/\r?\n?$//; |
|
826
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/; |
|
827
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/; |
|
828
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
|
829
|
0
|
|
|
|
|
0
|
my $relevancy_docs_string = $2; |
|
830
|
0
|
0
|
|
|
|
0
|
next unless $relevancy_docs_string; |
|
831
|
0
|
|
|
|
|
0
|
my @relevancy_docs = grep $_, split / /, $relevancy_docs_string; |
|
832
|
0
|
|
|
|
|
0
|
my %relevancies = map {$_ => 1} @relevancy_docs; |
|
|
0
|
|
|
|
|
0
|
|
|
833
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query_label} = \%relevancies; |
|
834
|
|
|
|
|
|
|
} |
|
835
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
836
|
0
|
|
|
|
|
0
|
for (sort keys %{$self->{_relevancy_estimates}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
837
|
0
|
|
|
|
|
0
|
my @rels = keys %{$self->{_relevancy_estimates}->{$_}}; |
|
|
0
|
|
|
|
|
0
|
|
|
838
|
0
|
|
|
|
|
0
|
print "$_ => @rels\n"; |
|
839
|
|
|
|
|
|
|
} |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
} |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
sub display_doc_relevancies { |
|
844
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
845
|
|
|
|
|
|
|
die "You must first estimate or provide the doc relevancies" |
|
846
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_relevancy_estimates}}); |
|
|
0
|
|
|
|
|
0
|
|
|
847
|
0
|
|
|
|
|
0
|
print "\nDisplaying relevancy judgments:\n\n"; |
|
848
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_relevancy_estimates}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
849
|
0
|
|
|
|
|
0
|
print "Query $query\n"; |
|
850
|
0
|
|
|
|
|
0
|
foreach my $file (sort { |
|
851
|
|
|
|
|
|
|
$self->{_relevancy_estimates}->{$query}->{$b} |
|
852
|
|
|
|
|
|
|
<=> |
|
853
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query}->{$a} |
|
854
|
|
|
|
|
|
|
} |
|
855
|
0
|
|
|
|
|
0
|
keys %{$self->{_relevancy_estimates}->{$query}}){ |
|
856
|
0
|
|
|
|
|
0
|
print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n"; |
|
857
|
|
|
|
|
|
|
} |
|
858
|
|
|
|
|
|
|
} |
|
859
|
|
|
|
|
|
|
} |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
sub _scan_file_for_rels { |
|
862
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
863
|
0
|
|
|
|
|
0
|
my $file = shift; |
|
864
|
0
|
|
|
|
|
0
|
open IN, $file; |
|
865
|
0
|
|
|
|
|
0
|
my @all_text = ; |
|
866
|
0
|
|
|
|
|
0
|
@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
867
|
0
|
|
|
|
|
0
|
my $all_text = join ' ', @all_text; |
|
868
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
869
|
0
|
|
|
|
|
0
|
my $count = 0; |
|
870
|
|
|
|
|
|
|
my @query_words = grep $_, |
|
871
|
0
|
|
|
|
|
0
|
split /\s+/, $self->{_queries_for_relevancy}->{$query}; |
|
872
|
0
|
0
|
|
|
|
0
|
print "Query words for $query: @query_words\n" if $self->{_debug}; |
|
873
|
0
|
|
|
|
|
0
|
foreach my $word (@query_words) { |
|
874
|
0
|
|
|
|
|
0
|
my @matches = $all_text =~ /$word/gi; |
|
875
|
|
|
|
|
|
|
print "Number of occurrences for word '$word' in file $file: " . |
|
876
|
0
|
0
|
|
|
|
0
|
scalar(@matches) . "\n" if $self->{_debug}; |
|
877
|
0
|
0
|
|
|
|
0
|
$count += @matches if @matches; |
|
878
|
|
|
|
|
|
|
} |
|
879
|
|
|
|
|
|
|
print "\nRelevancy count for query $query and file $file: $count\n\n" |
|
880
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
881
|
|
|
|
|
|
|
$self->{_relevancy_estimates}->{$query}->{$file} = $count |
|
882
|
0
|
0
|
|
|
|
0
|
if $count >= $self->{_relevancy_threshold}; |
|
883
|
|
|
|
|
|
|
} |
|
884
|
|
|
|
|
|
|
} |
|
885
|
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
######################### Calculate Precision versus Recall ########################## |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
sub precision_and_recall_calculator { |
|
889
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
890
|
0
|
|
|
|
|
0
|
my $retrieval_type = shift; |
|
891
|
0
|
0
|
|
|
|
0
|
die "You must specify the retrieval type through an argument to the method " . |
|
892
|
|
|
|
|
|
|
"precision_and_recall_calculator(). The retrieval type must either be 'vsm' " . |
|
893
|
|
|
|
|
|
|
"or 'lsa' \n" unless $retrieval_type; |
|
894
|
|
|
|
|
|
|
die "You must first estimate or provide the doc relevancies" |
|
895
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_relevancy_estimates}}); |
|
|
0
|
|
|
|
|
0
|
|
|
896
|
0
|
0
|
|
|
|
0
|
unless (scalar(keys %{$self->{_queries_for_relevancy}})) { |
|
|
0
|
|
|
|
|
0
|
|
|
897
|
|
|
|
|
|
|
open( IN, $self->{_query_file}) |
|
898
|
0
|
0
|
|
|
|
0
|
or die "unable to open the query file $self->{_query_file}: $!"; |
|
899
|
0
|
|
|
|
|
0
|
while () { |
|
900
|
0
|
0
|
|
|
|
0
|
next if /^#/; |
|
901
|
0
|
0
|
|
|
|
0
|
next if /^[ ]*\r?\n?$/; |
|
902
|
0
|
|
|
|
|
0
|
$_ =~ s/\r?\n?$//; |
|
903
|
0
|
0
|
|
|
|
0
|
die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/; |
|
904
|
0
|
|
|
|
|
0
|
/^[ ]*(q[0-9]+):[ ]*(.*)/; |
|
905
|
0
|
|
|
|
|
0
|
my $query_label = $1; |
|
906
|
0
|
|
|
|
|
0
|
my $query = $2; |
|
907
|
0
|
0
|
|
|
|
0
|
next unless $query; |
|
908
|
0
|
|
|
|
|
0
|
$self->{_queries_for_relevancy}->{$query_label} = $query; |
|
909
|
|
|
|
|
|
|
} |
|
910
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
911
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying queries in the query file:\n\n"; |
|
912
|
0
|
|
|
|
|
0
|
foreach (sort keys %{$self->{_queries_for_relevancy}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
913
|
0
|
|
|
|
|
0
|
print "$_ => $self->{_queries_for_relevancy}->{$_}\n"; |
|
914
|
|
|
|
|
|
|
} |
|
915
|
|
|
|
|
|
|
} |
|
916
|
|
|
|
|
|
|
} |
|
917
|
0
|
|
|
|
|
0
|
foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
918
|
|
|
|
|
|
|
print "\n\n====================================== query: $query ========================================\n\n" |
|
919
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
920
|
0
|
0
|
|
|
|
0
|
print "\n\n\nQuery $query:\n" if $self->{_debug}; |
|
921
|
|
|
|
|
|
|
my @query_words = grep $_, |
|
922
|
0
|
|
|
|
|
0
|
split /\s+/, $self->{_queries_for_relevancy}->{$query}; |
|
923
|
0
|
0
|
|
|
|
0
|
croak "\n\nYou have not specified the retrieval type for " . |
|
924
|
|
|
|
|
|
|
"precision-recall calculation. See code in 'examples'" . |
|
925
|
|
|
|
|
|
|
"directory:" if !defined $retrieval_type; |
|
926
|
0
|
|
|
|
|
0
|
my $retrievals; |
|
927
|
0
|
|
|
|
|
0
|
eval { |
|
928
|
0
|
0
|
|
|
|
0
|
if ($retrieval_type eq 'vsm') { |
|
|
|
0
|
|
|
|
|
|
|
929
|
0
|
|
|
|
|
0
|
$retrievals = $self->retrieve_with_vsm( \@query_words ); |
|
930
|
|
|
|
|
|
|
} elsif ($retrieval_type eq 'lsa') { |
|
931
|
0
|
|
|
|
|
0
|
$retrievals = $self->retrieve_with_lsa( \@query_words ); |
|
932
|
|
|
|
|
|
|
} |
|
933
|
|
|
|
|
|
|
}; |
|
934
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
935
|
0
|
|
|
|
|
0
|
warn "\n\nNo relevant docs found for query $query.\n" . |
|
936
|
|
|
|
|
|
|
"Will skip over this query for precision and\n" . |
|
937
|
|
|
|
|
|
|
"recall calculations\n\n"; |
|
938
|
0
|
|
|
|
|
0
|
next; |
|
939
|
|
|
|
|
|
|
} |
|
940
|
0
|
|
|
|
|
0
|
my %ranked_retrievals; |
|
941
|
0
|
|
|
|
|
0
|
my $i = 1; |
|
942
|
0
|
|
|
|
|
0
|
foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} |
|
|
0
|
|
|
|
|
0
|
|
|
943
|
|
|
|
|
|
|
keys %$retrievals) { |
|
944
|
0
|
|
|
|
|
0
|
$ranked_retrievals{$i++} = $_; |
|
945
|
|
|
|
|
|
|
} |
|
946
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
947
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying ranked retrievals for query $query:\n\n"; |
|
948
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %ranked_retrievals) { |
|
|
0
|
|
|
|
|
0
|
|
|
949
|
0
|
|
|
|
|
0
|
print "$_ => $ranked_retrievals{$_}\n"; |
|
950
|
|
|
|
|
|
|
} |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
# At this time, ranking of relevant documents based on their |
|
953
|
|
|
|
|
|
|
# relevancy counts serves no particular purpose since all we want |
|
954
|
|
|
|
|
|
|
# for the calculation of Precision and Recall are the total |
|
955
|
|
|
|
|
|
|
# number of relevant documents. However, I believe such a |
|
956
|
|
|
|
|
|
|
# ranking will play an important role in the future. |
|
957
|
|
|
|
|
|
|
# IMPORTANT: The relevancy judgments are ranked only when |
|
958
|
|
|
|
|
|
|
# estimated by the method estimate_doc_relevancies() |
|
959
|
|
|
|
|
|
|
# of the VSM class. When relevancies are supplied |
|
960
|
|
|
|
|
|
|
# directly through a disk file, they all carry the |
|
961
|
|
|
|
|
|
|
# same rank. |
|
962
|
0
|
|
|
|
|
0
|
my %ranked_relevancies; |
|
963
|
0
|
|
|
|
|
0
|
$i = 1; |
|
964
|
0
|
|
|
|
|
0
|
foreach my $file (sort { |
|
965
|
|
|
|
|
|
|
$self->{_relevancy_estimates}->{$query}->{$b} |
|
966
|
|
|
|
|
|
|
<=> |
|
967
|
0
|
|
|
|
|
0
|
$self->{_relevancy_estimates}->{$query}->{$a} |
|
968
|
|
|
|
|
|
|
} |
|
969
|
0
|
|
|
|
|
0
|
keys %{$self->{_relevancy_estimates}->{$query}}) { |
|
970
|
0
|
|
|
|
|
0
|
$ranked_relevancies{$i++} = $file; |
|
971
|
|
|
|
|
|
|
} |
|
972
|
0
|
0
|
|
|
|
0
|
if ($self->{_debug}) { |
|
973
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying ranked relevancies for query $query:\n\n"; |
|
974
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %ranked_relevancies) { |
|
|
0
|
|
|
|
|
0
|
|
|
975
|
0
|
|
|
|
|
0
|
print "$_ => $ranked_relevancies{$_}\n"; |
|
976
|
|
|
|
|
|
|
} |
|
977
|
|
|
|
|
|
|
} |
|
978
|
0
|
|
|
|
|
0
|
my @relevant_set = values %ranked_relevancies; |
|
979
|
0
|
0
|
|
|
|
0
|
warn "\n\nNo relevant docs found for query $query.\n" . |
|
980
|
|
|
|
|
|
|
"Will skip over this query for precision and\n" . |
|
981
|
|
|
|
|
|
|
"recall calculations\n\n" unless @relevant_set; |
|
982
|
0
|
0
|
|
|
|
0
|
next unless @relevant_set; |
|
983
|
|
|
|
|
|
|
print "\n\nRelevant set for query $query: @relevant_set\n\n" |
|
984
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
985
|
|
|
|
|
|
|
# @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed. |
|
986
|
0
|
|
|
|
|
0
|
my @retrieved; |
|
987
|
0
|
|
|
|
|
0
|
foreach (keys %ranked_retrievals) { |
|
988
|
0
|
|
|
|
|
0
|
push @retrieved, $ranked_retrievals{$_}; |
|
989
|
|
|
|
|
|
|
} |
|
990
|
|
|
|
|
|
|
print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n" |
|
991
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
992
|
0
|
|
|
|
|
0
|
my @Precision_values = (); |
|
993
|
0
|
|
|
|
|
0
|
my @Recall_values = (); |
|
994
|
0
|
|
|
|
|
0
|
my $rank = 1; |
|
995
|
0
|
|
|
|
|
0
|
while ($rank < @retrieved + 1) { |
|
996
|
0
|
|
|
|
|
0
|
my $index = 1; |
|
997
|
0
|
|
|
|
|
0
|
my @retrieved_at_rank = (); |
|
998
|
0
|
|
|
|
|
0
|
while ($index <= $rank) { |
|
999
|
0
|
|
|
|
|
0
|
push @retrieved_at_rank, $ranked_retrievals{$index}; |
|
1000
|
0
|
|
|
|
|
0
|
$index++; |
|
1001
|
|
|
|
|
|
|
} |
|
1002
|
0
|
|
|
|
|
0
|
my $intersection =set_intersection(\@retrieved_at_rank, |
|
1003
|
|
|
|
|
|
|
\@relevant_set); |
|
1004
|
0
|
0
|
|
|
|
0
|
my $precision_at_rank = @retrieved_at_rank ? |
|
1005
|
|
|
|
|
|
|
(@$intersection / @retrieved_at_rank) : 0; |
|
1006
|
0
|
|
|
|
|
0
|
push @Precision_values, $precision_at_rank; |
|
1007
|
0
|
|
|
|
|
0
|
my $recall_at_rank = @$intersection / @relevant_set; |
|
1008
|
0
|
|
|
|
|
0
|
push @Recall_values, $recall_at_rank; |
|
1009
|
0
|
|
|
|
|
0
|
$rank++; |
|
1010
|
|
|
|
|
|
|
} |
|
1011
|
|
|
|
|
|
|
print "\n\nFor query $query, precision values: @Precision_values\n" |
|
1012
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
1013
|
|
|
|
|
|
|
print "\nFor query $query, recall values: @Recall_values\n" |
|
1014
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
1015
|
0
|
|
|
|
|
0
|
$self->{_precision_for_queries}->{$query} = \@Precision_values; |
|
1016
|
0
|
|
|
|
|
0
|
my $avg_precision; |
|
1017
|
0
|
|
|
|
|
0
|
$avg_precision += $_ for @Precision_values; |
|
1018
|
0
|
|
|
|
|
0
|
$self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values); |
|
1019
|
0
|
|
|
|
|
0
|
$self->{_recall_for_queries}->{$query} = \@Recall_values; |
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n" |
|
1022
|
0
|
0
|
|
|
|
0
|
if $self->{_debug}; |
|
1023
|
0
|
|
|
|
|
0
|
my @avg_precisions; |
|
1024
|
0
|
|
|
|
|
0
|
foreach (keys %{$self->{_avg_precision_for_queries}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
1025
|
0
|
|
|
|
|
0
|
push @avg_precisions, $self->{_avg_precision_for_queries}->{$_}; |
|
1026
|
|
|
|
|
|
|
} |
|
1027
|
0
|
|
|
|
|
0
|
$self->{_map} += $_ for @avg_precisions; |
|
1028
|
0
|
|
|
|
|
0
|
$self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}}; |
|
|
0
|
|
|
|
|
0
|
|
|
1029
|
|
|
|
|
|
|
} |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
sub display_average_precision_for_queries_and_map { |
|
1032
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
1033
|
|
|
|
|
|
|
die "You must first invoke precision_and_recall_calculator function" |
|
1034
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_avg_precision_for_queries}}); |
|
|
0
|
|
|
|
|
0
|
|
|
1035
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying average precision for different queries:\n\n"; |
|
1036
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
1037
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
|
1038
|
0
|
|
|
|
|
0
|
keys %{$self->{_avg_precision_for_queries}}) { |
|
1039
|
|
|
|
|
|
|
my $output = sprintf "Query %s => %.3f", |
|
1040
|
0
|
|
|
|
|
0
|
$query, $self->{_avg_precision_for_queries}->{$query}; |
|
1041
|
0
|
|
|
|
|
0
|
print "$output\n"; |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
0
|
|
|
|
|
0
|
print "\n\nMAP value: $self->{_map}\n\n"; |
|
1044
|
|
|
|
|
|
|
} |
|
1045
|
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
sub display_precision_vs_recall_for_queries { |
|
1047
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
1048
|
|
|
|
|
|
|
die "You must first invoke precision_and_recall_calculator function" |
|
1049
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_precision_for_queries}}); |
|
|
0
|
|
|
|
|
0
|
|
|
1050
|
0
|
|
|
|
|
0
|
print "\n\nDisplaying precision and recall values for different queries:\n\n"; |
|
1051
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
1052
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
|
1053
|
0
|
|
|
|
|
0
|
keys %{$self->{_avg_precision_for_queries}}) { |
|
1054
|
0
|
|
|
|
|
0
|
print "\n\nQuery $query:\n"; |
|
1055
|
0
|
|
|
|
|
0
|
print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n"; |
|
1056
|
0
|
|
|
|
|
0
|
my @precision_vals = @{$self->{_precision_for_queries}->{$query}}; |
|
|
0
|
|
|
|
|
0
|
|
|
1057
|
0
|
|
|
|
|
0
|
@precision_vals = map {sprintf "%.3f", $_} @precision_vals; |
|
|
0
|
|
|
|
|
0
|
|
|
1058
|
0
|
|
|
|
|
0
|
print " Precision at rank => @precision_vals\n"; |
|
1059
|
0
|
|
|
|
|
0
|
my @recall_vals = @{$self->{_recall_for_queries}->{$query}}; |
|
|
0
|
|
|
|
|
0
|
|
|
1060
|
0
|
|
|
|
|
0
|
@recall_vals = map {sprintf "%.3f", $_} @recall_vals; |
|
|
0
|
|
|
|
|
0
|
|
|
1061
|
0
|
|
|
|
|
0
|
print "\n Recall at rank => @recall_vals\n"; |
|
1062
|
|
|
|
|
|
|
} |
|
1063
|
0
|
|
|
|
|
0
|
print "\n\n"; |
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
sub get_query_sorted_average_precision_for_queries { |
|
1067
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
1068
|
|
|
|
|
|
|
die "You must first invoke precision_and_recall_calculator function" |
|
1069
|
0
|
0
|
|
|
|
0
|
unless scalar(keys %{$self->{_avg_precision_for_queries}}); |
|
|
0
|
|
|
|
|
0
|
|
|
1070
|
0
|
|
|
|
|
0
|
my @average_precisions_for_queries = (); |
|
1071
|
0
|
|
|
|
|
0
|
foreach my $query (sort |
|
1072
|
0
|
|
|
|
|
0
|
{get_integer_suffix($a) <=> get_integer_suffix($b)} |
|
1073
|
0
|
|
|
|
|
0
|
keys %{$self->{_avg_precision_for_queries}}) { |
|
1074
|
0
|
|
|
|
|
0
|
my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query}; |
|
1075
|
0
|
|
|
|
|
0
|
push @average_precisions_for_queries, $output; |
|
1076
|
|
|
|
|
|
|
} |
|
1077
|
0
|
|
|
|
|
0
|
return \@average_precisions_for_queries; |
|
1078
|
|
|
|
|
|
|
} |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
################################### Utility Routines ################################### |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
sub _check_for_illegal_params { |
|
1083
|
2
|
|
|
2
|
|
7
|
my @params = @_; |
|
1084
|
2
|
|
|
|
|
26
|
my @legal_params = qw / corpus_directory |
|
1085
|
|
|
|
|
|
|
corpus_vocab_db |
|
1086
|
|
|
|
|
|
|
doc_vectors_db |
|
1087
|
|
|
|
|
|
|
normalized_doc_vecs_db |
|
1088
|
|
|
|
|
|
|
use_idf_filter |
|
1089
|
|
|
|
|
|
|
stop_words_file |
|
1090
|
|
|
|
|
|
|
file_types |
|
1091
|
|
|
|
|
|
|
case_sensitive |
|
1092
|
|
|
|
|
|
|
max_number_retrievals |
|
1093
|
|
|
|
|
|
|
query_file |
|
1094
|
|
|
|
|
|
|
relevancy_file |
|
1095
|
|
|
|
|
|
|
min_word_length |
|
1096
|
|
|
|
|
|
|
want_stemming |
|
1097
|
|
|
|
|
|
|
lsa_svd_threshold |
|
1098
|
|
|
|
|
|
|
relevancy_threshold |
|
1099
|
|
|
|
|
|
|
break_camelcased_and_underscored |
|
1100
|
|
|
|
|
|
|
save_model_on_disk |
|
1101
|
|
|
|
|
|
|
debug |
|
1102
|
|
|
|
|
|
|
/; |
|
1103
|
2
|
|
|
|
|
4
|
my $found_match_flag; |
|
1104
|
2
|
|
|
|
|
4
|
foreach my $param (@params) { |
|
1105
|
17
|
|
|
|
|
26
|
foreach my $legal (@legal_params) { |
|
1106
|
102
|
|
|
|
|
116
|
$found_match_flag = 0; |
|
1107
|
102
|
100
|
|
|
|
232
|
if ($param eq $legal) { |
|
1108
|
17
|
|
|
|
|
23
|
$found_match_flag = 1; |
|
1109
|
17
|
|
|
|
|
23
|
last; |
|
1110
|
|
|
|
|
|
|
} |
|
1111
|
|
|
|
|
|
|
} |
|
1112
|
17
|
50
|
|
|
|
41
|
last if $found_match_flag == 0; |
|
1113
|
|
|
|
|
|
|
} |
|
1114
|
2
|
|
|
|
|
10
|
return $found_match_flag; |
|
1115
|
|
|
|
|
|
|
} |
|
1116
|
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
# checks whether an element is in an array: |
|
1118
|
|
|
|
|
|
|
sub contained_in { |
|
1119
|
40
|
|
|
40
|
0
|
68
|
my $ele = shift; |
|
1120
|
40
|
|
|
|
|
178
|
my @array = @_; |
|
1121
|
40
|
|
|
|
|
99
|
my $count = 0; |
|
1122
|
40
|
100
|
|
|
|
78
|
map {$count++ if $ele eq $_} @array; |
|
|
80
|
|
|
|
|
440
|
|
|
1123
|
40
|
|
|
|
|
278
|
return $count; |
|
1124
|
|
|
|
|
|
|
} |
|
1125
|
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
# Meant only for an un-nested hash: |
|
1127
|
|
|
|
|
|
|
sub deep_copy_hash { |
|
1128
|
20
|
|
|
20
|
0
|
31
|
my $ref_in = shift; |
|
1129
|
20
|
|
|
|
|
46
|
my $ref_out = {}; |
|
1130
|
20
|
|
|
|
|
34
|
foreach ( keys %{$ref_in} ) { |
|
|
20
|
|
|
|
|
1324
|
|
|
1131
|
2180
|
|
|
|
|
4245
|
$ref_out->{$_} = $ref_in->{$_}; |
|
1132
|
|
|
|
|
|
|
} |
|
1133
|
20
|
|
|
|
|
1001
|
return $ref_out; |
|
1134
|
|
|
|
|
|
|
} |
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub vec_scalar_product { |
|
1137
|
66
|
|
|
66
|
0
|
85
|
my $vec1 = shift; |
|
1138
|
66
|
|
|
|
|
92
|
my $vec2 = shift; |
|
1139
|
66
|
50
|
|
|
|
167
|
croak "Something is wrong --- the two vectors are of unequal length" |
|
1140
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
|
1141
|
66
|
|
|
|
|
68
|
my $product; |
|
1142
|
66
|
|
|
|
|
134
|
for my $i (0..@$vec1-1) { |
|
1143
|
7194
|
|
|
|
|
10516
|
$product += $vec1->[$i] * $vec2->[$i]; |
|
1144
|
|
|
|
|
|
|
} |
|
1145
|
66
|
|
|
|
|
130
|
return $product; |
|
1146
|
|
|
|
|
|
|
} |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
sub vec_magnitude { |
|
1149
|
108
|
|
|
108
|
0
|
144
|
my $vec = shift; |
|
1150
|
108
|
|
|
|
|
129
|
my $mag_squared = 0; |
|
1151
|
108
|
|
|
|
|
185
|
foreach my $num (@$vec) { |
|
1152
|
11772
|
|
|
|
|
17491
|
$mag_squared += $num ** 2; |
|
1153
|
|
|
|
|
|
|
} |
|
1154
|
108
|
|
|
|
|
212
|
return sqrt $mag_squared; |
|
1155
|
|
|
|
|
|
|
} |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
sub sum { |
|
1158
|
2
|
|
|
2
|
0
|
5
|
my $vec = shift; |
|
1159
|
2
|
|
|
|
|
3
|
my $result; |
|
1160
|
2
|
|
|
|
|
9
|
for my $item (@$vec) { |
|
1161
|
218
|
|
|
|
|
297
|
$result += $item; |
|
1162
|
|
|
|
|
|
|
} |
|
1163
|
2
|
|
|
|
|
6
|
return $result; |
|
1164
|
|
|
|
|
|
|
} |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
sub simple_stemmer { |
|
1167
|
3346
|
|
|
3346
|
0
|
5310
|
my $word = shift; |
|
1168
|
3346
|
|
|
|
|
4952
|
my $debug = shift; |
|
1169
|
3346
|
50
|
|
|
|
7034
|
print "\nStemming the word: $word\n" if $debug; |
|
1170
|
3346
|
|
|
|
|
10255
|
$word =~ s/(.*[a-z][^aeious])s$/$1/i; |
|
1171
|
3346
|
|
|
|
|
8246
|
$word =~ s/(.*[a-z]s)es$/$1/i; |
|
1172
|
3346
|
|
|
|
|
7671
|
$word =~ s/(.*[a-z][ck])es$/$1e/i; |
|
1173
|
3346
|
|
|
|
|
9771
|
$word =~ s/(.*[a-z]+)tions$/$1tion/i; |
|
1174
|
3346
|
|
|
|
|
9045
|
$word =~ s/(.*[a-z]+)mming$/$1m/i; |
|
1175
|
3346
|
|
|
|
|
13125
|
$word =~ s/(.*[a-z]+[^rl])ing$/$1/i; |
|
1176
|
3346
|
|
|
|
|
9297
|
$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i; |
|
1177
|
3346
|
|
|
|
|
8535
|
$word =~ s/(.*[a-z]+)tices$/$1tex/i; |
|
1178
|
3346
|
|
|
|
|
11071
|
$word =~ s/(.*[a-z]+)pes$/$1pe/i; |
|
1179
|
3346
|
|
|
|
|
10753
|
$word =~ s/(.*[a-z]+)sed$/$1se/i; |
|
1180
|
3346
|
|
|
|
|
11089
|
$word =~ s/(.*[a-z]+)ed$/$1/i; |
|
1181
|
3346
|
|
|
|
|
7270
|
$word =~ s/(.*[a-z]+)tation$/$1t/i; |
|
1182
|
3346
|
50
|
|
|
|
6552
|
print "Stemmed word: $word\n\n" if $debug; |
|
1183
|
3346
|
|
|
|
|
11969
|
return $word; |
|
1184
|
|
|
|
|
|
|
} |
|
1185
|
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
# Assumes the array is sorted in a descending order, as would be the |
|
1187
|
|
|
|
|
|
|
# case with an array of singular values produced by an SVD algorithm |
|
1188
|
|
|
|
|
|
|
sub return_index_of_last_value_above_threshold { |
|
1189
|
1
|
|
|
1
|
0
|
5
|
my $pdl_obj = shift; |
|
1190
|
1
|
|
|
|
|
7
|
my $size = $pdl_obj->getdim(0); |
|
1191
|
1
|
|
|
|
|
4
|
my $threshold = shift; |
|
1192
|
1
|
|
|
|
|
13
|
my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold; |
|
1193
|
1
|
|
|
|
|
72
|
my $i = 0; |
|
1194
|
1
|
|
66
|
|
|
11
|
while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;} |
|
|
10
|
|
|
|
|
324
|
|
|
1195
|
1
|
|
|
|
|
6
|
return $i-1; |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub set_intersection { |
|
1199
|
0
|
|
|
0
|
0
|
|
my $set1 = shift; |
|
1200
|
0
|
|
|
|
|
|
my $set2 = shift; |
|
1201
|
0
|
|
|
|
|
|
my %hset1 = map {$_ => 1} @$set1; |
|
|
0
|
|
|
|
|
|
|
|
1202
|
0
|
|
|
|
|
|
my @common_elements = grep {$hset1{$_}} @$set2; |
|
|
0
|
|
|
|
|
|
|
|
1203
|
0
|
0
|
|
|
|
|
return @common_elements ? \@common_elements : []; |
|
1204
|
|
|
|
|
|
|
} |
|
1205
|
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
sub get_integer_suffix { |
|
1207
|
0
|
|
|
0
|
0
|
|
my $label = shift; |
|
1208
|
0
|
|
|
|
|
|
$label =~ /(\d*)$/; |
|
1209
|
0
|
|
|
|
|
|
return $1; |
|
1210
|
|
|
|
|
|
|
} |
|
1211
|
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
1; |
|
1213
|
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=pod |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
=head1 NAME |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
Algorithm::VSM --- A Perl module for retrieving files and documents from a software |
|
1219
|
|
|
|
|
|
|
library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis) |
|
1220
|
|
|
|
|
|
|
algorithms in response to search words and phrases. |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL: |
|
1225
|
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
use Algorithm::VSM; |
|
1227
|
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
|
1229
|
|
|
|
|
|
|
my @query = qw/ program ListIterator add ArrayList args /; |
|
1230
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
|
1231
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
|
1232
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1233
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1234
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1235
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1236
|
|
|
|
|
|
|
max_number_retrievals => 10, |
|
1237
|
|
|
|
|
|
|
min_word_length => 4, |
|
1238
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1239
|
|
|
|
|
|
|
use_idf_filter => 1, |
|
1240
|
|
|
|
|
|
|
want_stemming => 1, |
|
1241
|
|
|
|
|
|
|
); |
|
1242
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
|
1243
|
|
|
|
|
|
|
$vsm->display_corpus_vocab(); |
|
1244
|
|
|
|
|
|
|
$vsm->display_corpus_vocab_size(); |
|
1245
|
|
|
|
|
|
|
$vsm->write_corpus_vocab_to_file("vocabulary_dump.txt"); |
|
1246
|
|
|
|
|
|
|
$vsm->display_inverse_document_frequencies(); |
|
1247
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
|
1248
|
|
|
|
|
|
|
$vsm->display_doc_vectors(); |
|
1249
|
|
|
|
|
|
|
$vsm->display_normalized_doc_vectors(); |
|
1250
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query ); |
|
1251
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
The purpose of each constructor option and what is accomplished by the method |
|
1254
|
|
|
|
|
|
|
calls should be obvious by their names. If not, they are explained in greater |
|
1255
|
|
|
|
|
|
|
detail elsewhere in this documentation page. Note that the methods |
|
1256
|
|
|
|
|
|
|
display_corpus_vocab() and display_doc_vectors() are there only for testing |
|
1257
|
|
|
|
|
|
|
purposes with small corpora. If you must use them for large libraries/corpora, |
|
1258
|
|
|
|
|
|
|
you might wish to redirect the output to a file. |
|
1259
|
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
By default, a call to a constructor calculates normalized term-frequency vectors |
|
1261
|
|
|
|
|
|
|
for the documents. Normalization consists of first calculating the term |
|
1262
|
|
|
|
|
|
|
frequency tf(t) of a term t in a document as a proportion of the total numbers |
|
1263
|
|
|
|
|
|
|
of words in the document and then multiplying it by idf(t), where idf(t) stands |
|
1264
|
|
|
|
|
|
|
for the inverse document frequency associated with that term. Note that 'word' |
|
1265
|
|
|
|
|
|
|
and 'term' mean the same thing. |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
# FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL: |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
|
1272
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1273
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1274
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1275
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1276
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
|
1277
|
|
|
|
|
|
|
max_number_retrievals => 10, |
|
1278
|
|
|
|
|
|
|
min_word_length => 4, |
|
1279
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1280
|
|
|
|
|
|
|
use_idf_filter => 1, |
|
1281
|
|
|
|
|
|
|
want_stemming => 1, |
|
1282
|
|
|
|
|
|
|
); |
|
1283
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
|
1284
|
|
|
|
|
|
|
$lsa->display_corpus_vocab(); |
|
1285
|
|
|
|
|
|
|
$lsa->display_corpus_vocab_size(); |
|
1286
|
|
|
|
|
|
|
$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt"); |
|
1287
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
|
1288
|
|
|
|
|
|
|
$lsa->construct_lsa_model(); |
|
1289
|
|
|
|
|
|
|
my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query ); |
|
1290
|
|
|
|
|
|
|
$lsa->display_retrievals( $retrievals ); |
|
1291
|
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
The initialization code before the constructor call and the calls for displaying |
|
1293
|
|
|
|
|
|
|
the vocabulary and the vectors after the call remain the same as for the VSM case |
|
1294
|
|
|
|
|
|
|
shown previously in this Synopsis. In the call above, the constructor parameter |
|
1295
|
|
|
|
|
|
|
'lsa_svd_threshold' determines how many of the singular values will be retained |
|
1296
|
|
|
|
|
|
|
after we have carried out an SVD decomposition of the term-frequency matrix for |
|
1297
|
|
|
|
|
|
|
the documents in the corpus. Singular values smaller than this threshold |
|
1298
|
|
|
|
|
|
|
fraction of the largest value are rejected. |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR VSM: |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
|
1305
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
|
1306
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
|
1307
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; # All relevancy judgments |
|
1308
|
|
|
|
|
|
|
# will be stored in this file |
|
1309
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
|
1310
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1311
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1312
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1313
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1314
|
|
|
|
|
|
|
min_word_length => 4, |
|
1315
|
|
|
|
|
|
|
query_file => $query_file, |
|
1316
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
|
1317
|
|
|
|
|
|
|
relevancy_threshold => 5, |
|
1318
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1319
|
|
|
|
|
|
|
want_stemming => 1, |
|
1320
|
|
|
|
|
|
|
); |
|
1321
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
|
1322
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
|
1323
|
|
|
|
|
|
|
$vsm->estimate_doc_relevancies(); |
|
1324
|
|
|
|
|
|
|
$vsm->display_doc_relevancies(); # used only for testing |
|
1325
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
|
1326
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
|
1327
|
|
|
|
|
|
|
$vsm->display_average_precision_for_queries_and_map(); |
|
1328
|
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
Measuring precision and recall requires a set of queries. These are supplied |
|
1330
|
|
|
|
|
|
|
through the constructor parameter 'query_file'. The format of the this file |
|
1331
|
|
|
|
|
|
|
must be according to the sample file 'test_queries.txt' in the 'examples' |
|
1332
|
|
|
|
|
|
|
directory. The module estimates the relevancies of the documents to the |
|
1333
|
|
|
|
|
|
|
queries and dumps the relevancies in a file named by the 'relevancy_file' |
|
1334
|
|
|
|
|
|
|
constructor parameter. The constructor parameter 'relevancy_threshold' is used |
|
1335
|
|
|
|
|
|
|
to decide which of the documents are considered to be relevant to a query. A |
|
1336
|
|
|
|
|
|
|
document must contain at least the 'relevancy_threshold' occurrences of query |
|
1337
|
|
|
|
|
|
|
words in order to be considered relevant to a query. |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR LSA: |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
|
1344
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1345
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1346
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1347
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1348
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
|
1349
|
|
|
|
|
|
|
min_word_length => 4, |
|
1350
|
|
|
|
|
|
|
query_file => $query_file, |
|
1351
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
|
1352
|
|
|
|
|
|
|
relevancy_threshold => 5, |
|
1353
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1354
|
|
|
|
|
|
|
want_stemming => 1, |
|
1355
|
|
|
|
|
|
|
); |
|
1356
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
|
1357
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
|
1358
|
|
|
|
|
|
|
$lsa->construct_lsa_model(); |
|
1359
|
|
|
|
|
|
|
$lsa->estimate_doc_relevancies(); |
|
1360
|
|
|
|
|
|
|
$lsa->display_doc_relevancies(); |
|
1361
|
|
|
|
|
|
|
$lsa->precision_and_recall_calculator('lsa'); |
|
1362
|
|
|
|
|
|
|
$lsa->display_precision_vs_recall_for_queries(); |
|
1363
|
|
|
|
|
|
|
$lsa->display_average_precision_for_queries_and_map(); |
|
1364
|
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
We have already explained the purpose of the constructor parameter 'query_file' |
|
1366
|
|
|
|
|
|
|
and about the constraints on the format of queries in the file named through |
|
1367
|
|
|
|
|
|
|
this parameter. As mentioned earlier, the module estimates the relevancies of |
|
1368
|
|
|
|
|
|
|
the documents to the queries and dumps the relevancies in a file named by the |
|
1369
|
|
|
|
|
|
|
'relevancy_file' constructor parameter. The constructor parameter |
|
1370
|
|
|
|
|
|
|
'relevancy_threshold' is used in deciding which of the documents are considered |
|
1371
|
|
|
|
|
|
|
to be relevant to a query. A document must contain at least the |
|
1372
|
|
|
|
|
|
|
'relevancy_threshold' occurrences of query words in order to be considered |
|
1373
|
|
|
|
|
|
|
relevant to a query. We have previously explained the role of the constructor |
|
1374
|
|
|
|
|
|
|
parameter 'lsa_svd_threshold'. |
|
1375
|
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS: |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
|
1381
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
|
1382
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
|
1383
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; |
|
1384
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
|
1385
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1386
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1387
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1388
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1389
|
|
|
|
|
|
|
min_word_length => 4, |
|
1390
|
|
|
|
|
|
|
query_file => $query_file, |
|
1391
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
|
1392
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1393
|
|
|
|
|
|
|
want_stemming => 1, |
|
1394
|
|
|
|
|
|
|
); |
|
1395
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
|
1396
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
|
1397
|
|
|
|
|
|
|
$vsm->upload_document_relevancies_from_file(); |
|
1398
|
|
|
|
|
|
|
$vsm->display_doc_relevancies(); |
|
1399
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
|
1400
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
|
1401
|
|
|
|
|
|
|
$vsm->display_average_precision_for_queries_and_map(); |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
Now the filename supplied through the constructor parameter 'relevancy_file' must |
|
1404
|
|
|
|
|
|
|
contain relevance judgments for the queries that are named in the file supplied |
|
1405
|
|
|
|
|
|
|
through the parameter 'query_file'. The format of these two files must be |
|
1406
|
|
|
|
|
|
|
according to what is shown in the sample files 'test_queries.txt' and |
|
1407
|
|
|
|
|
|
|
'relevancy.txt' in the 'examples' directory. |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
# FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS: |
|
1412
|
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
|
1414
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
|
1415
|
|
|
|
|
|
|
my $query_file = "test_queries.txt"; |
|
1416
|
|
|
|
|
|
|
my $relevancy_file = "relevancy.txt"; |
|
1417
|
|
|
|
|
|
|
my $lsa = Algorithm::VSM->new( |
|
1418
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1419
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1420
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1421
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1422
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
|
1423
|
|
|
|
|
|
|
min_word_length => 4, |
|
1424
|
|
|
|
|
|
|
query_file => $query_file, |
|
1425
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
|
1426
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1427
|
|
|
|
|
|
|
want_stemming => 1, |
|
1428
|
|
|
|
|
|
|
); |
|
1429
|
|
|
|
|
|
|
$lsa->get_corpus_vocabulary_and_word_counts(); |
|
1430
|
|
|
|
|
|
|
$lsa->generate_document_vectors(); |
|
1431
|
|
|
|
|
|
|
$lsa->upload_document_relevancies_from_file(); |
|
1432
|
|
|
|
|
|
|
$lsa->display_doc_relevancies(); |
|
1433
|
|
|
|
|
|
|
$lsa->precision_and_recall_calculator('vsm'); |
|
1434
|
|
|
|
|
|
|
$lsa->display_precision_vs_recall_for_queries(); |
|
1435
|
|
|
|
|
|
|
$lsa->display_average_precision_for_queries_and_map(); |
|
1436
|
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
As mentioned for the previous code block, the filename supplied through the |
|
1438
|
|
|
|
|
|
|
constructor parameter 'relevancy_file' must contain relevance judgments for the |
|
1439
|
|
|
|
|
|
|
queries that are named in the file supplied through the parameter 'query_file'. |
|
1440
|
|
|
|
|
|
|
The format of this file must be according to what is shown in the sample file |
|
1441
|
|
|
|
|
|
|
'relevancy.txt' in the 'examples' directory. We have already explained the roles |
|
1442
|
|
|
|
|
|
|
played by the constructor parameters such as 'lsa_svd_threshold'. |
|
1443
|
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
# FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS: |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
my $corpus_dir = "corpus"; |
|
1449
|
|
|
|
|
|
|
my $stop_words_file = "stop_words.txt"; |
|
1450
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
|
1451
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1452
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1453
|
|
|
|
|
|
|
corpus_directory => $corpus_dir, |
|
1454
|
|
|
|
|
|
|
file_types => ['.txt', '.java'], |
|
1455
|
|
|
|
|
|
|
min_word_length => 4, |
|
1456
|
|
|
|
|
|
|
stop_words_file => $stop_words_file, |
|
1457
|
|
|
|
|
|
|
want_stemming => 1, |
|
1458
|
|
|
|
|
|
|
); |
|
1459
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
|
1460
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
|
1461
|
|
|
|
|
|
|
# code for calculating pairwise similarities as shown in the |
|
1462
|
|
|
|
|
|
|
# script calculate_similarity_matrix_for_all_docs.pl in the |
|
1463
|
|
|
|
|
|
|
# examples directory. This script makes calls to |
|
1464
|
|
|
|
|
|
|
# |
|
1465
|
|
|
|
|
|
|
# $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]); |
|
1466
|
|
|
|
|
|
|
# |
|
1467
|
|
|
|
|
|
|
# for every pair of documents. |
|
1468
|
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
=head1 CHANGES |
|
1470
|
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
Version 1.70: All of the changes made in this version affect only that part of the |
|
1472
|
|
|
|
|
|
|
module that is used for calculating precision-vs.-recall curve for the estimation of |
|
1473
|
|
|
|
|
|
|
MAP (Mean Average Precision). The new formulas that go into estimating MAP are |
|
1474
|
|
|
|
|
|
|
presented in the author's tutorial on significance testing. Additionally, when |
|
1475
|
|
|
|
|
|
|
estimating the average retrieval precision for a query, this version explicitly |
|
1476
|
|
|
|
|
|
|
disregards all documents that have zero similarity with the query. |
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
Version 1.62 removes the Perl version restriction on the module. This version also |
|
1479
|
|
|
|
|
|
|
fixes two bugs, one in the file scanner code and the other in the |
|
1480
|
|
|
|
|
|
|
precision-and-recall calculator. The file scanner bug was related to the new |
|
1481
|
|
|
|
|
|
|
constructor parameter C that was introduced in Version 1.61. And the |
|
1482
|
|
|
|
|
|
|
precision-and-recall calculator bug was triggered if a query consisted solely of |
|
1483
|
|
|
|
|
|
|
non-vocabulary words. |
|
1484
|
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
Version 1.61 improves the implementation of the directory scanner to make it more |
|
1486
|
|
|
|
|
|
|
platform independent. Additionally, you are now required to specify in the |
|
1487
|
|
|
|
|
|
|
constructor call the file types to be considered for computing the database model. |
|
1488
|
|
|
|
|
|
|
If, say, you have a large software library and you want only Java and text files to |
|
1489
|
|
|
|
|
|
|
be scanned for creating the VSM (or the LSA) model, you must supply that information |
|
1490
|
|
|
|
|
|
|
to the module by setting the constructor parameter C to the anonymous |
|
1491
|
|
|
|
|
|
|
list C<['.java', '.txt']>. An additional constructor parameter introduced in this |
|
1492
|
|
|
|
|
|
|
version is C. If you set it to 1, that will force the database model |
|
1493
|
|
|
|
|
|
|
and query matching to become case sensitive. |
|
1494
|
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
Version 1.60 reflects the fact that people are now more likely to use this module by |
|
1496
|
|
|
|
|
|
|
keeping the model constructed for a corpus in the fast memory (as opposed to storing |
|
1497
|
|
|
|
|
|
|
the models in disk-based hash tables) for its repeated invocation for different |
|
1498
|
|
|
|
|
|
|
queries. As a result, the default value for the constructor option |
|
1499
|
|
|
|
|
|
|
C was changed from 1 to 0. For those who still wish to store on |
|
1500
|
|
|
|
|
|
|
a disk the model that is constructed, the script |
|
1501
|
|
|
|
|
|
|
C shows how you can do that. |
|
1502
|
|
|
|
|
|
|
Other changes in 1.60 include a slight reorganization of the scripts in the |
|
1503
|
|
|
|
|
|
|
C directory. Most scripts now do not by default store their models in |
|
1504
|
|
|
|
|
|
|
disk-based hash tables. This reorganization is reflected in the description of the |
|
1505
|
|
|
|
|
|
|
C directory in this documentation. The basic logic of constructing VSM and |
|
1506
|
|
|
|
|
|
|
LSA models and how these are used for retrievals remains unchanged. |
|
1507
|
|
|
|
|
|
|
|
|
1508
|
|
|
|
|
|
|
Version 1.50 incorporates a couple of new features: (1) You now have the option to |
|
1509
|
|
|
|
|
|
|
split camel-cased and underscored words for constructing your vocabulary set; and (2) |
|
1510
|
|
|
|
|
|
|
Storing the VSM and LSA models in database files on the disk is now optional. The |
|
1511
|
|
|
|
|
|
|
second feature, in particular, should prove useful to those who are using this module |
|
1512
|
|
|
|
|
|
|
for large collections of documents. |
|
1513
|
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
Version 1.42 includes two new methods, C and |
|
1515
|
|
|
|
|
|
|
C, for those folks who deal with very large datasets. |
|
1516
|
|
|
|
|
|
|
You can get a better sense of the overall vocabulary being used by the module for |
|
1517
|
|
|
|
|
|
|
file retrieval by examining the contents of a dump file whose name is supplied as an |
|
1518
|
|
|
|
|
|
|
argument to C. |
|
1519
|
|
|
|
|
|
|
|
|
1520
|
|
|
|
|
|
|
Version 1.41 downshifts the required version of the PDL module. Also cleaned up are |
|
1521
|
|
|
|
|
|
|
the dependencies between this module and the submodules of PDL. |
|
1522
|
|
|
|
|
|
|
|
|
1523
|
|
|
|
|
|
|
Version 1.4 makes it easier for a user to calculate a similarity matrix over all the |
|
1524
|
|
|
|
|
|
|
documents in the corpus. The elements of such a matrix express pairwise similarities |
|
1525
|
|
|
|
|
|
|
between the documents. The pairwise similarities are based on the dot product of two |
|
1526
|
|
|
|
|
|
|
document vectors divided by the product of the vector magnitudes. The 'examples' |
|
1527
|
|
|
|
|
|
|
directory contains two scripts to illustrate how such matrices can be calculated by |
|
1528
|
|
|
|
|
|
|
the user. The similarity matrix is output as a CSV file. |
|
1529
|
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a |
|
1531
|
|
|
|
|
|
|
document file. What that means is that the words that appear in most of the documents |
|
1532
|
|
|
|
|
|
|
get reduced weighting since such words are non-discriminatory with respect to the |
|
1533
|
|
|
|
|
|
|
retrieval of the documents. A typical formula that is used to calculate the IDF |
|
1534
|
|
|
|
|
|
|
weight for a word is the logarithm of the ratio of the total number of documents to |
|
1535
|
|
|
|
|
|
|
the number of documents in which the word appears. So if a word were to appear in |
|
1536
|
|
|
|
|
|
|
all the documents, its IDF multiplier would be zero in the vector representation of a |
|
1537
|
|
|
|
|
|
|
document. If so desired, you can turn off the IDF weighting of the words by |
|
1538
|
|
|
|
|
|
|
explicitly setting the constructor parameter C to zero. |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
Version 1.2 includes a code correction and some general code and documentation |
|
1541
|
|
|
|
|
|
|
cleanup. |
|
1542
|
|
|
|
|
|
|
|
|
1543
|
|
|
|
|
|
|
With Version 1.1, you can access the retrieval precision results so that you can |
|
1544
|
|
|
|
|
|
|
compare two different retrieval algorithms (VSM or LSA with different choices for |
|
1545
|
|
|
|
|
|
|
some of the constructor parameters) with significance testing. (Version 1.0 merely |
|
1546
|
|
|
|
|
|
|
sent those results to standard output, typically your terminal window.) In Version |
|
1547
|
|
|
|
|
|
|
1.1, the new script B in the 'examples' directory |
|
1548
|
|
|
|
|
|
|
illustrates significance testing with Randomization and with Student's Paired t-Test. |
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
1551
|
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
B is a I module for constructing a Vector Space Model (VSM) or |
|
1553
|
|
|
|
|
|
|
a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred |
|
1554
|
|
|
|
|
|
|
to as a corpus, and then retrieving the documents in response to search words in a |
|
1555
|
|
|
|
|
|
|
query. |
|
1556
|
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
VSM and LSA models have been around for a long time in the Information Retrieval (IR) |
|
1558
|
|
|
|
|
|
|
community. More recently such models have been shown to be effective in retrieving |
|
1559
|
|
|
|
|
|
|
files/documents from software libraries. For an account of this research that was |
|
1560
|
|
|
|
|
|
|
presented by Shivani Rao and the author of this module at the 2011 Mining Software |
|
1561
|
|
|
|
|
|
|
Repositories conference, see L. |
|
1562
|
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2) |
|
1564
|
|
|
|
|
|
|
Stemming the words so extracted and eliminating the designated stop words from the |
|
1565
|
|
|
|
|
|
|
vocabulary. Stemming means that closely related words like 'programming' and |
|
1566
|
|
|
|
|
|
|
'programs' are reduced to the common root word 'program' and the stop words are the |
|
1567
|
|
|
|
|
|
|
non-discriminating words that can be expected to exist in virtually all the |
|
1568
|
|
|
|
|
|
|
documents. (3) Constructing document vectors for the individual files in the corpus |
|
1569
|
|
|
|
|
|
|
--- the document vectors taken together constitute what is usually referred to as a |
|
1570
|
|
|
|
|
|
|
'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to |
|
1571
|
|
|
|
|
|
|
factor out the effect of document size and, if desired, multiplying the term |
|
1572
|
|
|
|
|
|
|
frequencies by the IDF (Inverse Document Frequency) values for the words to reduce |
|
1573
|
|
|
|
|
|
|
the weight of the words that appear in a large number of documents. (5) Constructing |
|
1574
|
|
|
|
|
|
|
a query vector for the search query after the query is subject to the same stemming |
|
1575
|
|
|
|
|
|
|
and stop-word elimination rules that were applied to the corpus. And, lastly, (6) |
|
1576
|
|
|
|
|
|
|
Using a similarity metric to return the set of documents that are most similar to the |
|
1577
|
|
|
|
|
|
|
query vector. The commonly used similarity metric is one based on the cosine |
|
1578
|
|
|
|
|
|
|
distance between two vectors. Also note that all the vectors mentioned here are of |
|
1579
|
|
|
|
|
|
|
the same size, the size of the vocabulary. An element of a vector is the frequency |
|
1580
|
|
|
|
|
|
|
of occurrence of the word corresponding to that position in the vector. |
|
1581
|
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one |
|
1583
|
|
|
|
|
|
|
step further by subjecting the term-frequency matrix for the corpus to singular value |
|
1584
|
|
|
|
|
|
|
decomposition (SVD). By retaining only a subset of the singular values (usually the |
|
1585
|
|
|
|
|
|
|
N largest for some value of N), you can construct reduced-dimensionality vectors for |
|
1586
|
|
|
|
|
|
|
the documents and the queries. In VSM, as mentioned above, the size of the document |
|
1587
|
|
|
|
|
|
|
and the query vectors is equal to the size of the vocabulary. For large corpora, |
|
1588
|
|
|
|
|
|
|
this size may involve tens of thousands of words --- this can slow down the VSM |
|
1589
|
|
|
|
|
|
|
modeling and retrieval process. So you are very likely to get faster performance |
|
1590
|
|
|
|
|
|
|
with retrieval based on LSA modeling, especially if you store the model once |
|
1591
|
|
|
|
|
|
|
constructed in a database file on the disk and carry out retrievals using the |
|
1592
|
|
|
|
|
|
|
disk-based model. |
|
1593
|
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL? |
|
1596
|
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
This module has only been tested for software retrieval. For more general text |
|
1598
|
|
|
|
|
|
|
retrieval, you would need to replace the simple stemmer used in the module by one |
|
1599
|
|
|
|
|
|
|
based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the |
|
1600
|
|
|
|
|
|
|
list of stop words appropriate to the text corpora of interest to you. As previously |
|
1601
|
|
|
|
|
|
|
mentioned, the stop words are the commonly occurring words that do not carry much |
|
1602
|
|
|
|
|
|
|
discriminatory power from the standpoint of distinguishing between the documents. |
|
1603
|
|
|
|
|
|
|
See the file 'stop_words.txt' in the 'examples' directory for how such a file must be |
|
1604
|
|
|
|
|
|
|
formatted. |
|
1605
|
|
|
|
|
|
|
|
|
1606
|
|
|
|
|
|
|
|
|
1607
|
|
|
|
|
|
|
=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA? |
|
1608
|
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
It is not uncommon for large software libraries to consist of tens of thousands of |
|
1610
|
|
|
|
|
|
|
documents that include source-code files, documentation files, README files, |
|
1611
|
|
|
|
|
|
|
configuration files, etc. The bug-localization work presented recently by Shivani |
|
1612
|
|
|
|
|
|
|
Rao and this author at the 2011 Mining Software Repository conference (MSR11) was |
|
1613
|
|
|
|
|
|
|
based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary |
|
1614
|
|
|
|
|
|
|
size of 7553 unique words. (Here is a link to this work: |
|
1615
|
|
|
|
|
|
|
L. Also note that the iBUGS dataset |
|
1616
|
|
|
|
|
|
|
was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of |
|
1617
|
|
|
|
|
|
|
automated bug detection and localization tools.) If C is the size of the |
|
1618
|
|
|
|
|
|
|
vocabulary and C the number of the documents in the corpus, the size of each |
|
1619
|
|
|
|
|
|
|
vector will be C and size of the term-frequency matrix for the entire corpus will |
|
1620
|
|
|
|
|
|
|
be CxC. So if you were to duplicate the bug localization experiments in |
|
1621
|
|
|
|
|
|
|
L you would be dealing with vectors of |
|
1622
|
|
|
|
|
|
|
size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers |
|
1623
|
|
|
|
|
|
|
to really large libraries/corpora, we are obviously talking about very large matrices |
|
1624
|
|
|
|
|
|
|
for SVD decomposition. For large libraries/corpora, it would be best to store away |
|
1625
|
|
|
|
|
|
|
the model in a disk file and to base all subsequent retrievals on the disk-stored |
|
1626
|
|
|
|
|
|
|
models. The 'examples' directory contains scripts that carry out retrievals on the |
|
1627
|
|
|
|
|
|
|
basis of disk-based models. Further speedup in retrieval can be achieved by using |
|
1628
|
|
|
|
|
|
|
LSA to create reduced-dimensionality representations for the documents and by basing |
|
1629
|
|
|
|
|
|
|
retrievals on the stored versions of such reduced-dimensionality representations. |
|
1630
|
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS |
|
1633
|
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
The performance of a retrieval algorithm is typically measured by two properties: |
|
1635
|
|
|
|
|
|
|
C and C. As mentioned in my tutorial |
|
1636
|
|
|
|
|
|
|
L, at a given |
|
1637
|
|
|
|
|
|
|
rank C, C is the ratio of the number of retrieved documents that are |
|
1638
|
|
|
|
|
|
|
relevant to the total number of retrieved documents up to that rank. And, along the |
|
1639
|
|
|
|
|
|
|
same lines, C at a given rank C is the ratio of the number of retrieved |
|
1640
|
|
|
|
|
|
|
documents that are relevant to the total number of relevant documents. The Average |
|
1641
|
|
|
|
|
|
|
Precision associated with a query is the average of all the Precision-at-rank values |
|
1642
|
|
|
|
|
|
|
for all the documents relevant to that query. When query specific Average Precision |
|
1643
|
|
|
|
|
|
|
is averaged over all the queries, you get Mean Average Precision (MAP) as a |
|
1644
|
|
|
|
|
|
|
single-number characterizer of the retrieval power of an algorithm for a given |
|
1645
|
|
|
|
|
|
|
corpus. For an oracle, the value of MAP should be 1.0. On the other hand, for |
|
1646
|
|
|
|
|
|
|
purely random retrieval from a corpus, the value of MAP will be inversely |
|
1647
|
|
|
|
|
|
|
proportional to the size of the corpus. (See the discussion in |
|
1648
|
|
|
|
|
|
|
L for further |
|
1649
|
|
|
|
|
|
|
explanation on these retrieval precision evaluators.) |
|
1650
|
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
This module includes methods that allow you to carry out these retrieval accuracy |
|
1652
|
|
|
|
|
|
|
measurements using the relevancy judgments supplied through a disk file. If |
|
1653
|
|
|
|
|
|
|
human-supplied relevancy judgments are not available, the module will be happy to |
|
1654
|
|
|
|
|
|
|
estimate relevancies for you just by determining the number of query words that exist |
|
1655
|
|
|
|
|
|
|
in a document. Note, however, that relevancy judgments estimated in this manner |
|
1656
|
|
|
|
|
|
|
cannot be trusted. That is because ultimately it is the humans who are the best |
|
1657
|
|
|
|
|
|
|
judges of the relevancies of documents to queries. The humans bring to bear semantic |
|
1658
|
|
|
|
|
|
|
considerations on the relevancy determination problem that are beyond the scope of |
|
1659
|
|
|
|
|
|
|
this module. |
|
1660
|
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
=head1 METHODS |
|
1663
|
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
The module provides the following methods for constructing VSM and LSA models of a |
|
1665
|
|
|
|
|
|
|
corpus, for using the models thus constructed for retrieval, and for carrying out |
|
1666
|
|
|
|
|
|
|
precision versus recall calculations for the determination of retrieval accuracy on |
|
1667
|
|
|
|
|
|
|
the corpora of interest to you. |
|
1668
|
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
=over |
|
1670
|
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
=item B |
|
1672
|
|
|
|
|
|
|
|
|
1673
|
|
|
|
|
|
|
A call to C constructs a new instance of the C class: |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
my $vsm = Algorithm::VSM->new( |
|
1676
|
|
|
|
|
|
|
break_camelcased_and_underscored => 1, |
|
1677
|
|
|
|
|
|
|
case_sensitive => 0, |
|
1678
|
|
|
|
|
|
|
corpus_directory => "", |
|
1679
|
|
|
|
|
|
|
corpus_vocab_db => "corpus_vocab_db", |
|
1680
|
|
|
|
|
|
|
doc_vectors_db => "doc_vectors_db", |
|
1681
|
|
|
|
|
|
|
file_types => $my_file_types, |
|
1682
|
|
|
|
|
|
|
lsa_svd_threshold => 0.01, |
|
1683
|
|
|
|
|
|
|
max_number_retrievals => 10, |
|
1684
|
|
|
|
|
|
|
min_word_length => 4, |
|
1685
|
|
|
|
|
|
|
normalized_doc_vecs_db => "normalized_doc_vecs_db", |
|
1686
|
|
|
|
|
|
|
query_file => "", |
|
1687
|
|
|
|
|
|
|
relevancy_file => $relevancy_file, |
|
1688
|
|
|
|
|
|
|
relevancy_threshold => 5, |
|
1689
|
|
|
|
|
|
|
save_model_on_disk => 0, |
|
1690
|
|
|
|
|
|
|
stop_words_file => "", |
|
1691
|
|
|
|
|
|
|
use_idf_filter => 1, |
|
1692
|
|
|
|
|
|
|
want_stemming => 1, |
|
1693
|
|
|
|
|
|
|
); |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
The values shown on the right side of the big arrows are the B
|
|
1696
|
|
|
|
|
|
|
parameters>. The value supplied through the variable C<$my_file_types> would be |
|
1697
|
|
|
|
|
|
|
something like C<['.java', '.txt']> if, say, you wanted only Java and text files to |
|
1698
|
|
|
|
|
|
|
be included in creating the database model. The following nested list will now |
|
1699
|
|
|
|
|
|
|
describe each of the constructor parameters shown above: |
|
1700
|
|
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
|
=over 16 |
|
1702
|
|
|
|
|
|
|
|
|
1703
|
|
|
|
|
|
|
=item I |
|
1704
|
|
|
|
|
|
|
|
|
1705
|
|
|
|
|
|
|
The parameter B when set causes the |
|
1706
|
|
|
|
|
|
|
underscored and camel-cased words to be split. By default the parameter is |
|
1707
|
|
|
|
|
|
|
set. So if you don't want such words to be split, you must set it |
|
1708
|
|
|
|
|
|
|
explicitly to 0. |
|
1709
|
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
=item I |
|
1711
|
|
|
|
|
|
|
|
|
1712
|
|
|
|
|
|
|
When set to 1, this parameter forces the module to maintain the case of the terms in |
|
1713
|
|
|
|
|
|
|
the corpus files when creating the vocabulary and the document vectors. Setting |
|
1714
|
|
|
|
|
|
|
C to 1 also causes the query matching to become case sensitive. |
|
1715
|
|
|
|
|
|
|
(This constructor parameter was introduced in Version 1.61.) |
|
1716
|
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
=item I |
|
1718
|
|
|
|
|
|
|
|
|
1719
|
|
|
|
|
|
|
The parameter B points to the root of the directory of documents |
|
1720
|
|
|
|
|
|
|
for which you want to create a VSM or LSA model. |
|
1721
|
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
=item I |
|
1723
|
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
The parameter B is for naming the DBM in which the corpus vocabulary |
|
1725
|
|
|
|
|
|
|
will be stored after it is subject to stemming and the elimination of stop words. |
|
1726
|
|
|
|
|
|
|
Once a disk-based VSM model is created and stored away in the file named by this |
|
1727
|
|
|
|
|
|
|
parameter and the parameter to be described next, it can subsequently be used |
|
1728
|
|
|
|
|
|
|
directly for speedier retrieval. |
|
1729
|
|
|
|
|
|
|
|
|
1730
|
|
|
|
|
|
|
=item I |
|
1731
|
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
The database named by B stores the document vector representation for |
|
1733
|
|
|
|
|
|
|
each document in the corpus. Each document vector has the same size as the |
|
1734
|
|
|
|
|
|
|
corpus-wide vocabulary; each element of such a vector is the number of occurrences of |
|
1735
|
|
|
|
|
|
|
the word that corresponds to that position in the vocabulary vector. |
|
1736
|
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
=item I |
|
1738
|
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
This parameter tells the module what types of files in the corpus directory you want |
|
1740
|
|
|
|
|
|
|
scanned for creating the database model. The value supplied for this parameter is an |
|
1741
|
|
|
|
|
|
|
anonymous list of the file suffixes for the file types. For example, if you wanted |
|
1742
|
|
|
|
|
|
|
only Java and text files to be scanned, you will set this parameter to C<['.java', |
|
1743
|
|
|
|
|
|
|
'.txt']>. The module throws an exception if this parameter is left unspecified. |
|
1744
|
|
|
|
|
|
|
(This constructor parameter was introduced in Version 1.61.) |
|
1745
|
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
=item I |
|
1747
|
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
The parameter B is used for rejecting singular values that are |
|
1749
|
|
|
|
|
|
|
smaller than this threshold fraction of the largest singular value. This plays a |
|
1750
|
|
|
|
|
|
|
critical role in creating reduced-dimensionality document vectors in LSA modeling of |
|
1751
|
|
|
|
|
|
|
a corpus. |
|
1752
|
|
|
|
|
|
|
|
|
1753
|
|
|
|
|
|
|
=item I |
|
1754
|
|
|
|
|
|
|
|
|
1755
|
|
|
|
|
|
|
The constructor parameter B stands for what it means. |
|
1756
|
|
|
|
|
|
|
|
|
1757
|
|
|
|
|
|
|
=item I |
|
1758
|
|
|
|
|
|
|
|
|
1759
|
|
|
|
|
|
|
The parameter B sets the minimum number of characters in a |
|
1760
|
|
|
|
|
|
|
word in order for it to be included in the corpus vocabulary. |
|
1761
|
|
|
|
|
|
|
|
|
1762
|
|
|
|
|
|
|
=item I |
|
1763
|
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
The database named by B stores the normalized document |
|
1765
|
|
|
|
|
|
|
vectors. Normalization consists of factoring out the size of the documents by |
|
1766
|
|
|
|
|
|
|
dividing the term frequency for each word in a document by the number of words in the |
|
1767
|
|
|
|
|
|
|
document, and then multiplying the result by the idf (Inverse Document Frequency) |
|
1768
|
|
|
|
|
|
|
value for the word. |
|
1769
|
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
=item I |
|
1771
|
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
The parameter B points to a file that contains the queries to be used for |
|
1773
|
|
|
|
|
|
|
calculating retrieval performance with C and C numbers. The format |
|
1774
|
|
|
|
|
|
|
of the query file must be as shown in the sample file C in the |
|
1775
|
|
|
|
|
|
|
'examples' directory. |
|
1776
|
|
|
|
|
|
|
|
|
1777
|
|
|
|
|
|
|
=item I |
|
1778
|
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
This option names the disk file for storing the relevancy judgments. |
|
1780
|
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
=item I |
|
1782
|
|
|
|
|
|
|
|
|
1783
|
|
|
|
|
|
|
The constructor parameter B is used for automatic determination |
|
1784
|
|
|
|
|
|
|
of document relevancies to queries on the basis of the number of occurrences of query |
|
1785
|
|
|
|
|
|
|
words in a document. You can exercise control over the process of determining |
|
1786
|
|
|
|
|
|
|
relevancy of a document to a query by giving a suitable value to the constructor |
|
1787
|
|
|
|
|
|
|
parameter B. A document is considered relevant to a query only |
|
1788
|
|
|
|
|
|
|
when the document contains at least B number of query words. |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
=item I |
|
1791
|
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
The constructor parameter B will cause the basic |
|
1793
|
|
|
|
|
|
|
information about the VSM and the LSA models to be stored on the disk. |
|
1794
|
|
|
|
|
|
|
Subsequently, any retrievals can be carried out from the disk-based model. |
|
1795
|
|
|
|
|
|
|
|
|
1796
|
|
|
|
|
|
|
=item I |
|
1797
|
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
The parameter B is for naming the file that contains the stop words |
|
1799
|
|
|
|
|
|
|
that you do not wish to include in the corpus vocabulary. The format of this file |
|
1800
|
|
|
|
|
|
|
must be as shown in the sample file C in the 'examples' directory. |
|
1801
|
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
=item I |
|
1803
|
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
The constructor parameter B is set by default. If you want |
|
1805
|
|
|
|
|
|
|
to turn off the normalization of the document vectors, including turning |
|
1806
|
|
|
|
|
|
|
off the weighting of the term frequencies of the words by their idf values, |
|
1807
|
|
|
|
|
|
|
you must set this parameter explicitly to 0. |
|
1808
|
|
|
|
|
|
|
|
|
1809
|
|
|
|
|
|
|
=item I |
|
1810
|
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
The boolean parameter B determines whether or not the words extracted |
|
1812
|
|
|
|
|
|
|
from the documents would be subject to stemming. As mentioned elsewhere, stemming |
|
1813
|
|
|
|
|
|
|
means that related words like 'programming' and 'programs' would both be reduced to |
|
1814
|
|
|
|
|
|
|
the root word 'program'. |
|
1815
|
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
=back |
|
1817
|
|
|
|
|
|
|
|
|
1818
|
|
|
|
|
|
|
=begin html |
|
1819
|
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
|
|
1822
|
|
|
|
|
|
|
=end html |
|
1823
|
|
|
|
|
|
|
|
|
1824
|
|
|
|
|
|
|
=item B |
|
1825
|
|
|
|
|
|
|
|
|
1826
|
|
|
|
|
|
|
You call this subroutine for constructing an LSA model for your corpus |
|
1827
|
|
|
|
|
|
|
after you have extracted the corpus vocabulary and constructed document |
|
1828
|
|
|
|
|
|
|
vectors: |
|
1829
|
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
$vsm->construct_lsa_model(); |
|
1831
|
|
|
|
|
|
|
|
|
1832
|
|
|
|
|
|
|
The SVD decomposition that is carried out in LSA model construction uses the |
|
1833
|
|
|
|
|
|
|
constructor parameter C to decide how many of the singular values |
|
1834
|
|
|
|
|
|
|
to retain for the LSA model. A singular is retained only if it is larger than the |
|
1835
|
|
|
|
|
|
|
C fraction of the largest singular value. |
|
1836
|
|
|
|
|
|
|
|
|
1837
|
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
=item B |
|
1839
|
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
The Average Precision for a query is the average of the Precision-at-rank values |
|
1841
|
|
|
|
|
|
|
associated with each of the corpus documents relevant to the query. The mean of the |
|
1842
|
|
|
|
|
|
|
Average Precision values for all the queries is the Mean Average Precision (MAP). |
|
1843
|
|
|
|
|
|
|
The C values for the queries and the overall C |
|
1844
|
|
|
|
|
|
|
out by calling |
|
1845
|
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
$vsm->display_average_precision_for_queries_and_map(); |
|
1847
|
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
=item B |
|
1850
|
|
|
|
|
|
|
|
|
1851
|
|
|
|
|
|
|
If you would like to see corpus vocabulary as constructed by the previous call, make |
|
1852
|
|
|
|
|
|
|
the call |
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
$vsm->display_corpus_vocab(); |
|
1855
|
|
|
|
|
|
|
|
|
1856
|
|
|
|
|
|
|
Note that this is a useful thing to do only on small test corpora. If you need |
|
1857
|
|
|
|
|
|
|
to examine the vocabulary for a large corpus, call the two methods listed below. |
|
1858
|
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
|
=item B |
|
1861
|
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
If you would like for the module to print out in your terminal window the size of the |
|
1863
|
|
|
|
|
|
|
vocabulary, make the call |
|
1864
|
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
$vsm->display_corpus_vocab_size(); |
|
1866
|
|
|
|
|
|
|
|
|
1867
|
|
|
|
|
|
|
|
|
1868
|
|
|
|
|
|
|
=item B |
|
1869
|
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
If you would like to see the document relevancies generated by the previous method, |
|
1871
|
|
|
|
|
|
|
you can call |
|
1872
|
|
|
|
|
|
|
|
|
1873
|
|
|
|
|
|
|
$vsm->display_doc_relevancies() |
|
1874
|
|
|
|
|
|
|
|
|
1875
|
|
|
|
|
|
|
|
|
1876
|
|
|
|
|
|
|
=item B |
|
1877
|
|
|
|
|
|
|
|
|
1878
|
|
|
|
|
|
|
If you would like to see the document vectors constructed by the previous call, make |
|
1879
|
|
|
|
|
|
|
the call: |
|
1880
|
|
|
|
|
|
|
|
|
1881
|
|
|
|
|
|
|
$vsm->display_doc_vectors(); |
|
1882
|
|
|
|
|
|
|
|
|
1883
|
|
|
|
|
|
|
Note that this is a useful thing to do only on small test corpora. If you must call |
|
1884
|
|
|
|
|
|
|
this method on a large corpus, you might wish to direct the output to a file. |
|
1885
|
|
|
|
|
|
|
|
|
1886
|
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
=item B |
|
1888
|
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
You can display the idf value associated with each word in the corpus by |
|
1890
|
|
|
|
|
|
|
|
|
1891
|
|
|
|
|
|
|
$vsm->display_inverse_document_frequencies(); |
|
1892
|
|
|
|
|
|
|
|
|
1893
|
|
|
|
|
|
|
The idf of a word in the corpus is calculated typically as the logarithm of the ratio |
|
1894
|
|
|
|
|
|
|
of the total number of documents in the corpus to the number of documents in which |
|
1895
|
|
|
|
|
|
|
the word appears (with protection built in to prevent division by zero). Ideally, if |
|
1896
|
|
|
|
|
|
|
a word appears in all the documents, its idf would be small, close to zero. Words |
|
1897
|
|
|
|
|
|
|
with small idf values are non-discriminatory and should get reduced weighting in |
|
1898
|
|
|
|
|
|
|
document retrieval. |
|
1899
|
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
|
|
1901
|
|
|
|
|
|
|
=item B |
|
1902
|
|
|
|
|
|
|
|
|
1903
|
|
|
|
|
|
|
If you would like to see the normalized document vectors, make the call: |
|
1904
|
|
|
|
|
|
|
|
|
1905
|
|
|
|
|
|
|
$vsm->display_normalized_doc_vectors(); |
|
1906
|
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
See the comment made previously as to what is meant by the normalization of a |
|
1908
|
|
|
|
|
|
|
document vector. |
|
1909
|
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
|
|
1911
|
|
|
|
|
|
|
=item B |
|
1912
|
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
A call to C will normally be followed by the |
|
1914
|
|
|
|
|
|
|
following call |
|
1915
|
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
$vsm->display_precision_vs_recall_for_queries(); |
|
1917
|
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
for displaying the C and C values. |
|
1919
|
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
|
|
1921
|
|
|
|
|
|
|
=item B |
|
1922
|
|
|
|
|
|
|
|
|
1923
|
|
|
|
|
|
|
You can display the retrieved document names by calling this method using the syntax: |
|
1924
|
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
|
1926
|
|
|
|
|
|
|
|
|
1927
|
|
|
|
|
|
|
where C<$retrievals> is a reference to the hash returned by a call to one of the |
|
1928
|
|
|
|
|
|
|
C methods. The display method shown here respects the retrieval size |
|
1929
|
|
|
|
|
|
|
constraints expressed by the constructor parameter C. |
|
1930
|
|
|
|
|
|
|
|
|
1931
|
|
|
|
|
|
|
|
|
1932
|
|
|
|
|
|
|
=item B |
|
1933
|
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
Before you can carry out precision and recall calculations to test the accuracy of |
|
1935
|
|
|
|
|
|
|
VSM and LSA based retrievals from a corpus, you need to have available the relevancy |
|
1936
|
|
|
|
|
|
|
judgments for the queries. (A relevancy judgment for a query is simply the list of |
|
1937
|
|
|
|
|
|
|
documents relevant to that query.) Relevancy judgments are commonly supplied by the |
|
1938
|
|
|
|
|
|
|
humans who are familiar with the corpus. But if such human-supplied relevance |
|
1939
|
|
|
|
|
|
|
judgments are not available, you can invoke the following method to estimate them: |
|
1940
|
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
$vsm->estimate_doc_relevancies(); |
|
1942
|
|
|
|
|
|
|
|
|
1943
|
|
|
|
|
|
|
For the above method call, a document is considered to be relevant to a query if it |
|
1944
|
|
|
|
|
|
|
contains several of the query words. As to the minimum number of query words that |
|
1945
|
|
|
|
|
|
|
must exist in a document in order for the latter to be considered relevant, that is |
|
1946
|
|
|
|
|
|
|
determined by the C parameter in the VSM constructor. |
|
1947
|
|
|
|
|
|
|
|
|
1948
|
|
|
|
|
|
|
But note that this estimation of document relevancies to queries is NOT for serious |
|
1949
|
|
|
|
|
|
|
work. The reason for that is because ultimately it is the humans who are the best |
|
1950
|
|
|
|
|
|
|
judges of the relevancies of documents to queries. The humans bring to bear semantic |
|
1951
|
|
|
|
|
|
|
considerations on the relevancy determination problem that are beyond the scope of |
|
1952
|
|
|
|
|
|
|
this module. |
|
1953
|
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
The generated relevancies are deposited in a file named by the constructor parameter |
|
1955
|
|
|
|
|
|
|
C. |
|
1956
|
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
|
|
1958
|
|
|
|
|
|
|
=item B |
|
1959
|
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
If you want to get hold of all the filenames in the corpus in your own script, you |
|
1961
|
|
|
|
|
|
|
can call |
|
1962
|
|
|
|
|
|
|
|
|
1963
|
|
|
|
|
|
|
my @docs = @{$vsm->get_all_document_names()}; |
|
1964
|
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
The array on the left will contain an alphabetized list of the files. |
|
1966
|
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
=item B |
|
1969
|
|
|
|
|
|
|
|
|
1970
|
|
|
|
|
|
|
This is a necessary step after the vocabulary used by a corpus is constructed. (Of |
|
1971
|
|
|
|
|
|
|
course, if you will be doing document retrieval through a disk-stored VSM or LSA |
|
1972
|
|
|
|
|
|
|
model, then you do not need to call this method. You construct document vectors |
|
1973
|
|
|
|
|
|
|
through the following call: |
|
1974
|
|
|
|
|
|
|
|
|
1975
|
|
|
|
|
|
|
$vsm->generate_document_vectors(); |
|
1976
|
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
|
|
1978
|
|
|
|
|
|
|
=item B |
|
1979
|
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
After you have constructed a new instance of the C class, you must |
|
1981
|
|
|
|
|
|
|
now scan the corpus documents for constructing the corpus vocabulary. This you do by: |
|
1982
|
|
|
|
|
|
|
|
|
1983
|
|
|
|
|
|
|
$vsm->get_corpus_vocabulary_and_word_counts(); |
|
1984
|
|
|
|
|
|
|
|
|
1985
|
|
|
|
|
|
|
The only time you do NOT need to call this method is when you are using a previously |
|
1986
|
|
|
|
|
|
|
constructed disk-stored VSM model for retrieval. |
|
1987
|
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
=item B |
|
1990
|
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
If you want to run significance tests on the retrieval accuracies you obtain on a |
|
1992
|
|
|
|
|
|
|
given corpus and with different algorithms (VSM or LSA with different choices for the |
|
1993
|
|
|
|
|
|
|
constructor parameters), your own script would need access to the average precision |
|
1994
|
|
|
|
|
|
|
data for a set of queries. You can get hold of this data by calling |
|
1995
|
|
|
|
|
|
|
|
|
1996
|
|
|
|
|
|
|
$vsm->get_query_sorted_average_precision_for_queries(); |
|
1997
|
|
|
|
|
|
|
|
|
1998
|
|
|
|
|
|
|
The script C in the 'examples' directory shows how you can |
|
1999
|
|
|
|
|
|
|
use this method for significance testing. |
|
2000
|
|
|
|
|
|
|
|
|
2001
|
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
=item B |
|
2003
|
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
=item B |
|
2005
|
|
|
|
|
|
|
|
|
2006
|
|
|
|
|
|
|
If you would like to compare in your own script any two documents in the corpus, you |
|
2007
|
|
|
|
|
|
|
can call |
|
2008
|
|
|
|
|
|
|
|
|
2009
|
|
|
|
|
|
|
my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2"); |
|
2010
|
|
|
|
|
|
|
or |
|
2011
|
|
|
|
|
|
|
my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2"); |
|
2012
|
|
|
|
|
|
|
|
|
2013
|
|
|
|
|
|
|
Both these calls return a number that is the dot product of the two document vectors |
|
2014
|
|
|
|
|
|
|
normalized by the product of their magnitudes. The first call uses the regular |
|
2015
|
|
|
|
|
|
|
document vectors and the second the normalized document vectors. |
|
2016
|
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
=item B |
|
2019
|
|
|
|
|
|
|
|
|
2020
|
|
|
|
|
|
|
After you have created or obtained the relevancy judgments for your test queries, you |
|
2021
|
|
|
|
|
|
|
can make the following call to calculate C and C: |
|
2022
|
|
|
|
|
|
|
|
|
2023
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('vsm'); |
|
2024
|
|
|
|
|
|
|
or |
|
2025
|
|
|
|
|
|
|
$vsm->precision_and_recall_calculator('lsa'); |
|
2026
|
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
depending on whether you are testing VSM-based retrieval or LSA-based retrieval. |
|
2028
|
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
=item B |
|
2030
|
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
After you have built an LSA model through the call to C, you |
|
2032
|
|
|
|
|
|
|
can retrieve the document names most similar to the query by: |
|
2033
|
|
|
|
|
|
|
|
|
2034
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_lsa( \@query ); |
|
2035
|
|
|
|
|
|
|
|
|
2036
|
|
|
|
|
|
|
Subsequently, you can display the retrievals by calling the |
|
2037
|
|
|
|
|
|
|
C method described previously. |
|
2038
|
|
|
|
|
|
|
|
|
2039
|
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
=item B |
|
2041
|
|
|
|
|
|
|
|
|
2042
|
|
|
|
|
|
|
After you have constructed a VSM model, you call this method for document retrieval |
|
2043
|
|
|
|
|
|
|
for a given query C<@query>. The call syntax is: |
|
2044
|
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_vsm( \@query ); |
|
2046
|
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
The argument, C<@query>, is simply a list of words that you wish to use for |
|
2048
|
|
|
|
|
|
|
retrieval. The method returns a hash whose keys are the document names and whose |
|
2049
|
|
|
|
|
|
|
values the similarity distance between the document and the query. As is commonly |
|
2050
|
|
|
|
|
|
|
the case with VSM, this module uses the cosine similarity distance when comparing a |
|
2051
|
|
|
|
|
|
|
document vector with the query vector. |
|
2052
|
|
|
|
|
|
|
|
|
2053
|
|
|
|
|
|
|
|
|
2054
|
|
|
|
|
|
|
=item B |
|
2055
|
|
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
When human-supplied relevancies are available, you can upload them into the program |
|
2057
|
|
|
|
|
|
|
by calling |
|
2058
|
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
$vsm->upload_document_relevancies_from_file(); |
|
2060
|
|
|
|
|
|
|
|
|
2061
|
|
|
|
|
|
|
These relevance judgments will be read from a file that is named with the |
|
2062
|
|
|
|
|
|
|
C constructor parameter. |
|
2063
|
|
|
|
|
|
|
|
|
2064
|
|
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
|
=item B |
|
2066
|
|
|
|
|
|
|
|
|
2067
|
|
|
|
|
|
|
When you invoke the methods C and |
|
2068
|
|
|
|
|
|
|
C, that automatically deposits the VSM model in the |
|
2069
|
|
|
|
|
|
|
database files named with the constructor parameters C, |
|
2070
|
|
|
|
|
|
|
C and C. Subsequently, you can carry out |
|
2071
|
|
|
|
|
|
|
retrieval by directly using this disk-based VSM model for speedier performance. In |
|
2072
|
|
|
|
|
|
|
order to do so, you must upload the disk-based model by |
|
2073
|
|
|
|
|
|
|
|
|
2074
|
|
|
|
|
|
|
$vsm->upload_normalized_vsm_model_from_disk(); |
|
2075
|
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
Subsequently you call |
|
2077
|
|
|
|
|
|
|
|
|
2078
|
|
|
|
|
|
|
my $retrievals = $vsm->retrieve_with_vsm( \@query ); |
|
2079
|
|
|
|
|
|
|
$vsm->display_retrievals( $retrievals ); |
|
2080
|
|
|
|
|
|
|
|
|
2081
|
|
|
|
|
|
|
for retrieval and for displaying the results. |
|
2082
|
|
|
|
|
|
|
|
|
2083
|
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
=item B |
|
2085
|
|
|
|
|
|
|
|
|
2086
|
|
|
|
|
|
|
This is the method to call for large text corpora if you would like to examine the |
|
2087
|
|
|
|
|
|
|
vocabulary created. The call syntax is |
|
2088
|
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
$vsm->write_corpus_vocab_to_file($filename); |
|
2090
|
|
|
|
|
|
|
|
|
2091
|
|
|
|
|
|
|
where C<$filename> is the name of the file that you want the vocabulary to be written |
|
2092
|
|
|
|
|
|
|
out to. This call will also show the frequency of each vocabulary word in your |
|
2093
|
|
|
|
|
|
|
corpus. |
|
2094
|
|
|
|
|
|
|
|
|
2095
|
|
|
|
|
|
|
|
|
2096
|
|
|
|
|
|
|
=back |
|
2097
|
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
=head1 REQUIRED |
|
2100
|
|
|
|
|
|
|
|
|
2101
|
|
|
|
|
|
|
This module requires the following modules: |
|
2102
|
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
SDBM_File |
|
2104
|
|
|
|
|
|
|
Storable |
|
2105
|
|
|
|
|
|
|
PDL |
|
2106
|
|
|
|
|
|
|
File::Basename |
|
2107
|
|
|
|
|
|
|
File::Spec::Functions |
|
2108
|
|
|
|
|
|
|
|
|
2109
|
|
|
|
|
|
|
The first two of these are needed for creating disk-based database records for the |
|
2110
|
|
|
|
|
|
|
VSM and LSA models. The third is needed for calculating the SVD of the |
|
2111
|
|
|
|
|
|
|
term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed |
|
2112
|
|
|
|
|
|
|
by the directory scanner to make pathnames platform independent. |
|
2113
|
|
|
|
|
|
|
|
|
2114
|
|
|
|
|
|
|
=head1 EXAMPLES |
|
2115
|
|
|
|
|
|
|
|
|
2116
|
|
|
|
|
|
|
See the 'examples' directory in the distribution for the scripts listed below: |
|
2117
|
|
|
|
|
|
|
|
|
2118
|
|
|
|
|
|
|
=over |
|
2119
|
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
=item B |
|
2121
|
|
|
|
|
|
|
|
|
2122
|
|
|
|
|
|
|
For basic VSM-based model construction and retrieval, run the script: |
|
2123
|
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
retrieve_with_VSM.pl |
|
2125
|
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
Starting with version 1.60, this script does not store away the VSM model in |
|
2127
|
|
|
|
|
|
|
disk-based hash tables. If you want your model to be stored on the disk, you must |
|
2128
|
|
|
|
|
|
|
run the script C for that. |
|
2129
|
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
=item B |
|
2131
|
|
|
|
|
|
|
|
|
2132
|
|
|
|
|
|
|
If you want to run an infinite loop for repeated retrievals from a VSM model, run the |
|
2133
|
|
|
|
|
|
|
script |
|
2134
|
|
|
|
|
|
|
|
|
2135
|
|
|
|
|
|
|
continuously_running_VSM_retrieval_engine.pl |
|
2136
|
|
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
|
You can create a script similar to this for doing the same with LSA models. |
|
2138
|
|
|
|
|
|
|
|
|
2139
|
|
|
|
|
|
|
=item B |
|
2140
|
|
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
For storing the model information in disk-based DBM files that can subsequently be |
|
2142
|
|
|
|
|
|
|
used for both VSM and LSA retrieval, run the script: |
|
2143
|
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
retrieve_with_VSM_and_also_create_disk_based_model.pl |
|
2145
|
|
|
|
|
|
|
|
|
2146
|
|
|
|
|
|
|
=item B |
|
2147
|
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
For basic LSA-based model construction and retrieval, run the script: |
|
2149
|
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
retrieve_with_LSA.pl |
|
2151
|
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
Starting with version 1.60, this script does not store away the model information in |
|
2153
|
|
|
|
|
|
|
disk-based hash tables. If you want your model to be stored on the disk, you must |
|
2154
|
|
|
|
|
|
|
run the script C for that. |
|
2155
|
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
=item B |
|
2157
|
|
|
|
|
|
|
|
|
2158
|
|
|
|
|
|
|
If you have previously run a script like |
|
2159
|
|
|
|
|
|
|
C, you can run the script |
|
2160
|
|
|
|
|
|
|
|
|
2161
|
|
|
|
|
|
|
retrieve_with_disk_based_VSM.pl |
|
2162
|
|
|
|
|
|
|
|
|
2163
|
|
|
|
|
|
|
for repeated VSM-based retrievals from a disk-based model. |
|
2164
|
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
=item B |
|
2166
|
|
|
|
|
|
|
|
|
2167
|
|
|
|
|
|
|
If you have previously run a script like |
|
2168
|
|
|
|
|
|
|
C, you can run the script |
|
2169
|
|
|
|
|
|
|
|
|
2170
|
|
|
|
|
|
|
retrieve_with_disk_based_LSA.pl |
|
2171
|
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
for repeated LSA-based retrievals from a disk-based model. |
|
2173
|
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
=item B |
|
2175
|
|
|
|
|
|
|
|
|
2176
|
|
|
|
|
|
|
To experiment with precision and recall calculations for VSM retrieval, run the |
|
2177
|
|
|
|
|
|
|
script: |
|
2178
|
|
|
|
|
|
|
|
|
2179
|
|
|
|
|
|
|
calculate_precision_and_recall_for_VSM.pl |
|
2180
|
|
|
|
|
|
|
|
|
2181
|
|
|
|
|
|
|
Note that this script will carry out its own estimation of relevancy judgments --- |
|
2182
|
|
|
|
|
|
|
which in most cases would not be a safe thing to do. |
|
2183
|
|
|
|
|
|
|
|
|
2184
|
|
|
|
|
|
|
=item B |
|
2185
|
|
|
|
|
|
|
|
|
2186
|
|
|
|
|
|
|
To experiment with precision and recall calculations for LSA retrieval, run the |
|
2187
|
|
|
|
|
|
|
script: |
|
2188
|
|
|
|
|
|
|
|
|
2189
|
|
|
|
|
|
|
calculate_precision_and_recall_for_LSA.pl |
|
2190
|
|
|
|
|
|
|
|
|
2191
|
|
|
|
|
|
|
Note that this script will carry out its own estimation of relevancy judgments --- |
|
2192
|
|
|
|
|
|
|
which in most cases would not be a safe thing to do. |
|
2193
|
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
=item B
|
|
2195
|
|
|
|
|
|
|
Human-Supplied Relevancies:> |
|
2196
|
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
Precision and recall calculations for retrieval accuracy determination are best |
|
2198
|
|
|
|
|
|
|
carried out with human-supplied judgments of relevancies of the documents to queries. |
|
2199
|
|
|
|
|
|
|
If such judgments are available, run the script: |
|
2200
|
|
|
|
|
|
|
|
|
2201
|
|
|
|
|
|
|
calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl |
|
2202
|
|
|
|
|
|
|
|
|
2203
|
|
|
|
|
|
|
This script will print out the average precisions for the different test queries and |
|
2204
|
|
|
|
|
|
|
calculate the MAP metric of retrieval accuracy. |
|
2205
|
|
|
|
|
|
|
|
|
2206
|
|
|
|
|
|
|
=item B
|
|
2207
|
|
|
|
|
|
|
Human-Supplied Relevancies:> |
|
2208
|
|
|
|
|
|
|
|
|
2209
|
|
|
|
|
|
|
If human-supplied relevancy judgments are available and you wish to experiment with |
|
2210
|
|
|
|
|
|
|
precision and recall calculations for LSA-based retrieval, run the script: |
|
2211
|
|
|
|
|
|
|
|
|
2212
|
|
|
|
|
|
|
calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl |
|
2213
|
|
|
|
|
|
|
|
|
2214
|
|
|
|
|
|
|
This script will print out the average precisions for the different test queries and |
|
2215
|
|
|
|
|
|
|
calculate the MAP metric of retrieval accuracy. |
|
2216
|
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
=item B
|
|
2218
|
|
|
|
|
|
|
Randomization or with Student's Paired t-Test:> |
|
2219
|
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
significance_testing.pl randomization |
|
2221
|
|
|
|
|
|
|
|
|
2222
|
|
|
|
|
|
|
or |
|
2223
|
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
significance_testing.pl t-test |
|
2225
|
|
|
|
|
|
|
|
|
2226
|
|
|
|
|
|
|
Significance testing consists of forming a null hypothesis that the two retrieval |
|
2227
|
|
|
|
|
|
|
algorithms you are considering are the same from a black-box perspective and then |
|
2228
|
|
|
|
|
|
|
calculating what is known as a C. If the C is less than, say, |
|
2229
|
|
|
|
|
|
|
0.05, you reject the null hypothesis. |
|
2230
|
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
=item B |
|
2232
|
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
calculate_similarity_matrix_for_all_docs.pl |
|
2234
|
|
|
|
|
|
|
|
|
2235
|
|
|
|
|
|
|
or |
|
2236
|
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
calculate_similarity_matrix_for_all_normalized_docs.pl |
|
2238
|
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
The former uses regular document vectors for calculating the similarity between every |
|
2240
|
|
|
|
|
|
|
pair of documents in the corpus. And the latter uses normalized document vectors for |
|
2241
|
|
|
|
|
|
|
the same purpose. The document order used for row and column indexing of the matrix |
|
2242
|
|
|
|
|
|
|
corresponds to the alphabetic ordering of the document names in the corpus directory. |
|
2243
|
|
|
|
|
|
|
|
|
2244
|
|
|
|
|
|
|
=back |
|
2245
|
|
|
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
=head1 EXPORT |
|
2248
|
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
None by design. |
|
2250
|
|
|
|
|
|
|
|
|
2251
|
|
|
|
|
|
|
=head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS |
|
2252
|
|
|
|
|
|
|
|
|
2253
|
|
|
|
|
|
|
You have to be careful when carrying out Precision verses Recall calculations if you |
|
2254
|
|
|
|
|
|
|
do not wish to lose the previously created relevancy judgments. Invoking the method |
|
2255
|
|
|
|
|
|
|
C in your own script will cause the file C |
|
2256
|
|
|
|
|
|
|
to be overwritten. If you have created a relevancy database and stored it in a file |
|
2257
|
|
|
|
|
|
|
called, say, C, you should make a backup copy of this file before |
|
2258
|
|
|
|
|
|
|
executing a script that calls C. |
|
2259
|
|
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
|
=head1 BUGS |
|
2261
|
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
Please notify the author if you encounter any bugs. When sending email, please place |
|
2263
|
|
|
|
|
|
|
the string 'VSM' in the subject line to get past my spam filter. |
|
2264
|
|
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
|
=head1 INSTALLATION |
|
2266
|
|
|
|
|
|
|
|
|
2267
|
|
|
|
|
|
|
Download the archive from CPAN in any directory of your choice. Unpack the archive |
|
2268
|
|
|
|
|
|
|
with a command that on a Linux machine would look like: |
|
2269
|
|
|
|
|
|
|
|
|
2270
|
|
|
|
|
|
|
tar zxvf Algorithm-VSM-1.70.tar.gz |
|
2271
|
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
This will create an installation directory for you whose name will be |
|
2273
|
|
|
|
|
|
|
C. Enter this directory and execute the following commands for a |
|
2274
|
|
|
|
|
|
|
standard install of the module if you have root privileges: |
|
2275
|
|
|
|
|
|
|
|
|
2276
|
|
|
|
|
|
|
perl Makefile.PL |
|
2277
|
|
|
|
|
|
|
make |
|
2278
|
|
|
|
|
|
|
make test |
|
2279
|
|
|
|
|
|
|
sudo make install |
|
2280
|
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
If you do not have root privileges, you can carry out a non-standard install the |
|
2282
|
|
|
|
|
|
|
module in any directory of your choice by: |
|
2283
|
|
|
|
|
|
|
|
|
2284
|
|
|
|
|
|
|
perl Makefile.PL prefix=/some/other/directory/ |
|
2285
|
|
|
|
|
|
|
make |
|
2286
|
|
|
|
|
|
|
make test |
|
2287
|
|
|
|
|
|
|
make install |
|
2288
|
|
|
|
|
|
|
|
|
2289
|
|
|
|
|
|
|
With a non-standard install, you may also have to set your PERL5LIB environment |
|
2290
|
|
|
|
|
|
|
variable so that this module can find the required other modules. How you do that |
|
2291
|
|
|
|
|
|
|
would depend on what platform you are working on. In order to install this module in |
|
2292
|
|
|
|
|
|
|
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment |
|
2293
|
|
|
|
|
|
|
variable by |
|
2294
|
|
|
|
|
|
|
|
|
2295
|
|
|
|
|
|
|
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
|
2296
|
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
If I used bash, I'd need to declare: |
|
2298
|
|
|
|
|
|
|
|
|
2299
|
|
|
|
|
|
|
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/ |
|
2300
|
|
|
|
|
|
|
|
|
2301
|
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
=head1 THANKS |
|
2303
|
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their |
|
2305
|
|
|
|
|
|
|
deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in |
|
2306
|
|
|
|
|
|
|
similarity matrix characterization of a corpus. Thanks, Zahn! |
|
2307
|
|
|
|
|
|
|
|
|
2308
|
|
|
|
|
|
|
Several of the recent changes to the module are a result of the feedback I have |
|
2309
|
|
|
|
|
|
|
received from Naveen Kulkarni of Infosys Labs. Thanks, Naveen! |
|
2310
|
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
Version 1.62 was a result of Slaven Rezic's recommendation that I remove the Perl |
|
2312
|
|
|
|
|
|
|
version restriction on the module since he was able to run it with Perl version |
|
2313
|
|
|
|
|
|
|
5.8.9. Another important reason for v. 1.62 was the discovery of the two bugs |
|
2314
|
|
|
|
|
|
|
mentioned in Changes, one of them brought to my attention by Naveen Kulkarni. |
|
2315
|
|
|
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
=head1 AUTHOR |
|
2317
|
|
|
|
|
|
|
|
|
2318
|
|
|
|
|
|
|
The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy" project |
|
2319
|
|
|
|
|
|
|
with the publication of the book "B" by John-Wiley. If |
|
2320
|
|
|
|
|
|
|
interested, check out his web page at Purdue to find out what the Objects Trilogy |
|
2321
|
|
|
|
|
|
|
project was all about. You might like "B" especially if you |
|
2322
|
|
|
|
|
|
|
enjoyed reading Harry Potter as a kid (or even as an adult, for that matter). The |
|
2323
|
|
|
|
|
|
|
other two books in the trilogy are "B" and "B
|
|
2324
|
|
|
|
|
|
|
with Objects>". |
|
2325
|
|
|
|
|
|
|
|
|
2326
|
|
|
|
|
|
|
For all issues related to this module, contact the author at C |
|
2327
|
|
|
|
|
|
|
|
|
2328
|
|
|
|
|
|
|
If you send email, please place the string "VSM" in your subject line to get past the |
|
2329
|
|
|
|
|
|
|
author's spam filter. |
|
2330
|
|
|
|
|
|
|
|
|
2331
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
2332
|
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it under the |
|
2334
|
|
|
|
|
|
|
same terms as Perl itself. |
|
2335
|
|
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
Copyright 2015 Avinash Kak |
|
2337
|
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
=cut |
|
2339
|
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
|