File Coverage

blib/lib/Algorithm/VSM.pm
Criterion Covered Total %
statement 354 820 43.1
branch 91 324 28.0
condition 25 67 37.3
subroutine 32 55 58.1
pod 24 35 68.5
total 526 1301 40.4


line stmt bran cond sub pod time code
1             package Algorithm::VSM;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::VSM is a Perl module for retrieving the documents from a software
9             # library that match a list of words in a query. The matching criterion used depends
10             # on whether you ask the module to construct a full-dimensionality VSM or a
11             # reduced-dimensionality LSA model for the library.
12             # ---------------------------------------------------------------------------
13              
14 1     1   12635 use 5.10.0;
  1         3  
  1         32  
15 1     1   4 use strict;
  1         1  
  1         25  
16 1     1   3 use warnings;
  1         4  
  1         27  
17 1     1   3 use Carp;
  1         1  
  1         70  
18 1     1   386 use SDBM_File;
  1         1624  
  1         38  
19 1     1   372 use PDL::Lite;
  1         127119  
  1         28  
20 1     1   6 use PDL::MatrixOps;
  1         2  
  1         6  
21 1     1   151 use File::Basename;
  1         1  
  1         68  
22 1     1   441 use File::Spec::Functions qw(rel2abs);
  1         612  
  1         55  
23 1     1   5 use Fcntl;
  1         1  
  1         267  
24 1     1   544 use Storable;
  1         2352  
  1         46  
25 1     1   4 use Cwd;
  1         1  
  1         8326  
26              
27             our $VERSION = '1.61';
28              
29             # for camelcase splits (from perlmonks):
30             my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/;
31              
32             ################################### Constructor #######################################
33              
34             # Constructor for creating a VSM or LSA model of a corpus. The model instance
35             # returned by the constructor can be used for retrieving documents from the corpus
36             # in response to queries.
37             sub new {
38 2     2 1 490 my ($class, %args) = @_;
39 2         11 my @params = keys %args;
40 2 50       11 croak "\nYou have used a wrong name for a keyword argument " .
41             "--- perhaps a misspelling\n"
42             if _check_for_illegal_params(@params) == 0;
43 2 50 50     5447 bless {
    50 50        
    50 50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      100        
      50        
      50        
44             _corpus_directory => $args{corpus_directory} || "",
45             _save_model_on_disk => $args{save_model_on_disk} || 0,
46             _break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47             $args{break_camelcased_and_underscored} : 1,
48             _corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
49             _doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
50             _normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
51             _stop_words_file => $args{stop_words_file} || "",
52             _case_sensitive => $args{case_sensitive} || 0,
53             _query_file => $args{query_file} || "",
54             _file_types => $args{file_types} || [],
55             _min_word_length => $args{min_word_length} || 4,
56             _want_stemming => $args{want_stemming} || 0,
57             _idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58             _max_number_retrievals => $args{max_number_retrievals} || 30,
59             _lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
60             _relevancy_threshold => exists $args{relevancy_threshold} ?
61             $args{relevancy_threshold} : 1,
62             _relevancy_file => $args{relevancy_file} || "",
63             _debug => $args{debug} || 0,
64             _working_directory => cwd,
65             _vocab_hist_on_disk => {},
66             _vocab_hist => {},
67             _doc_hist_template => {},
68             _corpus_doc_vectors => {},
69             _normalized_doc_vecs => {},
70             _query_vector => {},
71             _stop_words => [],
72             _term_document_matrix => [],
73             _corpus_vocab_done => 0,
74             _scan_dir_for_rels => 0,
75             _vocab_size => undef,
76             _doc_vecs_trunc_lsa => {},
77             _lsa_vec_truncator => undef,
78             _queries_for_relevancy => {},
79             _relevancy_estimates => {},
80             _precision_for_queries => {},
81             _recall_for_queries => {},
82             _map_vals_for_queries => {},
83             _vocab_idf_hist => {},
84             _idf_t => {},
85             _total_num_of_docs => 0,
86             }, $class;
87             }
88              
89              
90             ###################### Get corpus vocabulary and word counts #########################
91              
92             sub get_corpus_vocabulary_and_word_counts {
93 2     2 1 32 my $self = shift;
94 2 50       25 die "You must supply the name of the corpus directory to the constructor"
95             unless $self->{_corpus_directory};
96 2 50       12 print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97             " model construction\n\n" if $self->{_debug};
98 2         18 $self->_scan_directory( $self->{_corpus_directory} );
99 2 50       9 $self->_drop_stop_words() if $self->{_stop_words_file};
100 2 50       6 if ($self->{_debug}) {
101 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
102 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103             }
104             }
105 2 50       7 if ($self->{_save_model_on_disk}) {
106 0         0 unlink glob "$self->{_corpus_vocab_db}.*";
107 0         0 unlink glob "$self->{_doc_vectors_db}.*";
108 0         0 unlink glob "$self->{_normalized_doc_vecs_db}.*";
109 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
110             $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
111             or die "Can't create DBM files: $!";
112 0         0 foreach (keys %{$self->{_vocab_hist}}) {
  0         0  
113 0         0 $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114             }
115 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
116             }
117 2         6 $self->{_corpus_vocab_done} = 1;
118 2         3 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  2         13  
119 2 50       6 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120             if $self->{_debug};
121             # Calculate idf(t):
122 2         2 foreach (keys %{$self->{_vocab_idf_hist}}) {
  2         25  
123 200         373 $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124             /
125             (1 + $self->{_vocab_idf_hist}->{$_})))
126             / log(10) );
127             }
128             }
129              
130             sub display_corpus_vocab {
131 0     0 1 0 my $self = shift;
132 0         0 die "corpus vocabulary not yet constructed"
133 0 0       0 unless keys %{$self->{_vocab_hist}};
134 0         0 print "\n\nDisplaying corpus vocabulary:\n\n";
135 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
136 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137 0         0 print "$outstring\n";
138             }
139             }
140              
141             sub display_corpus_vocab_size {
142 0     0 1 0 my $self = shift;
143 0         0 die "corpus vocabulary not yet constructed"
144 0 0       0 unless keys %{$self->{_vocab_hist}};
145 0         0 my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
146 0         0 print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147             }
148              
149             sub write_corpus_vocab_to_file {
150 0     0 1 0 my $self = shift;
151 0         0 my $file = shift;
152 0 0       0 die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
  0         0  
153 0 0       0 open OUT, "> $file"
154             or die "unable to open for output a file with name `$file': $!";
155 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
156 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157 0         0 print OUT "$outstring\n";
158             }
159 0         0 close OUT;
160             }
161              
162             sub display_inverse_document_frequencies {
163 0     0 1 0 my $self = shift;
164 0         0 die "corpus vocabulary not yet constructed"
165 0 0       0 unless keys %{$self->{_vocab_idf_hist}};
166 0 0       0 print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167             unless $self->{_idf_filter_option};
168 0         0 print "\n\nDisplaying inverse document frequencies:\n";
169 0         0 foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
  0         0  
170 0         0 my $outstring = sprintf("%30s %d",
171             $_, $self->{_vocab_idf_hist}->{$_});
172 0         0 print "$outstring\n";
173             }
174 0         0 print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175 0         0 foreach ( sort keys %{$self->{_idf_t}} ) {
  0         0  
176 0         0 my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177 0         0 print "$outstring\n";
178             }
179             }
180              
181             sub get_all_document_names {
182 0     0 1 0 my $self = shift;
183 0         0 my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
  0         0  
184 0         0 return \@all_files;
185             }
186              
187             ############################ Generate Document Vectors #################################
188              
189             sub generate_document_vectors {
190 2     2 1 28 my $self = shift;
191 2         23 chdir $self->{_working_directory};
192 2         3 foreach ( sort keys %{$self->{_vocab_hist}} ) {
  2         95  
193 200         211 $self->{_doc_hist_template}->{$_} = 0;
194             }
195 2         19 $self->_scan_directory( $self->{_corpus_directory} );
196 2         30 chdir $self->{_working_directory};
197 2 50       27 if ($self->{_save_model_on_disk}) {
198 0 0 0     0 die "You did not specify in the constructor call the names for the diskfiles " .
199             "for storing the disk-based hash tables consisting of document vectors " .
200             "and their normalized versions"
201             unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202 0         0 eval {
203 0         0 store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204             };
205 0 0       0 if ($@) {
206 0         0 print "Something went wrong with disk storage of document vectors: $@";
207             }
208 0         0 eval {
209 0         0 store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210             };
211 0 0       0 if ($@) {
212 0         0 print "Something wrong with disk storage of normalized doc vecs: $@";
213             }
214             }
215             }
216              
217             sub display_doc_vectors {
218 0     0 1 0 my $self = shift;
219 0         0 die "document vectors not yet constructed"
220 0 0       0 unless keys %{$self->{_corpus_doc_vectors}};
221 0         0 foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
222 0         0 print "\n\ndisplay doc vec for $file:\n";
223 0         0 foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
  0         0  
224 0         0 print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225             }
226 0         0 my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
  0         0  
227 0         0 print "\nSize of vector for $file: $docvec_size\n";
228             }
229             }
230              
231             sub display_normalized_doc_vectors {
232 0     0 1 0 my $self = shift;
233 0         0 die "normalized document vectors not yet constructed"
234 0 0       0 unless keys %{$self->{_normalized_doc_vecs}};
235 0 0       0 unless ($self->{_idf_filter_option}) {
236 0         0 print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237 0         0 return;
238             }
239 0         0 foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
  0         0  
240 0         0 print "\n\ndisplay normalized doc vec for $file:\n";
241 0         0 foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
  0         0  
242 0         0 print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243             }
244 0         0 my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
  0         0  
245 0         0 print "\nSize of normalized vector for $file: $docvec_size\n";
246             }
247             }
248              
249             ######################## Calculate Pairwise Document Similarities ######################
250              
251             # Returns the similarity score for two documents whose actual names are are supplied
252             # as its two arguments.
253             sub pairwise_similarity_for_docs {
254 0     0 1 0 my $self = shift;
255 0         0 my $doc1 = shift;
256 0         0 my $doc2 = shift;
257 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
258 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260 0         0 my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261 0         0 my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262 0         0 my @vec1 = ();
263 0         0 my @vec2 = ();
264 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
265 0         0 push @vec1, $vec_hash_ref1->{$word};
266 0         0 push @vec2, $vec_hash_ref2->{$word};
267             }
268 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
269 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
270 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
271 0         0 $product /= $vec_mag1 * $vec_mag2;
272 0         0 return $product;
273             }
274              
275             sub pairwise_similarity_for_normalized_docs {
276 0     0 1 0 my $self = shift;
277 0         0 my $doc1 = shift;
278 0         0 my $doc2 = shift;
279 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
280 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282 0         0 my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283 0         0 my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284 0         0 my @vec1 = ();
285 0         0 my @vec2 = ();
286 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
287 0         0 push @vec1, $vec_hash_ref1->{$word};
288 0         0 push @vec2, $vec_hash_ref2->{$word};
289             }
290 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
291 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
292 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
293 0         0 $product /= $vec_mag1 * $vec_mag2;
294 0         0 return $product;
295             }
296              
297             ############################### Retrieve with VSM Model ################################
298              
299             sub retrieve_with_vsm {
300 1     1 1 18 my $self = shift;
301 1         4 my $query = shift;
302 1         1 my @clean_words;
303 1         5 my $min = $self->{_min_word_length};
304              
305 1 50       10 if ($self->{_break_camelcased_and_underscored}) {
306 1         31 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
307 1         4 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         30  
308 0         0 @clean_words = $self->{_case_sensitive} ?
309 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  8         63  
310 1 100       7 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8 50       30  
311             } else {
312 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
313 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
314             }
315 1         4 $query = \@clean_words;
316 1 50       51 print "\nYour query words are: @$query\n" if $self->{_debug};
317 1 50       4 if ($self->{_idf_filter_option}) {
318 1         9 die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319 1         6 unless scalar(keys %{$self->{_vocab_hist}})
320 1 50 50     2 && scalar(keys %{$self->{_normalized_doc_vecs}});
321             } else {
322 0         0 die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323 0         0 unless scalar(keys %{$self->{_vocab_hist}})
324 0 0 0     0 && scalar(keys %{$self->{_corpus_doc_vectors}});
325             }
326 1         7 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         34  
327 100         127 $self->{_query_vector}->{$_} = 0;
328             }
329 1         7 foreach (@$query) {
330 7 50       12 if ($self->{_case_sensitive}) {
331 0 0       0 $self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332             } else {
333 7 100       21 $self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334             }
335             }
336 1         1 my @query_word_counts = values %{$self->{_query_vector}};
  1         13  
337 1         6 my $query_word_count_total = reduce(\@query_word_counts);
338 1 50       15 die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339             unless $query_word_count_total;
340 1         3 my %retrievals;
341 1 50       4 if ($self->{_idf_filter_option}) {
342 1 50       5 print "\n\nUsing idf filter option for retrieval:\n\n"
343             if $self->{_debug};
344 1         2 foreach (sort {$self->_doc_vec_comparator}
  15         34  
  1         12  
345             keys %{$self->{_normalized_doc_vecs}}) {
346 8         17 $retrievals{$_} = $self->_similarity_to_query($_);
347             }
348             } else {
349 0 0       0 print "\n\nNOT using idf filter option for retrieval:\n\n"
350             if $self->{_debug};
351 0         0 foreach (sort {$self->_doc_vec_comparator}
  0         0  
  0         0  
352             keys %{$self->{_corpus_doc_vectors}}) {
353 0         0 $retrievals{$_} = $self->_similarity_to_query($_);
354             }
355             }
356 1 50       5 if ($self->{_debug}) {
357 0         0 print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
359 0         0 print "$_ => $retrievals{$_}\n";
360             }
361             }
362 1         12 return \%retrievals;
363             }
364              
365             ######################### Upload a Previously Constructed Model #########################
366              
367             sub upload_vsm_model_from_disk {
368 0     0 0 0 my $self = shift;
369 0 0 0     0 die "\nCannot find the database files for the VSM model"
370             unless -s "$self->{_corpus_vocab_db}.pag"
371             && -s $self->{_doc_vectors_db};
372 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
374             $self->{_corpus_vocab_db}, O_RDONLY, 0640
375             or die "Can't open DBM file: $!";
376 0 0       0 if ($self->{_debug}) {
377 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
378 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379             }
380             }
381 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
382 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383             }
384 0         0 $self->{_corpus_vocab_done} = 1;
385 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
386 0 0       0 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387             if $self->{_debug};
388 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
390             }
391              
392             sub upload_normalized_vsm_model_from_disk {
393 0     0 1 0 my $self = shift;
394 0 0 0     0 die "\nCannot find the database files for the VSM model"
395             unless -s "$self->{_corpus_vocab_db}.pag"
396             && -s $self->{_normalized_doc_vecs_db};
397 0         0 $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
399             $self->{_corpus_vocab_db}, O_RDONLY, 0640
400             or die "Can't open DBM file: $!";
401 0 0       0 if ($self->{_debug}) {
402 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
403 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404             }
405             }
406 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
407 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408             }
409 0         0 $self->{_corpus_vocab_done} = 1;
410 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
411 0 0       0 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412             if $self->{_debug};
413 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
414             }
415              
416             ############################## Display Retrieval Results ################################
417              
418             sub display_retrievals {
419 0     0 1 0 my $self = shift;
420 0         0 my $retrievals = shift;
421 0         0 print "\n\nShowing the retrievals and the similarity scores:\n\n";
422 0         0 my $iter = 0;
423 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
  0         0  
424 0         0 print "$_ => $retrievals->{$_}\n";
425 0         0 $iter++;
426 0 0       0 last if $iter > $self->{_max_number_retrievals};
427             }
428 0         0 print "\n\n";
429             }
430              
431             ############################### Directory Scanner ################################
432              
433             sub _scan_directory {
434 4     4   17 my $self = shift;
435 4         32 my $dir = rel2abs( shift );
436 4         9878 my $current_dir = cwd;
437 4 50       136 chdir $dir or die "Unable to change directory to $dir: $!";
438 4         558 foreach ( glob "*" ) {
439 32 50 33     2567 if ( -d and !(-l) ) {
    50 33        
      33        
      33        
      33        
440 0         0 $self->_scan_directory( $_ );
441 0 0       0 chdir $dir
442             or die "Unable to change directory to $dir: $!";
443             } elsif (-r _ and
444             -T _ and
445             -M _ > 0.00001 and # modification age is at least 1 sec
446             !( -l $_ ) and
447             $self->ok_to_filetype($_) ) {
448 32 50       77 $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449 32 100       101 $self->_scan_file($_) unless $self->{_corpus_vocab_done};
450 32 100       129 $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451             }
452             }
453 4         100 chdir $current_dir;
454             }
455              
456             sub _scan_file {
457 16     16   17 my $self = shift;
458 16         18 my $file = shift;
459 16         381 open IN, $file;
460 16         31 my $min = $self->{_min_word_length};
461 16         34 my %uniques = ();
462 16         142 while () {
463 834 100       2524 next if /^[ ]*\r?\n?$/;
464 710         1913 $_ =~ s/\r?\n?$//;
465 710         610 my @clean_words;
466 710 50       929 if ($self->{_break_camelcased_and_underscored}) {
467 710         6819 my @brokenup = grep $_, split /\W|_|\s+/, $_;
468 710         1131 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  2574         6930  
469 0         0 @clean_words = $self->{_case_sensitive} ?
470 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  2712         4892  
471 710 100       1374 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  2712 50       6549  
472             } else {
473 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
474 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
475             }
476 710 100       2029 next unless @clean_words;
477 426 50       1042 @clean_words = grep $_, map &simple_stemmer($_), @clean_words
478             if $self->{_want_stemming};
479 426 50       863 if ($self->{_case_sensitive}) {
480 0         0 map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words;
  0         0  
481             } else {
482 426         551 map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
  1718         2444  
483             }
484 426         636 for (@clean_words) { $uniques{$_}++ };
  1718         3004  
485             }
486 16         149 close( IN );
487 16         107 map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
  820         816  
488 16         110 $self->{_total_num_of_docs}++;
489             }
490              
491             sub ok_to_filetype {
492 32     32 0 61 my $self = shift;
493 32         88 my $filename = shift;
494 32         1525 my ($base, $dir, $suffix) = fileparse($filename, '\..*');
495 32         126 croak "You called this module without specifying the file types in the constructor"
496 32 50       59 unless @{$self->{_file_types}} > 0;
497 32 50       61 return 1 if contained_in($suffix, @{$self->{_file_types}});
  32         169  
498 0         0 return 0;
499             }
500              
501             ############################## LSA Modeling and Retrieval ################################
502              
503             sub construct_lsa_model {
504 1     1 1 18 my $self = shift;
505 1 50       10 if ($self->{_idf_filter_option}) {
506 1 50 33     15 if (!$self->{_normalized_doc_vecs} and
507             -s $self->{_normalized_doc_vecs_db}) {
508 0         0 $self->{_normalized_doc_vecs} =
509             retrieve($self->{_normalized_doc_vecs_db});
510             }
511 1         4 foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
  1         18  
512 8         10 my $term_frequency_vec;
513 8         11 foreach my $word (sort keys
  8         504  
514             %{$self->{_normalized_doc_vecs}->{$_}}){
515 800         1354 push @$term_frequency_vec,
516             $self->{_normalized_doc_vecs}->{$_}->{$word};
517             }
518 8         60 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  8         28  
519             }
520             } else {
521 0 0 0     0 if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
522 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
523             }
524 0         0 foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
525 0         0 my $term_frequency_vec;
526 0         0 foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
  0         0  
527 0         0 push @$term_frequency_vec,
528             $self->{_corpus_doc_vectors}->{$_}->{$word};
529             }
530 0         0 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  0         0  
531             }
532             }
533 1         3 my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
  1         12  
534 1         497 my ($U,$SIGMA,$V) = svd $A;
535 1 50       12 print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
536 1 50       6 print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
537 1         11 my $index = return_index_of_last_value_above_threshold($SIGMA,
538             $self->{_lsa_svd_threshold});
539 1         8 my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
540 1 50       32 print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
541             # When you measure the size of a matrix in PDL, the zeroth dimension
542             # is considered to be along the horizontal and the one-th dimension
543             # along the rows. This is opposite of how we want to look at
544             # matrices. For a matrix of size MxN, we mean M rows and N columns.
545             # With this 'rows x columns' convention for matrix size, if you had
546             # to check the size of, say, U matrix, you would call
547             # my @size = ( $U->getdim(1), $U->getdim(0) );
548             # print "\nsize of U: @size\n";
549 1         7 my $U_trunc = $U->slice("0:$index,:")->sever;
550 1         49 my $V_trunc = $V->slice("0:$index,0:$index")->sever;
551 1         29 $self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
552             PDL::Basic::transpose($U_trunc);
553 1 50       10129 print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
554             if $self->{_debug};
555 1         10 my @sorted_doc_names = $self->{_idf_filter_option} ?
556 0         0 sort keys %{$self->{_normalized_doc_vecs}} :
557 1 50       7 sort keys %{$self->{_corpus_doc_vectors}};
558 1         2 my $i = 0;
559 1         2 foreach (@{$self->{_term_document_matrix}}) {
  1         3  
560 8         17 my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
561             PDL::Basic::transpose(pdl($_));
562 8         883 my $doc_name = $sorted_doc_names[$i++];
563 8 50       24 print "\n\nTruncated doc vec for $doc_name: " .
564             $truncated_doc_vec . "\n" if $self->{_debug};
565 8         20 $self->{_doc_vecs_trunc_lsa}->{$doc_name}
566             = $truncated_doc_vec;
567             }
568 1         39 chdir $self->{_working_directory};
569             }
570              
571             sub retrieve_with_lsa {
572 1     1 1 11 my $self = shift;
573 1         3 my $query = shift;
574 1         2 my @clean_words;
575 1         3 my $min = $self->{_min_word_length};
576 1 50       4 if ($self->{_break_camelcased_and_underscored}) {
577 1         26 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
578 1         4 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         24  
579 1 100       2 @clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8         78  
  8         29  
580             } else {
581 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
582 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
583             }
584 1         3 $query = \@clean_words;
585 1 50       4 print "\nYour processed query words are: @$query\n" if $self->{_debug};
586 1         7 die "Your vocabulary histogram is empty"
587 1 50       2 unless scalar(keys %{$self->{_vocab_hist}});
588 1         5 die "You must first construct an LSA model"
589 1 50       2 unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
590 1         2 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         45  
591 100         110 $self->{_query_vector}->{$_} = 0;
592             }
593 1         5 foreach (@$query) {
594 7 100       17 $self->{_query_vector}->{"\L$_"}++
595             if exists $self->{_vocab_hist}->{"\L$_"};
596             }
597 1         2 my @query_word_counts = values %{$self->{_query_vector}};
  1         10  
598 1         7 my $query_word_count_total = reduce(\@query_word_counts);
599 1 50       8 die "Query does not contain corpus words. Nothing retrieved."
600             unless $query_word_count_total;
601 1         2 my $query_vec;
602 1         2 foreach (sort keys %{$self->{_query_vector}}) {
  1         51  
603 100         99 push @$query_vec, $self->{_query_vector}->{$_};
604             }
605 1 50       7 print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
606 1         4 my $truncated_query_vec = $self->{_lsa_vec_truncator} x
607             PDL::Basic::transpose(pdl($query_vec));
608 1 50       131 print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
609             if $self->{_debug};
610 1         1 my %retrievals;
611 1         2 foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
  1         6  
612 8         68 my $dot_product = PDL::Basic::transpose($truncated_query_vec)
613             x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
614 8 50       487 print "\n\nLSA: dot product of truncated query and\n" .
615             " truncated vec for doc $_ is " . $dot_product->sclr . "\n"
616             if $self->{_debug};
617 8         14 $retrievals{$_} = $dot_product->sclr;
618             }
619 1 50       13 if ($self->{_debug}) {
620 0         0 print "\n\nShowing LSA retrievals and similarity scores:\n\n";
621 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
622 0         0 print "$_ => $retrievals{$_}\n";
623             }
624 0         0 print "\n\n";
625             }
626 1         12 return \%retrievals;
627             }
628              
629             sub _construct_doc_vector {
630 16     16   24 my $self = shift;
631 16         29 my $file = shift;
632 16         22 my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
  16         51  
633 16         174 foreach ( sort keys %{$self->{_doc_hist_template}} ) {
  16         755  
634 1600         1499 $document_vector{$_} = 0;
635             }
636 16         111 my $min = $self->{_min_word_length};
637 16         34 my $total_words_in_doc = 0;
638 16 50       449 unless (open IN, $file) {
639 0 0       0 print "Unable to open file $file in the corpus: $!\n"
640             if $self->{_debug};
641 0         0 return;
642             }
643 16         257 while () {
644 834 100       2823 next if /^[ ]*\r?\n?$/;
645 710         2175 $_ =~ s/\r?\n?$//;
646 710         5708 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
647 710         1027 my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  4702         7927  
  4702         7359  
648 710 100       2290 next unless @clean_words;
649 426 50       1403 @clean_words = grep $_,
650             map &simple_stemmer($_, $self->{_debug}), @clean_words
651             if $self->{_want_stemming};
652 0         0 $self->{_case_sensitive} ?
653 0         0 map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
  1470         3140  
654 1592         2631 map { $document_vector{"\L$_"}++ }
655 426 50       1043 grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
656             }
657 16         164 close IN;
658 16 50       74 die "Something went wrong. Doc vector size unequal to vocab size"
659             unless $self->{_vocab_size} == scalar(keys %document_vector);
660 16         212 foreach (keys %document_vector) {
661 1600         1548 $total_words_in_doc += $document_vector{$_};
662             }
663 16         74 my %normalized_doc_vec;
664 16 50       49 if ($self->{_idf_filter_option}) {
665 16         136 foreach (keys %document_vector) {
666 1600         2666 $normalized_doc_vec{$_} = $document_vector{$_}
667             *
668             $self->{_idf_t}->{$_}
669             /
670             $total_words_in_doc;
671             }
672             }
673 16         38053 my $pwd = cwd;
674 16         342 $pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
675 16         68 my $file_path_name;
676 16 50       123 unless ( $1 eq "" ) {
677 0         0 $file_path_name = "$1/$file";
678             } else {
679 16         30 $file_path_name = $file;
680             }
681 16         127 $self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
682 16         252 $self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
683             }
684              
685             ################################### Drop Stop Words ###################################
686              
687             sub _drop_stop_words {
688 0     0   0 my $self = shift;
689 0 0       0 open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
690             or die "unable to open stop words file: $!";
691 0         0 while () {
692 0 0       0 next if /^#/;
693 0 0       0 next if /^[ ]*$/;
694 0         0 chomp;
695 0 0       0 delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
696 0         0 unshift @{$self->{_stop_words}}, $_;
  0         0  
697             }
698             }
699              
700             ################################### Support Methods ####################################
701              
702             sub _doc_vec_comparator {
703 15     15   16 my $self = shift;
704 15         15 my %query_vector = %{$self->{_query_vector}};
  15         351  
705 15 50       87 my $vec1_hash_ref = $self->{_idf_filter_option} ?
706             $self->{_normalized_doc_vecs}->{$a} :
707             $self->{_corpus_doc_vectors}->{$a};
708 15 50       29 my $vec2_hash_ref = $self->{_idf_filter_option} ?
709             $self->{_normalized_doc_vecs}->{$b} :
710             $self->{_corpus_doc_vectors}->{$b};
711 15         18 my @vec1 = ();
712 15         17 my @vec2 = ();
713 15         11 my @qvec = ();
714 15         12 foreach my $word (sort keys %{$self->{_vocab_hist}}) {
  15         560  
715 1500         1369 push @vec1, $vec1_hash_ref->{$word};
716 1500         1319 push @vec2, $vec2_hash_ref->{$word};
717 1500         1554 push @qvec, $query_vector{$word};
718             }
719 15         89 my $vec1_mag = vec_magnitude(\@vec1);
720 15         24 my $vec2_mag = vec_magnitude(\@vec2);
721 15         27 my $qvec_mag = vec_magnitude(\@qvec);
722 15         26 my $product1 = vec_scalar_product(\@vec1, \@qvec);
723 15         17 $product1 /= $vec1_mag * $qvec_mag;
724 15         27 my $product2 = vec_scalar_product(\@vec2, \@qvec);
725 15         19 $product2 /= $vec2_mag * $qvec_mag;
726 15 100       175 return 1 if $product1 < $product2;
727 5 50       9 return 0 if $product1 == $product2;
728 5 50       89 return -1 if $product1 > $product2;
729             }
730              
731             sub _similarity_to_query {
732 8     8   9 my $self = shift;
733 8         9 my $doc_name = shift;
734 8 50       20 my $vec_hash_ref = $self->{_idf_filter_option} ?
735             $self->{_normalized_doc_vecs}->{$doc_name} :
736             $self->{_corpus_doc_vectors}->{$doc_name};
737 8         10 my @vec = ();
738 8         7 my @qvec = ();
739 8         328 foreach my $word (sort keys %$vec_hash_ref) {
740 800         655 push @vec, $vec_hash_ref->{$word};
741 800         758 push @qvec, $self->{_query_vector}->{$word};
742             }
743 8         39 my $vec_mag = vec_magnitude(\@vec);
744 8         15 my $qvec_mag = vec_magnitude(\@qvec);
745 8         12 my $product = vec_scalar_product(\@vec, \@qvec);
746 8         14 $product /= $vec_mag * $qvec_mag;
747 8         56 return $product;
748             }
749              
750             ###################### Relevance Judgments for Testing Purposes #######################
751              
752             ## IMPORTANT: This estimation of document relevancies to queries is NOT for
753             ## serious work. A document is considered to be relevant to a
754             ## query if it contains several of the query words. As to the
755             ## minimum number of query words that must exist in a document
756             ## in order for the latter to be considered relevant is
757             ## determined by the relevancy_threshold parameter in the VSM
758             ## constructor. (See the relevancy and precision-recall related
759             ## scripts in the 'examples' directory.) The reason for why the
760             ## function shown below is not for serious work is because
761             ## ultimately it is the humans who are the best judges of the
762             ## relevancies of documents to queries. The humans bring to
763             ## bear semantic considerations on the relevancy determination
764             ## problem that are beyond the scope of this module.
765              
766             sub estimate_doc_relevancies {
767 0     0 1 0 my $self = shift;
768 0 0       0 die "You did not set the 'query_file' parameter in the constructor"
769             unless $self->{_query_file};
770 0 0       0 open( IN, $self->{_query_file} )
771             or die "unable to open the query file $self->{_query_file}: $!";
772 0 0       0 croak "\n\nYou need to specify a name for the relevancy file in \n" .
773             " in which the relevancy judgments will be dumped."
774             unless $self->{_relevancy_file};
775 0         0 while () {
776 0         0 chomp;
777 0 0       0 next if /^#/;
778 0 0       0 next if /^[ ]*$/;
779 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
780 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
781 0         0 my $query_label = $1;
782 0         0 my $query = $2;
783 0 0       0 next unless $query;
784 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
785             }
786 0 0       0 if ($self->{_debug}) {
787 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
788 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
789             }
790             }
791 0         0 $self->{_scan_dir_for_rels} = 1;
792 0         0 $self->_scan_directory($self->{_corpus_directory});
793 0         0 $self->{_scan_dir_for_rels} = 0;
794 0         0 chdir $self->{_working_directory};
795 0 0       0 open(OUT, ">$self->{_relevancy_file}")
796             or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
797 0         0 my @relevancy_list_for_query;
798 0         0 foreach (sort
799 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
  0         0  
800             keys %{$self->{_relevancy_estimates}}) {
801 0         0 @relevancy_list_for_query =
802 0         0 keys %{$self->{_relevancy_estimates}->{$_}};
803 0         0 print OUT "$_ => @relevancy_list_for_query\n\n";
804 0         0 print "Number of relevant docs for query $_: " .
805             scalar(@relevancy_list_for_query) . "\n";
806             }
807             }
808              
809             # If there are available human-supplied relevancy judgments in a disk
810             # file, use this script to upload that information. One of the scripts
811             # in the 'examples' directory carries out the precision-recall analysis
812             # by using this approach. IMPORTANT: The human-supplied relevancy
813             # judgments must be in a format that is shown in the sample file
814             # relevancy.txt in the 'examples' directory.
815             sub upload_document_relevancies_from_file {
816 0     0 1 0 my $self = shift;
817 0         0 chdir $self->{_working_directory};
818 0 0       0 open( IN, $self->{_relevancy_file} )
819             or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
820 0         0 while () {
821 0         0 chomp;
822 0 0       0 next if /^#/;
823 0 0       0 next if /^[ ]*$/;
824 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
825 0         0 /^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
826 0         0 my $query_label = $1;
827 0         0 my $relevancy_docs_string = $2;
828 0 0       0 next unless $relevancy_docs_string;
829 0         0 my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
830 0         0 my %relevancies = map {$_ => 1} @relevancy_docs;
  0         0  
831 0         0 $self->{_relevancy_estimates}->{$query_label} = \%relevancies;
832             }
833 0 0       0 if ($self->{_debug}) {
834 0         0 for (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
835 0         0 my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
  0         0  
836 0         0 print "$_ => @rels\n";
837             }
838             }
839             }
840              
841             sub display_doc_relevancies {
842 0     0 1 0 my $self = shift;
843 0         0 die "You must first estimate or provide the doc relevancies"
844 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
845 0         0 print "\nDisplaying relevancy judgments:\n\n";
846 0         0 foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
847 0         0 print "Query $query\n";
848 0         0 foreach my $file (sort {
  0         0  
849 0         0 $self->{_relevancy_estimates}->{$query}->{$b}
850             <=>
851             $self->{_relevancy_estimates}->{$query}->{$a}
852             }
853             keys %{$self->{_relevancy_estimates}->{$query}}){
854 0         0 print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
855             }
856             }
857             }
858              
859             sub _scan_file_for_rels {
860 0     0   0 my $self = shift;
861 0         0 my $file = shift;
862 0         0 open IN, $file;
863 0         0 my @all_text = ;
864 0         0 @all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
  0         0  
  0         0  
865 0         0 my $all_text = join ' ', @all_text;
866 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
867 0         0 my $count = 0;
868 0         0 my @query_words = grep $_,
869             split /\s+/, $self->{_queries_for_relevancy}->{$query};
870 0 0       0 print "Query words for $query: @query_words\n" if $self->{_debug};
871 0         0 foreach my $word (@query_words) {
872 0         0 my @matches = $all_text =~ /$word/gi;
873 0 0       0 print "Number of occurrences for word '$word' in file $file: " .
874             scalar(@matches) . "\n" if $self->{_debug};
875 0 0       0 $count += @matches if @matches;
876             }
877 0 0       0 print "\nRelevancy count for query $query and file $file: $count\n\n"
878             if $self->{_debug};
879 0 0       0 $self->{_relevancy_estimates}->{$query}->{$file} = $count
880             if $count >= $self->{_relevancy_threshold};
881             }
882             }
883              
884             ######################### Calculate Precision versus Recall ##########################
885              
886             sub precision_and_recall_calculator {
887 0     0 1 0 my $self = shift;
888 0         0 my $retrieval_type = shift;
889 0         0 die "You must first estimate or provide the doc relevancies"
890 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
891 0 0       0 unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
  0         0  
892 0 0       0 open( IN, $self->{_query_file})
893             or die "unable to open the query file $self->{_query_file}: $!";
894 0         0 while () {
895 0         0 chomp;
896 0 0       0 next if /^#/;
897 0 0       0 next if /^[ ]*$/;
898 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
899 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
900 0         0 my $query_label = $1;
901 0         0 my $query = $2;
902 0 0       0 next unless $query;
903 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
904             }
905 0 0       0 if ($self->{_debug}) {
906 0         0 print "\n\nDisplaying queries in the query file:\n\n";
907 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
908 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
909             }
910             }
911             }
912 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
913 0 0       0 print "\n\n\nQuery $query:\n" if $self->{_debug};
914 0         0 my @query_words = grep $_,
915             split /\s+/, $self->{_queries_for_relevancy}->{$query};
916 0         0 my $retrievals;
917 0 0       0 croak "\n\nYou have not specified the retrieval type for " .
918             "precision-recall calculation. See code in 'examples'" .
919             "directory:" if !defined $retrieval_type;
920 0 0       0 if ($retrieval_type eq 'vsm') {
    0          
921 0         0 $retrievals = $self->retrieve_with_vsm( \@query_words );
922             } elsif ($retrieval_type eq 'lsa') {
923 0         0 $retrievals = $self->retrieve_with_lsa( \@query_words );
924             }
925 0         0 my %ranked_retrievals;
926 0         0 my $i = 1;
927 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
  0         0  
928             keys %$retrievals) {
929 0         0 $ranked_retrievals{$i++} = $_;
930             }
931 0 0       0 if ($self->{_debug}) {
932 0         0 print "\n\nDisplaying ranked retrievals for query $query:\n\n";
933 0         0 foreach (sort {$a <=> $b} keys %ranked_retrievals) {
  0         0  
934 0         0 print "$_ => $ranked_retrievals{$_}\n";
935             }
936             }
937             # At this time, ranking of relevant documents based on their
938             # relevancy counts serves no particular purpose since all we want
939             # for the calculation of Precision and Recall are the total
940             # number of relevant documents. However, I believe such a
941             # ranking will play an important role in the future.
942             # IMPORTANT: The relevancy judgments are ranked only when
943             # estimated by the method estimate_doc_relevancies()
944             # of the VSM class. When relevancies are supplied
945             # directly through a disk file, they all carry the
946             # same rank.
947 0         0 my %ranked_relevancies;
948 0         0 $i = 1;
949 0         0 foreach my $file (sort {
  0         0  
950 0         0 $self->{_relevancy_estimates}->{$query}->{$b}
951             <=>
952             $self->{_relevancy_estimates}->{$query}->{$a}
953             }
954             keys %{$self->{_relevancy_estimates}->{$query}}) {
955 0         0 $ranked_relevancies{$i++} = $file;
956             }
957 0 0       0 if ($self->{_debug}) {
958 0         0 print "\n\nDisplaying ranked relevancies for query $query:\n\n";
959 0         0 foreach (sort {$a <=> $b} keys %ranked_relevancies) {
  0         0  
960 0         0 print "$_ => $ranked_relevancies{$_}\n";
961             }
962             }
963 0         0 my @relevant_set = values %ranked_relevancies;
964 0 0       0 warn "\n\nNo relevant docs found for query $query.\n" .
965             "Will skip over this query for precision and\n" .
966             "recall calculations\n\n" unless @relevant_set;
967 0 0       0 next unless @relevant_set;
968 0 0       0 print "\n\nRelevant set for query $query: @relevant_set\n\n"
969             if $self->{_debug};
970 0         0 my @retrieved;
971 0         0 foreach (sort keys %ranked_retrievals) {
972 0         0 push @retrieved, $ranked_retrievals{$_};
973             }
974 0 0       0 print "\n\nRetrieved set for query $query: @retrieved\n\n"
975             if $self->{_debug};
976 0         0 my @Precision_values = ();
977 0         0 my @Recall_values = ();
978 0         0 my $rank = 1;
979 0         0 while ($rank < @retrieved + 1) {
980 0         0 my $index = 1;
981 0         0 my @retrieved_at_rank = ();
982 0         0 while ($index <= $rank) {
983 0         0 push @retrieved_at_rank, $ranked_retrievals{$index};
984 0         0 $index++;
985             }
986 0         0 my $intersection =set_intersection(\@retrieved_at_rank,
987             \@relevant_set);
988 0 0       0 my $precision_at_rank = @retrieved_at_rank ?
989             (@$intersection / @retrieved_at_rank) : 0;
990 0         0 push @Precision_values, $precision_at_rank;
991 0         0 my $recall_at_rank = @$intersection / @relevant_set;
992 0         0 push @Recall_values, $recall_at_rank;
993 0         0 $rank++;
994             }
995 0 0       0 print "\n\nFor query $query, precision values: @Precision_values\n"
996             if $self->{_debug};
997 0 0       0 print "\nFor query $query, recall values: @Recall_values\n"
998             if $self->{_debug};
999 0         0 $self->{_precision_for_queries}->{$query} = \@Precision_values;
1000 0         0 $self->{_recall_for_queries}->{$query} = \@Recall_values;
1001 0         0 my $area = 0;
1002             # Use trapezoidal rule to find the area under the precision-recall
1003             # curve:
1004 0         0 for my $j (1..@Precision_values-1) {
1005 0         0 my $height = ($Precision_values[$j]+$Precision_values[$j-1])/2.0;
1006 0         0 my $base = ($Recall_values[$j] - $Recall_values[$j-1]);
1007 0         0 $area += $base * $height;
1008             }
1009 0         0 my $map_for_query = $area;
1010 0 0       0 print "\nMAP for query $query: $map_for_query\n" if $self->{_debug};
1011 0         0 $self->{_map_vals_for_queries}->{$query} = $map_for_query;
1012             }
1013             }
1014              
1015             sub display_map_values_for_queries {
1016 0     0 1 0 my $self = shift;
1017 0         0 die "You must first invoke precision_and_recall_calculator function"
1018 0 0       0 unless scalar(keys %{$self->{_map_vals_for_queries}});
1019 0         0 my $map = 0;
1020 0         0 print "\n\nDisplaying average precision for different queries:\n\n";
1021 0         0 foreach my $query (sort
  0         0  
1022 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1023             keys %{$self->{_map_vals_for_queries}}) {
1024 0         0 my $output = sprintf "Query %s => %.3f",
1025             $query, $self->{_map_vals_for_queries}->{$query};
1026 0         0 print "$output\n";
1027 0         0 $map += $self->{_map_vals_for_queries}->{$query};
1028             }
1029 0         0 print "\n\n";
1030 0         0 my $avg_map_for_all_queries =
1031 0         0 $map / scalar(keys %{$self->{_map_vals_for_queries}});
1032 0         0 print "MAP value: $avg_map_for_all_queries\n\n";
1033             }
1034              
1035             sub display_precision_vs_recall_for_queries {
1036 0     0 1 0 my $self = shift;
1037 0         0 die "You must first invoke precision_and_recall_calculator function"
1038 0 0       0 unless scalar(keys %{$self->{_precision_for_queries}});
1039 0         0 print "\n\nDisplaying precision and recall values for different queries:\n\n";
1040 0         0 foreach my $query (sort
  0         0  
1041 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1042             keys %{$self->{_map_vals_for_queries}}) {
1043 0         0 print "\n\nQuery $query:\n";
1044 0         0 print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1045 0         0 my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
  0         0  
1046 0         0 @precision_vals = map {sprintf "%.3f", $_} @precision_vals;
  0         0  
1047 0         0 print " Precision at rank => @precision_vals\n";
1048 0         0 my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
  0         0  
1049 0         0 @recall_vals = map {sprintf "%.3f", $_} @recall_vals;
  0         0  
1050 0         0 print "\n Recall at rank => @recall_vals\n";
1051             }
1052 0         0 print "\n\n";
1053             }
1054              
1055             sub get_query_sorted_average_precision_for_queries {
1056 0     0 1 0 my $self = shift;
1057 0         0 die "You must first invoke precision_and_recall_calculator function"
1058 0 0       0 unless scalar(keys %{$self->{_map_vals_for_queries}});
1059 0         0 my @average_precisions_for_queries = ();
1060 0         0 foreach my $query (sort
  0         0  
1061 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1062             keys %{$self->{_map_vals_for_queries}}) {
1063 0         0 my $output = sprintf "%.3f", $self->{_map_vals_for_queries}->{$query};
1064 0         0 push @average_precisions_for_queries, $output;
1065             }
1066 0         0 return \@average_precisions_for_queries;
1067             }
1068              
1069             ################################### Utility Routines ###################################
1070              
1071             sub _check_for_illegal_params {
1072 2     2   6 my @params = @_;
1073 2         13 my @legal_params = qw / corpus_directory
1074             corpus_vocab_db
1075             doc_vectors_db
1076             normalized_doc_vecs_db
1077             use_idf_filter
1078             stop_words_file
1079             file_types
1080             case_sensitive
1081             max_number_retrievals
1082             query_file
1083             relevancy_file
1084             min_word_length
1085             want_stemming
1086             lsa_svd_threshold
1087             relevancy_threshold
1088             break_camelcased_and_underscored
1089             save_model_on_disk
1090             debug
1091             /;
1092 2         2 my $found_match_flag;
1093 2         5 foreach my $param (@params) {
1094 17         16 foreach my $legal (@legal_params) {
1095 102         70 $found_match_flag = 0;
1096 102 100       137 if ($param eq $legal) {
1097 17         13 $found_match_flag = 1;
1098 17         14 last;
1099             }
1100             }
1101 17 50       31 last if $found_match_flag == 0;
1102             }
1103 2         9 return $found_match_flag;
1104             }
1105              
1106             # checks whether an element is in an array:
1107             sub contained_in {
1108 32     32 0 58 my $ele = shift;
1109 32         101 my @array = @_;
1110 32         42 my $count = 0;
1111 32 100       71 map {$count++ if $ele eq $_} @array;
  64         240  
1112 32         181 return $count;
1113             }
1114              
1115             # Meant only for an un-nested hash:
1116             sub deep_copy_hash {
1117 16     16 0 30 my $ref_in = shift;
1118 16         31 my $ref_out = {};
1119 16         28 foreach ( keys %{$ref_in} ) {
  16         600  
1120 1600         2128 $ref_out->{$_} = $ref_in->{$_};
1121             }
1122 16         575 return $ref_out;
1123             }
1124              
1125             sub vec_scalar_product {
1126 38     38 0 35 my $vec1 = shift;
1127 38         33 my $vec2 = shift;
1128 38 50       102 croak "Something is wrong --- the two vectors are of unequal length"
1129             unless @$vec1 == @$vec2;
1130 38         27 my $product;
1131 38         57 for my $i (0..@$vec1-1) {
1132 3800         3935 $product += $vec1->[$i] * $vec2->[$i];
1133             }
1134 38         70 return $product;
1135             }
1136              
1137             sub vec_magnitude {
1138 61     61 0 56 my $vec = shift;
1139 61         45 my $mag_squared = 0;
1140 61         67 foreach my $num (@$vec) {
1141 6100         5394 $mag_squared += $num ** 2;
1142             }
1143 61         93 return sqrt $mag_squared;
1144             }
1145              
1146             sub reduce {
1147 2     2 0 4 my $vec = shift;
1148 2         3 my $result;
1149 2         6 for my $item (@$vec) {
1150 200         175 $result += $item;
1151             }
1152 2         8 return $result;
1153             }
1154              
1155             sub simple_stemmer {
1156 3310     3310 0 3727 my $word = shift;
1157 3310         2505 my $debug = shift;
1158 3310 50       4473 print "\nStemming the word: $word\n" if $debug;
1159 3310         12710 $word =~ s/(.*[a-z][^aeious])s$/$1/i;
1160 3310         4754 $word =~ s/(.*[a-z]s)es$/$1/i;
1161 3310         4436 $word =~ s/(.*[a-z][ck])es$/$1e/i;
1162 3310         12004 $word =~ s/(.*[a-z]+)tions$/$1tion/i;
1163 3310         9997 $word =~ s/(.*[a-z]+)mming$/$1m/i;
1164 3310         18875 $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1165 3310         10422 $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1166 3310         10782 $word =~ s/(.*[a-z]+)tices$/$1tex/i;
1167 3310         14847 $word =~ s/(.*[a-z]+)pes$/$1pe/i;
1168 3310         14581 $word =~ s/(.*[a-z]+)sed$/$1se/i;
1169 3310         17184 $word =~ s/(.*[a-z]+)ed$/$1/i;
1170 3310         7657 $word =~ s/(.*[a-z]+)tation$/$1t/i;
1171 3310 50       4601 print "Stemmed word: $word\n\n" if $debug;
1172 3310         8313 return $word;
1173             }
1174              
1175             # Assumes the array is sorted in a descending order, as would be the
1176             # case with an array of singular values produced by an SVD algorithm
1177             sub return_index_of_last_value_above_threshold {
1178 1     1 0 2 my $pdl_obj = shift;
1179 1         7 my $size = $pdl_obj->getdim(0);
1180 1         2 my $threshold = shift;
1181 1         13 my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1182 1         91 my $i = 0;
1183 1   66     19 while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
  8         212  
1184 1         6 return $i-1;
1185             }
1186              
1187             sub set_intersection {
1188 0     0 0   my $set1 = shift;
1189 0           my $set2 = shift;
1190 0           my %hset1 = map {$_ => 1} @$set1;
  0            
1191 0           my @common_elements = grep {$hset1{$_}} @$set2;
  0            
1192 0 0         return @common_elements ? \@common_elements : [];
1193             }
1194              
1195             sub get_integer_suffix {
1196 0     0 0   my $label = shift;
1197 0           $label =~ /(\d*)$/;
1198 0           return $1;
1199             }
1200              
1201             1;
1202              
1203             =pod
1204              
1205             =head1 NAME
1206              
1207             Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1208             library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1209             algorithms in response to search words and phrases.
1210              
1211             =head1 SYNOPSIS
1212              
1213             # FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1214              
1215             use Algorithm::VSM;
1216              
1217             my $corpus_dir = "corpus";
1218             my @query = qw/ program ListIterator add ArrayList args /;
1219             my $stop_words_file = "stop_words.txt";
1220             my $vsm = Algorithm::VSM->new(
1221             break_camelcased_and_underscored => 1,
1222             case_sensitive => 0,
1223             corpus_directory => $corpus_dir,
1224             file_types => ['.txt', '.java'],
1225             max_number_retrievals => 10,
1226             min_word_length => 4,
1227             stop_words_file => $stop_words_file,
1228             use_idf_filter => 1,
1229             want_stemming => 1,
1230             );
1231             $vsm->get_corpus_vocabulary_and_word_counts();
1232             $vsm->display_corpus_vocab();
1233             $vsm->display_corpus_vocab_size();
1234             $vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1235             $vsm->display_inverse_document_frequencies();
1236             $vsm->generate_document_vectors();
1237             $vsm->display_doc_vectors();
1238             $vsm->display_normalized_doc_vectors();
1239             my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1240             $vsm->display_retrievals( $retrievals );
1241              
1242             The purpose of each constructor option and what is accomplished by the method
1243             calls should be obvious by their names. If not, they are explained in greater
1244             detail elsewhere in this documentation page. Note that the methods
1245             display_corpus_vocab() and display_doc_vectors() are there only for testing
1246             purposes with small corpora. If you must use them for large libraries/corpora,
1247             you might wish to redirect the output to a file.
1248              
1249             By default, a call to a constructor calculates normalized term-frequency vectors
1250             for the documents. Normalization consists of first calculating the term
1251             frequency tf(t) of a term t in a document as a proportion of the total numbers
1252             of words in the document and then multiplying it by idf(t), where idf(t) stands
1253             for the inverse document frequency associated with that term. Note that 'word'
1254             and 'term' mean the same thing.
1255              
1256              
1257              
1258             # FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1259              
1260             my $lsa = Algorithm::VSM->new(
1261             break_camelcased_and_underscored => 1,
1262             case_sensitive => 0,
1263             corpus_directory => $corpus_dir,
1264             file_types => ['.txt', '.java'],
1265             lsa_svd_threshold => 0.01,
1266             max_number_retrievals => 10,
1267             min_word_length => 4,
1268             stop_words_file => $stop_words_file,
1269             use_idf_filter => 1,
1270             want_stemming => 1,
1271             );
1272             $lsa->get_corpus_vocabulary_and_word_counts();
1273             $lsa->display_corpus_vocab();
1274             $lsa->display_corpus_vocab_size();
1275             $lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1276             $lsa->generate_document_vectors();
1277             $lsa->construct_lsa_model();
1278             my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1279             $lsa->display_retrievals( $retrievals );
1280              
1281             The initialization code before the constructor call and the calls for displaying
1282             the vocabulary and the vectors after the call remain the same as for the VSM case
1283             shown previously in this Synopsis. In the call above, the constructor parameter
1284             'lsa_svd_threshold' determines how many of the singular values will be retained
1285             after we have carried out an SVD decomposition of the term-frequency matrix for
1286             the documents in the corpus. Singular values smaller than this threshold
1287             fraction of the largest value are rejected.
1288              
1289              
1290              
1291             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1292              
1293             my $corpus_dir = "corpus";
1294             my $stop_words_file = "stop_words.txt";
1295             my $query_file = "test_queries.txt";
1296             my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1297             # will be stored in this file
1298             my $vsm = Algorithm::VSM->new(
1299             break_camelcased_and_underscored => 1,
1300             case_sensitive => 0,
1301             corpus_directory => $corpus_dir,
1302             file_types => ['.txt', '.java'],
1303             min_word_length => 4,
1304             query_file => $query_file,
1305             relevancy_file => $relevancy_file,
1306             relevancy_threshold => 5,
1307             stop_words_file => $stop_words_file,
1308             want_stemming => 1,
1309             );
1310             $vsm->get_corpus_vocabulary_and_word_counts();
1311             $vsm->generate_document_vectors();
1312             $vsm->estimate_doc_relevancies();
1313             $vsm->display_doc_relevancies(); # used only for testing
1314             $vsm->precision_and_recall_calculator('vsm');
1315             $vsm->display_precision_vs_recall_for_queries();
1316             $vsm->display_map_values_for_queries();
1317              
1318             Measuring precision and recall requires a set of queries. These are supplied
1319             through the constructor parameter 'query_file'. The format of the this file
1320             must be according to the sample file 'test_queries.txt' in the 'examples'
1321             directory. The module estimates the relevancies of the documents to the
1322             queries and dumps the relevancies in a file named by the 'relevancy_file'
1323             constructor parameter. The constructor parameter 'relevancy_threshold' is used
1324             to decide which of the documents are considered to be relevant to a query. A
1325             document must contain at least the 'relevancy_threshold' occurrences of query
1326             words in order to be considered relevant to a query.
1327              
1328              
1329              
1330             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1331              
1332             my $lsa = Algorithm::VSM->new(
1333             break_camelcased_and_underscored => 1,
1334             case_sensitive => 0,
1335             corpus_directory => $corpus_dir,
1336             file_types => ['.txt', '.java'],
1337             lsa_svd_threshold => 0.01,
1338             min_word_length => 4,
1339             query_file => $query_file,
1340             relevancy_file => $relevancy_file,
1341             relevancy_threshold => 5,
1342             stop_words_file => $stop_words_file,
1343             want_stemming => 1,
1344             );
1345             $lsa->get_corpus_vocabulary_and_word_counts();
1346             $lsa->generate_document_vectors();
1347             $lsa->construct_lsa_model();
1348             $lsa->estimate_doc_relevancies();
1349             $lsa->display_doc_relevancies();
1350             $lsa->precision_and_recall_calculator('lsa');
1351             $lsa->display_precision_vs_recall_for_queries();
1352             $lsa->display_map_values_for_queries();
1353              
1354             We have already explained the purpose of the constructor parameter 'query_file'
1355             and about the constraints on the format of queries in the file named through
1356             this parameter. As mentioned earlier, the module estimates the relevancies of
1357             the documents to the queries and dumps the relevancies in a file named by the
1358             'relevancy_file' constructor parameter. The constructor parameter
1359             'relevancy_threshold' is used in deciding which of the documents are considered
1360             to be relevant to a query. A document must contain at least the
1361             'relevancy_threshold' occurrences of query words in order to be considered
1362             relevant to a query. We have previously explained the role of the constructor
1363             parameter 'lsa_svd_threshold'.
1364              
1365              
1366              
1367             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1368              
1369             my $corpus_dir = "corpus";
1370             my $stop_words_file = "stop_words.txt";
1371             my $query_file = "test_queries.txt";
1372             my $relevancy_file = "relevancy.txt";
1373             my $vsm = Algorithm::VSM->new(
1374             break_camelcased_and_underscored => 1,
1375             case_sensitive => 0,
1376             corpus_directory => $corpus_dir,
1377             file_types => ['.txt', '.java'],
1378             min_word_length => 4,
1379             query_file => $query_file,
1380             relevancy_file => $relevancy_file,
1381             stop_words_file => $stop_words_file,
1382             want_stemming => 1,
1383             );
1384             $vsm->get_corpus_vocabulary_and_word_counts();
1385             $vsm->generate_document_vectors();
1386             $vsm->upload_document_relevancies_from_file();
1387             $vsm->display_doc_relevancies();
1388             $vsm->precision_and_recall_calculator('vsm');
1389             $vsm->display_precision_vs_recall_for_queries();
1390             $vsm->display_map_values_for_queries();
1391              
1392             Now the filename supplied through the constructor parameter 'relevancy_file' must
1393             contain relevance judgments for the queries that are named in the file supplied
1394             through the parameter 'query_file'. The format of these two files must be
1395             according to what is shown in the sample files 'test_queries.txt' and
1396             'relevancy.txt' in the 'examples' directory.
1397              
1398              
1399              
1400             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1401              
1402             my $corpus_dir = "corpus";
1403             my $stop_words_file = "stop_words.txt";
1404             my $query_file = "test_queries.txt";
1405             my $relevancy_file = "relevancy.txt";
1406             my $lsa = Algorithm::VSM->new(
1407             break_camelcased_and_underscored => 1,
1408             case_sensitive => 0,
1409             corpus_directory => $corpus_dir,
1410             file_types => ['.txt', '.java'],
1411             lsa_svd_threshold => 0.01,
1412             min_word_length => 4,
1413             query_file => $query_file,
1414             relevancy_file => $relevancy_file,
1415             stop_words_file => $stop_words_file,
1416             want_stemming => 1,
1417             );
1418             $lsa->get_corpus_vocabulary_and_word_counts();
1419             $lsa->generate_document_vectors();
1420             $lsa->upload_document_relevancies_from_file();
1421             $lsa->display_doc_relevancies();
1422             $lsa->precision_and_recall_calculator('vsm');
1423             $lsa->display_precision_vs_recall_for_queries();
1424             $lsa->display_map_values_for_queries();
1425              
1426             As mentioned for the previous code block, the filename supplied through the
1427             constructor parameter 'relevancy_file' must contain relevance judgments for the
1428             queries that are named in the file supplied through the parameter 'query_file'.
1429             The format of this file must be according to what is shown in the sample file
1430             'relevancy.txt' in the 'examples' directory. We have already explained the roles
1431             played by the constructor parameters such as 'lsa_svd_threshold'.
1432              
1433              
1434              
1435             # FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1436              
1437             my $corpus_dir = "corpus";
1438             my $stop_words_file = "stop_words.txt";
1439             my $vsm = Algorithm::VSM->new(
1440             break_camelcased_and_underscored => 1,
1441             case_sensitive => 0,
1442             corpus_directory => $corpus_dir,
1443             file_types => ['.txt', '.java'],
1444             min_word_length => 4,
1445             stop_words_file => $stop_words_file,
1446             want_stemming => 1,
1447             );
1448             $vsm->get_corpus_vocabulary_and_word_counts();
1449             $vsm->generate_document_vectors();
1450             # code for calculating pairwise similarities as shown in the
1451             # script calculate_similarity_matrix_for_all_docs.pl in the
1452             # examples directory. This script makes calls to
1453             #
1454             # $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1455             #
1456             # for every pair of documents.
1457              
1458             =head1 CHANGES
1459              
1460             Version 1.61 improves the implementation of the directory scanner to make it more
1461             platform independent. Additionally, you are now required to specify in the
1462             constructor call the file types to be considered for computing the database model.
1463             If, say, you have a large software library and you want only Java and text files to
1464             be scanned for creating the VSM (or the LSA) model, you must supply that information
1465             to the module by setting the constructor parameter C to the anonymous
1466             list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1467             version is C. If you set it to 1, that will force the database model
1468             and query matching to become case sensitive.
1469              
1470             Version 1.60 reflects the fact that people are now more likely to use this module by
1471             keeping the model constructed for a corpus in the fast memory (as opposed to storing
1472             the models in disk-based hash tables) for its repeated invocation for different
1473             queries. As a result, the default value for the constructor option
1474             C was changed from 1 to 0. For those who still wish to store on
1475             a disk the model that is constructed, the script
1476             C shows how you can do that.
1477             Other changes in 1.60 include a slight reorganization of the scripts in the
1478             C directory. Most scripts now do not by default store their models in
1479             disk-based hash tables. This reorganization is reflected in the description of the
1480             C directory in this documentation. The basic logic of constructing VSM and
1481             LSA models and how these are used for retrievals remains unchanged.
1482              
1483             Version 1.50 incorporates a couple of new features: (1) You now have the option to
1484             split camel-cased and underscored words for constructing your vocabulary set; and (2)
1485             Storing the VSM and LSA models in database files on the disk is now optional. The
1486             second feature, in particular, should prove useful to those who are using this module
1487             for large collections of documents.
1488              
1489             Version 1.42 includes two new methods, C and
1490             C, for those folks who deal with very large datasets.
1491             You can get a better sense of the overall vocabulary being used by the module for
1492             file retrieval by examining the contents of a dump file whose name is supplied as an
1493             argument to C.
1494              
1495             Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1496             the dependencies between this module and the submodules of PDL.
1497              
1498             Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1499             documents in the corpus. The elements of such a matrix express pairwise similarities
1500             between the documents. The pairwise similarities are based on the dot product of two
1501             document vectors divided by the product of the vector magnitudes. The 'examples'
1502             directory contains two scripts to illustrate how such matrices can be calculated by
1503             the user. The similarity matrix is output as a CSV file.
1504              
1505             Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1506             document file. What that means is that the words that appear in most of the documents
1507             get reduced weighting since such words are non-discriminatory with respect to the
1508             retrieval of the documents. A typical formula that is used to calculate the IDF
1509             weight for a word is the logarithm of the ratio of the total number of documents to
1510             the number of documents in which the word appears. So if a word were to appear in
1511             all the documents, its IDF multiplier would be zero in the vector representation of a
1512             document. If so desired, you can turn off the IDF weighting of the words by
1513             explicitly setting the constructor parameter C to zero.
1514              
1515             Version 1.2 includes a code correction and some general code and documentation
1516             cleanup.
1517              
1518             With Version 1.1, you can access the retrieval precision results so that you can
1519             compare two different retrieval algorithms (VSM or LSA with different choices for
1520             some of the constructor parameters) with significance testing. (Version 1.0 merely
1521             sent those results to standard output, typically your terminal window.) In Version
1522             1.1, the new script B in the 'examples' directory
1523             illustrates significance testing with Randomization and with Student's Paired t-Test.
1524              
1525             =head1 DESCRIPTION
1526              
1527             B is a I module for constructing a Vector Space Model (VSM) or
1528             a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1529             to as a corpus, and then retrieving the documents in response to search words in a
1530             query.
1531              
1532             VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1533             community. More recently such models have been shown to be effective in retrieving
1534             files/documents from software libraries. For an account of this research that was
1535             presented by Shivani Rao and the author of this module at the 2011 Mining Software
1536             Repositories conference, see L.
1537              
1538             VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1539             Stemming the words so extracted and eliminating the designated stop words from the
1540             vocabulary. Stemming means that closely related words like 'programming' and
1541             'programs' are reduced to the common root word 'program' and the stop words are the
1542             non-discriminating words that can be expected to exist in virtually all the
1543             documents. (3) Constructing document vectors for the individual files in the corpus
1544             --- the document vectors taken together constitute what is usually referred to as a
1545             'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1546             factor out the effect of document size and, if desired, multiplying the term
1547             frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1548             the weight of the words that appear in a large number of documents. (5) Constructing
1549             a query vector for the search query after the query is subject to the same stemming
1550             and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1551             Using a similarity metric to return the set of documents that are most similar to the
1552             query vector. The commonly used similarity metric is one based on the cosine
1553             distance between two vectors. Also note that all the vectors mentioned here are of
1554             the same size, the size of the vocabulary. An element of a vector is the frequency
1555             of occurrence of the word corresponding to that position in the vector.
1556              
1557             LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1558             step further by subjecting the term-frequency matrix for the corpus to singular value
1559             decomposition (SVD). By retaining only a subset of the singular values (usually the
1560             N largest for some value of N), you can construct reduced-dimensionality vectors for
1561             the documents and the queries. In VSM, as mentioned above, the size of the document
1562             and the query vectors is equal to the size of the vocabulary. For large corpora,
1563             this size may involve tens of thousands of words --- this can slow down the VSM
1564             modeling and retrieval process. So you are very likely to get faster performance
1565             with retrieval based on LSA modeling, especially if you store the model once
1566             constructed in a database file on the disk and carry out retrievals using the
1567             disk-based model.
1568              
1569              
1570             =head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1571              
1572             This module has only been tested for software retrieval. For more general text
1573             retrieval, you would need to replace the simple stemmer used in the module by one
1574             based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1575             list of stop words appropriate to the text corpora of interest to you. As previously
1576             mentioned, the stop words are the commonly occurring words that do not carry much
1577             discriminatory power from the standpoint of distinguishing between the documents.
1578             See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1579             formatted.
1580              
1581              
1582             =head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1583              
1584             It is not uncommon for large software libraries to consist of tens of thousands of
1585             documents that include source-code files, documentation files, README files,
1586             configuration files, etc. The bug-localization work presented recently by Shivani
1587             Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1588             based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1589             size of 7553 unique words. (Here is a link to this work:
1590             L. Also note that the iBUGS dataset
1591             was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1592             automated bug detection and localization tools.) If C is the size of the
1593             vocabulary and C the number of the documents in the corpus, the size of each
1594             vector will be C and size of the term-frequency matrix for the entire corpus will
1595             be CxC. So if you were to duplicate the bug localization experiments in
1596             L you would be dealing with vectors of
1597             size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1598             to really large libraries/corpora, we are obviously talking about very large matrices
1599             for SVD decomposition. For large libraries/corpora, it would be best to store away
1600             the model in a disk file and to base all subsequent retrievals on the disk-stored
1601             models. The 'examples' directory contains scripts that carry out retrievals on the
1602             basis of disk-based models. Further speedup in retrieval can be achieved by using
1603             LSA to create reduced-dimensionality representations for the documents and by basing
1604             retrievals on the stored versions of such reduced-dimensionality representations.
1605              
1606              
1607             =head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1608              
1609             The performance of a retrieval algorithm is typically measured by two properties:
1610             C and C. As mentioned in the
1611             L publication, at a given rank C,
1612             C is the ratio of the number of retrieved documents that are relevant to
1613             the total number of retrieved documents up to that rank. And, along the same lines,
1614             C at a given rank C is the ratio of the number of retrieved documents that
1615             are relevant to the total number of relevant documents. The area under the
1616             C--C curve is called the C for a query. When
1617             the C is averaged over all the queries, we obtain what is known as
1618             C (MAP). For an oracle, the value of MAP should be 1.0. On
1619             the other hand, for purely random retrieval from a corpus, the value of MAP will be
1620             inversely proportional to the size of the corpus. (See the discussion in
1621             L for further
1622             explanation on these retrieval precision evaluators.) This module includes methods
1623             that allow you to carry out these retrieval accuracy measurements using the relevancy
1624             judgments supplied through a disk file. If human-supplied relevancy judgments are
1625             not available, the module will be happy to estimate relevancies for you just by
1626             determining the number of query words that exist in a document. Note, however, that
1627             relevancy judgments estimated in this manner cannot be trusted. That is because
1628             ultimately it is the humans who are the best judges of the relevancies of documents
1629             to queries. The humans bring to bear semantic considerations on the relevancy
1630             determination problem that are beyond the scope of this module.
1631              
1632              
1633             =head1 METHODS
1634              
1635             The module provides the following methods for constructing VSM and LSA models of a
1636             corpus, for using the models thus constructed for retrieval, and for carrying out
1637             precision versus recall calculations for the determination of retrieval accuracy on
1638             the corpora of interest to you.
1639              
1640             =over
1641              
1642             =item B
1643              
1644             A call to C constructs a new instance of the C class:
1645              
1646             my $vsm = Algorithm::VSM->new(
1647             break_camelcased_and_underscored => 1,
1648             case_sensitive => 0,
1649             corpus_directory => "",
1650             corpus_vocab_db => "corpus_vocab_db",
1651             doc_vectors_db => "doc_vectors_db",
1652             file_types => $my_file_types,
1653             lsa_svd_threshold => 0.01,
1654             max_number_retrievals => 10,
1655             min_word_length => 4,
1656             normalized_doc_vecs_db => "normalized_doc_vecs_db",
1657             query_file => "",
1658             relevancy_file => $relevancy_file,
1659             relevancy_threshold => 5,
1660             save_model_on_disk => 0,
1661             stop_words_file => "",
1662             use_idf_filter => 1,
1663             want_stemming => 1,
1664             );
1665              
1666             The values shown on the right side of the big arrows are the B
1667             parameters>. The value supplied through the variable C<$my_file_types> would be
1668             something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1669             be included in creating the database model. The following nested list will now
1670             describe each of the constructor parameters shown above:
1671              
1672             =over 16
1673              
1674             =item I
1675              
1676             The parameter B when set causes the
1677             underscored and camel-cased words to be split. By default the parameter is
1678             set. So if you don't want such words to be split, you must set it
1679             explicitly to 0.
1680              
1681             =item I
1682              
1683             The parameter B points to the root of the directory of documents
1684             for which you want to create a VSM or LSA model.
1685              
1686             =item I
1687              
1688             The parameter B is for naming the DBM in which the corpus vocabulary
1689             will be stored after it is subject to stemming and the elimination of stop words.
1690             Once a disk-based VSM model is created and stored away in the file named by this
1691             parameter and the parameter to be described next, it can subsequently be used
1692             directly for speedier retrieval.
1693              
1694             =item I
1695              
1696             When set to 1, this parameter forces the module to maintain the case of the terms in
1697             the corpus files when creating the vocabulary and the document vectors. Setting
1698             C to 1 also causes the query matching to become case sensitive.
1699             (This constructor parameter was introduced in Version 1.61.)
1700              
1701             =item I
1702              
1703             The database named by B stores the document vector representation for
1704             each document in the corpus. Each document vector has the same size as the
1705             corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1706             the word that corresponds to that position in the vocabulary vector.
1707              
1708             =item I
1709              
1710             This parameter tells the module what types of files in the corpus directory you want
1711             scanned for creating the database model. The value supplied for this parameter is an
1712             anonymous list of the file suffixes for the file types. For example, if you wanted
1713             only Java and text files to be scanned, you will set this parameter to C<['.java',
1714             '.txt']>. The module throws an exception if this parameter is left unspecified.
1715             (This constructor parameter was introduced in Version 1.61.)
1716              
1717             =item I
1718              
1719             The parameter B is used for rejecting singular values that are
1720             smaller than this threshold fraction of the largest singular value. This plays a
1721             critical role in creating reduced-dimensionality document vectors in LSA modeling of
1722             a corpus.
1723              
1724             =item I
1725              
1726             The constructor parameter B stands for what it means.
1727              
1728             =item I
1729              
1730             The parameter B sets the minimum number of characters in a
1731             word in order for it to be included in the corpus vocabulary.
1732              
1733             =item I
1734              
1735             The database named by B stores the normalized document
1736             vectors. Normalization consists of factoring out the size of the documents by
1737             dividing the term frequency for each word in a document by the number of words in the
1738             document, and then multiplying the result by the idf (Inverse Document Frequency)
1739             value for the word.
1740              
1741             =item I
1742              
1743             The parameter B points to a file that contains the queries to be used for
1744             calculating retrieval performance with C and C numbers. The format
1745             of the query file must be as shown in the sample file C in the
1746             'examples' directory.
1747              
1748             =item I
1749              
1750             This option names the disk file for storing the relevancy judgments.
1751              
1752             =item I
1753              
1754             The constructor parameter B is used for automatic determination
1755             of document relevancies to queries on the basis of the number of occurrences of query
1756             words in a document. You can exercise control over the process of determining
1757             relevancy of a document to a query by giving a suitable value to the constructor
1758             parameter B. A document is considered relevant to a query only
1759             when the document contains at least B number of query words.
1760              
1761             =item I
1762              
1763             The constructor parameter B will cause the basic
1764             information about the VSM and the LSA models to be stored on the disk.
1765             Subsequently, any retrievals can be carried out from the disk-based model.
1766              
1767             =item I
1768              
1769             The parameter B is for naming the file that contains the stop words
1770             that you do not wish to include in the corpus vocabulary. The format of this file
1771             must be as shown in the sample file C in the 'examples' directory.
1772              
1773             =item I
1774              
1775             The constructor parameter B is set by default. If you want
1776             to turn off the normalization of the document vectors, including turning
1777             off the weighting of the term frequencies of the words by their idf values,
1778             you must set this parameter explicitly to 0.
1779              
1780             =item I
1781              
1782             The boolean parameter B determines whether or not the words extracted
1783             from the documents would be subject to stemming. As mentioned elsewhere, stemming
1784             means that related words like 'programming' and 'programs' would both be reduced to
1785             the root word 'program'.
1786              
1787             =back
1788              
1789             =begin html
1790              
1791            
1792              
1793             =end html
1794              
1795             =item B
1796              
1797             You call this subroutine for constructing an LSA model for your corpus
1798             after you have extracted the corpus vocabulary and constructed document
1799             vectors:
1800              
1801             $vsm->construct_lsa_model();
1802              
1803             The SVD decomposition that is carried out in LSA model construction uses the
1804             constructor parameter C to decide how many of the singular values
1805             to retain for the LSA model. A singular is retained only if it is larger than the
1806             C fraction of the largest singular value.
1807              
1808              
1809             =item B
1810              
1811             If you would like to see corpus vocabulary as constructed by the previous call, make
1812             the call
1813              
1814             $vsm->display_corpus_vocab();
1815              
1816             Note that this is a useful thing to do only on small test corpora. If you need
1817             to examine the vocabulary for a large corpus, call the two methods listed below.
1818              
1819              
1820             =item B
1821              
1822             If you would like for the module to print out in your terminal window the size of the
1823             vocabulary, make the call
1824              
1825             $vsm->display_corpus_vocab_size();
1826              
1827              
1828             =item B
1829              
1830             If you would like to see the document relevancies generated by the previous method,
1831             you can call
1832              
1833             $vsm->display_doc_relevancies()
1834              
1835              
1836             =item B
1837              
1838             If you would like to see the document vectors constructed by the previous call, make
1839             the call:
1840              
1841             $vsm->display_doc_vectors();
1842              
1843             Note that this is a useful thing to do only on small test corpora. If you must call
1844             this method on a large corpus, you might wish to direct the output to a file.
1845              
1846              
1847             =item B
1848              
1849             You can display the idf value associated with each word in the corpus by
1850              
1851             $vsm->display_inverse_document_frequencies();
1852              
1853             The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1854             of the total number of documents in the corpus to the number of documents in which
1855             the word appears (with protection built in to prevent division by zero). Ideally, if
1856             a word appears in all the documents, its idf would be small, close to zero. Words
1857             with small idf values are non-discriminatory and should get reduced weighting in
1858             document retrieval.
1859              
1860              
1861             =item B
1862              
1863             The area under the precision vs. recall curve for a given query is called C
1864             Precision> for that query. When this area is averaged over all the queries, you get
1865             C (Mean Average Precision) as a measure of the accuracy of the retrieval
1866             algorithm. The C values for the queries and the overall C
1867             can be printed out by calling
1868              
1869             $vsm->display_map_values_for_queries();
1870              
1871              
1872             =item B
1873              
1874             If you would like to see the normalized document vectors, make the call:
1875              
1876             $vsm->display_normalized_doc_vectors();
1877              
1878             See the comment made previously as to what is meant by the normalization of a
1879             document vector.
1880              
1881              
1882             =item B
1883              
1884             A call to C will normally be followed by the
1885             following call
1886              
1887             $vsm->display_precision_vs_recall_for_queries();
1888              
1889             for displaying the C and C values.
1890              
1891              
1892             =item B
1893              
1894             You can display the retrieved document names by calling this method using the syntax:
1895              
1896             $vsm->display_retrievals( $retrievals );
1897              
1898             where C<$retrievals> is a reference to the hash returned by a call to one of the
1899             C methods. The display method shown here respects the retrieval size
1900             constraints expressed by the constructor parameter C.
1901              
1902              
1903             =item B
1904              
1905             Before you can carry out precision and recall calculations to test the accuracy of
1906             VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1907             judgments for the queries. (A relevancy judgment for a query is simply the list of
1908             documents relevant to that query.) Relevancy judgments are commonly supplied by the
1909             humans who are familiar with the corpus. But if such human-supplied relevance
1910             judgments are not available, you can invoke the following method to estimate them:
1911              
1912             $vsm->estimate_doc_relevancies();
1913              
1914             For the above method call, a document is considered to be relevant to a query if it
1915             contains several of the query words. As to the minimum number of query words that
1916             must exist in a document in order for the latter to be considered relevant, that is
1917             determined by the C parameter in the VSM constructor.
1918              
1919             But note that this estimation of document relevancies to queries is NOT for serious
1920             work. The reason for that is because ultimately it is the humans who are the best
1921             judges of the relevancies of documents to queries. The humans bring to bear semantic
1922             considerations on the relevancy determination problem that are beyond the scope of
1923             this module.
1924              
1925             The generated relevancies are deposited in a file named by the constructor parameter
1926             C.
1927              
1928              
1929             =item B
1930              
1931             If you want to get hold of all the filenames in the corpus in your own script, you
1932             can call
1933              
1934             my @docs = @{$vsm->get_all_document_names()};
1935              
1936             The array on the left will contain an alphabetized list of the files.
1937              
1938              
1939             =item B
1940              
1941             This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1942             course, if you will be doing document retrieval through a disk-stored VSM or LSA
1943             model, then you do not need to call this method. You construct document vectors
1944             through the following call:
1945              
1946             $vsm->generate_document_vectors();
1947              
1948              
1949             =item B
1950              
1951             After you have constructed a new instance of the C class, you must
1952             now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1953              
1954             $vsm->get_corpus_vocabulary_and_word_counts();
1955              
1956             The only time you do NOT need to call this method is when you are using a previously
1957             constructed disk-stored VSM model for retrieval.
1958              
1959              
1960             =item B
1961              
1962             If you want to run significance tests on the retrieval accuracies you obtain on a
1963             given corpus and with different algorithms (VSM or LSA with different choices for the
1964             constructor parameters), your own script would need access to the average precision
1965             data for a set of queries. You can get hold of this data by calling
1966              
1967             $vsm->get_query_sorted_average_precision_for_queries();
1968              
1969             The script C in the 'examples' directory shows how you can
1970             use this method for significance testing.
1971              
1972              
1973             =item B
1974              
1975             =item B
1976              
1977             If you would like to compare in your own script any two documents in the corpus, you
1978             can call
1979              
1980             my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
1981             or
1982             my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
1983              
1984             Both these calls return a number that is the dot product of the two document vectors
1985             normalized by the product of their magnitudes. The first call uses the regular
1986             document vectors and the second the normalized document vectors.
1987              
1988              
1989             =item B
1990              
1991             After you have created or obtained the relevancy judgments for your test queries, you
1992             can make the following call to calculate C and C:
1993              
1994             $vsm->precision_and_recall_calculator('vsm');
1995             or
1996             $vsm->precision_and_recall_calculator('lsa');
1997              
1998             depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
1999              
2000             =item B
2001              
2002             After you have built an LSA model through the call to C, you
2003             can retrieve the document names most similar to the query by:
2004              
2005             my $retrievals = $vsm->retrieve_with_lsa( \@query );
2006              
2007             Subsequently, you can display the retrievals by calling the
2008             C method described previously.
2009              
2010              
2011             =item B
2012              
2013             After you have constructed a VSM model, you call this method for document retrieval
2014             for a given query C<@query>. The call syntax is:
2015              
2016             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2017              
2018             The argument, C<@query>, is simply a list of words that you wish to use for
2019             retrieval. The method returns a hash whose keys are the document names and whose
2020             values the similarity distance between the document and the query. As is commonly
2021             the case with VSM, this module uses the cosine similarity distance when comparing a
2022             document vector with the query vector.
2023              
2024              
2025             =item B
2026              
2027             When human-supplied relevancies are available, you can upload them into the program
2028             by calling
2029              
2030             $vsm->upload_document_relevancies_from_file();
2031              
2032             These relevance judgments will be read from a file that is named with the
2033             C constructor parameter.
2034              
2035              
2036             =item B
2037              
2038             When you invoke the methods C and
2039             C, that automatically deposits the VSM model in the
2040             database files named with the constructor parameters C,
2041             C and C. Subsequently, you can carry out
2042             retrieval by directly using this disk-based VSM model for speedier performance. In
2043             order to do so, you must upload the disk-based model by
2044              
2045             $vsm->upload_normalized_vsm_model_from_disk();
2046              
2047             Subsequently you call
2048              
2049             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2050             $vsm->display_retrievals( $retrievals );
2051              
2052             for retrieval and for displaying the results.
2053              
2054              
2055             =item B
2056              
2057             This is the method to call for large text corpora if you would like to examine the
2058             vocabulary created. The call syntax is
2059              
2060             $vsm->write_corpus_vocab_to_file($filename);
2061              
2062             where C<$filename> is the name of the file that you want the vocabulary to be written
2063             out to. This call will also show the frequency of each vocabulary word in your
2064             corpus.
2065              
2066              
2067             =back
2068              
2069              
2070             =head1 REQUIRED
2071              
2072             This module requires the following modules:
2073              
2074             SDBM_File
2075             Storable
2076             PDL
2077             File::Basename
2078             File::Spec::Functions
2079              
2080             The first two of these are needed for creating disk-based database records for the
2081             VSM and LSA models. The third is needed for calculating the SVD of the
2082             term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2083             by the directory scanner to make pathnames platform independent.
2084              
2085             =head1 EXAMPLES
2086              
2087             See the 'examples' directory in the distribution for the scripts listed below:
2088              
2089             =over
2090              
2091             =item B
2092              
2093             For basic VSM-based model construction and retrieval, run the script:
2094              
2095             retrieve_with_VSM.pl
2096              
2097             Starting with version 1.60, this script does not store away the VSM model in
2098             disk-based hash tables. If you want your model to be stored on the disk, you must
2099             run the script C for that.
2100              
2101             =item B
2102              
2103             If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2104             script
2105              
2106             continuously_running_VSM_retrieval_engine.pl
2107              
2108             You can create a script similar to this for doing the same with LSA models.
2109              
2110             =item B
2111              
2112             For storing the model information in disk-based DBM files that can subsequently be
2113             used for both VSM and LSA retrieval, run the script:
2114              
2115             retrieve_with_VSM_and_also_create_disk_based_model.pl
2116              
2117             =item B
2118              
2119             For basic LSA-based model construction and retrieval, run the script:
2120              
2121             retrieve_with_LSA.pl
2122              
2123             Starting with version 1.60, this script does not store away the model information in
2124             disk-based hash tables. If you want your model to be stored on the disk, you must
2125             run the script C for that.
2126              
2127             =item B
2128              
2129             If you have previously run a script like
2130             C, you can run the script
2131              
2132             retrieve_with_disk_based_VSM.pl
2133              
2134             for repeated VSM-based retrievals from a disk-based model.
2135              
2136             =item B
2137              
2138             If you have previously run a script like
2139             C, you can run the script
2140              
2141             retrieve_with_disk_based_LSA.pl
2142              
2143             for repeated LSA-based retrievals from a disk-based model.
2144              
2145             =item B
2146              
2147             To experiment with precision and recall calculations for VSM retrieval, run the
2148             script:
2149              
2150             calculate_precision_and_recall_for_VSM.pl
2151              
2152             Note that this script will carry out its own estimation of relevancy judgments ---
2153             which in most cases would not be a safe thing to do.
2154              
2155             =item B
2156              
2157             To experiment with precision and recall calculations for LSA retrieval, run the
2158             script:
2159              
2160             calculate_precision_and_recall_for_LSA.pl
2161              
2162             Note that this script will carry out its own estimation of relevancy judgments ---
2163             which in most cases would not be a safe thing to do.
2164              
2165             =item B
2166             Human-Supplied Relevancies:>
2167              
2168             Precision and recall calculations for retrieval accuracy determination are best
2169             carried out with human-supplied judgments of relevancies of the documents to queries.
2170             If such judgments are available, run the script:
2171              
2172             calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2173              
2174             This script will print out the average precisions for the different test queries and
2175             calculate the MAP metric of retrieval accuracy.
2176              
2177             =item B
2178             Human-Supplied Relevancies:>
2179              
2180             If human-supplied relevancy judgments are available and you wish to experiment with
2181             precision and recall calculations for LSA-based retrieval, run the script:
2182              
2183             calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2184              
2185             This script will print out the average precisions for the different test queries and
2186             calculate the MAP metric of retrieval accuracy.
2187              
2188             =item B
2189             Randomization or with Student's Paired t-Test:>
2190              
2191             significance_testing.pl randomization
2192              
2193             or
2194              
2195             significance_testing.pl t-test
2196              
2197             Significance testing consists of forming a null hypothesis that the two retrieval
2198             algorithms you are considering are the same from a black-box perspective and then
2199             calculating what is known as a C. If the C is less than, say,
2200             0.05, you reject the null hypothesis.
2201              
2202             =item B
2203              
2204             calculate_similarity_matrix_for_all_docs.pl
2205              
2206             or
2207              
2208             calculate_similarity_matrix_for_all_normalized_docs.pl
2209              
2210             The former uses regular document vectors for calculating the similarity between every
2211             pair of documents in the corpus. And the latter uses normalized document vectors for
2212             the same purpose. The document order used for row and column indexing of the matrix
2213             corresponds to the alphabetic ordering of the document names in the corpus directory.
2214              
2215             =back
2216              
2217              
2218             =head1 EXPORT
2219              
2220             None by design.
2221              
2222             =head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2223              
2224             You have to be careful when carrying out Precision verses Recall calculations if you
2225             do not wish to lose the previously created relevancy judgments. Invoking the method
2226             C in your own script will cause the file C
2227             to be overwritten. If you have created a relevancy database and stored it in a file
2228             called, say, C, you should make a backup copy of this file before
2229             executing a script that calls C.
2230              
2231             =head1 BUGS
2232              
2233             Please notify the author if you encounter any bugs. When sending email, please place
2234             the string 'VSM' in the subject line to get past my spam filter.
2235              
2236             =head1 INSTALLATION
2237              
2238             Download the archive from CPAN in any directory of your choice. Unpack the archive
2239             with a command that on a Linux machine would look like:
2240              
2241             tar zxvf Algorithm-VSM-1.61.tar.gz
2242              
2243             This will create an installation directory for you whose name will be
2244             C. Enter this directory and execute the following commands for a
2245             standard install of the module if you have root privileges:
2246              
2247             perl Makefile.PL
2248             make
2249             make test
2250             sudo make install
2251              
2252             If you do not have root privileges, you can carry out a non-standard install the
2253             module in any directory of your choice by:
2254              
2255             perl Makefile.PL prefix=/some/other/directory/
2256             make
2257             make test
2258             make install
2259              
2260             With a non-standard install, you may also have to set your PERL5LIB environment
2261             variable so that this module can find the required other modules. How you do that
2262             would depend on what platform you are working on. In order to install this module in
2263             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2264             variable by
2265              
2266             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2267              
2268             If I used bash, I'd need to declare:
2269              
2270             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2271              
2272              
2273             =head1 THANKS
2274              
2275             Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2276             deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2277             similarity matrix characterization of a corpus. Thanks, Zahn! Several of the recent
2278             changes to the module are a result of the feedback I have received from Naveen
2279             Kulkarni of Infosys Labs. Thanks, Naveen!
2280              
2281             =head1 AUTHOR
2282              
2283             Avinash Kak, kak@purdue.edu
2284              
2285             If you send email, please place the string "VSM" in your subject line to get past my
2286             spam filter.
2287              
2288             =head1 COPYRIGHT
2289              
2290             This library is free software; you can redistribute it and/or modify it under the
2291             same terms as Perl itself.
2292              
2293             Copyright 2015 Avinash Kak
2294              
2295             =cut
2296              
2297