File Coverage

blib/lib/Algorithm/VSM.pm
Criterion Covered Total %
statement 351 822 42.7
branch 92 328 28.0
condition 25 67 37.3
subroutine 31 54 57.4
pod 24 35 68.5
total 523 1306 40.0


line stmt bran cond sub pod time code
1             package Algorithm::VSM;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::VSM is a Perl module for retrieving the documents from a software
9             # library that match a list of words in a query. The matching criterion used depends
10             # on whether you ask the module to construct a full-dimensionality VSM or a
11             # reduced-dimensionality LSA model for the library.
12             # ---------------------------------------------------------------------------
13              
14             #use 5.10.0;
15 1     1   13497 use strict;
  1         2  
  1         29  
16 1     1   3 use warnings;
  1         2  
  1         27  
17 1     1   4 use Carp;
  1         4  
  1         73  
18 1     1   387 use SDBM_File;
  1         1569  
  1         39  
19 1     1   389 use PDL::Lite;
  1         139578  
  1         27  
20 1     1   7 use PDL::MatrixOps;
  1         1  
  1         8  
21 1     1   165 use File::Basename;
  1         1  
  1         76  
22 1     1   572 use File::Spec::Functions qw(rel2abs);
  1         737  
  1         58  
23 1     1   5 use Fcntl;
  1         2  
  1         199  
24 1     1   556 use Storable;
  1         2595  
  1         56  
25 1     1   7 use Cwd;
  1         1  
  1         10284  
26              
27             our $VERSION = '1.62';
28              
29             # for camelcase splits (from perlmonks):
30             my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/;
31              
32             ################################### Constructor #######################################
33              
34             # Constructor for creating a VSM or LSA model of a corpus. The model instance
35             # returned by the constructor can be used for retrieving documents from the corpus
36             # in response to queries.
37             sub new {
38 2     2 1 514 my ($class, %args) = @_;
39 2         17 my @params = keys %args;
40 2 50       13 croak "\nYou have used a wrong name for a keyword argument " .
41             "--- perhaps a misspelling\n"
42             if _check_for_illegal_params(@params) == 0;
43 2 50 50     7035 bless {
    50 50        
    50 50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      100        
      50        
      50        
44             _corpus_directory => $args{corpus_directory} || "",
45             _save_model_on_disk => $args{save_model_on_disk} || 0,
46             _break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47             $args{break_camelcased_and_underscored} : 1,
48             _corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
49             _doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
50             _normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
51             _stop_words_file => $args{stop_words_file} || "",
52             _case_sensitive => $args{case_sensitive} || 0,
53             _query_file => $args{query_file} || "",
54             _file_types => $args{file_types} || [],
55             _min_word_length => $args{min_word_length} || 4,
56             _want_stemming => $args{want_stemming} || 0,
57             _idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58             _max_number_retrievals => $args{max_number_retrievals} || 30,
59             _lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
60             _relevancy_threshold => exists $args{relevancy_threshold} ?
61             $args{relevancy_threshold} : 1,
62             _relevancy_file => $args{relevancy_file} || "",
63             _debug => $args{debug} || 0,
64             _working_directory => cwd,
65             _vocab_hist_on_disk => {},
66             _vocab_hist => {},
67             _doc_hist_template => {},
68             _corpus_doc_vectors => {},
69             _normalized_doc_vecs => {},
70             _query_vector => {},
71             _stop_words => [],
72             _term_document_matrix => [],
73             _corpus_vocab_done => 0,
74             _scan_dir_for_rels => 0,
75             _vocab_size => undef,
76             _doc_vecs_trunc_lsa => {},
77             _lsa_vec_truncator => undef,
78             _queries_for_relevancy => {},
79             _relevancy_estimates => {},
80             _precision_for_queries => {},
81             _recall_for_queries => {},
82             _map_vals_for_queries => {},
83             _vocab_idf_hist => {},
84             _idf_t => {},
85             _total_num_of_docs => 0,
86             }, $class;
87             }
88              
89              
90             ###################### Get corpus vocabulary and word counts #########################
91              
92             sub get_corpus_vocabulary_and_word_counts {
93 2     2 1 28 my $self = shift;
94 2 50       25 die "You must supply the name of the corpus directory to the constructor"
95             unless $self->{_corpus_directory};
96 2 50       11 print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97             " model construction\n\n" if $self->{_debug};
98 2         23 $self->_scan_directory( $self->{_corpus_directory} );
99 2 50       9 $self->_drop_stop_words() if $self->{_stop_words_file};
100 2 50       8 if ($self->{_debug}) {
101 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
102 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103             }
104             }
105 2 50       13 if ($self->{_save_model_on_disk}) {
106 0         0 unlink glob "$self->{_corpus_vocab_db}.*";
107 0         0 unlink glob "$self->{_doc_vectors_db}.*";
108 0         0 unlink glob "$self->{_normalized_doc_vecs_db}.*";
109 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
110             $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
111             or die "Can't create DBM files: $!";
112 0         0 foreach (keys %{$self->{_vocab_hist}}) {
  0         0  
113 0         0 $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114             }
115 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
116             }
117 2         6 $self->{_corpus_vocab_done} = 1;
118 2         5 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  2         25  
119 2 50       6 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120             if $self->{_debug};
121             # Calculate idf(t):
122 2         5 foreach (keys %{$self->{_vocab_idf_hist}}) {
  2         32  
123 200         429 $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124             /
125             (1 + $self->{_vocab_idf_hist}->{$_})))
126             / log(10) );
127             }
128             }
129              
130             sub display_corpus_vocab {
131 0     0 1 0 my $self = shift;
132 0         0 die "corpus vocabulary not yet constructed"
133 0 0       0 unless keys %{$self->{_vocab_hist}};
134 0         0 print "\n\nDisplaying corpus vocabulary:\n\n";
135 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
136 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137 0         0 print "$outstring\n";
138             }
139             }
140              
141             sub display_corpus_vocab_size {
142 0     0 1 0 my $self = shift;
143 0         0 die "corpus vocabulary not yet constructed"
144 0 0       0 unless keys %{$self->{_vocab_hist}};
145 0         0 my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
146 0         0 print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147             }
148              
149             sub write_corpus_vocab_to_file {
150 0     0 1 0 my $self = shift;
151 0         0 my $file = shift;
152 0 0       0 die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
  0         0  
153 0 0       0 open OUT, "> $file"
154             or die "unable to open for output a file with name `$file': $!";
155 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
156 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157 0         0 print OUT "$outstring\n";
158             }
159 0         0 close OUT;
160             }
161              
162             sub display_inverse_document_frequencies {
163 0     0 1 0 my $self = shift;
164 0         0 die "corpus vocabulary not yet constructed"
165 0 0       0 unless keys %{$self->{_vocab_idf_hist}};
166 0 0       0 print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167             unless $self->{_idf_filter_option};
168 0         0 print "\n\nDisplaying inverse document frequencies:\n";
169 0         0 foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
  0         0  
170 0         0 my $outstring = sprintf("%30s %d",
171             $_, $self->{_vocab_idf_hist}->{$_});
172 0         0 print "$outstring\n";
173             }
174 0         0 print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175 0         0 foreach ( sort keys %{$self->{_idf_t}} ) {
  0         0  
176 0         0 my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177 0         0 print "$outstring\n";
178             }
179             }
180              
181             sub get_all_document_names {
182 0     0 1 0 my $self = shift;
183 0         0 my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
  0         0  
184 0         0 return \@all_files;
185             }
186              
187             ############################ Generate Document Vectors #################################
188              
189             sub generate_document_vectors {
190 2     2 1 33 my $self = shift;
191 2         23 chdir $self->{_working_directory};
192 2         4 foreach ( sort keys %{$self->{_vocab_hist}} ) {
  2         84  
193 200         352 $self->{_doc_hist_template}->{$_} = 0;
194             }
195 2         27 $self->_scan_directory( $self->{_corpus_directory} );
196 2         24 chdir $self->{_working_directory};
197 2 50       27 if ($self->{_save_model_on_disk}) {
198 0 0 0     0 die "You did not specify in the constructor call the names for the diskfiles " .
199             "for storing the disk-based hash tables consisting of document vectors " .
200             "and their normalized versions"
201             unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202 0         0 eval {
203 0         0 store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204             };
205 0 0       0 if ($@) {
206 0         0 print "Something went wrong with disk storage of document vectors: $@";
207             }
208 0         0 eval {
209 0         0 store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210             };
211 0 0       0 if ($@) {
212 0         0 print "Something wrong with disk storage of normalized doc vecs: $@";
213             }
214             }
215             }
216              
217             sub display_doc_vectors {
218 0     0 1 0 my $self = shift;
219 0         0 die "document vectors not yet constructed"
220 0 0       0 unless keys %{$self->{_corpus_doc_vectors}};
221 0         0 foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
222 0         0 print "\n\ndisplay doc vec for $file:\n";
223 0         0 foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
  0         0  
224 0         0 print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225             }
226 0         0 my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
  0         0  
227 0         0 print "\nSize of vector for $file: $docvec_size\n";
228             }
229             }
230              
231             sub display_normalized_doc_vectors {
232 0     0 1 0 my $self = shift;
233 0         0 die "normalized document vectors not yet constructed"
234 0 0       0 unless keys %{$self->{_normalized_doc_vecs}};
235 0 0       0 unless ($self->{_idf_filter_option}) {
236 0         0 print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237 0         0 return;
238             }
239 0         0 foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
  0         0  
240 0         0 print "\n\ndisplay normalized doc vec for $file:\n";
241 0         0 foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
  0         0  
242 0         0 print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243             }
244 0         0 my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
  0         0  
245 0         0 print "\nSize of normalized vector for $file: $docvec_size\n";
246             }
247             }
248              
249             ######################## Calculate Pairwise Document Similarities ######################
250              
251             # Returns the similarity score for two documents whose actual names are are supplied
252             # as its two arguments.
253             sub pairwise_similarity_for_docs {
254 0     0 1 0 my $self = shift;
255 0         0 my $doc1 = shift;
256 0         0 my $doc2 = shift;
257 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
258 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260 0         0 my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261 0         0 my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262 0         0 my @vec1 = ();
263 0         0 my @vec2 = ();
264 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
265 0         0 push @vec1, $vec_hash_ref1->{$word};
266 0         0 push @vec2, $vec_hash_ref2->{$word};
267             }
268 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
269 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
270 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
271 0         0 $product /= $vec_mag1 * $vec_mag2;
272 0         0 return $product;
273             }
274              
275             sub pairwise_similarity_for_normalized_docs {
276 0     0 1 0 my $self = shift;
277 0         0 my $doc1 = shift;
278 0         0 my $doc2 = shift;
279 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
280 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282 0         0 my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283 0         0 my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284 0         0 my @vec1 = ();
285 0         0 my @vec2 = ();
286 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
287 0         0 push @vec1, $vec_hash_ref1->{$word};
288 0         0 push @vec2, $vec_hash_ref2->{$word};
289             }
290 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
291 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
292 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
293 0         0 $product /= $vec_mag1 * $vec_mag2;
294 0         0 return $product;
295             }
296              
297             ############################### Retrieve with VSM Model ################################
298              
299             sub retrieve_with_vsm {
300 1     1 1 12 my $self = shift;
301 1         3 my $query = shift;
302 1         2 my @clean_words;
303 1         5 my $min = $self->{_min_word_length};
304              
305 1 50       5 if ($self->{_break_camelcased_and_underscored}) {
306 1         25 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
307 1         3 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         25  
308 0         0 @clean_words = $self->{_case_sensitive} ?
309 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  8         51  
310 1 100       5 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8 50       26  
311             } else {
312 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
313 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
314             }
315 1         4 $query = \@clean_words;
316 1 50       7 print "\nYour query words are: @$query\n" if $self->{_debug};
317 1 50       9 if ($self->{_idf_filter_option}) {
318 1         73 die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319 1         10 unless scalar(keys %{$self->{_vocab_hist}})
320 1 50 50     1 && scalar(keys %{$self->{_normalized_doc_vecs}});
321             } else {
322 0         0 die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323 0         0 unless scalar(keys %{$self->{_vocab_hist}})
324 0 0 0     0 && scalar(keys %{$self->{_corpus_doc_vectors}});
325             }
326 1         2 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         42  
327 100         124 $self->{_query_vector}->{$_} = 0;
328             }
329 1         6 foreach (@$query) {
330 7 50       10 if ($self->{_case_sensitive}) {
331 0 0       0 $self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332             } else {
333 7 100       17 $self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334             }
335             }
336 1         2 my @query_word_counts = values %{$self->{_query_vector}};
  1         14  
337 1         9 my $query_word_count_total = reduce(\@query_word_counts);
338 1 50       10 die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339             unless $query_word_count_total;
340 1         3 my %retrievals;
341 1 50       5 if ($self->{_idf_filter_option}) {
342 1 50       3 print "\n\nUsing idf filter option for retrieval:\n\n"
343             if $self->{_debug};
344 1         3 foreach (sort {$self->_doc_vec_comparator}
  16         38  
  1         15  
345             keys %{$self->{_normalized_doc_vecs}}) {
346 8         17 $retrievals{$_} = $self->_similarity_to_query($_);
347             }
348             } else {
349 0 0       0 print "\n\nNOT using idf filter option for retrieval:\n\n"
350             if $self->{_debug};
351 0         0 foreach (sort {$self->_doc_vec_comparator}
  0         0  
  0         0  
352             keys %{$self->{_corpus_doc_vectors}}) {
353 0         0 $retrievals{$_} = $self->_similarity_to_query($_);
354             }
355             }
356 1 50       5 if ($self->{_debug}) {
357 0         0 print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
359 0         0 print "$_ => $retrievals{$_}\n";
360             }
361             }
362 1         12 return \%retrievals;
363             }
364              
365             ######################### Upload a Previously Constructed Model #########################
366              
367             sub upload_vsm_model_from_disk {
368 0     0 0 0 my $self = shift;
369 0 0 0     0 die "\nCannot find the database files for the VSM model"
370             unless -s "$self->{_corpus_vocab_db}.pag"
371             && -s $self->{_doc_vectors_db};
372 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
374             $self->{_corpus_vocab_db}, O_RDONLY, 0640
375             or die "Can't open DBM file: $!";
376 0 0       0 if ($self->{_debug}) {
377 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
378 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379             }
380             }
381 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
382 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383             }
384 0         0 $self->{_corpus_vocab_done} = 1;
385 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
386 0 0       0 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387             if $self->{_debug};
388 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
390             }
391              
392             sub upload_normalized_vsm_model_from_disk {
393 0     0 1 0 my $self = shift;
394 0 0 0     0 die "\nCannot find the database files for the VSM model"
395             unless -s "$self->{_corpus_vocab_db}.pag"
396             && -s $self->{_normalized_doc_vecs_db};
397 0         0 $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398 0 0       0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
  0         0  
399             $self->{_corpus_vocab_db}, O_RDONLY, 0640
400             or die "Can't open DBM file: $!";
401 0 0       0 if ($self->{_debug}) {
402 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
403 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404             }
405             }
406 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
407 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408             }
409 0         0 $self->{_corpus_vocab_done} = 1;
410 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
411 0 0       0 print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412             if $self->{_debug};
413 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
414             }
415              
416             ############################## Display Retrieval Results ################################
417              
418             sub display_retrievals {
419 0     0 1 0 my $self = shift;
420 0         0 my $retrievals = shift;
421 0         0 print "\n\nShowing the retrievals and the similarity scores:\n\n";
422 0         0 my $iter = 0;
423 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
  0         0  
424 0         0 print "$_ => $retrievals->{$_}\n";
425 0         0 $iter++;
426 0 0       0 last if $iter > $self->{_max_number_retrievals};
427             }
428 0         0 print "\n\n";
429             }
430              
431             ############################### Directory Scanner ################################
432              
433             sub _scan_directory {
434 4     4   12 my $self = shift;
435 4         43 my $dir = rel2abs( shift );
436 4         10709 my $current_dir = cwd;
437 4 50       147 chdir $dir or die "Unable to change directory to $dir: $!";
438 4         611 foreach ( glob "*" ) {
439 32 50 33     2908 if ( -d and !(-l) ) {
    50 33        
      33        
      33        
      33        
440 0         0 $self->_scan_directory( $_ );
441 0 0       0 chdir $dir
442             or die "Unable to change directory to $dir: $!";
443             } elsif (-r _ and
444             -T _ and
445             -M _ > 0.00001 and # modification age is at least 1 sec
446             !( -l $_ ) and
447             $self->ok_to_filetype($_) ) {
448 32 50       168 $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449 32 100       112 $self->_scan_file($_) unless $self->{_corpus_vocab_done};
450 32 100       148 $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451             }
452             }
453 4         128 chdir $current_dir;
454             }
455              
456             sub _scan_file {
457 16     16   68 my $self = shift;
458 16         19 my $file = shift;
459 16         398 open IN, $file;
460 16         48 my $min = $self->{_min_word_length};
461 16         37 my %uniques = ();
462 16         165 while () {
463 834 100       2814 next if /^[ ]*\r?\n?$/;
464 710         2272 $_ =~ s/\r?\n?$//;
465 710         693 my @clean_words;
466 710 50       1067 if ($self->{_break_camelcased_and_underscored}) {
467 710         7567 my @brokenup = grep $_, split /\W|_|\s+/, $_;
468 710         1350 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  2574         8207  
469 0         0 @clean_words = $self->{_case_sensitive} ?
470 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  2712         5482  
471 710 100       1366 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  2712 50       7312  
472             } else {
473 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
474 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
475             }
476 710 100       2184 next unless @clean_words;
477 426 50       1312 @clean_words = grep $_, map &simple_stemmer($_), @clean_words
478             if $self->{_want_stemming};
479 0         0 $self->{_case_sensitive} ?
480 1718         2858 map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
481 426 50       1211 map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
482 426 50       572 if ($self->{_case_sensitive}) {
483 0         0 for (@clean_words) { $uniques{$_}++ }
  0         0  
484             } else {
485 426         549 for (@clean_words) { $uniques{"\L$_"}++ }
  1718         3512  
486             }
487             }
488 16         311 close( IN );
489 16         174 map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
  820         881  
490 16         141 $self->{_total_num_of_docs}++;
491             }
492              
493             sub ok_to_filetype {
494 32     32 0 62 my $self = shift;
495 32         59 my $filename = shift;
496 32         1560 my ($base, $dir, $suffix) = fileparse($filename, '\..*');
497 32         121 croak "You called this module without specifying the file types in the constructor"
498 32 50       49 unless @{$self->{_file_types}} > 0;
499 32 50       41 return 1 if contained_in($suffix, @{$self->{_file_types}});
  32         179  
500 0         0 return 0;
501             }
502              
503             ############################## LSA Modeling and Retrieval ################################
504              
505             sub construct_lsa_model {
506 1     1 1 17 my $self = shift;
507 1 50       10 if ($self->{_idf_filter_option}) {
508 1 50 33     15 if (!$self->{_normalized_doc_vecs} and
509             -s $self->{_normalized_doc_vecs_db}) {
510 0         0 $self->{_normalized_doc_vecs} =
511             retrieve($self->{_normalized_doc_vecs_db});
512             }
513 1         4 foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
  1         22  
514 8         10 my $term_frequency_vec;
515 8         12 foreach my $word (sort keys
  8         529  
516             %{$self->{_normalized_doc_vecs}->{$_}}){
517 800         1243 push @$term_frequency_vec,
518             $self->{_normalized_doc_vecs}->{$_}->{$word};
519             }
520 8         48 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  8         23  
521             }
522             } else {
523 0 0 0     0 if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
524 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
525             }
526 0         0 foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
527 0         0 my $term_frequency_vec;
528 0         0 foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
  0         0  
529 0         0 push @$term_frequency_vec,
530             $self->{_corpus_doc_vectors}->{$_}->{$word};
531             }
532 0         0 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  0         0  
533             }
534             }
535 1         6 my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
  1         19  
536 1         1023 my ($U,$SIGMA,$V) = svd $A;
537 1 50       13 print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
538 1 50       8 print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
539 1         10 my $index = return_index_of_last_value_above_threshold($SIGMA,
540             $self->{_lsa_svd_threshold});
541 1         11 my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
542 1 50       64 print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
543             # When you measure the size of a matrix in PDL, the zeroth dimension
544             # is considered to be along the horizontal and the one-th dimension
545             # along the rows. This is opposite of how we want to look at
546             # matrices. For a matrix of size MxN, we mean M rows and N columns.
547             # With this 'rows x columns' convention for matrix size, if you had
548             # to check the size of, say, U matrix, you would call
549             # my @size = ( $U->getdim(1), $U->getdim(0) );
550             # print "\nsize of U: @size\n";
551 1         9 my $U_trunc = $U->slice("0:$index,:")->sever;
552 1         59 my $V_trunc = $V->slice("0:$index,0:$index")->sever;
553 1         44 $self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
554             PDL::Basic::transpose($U_trunc);
555 1 50       11047 print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
556             if $self->{_debug};
557 1         14 my @sorted_doc_names = $self->{_idf_filter_option} ?
558 0         0 sort keys %{$self->{_normalized_doc_vecs}} :
559 1 50       4 sort keys %{$self->{_corpus_doc_vectors}};
560 1         4 my $i = 0;
561 1         2 foreach (@{$self->{_term_document_matrix}}) {
  1         3  
562 8         21 my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
563             PDL::Basic::transpose(pdl($_));
564 8         897 my $doc_name = $sorted_doc_names[$i++];
565 8 50       20 print "\n\nTruncated doc vec for $doc_name: " .
566             $truncated_doc_vec . "\n" if $self->{_debug};
567 8         22 $self->{_doc_vecs_trunc_lsa}->{$doc_name}
568             = $truncated_doc_vec;
569             }
570 1         41 chdir $self->{_working_directory};
571             }
572              
573             sub retrieve_with_lsa {
574 1     1 1 6 my $self = shift;
575 1         2 my $query = shift;
576 1         2 my @clean_words;
577 1         6 my $min = $self->{_min_word_length};
578 1 50       4 if ($self->{_break_camelcased_and_underscored}) {
579 1         29 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
580 1         3 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         30  
581 1 100       3 @clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8         50  
  8         26  
582             } else {
583 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
584 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
585             }
586 1         3 $query = \@clean_words;
587 1 50       4 print "\nYour processed query words are: @$query\n" if $self->{_debug};
588 1         5 die "Your vocabulary histogram is empty"
589 1 50       1 unless scalar(keys %{$self->{_vocab_hist}});
590 1         5 die "You must first construct an LSA model"
591 1 50       3 unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
592 1         1 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         24  
593 100         112 $self->{_query_vector}->{$_} = 0;
594             }
595 1         4 foreach (@$query) {
596 7 100       15 $self->{_query_vector}->{"\L$_"}++
597             if exists $self->{_vocab_hist}->{"\L$_"};
598             }
599 1         1 my @query_word_counts = values %{$self->{_query_vector}};
  1         10  
600 1         4 my $query_word_count_total = reduce(\@query_word_counts);
601 1 50       3 die "Query does not contain corpus words. Nothing retrieved."
602             unless $query_word_count_total;
603 1         1 my $query_vec;
604 1         2 foreach (sort keys %{$self->{_query_vector}}) {
  1         34  
605 100         92 push @$query_vec, $self->{_query_vector}->{$_};
606             }
607 1 50       5 print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
608 1         4 my $truncated_query_vec = $self->{_lsa_vec_truncator} x
609             PDL::Basic::transpose(pdl($query_vec));
610 1 50       124 print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
611             if $self->{_debug};
612 1         1 my %retrievals;
613 1         2 foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
  1         6  
614 8         129 my $dot_product = PDL::Basic::transpose($truncated_query_vec)
615             x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
616 8 50       642 print "\n\nLSA: dot product of truncated query and\n" .
617             " truncated vec for doc $_ is " . $dot_product->sclr . "\n"
618             if $self->{_debug};
619 8         16 $retrievals{$_} = $dot_product->sclr;
620             }
621 1 50       15 if ($self->{_debug}) {
622 0         0 print "\n\nShowing LSA retrievals and similarity scores:\n\n";
623 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
624 0         0 print "$_ => $retrievals{$_}\n";
625             }
626 0         0 print "\n\n";
627             }
628 1         12 return \%retrievals;
629             }
630              
631             sub _construct_doc_vector {
632 16     16   29 my $self = shift;
633 16         27 my $file = shift;
634 16         17 my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
  16         48  
635 16         186 foreach ( sort keys %{$self->{_doc_hist_template}} ) {
  16         695  
636 1600         1298 $document_vector{$_} = 0;
637             }
638 16         103 my $min = $self->{_min_word_length};
639 16         33 my $total_words_in_doc = 0;
640 16 50       422 unless (open IN, $file) {
641 0 0       0 print "Unable to open file $file in the corpus: $!\n"
642             if $self->{_debug};
643 0         0 return;
644             }
645 16         264 while () {
646 834 100       2559 next if /^[ ]*\r?\n?$/;
647 710         2003 $_ =~ s/\r?\n?$//;
648 710         4992 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
649 710         931 my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  4702         6914  
  4702         6561  
650 710 100       2894 next unless @clean_words;
651 426 50       1265 @clean_words = grep $_,
652             map &simple_stemmer($_, $self->{_debug}), @clean_words
653             if $self->{_want_stemming};
654 0         0 $self->{_case_sensitive} ?
655 0         0 map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
  1470         2960  
656 1592         2518 map { $document_vector{"\L$_"}++ }
657 426 50       958 grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
658             }
659 16         271 close IN;
660 16 50       93 die "Something went wrong. Doc vector size unequal to vocab size"
661             unless $self->{_vocab_size} == scalar(keys %document_vector);
662 16         217 foreach (keys %document_vector) {
663 1600         1265 $total_words_in_doc += $document_vector{$_};
664             }
665 16         71 my %normalized_doc_vec;
666 16 50       93 if ($self->{_idf_filter_option}) {
667 16         122 foreach (keys %document_vector) {
668 1600         2607 $normalized_doc_vec{$_} = $document_vector{$_}
669             *
670             $self->{_idf_t}->{$_}
671             /
672             $total_words_in_doc;
673             }
674             }
675 16         37391 my $pwd = cwd;
676 16         428 $pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
677 16         23 my $file_path_name;
678 16 50       120 unless ( $1 eq "" ) {
679 0         0 $file_path_name = "$1/$file";
680             } else {
681 16         41 $file_path_name = $file;
682             }
683 16         119 $self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
684 16         199 $self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
685             }
686              
687             ################################### Drop Stop Words ###################################
688              
689             sub _drop_stop_words {
690 0     0   0 my $self = shift;
691 0 0       0 open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
692             or die "unable to open stop words file: $!";
693 0         0 while () {
694 0 0       0 next if /^#/;
695 0 0       0 next if /^[ ]*\r?\n?$/;
696 0         0 $_ =~ s/\r?\n?$//;
697 0 0       0 delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
698 0         0 unshift @{$self->{_stop_words}}, $_;
  0         0  
699             }
700             }
701              
702             ################################### Support Methods ####################################
703              
704             sub _doc_vec_comparator {
705 16     16   14 my $self = shift;
706 16         13 my %query_vector = %{$self->{_query_vector}};
  16         314  
707 16 50       84 my $vec1_hash_ref = $self->{_idf_filter_option} ?
708             $self->{_normalized_doc_vecs}->{$a} :
709             $self->{_corpus_doc_vectors}->{$a};
710 16 50       30 my $vec2_hash_ref = $self->{_idf_filter_option} ?
711             $self->{_normalized_doc_vecs}->{$b} :
712             $self->{_corpus_doc_vectors}->{$b};
713 16         16 my @vec1 = ();
714 16         12 my @vec2 = ();
715 16         14 my @qvec = ();
716 16         14 foreach my $word (sort keys %{$self->{_vocab_hist}}) {
  16         543  
717 1600         1245 push @vec1, $vec1_hash_ref->{$word};
718 1600         1243 push @vec2, $vec2_hash_ref->{$word};
719 1600         1397 push @qvec, $query_vector{$word};
720             }
721 16         93 my $vec1_mag = vec_magnitude(\@vec1);
722 16         25 my $vec2_mag = vec_magnitude(\@vec2);
723 16         25 my $qvec_mag = vec_magnitude(\@qvec);
724 16         31 my $product1 = vec_scalar_product(\@vec1, \@qvec);
725 16         18 $product1 /= $vec1_mag * $qvec_mag;
726 16         21 my $product2 = vec_scalar_product(\@vec2, \@qvec);
727 16         16 $product2 /= $vec2_mag * $qvec_mag;
728 16 100       114 return 1 if $product1 < $product2;
729 9 50       12 return 0 if $product1 == $product2;
730 9 50       122 return -1 if $product1 > $product2;
731             }
732              
733             sub _similarity_to_query {
734 8     8   7 my $self = shift;
735 8         13 my $doc_name = shift;
736 8 50       19 my $vec_hash_ref = $self->{_idf_filter_option} ?
737             $self->{_normalized_doc_vecs}->{$doc_name} :
738             $self->{_corpus_doc_vectors}->{$doc_name};
739 8         10 my @vec = ();
740 8         4 my @qvec = ();
741 8         300 foreach my $word (sort keys %$vec_hash_ref) {
742 800         658 push @vec, $vec_hash_ref->{$word};
743 800         801 push @qvec, $self->{_query_vector}->{$word};
744             }
745 8         54 my $vec_mag = vec_magnitude(\@vec);
746 8         13 my $qvec_mag = vec_magnitude(\@qvec);
747 8         14 my $product = vec_scalar_product(\@vec, \@qvec);
748 8         10 $product /= $vec_mag * $qvec_mag;
749 8         52 return $product;
750             }
751              
752             ###################### Relevance Judgments for Testing Purposes #######################
753              
754             ## IMPORTANT: This estimation of document relevancies to queries is NOT for
755             ## serious work. A document is considered to be relevant to a
756             ## query if it contains several of the query words. As to the
757             ## minimum number of query words that must exist in a document
758             ## in order for the latter to be considered relevant is
759             ## determined by the relevancy_threshold parameter in the VSM
760             ## constructor. (See the relevancy and precision-recall related
761             ## scripts in the 'examples' directory.) The reason for why the
762             ## function shown below is not for serious work is because
763             ## ultimately it is the humans who are the best judges of the
764             ## relevancies of documents to queries. The humans bring to
765             ## bear semantic considerations on the relevancy determination
766             ## problem that are beyond the scope of this module.
767              
768             sub estimate_doc_relevancies {
769 0     0 1 0 my $self = shift;
770 0 0       0 die "You did not set the 'query_file' parameter in the constructor"
771             unless $self->{_query_file};
772 0 0       0 open( IN, $self->{_query_file} )
773             or die "unable to open the query file $self->{_query_file}: $!";
774 0 0       0 croak "\n\nYou need to specify a name for the relevancy file in \n" .
775             " in which the relevancy judgments will be dumped."
776             unless $self->{_relevancy_file};
777 0         0 while () {
778 0 0       0 next if /^#/;
779 0 0       0 next if /^[ ]*\r?\n?$/;
780 0         0 $_ =~ s/\r?\n?$//;
781 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
782 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
783 0         0 my $query_label = $1;
784 0         0 my $query = $2;
785 0 0       0 next unless $query;
786 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
787             }
788 0 0       0 if ($self->{_debug}) {
789 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
790 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
791             }
792             }
793 0         0 $self->{_scan_dir_for_rels} = 1;
794 0         0 $self->_scan_directory($self->{_corpus_directory});
795 0         0 $self->{_scan_dir_for_rels} = 0;
796 0         0 chdir $self->{_working_directory};
797 0 0       0 open(OUT, ">$self->{_relevancy_file}")
798             or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
799 0         0 my @relevancy_list_for_query;
800 0         0 foreach (sort
801 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
  0         0  
802             keys %{$self->{_relevancy_estimates}}) {
803 0         0 @relevancy_list_for_query =
804 0         0 keys %{$self->{_relevancy_estimates}->{$_}};
805 0         0 print OUT "$_ => @relevancy_list_for_query\n\n";
806 0         0 print "Number of relevant docs for query $_: " .
807             scalar(@relevancy_list_for_query) . "\n";
808             }
809             }
810              
811             # If there are available human-supplied relevancy judgments in a disk
812             # file, use this script to upload that information. One of the scripts
813             # in the 'examples' directory carries out the precision-recall analysis
814             # by using this approach. IMPORTANT: The human-supplied relevancy
815             # judgments must be in a format that is shown in the sample file
816             # relevancy.txt in the 'examples' directory.
817             sub upload_document_relevancies_from_file {
818 0     0 1 0 my $self = shift;
819 0         0 chdir $self->{_working_directory};
820 0 0       0 open( IN, $self->{_relevancy_file} )
821             or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
822 0         0 while () {
823 0 0       0 next if /^#/;
824 0 0       0 next if /^[ ]*\r?\n?$/;
825 0         0 $_ =~ s/\r?\n?$//;
826 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
827 0         0 /^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
828 0         0 my $query_label = $1;
829 0         0 my $relevancy_docs_string = $2;
830 0 0       0 next unless $relevancy_docs_string;
831 0         0 my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
832 0         0 my %relevancies = map {$_ => 1} @relevancy_docs;
  0         0  
833 0         0 $self->{_relevancy_estimates}->{$query_label} = \%relevancies;
834             }
835 0 0       0 if ($self->{_debug}) {
836 0         0 for (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
837 0         0 my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
  0         0  
838 0         0 print "$_ => @rels\n";
839             }
840             }
841             }
842              
843             sub display_doc_relevancies {
844 0     0 1 0 my $self = shift;
845 0         0 die "You must first estimate or provide the doc relevancies"
846 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
847 0         0 print "\nDisplaying relevancy judgments:\n\n";
848 0         0 foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
849 0         0 print "Query $query\n";
850 0         0 foreach my $file (sort {
  0         0  
851 0         0 $self->{_relevancy_estimates}->{$query}->{$b}
852             <=>
853             $self->{_relevancy_estimates}->{$query}->{$a}
854             }
855             keys %{$self->{_relevancy_estimates}->{$query}}){
856 0         0 print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
857             }
858             }
859             }
860              
861             sub _scan_file_for_rels {
862 0     0   0 my $self = shift;
863 0         0 my $file = shift;
864 0         0 open IN, $file;
865 0         0 my @all_text = ;
866 0         0 @all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
  0         0  
  0         0  
867 0         0 my $all_text = join ' ', @all_text;
868 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
869 0         0 my $count = 0;
870 0         0 my @query_words = grep $_,
871             split /\s+/, $self->{_queries_for_relevancy}->{$query};
872 0 0       0 print "Query words for $query: @query_words\n" if $self->{_debug};
873 0         0 foreach my $word (@query_words) {
874 0         0 my @matches = $all_text =~ /$word/gi;
875 0 0       0 print "Number of occurrences for word '$word' in file $file: " .
876             scalar(@matches) . "\n" if $self->{_debug};
877 0 0       0 $count += @matches if @matches;
878             }
879 0 0       0 print "\nRelevancy count for query $query and file $file: $count\n\n"
880             if $self->{_debug};
881 0 0       0 $self->{_relevancy_estimates}->{$query}->{$file} = $count
882             if $count >= $self->{_relevancy_threshold};
883             }
884             }
885              
886             ######################### Calculate Precision versus Recall ##########################
887              
888             sub precision_and_recall_calculator {
889 0     0 1 0 my $self = shift;
890 0         0 my $retrieval_type = shift;
891 0         0 die "You must first estimate or provide the doc relevancies"
892 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
893 0 0       0 unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
  0         0  
894 0 0       0 open( IN, $self->{_query_file})
895             or die "unable to open the query file $self->{_query_file}: $!";
896 0         0 while () {
897 0 0       0 next if /^#/;
898 0 0       0 next if /^[ ]*\r?\n?$/;
899 0         0 $_ =~ s/\r?\n?$//;
900 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
901 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
902 0         0 my $query_label = $1;
903 0         0 my $query = $2;
904 0 0       0 next unless $query;
905 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
906             }
907 0 0       0 if ($self->{_debug}) {
908 0         0 print "\n\nDisplaying queries in the query file:\n\n";
909 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
910 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
911             }
912             }
913             }
914 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
915 0 0       0 print "\n\n\nQuery $query:\n" if $self->{_debug};
916 0         0 my @query_words = grep $_,
917             split /\s+/, $self->{_queries_for_relevancy}->{$query};
918 0 0       0 croak "\n\nYou have not specified the retrieval type for " .
919             "precision-recall calculation. See code in 'examples'" .
920             "directory:" if !defined $retrieval_type;
921 0         0 my $retrievals;
922 0         0 eval {
923 0 0       0 if ($retrieval_type eq 'vsm') {
    0          
924 0         0 $retrievals = $self->retrieve_with_vsm( \@query_words );
925             } elsif ($retrieval_type eq 'lsa') {
926 0         0 $retrievals = $self->retrieve_with_lsa( \@query_words );
927             }
928             };
929 0 0       0 if ($@) {
930 0         0 warn "\n\nNo relevant docs found for query $query.\n" .
931             "Will skip over this query for precision and\n" .
932             "recall calculations\n\n";
933 0         0 next;
934             }
935 0         0 my %ranked_retrievals;
936 0         0 my $i = 1;
937 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
  0         0  
938             keys %$retrievals) {
939 0         0 $ranked_retrievals{$i++} = $_;
940             }
941 0 0       0 if ($self->{_debug}) {
942 0         0 print "\n\nDisplaying ranked retrievals for query $query:\n\n";
943 0         0 foreach (sort {$a <=> $b} keys %ranked_retrievals) {
  0         0  
944 0         0 print "$_ => $ranked_retrievals{$_}\n";
945             }
946             }
947             # At this time, ranking of relevant documents based on their
948             # relevancy counts serves no particular purpose since all we want
949             # for the calculation of Precision and Recall are the total
950             # number of relevant documents. However, I believe such a
951             # ranking will play an important role in the future.
952             # IMPORTANT: The relevancy judgments are ranked only when
953             # estimated by the method estimate_doc_relevancies()
954             # of the VSM class. When relevancies are supplied
955             # directly through a disk file, they all carry the
956             # same rank.
957 0         0 my %ranked_relevancies;
958 0         0 $i = 1;
959 0         0 foreach my $file (sort {
  0         0  
960 0         0 $self->{_relevancy_estimates}->{$query}->{$b}
961             <=>
962             $self->{_relevancy_estimates}->{$query}->{$a}
963             }
964             keys %{$self->{_relevancy_estimates}->{$query}}) {
965 0         0 $ranked_relevancies{$i++} = $file;
966             }
967 0 0       0 if ($self->{_debug}) {
968 0         0 print "\n\nDisplaying ranked relevancies for query $query:\n\n";
969 0         0 foreach (sort {$a <=> $b} keys %ranked_relevancies) {
  0         0  
970 0         0 print "$_ => $ranked_relevancies{$_}\n";
971             }
972             }
973 0         0 my @relevant_set = values %ranked_relevancies;
974 0 0       0 warn "\n\nNo relevant docs found for query $query.\n" .
975             "Will skip over this query for precision and\n" .
976             "recall calculations\n\n" unless @relevant_set;
977 0 0       0 next unless @relevant_set;
978 0 0       0 print "\n\nRelevant set for query $query: @relevant_set\n\n"
979             if $self->{_debug};
980 0         0 my @retrieved;
981 0         0 foreach (sort keys %ranked_retrievals) {
982 0         0 push @retrieved, $ranked_retrievals{$_};
983             }
984 0 0       0 print "\n\nRetrieved set for query $query: @retrieved\n\n"
985             if $self->{_debug};
986 0         0 my @Precision_values = ();
987 0         0 my @Recall_values = ();
988 0         0 my $rank = 1;
989 0         0 while ($rank < @retrieved + 1) {
990 0         0 my $index = 1;
991 0         0 my @retrieved_at_rank = ();
992 0         0 while ($index <= $rank) {
993 0         0 push @retrieved_at_rank, $ranked_retrievals{$index};
994 0         0 $index++;
995             }
996 0         0 my $intersection =set_intersection(\@retrieved_at_rank,
997             \@relevant_set);
998 0 0       0 my $precision_at_rank = @retrieved_at_rank ?
999             (@$intersection / @retrieved_at_rank) : 0;
1000 0         0 push @Precision_values, $precision_at_rank;
1001 0         0 my $recall_at_rank = @$intersection / @relevant_set;
1002 0         0 push @Recall_values, $recall_at_rank;
1003 0         0 $rank++;
1004             }
1005 0 0       0 print "\n\nFor query $query, precision values: @Precision_values\n"
1006             if $self->{_debug};
1007 0 0       0 print "\nFor query $query, recall values: @Recall_values\n"
1008             if $self->{_debug};
1009 0         0 $self->{_precision_for_queries}->{$query} = \@Precision_values;
1010 0         0 $self->{_recall_for_queries}->{$query} = \@Recall_values;
1011 0         0 my $area = 0;
1012             # Use trapezoidal rule to find the area under the precision-recall
1013             # curve:
1014 0         0 for my $j (1..@Precision_values-1) {
1015 0         0 my $height = ($Precision_values[$j]+$Precision_values[$j-1])/2.0;
1016 0         0 my $base = ($Recall_values[$j] - $Recall_values[$j-1]);
1017 0         0 $area += $base * $height;
1018             }
1019 0         0 my $map_for_query = $area;
1020 0 0       0 print "\nMAP for query $query: $map_for_query\n" if $self->{_debug};
1021 0         0 $self->{_map_vals_for_queries}->{$query} = $map_for_query;
1022             }
1023             }
1024              
1025             sub display_map_values_for_queries {
1026 0     0 1 0 my $self = shift;
1027 0         0 die "You must first invoke precision_and_recall_calculator function"
1028 0 0       0 unless scalar(keys %{$self->{_map_vals_for_queries}});
1029 0         0 my $map = 0;
1030 0         0 print "\n\nDisplaying average precision for different queries:\n\n";
1031 0         0 foreach my $query (sort
  0         0  
1032 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1033             keys %{$self->{_map_vals_for_queries}}) {
1034 0         0 my $output = sprintf "Query %s => %.3f",
1035             $query, $self->{_map_vals_for_queries}->{$query};
1036 0         0 print "$output\n";
1037 0         0 $map += $self->{_map_vals_for_queries}->{$query};
1038             }
1039 0         0 print "\n\n";
1040 0         0 my $avg_map_for_all_queries =
1041 0         0 $map / scalar(keys %{$self->{_map_vals_for_queries}});
1042 0         0 print "MAP value: $avg_map_for_all_queries\n\n";
1043             }
1044              
1045             sub display_precision_vs_recall_for_queries {
1046 0     0 1 0 my $self = shift;
1047 0         0 die "You must first invoke precision_and_recall_calculator function"
1048 0 0       0 unless scalar(keys %{$self->{_precision_for_queries}});
1049 0         0 print "\n\nDisplaying precision and recall values for different queries:\n\n";
1050 0         0 foreach my $query (sort
  0         0  
1051 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1052             keys %{$self->{_map_vals_for_queries}}) {
1053 0         0 print "\n\nQuery $query:\n";
1054 0         0 print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1055 0         0 my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
  0         0  
1056 0         0 @precision_vals = map {sprintf "%.3f", $_} @precision_vals;
  0         0  
1057 0         0 print " Precision at rank => @precision_vals\n";
1058 0         0 my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
  0         0  
1059 0         0 @recall_vals = map {sprintf "%.3f", $_} @recall_vals;
  0         0  
1060 0         0 print "\n Recall at rank => @recall_vals\n";
1061             }
1062 0         0 print "\n\n";
1063             }
1064              
1065             sub get_query_sorted_average_precision_for_queries {
1066 0     0 1 0 my $self = shift;
1067 0         0 die "You must first invoke precision_and_recall_calculator function"
1068 0 0       0 unless scalar(keys %{$self->{_map_vals_for_queries}});
1069 0         0 my @average_precisions_for_queries = ();
1070 0         0 foreach my $query (sort
  0         0  
1071 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1072             keys %{$self->{_map_vals_for_queries}}) {
1073 0         0 my $output = sprintf "%.3f", $self->{_map_vals_for_queries}->{$query};
1074 0         0 push @average_precisions_for_queries, $output;
1075             }
1076 0         0 return \@average_precisions_for_queries;
1077             }
1078              
1079             ################################### Utility Routines ###################################
1080              
1081             sub _check_for_illegal_params {
1082 2     2   7 my @params = @_;
1083 2         17 my @legal_params = qw / corpus_directory
1084             corpus_vocab_db
1085             doc_vectors_db
1086             normalized_doc_vecs_db
1087             use_idf_filter
1088             stop_words_file
1089             file_types
1090             case_sensitive
1091             max_number_retrievals
1092             query_file
1093             relevancy_file
1094             min_word_length
1095             want_stemming
1096             lsa_svd_threshold
1097             relevancy_threshold
1098             break_camelcased_and_underscored
1099             save_model_on_disk
1100             debug
1101             /;
1102 2         4 my $found_match_flag;
1103 2         6 foreach my $param (@params) {
1104 17         20 foreach my $legal (@legal_params) {
1105 102         81 $found_match_flag = 0;
1106 102 100       165 if ($param eq $legal) {
1107 17         16 $found_match_flag = 1;
1108 17         17 last;
1109             }
1110             }
1111 17 50       36 last if $found_match_flag == 0;
1112             }
1113 2         18 return $found_match_flag;
1114             }
1115              
1116             # checks whether an element is in an array:
1117             sub contained_in {
1118 32     32 0 36 my $ele = shift;
1119 32         86 my @array = @_;
1120 32         66 my $count = 0;
1121 32 100       42 map {$count++ if $ele eq $_} @array;
  64         230  
1122 32         204 return $count;
1123             }
1124              
1125             # Meant only for an un-nested hash:
1126             sub deep_copy_hash {
1127 16     16 0 22 my $ref_in = shift;
1128 16         35 my $ref_out = {};
1129 16         24 foreach ( keys %{$ref_in} ) {
  16         649  
1130 1600         1934 $ref_out->{$_} = $ref_in->{$_};
1131             }
1132 16         592 return $ref_out;
1133             }
1134              
1135             sub vec_scalar_product {
1136 40     40 0 27 my $vec1 = shift;
1137 40         32 my $vec2 = shift;
1138 40 50       71 croak "Something is wrong --- the two vectors are of unequal length"
1139             unless @$vec1 == @$vec2;
1140 40         27 my $product;
1141 40         59 for my $i (0..@$vec1-1) {
1142 4000         3518 $product += $vec1->[$i] * $vec2->[$i];
1143             }
1144 40         59 return $product;
1145             }
1146              
1147             sub vec_magnitude {
1148 64     64 0 58 my $vec = shift;
1149 64         43 my $mag_squared = 0;
1150 64         62 foreach my $num (@$vec) {
1151 6400         5018 $mag_squared += $num ** 2;
1152             }
1153 64         114 return sqrt $mag_squared;
1154             }
1155              
1156             sub reduce {
1157 2     2 0 5 my $vec = shift;
1158 2         2 my $result;
1159 2         7 for my $item (@$vec) {
1160 200         137 $result += $item;
1161             }
1162 2         5 return $result;
1163             }
1164              
1165             sub simple_stemmer {
1166 3310     3310 0 3262 my $word = shift;
1167 3310         2243 my $debug = shift;
1168 3310 50       4352 print "\nStemming the word: $word\n" if $debug;
1169 3310         12280 $word =~ s/(.*[a-z][^aeious])s$/$1/i;
1170 3310         4460 $word =~ s/(.*[a-z]s)es$/$1/i;
1171 3310         4185 $word =~ s/(.*[a-z][ck])es$/$1e/i;
1172 3310         11463 $word =~ s/(.*[a-z]+)tions$/$1tion/i;
1173 3310         9965 $word =~ s/(.*[a-z]+)mming$/$1m/i;
1174 3310         18276 $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1175 3310         10122 $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1176 3310         9483 $word =~ s/(.*[a-z]+)tices$/$1tex/i;
1177 3310         14624 $word =~ s/(.*[a-z]+)pes$/$1pe/i;
1178 3310         13976 $word =~ s/(.*[a-z]+)sed$/$1se/i;
1179 3310         16598 $word =~ s/(.*[a-z]+)ed$/$1/i;
1180 3310         7283 $word =~ s/(.*[a-z]+)tation$/$1t/i;
1181 3310 50       4433 print "Stemmed word: $word\n\n" if $debug;
1182 3310         8614 return $word;
1183             }
1184              
1185             # Assumes the array is sorted in a descending order, as would be the
1186             # case with an array of singular values produced by an SVD algorithm
1187             sub return_index_of_last_value_above_threshold {
1188 1     1 0 4 my $pdl_obj = shift;
1189 1         10 my $size = $pdl_obj->getdim(0);
1190 1         4 my $threshold = shift;
1191 1         14 my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1192 1         66 my $i = 0;
1193 1   66     11 while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
  8         256  
1194 1         5 return $i-1;
1195             }
1196              
1197             sub set_intersection {
1198 0     0 0   my $set1 = shift;
1199 0           my $set2 = shift;
1200 0           my %hset1 = map {$_ => 1} @$set1;
  0            
1201 0           my @common_elements = grep {$hset1{$_}} @$set2;
  0            
1202 0 0         return @common_elements ? \@common_elements : [];
1203             }
1204              
1205             sub get_integer_suffix {
1206 0     0 0   my $label = shift;
1207 0           $label =~ /(\d*)$/;
1208 0           return $1;
1209             }
1210              
1211             1;
1212              
1213             =pod
1214              
1215             =head1 NAME
1216              
1217             Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1218             library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1219             algorithms in response to search words and phrases.
1220              
1221             =head1 SYNOPSIS
1222              
1223             # FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1224              
1225             use Algorithm::VSM;
1226              
1227             my $corpus_dir = "corpus";
1228             my @query = qw/ program ListIterator add ArrayList args /;
1229             my $stop_words_file = "stop_words.txt";
1230             my $vsm = Algorithm::VSM->new(
1231             break_camelcased_and_underscored => 1,
1232             case_sensitive => 0,
1233             corpus_directory => $corpus_dir,
1234             file_types => ['.txt', '.java'],
1235             max_number_retrievals => 10,
1236             min_word_length => 4,
1237             stop_words_file => $stop_words_file,
1238             use_idf_filter => 1,
1239             want_stemming => 1,
1240             );
1241             $vsm->get_corpus_vocabulary_and_word_counts();
1242             $vsm->display_corpus_vocab();
1243             $vsm->display_corpus_vocab_size();
1244             $vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1245             $vsm->display_inverse_document_frequencies();
1246             $vsm->generate_document_vectors();
1247             $vsm->display_doc_vectors();
1248             $vsm->display_normalized_doc_vectors();
1249             my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1250             $vsm->display_retrievals( $retrievals );
1251              
1252             The purpose of each constructor option and what is accomplished by the method
1253             calls should be obvious by their names. If not, they are explained in greater
1254             detail elsewhere in this documentation page. Note that the methods
1255             display_corpus_vocab() and display_doc_vectors() are there only for testing
1256             purposes with small corpora. If you must use them for large libraries/corpora,
1257             you might wish to redirect the output to a file.
1258              
1259             By default, a call to a constructor calculates normalized term-frequency vectors
1260             for the documents. Normalization consists of first calculating the term
1261             frequency tf(t) of a term t in a document as a proportion of the total numbers
1262             of words in the document and then multiplying it by idf(t), where idf(t) stands
1263             for the inverse document frequency associated with that term. Note that 'word'
1264             and 'term' mean the same thing.
1265              
1266              
1267              
1268             # FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1269              
1270             my $lsa = Algorithm::VSM->new(
1271             break_camelcased_and_underscored => 1,
1272             case_sensitive => 0,
1273             corpus_directory => $corpus_dir,
1274             file_types => ['.txt', '.java'],
1275             lsa_svd_threshold => 0.01,
1276             max_number_retrievals => 10,
1277             min_word_length => 4,
1278             stop_words_file => $stop_words_file,
1279             use_idf_filter => 1,
1280             want_stemming => 1,
1281             );
1282             $lsa->get_corpus_vocabulary_and_word_counts();
1283             $lsa->display_corpus_vocab();
1284             $lsa->display_corpus_vocab_size();
1285             $lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1286             $lsa->generate_document_vectors();
1287             $lsa->construct_lsa_model();
1288             my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1289             $lsa->display_retrievals( $retrievals );
1290              
1291             The initialization code before the constructor call and the calls for displaying
1292             the vocabulary and the vectors after the call remain the same as for the VSM case
1293             shown previously in this Synopsis. In the call above, the constructor parameter
1294             'lsa_svd_threshold' determines how many of the singular values will be retained
1295             after we have carried out an SVD decomposition of the term-frequency matrix for
1296             the documents in the corpus. Singular values smaller than this threshold
1297             fraction of the largest value are rejected.
1298              
1299              
1300              
1301             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1302              
1303             my $corpus_dir = "corpus";
1304             my $stop_words_file = "stop_words.txt";
1305             my $query_file = "test_queries.txt";
1306             my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1307             # will be stored in this file
1308             my $vsm = Algorithm::VSM->new(
1309             break_camelcased_and_underscored => 1,
1310             case_sensitive => 0,
1311             corpus_directory => $corpus_dir,
1312             file_types => ['.txt', '.java'],
1313             min_word_length => 4,
1314             query_file => $query_file,
1315             relevancy_file => $relevancy_file,
1316             relevancy_threshold => 5,
1317             stop_words_file => $stop_words_file,
1318             want_stemming => 1,
1319             );
1320             $vsm->get_corpus_vocabulary_and_word_counts();
1321             $vsm->generate_document_vectors();
1322             $vsm->estimate_doc_relevancies();
1323             $vsm->display_doc_relevancies(); # used only for testing
1324             $vsm->precision_and_recall_calculator('vsm');
1325             $vsm->display_precision_vs_recall_for_queries();
1326             $vsm->display_map_values_for_queries();
1327              
1328             Measuring precision and recall requires a set of queries. These are supplied
1329             through the constructor parameter 'query_file'. The format of the this file
1330             must be according to the sample file 'test_queries.txt' in the 'examples'
1331             directory. The module estimates the relevancies of the documents to the
1332             queries and dumps the relevancies in a file named by the 'relevancy_file'
1333             constructor parameter. The constructor parameter 'relevancy_threshold' is used
1334             to decide which of the documents are considered to be relevant to a query. A
1335             document must contain at least the 'relevancy_threshold' occurrences of query
1336             words in order to be considered relevant to a query.
1337              
1338              
1339              
1340             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1341              
1342             my $lsa = Algorithm::VSM->new(
1343             break_camelcased_and_underscored => 1,
1344             case_sensitive => 0,
1345             corpus_directory => $corpus_dir,
1346             file_types => ['.txt', '.java'],
1347             lsa_svd_threshold => 0.01,
1348             min_word_length => 4,
1349             query_file => $query_file,
1350             relevancy_file => $relevancy_file,
1351             relevancy_threshold => 5,
1352             stop_words_file => $stop_words_file,
1353             want_stemming => 1,
1354             );
1355             $lsa->get_corpus_vocabulary_and_word_counts();
1356             $lsa->generate_document_vectors();
1357             $lsa->construct_lsa_model();
1358             $lsa->estimate_doc_relevancies();
1359             $lsa->display_doc_relevancies();
1360             $lsa->precision_and_recall_calculator('lsa');
1361             $lsa->display_precision_vs_recall_for_queries();
1362             $lsa->display_map_values_for_queries();
1363              
1364             We have already explained the purpose of the constructor parameter 'query_file'
1365             and about the constraints on the format of queries in the file named through
1366             this parameter. As mentioned earlier, the module estimates the relevancies of
1367             the documents to the queries and dumps the relevancies in a file named by the
1368             'relevancy_file' constructor parameter. The constructor parameter
1369             'relevancy_threshold' is used in deciding which of the documents are considered
1370             to be relevant to a query. A document must contain at least the
1371             'relevancy_threshold' occurrences of query words in order to be considered
1372             relevant to a query. We have previously explained the role of the constructor
1373             parameter 'lsa_svd_threshold'.
1374              
1375              
1376              
1377             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1378              
1379             my $corpus_dir = "corpus";
1380             my $stop_words_file = "stop_words.txt";
1381             my $query_file = "test_queries.txt";
1382             my $relevancy_file = "relevancy.txt";
1383             my $vsm = Algorithm::VSM->new(
1384             break_camelcased_and_underscored => 1,
1385             case_sensitive => 0,
1386             corpus_directory => $corpus_dir,
1387             file_types => ['.txt', '.java'],
1388             min_word_length => 4,
1389             query_file => $query_file,
1390             relevancy_file => $relevancy_file,
1391             stop_words_file => $stop_words_file,
1392             want_stemming => 1,
1393             );
1394             $vsm->get_corpus_vocabulary_and_word_counts();
1395             $vsm->generate_document_vectors();
1396             $vsm->upload_document_relevancies_from_file();
1397             $vsm->display_doc_relevancies();
1398             $vsm->precision_and_recall_calculator('vsm');
1399             $vsm->display_precision_vs_recall_for_queries();
1400             $vsm->display_map_values_for_queries();
1401              
1402             Now the filename supplied through the constructor parameter 'relevancy_file' must
1403             contain relevance judgments for the queries that are named in the file supplied
1404             through the parameter 'query_file'. The format of these two files must be
1405             according to what is shown in the sample files 'test_queries.txt' and
1406             'relevancy.txt' in the 'examples' directory.
1407              
1408              
1409              
1410             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1411              
1412             my $corpus_dir = "corpus";
1413             my $stop_words_file = "stop_words.txt";
1414             my $query_file = "test_queries.txt";
1415             my $relevancy_file = "relevancy.txt";
1416             my $lsa = Algorithm::VSM->new(
1417             break_camelcased_and_underscored => 1,
1418             case_sensitive => 0,
1419             corpus_directory => $corpus_dir,
1420             file_types => ['.txt', '.java'],
1421             lsa_svd_threshold => 0.01,
1422             min_word_length => 4,
1423             query_file => $query_file,
1424             relevancy_file => $relevancy_file,
1425             stop_words_file => $stop_words_file,
1426             want_stemming => 1,
1427             );
1428             $lsa->get_corpus_vocabulary_and_word_counts();
1429             $lsa->generate_document_vectors();
1430             $lsa->upload_document_relevancies_from_file();
1431             $lsa->display_doc_relevancies();
1432             $lsa->precision_and_recall_calculator('vsm');
1433             $lsa->display_precision_vs_recall_for_queries();
1434             $lsa->display_map_values_for_queries();
1435              
1436             As mentioned for the previous code block, the filename supplied through the
1437             constructor parameter 'relevancy_file' must contain relevance judgments for the
1438             queries that are named in the file supplied through the parameter 'query_file'.
1439             The format of this file must be according to what is shown in the sample file
1440             'relevancy.txt' in the 'examples' directory. We have already explained the roles
1441             played by the constructor parameters such as 'lsa_svd_threshold'.
1442              
1443              
1444              
1445             # FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1446              
1447             my $corpus_dir = "corpus";
1448             my $stop_words_file = "stop_words.txt";
1449             my $vsm = Algorithm::VSM->new(
1450             break_camelcased_and_underscored => 1,
1451             case_sensitive => 0,
1452             corpus_directory => $corpus_dir,
1453             file_types => ['.txt', '.java'],
1454             min_word_length => 4,
1455             stop_words_file => $stop_words_file,
1456             want_stemming => 1,
1457             );
1458             $vsm->get_corpus_vocabulary_and_word_counts();
1459             $vsm->generate_document_vectors();
1460             # code for calculating pairwise similarities as shown in the
1461             # script calculate_similarity_matrix_for_all_docs.pl in the
1462             # examples directory. This script makes calls to
1463             #
1464             # $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1465             #
1466             # for every pair of documents.
1467              
1468             =head1 CHANGES
1469              
1470             Version 1.62 removes the Perl version restriction on the module. This version also
1471             fixes two bugs, one in the file scanner code and the other in the
1472             precision-and-recall calculator. The file scanner bug was related to the new
1473             constructor parameter C that was introduced in Version 1.61. And the
1474             precision-and-recall calculator bug was triggered if a query consisted solely of
1475             non-vocabulary words.
1476              
1477             Version 1.61 improves the implementation of the directory scanner to make it more
1478             platform independent. Additionally, you are now required to specify in the
1479             constructor call the file types to be considered for computing the database model.
1480             If, say, you have a large software library and you want only Java and text files to
1481             be scanned for creating the VSM (or the LSA) model, you must supply that information
1482             to the module by setting the constructor parameter C to the anonymous
1483             list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1484             version is C. If you set it to 1, that will force the database model
1485             and query matching to become case sensitive.
1486              
1487             Version 1.60 reflects the fact that people are now more likely to use this module by
1488             keeping the model constructed for a corpus in the fast memory (as opposed to storing
1489             the models in disk-based hash tables) for its repeated invocation for different
1490             queries. As a result, the default value for the constructor option
1491             C was changed from 1 to 0. For those who still wish to store on
1492             a disk the model that is constructed, the script
1493             C shows how you can do that.
1494             Other changes in 1.60 include a slight reorganization of the scripts in the
1495             C directory. Most scripts now do not by default store their models in
1496             disk-based hash tables. This reorganization is reflected in the description of the
1497             C directory in this documentation. The basic logic of constructing VSM and
1498             LSA models and how these are used for retrievals remains unchanged.
1499              
1500             Version 1.50 incorporates a couple of new features: (1) You now have the option to
1501             split camel-cased and underscored words for constructing your vocabulary set; and (2)
1502             Storing the VSM and LSA models in database files on the disk is now optional. The
1503             second feature, in particular, should prove useful to those who are using this module
1504             for large collections of documents.
1505              
1506             Version 1.42 includes two new methods, C and
1507             C, for those folks who deal with very large datasets.
1508             You can get a better sense of the overall vocabulary being used by the module for
1509             file retrieval by examining the contents of a dump file whose name is supplied as an
1510             argument to C.
1511              
1512             Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1513             the dependencies between this module and the submodules of PDL.
1514              
1515             Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1516             documents in the corpus. The elements of such a matrix express pairwise similarities
1517             between the documents. The pairwise similarities are based on the dot product of two
1518             document vectors divided by the product of the vector magnitudes. The 'examples'
1519             directory contains two scripts to illustrate how such matrices can be calculated by
1520             the user. The similarity matrix is output as a CSV file.
1521              
1522             Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1523             document file. What that means is that the words that appear in most of the documents
1524             get reduced weighting since such words are non-discriminatory with respect to the
1525             retrieval of the documents. A typical formula that is used to calculate the IDF
1526             weight for a word is the logarithm of the ratio of the total number of documents to
1527             the number of documents in which the word appears. So if a word were to appear in
1528             all the documents, its IDF multiplier would be zero in the vector representation of a
1529             document. If so desired, you can turn off the IDF weighting of the words by
1530             explicitly setting the constructor parameter C to zero.
1531              
1532             Version 1.2 includes a code correction and some general code and documentation
1533             cleanup.
1534              
1535             With Version 1.1, you can access the retrieval precision results so that you can
1536             compare two different retrieval algorithms (VSM or LSA with different choices for
1537             some of the constructor parameters) with significance testing. (Version 1.0 merely
1538             sent those results to standard output, typically your terminal window.) In Version
1539             1.1, the new script B in the 'examples' directory
1540             illustrates significance testing with Randomization and with Student's Paired t-Test.
1541              
1542             =head1 DESCRIPTION
1543              
1544             B is a I module for constructing a Vector Space Model (VSM) or
1545             a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1546             to as a corpus, and then retrieving the documents in response to search words in a
1547             query.
1548              
1549             VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1550             community. More recently such models have been shown to be effective in retrieving
1551             files/documents from software libraries. For an account of this research that was
1552             presented by Shivani Rao and the author of this module at the 2011 Mining Software
1553             Repositories conference, see L.
1554              
1555             VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1556             Stemming the words so extracted and eliminating the designated stop words from the
1557             vocabulary. Stemming means that closely related words like 'programming' and
1558             'programs' are reduced to the common root word 'program' and the stop words are the
1559             non-discriminating words that can be expected to exist in virtually all the
1560             documents. (3) Constructing document vectors for the individual files in the corpus
1561             --- the document vectors taken together constitute what is usually referred to as a
1562             'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1563             factor out the effect of document size and, if desired, multiplying the term
1564             frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1565             the weight of the words that appear in a large number of documents. (5) Constructing
1566             a query vector for the search query after the query is subject to the same stemming
1567             and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1568             Using a similarity metric to return the set of documents that are most similar to the
1569             query vector. The commonly used similarity metric is one based on the cosine
1570             distance between two vectors. Also note that all the vectors mentioned here are of
1571             the same size, the size of the vocabulary. An element of a vector is the frequency
1572             of occurrence of the word corresponding to that position in the vector.
1573              
1574             LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1575             step further by subjecting the term-frequency matrix for the corpus to singular value
1576             decomposition (SVD). By retaining only a subset of the singular values (usually the
1577             N largest for some value of N), you can construct reduced-dimensionality vectors for
1578             the documents and the queries. In VSM, as mentioned above, the size of the document
1579             and the query vectors is equal to the size of the vocabulary. For large corpora,
1580             this size may involve tens of thousands of words --- this can slow down the VSM
1581             modeling and retrieval process. So you are very likely to get faster performance
1582             with retrieval based on LSA modeling, especially if you store the model once
1583             constructed in a database file on the disk and carry out retrievals using the
1584             disk-based model.
1585              
1586              
1587             =head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1588              
1589             This module has only been tested for software retrieval. For more general text
1590             retrieval, you would need to replace the simple stemmer used in the module by one
1591             based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1592             list of stop words appropriate to the text corpora of interest to you. As previously
1593             mentioned, the stop words are the commonly occurring words that do not carry much
1594             discriminatory power from the standpoint of distinguishing between the documents.
1595             See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1596             formatted.
1597              
1598              
1599             =head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1600              
1601             It is not uncommon for large software libraries to consist of tens of thousands of
1602             documents that include source-code files, documentation files, README files,
1603             configuration files, etc. The bug-localization work presented recently by Shivani
1604             Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1605             based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1606             size of 7553 unique words. (Here is a link to this work:
1607             L. Also note that the iBUGS dataset
1608             was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1609             automated bug detection and localization tools.) If C is the size of the
1610             vocabulary and C the number of the documents in the corpus, the size of each
1611             vector will be C and size of the term-frequency matrix for the entire corpus will
1612             be CxC. So if you were to duplicate the bug localization experiments in
1613             L you would be dealing with vectors of
1614             size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1615             to really large libraries/corpora, we are obviously talking about very large matrices
1616             for SVD decomposition. For large libraries/corpora, it would be best to store away
1617             the model in a disk file and to base all subsequent retrievals on the disk-stored
1618             models. The 'examples' directory contains scripts that carry out retrievals on the
1619             basis of disk-based models. Further speedup in retrieval can be achieved by using
1620             LSA to create reduced-dimensionality representations for the documents and by basing
1621             retrievals on the stored versions of such reduced-dimensionality representations.
1622              
1623              
1624             =head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1625              
1626             The performance of a retrieval algorithm is typically measured by two properties:
1627             C and C. As mentioned in the
1628             L publication, at a given rank C,
1629             C is the ratio of the number of retrieved documents that are relevant to
1630             the total number of retrieved documents up to that rank. And, along the same lines,
1631             C at a given rank C is the ratio of the number of retrieved documents that
1632             are relevant to the total number of relevant documents. The area under the
1633             C--C curve is called the C for a query. When
1634             the C is averaged over all the queries, we obtain what is known as
1635             C (MAP). For an oracle, the value of MAP should be 1.0. On
1636             the other hand, for purely random retrieval from a corpus, the value of MAP will be
1637             inversely proportional to the size of the corpus. (See the discussion in
1638             L for further
1639             explanation on these retrieval precision evaluators.) This module includes methods
1640             that allow you to carry out these retrieval accuracy measurements using the relevancy
1641             judgments supplied through a disk file. If human-supplied relevancy judgments are
1642             not available, the module will be happy to estimate relevancies for you just by
1643             determining the number of query words that exist in a document. Note, however, that
1644             relevancy judgments estimated in this manner cannot be trusted. That is because
1645             ultimately it is the humans who are the best judges of the relevancies of documents
1646             to queries. The humans bring to bear semantic considerations on the relevancy
1647             determination problem that are beyond the scope of this module.
1648              
1649              
1650             =head1 METHODS
1651              
1652             The module provides the following methods for constructing VSM and LSA models of a
1653             corpus, for using the models thus constructed for retrieval, and for carrying out
1654             precision versus recall calculations for the determination of retrieval accuracy on
1655             the corpora of interest to you.
1656              
1657             =over
1658              
1659             =item B
1660              
1661             A call to C constructs a new instance of the C class:
1662              
1663             my $vsm = Algorithm::VSM->new(
1664             break_camelcased_and_underscored => 1,
1665             case_sensitive => 0,
1666             corpus_directory => "",
1667             corpus_vocab_db => "corpus_vocab_db",
1668             doc_vectors_db => "doc_vectors_db",
1669             file_types => $my_file_types,
1670             lsa_svd_threshold => 0.01,
1671             max_number_retrievals => 10,
1672             min_word_length => 4,
1673             normalized_doc_vecs_db => "normalized_doc_vecs_db",
1674             query_file => "",
1675             relevancy_file => $relevancy_file,
1676             relevancy_threshold => 5,
1677             save_model_on_disk => 0,
1678             stop_words_file => "",
1679             use_idf_filter => 1,
1680             want_stemming => 1,
1681             );
1682              
1683             The values shown on the right side of the big arrows are the B
1684             parameters>. The value supplied through the variable C<$my_file_types> would be
1685             something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1686             be included in creating the database model. The following nested list will now
1687             describe each of the constructor parameters shown above:
1688              
1689             =over 16
1690              
1691             =item I
1692              
1693             The parameter B when set causes the
1694             underscored and camel-cased words to be split. By default the parameter is
1695             set. So if you don't want such words to be split, you must set it
1696             explicitly to 0.
1697              
1698             =item I
1699              
1700             When set to 1, this parameter forces the module to maintain the case of the terms in
1701             the corpus files when creating the vocabulary and the document vectors. Setting
1702             C to 1 also causes the query matching to become case sensitive.
1703             (This constructor parameter was introduced in Version 1.61.)
1704              
1705             =item I
1706              
1707             The parameter B points to the root of the directory of documents
1708             for which you want to create a VSM or LSA model.
1709              
1710             =item I
1711              
1712             The parameter B is for naming the DBM in which the corpus vocabulary
1713             will be stored after it is subject to stemming and the elimination of stop words.
1714             Once a disk-based VSM model is created and stored away in the file named by this
1715             parameter and the parameter to be described next, it can subsequently be used
1716             directly for speedier retrieval.
1717              
1718             =item I
1719              
1720             The database named by B stores the document vector representation for
1721             each document in the corpus. Each document vector has the same size as the
1722             corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1723             the word that corresponds to that position in the vocabulary vector.
1724              
1725             =item I
1726              
1727             This parameter tells the module what types of files in the corpus directory you want
1728             scanned for creating the database model. The value supplied for this parameter is an
1729             anonymous list of the file suffixes for the file types. For example, if you wanted
1730             only Java and text files to be scanned, you will set this parameter to C<['.java',
1731             '.txt']>. The module throws an exception if this parameter is left unspecified.
1732             (This constructor parameter was introduced in Version 1.61.)
1733              
1734             =item I
1735              
1736             The parameter B is used for rejecting singular values that are
1737             smaller than this threshold fraction of the largest singular value. This plays a
1738             critical role in creating reduced-dimensionality document vectors in LSA modeling of
1739             a corpus.
1740              
1741             =item I
1742              
1743             The constructor parameter B stands for what it means.
1744              
1745             =item I
1746              
1747             The parameter B sets the minimum number of characters in a
1748             word in order for it to be included in the corpus vocabulary.
1749              
1750             =item I
1751              
1752             The database named by B stores the normalized document
1753             vectors. Normalization consists of factoring out the size of the documents by
1754             dividing the term frequency for each word in a document by the number of words in the
1755             document, and then multiplying the result by the idf (Inverse Document Frequency)
1756             value for the word.
1757              
1758             =item I
1759              
1760             The parameter B points to a file that contains the queries to be used for
1761             calculating retrieval performance with C and C numbers. The format
1762             of the query file must be as shown in the sample file C in the
1763             'examples' directory.
1764              
1765             =item I
1766              
1767             This option names the disk file for storing the relevancy judgments.
1768              
1769             =item I
1770              
1771             The constructor parameter B is used for automatic determination
1772             of document relevancies to queries on the basis of the number of occurrences of query
1773             words in a document. You can exercise control over the process of determining
1774             relevancy of a document to a query by giving a suitable value to the constructor
1775             parameter B. A document is considered relevant to a query only
1776             when the document contains at least B number of query words.
1777              
1778             =item I
1779              
1780             The constructor parameter B will cause the basic
1781             information about the VSM and the LSA models to be stored on the disk.
1782             Subsequently, any retrievals can be carried out from the disk-based model.
1783              
1784             =item I
1785              
1786             The parameter B is for naming the file that contains the stop words
1787             that you do not wish to include in the corpus vocabulary. The format of this file
1788             must be as shown in the sample file C in the 'examples' directory.
1789              
1790             =item I
1791              
1792             The constructor parameter B is set by default. If you want
1793             to turn off the normalization of the document vectors, including turning
1794             off the weighting of the term frequencies of the words by their idf values,
1795             you must set this parameter explicitly to 0.
1796              
1797             =item I
1798              
1799             The boolean parameter B determines whether or not the words extracted
1800             from the documents would be subject to stemming. As mentioned elsewhere, stemming
1801             means that related words like 'programming' and 'programs' would both be reduced to
1802             the root word 'program'.
1803              
1804             =back
1805              
1806             =begin html
1807              
1808            
1809              
1810             =end html
1811              
1812             =item B
1813              
1814             You call this subroutine for constructing an LSA model for your corpus
1815             after you have extracted the corpus vocabulary and constructed document
1816             vectors:
1817              
1818             $vsm->construct_lsa_model();
1819              
1820             The SVD decomposition that is carried out in LSA model construction uses the
1821             constructor parameter C to decide how many of the singular values
1822             to retain for the LSA model. A singular is retained only if it is larger than the
1823             C fraction of the largest singular value.
1824              
1825              
1826             =item B
1827              
1828             If you would like to see corpus vocabulary as constructed by the previous call, make
1829             the call
1830              
1831             $vsm->display_corpus_vocab();
1832              
1833             Note that this is a useful thing to do only on small test corpora. If you need
1834             to examine the vocabulary for a large corpus, call the two methods listed below.
1835              
1836              
1837             =item B
1838              
1839             If you would like for the module to print out in your terminal window the size of the
1840             vocabulary, make the call
1841              
1842             $vsm->display_corpus_vocab_size();
1843              
1844              
1845             =item B
1846              
1847             If you would like to see the document relevancies generated by the previous method,
1848             you can call
1849              
1850             $vsm->display_doc_relevancies()
1851              
1852              
1853             =item B
1854              
1855             If you would like to see the document vectors constructed by the previous call, make
1856             the call:
1857              
1858             $vsm->display_doc_vectors();
1859              
1860             Note that this is a useful thing to do only on small test corpora. If you must call
1861             this method on a large corpus, you might wish to direct the output to a file.
1862              
1863              
1864             =item B
1865              
1866             You can display the idf value associated with each word in the corpus by
1867              
1868             $vsm->display_inverse_document_frequencies();
1869              
1870             The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1871             of the total number of documents in the corpus to the number of documents in which
1872             the word appears (with protection built in to prevent division by zero). Ideally, if
1873             a word appears in all the documents, its idf would be small, close to zero. Words
1874             with small idf values are non-discriminatory and should get reduced weighting in
1875             document retrieval.
1876              
1877              
1878             =item B
1879              
1880             The area under the precision vs. recall curve for a given query is called C
1881             Precision> for that query. When this area is averaged over all the queries, you get
1882             C (Mean Average Precision) as a measure of the accuracy of the retrieval
1883             algorithm. The C values for the queries and the overall C
1884             can be printed out by calling
1885              
1886             $vsm->display_map_values_for_queries();
1887              
1888              
1889             =item B
1890              
1891             If you would like to see the normalized document vectors, make the call:
1892              
1893             $vsm->display_normalized_doc_vectors();
1894              
1895             See the comment made previously as to what is meant by the normalization of a
1896             document vector.
1897              
1898              
1899             =item B
1900              
1901             A call to C will normally be followed by the
1902             following call
1903              
1904             $vsm->display_precision_vs_recall_for_queries();
1905              
1906             for displaying the C and C values.
1907              
1908              
1909             =item B
1910              
1911             You can display the retrieved document names by calling this method using the syntax:
1912              
1913             $vsm->display_retrievals( $retrievals );
1914              
1915             where C<$retrievals> is a reference to the hash returned by a call to one of the
1916             C methods. The display method shown here respects the retrieval size
1917             constraints expressed by the constructor parameter C.
1918              
1919              
1920             =item B
1921              
1922             Before you can carry out precision and recall calculations to test the accuracy of
1923             VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1924             judgments for the queries. (A relevancy judgment for a query is simply the list of
1925             documents relevant to that query.) Relevancy judgments are commonly supplied by the
1926             humans who are familiar with the corpus. But if such human-supplied relevance
1927             judgments are not available, you can invoke the following method to estimate them:
1928              
1929             $vsm->estimate_doc_relevancies();
1930              
1931             For the above method call, a document is considered to be relevant to a query if it
1932             contains several of the query words. As to the minimum number of query words that
1933             must exist in a document in order for the latter to be considered relevant, that is
1934             determined by the C parameter in the VSM constructor.
1935              
1936             But note that this estimation of document relevancies to queries is NOT for serious
1937             work. The reason for that is because ultimately it is the humans who are the best
1938             judges of the relevancies of documents to queries. The humans bring to bear semantic
1939             considerations on the relevancy determination problem that are beyond the scope of
1940             this module.
1941              
1942             The generated relevancies are deposited in a file named by the constructor parameter
1943             C.
1944              
1945              
1946             =item B
1947              
1948             If you want to get hold of all the filenames in the corpus in your own script, you
1949             can call
1950              
1951             my @docs = @{$vsm->get_all_document_names()};
1952              
1953             The array on the left will contain an alphabetized list of the files.
1954              
1955              
1956             =item B
1957              
1958             This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1959             course, if you will be doing document retrieval through a disk-stored VSM or LSA
1960             model, then you do not need to call this method. You construct document vectors
1961             through the following call:
1962              
1963             $vsm->generate_document_vectors();
1964              
1965              
1966             =item B
1967              
1968             After you have constructed a new instance of the C class, you must
1969             now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1970              
1971             $vsm->get_corpus_vocabulary_and_word_counts();
1972              
1973             The only time you do NOT need to call this method is when you are using a previously
1974             constructed disk-stored VSM model for retrieval.
1975              
1976              
1977             =item B
1978              
1979             If you want to run significance tests on the retrieval accuracies you obtain on a
1980             given corpus and with different algorithms (VSM or LSA with different choices for the
1981             constructor parameters), your own script would need access to the average precision
1982             data for a set of queries. You can get hold of this data by calling
1983              
1984             $vsm->get_query_sorted_average_precision_for_queries();
1985              
1986             The script C in the 'examples' directory shows how you can
1987             use this method for significance testing.
1988              
1989              
1990             =item B
1991              
1992             =item B
1993              
1994             If you would like to compare in your own script any two documents in the corpus, you
1995             can call
1996              
1997             my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
1998             or
1999             my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
2000              
2001             Both these calls return a number that is the dot product of the two document vectors
2002             normalized by the product of their magnitudes. The first call uses the regular
2003             document vectors and the second the normalized document vectors.
2004              
2005              
2006             =item B
2007              
2008             After you have created or obtained the relevancy judgments for your test queries, you
2009             can make the following call to calculate C and C:
2010              
2011             $vsm->precision_and_recall_calculator('vsm');
2012             or
2013             $vsm->precision_and_recall_calculator('lsa');
2014              
2015             depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
2016              
2017             =item B
2018              
2019             After you have built an LSA model through the call to C, you
2020             can retrieve the document names most similar to the query by:
2021              
2022             my $retrievals = $vsm->retrieve_with_lsa( \@query );
2023              
2024             Subsequently, you can display the retrievals by calling the
2025             C method described previously.
2026              
2027              
2028             =item B
2029              
2030             After you have constructed a VSM model, you call this method for document retrieval
2031             for a given query C<@query>. The call syntax is:
2032              
2033             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2034              
2035             The argument, C<@query>, is simply a list of words that you wish to use for
2036             retrieval. The method returns a hash whose keys are the document names and whose
2037             values the similarity distance between the document and the query. As is commonly
2038             the case with VSM, this module uses the cosine similarity distance when comparing a
2039             document vector with the query vector.
2040              
2041              
2042             =item B
2043              
2044             When human-supplied relevancies are available, you can upload them into the program
2045             by calling
2046              
2047             $vsm->upload_document_relevancies_from_file();
2048              
2049             These relevance judgments will be read from a file that is named with the
2050             C constructor parameter.
2051              
2052              
2053             =item B
2054              
2055             When you invoke the methods C and
2056             C, that automatically deposits the VSM model in the
2057             database files named with the constructor parameters C,
2058             C and C. Subsequently, you can carry out
2059             retrieval by directly using this disk-based VSM model for speedier performance. In
2060             order to do so, you must upload the disk-based model by
2061              
2062             $vsm->upload_normalized_vsm_model_from_disk();
2063              
2064             Subsequently you call
2065              
2066             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2067             $vsm->display_retrievals( $retrievals );
2068              
2069             for retrieval and for displaying the results.
2070              
2071              
2072             =item B
2073              
2074             This is the method to call for large text corpora if you would like to examine the
2075             vocabulary created. The call syntax is
2076              
2077             $vsm->write_corpus_vocab_to_file($filename);
2078              
2079             where C<$filename> is the name of the file that you want the vocabulary to be written
2080             out to. This call will also show the frequency of each vocabulary word in your
2081             corpus.
2082              
2083              
2084             =back
2085              
2086              
2087             =head1 REQUIRED
2088              
2089             This module requires the following modules:
2090              
2091             SDBM_File
2092             Storable
2093             PDL
2094             File::Basename
2095             File::Spec::Functions
2096              
2097             The first two of these are needed for creating disk-based database records for the
2098             VSM and LSA models. The third is needed for calculating the SVD of the
2099             term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2100             by the directory scanner to make pathnames platform independent.
2101              
2102             =head1 EXAMPLES
2103              
2104             See the 'examples' directory in the distribution for the scripts listed below:
2105              
2106             =over
2107              
2108             =item B
2109              
2110             For basic VSM-based model construction and retrieval, run the script:
2111              
2112             retrieve_with_VSM.pl
2113              
2114             Starting with version 1.60, this script does not store away the VSM model in
2115             disk-based hash tables. If you want your model to be stored on the disk, you must
2116             run the script C for that.
2117              
2118             =item B
2119              
2120             If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2121             script
2122              
2123             continuously_running_VSM_retrieval_engine.pl
2124              
2125             You can create a script similar to this for doing the same with LSA models.
2126              
2127             =item B
2128              
2129             For storing the model information in disk-based DBM files that can subsequently be
2130             used for both VSM and LSA retrieval, run the script:
2131              
2132             retrieve_with_VSM_and_also_create_disk_based_model.pl
2133              
2134             =item B
2135              
2136             For basic LSA-based model construction and retrieval, run the script:
2137              
2138             retrieve_with_LSA.pl
2139              
2140             Starting with version 1.60, this script does not store away the model information in
2141             disk-based hash tables. If you want your model to be stored on the disk, you must
2142             run the script C for that.
2143              
2144             =item B
2145              
2146             If you have previously run a script like
2147             C, you can run the script
2148              
2149             retrieve_with_disk_based_VSM.pl
2150              
2151             for repeated VSM-based retrievals from a disk-based model.
2152              
2153             =item B
2154              
2155             If you have previously run a script like
2156             C, you can run the script
2157              
2158             retrieve_with_disk_based_LSA.pl
2159              
2160             for repeated LSA-based retrievals from a disk-based model.
2161              
2162             =item B
2163              
2164             To experiment with precision and recall calculations for VSM retrieval, run the
2165             script:
2166              
2167             calculate_precision_and_recall_for_VSM.pl
2168              
2169             Note that this script will carry out its own estimation of relevancy judgments ---
2170             which in most cases would not be a safe thing to do.
2171              
2172             =item B
2173              
2174             To experiment with precision and recall calculations for LSA retrieval, run the
2175             script:
2176              
2177             calculate_precision_and_recall_for_LSA.pl
2178              
2179             Note that this script will carry out its own estimation of relevancy judgments ---
2180             which in most cases would not be a safe thing to do.
2181              
2182             =item B
2183             Human-Supplied Relevancies:>
2184              
2185             Precision and recall calculations for retrieval accuracy determination are best
2186             carried out with human-supplied judgments of relevancies of the documents to queries.
2187             If such judgments are available, run the script:
2188              
2189             calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2190              
2191             This script will print out the average precisions for the different test queries and
2192             calculate the MAP metric of retrieval accuracy.
2193              
2194             =item B
2195             Human-Supplied Relevancies:>
2196              
2197             If human-supplied relevancy judgments are available and you wish to experiment with
2198             precision and recall calculations for LSA-based retrieval, run the script:
2199              
2200             calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2201              
2202             This script will print out the average precisions for the different test queries and
2203             calculate the MAP metric of retrieval accuracy.
2204              
2205             =item B
2206             Randomization or with Student's Paired t-Test:>
2207              
2208             significance_testing.pl randomization
2209              
2210             or
2211              
2212             significance_testing.pl t-test
2213              
2214             Significance testing consists of forming a null hypothesis that the two retrieval
2215             algorithms you are considering are the same from a black-box perspective and then
2216             calculating what is known as a C. If the C is less than, say,
2217             0.05, you reject the null hypothesis.
2218              
2219             =item B
2220              
2221             calculate_similarity_matrix_for_all_docs.pl
2222              
2223             or
2224              
2225             calculate_similarity_matrix_for_all_normalized_docs.pl
2226              
2227             The former uses regular document vectors for calculating the similarity between every
2228             pair of documents in the corpus. And the latter uses normalized document vectors for
2229             the same purpose. The document order used for row and column indexing of the matrix
2230             corresponds to the alphabetic ordering of the document names in the corpus directory.
2231              
2232             =back
2233              
2234              
2235             =head1 EXPORT
2236              
2237             None by design.
2238              
2239             =head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2240              
2241             You have to be careful when carrying out Precision verses Recall calculations if you
2242             do not wish to lose the previously created relevancy judgments. Invoking the method
2243             C in your own script will cause the file C
2244             to be overwritten. If you have created a relevancy database and stored it in a file
2245             called, say, C, you should make a backup copy of this file before
2246             executing a script that calls C.
2247              
2248             =head1 BUGS
2249              
2250             Please notify the author if you encounter any bugs. When sending email, please place
2251             the string 'VSM' in the subject line to get past my spam filter.
2252              
2253             =head1 INSTALLATION
2254              
2255             Download the archive from CPAN in any directory of your choice. Unpack the archive
2256             with a command that on a Linux machine would look like:
2257              
2258             tar zxvf Algorithm-VSM-1.62.tar.gz
2259              
2260             This will create an installation directory for you whose name will be
2261             C. Enter this directory and execute the following commands for a
2262             standard install of the module if you have root privileges:
2263              
2264             perl Makefile.PL
2265             make
2266             make test
2267             sudo make install
2268              
2269             If you do not have root privileges, you can carry out a non-standard install the
2270             module in any directory of your choice by:
2271              
2272             perl Makefile.PL prefix=/some/other/directory/
2273             make
2274             make test
2275             make install
2276              
2277             With a non-standard install, you may also have to set your PERL5LIB environment
2278             variable so that this module can find the required other modules. How you do that
2279             would depend on what platform you are working on. In order to install this module in
2280             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2281             variable by
2282              
2283             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2284              
2285             If I used bash, I'd need to declare:
2286              
2287             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2288              
2289              
2290             =head1 THANKS
2291              
2292             Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2293             deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2294             similarity matrix characterization of a corpus. Thanks, Zahn!
2295              
2296             Several of the recent changes to the module are a result of the feedback I have
2297             received from Naveen Kulkarni of Infosys Labs. Thanks, Naveen!
2298              
2299             Version 1.62 was a result of Slaven Rezic's recommendation that I remove the Perl
2300             version restriction on the module since he was able to run it with Perl version
2301             5.8.9. Another important reason for v. 1.62 was the discovery of the two bugs
2302             mentioned in Changes, one of them brought to my attention by Naveen Kulkarni.
2303              
2304             =head1 AUTHOR
2305              
2306             Avinash Kak, kak@purdue.edu
2307              
2308             If you send email, please place the string "VSM" in your subject line to get past my
2309             spam filter.
2310              
2311             =head1 COPYRIGHT
2312              
2313             This library is free software; you can redistribute it and/or modify it under the
2314             same terms as Perl itself.
2315              
2316             Copyright 2015 Avinash Kak
2317              
2318             =cut
2319              
2320