File Coverage

blib/lib/Algorithm/VSM.pm
Criterion Covered Total %
statement 351 822 42.7
branch 97 338 28.7
condition 25 67 37.3
subroutine 31 54 57.4
pod 24 35 68.5
total 528 1316 40.1


line stmt bran cond sub pod time code
1             package Algorithm::VSM;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::VSM is a Perl module for retrieving the documents from a software
9             # library that match a list of words in a query. The matching criterion used depends
10             # on whether you ask the module to construct a full-dimensionality VSM or a
11             # reduced-dimensionality LSA model for the library.
12             # ---------------------------------------------------------------------------
13              
14             #use 5.10.0;
15 1     1   33328 use strict;
  1         3  
  1         37  
16 1     1   6 use warnings;
  1         2  
  1         37  
17 1     1   7 use Carp;
  1         7  
  1         103  
18 1     1   882 use SDBM_File;
  1         29572  
  1         104  
19 1     1   1082 use PDL::Lite;
  1         217243  
  1         33  
20 1     1   9 use PDL::MatrixOps;
  1         2  
  1         9  
21 1     1   217 use File::Basename;
  1         2  
  1         94  
22 1     1   837 use File::Spec::Functions qw(rel2abs);
  1         929  
  1         68  
23 1     1   13 use Fcntl;
  1         3  
  1         272  
24 1     1   5464 use Storable;
  1         15157  
  1         83  
25 1     1   8 use Cwd;
  1         2  
  1         13366  
26              
27             our $VERSION = '1.70';
28              
29             # for camelcase splits (from perlmonks):
30             my $_regex = qr/[[:lower:]0-9]+|[[:upper:]0-9](?:[[:upper:]0-9]+|[[:lower:]0-9]*)(?=$|[[:upper:]0-9])/;
31              
32             ################################### Constructor #######################################
33              
34             # Constructor for creating a VSM or LSA model of a corpus. The model instance
35             # returned by the constructor can be used for retrieving documents from the corpus
36             # in response to queries.
37             sub new {
38 2     2 1 1315 my ($class, %args) = @_;
39 2         13 my @params = keys %args;
40 2 50       12 croak "\nYou have used a wrong name for a keyword argument " .
41             "--- perhaps a misspelling\n"
42             if _check_for_illegal_params(@params) == 0;
43             bless {
44             _corpus_directory => $args{corpus_directory} || "",
45             _save_model_on_disk => $args{save_model_on_disk} || 0,
46             _break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47             $args{break_camelcased_and_underscored} : 1,
48             _corpus_vocab_db => $args{corpus_vocab_db} || "corpus_vocab_db",
49             _doc_vectors_db => $args{doc_vectors_db} || "doc_vectors_db",
50             _normalized_doc_vecs_db => $args{normalized_doc_vecs_db} || "normalized_doc_vecs_db",
51             _stop_words_file => $args{stop_words_file} || "",
52             _case_sensitive => $args{case_sensitive} || 0,
53             _query_file => $args{query_file} || "",
54             _file_types => $args{file_types} || [],
55             _min_word_length => $args{min_word_length} || 4,
56             _want_stemming => $args{want_stemming} || 0,
57             _idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58             _max_number_retrievals => $args{max_number_retrievals} || 30,
59             _lsa_svd_threshold => $args{lsa_svd_threshold} || 0.01,
60             _relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
61             _relevancy_file => $args{relevancy_file} || "",
62 2 50 50     9796 _debug => $args{debug} || 0,
    50 50        
    50 50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      100        
      50        
      50        
63             _working_directory => cwd,
64             _vocab_hist_on_disk => {},
65             _vocab_hist => {},
66             _doc_hist_template => {},
67             _corpus_doc_vectors => {},
68             _normalized_doc_vecs => {},
69             _query_vector => {},
70             _stop_words => [],
71             _term_document_matrix => [],
72             _corpus_vocab_done => 0,
73             _scan_dir_for_rels => 0,
74             _vocab_size => undef,
75             _doc_vecs_trunc_lsa => {},
76             _lsa_vec_truncator => undef,
77             _queries_for_relevancy => {},
78             _relevancy_estimates => {},
79             _precision_for_queries => {},
80             _avg_precision_for_queries => {},
81             _recall_for_queries => {},
82             _map => undef,
83             _vocab_idf_hist => {},
84             _idf_t => {},
85             _total_num_of_docs => 0,
86             }, $class;
87             }
88              
89              
90             ###################### Get corpus vocabulary and word counts #########################
91              
92             sub get_corpus_vocabulary_and_word_counts {
93 2     2 1 27 my $self = shift;
94             die "You must supply the name of the corpus directory to the constructor"
95 2 50       22 unless $self->{_corpus_directory};
96             print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97 2 50       14 " model construction\n\n" if $self->{_debug};
98 2         28 $self->_scan_directory( $self->{_corpus_directory} );
99 2 50       10 $self->_drop_stop_words() if $self->{_stop_words_file};
100 2 50       10 if ($self->{_debug}) {
101 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
102 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103             }
104             }
105 2 50       8 if ($self->{_save_model_on_disk}) {
106 0         0 unlink glob "$self->{_corpus_vocab_db}.*";
107 0         0 unlink glob "$self->{_doc_vectors_db}.*";
108 0         0 unlink glob "$self->{_normalized_doc_vecs_db}.*";
109 0         0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
110 0 0       0 $self->{_corpus_vocab_db}, O_RDWR|O_CREAT, 0640
111             or die "Can't create DBM files: $!";
112 0         0 foreach (keys %{$self->{_vocab_hist}}) {
  0         0  
113 0         0 $self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114             }
115 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
116             }
117 2         8 $self->{_corpus_vocab_done} = 1;
118 2         5 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  2         17  
119             print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120 2 50       8 if $self->{_debug};
121             # Calculate idf(t):
122 2         3 foreach (keys %{$self->{_vocab_idf_hist}}) {
  2         33  
123             $self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124             /
125 218         576 (1 + $self->{_vocab_idf_hist}->{$_})))
126             / log(10) );
127             }
128             }
129              
130             sub display_corpus_vocab {
131 0     0 1 0 my $self = shift;
132             die "corpus vocabulary not yet constructed"
133 0 0       0 unless keys %{$self->{_vocab_hist}};
  0         0  
134 0         0 print "\n\nDisplaying corpus vocabulary:\n\n";
135 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
136 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137 0         0 print "$outstring\n";
138             }
139             }
140              
141             sub display_corpus_vocab_size {
142 0     0 1 0 my $self = shift;
143             die "corpus vocabulary not yet constructed"
144 0 0       0 unless keys %{$self->{_vocab_hist}};
  0         0  
145 0         0 my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
146 0         0 print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147             }
148              
149             sub write_corpus_vocab_to_file {
150 0     0 1 0 my $self = shift;
151 0         0 my $file = shift;
152 0 0       0 die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
  0         0  
153 0 0       0 open OUT, "> $file"
154             or die "unable to open for output a file with name `$file': $!";
155 0         0 foreach (sort keys %{$self->{_vocab_hist}}){
  0         0  
156 0         0 my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157 0         0 print OUT "$outstring\n";
158             }
159 0         0 close OUT;
160             }
161              
162             sub display_inverse_document_frequencies {
163 0     0 1 0 my $self = shift;
164             die "corpus vocabulary not yet constructed"
165 0 0       0 unless keys %{$self->{_vocab_idf_hist}};
  0         0  
166             print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167 0 0       0 unless $self->{_idf_filter_option};
168 0         0 print "\n\nDisplaying inverse document frequencies:\n";
169 0         0 foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
  0         0  
170             my $outstring = sprintf("%30s %d",
171 0         0 $_, $self->{_vocab_idf_hist}->{$_});
172 0         0 print "$outstring\n";
173             }
174 0         0 print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175 0         0 foreach ( sort keys %{$self->{_idf_t}} ) {
  0         0  
176 0         0 my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177 0         0 print "$outstring\n";
178             }
179             }
180              
181             sub get_all_document_names {
182 0     0 1 0 my $self = shift;
183 0         0 my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
  0         0  
184 0         0 return \@all_files;
185             }
186              
187             ############################ Generate Document Vectors #################################
188              
189             sub generate_document_vectors {
190 2     2 1 34 my $self = shift;
191 2         28 chdir $self->{_working_directory};
192 2         6 foreach ( sort keys %{$self->{_vocab_hist}} ) {
  2         108  
193 218         353 $self->{_doc_hist_template}->{$_} = 0;
194             }
195 2         26 $self->_scan_directory( $self->{_corpus_directory} );
196 2         36 chdir $self->{_working_directory};
197 2 50       40 if ($self->{_save_model_on_disk}) {
198             die "You did not specify in the constructor call the names for the diskfiles " .
199             "for storing the disk-based hash tables consisting of document vectors " .
200             "and their normalized versions"
201 0 0 0     0 unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202 0         0 eval {
203 0         0 store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204             };
205 0 0       0 if ($@) {
206 0         0 print "Something went wrong with disk storage of document vectors: $@";
207             }
208 0         0 eval {
209 0         0 store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210             };
211 0 0       0 if ($@) {
212 0         0 print "Something wrong with disk storage of normalized doc vecs: $@";
213             }
214             }
215             }
216              
217             sub display_doc_vectors {
218 0     0 1 0 my $self = shift;
219             die "document vectors not yet constructed"
220 0 0       0 unless keys %{$self->{_corpus_doc_vectors}};
  0         0  
221 0         0 foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
222 0         0 print "\n\ndisplay doc vec for $file:\n";
223 0         0 foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
  0         0  
224 0         0 print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225             }
226 0         0 my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
  0         0  
227 0         0 print "\nSize of vector for $file: $docvec_size\n";
228             }
229             }
230              
231             sub display_normalized_doc_vectors {
232 0     0 1 0 my $self = shift;
233             die "normalized document vectors not yet constructed"
234 0 0       0 unless keys %{$self->{_normalized_doc_vecs}};
  0         0  
235 0 0       0 unless ($self->{_idf_filter_option}) {
236 0         0 print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237 0         0 return;
238             }
239 0         0 foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
  0         0  
240 0         0 print "\n\ndisplay normalized doc vec for $file:\n";
241 0         0 foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
  0         0  
242 0         0 print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243             }
244 0         0 my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
  0         0  
245 0         0 print "\nSize of normalized vector for $file: $docvec_size\n";
246             }
247             }
248              
249             ######################## Calculate Pairwise Document Similarities ######################
250              
251             # Returns the similarity score for two documents whose actual names are are supplied
252             # as its two arguments.
253             sub pairwise_similarity_for_docs {
254 0     0 1 0 my $self = shift;
255 0         0 my $doc1 = shift;
256 0         0 my $doc2 = shift;
257 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
258 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260 0         0 my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261 0         0 my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262 0         0 my @vec1 = ();
263 0         0 my @vec2 = ();
264 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
265 0         0 push @vec1, $vec_hash_ref1->{$word};
266 0         0 push @vec2, $vec_hash_ref2->{$word};
267             }
268 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
269 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
270 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
271 0         0 $product /= $vec_mag1 * $vec_mag2;
272 0         0 return $product;
273             }
274              
275             sub pairwise_similarity_for_normalized_docs {
276 0     0 1 0 my $self = shift;
277 0         0 my $doc1 = shift;
278 0         0 my $doc2 = shift;
279 0         0 my @all_files = keys %{$self->{_corpus_doc_vectors}};
  0         0  
280 0 0       0 croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281 0 0       0 croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282 0         0 my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283 0         0 my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284 0         0 my @vec1 = ();
285 0         0 my @vec2 = ();
286 0         0 foreach my $word (sort keys %$vec_hash_ref1) {
287 0         0 push @vec1, $vec_hash_ref1->{$word};
288 0         0 push @vec2, $vec_hash_ref2->{$word};
289             }
290 0         0 my $vec_mag1 = vec_magnitude(\@vec1);
291 0         0 my $vec_mag2 = vec_magnitude(\@vec2);
292 0         0 my $product = vec_scalar_product(\@vec1, \@vec2);
293 0         0 $product /= $vec_mag1 * $vec_mag2;
294 0         0 return $product;
295             }
296              
297             ############################### Retrieve with VSM Model ################################
298              
299             sub retrieve_with_vsm {
300 1     1 1 18 my $self = shift;
301 1         6 my $query = shift;
302 1         8 my @clean_words;
303 1         5 my $min = $self->{_min_word_length};
304              
305 1 50       6 if ($self->{_break_camelcased_and_underscored}) {
306 1         41 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
307 1         6 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         41  
308             @clean_words = $self->{_case_sensitive} ?
309 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  0         0  
310 1 100       12 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8 50       82  
  8         43  
311             } else {
312 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
313 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
314             }
315 1         6 $query = \@clean_words;
316 1 50       6 print "\nYour query words are: @$query\n" if $self->{_debug};
317 1 50       16 if ($self->{_idf_filter_option}) {
318             die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319 1         82 unless scalar(keys %{$self->{_vocab_hist}})
320 1 50 50     10 && scalar(keys %{$self->{_normalized_doc_vecs}});
  1         6  
321             } else {
322             die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323 0         0 unless scalar(keys %{$self->{_vocab_hist}})
324 0 0 0     0 && scalar(keys %{$self->{_corpus_doc_vectors}});
  0         0  
325             }
326 1         2 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         60  
327 109         184 $self->{_query_vector}->{$_} = 0;
328             }
329 1         11 foreach (@$query) {
330 7 50       18 if ($self->{_case_sensitive}) {
331 0 0       0 $self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332             } else {
333 7 100       26 $self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334             }
335             }
336 1         3 my @query_word_counts = values %{$self->{_query_vector}};
  1         12  
337 1         13 my $query_word_count_total = sum(\@query_word_counts);
338 1 50       10 die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339             unless $query_word_count_total;
340 1         2 my %retrievals;
341 1 50       6 if ($self->{_idf_filter_option}) {
342             print "\n\nUsing idf filter option for retrieval:\n\n"
343 1 50       6 if $self->{_debug};
344 1         2 foreach (sort {$self->_doc_vec_comparator}
  24         62  
345 1         21 keys %{$self->{_normalized_doc_vecs}}) {
346 10 100       28 $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
347             }
348             } else {
349             print "\n\nNOT using idf filter option for retrieval:\n\n"
350 0 0       0 if $self->{_debug};
351 0         0 foreach (sort {$self->_doc_vec_comparator}
  0         0  
352 0         0 keys %{$self->{_corpus_doc_vectors}}) {
353 0 0       0 $retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
354             }
355             }
356 1 50       6 if ($self->{_debug}) {
357 0         0 print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
359 0         0 print "$_ => $retrievals{$_}\n";
360             }
361             }
362 1         11 return \%retrievals;
363             }
364              
365             ######################### Upload a Previously Constructed Model #########################
366              
367             sub upload_vsm_model_from_disk {
368 0     0 0 0 my $self = shift;
369             die "\nCannot find the database files for the VSM model"
370             unless -s "$self->{_corpus_vocab_db}.pag"
371 0 0 0     0 && -s $self->{_doc_vectors_db};
372 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373 0         0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
374 0 0       0 $self->{_corpus_vocab_db}, O_RDONLY, 0640
375             or die "Can't open DBM file: $!";
376 0 0       0 if ($self->{_debug}) {
377 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
378 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379             }
380             }
381 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
382 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383             }
384 0         0 $self->{_corpus_vocab_done} = 1;
385 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
386             print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387 0 0       0 if $self->{_debug};
388 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
390             }
391              
392             sub upload_normalized_vsm_model_from_disk {
393 0     0 1 0 my $self = shift;
394             die "\nCannot find the database files for the VSM model"
395             unless -s "$self->{_corpus_vocab_db}.pag"
396 0 0 0     0 && -s $self->{_normalized_doc_vecs_db};
397 0         0 $self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398 0         0 tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
399 0 0       0 $self->{_corpus_vocab_db}, O_RDONLY, 0640
400             or die "Can't open DBM file: $!";
401 0 0       0 if ($self->{_debug}) {
402 0         0 foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
  0         0  
403 0         0 printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404             }
405             }
406 0         0 foreach (keys %{$self->{_vocab_hist_on_disk}}) {
  0         0  
407 0         0 $self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408             }
409 0         0 $self->{_corpus_vocab_done} = 1;
410 0         0 $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
  0         0  
411             print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412 0 0       0 if $self->{_debug};
413 0         0 untie %{$self->{_vocab_hist_on_disk}};
  0         0  
414             }
415              
416             ############################## Display Retrieval Results ################################
417              
418             sub display_retrievals {
419 0     0 1 0 my $self = shift;
420 0         0 my $retrievals = shift;
421 0         0 print "\n\nShowing the retrievals and the similarity scores:\n\n";
422 0         0 my $iter = 0;
423 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
  0         0  
424 0         0 print "$_ => $retrievals->{$_}\n";
425 0         0 $iter++;
426 0 0       0 last if $iter > $self->{_max_number_retrievals};
427             }
428 0         0 print "\n\n";
429             }
430              
431             ############################### Directory Scanner ################################
432              
433             sub _scan_directory {
434 4     4   9 my $self = shift;
435 4         46 my $dir = rel2abs( shift );
436 4         18346 my $current_dir = cwd;
437 4 50       245 chdir $dir or die "Unable to change directory to $dir: $!";
438 4         942 foreach ( glob "*" ) {
439 40 50 33     30743 if ( -d and !(-l) ) {
    50 33        
      33        
      33        
      33        
440 0         0 $self->_scan_directory( $_ );
441 0 0       0 chdir $dir
442             or die "Unable to change directory to $dir: $!";
443             } elsif (-r _ and
444             -T _ and
445             -M _ > 0.00001 and # modification age is at least 1 sec
446             !( -l $_ ) and
447             $self->ok_to_filetype($_) ) {
448 40 50       100 $self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449 40 100       174 $self->_scan_file($_) unless $self->{_corpus_vocab_done};
450 40 100       199 $self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451             }
452             }
453 4         149 chdir $current_dir;
454             }
455              
456             sub _scan_file {
457 20     20   32 my $self = shift;
458 20         39 my $file = shift;
459 20         493 open IN, $file;
460 20         51 my $min = $self->{_min_word_length};
461 20         49 my %uniques = ();
462 20         218 while () {
463 856 100       3855 next if /^[ ]*\r?\n?$/;
464 730         3101 $_ =~ s/\r?\n?$//;
465 730         1177 my @clean_words;
466 730 50       1563 if ($self->{_break_camelcased_and_underscored}) {
467 730         10261 my @brokenup = grep $_, split /\W|_|\s+/, $_;
468 730         1993 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  2594         11704  
469             @clean_words = $self->{_case_sensitive} ?
470 0 0       0 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
  0         0  
471 730 100       2140 grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  2732 50       8037  
  2732         10985  
472             } else {
473 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
474 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
475             }
476 730 100       3037 next unless @clean_words;
477             @clean_words = grep $_, map &simple_stemmer($_), @clean_words
478 444 50       2255 if $self->{_want_stemming};
479             $self->{_case_sensitive} ?
480 0         0 map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
481 444 50       1697 map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
  1736         3999  
482 444 50       905 if ($self->{_case_sensitive}) {
483 0         0 for (@clean_words) { $uniques{$_}++ }
  0         0  
484             } else {
485 444         965 for (@clean_words) { $uniques{"\L$_"}++ }
  1736         4630  
486             }
487             }
488 20         567 close( IN );
489 20         139 map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
  838         1431  
490 20         175 $self->{_total_num_of_docs}++;
491             }
492              
493             sub ok_to_filetype {
494 40     40 0 81 my $self = shift;
495 40         186 my $filename = shift;
496 40         2215 my ($base, $dir, $suffix) = fileparse($filename, '\..*');
497             croak "You called this module without specifying the file types in the constructor"
498 40 50       73 unless @{$self->{_file_types}} > 0;
  40         166  
499 40 50       71 return 1 if contained_in($suffix, @{$self->{_file_types}});
  40         317  
500 0         0 return 0;
501             }
502              
503             ############################## LSA Modeling and Retrieval ################################
504              
505             sub construct_lsa_model {
506 1     1 1 21 my $self = shift;
507 1 50       12 if ($self->{_idf_filter_option}) {
508 1 50 33     19 if (!$self->{_normalized_doc_vecs} and
509             -s $self->{_normalized_doc_vecs_db}) {
510             $self->{_normalized_doc_vecs} =
511 0         0 retrieve($self->{_normalized_doc_vecs_db});
512             }
513 1         6 foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
  1         18  
514 10         15 my $term_frequency_vec;
515 10         14 foreach my $word (sort keys
516 10         576 %{$self->{_normalized_doc_vecs}->{$_}}){
517             push @$term_frequency_vec,
518 1090         1895 $self->{_normalized_doc_vecs}->{$_}->{$word};
519             }
520 10         57 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  10         31  
521             }
522             } else {
523 0 0 0     0 if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
524 0         0 $self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
525             }
526 0         0 foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
  0         0  
527 0         0 my $term_frequency_vec;
528 0         0 foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
  0         0  
529             push @$term_frequency_vec,
530 0         0 $self->{_corpus_doc_vectors}->{$_}->{$word};
531             }
532 0         0 push @{$self->{_term_document_matrix}}, $term_frequency_vec;
  0         0  
533             }
534             }
535 1         5 my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
  1         19  
536 1         485 my ($U,$SIGMA,$V) = svd $A;
537 1 50       8 print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
538 1 50       7 print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
539             my $index = return_index_of_last_value_above_threshold($SIGMA,
540 1         10 $self->{_lsa_svd_threshold});
541 1         14 my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
542 1 50       32 print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
543             # When you measure the size of a matrix in PDL, the zeroth dimension
544             # is considered to be along the horizontal and the one-th dimension
545             # along the rows. This is opposite of how we want to look at
546             # matrices. For a matrix of size MxN, we mean M rows and N columns.
547             # With this 'rows x columns' convention for matrix size, if you had
548             # to check the size of, say, U matrix, you would call
549             # my @size = ( $U->getdim(1), $U->getdim(0) );
550             # print "\nsize of U: @size\n";
551 1         7 my $U_trunc = $U->slice("0:$index,:")->sever;
552 1         46 my $V_trunc = $V->slice("0:$index,0:$index")->sever;
553 1         33 $self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
554             PDL::Basic::transpose($U_trunc);
555             print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
556 1 50       14151 if $self->{_debug};
557             my @sorted_doc_names = $self->{_idf_filter_option} ?
558 1         10 sort keys %{$self->{_normalized_doc_vecs}} :
559 1 50       7 sort keys %{$self->{_corpus_doc_vectors}};
  0         0  
560 1         3 my $i = 0;
561 1         5 foreach (@{$self->{_term_document_matrix}}) {
  1         4  
562             my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
563 10         31 PDL::Basic::transpose(pdl($_));
564 10         1384 my $doc_name = $sorted_doc_names[$i++];
565             print "\n\nTruncated doc vec for $doc_name: " .
566 10 50       26 $truncated_doc_vec . "\n" if $self->{_debug};
567 10         30 $self->{_doc_vecs_trunc_lsa}->{$doc_name}
568             = $truncated_doc_vec;
569             }
570 1         88 chdir $self->{_working_directory};
571             }
572              
573             sub retrieve_with_lsa {
574 1     1 1 11 my $self = shift;
575 1         5 my $query = shift;
576 1         2 my @clean_words;
577 1         6 my $min = $self->{_min_word_length};
578 1 50       5 if ($self->{_break_camelcased_and_underscored}) {
579 1         33 my @brokenup = grep $_, split /\W|_|\s+/, "@$query";
580 1         4 @clean_words = map {$_ =~ /$_regex/g} @brokenup;
  8         38  
581 1 100       3 @clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
  8         57  
  8         37  
582             } else {
583 0         0 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, "@$query";
584 0         0 @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  0         0  
  0         0  
585             }
586 1         3 $query = \@clean_words;
587 1 50       6 print "\nYour processed query words are: @$query\n" if $self->{_debug};
588             die "Your vocabulary histogram is empty"
589 1 50       3 unless scalar(keys %{$self->{_vocab_hist}});
  1         7  
590             die "You must first construct an LSA model"
591 1 50       2 unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
  1         10  
592 1         2 foreach ( keys %{$self->{_vocab_hist}} ) {
  1         21  
593 109         190 $self->{_query_vector}->{$_} = 0;
594             }
595 1         8 foreach (@$query) {
596             $self->{_query_vector}->{"\L$_"}++
597 7 100       24 if exists $self->{_vocab_hist}->{"\L$_"};
598             }
599 1         2 my @query_word_counts = values %{$self->{_query_vector}};
  1         13  
600 1         11 my $query_word_count_total = sum(\@query_word_counts);
601 1 50       4 die "Query does not contain corpus words. Nothing retrieved."
602             unless $query_word_count_total;
603 1         2 my $query_vec;
604 1         4 foreach (sort keys %{$self->{_query_vector}}) {
  1         50  
605 109         180 push @$query_vec, $self->{_query_vector}->{$_};
606             }
607 1 50       8 print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
608             my $truncated_query_vec = $self->{_lsa_vec_truncator} x
609 1         5 PDL::Basic::transpose(pdl($query_vec));
610             print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
611 1 50       163 if $self->{_debug};
612 1         2 my %retrievals;
613 1         2 foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
  1         9  
614             my $dot_product = PDL::Basic::transpose($truncated_query_vec)
615 10         175 x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
616             print "\n\nLSA: dot product of truncated query and\n" .
617             " truncated vec for doc $_ is " . $dot_product->sclr . "\n"
618 10 50       886 if $self->{_debug};
619 10 100       30 $retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
620             }
621 1 50       15 if ($self->{_debug}) {
622 0         0 print "\n\nShowing LSA retrievals and similarity scores:\n\n";
623 0         0 foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
  0         0  
624 0         0 print "$_ => $retrievals{$_}\n";
625             }
626 0         0 print "\n\n";
627             }
628 1         13 return \%retrievals;
629             }
630              
631             sub _construct_doc_vector {
632 20     20   34 my $self = shift;
633 20         44 my $file = shift;
634 20         35 my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
  20         77  
635 20         318 foreach ( sort keys %{$self->{_doc_hist_template}} ) {
  20         1129  
636 2180         3237 $document_vector{$_} = 0;
637             }
638 20         179 my $min = $self->{_min_word_length};
639 20         65 my $total_words_in_doc = 0;
640 20 50       557 unless (open IN, $file) {
641             print "Unable to open file $file in the corpus: $!\n"
642 0 0       0 if $self->{_debug};
643 0         0 return;
644             }
645 20         432 while () {
646 856 100       3682 next if /^[ ]*\r?\n?$/;
647 730         2846 $_ =~ s/\r?\n?$//;
648 730         6639 my @brokenup = split /\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $_;
649 730         1238 my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
  4726         10785  
  4726         10663  
650 730 100       2848 next unless @clean_words;
651             @clean_words = grep $_,
652             map &simple_stemmer($_, $self->{_debug}), @clean_words
653 444 50       1719 if $self->{_want_stemming};
654             $self->{_case_sensitive} ?
655 0         0 map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
  0         0  
656 1488         4267 map { $document_vector{"\L$_"}++ }
657 444 50       1354 grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
  1610         4037  
658             }
659 20         201 close IN;
660             die "Something went wrong. Doc vector size unequal to vocab size"
661 20 50       98 unless $self->{_vocab_size} == scalar(keys %document_vector);
662 20         266 foreach (keys %document_vector) {
663 2180         3083 $total_words_in_doc += $document_vector{$_};
664             }
665 20         121 my %normalized_doc_vec;
666 20 50       57 if ($self->{_idf_filter_option}) {
667 20         238 foreach (keys %document_vector) {
668             $normalized_doc_vec{$_} = $document_vector{$_}
669             *
670 2180         4973 $self->{_idf_t}->{$_}
671             /
672             $total_words_in_doc;
673             }
674             }
675 20         81749 my $pwd = cwd;
676 20         567 $pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
677 20         43 my $file_path_name;
678 20 50       201 unless ( $1 eq "" ) {
679 0         0 $file_path_name = "$1/$file";
680             } else {
681 20         71 $file_path_name = $file;
682             }
683 20         232 $self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
684 20         507 $self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
685             }
686              
687             ################################### Drop Stop Words ###################################
688              
689             sub _drop_stop_words {
690 0     0   0 my $self = shift;
691 0 0       0 open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
692             or die "unable to open stop words file: $!";
693 0         0 while () {
694 0 0       0 next if /^#/;
695 0 0       0 next if /^[ ]*\r?\n?$/;
696 0         0 $_ =~ s/\r?\n?$//;
697 0 0       0 delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
698 0         0 unshift @{$self->{_stop_words}}, $_;
  0         0  
699             }
700             }
701              
702             ################################### Support Methods ####################################
703              
704             sub _doc_vec_comparator {
705 24     24   32 my $self = shift;
706 24         30 my %query_vector = %{$self->{_query_vector}};
  24         642  
707             my $vec1_hash_ref = $self->{_idf_filter_option} ?
708             $self->{_normalized_doc_vecs}->{$a} :
709 24 50       198 $self->{_corpus_doc_vectors}->{$a};
710             my $vec2_hash_ref = $self->{_idf_filter_option} ?
711             $self->{_normalized_doc_vecs}->{$b} :
712 24 50       61 $self->{_corpus_doc_vectors}->{$b};
713 24         38 my @vec1 = ();
714 24         31 my @vec2 = ();
715 24         35 my @qvec = ();
716 24         30 foreach my $word (sort keys %{$self->{_vocab_hist}}) {
  24         1255  
717 2616         3808 push @vec1, $vec1_hash_ref->{$word};
718 2616         3607 push @vec2, $vec2_hash_ref->{$word};
719 2616         4249 push @qvec, $query_vector{$word};
720             }
721 24         185 my $vec1_mag = vec_magnitude(\@vec1);
722 24         55 my $vec2_mag = vec_magnitude(\@vec2);
723 24         45 my $qvec_mag = vec_magnitude(\@qvec);
724 24         54 my $product1 = vec_scalar_product(\@vec1, \@qvec);
725 24         51 $product1 /= $vec1_mag * $qvec_mag;
726 24         57 my $product2 = vec_scalar_product(\@vec2, \@qvec);
727 24         37 $product2 /= $vec2_mag * $qvec_mag;
728 24 100       230 return 1 if $product1 < $product2;
729 14 100       49 return 0 if $product1 == $product2;
730 13 50       253 return -1 if $product1 > $product2;
731             }
732              
733             sub _similarity_to_query {
734 18     18   24 my $self = shift;
735 18         26 my $doc_name = shift;
736             my $vec_hash_ref = $self->{_idf_filter_option} ?
737             $self->{_normalized_doc_vecs}->{$doc_name} :
738 18 50       50 $self->{_corpus_doc_vectors}->{$doc_name};
739 18         29 my @vec = ();
740 18         22 my @qvec = ();
741 18         925 foreach my $word (sort keys %$vec_hash_ref) {
742 1962         2684 push @vec, $vec_hash_ref->{$word};
743 1962         3059 push @qvec, $self->{_query_vector}->{$word};
744             }
745 18         134 my $vec_mag = vec_magnitude(\@vec);
746 18         42 my $qvec_mag = vec_magnitude(\@qvec);
747 18         40 my $product = vec_scalar_product(\@vec, \@qvec);
748 18         32 $product /= $vec_mag * $qvec_mag;
749 18         178 return $product;
750             }
751              
752             ###################### Relevance Judgments for Testing Purposes #######################
753              
754             ## IMPORTANT: This estimation of document relevancies to queries is NOT for
755             ## serious work. A document is considered to be relevant to a
756             ## query if it contains several of the query words. As to the
757             ## minimum number of query words that must exist in a document
758             ## in order for the latter to be considered relevant is
759             ## determined by the relevancy_threshold parameter in the VSM
760             ## constructor. (See the relevancy and precision-recall related
761             ## scripts in the 'examples' directory.) The reason for why the
762             ## function shown below is not for serious work is because
763             ## ultimately it is the humans who are the best judges of the
764             ## relevancies of documents to queries. The humans bring to
765             ## bear semantic considerations on the relevancy determination
766             ## problem that are beyond the scope of this module.
767              
768             sub estimate_doc_relevancies {
769 0     0 1 0 my $self = shift;
770             die "You did not set the 'query_file' parameter in the constructor"
771 0 0       0 unless $self->{_query_file};
772             open( IN, $self->{_query_file} )
773 0 0       0 or die "unable to open the query file $self->{_query_file}: $!";
774             croak "\n\nYou need to specify a name for the relevancy file in \n" .
775             " in which the relevancy judgments will be dumped."
776 0 0       0 unless $self->{_relevancy_file};
777 0         0 while () {
778 0 0       0 next if /^#/;
779 0 0       0 next if /^[ ]*\r?\n?$/;
780 0         0 $_ =~ s/\r?\n?$//;
781 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
782 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
783 0         0 my $query_label = $1;
784 0         0 my $query = $2;
785 0 0       0 next unless $query;
786 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
787             }
788 0 0       0 if ($self->{_debug}) {
789 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
790 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
791             }
792             }
793 0         0 $self->{_scan_dir_for_rels} = 1;
794 0         0 $self->_scan_directory($self->{_corpus_directory});
795 0         0 $self->{_scan_dir_for_rels} = 0;
796 0         0 chdir $self->{_working_directory};
797 0 0       0 open(OUT, ">$self->{_relevancy_file}")
798             or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
799 0         0 my @relevancy_list_for_query;
800 0         0 foreach (sort
801 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
802 0         0 keys %{$self->{_relevancy_estimates}}) {
803             @relevancy_list_for_query =
804 0         0 keys %{$self->{_relevancy_estimates}->{$_}};
  0         0  
805 0         0 print OUT "$_ => @relevancy_list_for_query\n\n";
806 0         0 print "Number of relevant docs for query $_: " .
807             scalar(@relevancy_list_for_query) . "\n";
808             }
809             }
810              
811             # If there are available human-supplied relevancy judgments in a disk
812             # file, use this script to upload that information. One of the scripts
813             # in the 'examples' directory carries out the precision-recall analysis
814             # by using this approach. IMPORTANT: The human-supplied relevancy
815             # judgments must be in a format that is shown in the sample file
816             # relevancy.txt in the 'examples' directory.
817             sub upload_document_relevancies_from_file {
818 0     0 1 0 my $self = shift;
819 0         0 chdir $self->{_working_directory};
820             open( IN, $self->{_relevancy_file} )
821 0 0       0 or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
822 0         0 while () {
823 0 0       0 next if /^#/;
824 0 0       0 next if /^[ ]*\r?\n?$/;
825 0         0 $_ =~ s/\r?\n?$//;
826 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+[ ]*=>/;
827 0         0 /^[ ]*(q[0-9]+)[ ]*=>[ ]*(.*)/;
828 0         0 my $query_label = $1;
829 0         0 my $relevancy_docs_string = $2;
830 0 0       0 next unless $relevancy_docs_string;
831 0         0 my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
832 0         0 my %relevancies = map {$_ => 1} @relevancy_docs;
  0         0  
833 0         0 $self->{_relevancy_estimates}->{$query_label} = \%relevancies;
834             }
835 0 0       0 if ($self->{_debug}) {
836 0         0 for (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
837 0         0 my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
  0         0  
838 0         0 print "$_ => @rels\n";
839             }
840             }
841             }
842              
843             sub display_doc_relevancies {
844 0     0 1 0 my $self = shift;
845             die "You must first estimate or provide the doc relevancies"
846 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
  0         0  
847 0         0 print "\nDisplaying relevancy judgments:\n\n";
848 0         0 foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
  0         0  
849 0         0 print "Query $query\n";
850 0         0 foreach my $file (sort {
851             $self->{_relevancy_estimates}->{$query}->{$b}
852             <=>
853 0         0 $self->{_relevancy_estimates}->{$query}->{$a}
854             }
855 0         0 keys %{$self->{_relevancy_estimates}->{$query}}){
856 0         0 print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
857             }
858             }
859             }
860              
861             sub _scan_file_for_rels {
862 0     0   0 my $self = shift;
863 0         0 my $file = shift;
864 0         0 open IN, $file;
865 0         0 my @all_text = ;
866 0         0 @all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
  0         0  
  0         0  
867 0         0 my $all_text = join ' ', @all_text;
868 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
869 0         0 my $count = 0;
870             my @query_words = grep $_,
871 0         0 split /\s+/, $self->{_queries_for_relevancy}->{$query};
872 0 0       0 print "Query words for $query: @query_words\n" if $self->{_debug};
873 0         0 foreach my $word (@query_words) {
874 0         0 my @matches = $all_text =~ /$word/gi;
875             print "Number of occurrences for word '$word' in file $file: " .
876 0 0       0 scalar(@matches) . "\n" if $self->{_debug};
877 0 0       0 $count += @matches if @matches;
878             }
879             print "\nRelevancy count for query $query and file $file: $count\n\n"
880 0 0       0 if $self->{_debug};
881             $self->{_relevancy_estimates}->{$query}->{$file} = $count
882 0 0       0 if $count >= $self->{_relevancy_threshold};
883             }
884             }
885              
886             ######################### Calculate Precision versus Recall ##########################
887              
888             sub precision_and_recall_calculator {
889 0     0 1 0 my $self = shift;
890 0         0 my $retrieval_type = shift;
891 0 0       0 die "You must specify the retrieval type through an argument to the method " .
892             "precision_and_recall_calculator(). The retrieval type must either be 'vsm' " .
893             "or 'lsa' \n" unless $retrieval_type;
894             die "You must first estimate or provide the doc relevancies"
895 0 0       0 unless scalar(keys %{$self->{_relevancy_estimates}});
  0         0  
896 0 0       0 unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
  0         0  
897             open( IN, $self->{_query_file})
898 0 0       0 or die "unable to open the query file $self->{_query_file}: $!";
899 0         0 while () {
900 0 0       0 next if /^#/;
901 0 0       0 next if /^[ ]*\r?\n?$/;
902 0         0 $_ =~ s/\r?\n?$//;
903 0 0       0 die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
904 0         0 /^[ ]*(q[0-9]+):[ ]*(.*)/;
905 0         0 my $query_label = $1;
906 0         0 my $query = $2;
907 0 0       0 next unless $query;
908 0         0 $self->{_queries_for_relevancy}->{$query_label} = $query;
909             }
910 0 0       0 if ($self->{_debug}) {
911 0         0 print "\n\nDisplaying queries in the query file:\n\n";
912 0         0 foreach (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
913 0         0 print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
914             }
915             }
916             }
917 0         0 foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
  0         0  
918             print "\n\n====================================== query: $query ========================================\n\n"
919 0 0       0 if $self->{_debug};
920 0 0       0 print "\n\n\nQuery $query:\n" if $self->{_debug};
921             my @query_words = grep $_,
922 0         0 split /\s+/, $self->{_queries_for_relevancy}->{$query};
923 0 0       0 croak "\n\nYou have not specified the retrieval type for " .
924             "precision-recall calculation. See code in 'examples'" .
925             "directory:" if !defined $retrieval_type;
926 0         0 my $retrievals;
927 0         0 eval {
928 0 0       0 if ($retrieval_type eq 'vsm') {
    0          
929 0         0 $retrievals = $self->retrieve_with_vsm( \@query_words );
930             } elsif ($retrieval_type eq 'lsa') {
931 0         0 $retrievals = $self->retrieve_with_lsa( \@query_words );
932             }
933             };
934 0 0       0 if ($@) {
935 0         0 warn "\n\nNo relevant docs found for query $query.\n" .
936             "Will skip over this query for precision and\n" .
937             "recall calculations\n\n";
938 0         0 next;
939             }
940 0         0 my %ranked_retrievals;
941 0         0 my $i = 1;
942 0         0 foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
  0         0  
943             keys %$retrievals) {
944 0         0 $ranked_retrievals{$i++} = $_;
945             }
946 0 0       0 if ($self->{_debug}) {
947 0         0 print "\n\nDisplaying ranked retrievals for query $query:\n\n";
948 0         0 foreach (sort {$a <=> $b} keys %ranked_retrievals) {
  0         0  
949 0         0 print "$_ => $ranked_retrievals{$_}\n";
950             }
951             }
952             # At this time, ranking of relevant documents based on their
953             # relevancy counts serves no particular purpose since all we want
954             # for the calculation of Precision and Recall are the total
955             # number of relevant documents. However, I believe such a
956             # ranking will play an important role in the future.
957             # IMPORTANT: The relevancy judgments are ranked only when
958             # estimated by the method estimate_doc_relevancies()
959             # of the VSM class. When relevancies are supplied
960             # directly through a disk file, they all carry the
961             # same rank.
962 0         0 my %ranked_relevancies;
963 0         0 $i = 1;
964 0         0 foreach my $file (sort {
965             $self->{_relevancy_estimates}->{$query}->{$b}
966             <=>
967 0         0 $self->{_relevancy_estimates}->{$query}->{$a}
968             }
969 0         0 keys %{$self->{_relevancy_estimates}->{$query}}) {
970 0         0 $ranked_relevancies{$i++} = $file;
971             }
972 0 0       0 if ($self->{_debug}) {
973 0         0 print "\n\nDisplaying ranked relevancies for query $query:\n\n";
974 0         0 foreach (sort {$a <=> $b} keys %ranked_relevancies) {
  0         0  
975 0         0 print "$_ => $ranked_relevancies{$_}\n";
976             }
977             }
978 0         0 my @relevant_set = values %ranked_relevancies;
979 0 0       0 warn "\n\nNo relevant docs found for query $query.\n" .
980             "Will skip over this query for precision and\n" .
981             "recall calculations\n\n" unless @relevant_set;
982 0 0       0 next unless @relevant_set;
983             print "\n\nRelevant set for query $query: @relevant_set\n\n"
984 0 0       0 if $self->{_debug};
985             # @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.
986 0         0 my @retrieved;
987 0         0 foreach (keys %ranked_retrievals) {
988 0         0 push @retrieved, $ranked_retrievals{$_};
989             }
990             print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
991 0 0       0 if $self->{_debug};
992 0         0 my @Precision_values = ();
993 0         0 my @Recall_values = ();
994 0         0 my $rank = 1;
995 0         0 while ($rank < @retrieved + 1) {
996 0         0 my $index = 1;
997 0         0 my @retrieved_at_rank = ();
998 0         0 while ($index <= $rank) {
999 0         0 push @retrieved_at_rank, $ranked_retrievals{$index};
1000 0         0 $index++;
1001             }
1002 0         0 my $intersection =set_intersection(\@retrieved_at_rank,
1003             \@relevant_set);
1004 0 0       0 my $precision_at_rank = @retrieved_at_rank ?
1005             (@$intersection / @retrieved_at_rank) : 0;
1006 0         0 push @Precision_values, $precision_at_rank;
1007 0         0 my $recall_at_rank = @$intersection / @relevant_set;
1008 0         0 push @Recall_values, $recall_at_rank;
1009 0         0 $rank++;
1010             }
1011             print "\n\nFor query $query, precision values: @Precision_values\n"
1012 0 0       0 if $self->{_debug};
1013             print "\nFor query $query, recall values: @Recall_values\n"
1014 0 0       0 if $self->{_debug};
1015 0         0 $self->{_precision_for_queries}->{$query} = \@Precision_values;
1016 0         0 my $avg_precision;
1017 0         0 $avg_precision += $_ for @Precision_values;
1018 0         0 $self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
1019 0         0 $self->{_recall_for_queries}->{$query} = \@Recall_values;
1020             }
1021             print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n"
1022 0 0       0 if $self->{_debug};
1023 0         0 my @avg_precisions;
1024 0         0 foreach (keys %{$self->{_avg_precision_for_queries}}) {
  0         0  
1025 0         0 push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
1026             }
1027 0         0 $self->{_map} += $_ for @avg_precisions;
1028 0         0 $self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
  0         0  
1029             }
1030              
1031             sub display_average_precision_for_queries_and_map {
1032 0     0 1 0 my $self = shift;
1033             die "You must first invoke precision_and_recall_calculator function"
1034 0 0       0 unless scalar(keys %{$self->{_avg_precision_for_queries}});
  0         0  
1035 0         0 print "\n\nDisplaying average precision for different queries:\n\n";
1036 0         0 foreach my $query (sort
1037 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1038 0         0 keys %{$self->{_avg_precision_for_queries}}) {
1039             my $output = sprintf "Query %s => %.3f",
1040 0         0 $query, $self->{_avg_precision_for_queries}->{$query};
1041 0         0 print "$output\n";
1042             }
1043 0         0 print "\n\nMAP value: $self->{_map}\n\n";
1044             }
1045              
1046             sub display_precision_vs_recall_for_queries {
1047 0     0 1 0 my $self = shift;
1048             die "You must first invoke precision_and_recall_calculator function"
1049 0 0       0 unless scalar(keys %{$self->{_precision_for_queries}});
  0         0  
1050 0         0 print "\n\nDisplaying precision and recall values for different queries:\n\n";
1051 0         0 foreach my $query (sort
1052 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1053 0         0 keys %{$self->{_avg_precision_for_queries}}) {
1054 0         0 print "\n\nQuery $query:\n";
1055 0         0 print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1056 0         0 my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
  0         0  
1057 0         0 @precision_vals = map {sprintf "%.3f", $_} @precision_vals;
  0         0  
1058 0         0 print " Precision at rank => @precision_vals\n";
1059 0         0 my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
  0         0  
1060 0         0 @recall_vals = map {sprintf "%.3f", $_} @recall_vals;
  0         0  
1061 0         0 print "\n Recall at rank => @recall_vals\n";
1062             }
1063 0         0 print "\n\n";
1064             }
1065              
1066             sub get_query_sorted_average_precision_for_queries {
1067 0     0 1 0 my $self = shift;
1068             die "You must first invoke precision_and_recall_calculator function"
1069 0 0       0 unless scalar(keys %{$self->{_avg_precision_for_queries}});
  0         0  
1070 0         0 my @average_precisions_for_queries = ();
1071 0         0 foreach my $query (sort
1072 0         0 {get_integer_suffix($a) <=> get_integer_suffix($b)}
1073 0         0 keys %{$self->{_avg_precision_for_queries}}) {
1074 0         0 my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
1075 0         0 push @average_precisions_for_queries, $output;
1076             }
1077 0         0 return \@average_precisions_for_queries;
1078             }
1079              
1080             ################################### Utility Routines ###################################
1081              
1082             sub _check_for_illegal_params {
1083 2     2   7 my @params = @_;
1084 2         26 my @legal_params = qw / corpus_directory
1085             corpus_vocab_db
1086             doc_vectors_db
1087             normalized_doc_vecs_db
1088             use_idf_filter
1089             stop_words_file
1090             file_types
1091             case_sensitive
1092             max_number_retrievals
1093             query_file
1094             relevancy_file
1095             min_word_length
1096             want_stemming
1097             lsa_svd_threshold
1098             relevancy_threshold
1099             break_camelcased_and_underscored
1100             save_model_on_disk
1101             debug
1102             /;
1103 2         4 my $found_match_flag;
1104 2         4 foreach my $param (@params) {
1105 17         26 foreach my $legal (@legal_params) {
1106 102         116 $found_match_flag = 0;
1107 102 100       232 if ($param eq $legal) {
1108 17         23 $found_match_flag = 1;
1109 17         23 last;
1110             }
1111             }
1112 17 50       41 last if $found_match_flag == 0;
1113             }
1114 2         10 return $found_match_flag;
1115             }
1116              
1117             # checks whether an element is in an array:
1118             sub contained_in {
1119 40     40 0 68 my $ele = shift;
1120 40         178 my @array = @_;
1121 40         99 my $count = 0;
1122 40 100       78 map {$count++ if $ele eq $_} @array;
  80         440  
1123 40         278 return $count;
1124             }
1125              
1126             # Meant only for an un-nested hash:
1127             sub deep_copy_hash {
1128 20     20 0 31 my $ref_in = shift;
1129 20         46 my $ref_out = {};
1130 20         34 foreach ( keys %{$ref_in} ) {
  20         1324  
1131 2180         4245 $ref_out->{$_} = $ref_in->{$_};
1132             }
1133 20         1001 return $ref_out;
1134             }
1135              
1136             sub vec_scalar_product {
1137 66     66 0 85 my $vec1 = shift;
1138 66         92 my $vec2 = shift;
1139 66 50       167 croak "Something is wrong --- the two vectors are of unequal length"
1140             unless @$vec1 == @$vec2;
1141 66         68 my $product;
1142 66         134 for my $i (0..@$vec1-1) {
1143 7194         10516 $product += $vec1->[$i] * $vec2->[$i];
1144             }
1145 66         130 return $product;
1146             }
1147              
1148             sub vec_magnitude {
1149 108     108 0 144 my $vec = shift;
1150 108         129 my $mag_squared = 0;
1151 108         185 foreach my $num (@$vec) {
1152 11772         17491 $mag_squared += $num ** 2;
1153             }
1154 108         212 return sqrt $mag_squared;
1155             }
1156              
1157             sub sum {
1158 2     2 0 5 my $vec = shift;
1159 2         3 my $result;
1160 2         9 for my $item (@$vec) {
1161 218         297 $result += $item;
1162             }
1163 2         6 return $result;
1164             }
1165              
1166             sub simple_stemmer {
1167 3346     3346 0 5310 my $word = shift;
1168 3346         4952 my $debug = shift;
1169 3346 50       7034 print "\nStemming the word: $word\n" if $debug;
1170 3346         10255 $word =~ s/(.*[a-z][^aeious])s$/$1/i;
1171 3346         8246 $word =~ s/(.*[a-z]s)es$/$1/i;
1172 3346         7671 $word =~ s/(.*[a-z][ck])es$/$1e/i;
1173 3346         9771 $word =~ s/(.*[a-z]+)tions$/$1tion/i;
1174 3346         9045 $word =~ s/(.*[a-z]+)mming$/$1m/i;
1175 3346         13125 $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1176 3346         9297 $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1177 3346         8535 $word =~ s/(.*[a-z]+)tices$/$1tex/i;
1178 3346         11071 $word =~ s/(.*[a-z]+)pes$/$1pe/i;
1179 3346         10753 $word =~ s/(.*[a-z]+)sed$/$1se/i;
1180 3346         11089 $word =~ s/(.*[a-z]+)ed$/$1/i;
1181 3346         7270 $word =~ s/(.*[a-z]+)tation$/$1t/i;
1182 3346 50       6552 print "Stemmed word: $word\n\n" if $debug;
1183 3346         11969 return $word;
1184             }
1185              
1186             # Assumes the array is sorted in a descending order, as would be the
1187             # case with an array of singular values produced by an SVD algorithm
1188             sub return_index_of_last_value_above_threshold {
1189 1     1 0 5 my $pdl_obj = shift;
1190 1         7 my $size = $pdl_obj->getdim(0);
1191 1         4 my $threshold = shift;
1192 1         13 my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1193 1         72 my $i = 0;
1194 1   66     11 while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
  10         324  
1195 1         6 return $i-1;
1196             }
1197              
1198             sub set_intersection {
1199 0     0 0   my $set1 = shift;
1200 0           my $set2 = shift;
1201 0           my %hset1 = map {$_ => 1} @$set1;
  0            
1202 0           my @common_elements = grep {$hset1{$_}} @$set2;
  0            
1203 0 0         return @common_elements ? \@common_elements : [];
1204             }
1205              
1206             sub get_integer_suffix {
1207 0     0 0   my $label = shift;
1208 0           $label =~ /(\d*)$/;
1209 0           return $1;
1210             }
1211              
1212             1;
1213              
1214             =pod
1215              
1216             =head1 NAME
1217              
1218             Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1219             library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1220             algorithms in response to search words and phrases.
1221              
1222             =head1 SYNOPSIS
1223              
1224             # FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1225              
1226             use Algorithm::VSM;
1227              
1228             my $corpus_dir = "corpus";
1229             my @query = qw/ program ListIterator add ArrayList args /;
1230             my $stop_words_file = "stop_words.txt";
1231             my $vsm = Algorithm::VSM->new(
1232             break_camelcased_and_underscored => 1,
1233             case_sensitive => 0,
1234             corpus_directory => $corpus_dir,
1235             file_types => ['.txt', '.java'],
1236             max_number_retrievals => 10,
1237             min_word_length => 4,
1238             stop_words_file => $stop_words_file,
1239             use_idf_filter => 1,
1240             want_stemming => 1,
1241             );
1242             $vsm->get_corpus_vocabulary_and_word_counts();
1243             $vsm->display_corpus_vocab();
1244             $vsm->display_corpus_vocab_size();
1245             $vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1246             $vsm->display_inverse_document_frequencies();
1247             $vsm->generate_document_vectors();
1248             $vsm->display_doc_vectors();
1249             $vsm->display_normalized_doc_vectors();
1250             my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1251             $vsm->display_retrievals( $retrievals );
1252              
1253             The purpose of each constructor option and what is accomplished by the method
1254             calls should be obvious by their names. If not, they are explained in greater
1255             detail elsewhere in this documentation page. Note that the methods
1256             display_corpus_vocab() and display_doc_vectors() are there only for testing
1257             purposes with small corpora. If you must use them for large libraries/corpora,
1258             you might wish to redirect the output to a file.
1259              
1260             By default, a call to a constructor calculates normalized term-frequency vectors
1261             for the documents. Normalization consists of first calculating the term
1262             frequency tf(t) of a term t in a document as a proportion of the total numbers
1263             of words in the document and then multiplying it by idf(t), where idf(t) stands
1264             for the inverse document frequency associated with that term. Note that 'word'
1265             and 'term' mean the same thing.
1266              
1267              
1268              
1269             # FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1270              
1271             my $lsa = Algorithm::VSM->new(
1272             break_camelcased_and_underscored => 1,
1273             case_sensitive => 0,
1274             corpus_directory => $corpus_dir,
1275             file_types => ['.txt', '.java'],
1276             lsa_svd_threshold => 0.01,
1277             max_number_retrievals => 10,
1278             min_word_length => 4,
1279             stop_words_file => $stop_words_file,
1280             use_idf_filter => 1,
1281             want_stemming => 1,
1282             );
1283             $lsa->get_corpus_vocabulary_and_word_counts();
1284             $lsa->display_corpus_vocab();
1285             $lsa->display_corpus_vocab_size();
1286             $lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1287             $lsa->generate_document_vectors();
1288             $lsa->construct_lsa_model();
1289             my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1290             $lsa->display_retrievals( $retrievals );
1291              
1292             The initialization code before the constructor call and the calls for displaying
1293             the vocabulary and the vectors after the call remain the same as for the VSM case
1294             shown previously in this Synopsis. In the call above, the constructor parameter
1295             'lsa_svd_threshold' determines how many of the singular values will be retained
1296             after we have carried out an SVD decomposition of the term-frequency matrix for
1297             the documents in the corpus. Singular values smaller than this threshold
1298             fraction of the largest value are rejected.
1299              
1300              
1301              
1302             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1303              
1304             my $corpus_dir = "corpus";
1305             my $stop_words_file = "stop_words.txt";
1306             my $query_file = "test_queries.txt";
1307             my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1308             # will be stored in this file
1309             my $vsm = Algorithm::VSM->new(
1310             break_camelcased_and_underscored => 1,
1311             case_sensitive => 0,
1312             corpus_directory => $corpus_dir,
1313             file_types => ['.txt', '.java'],
1314             min_word_length => 4,
1315             query_file => $query_file,
1316             relevancy_file => $relevancy_file,
1317             relevancy_threshold => 5,
1318             stop_words_file => $stop_words_file,
1319             want_stemming => 1,
1320             );
1321             $vsm->get_corpus_vocabulary_and_word_counts();
1322             $vsm->generate_document_vectors();
1323             $vsm->estimate_doc_relevancies();
1324             $vsm->display_doc_relevancies(); # used only for testing
1325             $vsm->precision_and_recall_calculator('vsm');
1326             $vsm->display_precision_vs_recall_for_queries();
1327             $vsm->display_average_precision_for_queries_and_map();
1328              
1329             Measuring precision and recall requires a set of queries. These are supplied
1330             through the constructor parameter 'query_file'. The format of the this file
1331             must be according to the sample file 'test_queries.txt' in the 'examples'
1332             directory. The module estimates the relevancies of the documents to the
1333             queries and dumps the relevancies in a file named by the 'relevancy_file'
1334             constructor parameter. The constructor parameter 'relevancy_threshold' is used
1335             to decide which of the documents are considered to be relevant to a query. A
1336             document must contain at least the 'relevancy_threshold' occurrences of query
1337             words in order to be considered relevant to a query.
1338              
1339              
1340              
1341             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1342              
1343             my $lsa = Algorithm::VSM->new(
1344             break_camelcased_and_underscored => 1,
1345             case_sensitive => 0,
1346             corpus_directory => $corpus_dir,
1347             file_types => ['.txt', '.java'],
1348             lsa_svd_threshold => 0.01,
1349             min_word_length => 4,
1350             query_file => $query_file,
1351             relevancy_file => $relevancy_file,
1352             relevancy_threshold => 5,
1353             stop_words_file => $stop_words_file,
1354             want_stemming => 1,
1355             );
1356             $lsa->get_corpus_vocabulary_and_word_counts();
1357             $lsa->generate_document_vectors();
1358             $lsa->construct_lsa_model();
1359             $lsa->estimate_doc_relevancies();
1360             $lsa->display_doc_relevancies();
1361             $lsa->precision_and_recall_calculator('lsa');
1362             $lsa->display_precision_vs_recall_for_queries();
1363             $lsa->display_average_precision_for_queries_and_map();
1364              
1365             We have already explained the purpose of the constructor parameter 'query_file'
1366             and about the constraints on the format of queries in the file named through
1367             this parameter. As mentioned earlier, the module estimates the relevancies of
1368             the documents to the queries and dumps the relevancies in a file named by the
1369             'relevancy_file' constructor parameter. The constructor parameter
1370             'relevancy_threshold' is used in deciding which of the documents are considered
1371             to be relevant to a query. A document must contain at least the
1372             'relevancy_threshold' occurrences of query words in order to be considered
1373             relevant to a query. We have previously explained the role of the constructor
1374             parameter 'lsa_svd_threshold'.
1375              
1376              
1377              
1378             # FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1379              
1380             my $corpus_dir = "corpus";
1381             my $stop_words_file = "stop_words.txt";
1382             my $query_file = "test_queries.txt";
1383             my $relevancy_file = "relevancy.txt";
1384             my $vsm = Algorithm::VSM->new(
1385             break_camelcased_and_underscored => 1,
1386             case_sensitive => 0,
1387             corpus_directory => $corpus_dir,
1388             file_types => ['.txt', '.java'],
1389             min_word_length => 4,
1390             query_file => $query_file,
1391             relevancy_file => $relevancy_file,
1392             stop_words_file => $stop_words_file,
1393             want_stemming => 1,
1394             );
1395             $vsm->get_corpus_vocabulary_and_word_counts();
1396             $vsm->generate_document_vectors();
1397             $vsm->upload_document_relevancies_from_file();
1398             $vsm->display_doc_relevancies();
1399             $vsm->precision_and_recall_calculator('vsm');
1400             $vsm->display_precision_vs_recall_for_queries();
1401             $vsm->display_average_precision_for_queries_and_map();
1402              
1403             Now the filename supplied through the constructor parameter 'relevancy_file' must
1404             contain relevance judgments for the queries that are named in the file supplied
1405             through the parameter 'query_file'. The format of these two files must be
1406             according to what is shown in the sample files 'test_queries.txt' and
1407             'relevancy.txt' in the 'examples' directory.
1408              
1409              
1410              
1411             # FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1412              
1413             my $corpus_dir = "corpus";
1414             my $stop_words_file = "stop_words.txt";
1415             my $query_file = "test_queries.txt";
1416             my $relevancy_file = "relevancy.txt";
1417             my $lsa = Algorithm::VSM->new(
1418             break_camelcased_and_underscored => 1,
1419             case_sensitive => 0,
1420             corpus_directory => $corpus_dir,
1421             file_types => ['.txt', '.java'],
1422             lsa_svd_threshold => 0.01,
1423             min_word_length => 4,
1424             query_file => $query_file,
1425             relevancy_file => $relevancy_file,
1426             stop_words_file => $stop_words_file,
1427             want_stemming => 1,
1428             );
1429             $lsa->get_corpus_vocabulary_and_word_counts();
1430             $lsa->generate_document_vectors();
1431             $lsa->upload_document_relevancies_from_file();
1432             $lsa->display_doc_relevancies();
1433             $lsa->precision_and_recall_calculator('vsm');
1434             $lsa->display_precision_vs_recall_for_queries();
1435             $lsa->display_average_precision_for_queries_and_map();
1436              
1437             As mentioned for the previous code block, the filename supplied through the
1438             constructor parameter 'relevancy_file' must contain relevance judgments for the
1439             queries that are named in the file supplied through the parameter 'query_file'.
1440             The format of this file must be according to what is shown in the sample file
1441             'relevancy.txt' in the 'examples' directory. We have already explained the roles
1442             played by the constructor parameters such as 'lsa_svd_threshold'.
1443              
1444              
1445              
1446             # FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1447              
1448             my $corpus_dir = "corpus";
1449             my $stop_words_file = "stop_words.txt";
1450             my $vsm = Algorithm::VSM->new(
1451             break_camelcased_and_underscored => 1,
1452             case_sensitive => 0,
1453             corpus_directory => $corpus_dir,
1454             file_types => ['.txt', '.java'],
1455             min_word_length => 4,
1456             stop_words_file => $stop_words_file,
1457             want_stemming => 1,
1458             );
1459             $vsm->get_corpus_vocabulary_and_word_counts();
1460             $vsm->generate_document_vectors();
1461             # code for calculating pairwise similarities as shown in the
1462             # script calculate_similarity_matrix_for_all_docs.pl in the
1463             # examples directory. This script makes calls to
1464             #
1465             # $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1466             #
1467             # for every pair of documents.
1468              
1469             =head1 CHANGES
1470              
1471             Version 1.70: All of the changes made in this version affect only that part of the
1472             module that is used for calculating precision-vs.-recall curve for the estimation of
1473             MAP (Mean Average Precision). The new formulas that go into estimating MAP are
1474             presented in the author's tutorial on significance testing. Additionally, when
1475             estimating the average retrieval precision for a query, this version explicitly
1476             disregards all documents that have zero similarity with the query.
1477              
1478             Version 1.62 removes the Perl version restriction on the module. This version also
1479             fixes two bugs, one in the file scanner code and the other in the
1480             precision-and-recall calculator. The file scanner bug was related to the new
1481             constructor parameter C that was introduced in Version 1.61. And the
1482             precision-and-recall calculator bug was triggered if a query consisted solely of
1483             non-vocabulary words.
1484              
1485             Version 1.61 improves the implementation of the directory scanner to make it more
1486             platform independent. Additionally, you are now required to specify in the
1487             constructor call the file types to be considered for computing the database model.
1488             If, say, you have a large software library and you want only Java and text files to
1489             be scanned for creating the VSM (or the LSA) model, you must supply that information
1490             to the module by setting the constructor parameter C to the anonymous
1491             list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1492             version is C. If you set it to 1, that will force the database model
1493             and query matching to become case sensitive.
1494              
1495             Version 1.60 reflects the fact that people are now more likely to use this module by
1496             keeping the model constructed for a corpus in the fast memory (as opposed to storing
1497             the models in disk-based hash tables) for its repeated invocation for different
1498             queries. As a result, the default value for the constructor option
1499             C was changed from 1 to 0. For those who still wish to store on
1500             a disk the model that is constructed, the script
1501             C shows how you can do that.
1502             Other changes in 1.60 include a slight reorganization of the scripts in the
1503             C directory. Most scripts now do not by default store their models in
1504             disk-based hash tables. This reorganization is reflected in the description of the
1505             C directory in this documentation. The basic logic of constructing VSM and
1506             LSA models and how these are used for retrievals remains unchanged.
1507              
1508             Version 1.50 incorporates a couple of new features: (1) You now have the option to
1509             split camel-cased and underscored words for constructing your vocabulary set; and (2)
1510             Storing the VSM and LSA models in database files on the disk is now optional. The
1511             second feature, in particular, should prove useful to those who are using this module
1512             for large collections of documents.
1513              
1514             Version 1.42 includes two new methods, C and
1515             C, for those folks who deal with very large datasets.
1516             You can get a better sense of the overall vocabulary being used by the module for
1517             file retrieval by examining the contents of a dump file whose name is supplied as an
1518             argument to C.
1519              
1520             Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1521             the dependencies between this module and the submodules of PDL.
1522              
1523             Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1524             documents in the corpus. The elements of such a matrix express pairwise similarities
1525             between the documents. The pairwise similarities are based on the dot product of two
1526             document vectors divided by the product of the vector magnitudes. The 'examples'
1527             directory contains two scripts to illustrate how such matrices can be calculated by
1528             the user. The similarity matrix is output as a CSV file.
1529              
1530             Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1531             document file. What that means is that the words that appear in most of the documents
1532             get reduced weighting since such words are non-discriminatory with respect to the
1533             retrieval of the documents. A typical formula that is used to calculate the IDF
1534             weight for a word is the logarithm of the ratio of the total number of documents to
1535             the number of documents in which the word appears. So if a word were to appear in
1536             all the documents, its IDF multiplier would be zero in the vector representation of a
1537             document. If so desired, you can turn off the IDF weighting of the words by
1538             explicitly setting the constructor parameter C to zero.
1539              
1540             Version 1.2 includes a code correction and some general code and documentation
1541             cleanup.
1542              
1543             With Version 1.1, you can access the retrieval precision results so that you can
1544             compare two different retrieval algorithms (VSM or LSA with different choices for
1545             some of the constructor parameters) with significance testing. (Version 1.0 merely
1546             sent those results to standard output, typically your terminal window.) In Version
1547             1.1, the new script B in the 'examples' directory
1548             illustrates significance testing with Randomization and with Student's Paired t-Test.
1549              
1550             =head1 DESCRIPTION
1551              
1552             B is a I module for constructing a Vector Space Model (VSM) or
1553             a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1554             to as a corpus, and then retrieving the documents in response to search words in a
1555             query.
1556              
1557             VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1558             community. More recently such models have been shown to be effective in retrieving
1559             files/documents from software libraries. For an account of this research that was
1560             presented by Shivani Rao and the author of this module at the 2011 Mining Software
1561             Repositories conference, see L.
1562              
1563             VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1564             Stemming the words so extracted and eliminating the designated stop words from the
1565             vocabulary. Stemming means that closely related words like 'programming' and
1566             'programs' are reduced to the common root word 'program' and the stop words are the
1567             non-discriminating words that can be expected to exist in virtually all the
1568             documents. (3) Constructing document vectors for the individual files in the corpus
1569             --- the document vectors taken together constitute what is usually referred to as a
1570             'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1571             factor out the effect of document size and, if desired, multiplying the term
1572             frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1573             the weight of the words that appear in a large number of documents. (5) Constructing
1574             a query vector for the search query after the query is subject to the same stemming
1575             and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1576             Using a similarity metric to return the set of documents that are most similar to the
1577             query vector. The commonly used similarity metric is one based on the cosine
1578             distance between two vectors. Also note that all the vectors mentioned here are of
1579             the same size, the size of the vocabulary. An element of a vector is the frequency
1580             of occurrence of the word corresponding to that position in the vector.
1581              
1582             LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1583             step further by subjecting the term-frequency matrix for the corpus to singular value
1584             decomposition (SVD). By retaining only a subset of the singular values (usually the
1585             N largest for some value of N), you can construct reduced-dimensionality vectors for
1586             the documents and the queries. In VSM, as mentioned above, the size of the document
1587             and the query vectors is equal to the size of the vocabulary. For large corpora,
1588             this size may involve tens of thousands of words --- this can slow down the VSM
1589             modeling and retrieval process. So you are very likely to get faster performance
1590             with retrieval based on LSA modeling, especially if you store the model once
1591             constructed in a database file on the disk and carry out retrievals using the
1592             disk-based model.
1593              
1594              
1595             =head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1596              
1597             This module has only been tested for software retrieval. For more general text
1598             retrieval, you would need to replace the simple stemmer used in the module by one
1599             based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1600             list of stop words appropriate to the text corpora of interest to you. As previously
1601             mentioned, the stop words are the commonly occurring words that do not carry much
1602             discriminatory power from the standpoint of distinguishing between the documents.
1603             See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1604             formatted.
1605              
1606              
1607             =head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1608              
1609             It is not uncommon for large software libraries to consist of tens of thousands of
1610             documents that include source-code files, documentation files, README files,
1611             configuration files, etc. The bug-localization work presented recently by Shivani
1612             Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1613             based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1614             size of 7553 unique words. (Here is a link to this work:
1615             L. Also note that the iBUGS dataset
1616             was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1617             automated bug detection and localization tools.) If C is the size of the
1618             vocabulary and C the number of the documents in the corpus, the size of each
1619             vector will be C and size of the term-frequency matrix for the entire corpus will
1620             be CxC. So if you were to duplicate the bug localization experiments in
1621             L you would be dealing with vectors of
1622             size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1623             to really large libraries/corpora, we are obviously talking about very large matrices
1624             for SVD decomposition. For large libraries/corpora, it would be best to store away
1625             the model in a disk file and to base all subsequent retrievals on the disk-stored
1626             models. The 'examples' directory contains scripts that carry out retrievals on the
1627             basis of disk-based models. Further speedup in retrieval can be achieved by using
1628             LSA to create reduced-dimensionality representations for the documents and by basing
1629             retrievals on the stored versions of such reduced-dimensionality representations.
1630              
1631              
1632             =head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1633              
1634             The performance of a retrieval algorithm is typically measured by two properties:
1635             C and C. As mentioned in my tutorial
1636             L, at a given
1637             rank C, C is the ratio of the number of retrieved documents that are
1638             relevant to the total number of retrieved documents up to that rank. And, along the
1639             same lines, C at a given rank C is the ratio of the number of retrieved
1640             documents that are relevant to the total number of relevant documents. The Average
1641             Precision associated with a query is the average of all the Precision-at-rank values
1642             for all the documents relevant to that query. When query specific Average Precision
1643             is averaged over all the queries, you get Mean Average Precision (MAP) as a
1644             single-number characterizer of the retrieval power of an algorithm for a given
1645             corpus. For an oracle, the value of MAP should be 1.0. On the other hand, for
1646             purely random retrieval from a corpus, the value of MAP will be inversely
1647             proportional to the size of the corpus. (See the discussion in
1648             L for further
1649             explanation on these retrieval precision evaluators.)
1650              
1651             This module includes methods that allow you to carry out these retrieval accuracy
1652             measurements using the relevancy judgments supplied through a disk file. If
1653             human-supplied relevancy judgments are not available, the module will be happy to
1654             estimate relevancies for you just by determining the number of query words that exist
1655             in a document. Note, however, that relevancy judgments estimated in this manner
1656             cannot be trusted. That is because ultimately it is the humans who are the best
1657             judges of the relevancies of documents to queries. The humans bring to bear semantic
1658             considerations on the relevancy determination problem that are beyond the scope of
1659             this module.
1660              
1661              
1662             =head1 METHODS
1663              
1664             The module provides the following methods for constructing VSM and LSA models of a
1665             corpus, for using the models thus constructed for retrieval, and for carrying out
1666             precision versus recall calculations for the determination of retrieval accuracy on
1667             the corpora of interest to you.
1668              
1669             =over
1670              
1671             =item B
1672              
1673             A call to C constructs a new instance of the C class:
1674              
1675             my $vsm = Algorithm::VSM->new(
1676             break_camelcased_and_underscored => 1,
1677             case_sensitive => 0,
1678             corpus_directory => "",
1679             corpus_vocab_db => "corpus_vocab_db",
1680             doc_vectors_db => "doc_vectors_db",
1681             file_types => $my_file_types,
1682             lsa_svd_threshold => 0.01,
1683             max_number_retrievals => 10,
1684             min_word_length => 4,
1685             normalized_doc_vecs_db => "normalized_doc_vecs_db",
1686             query_file => "",
1687             relevancy_file => $relevancy_file,
1688             relevancy_threshold => 5,
1689             save_model_on_disk => 0,
1690             stop_words_file => "",
1691             use_idf_filter => 1,
1692             want_stemming => 1,
1693             );
1694              
1695             The values shown on the right side of the big arrows are the B
1696             parameters>. The value supplied through the variable C<$my_file_types> would be
1697             something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1698             be included in creating the database model. The following nested list will now
1699             describe each of the constructor parameters shown above:
1700              
1701             =over 16
1702              
1703             =item I
1704              
1705             The parameter B when set causes the
1706             underscored and camel-cased words to be split. By default the parameter is
1707             set. So if you don't want such words to be split, you must set it
1708             explicitly to 0.
1709              
1710             =item I
1711              
1712             When set to 1, this parameter forces the module to maintain the case of the terms in
1713             the corpus files when creating the vocabulary and the document vectors. Setting
1714             C to 1 also causes the query matching to become case sensitive.
1715             (This constructor parameter was introduced in Version 1.61.)
1716              
1717             =item I
1718              
1719             The parameter B points to the root of the directory of documents
1720             for which you want to create a VSM or LSA model.
1721              
1722             =item I
1723              
1724             The parameter B is for naming the DBM in which the corpus vocabulary
1725             will be stored after it is subject to stemming and the elimination of stop words.
1726             Once a disk-based VSM model is created and stored away in the file named by this
1727             parameter and the parameter to be described next, it can subsequently be used
1728             directly for speedier retrieval.
1729              
1730             =item I
1731              
1732             The database named by B stores the document vector representation for
1733             each document in the corpus. Each document vector has the same size as the
1734             corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1735             the word that corresponds to that position in the vocabulary vector.
1736              
1737             =item I
1738              
1739             This parameter tells the module what types of files in the corpus directory you want
1740             scanned for creating the database model. The value supplied for this parameter is an
1741             anonymous list of the file suffixes for the file types. For example, if you wanted
1742             only Java and text files to be scanned, you will set this parameter to C<['.java',
1743             '.txt']>. The module throws an exception if this parameter is left unspecified.
1744             (This constructor parameter was introduced in Version 1.61.)
1745              
1746             =item I
1747              
1748             The parameter B is used for rejecting singular values that are
1749             smaller than this threshold fraction of the largest singular value. This plays a
1750             critical role in creating reduced-dimensionality document vectors in LSA modeling of
1751             a corpus.
1752              
1753             =item I
1754              
1755             The constructor parameter B stands for what it means.
1756              
1757             =item I
1758              
1759             The parameter B sets the minimum number of characters in a
1760             word in order for it to be included in the corpus vocabulary.
1761              
1762             =item I
1763              
1764             The database named by B stores the normalized document
1765             vectors. Normalization consists of factoring out the size of the documents by
1766             dividing the term frequency for each word in a document by the number of words in the
1767             document, and then multiplying the result by the idf (Inverse Document Frequency)
1768             value for the word.
1769              
1770             =item I
1771              
1772             The parameter B points to a file that contains the queries to be used for
1773             calculating retrieval performance with C and C numbers. The format
1774             of the query file must be as shown in the sample file C in the
1775             'examples' directory.
1776              
1777             =item I
1778              
1779             This option names the disk file for storing the relevancy judgments.
1780              
1781             =item I
1782              
1783             The constructor parameter B is used for automatic determination
1784             of document relevancies to queries on the basis of the number of occurrences of query
1785             words in a document. You can exercise control over the process of determining
1786             relevancy of a document to a query by giving a suitable value to the constructor
1787             parameter B. A document is considered relevant to a query only
1788             when the document contains at least B number of query words.
1789              
1790             =item I
1791              
1792             The constructor parameter B will cause the basic
1793             information about the VSM and the LSA models to be stored on the disk.
1794             Subsequently, any retrievals can be carried out from the disk-based model.
1795              
1796             =item I
1797              
1798             The parameter B is for naming the file that contains the stop words
1799             that you do not wish to include in the corpus vocabulary. The format of this file
1800             must be as shown in the sample file C in the 'examples' directory.
1801              
1802             =item I
1803              
1804             The constructor parameter B is set by default. If you want
1805             to turn off the normalization of the document vectors, including turning
1806             off the weighting of the term frequencies of the words by their idf values,
1807             you must set this parameter explicitly to 0.
1808              
1809             =item I
1810              
1811             The boolean parameter B determines whether or not the words extracted
1812             from the documents would be subject to stemming. As mentioned elsewhere, stemming
1813             means that related words like 'programming' and 'programs' would both be reduced to
1814             the root word 'program'.
1815              
1816             =back
1817              
1818             =begin html
1819              
1820            
1821              
1822             =end html
1823              
1824             =item B
1825              
1826             You call this subroutine for constructing an LSA model for your corpus
1827             after you have extracted the corpus vocabulary and constructed document
1828             vectors:
1829              
1830             $vsm->construct_lsa_model();
1831              
1832             The SVD decomposition that is carried out in LSA model construction uses the
1833             constructor parameter C to decide how many of the singular values
1834             to retain for the LSA model. A singular is retained only if it is larger than the
1835             C fraction of the largest singular value.
1836              
1837              
1838             =item B
1839              
1840             The Average Precision for a query is the average of the Precision-at-rank values
1841             associated with each of the corpus documents relevant to the query. The mean of the
1842             Average Precision values for all the queries is the Mean Average Precision (MAP).
1843             The C values for the queries and the overall C can be printed
1844             out by calling
1845              
1846             $vsm->display_average_precision_for_queries_and_map();
1847              
1848              
1849             =item B
1850              
1851             If you would like to see corpus vocabulary as constructed by the previous call, make
1852             the call
1853              
1854             $vsm->display_corpus_vocab();
1855              
1856             Note that this is a useful thing to do only on small test corpora. If you need
1857             to examine the vocabulary for a large corpus, call the two methods listed below.
1858              
1859              
1860             =item B
1861              
1862             If you would like for the module to print out in your terminal window the size of the
1863             vocabulary, make the call
1864              
1865             $vsm->display_corpus_vocab_size();
1866              
1867              
1868             =item B
1869              
1870             If you would like to see the document relevancies generated by the previous method,
1871             you can call
1872              
1873             $vsm->display_doc_relevancies()
1874              
1875              
1876             =item B
1877              
1878             If you would like to see the document vectors constructed by the previous call, make
1879             the call:
1880              
1881             $vsm->display_doc_vectors();
1882              
1883             Note that this is a useful thing to do only on small test corpora. If you must call
1884             this method on a large corpus, you might wish to direct the output to a file.
1885              
1886              
1887             =item B
1888              
1889             You can display the idf value associated with each word in the corpus by
1890              
1891             $vsm->display_inverse_document_frequencies();
1892              
1893             The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1894             of the total number of documents in the corpus to the number of documents in which
1895             the word appears (with protection built in to prevent division by zero). Ideally, if
1896             a word appears in all the documents, its idf would be small, close to zero. Words
1897             with small idf values are non-discriminatory and should get reduced weighting in
1898             document retrieval.
1899              
1900              
1901             =item B
1902              
1903             If you would like to see the normalized document vectors, make the call:
1904              
1905             $vsm->display_normalized_doc_vectors();
1906              
1907             See the comment made previously as to what is meant by the normalization of a
1908             document vector.
1909              
1910              
1911             =item B
1912              
1913             A call to C will normally be followed by the
1914             following call
1915              
1916             $vsm->display_precision_vs_recall_for_queries();
1917              
1918             for displaying the C and C values.
1919              
1920              
1921             =item B
1922              
1923             You can display the retrieved document names by calling this method using the syntax:
1924              
1925             $vsm->display_retrievals( $retrievals );
1926              
1927             where C<$retrievals> is a reference to the hash returned by a call to one of the
1928             C methods. The display method shown here respects the retrieval size
1929             constraints expressed by the constructor parameter C.
1930              
1931              
1932             =item B
1933              
1934             Before you can carry out precision and recall calculations to test the accuracy of
1935             VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1936             judgments for the queries. (A relevancy judgment for a query is simply the list of
1937             documents relevant to that query.) Relevancy judgments are commonly supplied by the
1938             humans who are familiar with the corpus. But if such human-supplied relevance
1939             judgments are not available, you can invoke the following method to estimate them:
1940              
1941             $vsm->estimate_doc_relevancies();
1942              
1943             For the above method call, a document is considered to be relevant to a query if it
1944             contains several of the query words. As to the minimum number of query words that
1945             must exist in a document in order for the latter to be considered relevant, that is
1946             determined by the C parameter in the VSM constructor.
1947              
1948             But note that this estimation of document relevancies to queries is NOT for serious
1949             work. The reason for that is because ultimately it is the humans who are the best
1950             judges of the relevancies of documents to queries. The humans bring to bear semantic
1951             considerations on the relevancy determination problem that are beyond the scope of
1952             this module.
1953              
1954             The generated relevancies are deposited in a file named by the constructor parameter
1955             C.
1956              
1957              
1958             =item B
1959              
1960             If you want to get hold of all the filenames in the corpus in your own script, you
1961             can call
1962              
1963             my @docs = @{$vsm->get_all_document_names()};
1964              
1965             The array on the left will contain an alphabetized list of the files.
1966              
1967              
1968             =item B
1969              
1970             This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1971             course, if you will be doing document retrieval through a disk-stored VSM or LSA
1972             model, then you do not need to call this method. You construct document vectors
1973             through the following call:
1974              
1975             $vsm->generate_document_vectors();
1976              
1977              
1978             =item B
1979              
1980             After you have constructed a new instance of the C class, you must
1981             now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1982              
1983             $vsm->get_corpus_vocabulary_and_word_counts();
1984              
1985             The only time you do NOT need to call this method is when you are using a previously
1986             constructed disk-stored VSM model for retrieval.
1987              
1988              
1989             =item B
1990              
1991             If you want to run significance tests on the retrieval accuracies you obtain on a
1992             given corpus and with different algorithms (VSM or LSA with different choices for the
1993             constructor parameters), your own script would need access to the average precision
1994             data for a set of queries. You can get hold of this data by calling
1995              
1996             $vsm->get_query_sorted_average_precision_for_queries();
1997              
1998             The script C in the 'examples' directory shows how you can
1999             use this method for significance testing.
2000              
2001              
2002             =item B
2003              
2004             =item B
2005              
2006             If you would like to compare in your own script any two documents in the corpus, you
2007             can call
2008              
2009             my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
2010             or
2011             my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
2012              
2013             Both these calls return a number that is the dot product of the two document vectors
2014             normalized by the product of their magnitudes. The first call uses the regular
2015             document vectors and the second the normalized document vectors.
2016              
2017              
2018             =item B
2019              
2020             After you have created or obtained the relevancy judgments for your test queries, you
2021             can make the following call to calculate C and C:
2022              
2023             $vsm->precision_and_recall_calculator('vsm');
2024             or
2025             $vsm->precision_and_recall_calculator('lsa');
2026              
2027             depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
2028              
2029             =item B
2030              
2031             After you have built an LSA model through the call to C, you
2032             can retrieve the document names most similar to the query by:
2033              
2034             my $retrievals = $vsm->retrieve_with_lsa( \@query );
2035              
2036             Subsequently, you can display the retrievals by calling the
2037             C method described previously.
2038              
2039              
2040             =item B
2041              
2042             After you have constructed a VSM model, you call this method for document retrieval
2043             for a given query C<@query>. The call syntax is:
2044              
2045             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2046              
2047             The argument, C<@query>, is simply a list of words that you wish to use for
2048             retrieval. The method returns a hash whose keys are the document names and whose
2049             values the similarity distance between the document and the query. As is commonly
2050             the case with VSM, this module uses the cosine similarity distance when comparing a
2051             document vector with the query vector.
2052              
2053              
2054             =item B
2055              
2056             When human-supplied relevancies are available, you can upload them into the program
2057             by calling
2058              
2059             $vsm->upload_document_relevancies_from_file();
2060              
2061             These relevance judgments will be read from a file that is named with the
2062             C constructor parameter.
2063              
2064              
2065             =item B
2066              
2067             When you invoke the methods C and
2068             C, that automatically deposits the VSM model in the
2069             database files named with the constructor parameters C,
2070             C and C. Subsequently, you can carry out
2071             retrieval by directly using this disk-based VSM model for speedier performance. In
2072             order to do so, you must upload the disk-based model by
2073              
2074             $vsm->upload_normalized_vsm_model_from_disk();
2075              
2076             Subsequently you call
2077              
2078             my $retrievals = $vsm->retrieve_with_vsm( \@query );
2079             $vsm->display_retrievals( $retrievals );
2080              
2081             for retrieval and for displaying the results.
2082              
2083              
2084             =item B
2085              
2086             This is the method to call for large text corpora if you would like to examine the
2087             vocabulary created. The call syntax is
2088              
2089             $vsm->write_corpus_vocab_to_file($filename);
2090              
2091             where C<$filename> is the name of the file that you want the vocabulary to be written
2092             out to. This call will also show the frequency of each vocabulary word in your
2093             corpus.
2094              
2095              
2096             =back
2097              
2098              
2099             =head1 REQUIRED
2100              
2101             This module requires the following modules:
2102              
2103             SDBM_File
2104             Storable
2105             PDL
2106             File::Basename
2107             File::Spec::Functions
2108              
2109             The first two of these are needed for creating disk-based database records for the
2110             VSM and LSA models. The third is needed for calculating the SVD of the
2111             term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2112             by the directory scanner to make pathnames platform independent.
2113              
2114             =head1 EXAMPLES
2115              
2116             See the 'examples' directory in the distribution for the scripts listed below:
2117              
2118             =over
2119              
2120             =item B
2121              
2122             For basic VSM-based model construction and retrieval, run the script:
2123              
2124             retrieve_with_VSM.pl
2125              
2126             Starting with version 1.60, this script does not store away the VSM model in
2127             disk-based hash tables. If you want your model to be stored on the disk, you must
2128             run the script C for that.
2129              
2130             =item B
2131              
2132             If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2133             script
2134              
2135             continuously_running_VSM_retrieval_engine.pl
2136              
2137             You can create a script similar to this for doing the same with LSA models.
2138              
2139             =item B
2140              
2141             For storing the model information in disk-based DBM files that can subsequently be
2142             used for both VSM and LSA retrieval, run the script:
2143              
2144             retrieve_with_VSM_and_also_create_disk_based_model.pl
2145              
2146             =item B
2147              
2148             For basic LSA-based model construction and retrieval, run the script:
2149              
2150             retrieve_with_LSA.pl
2151              
2152             Starting with version 1.60, this script does not store away the model information in
2153             disk-based hash tables. If you want your model to be stored on the disk, you must
2154             run the script C for that.
2155              
2156             =item B
2157              
2158             If you have previously run a script like
2159             C, you can run the script
2160              
2161             retrieve_with_disk_based_VSM.pl
2162              
2163             for repeated VSM-based retrievals from a disk-based model.
2164              
2165             =item B
2166              
2167             If you have previously run a script like
2168             C, you can run the script
2169              
2170             retrieve_with_disk_based_LSA.pl
2171              
2172             for repeated LSA-based retrievals from a disk-based model.
2173              
2174             =item B
2175              
2176             To experiment with precision and recall calculations for VSM retrieval, run the
2177             script:
2178              
2179             calculate_precision_and_recall_for_VSM.pl
2180              
2181             Note that this script will carry out its own estimation of relevancy judgments ---
2182             which in most cases would not be a safe thing to do.
2183              
2184             =item B
2185              
2186             To experiment with precision and recall calculations for LSA retrieval, run the
2187             script:
2188              
2189             calculate_precision_and_recall_for_LSA.pl
2190              
2191             Note that this script will carry out its own estimation of relevancy judgments ---
2192             which in most cases would not be a safe thing to do.
2193              
2194             =item B
2195             Human-Supplied Relevancies:>
2196              
2197             Precision and recall calculations for retrieval accuracy determination are best
2198             carried out with human-supplied judgments of relevancies of the documents to queries.
2199             If such judgments are available, run the script:
2200              
2201             calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2202              
2203             This script will print out the average precisions for the different test queries and
2204             calculate the MAP metric of retrieval accuracy.
2205              
2206             =item B
2207             Human-Supplied Relevancies:>
2208              
2209             If human-supplied relevancy judgments are available and you wish to experiment with
2210             precision and recall calculations for LSA-based retrieval, run the script:
2211              
2212             calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2213              
2214             This script will print out the average precisions for the different test queries and
2215             calculate the MAP metric of retrieval accuracy.
2216              
2217             =item B
2218             Randomization or with Student's Paired t-Test:>
2219              
2220             significance_testing.pl randomization
2221              
2222             or
2223              
2224             significance_testing.pl t-test
2225              
2226             Significance testing consists of forming a null hypothesis that the two retrieval
2227             algorithms you are considering are the same from a black-box perspective and then
2228             calculating what is known as a C. If the C is less than, say,
2229             0.05, you reject the null hypothesis.
2230              
2231             =item B
2232              
2233             calculate_similarity_matrix_for_all_docs.pl
2234              
2235             or
2236              
2237             calculate_similarity_matrix_for_all_normalized_docs.pl
2238              
2239             The former uses regular document vectors for calculating the similarity between every
2240             pair of documents in the corpus. And the latter uses normalized document vectors for
2241             the same purpose. The document order used for row and column indexing of the matrix
2242             corresponds to the alphabetic ordering of the document names in the corpus directory.
2243              
2244             =back
2245              
2246              
2247             =head1 EXPORT
2248              
2249             None by design.
2250              
2251             =head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2252              
2253             You have to be careful when carrying out Precision verses Recall calculations if you
2254             do not wish to lose the previously created relevancy judgments. Invoking the method
2255             C in your own script will cause the file C
2256             to be overwritten. If you have created a relevancy database and stored it in a file
2257             called, say, C, you should make a backup copy of this file before
2258             executing a script that calls C.
2259              
2260             =head1 BUGS
2261              
2262             Please notify the author if you encounter any bugs. When sending email, please place
2263             the string 'VSM' in the subject line to get past my spam filter.
2264              
2265             =head1 INSTALLATION
2266              
2267             Download the archive from CPAN in any directory of your choice. Unpack the archive
2268             with a command that on a Linux machine would look like:
2269              
2270             tar zxvf Algorithm-VSM-1.70.tar.gz
2271              
2272             This will create an installation directory for you whose name will be
2273             C. Enter this directory and execute the following commands for a
2274             standard install of the module if you have root privileges:
2275              
2276             perl Makefile.PL
2277             make
2278             make test
2279             sudo make install
2280              
2281             If you do not have root privileges, you can carry out a non-standard install the
2282             module in any directory of your choice by:
2283              
2284             perl Makefile.PL prefix=/some/other/directory/
2285             make
2286             make test
2287             make install
2288              
2289             With a non-standard install, you may also have to set your PERL5LIB environment
2290             variable so that this module can find the required other modules. How you do that
2291             would depend on what platform you are working on. In order to install this module in
2292             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2293             variable by
2294              
2295             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2296              
2297             If I used bash, I'd need to declare:
2298              
2299             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2300              
2301              
2302             =head1 THANKS
2303              
2304             Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2305             deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2306             similarity matrix characterization of a corpus. Thanks, Zahn!
2307              
2308             Several of the recent changes to the module are a result of the feedback I have
2309             received from Naveen Kulkarni of Infosys Labs. Thanks, Naveen!
2310              
2311             Version 1.62 was a result of Slaven Rezic's recommendation that I remove the Perl
2312             version restriction on the module since he was able to run it with Perl version
2313             5.8.9. Another important reason for v. 1.62 was the discovery of the two bugs
2314             mentioned in Changes, one of them brought to my attention by Naveen Kulkarni.
2315              
2316             =head1 AUTHOR
2317              
2318             The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy" project
2319             with the publication of the book "B" by John-Wiley. If
2320             interested, check out his web page at Purdue to find out what the Objects Trilogy
2321             project was all about. You might like "B" especially if you
2322             enjoyed reading Harry Potter as a kid (or even as an adult, for that matter). The
2323             other two books in the trilogy are "B" and "B
2324             with Objects>".
2325              
2326             For all issues related to this module, contact the author at C
2327              
2328             If you send email, please place the string "VSM" in your subject line to get past the
2329             author's spam filter.
2330              
2331             =head1 COPYRIGHT
2332              
2333             This library is free software; you can redistribute it and/or modify it under the
2334             same terms as Perl itself.
2335              
2336             Copyright 2015 Avinash Kak
2337              
2338             =cut
2339              
2340