File Coverage

blib/lib/Algorithm/VSM.pm

Criterion	Covered	Total	%
statement	354	820	43.1
branch	91	324	28.0
condition	25	67	37.3
subroutine	32	55	58.1
pod	24	35	68.5
total	526	1301	40.4

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::VSM;
2
3							#---------------------------------------------------------------------------
4							# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::VSM is a Perl module for retrieving the documents from a software
9							# library that match a list of words in a query. The matching criterion used depends
10							# on whether you ask the module to construct a full-dimensionality VSM or a
11							# reduced-dimensionality LSA model for the library.
12							# ---------------------------------------------------------------------------
13
14	1			1		12635	use 5.10.0;
	1					3
	1					32
15	1			1		4	use strict;
	1					1
	1					25
16	1			1		3	use warnings;
	1					4
	1					27
17	1			1		3	use Carp;
	1					1
	1					70
18	1			1		386	use SDBM_File;
	1					1624
	1					38
19	1			1		372	use PDL::Lite;
	1					127119
	1					28
20	1			1		6	use PDL::MatrixOps;
	1					2
	1					6
21	1			1		151	use File::Basename;
	1					1
	1					68
22	1			1		441	use File::Spec::Functions qw(rel2abs);
	1					612
	1					55
23	1			1		5	use Fcntl;
	1					1
	1					267
24	1			1		544	use Storable;
	1					2352
	1					46
25	1			1		4	use Cwd;
	1					1
	1					8326
26
27							our $VERSION = '1.61';
28
29							# for camelcase splits (from perlmonks):
30							my $_regex = qr/[[:lower:]0-9]+\|[[:upper:]0-9](?:[[:upper:]0-9]+\|[[:lower:]0-9]*)(?=$\|[[:upper:]0-9])/;
31
32							################################### Constructor #######################################
33
34							# Constructor for creating a VSM or LSA model of a corpus. The model instance
35							# returned by the constructor can be used for retrieving documents from the corpus
36							# in response to queries.
37							sub new {
38	2			2	1	490	my ($class, %args) = @_;
39	2					11	my @params = keys %args;
40	2	50				11	croak "\nYou have used a wrong name for a keyword argument " .
41							"--- perhaps a misspelling\n"
42							if _check_for_illegal_params(@params) == 0;
43	2	50	50			5447	bless {
		50	50
		50	50
			50
			50
			50
			50
			50
			50
			50
			50
			50
			100
			50
			50
44							_corpus_directory => $args{corpus_directory} \|\| "",
45							_save_model_on_disk => $args{save_model_on_disk} \|\| 0,
46							_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47							$args{break_camelcased_and_underscored} : 1,
48							_corpus_vocab_db => $args{corpus_vocab_db} \|\| "corpus_vocab_db",
49							_doc_vectors_db => $args{doc_vectors_db} \|\| "doc_vectors_db",
50							_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} \|\| "normalized_doc_vecs_db",
51							_stop_words_file => $args{stop_words_file} \|\| "",
52							_case_sensitive => $args{case_sensitive} \|\| 0,
53							_query_file => $args{query_file} \|\| "",
54							_file_types => $args{file_types} \|\| [],
55							_min_word_length => $args{min_word_length} \|\| 4,
56							_want_stemming => $args{want_stemming} \|\| 0,
57							_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58							_max_number_retrievals => $args{max_number_retrievals} \|\| 30,
59							_lsa_svd_threshold => $args{lsa_svd_threshold} \|\| 0.01,
60							_relevancy_threshold => exists $args{relevancy_threshold} ?
61							$args{relevancy_threshold} : 1,
62							_relevancy_file => $args{relevancy_file} \|\| "",
63							_debug => $args{debug} \|\| 0,
64							_working_directory => cwd,
65							_vocab_hist_on_disk => {},
66							_vocab_hist => {},
67							_doc_hist_template => {},
68							_corpus_doc_vectors => {},
69							_normalized_doc_vecs => {},
70							_query_vector => {},
71							_stop_words => [],
72							_term_document_matrix => [],
73							_corpus_vocab_done => 0,
74							_scan_dir_for_rels => 0,
75							_vocab_size => undef,
76							_doc_vecs_trunc_lsa => {},
77							_lsa_vec_truncator => undef,
78							_queries_for_relevancy => {},
79							_relevancy_estimates => {},
80							_precision_for_queries => {},
81							_recall_for_queries => {},
82							_map_vals_for_queries => {},
83							_vocab_idf_hist => {},
84							_idf_t => {},
85							_total_num_of_docs => 0,
86							}, $class;
87							}
88
89
90							###################### Get corpus vocabulary and word counts #########################
91
92							sub get_corpus_vocabulary_and_word_counts {
93	2			2	1	32	my $self = shift;
94	2	50				25	die "You must supply the name of the corpus directory to the constructor"
95							unless $self->{_corpus_directory};
96	2	50				12	print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97							" model construction\n\n" if $self->{_debug};
98	2					18	$self->_scan_directory( $self->{_corpus_directory} );
99	2	50				9	$self->_drop_stop_words() if $self->{_stop_words_file};
100	2	50				6	if ($self->{_debug}) {
101	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
102	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103							}
104							}
105	2	50				7	if ($self->{_save_model_on_disk}) {
106	0					0	unlink glob "$self->{_corpus_vocab_db}.*";
107	0					0	unlink glob "$self->{_doc_vectors_db}.*";
108	0					0	unlink glob "$self->{_normalized_doc_vecs_db}.*";
109	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
110							$self->{_corpus_vocab_db}, O_RDWR\|O_CREAT, 0640
111							or die "Can't create DBM files: $!";
112	0					0	foreach (keys %{$self->{_vocab_hist}}) {
	0					0
113	0					0	$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114							}
115	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
116							}
117	2					6	$self->{_corpus_vocab_done} = 1;
118	2					3	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	2					13
119	2	50				6	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120							if $self->{_debug};
121							# Calculate idf(t):
122	2					2	foreach (keys %{$self->{_vocab_idf_hist}}) {
	2					25
123	200					373	$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124							/
125							(1 + $self->{_vocab_idf_hist}->{$_})))
126							/ log(10) );
127							}
128							}
129
130							sub display_corpus_vocab {
131	0			0	1	0	my $self = shift;
132	0					0	die "corpus vocabulary not yet constructed"
133	0	0				0	unless keys %{$self->{_vocab_hist}};
134	0					0	print "\n\nDisplaying corpus vocabulary:\n\n";
135	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
136	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137	0					0	print "$outstring\n";
138							}
139							}
140
141							sub display_corpus_vocab_size {
142	0			0	1	0	my $self = shift;
143	0					0	die "corpus vocabulary not yet constructed"
144	0	0				0	unless keys %{$self->{_vocab_hist}};
145	0					0	my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
	0					0
146	0					0	print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147							}
148
149							sub write_corpus_vocab_to_file {
150	0			0	1	0	my $self = shift;
151	0					0	my $file = shift;
152	0	0				0	die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
	0					0
153	0	0				0	open OUT, "> $file"
154							or die "unable to open for output a file with name `$file': $!";
155	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
156	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157	0					0	print OUT "$outstring\n";
158							}
159	0					0	close OUT;
160							}
161
162							sub display_inverse_document_frequencies {
163	0			0	1	0	my $self = shift;
164	0					0	die "corpus vocabulary not yet constructed"
165	0	0				0	unless keys %{$self->{_vocab_idf_hist}};
166	0	0				0	print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167							unless $self->{_idf_filter_option};
168	0					0	print "\n\nDisplaying inverse document frequencies:\n";
169	0					0	foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
	0					0
170	0					0	my $outstring = sprintf("%30s %d",
171							$_, $self->{_vocab_idf_hist}->{$_});
172	0					0	print "$outstring\n";
173							}
174	0					0	print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175	0					0	foreach ( sort keys %{$self->{_idf_t}} ) {
	0					0
176	0					0	my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177	0					0	print "$outstring\n";
178							}
179							}
180
181							sub get_all_document_names {
182	0			0	1	0	my $self = shift;
183	0					0	my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
	0					0
184	0					0	return \@all_files;
185							}
186
187							############################ Generate Document Vectors #################################
188
189							sub generate_document_vectors {
190	2			2	1	28	my $self = shift;
191	2					23	chdir $self->{_working_directory};
192	2					3	foreach ( sort keys %{$self->{_vocab_hist}} ) {
	2					95
193	200					211	$self->{_doc_hist_template}->{$_} = 0;
194							}
195	2					19	$self->_scan_directory( $self->{_corpus_directory} );
196	2					30	chdir $self->{_working_directory};
197	2	50				27	if ($self->{_save_model_on_disk}) {
198	0	0	0			0	die "You did not specify in the constructor call the names for the diskfiles " .
199							"for storing the disk-based hash tables consisting of document vectors " .
200							"and their normalized versions"
201							unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202	0					0	eval {
203	0					0	store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204							};
205	0	0				0	if ($@) {
206	0					0	print "Something went wrong with disk storage of document vectors: $@";
207							}
208	0					0	eval {
209	0					0	store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210							};
211	0	0				0	if ($@) {
212	0					0	print "Something wrong with disk storage of normalized doc vecs: $@";
213							}
214							}
215							}
216
217							sub display_doc_vectors {
218	0			0	1	0	my $self = shift;
219	0					0	die "document vectors not yet constructed"
220	0	0				0	unless keys %{$self->{_corpus_doc_vectors}};
221	0					0	foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
222	0					0	print "\n\ndisplay doc vec for $file:\n";
223	0					0	foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
	0					0
224	0					0	print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225							}
226	0					0	my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
	0					0
227	0					0	print "\nSize of vector for $file: $docvec_size\n";
228							}
229							}
230
231							sub display_normalized_doc_vectors {
232	0			0	1	0	my $self = shift;
233	0					0	die "normalized document vectors not yet constructed"
234	0	0				0	unless keys %{$self->{_normalized_doc_vecs}};
235	0	0				0	unless ($self->{_idf_filter_option}) {
236	0					0	print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237	0					0	return;
238							}
239	0					0	foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
	0					0
240	0					0	print "\n\ndisplay normalized doc vec for $file:\n";
241	0					0	foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
	0					0
242	0					0	print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243							}
244	0					0	my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
	0					0
245	0					0	print "\nSize of normalized vector for $file: $docvec_size\n";
246							}
247							}
248
249							######################## Calculate Pairwise Document Similarities ######################
250
251							# Returns the similarity score for two documents whose actual names are are supplied
252							# as its two arguments.
253							sub pairwise_similarity_for_docs {
254	0			0	1	0	my $self = shift;
255	0					0	my $doc1 = shift;
256	0					0	my $doc2 = shift;
257	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
258	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260	0					0	my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261	0					0	my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262	0					0	my @vec1 = ();
263	0					0	my @vec2 = ();
264	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
265	0					0	push @vec1, $vec_hash_ref1->{$word};
266	0					0	push @vec2, $vec_hash_ref2->{$word};
267							}
268	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
269	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
270	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
271	0					0	$product /= $vec_mag1 * $vec_mag2;
272	0					0	return $product;
273							}
274
275							sub pairwise_similarity_for_normalized_docs {
276	0			0	1	0	my $self = shift;
277	0					0	my $doc1 = shift;
278	0					0	my $doc2 = shift;
279	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
280	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282	0					0	my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283	0					0	my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284	0					0	my @vec1 = ();
285	0					0	my @vec2 = ();
286	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
287	0					0	push @vec1, $vec_hash_ref1->{$word};
288	0					0	push @vec2, $vec_hash_ref2->{$word};
289							}
290	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
291	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
292	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
293	0					0	$product /= $vec_mag1 * $vec_mag2;
294	0					0	return $product;
295							}
296
297							############################### Retrieve with VSM Model ################################
298
299							sub retrieve_with_vsm {
300	1			1	1	18	my $self = shift;
301	1					4	my $query = shift;
302	1					1	my @clean_words;
303	1					5	my $min = $self->{_min_word_length};
304
305	1	50				10	if ($self->{_break_camelcased_and_underscored}) {
306	1					31	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
307	1					4	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					30
308	0					0	@clean_words = $self->{_case_sensitive} ?
309	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	8					63
310	1	100				7	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8	50				30
311							} else {
312	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
313	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
314							}
315	1					4	$query = \@clean_words;
316	1	50				51	print "\nYour query words are: @$query\n" if $self->{_debug};
317	1	50				4	if ($self->{_idf_filter_option}) {
318	1					9	die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319	1					6	unless scalar(keys %{$self->{_vocab_hist}})
320	1	50	50			2	&& scalar(keys %{$self->{_normalized_doc_vecs}});
321							} else {
322	0					0	die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323	0					0	unless scalar(keys %{$self->{_vocab_hist}})
324	0	0	0			0	&& scalar(keys %{$self->{_corpus_doc_vectors}});
325							}
326	1					7	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					34
327	100					127	$self->{_query_vector}->{$_} = 0;
328							}
329	1					7	foreach (@$query) {
330	7	50				12	if ($self->{_case_sensitive}) {
331	0	0				0	$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332							} else {
333	7	100				21	$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334							}
335							}
336	1					1	my @query_word_counts = values %{$self->{_query_vector}};
	1					13
337	1					6	my $query_word_count_total = reduce(\@query_word_counts);
338	1	50				15	die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339							unless $query_word_count_total;
340	1					3	my %retrievals;
341	1	50				4	if ($self->{_idf_filter_option}) {
342	1	50				5	print "\n\nUsing idf filter option for retrieval:\n\n"
343							if $self->{_debug};
344	1					2	foreach (sort {$self->_doc_vec_comparator}
	15					34
	1					12
345							keys %{$self->{_normalized_doc_vecs}}) {
346	8					17	$retrievals{$_} = $self->_similarity_to_query($_);
347							}
348							} else {
349	0	0				0	print "\n\nNOT using idf filter option for retrieval:\n\n"
350							if $self->{_debug};
351	0					0	foreach (sort {$self->_doc_vec_comparator}
	0					0
	0					0
352							keys %{$self->{_corpus_doc_vectors}}) {
353	0					0	$retrievals{$_} = $self->_similarity_to_query($_);
354							}
355							}
356	1	50				5	if ($self->{_debug}) {
357	0					0	print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
359	0					0	print "$_ => $retrievals{$_}\n";
360							}
361							}
362	1					12	return \%retrievals;
363							}
364
365							######################### Upload a Previously Constructed Model #########################
366
367							sub upload_vsm_model_from_disk {
368	0			0	0	0	my $self = shift;
369	0	0	0			0	die "\nCannot find the database files for the VSM model"
370							unless -s "$self->{_corpus_vocab_db}.pag"
371							&& -s $self->{_doc_vectors_db};
372	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
374							$self->{_corpus_vocab_db}, O_RDONLY, 0640
375							or die "Can't open DBM file: $!";
376	0	0				0	if ($self->{_debug}) {
377	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
378	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379							}
380							}
381	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
382	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383							}
384	0					0	$self->{_corpus_vocab_done} = 1;
385	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
386	0	0				0	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387							if $self->{_debug};
388	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
390							}
391
392							sub upload_normalized_vsm_model_from_disk {
393	0			0	1	0	my $self = shift;
394	0	0	0			0	die "\nCannot find the database files for the VSM model"
395							unless -s "$self->{_corpus_vocab_db}.pag"
396							&& -s $self->{_normalized_doc_vecs_db};
397	0					0	$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
399							$self->{_corpus_vocab_db}, O_RDONLY, 0640
400							or die "Can't open DBM file: $!";
401	0	0				0	if ($self->{_debug}) {
402	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
403	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404							}
405							}
406	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
407	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408							}
409	0					0	$self->{_corpus_vocab_done} = 1;
410	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
411	0	0				0	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412							if $self->{_debug};
413	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
414							}
415
416							############################## Display Retrieval Results ################################
417
418							sub display_retrievals {
419	0			0	1	0	my $self = shift;
420	0					0	my $retrievals = shift;
421	0					0	print "\n\nShowing the retrievals and the similarity scores:\n\n";
422	0					0	my $iter = 0;
423	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
	0					0
424	0					0	print "$_ => $retrievals->{$_}\n";
425	0					0	$iter++;
426	0	0				0	last if $iter > $self->{_max_number_retrievals};
427							}
428	0					0	print "\n\n";
429							}
430
431							############################### Directory Scanner ################################
432
433							sub _scan_directory {
434	4			4		17	my $self = shift;
435	4					32	my $dir = rel2abs( shift );
436	4					9878	my $current_dir = cwd;
437	4	50				136	chdir $dir or die "Unable to change directory to $dir: $!";
438	4					558	foreach ( glob "*" ) {
439	32	50	33			2567	if ( -d and !(-l) ) {
		50	33
			33
			33
			33
440	0					0	$self->_scan_directory( $_ );
441	0	0				0	chdir $dir
442							or die "Unable to change directory to $dir: $!";
443							} elsif (-r _ and
444							-T _ and
445							-M _ > 0.00001 and # modification age is at least 1 sec
446							!( -l $_ ) and
447							$self->ok_to_filetype($_) ) {
448	32	50				77	$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449	32	100				101	$self->_scan_file($_) unless $self->{_corpus_vocab_done};
450	32	100				129	$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451							}
452							}
453	4					100	chdir $current_dir;
454							}
455
456							sub _scan_file {
457	16			16		17	my $self = shift;
458	16					18	my $file = shift;
459	16					381	open IN, $file;
460	16					31	my $min = $self->{_min_word_length};
461	16					34	my %uniques = ();
462	16					142	while () {
463	834	100				2524	next if /^[ ]*\r?\n?$/;
464	710					1913	$_ =~ s/\r?\n?$//;
465	710					610	my @clean_words;
466	710	50				929	if ($self->{_break_camelcased_and_underscored}) {
467	710					6819	my @brokenup = grep $_, split /\W\|_\|\s+/, $_;
468	710					1131	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	2574					6930
469	0					0	@clean_words = $self->{_case_sensitive} ?
470	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	2712					4892
471	710	100				1374	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	2712	50				6549
472							} else {
473	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
474	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
475							}
476	710	100				2029	next unless @clean_words;
477	426	50				1042	@clean_words = grep $_, map &simple_stemmer($_), @clean_words
478							if $self->{_want_stemming};
479	426	50				863	if ($self->{_case_sensitive}) {
480	0					0	map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words;
	0					0
481							} else {
482	426					551	map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
	1718					2444
483							}
484	426					636	for (@clean_words) { $uniques{$_}++ };
	1718					3004
485							}
486	16					149	close( IN );
487	16					107	map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
	820					816
488	16					110	$self->{_total_num_of_docs}++;
489							}
490
491							sub ok_to_filetype {
492	32			32	0	61	my $self = shift;
493	32					88	my $filename = shift;
494	32					1525	my ($base, $dir, $suffix) = fileparse($filename, '\..*');
495	32					126	croak "You called this module without specifying the file types in the constructor"
496	32	50				59	unless @{$self->{_file_types}} > 0;
497	32	50				61	return 1 if contained_in($suffix, @{$self->{_file_types}});
	32					169
498	0					0	return 0;
499							}
500
501							############################## LSA Modeling and Retrieval ################################
502
503							sub construct_lsa_model {
504	1			1	1	18	my $self = shift;
505	1	50				10	if ($self->{_idf_filter_option}) {
506	1	50	33			15	if (!$self->{_normalized_doc_vecs} and
507							-s $self->{_normalized_doc_vecs_db}) {
508	0					0	$self->{_normalized_doc_vecs} =
509							retrieve($self->{_normalized_doc_vecs_db});
510							}
511	1					4	foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
	1					18
512	8					10	my $term_frequency_vec;
513	8					11	foreach my $word (sort keys
	8					504
514							%{$self->{_normalized_doc_vecs}->{$_}}){
515	800					1354	push @$term_frequency_vec,
516							$self->{_normalized_doc_vecs}->{$_}->{$word};
517							}
518	8					60	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	8					28
519							}
520							} else {
521	0	0	0			0	if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
522	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
523							}
524	0					0	foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
525	0					0	my $term_frequency_vec;
526	0					0	foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
	0					0
527	0					0	push @$term_frequency_vec,
528							$self->{_corpus_doc_vectors}->{$_}->{$word};
529							}
530	0					0	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	0					0
531							}
532							}
533	1					3	my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
	1					12
534	1					497	my ($U,$SIGMA,$V) = svd $A;
535	1	50				12	print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
536	1	50				6	print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
537	1					11	my $index = return_index_of_last_value_above_threshold($SIGMA,
538							$self->{_lsa_svd_threshold});
539	1					8	my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
540	1	50				32	print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
541							# When you measure the size of a matrix in PDL, the zeroth dimension
542							# is considered to be along the horizontal and the one-th dimension
543							# along the rows. This is opposite of how we want to look at
544							# matrices. For a matrix of size MxN, we mean M rows and N columns.
545							# With this 'rows x columns' convention for matrix size, if you had
546							# to check the size of, say, U matrix, you would call
547							# my @size = ( $U->getdim(1), $U->getdim(0) );
548							# print "\nsize of U: @size\n";
549	1					7	my $U_trunc = $U->slice("0:$index,:")->sever;
550	1					49	my $V_trunc = $V->slice("0:$index,0:$index")->sever;
551	1					29	$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
552							PDL::Basic::transpose($U_trunc);
553	1	50				10129	print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
554							if $self->{_debug};
555	1					10	my @sorted_doc_names = $self->{_idf_filter_option} ?
556	0					0	sort keys %{$self->{_normalized_doc_vecs}} :
557	1	50				7	sort keys %{$self->{_corpus_doc_vectors}};
558	1					2	my $i = 0;
559	1					2	foreach (@{$self->{_term_document_matrix}}) {
	1					3
560	8					17	my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
561							PDL::Basic::transpose(pdl($_));
562	8					883	my $doc_name = $sorted_doc_names[$i++];
563	8	50				24	print "\n\nTruncated doc vec for $doc_name: " .
564							$truncated_doc_vec . "\n" if $self->{_debug};
565	8					20	$self->{_doc_vecs_trunc_lsa}->{$doc_name}
566							= $truncated_doc_vec;
567							}
568	1					39	chdir $self->{_working_directory};
569							}
570
571							sub retrieve_with_lsa {
572	1			1	1	11	my $self = shift;
573	1					3	my $query = shift;
574	1					2	my @clean_words;
575	1					3	my $min = $self->{_min_word_length};
576	1	50				4	if ($self->{_break_camelcased_and_underscored}) {
577	1					26	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
578	1					4	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					24
579	1	100				2	@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8					78
	8					29
580							} else {
581	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
582	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
583							}
584	1					3	$query = \@clean_words;
585	1	50				4	print "\nYour processed query words are: @$query\n" if $self->{_debug};
586	1					7	die "Your vocabulary histogram is empty"
587	1	50				2	unless scalar(keys %{$self->{_vocab_hist}});
588	1					5	die "You must first construct an LSA model"
589	1	50				2	unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
590	1					2	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					45
591	100					110	$self->{_query_vector}->{$_} = 0;
592							}
593	1					5	foreach (@$query) {
594	7	100				17	$self->{_query_vector}->{"\L$_"}++
595							if exists $self->{_vocab_hist}->{"\L$_"};
596							}
597	1					2	my @query_word_counts = values %{$self->{_query_vector}};
	1					10
598	1					7	my $query_word_count_total = reduce(\@query_word_counts);
599	1	50				8	die "Query does not contain corpus words. Nothing retrieved."
600							unless $query_word_count_total;
601	1					2	my $query_vec;
602	1					2	foreach (sort keys %{$self->{_query_vector}}) {
	1					51
603	100					99	push @$query_vec, $self->{_query_vector}->{$_};
604							}
605	1	50				7	print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
606	1					4	my $truncated_query_vec = $self->{_lsa_vec_truncator} x
607							PDL::Basic::transpose(pdl($query_vec));
608	1	50				131	print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
609							if $self->{_debug};
610	1					1	my %retrievals;
611	1					2	foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
	1					6
612	8					68	my $dot_product = PDL::Basic::transpose($truncated_query_vec)
613							x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
614	8	50				487	print "\n\nLSA: dot product of truncated query and\n" .
615							" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
616							if $self->{_debug};
617	8					14	$retrievals{$_} = $dot_product->sclr;
618							}
619	1	50				13	if ($self->{_debug}) {
620	0					0	print "\n\nShowing LSA retrievals and similarity scores:\n\n";
621	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
622	0					0	print "$_ => $retrievals{$_}\n";
623							}
624	0					0	print "\n\n";
625							}
626	1					12	return \%retrievals;
627							}
628
629							sub _construct_doc_vector {
630	16			16		24	my $self = shift;
631	16					29	my $file = shift;
632	16					22	my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
	16					51
633	16					174	foreach ( sort keys %{$self->{_doc_hist_template}} ) {
	16					755
634	1600					1499	$document_vector{$_} = 0;
635							}
636	16					111	my $min = $self->{_min_word_length};
637	16					34	my $total_words_in_doc = 0;
638	16	50				449	unless (open IN, $file) {
639	0	0				0	print "Unable to open file $file in the corpus: $!\n"
640							if $self->{_debug};
641	0					0	return;
642							}
643	16					257	while () {
644	834	100				2823	next if /^[ ]*\r?\n?$/;
645	710					2175	$_ =~ s/\r?\n?$//;
646	710					5708	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
647	710					1027	my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	4702					7927
	4702					7359
648	710	100				2290	next unless @clean_words;
649	426	50				1403	@clean_words = grep $_,
650							map &simple_stemmer($_, $self->{_debug}), @clean_words
651							if $self->{_want_stemming};
652	0					0	$self->{_case_sensitive} ?
653	0					0	map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
	1470					3140
654	1592					2631	map { $document_vector{"\L$_"}++ }
655	426	50				1043	grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
656							}
657	16					164	close IN;
658	16	50				74	die "Something went wrong. Doc vector size unequal to vocab size"
659							unless $self->{_vocab_size} == scalar(keys %document_vector);
660	16					212	foreach (keys %document_vector) {
661	1600					1548	$total_words_in_doc += $document_vector{$_};
662							}
663	16					74	my %normalized_doc_vec;
664	16	50				49	if ($self->{_idf_filter_option}) {
665	16					136	foreach (keys %document_vector) {
666	1600					2666	$normalized_doc_vec{$_} = $document_vector{$_}
667							*
668							$self->{_idf_t}->{$_}
669							/
670							$total_words_in_doc;
671							}
672							}
673	16					38053	my $pwd = cwd;
674	16					342	$pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
675	16					68	my $file_path_name;
676	16	50				123	unless ( $1 eq "" ) {
677	0					0	$file_path_name = "$1/$file";
678							} else {
679	16					30	$file_path_name = $file;
680							}
681	16					127	$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
682	16					252	$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
683							}
684
685							################################### Drop Stop Words ###################################
686
687							sub _drop_stop_words {
688	0			0		0	my $self = shift;
689	0	0				0	open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
690							or die "unable to open stop words file: $!";
691	0					0	while () {
692	0	0				0	next if /^#/;
693	0	0				0	next if /^[ ]*$/;
694	0					0	chomp;
695	0	0				0	delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
696	0					0	unshift @{$self->{_stop_words}}, $_;
	0					0
697							}
698							}
699
700							################################### Support Methods ####################################
701
702							sub _doc_vec_comparator {
703	15			15		16	my $self = shift;
704	15					15	my %query_vector = %{$self->{_query_vector}};
	15					351
705	15	50				87	my $vec1_hash_ref = $self->{_idf_filter_option} ?
706							$self->{_normalized_doc_vecs}->{$a} :
707							$self->{_corpus_doc_vectors}->{$a};
708	15	50				29	my $vec2_hash_ref = $self->{_idf_filter_option} ?
709							$self->{_normalized_doc_vecs}->{$b} :
710							$self->{_corpus_doc_vectors}->{$b};
711	15					18	my @vec1 = ();
712	15					17	my @vec2 = ();
713	15					11	my @qvec = ();
714	15					12	foreach my $word (sort keys %{$self->{_vocab_hist}}) {
	15					560
715	1500					1369	push @vec1, $vec1_hash_ref->{$word};
716	1500					1319	push @vec2, $vec2_hash_ref->{$word};
717	1500					1554	push @qvec, $query_vector{$word};
718							}
719	15					89	my $vec1_mag = vec_magnitude(\@vec1);
720	15					24	my $vec2_mag = vec_magnitude(\@vec2);
721	15					27	my $qvec_mag = vec_magnitude(\@qvec);
722	15					26	my $product1 = vec_scalar_product(\@vec1, \@qvec);
723	15					17	$product1 /= $vec1_mag * $qvec_mag;
724	15					27	my $product2 = vec_scalar_product(\@vec2, \@qvec);
725	15					19	$product2 /= $vec2_mag * $qvec_mag;
726	15	100				175	return 1 if $product1 < $product2;
727	5	50				9	return 0 if $product1 == $product2;
728	5	50				89	return -1 if $product1 > $product2;
729							}
730
731							sub _similarity_to_query {
732	8			8		9	my $self = shift;
733	8					9	my $doc_name = shift;
734	8	50				20	my $vec_hash_ref = $self->{_idf_filter_option} ?
735							$self->{_normalized_doc_vecs}->{$doc_name} :
736							$self->{_corpus_doc_vectors}->{$doc_name};
737	8					10	my @vec = ();
738	8					7	my @qvec = ();
739	8					328	foreach my $word (sort keys %$vec_hash_ref) {
740	800					655	push @vec, $vec_hash_ref->{$word};
741	800					758	push @qvec, $self->{_query_vector}->{$word};
742							}
743	8					39	my $vec_mag = vec_magnitude(\@vec);
744	8					15	my $qvec_mag = vec_magnitude(\@qvec);
745	8					12	my $product = vec_scalar_product(\@vec, \@qvec);
746	8					14	$product /= $vec_mag * $qvec_mag;
747	8					56	return $product;
748							}
749
750							###################### Relevance Judgments for Testing Purposes #######################
751
752							## IMPORTANT: This estimation of document relevancies to queries is NOT for
753							## serious work. A document is considered to be relevant to a
754							## query if it contains several of the query words. As to the
755							## minimum number of query words that must exist in a document
756							## in order for the latter to be considered relevant is
757							## determined by the relevancy_threshold parameter in the VSM
758							## constructor. (See the relevancy and precision-recall related
759							## scripts in the 'examples' directory.) The reason for why the
760							## function shown below is not for serious work is because
761							## ultimately it is the humans who are the best judges of the
762							## relevancies of documents to queries. The humans bring to
763							## bear semantic considerations on the relevancy determination
764							## problem that are beyond the scope of this module.
765
766							sub estimate_doc_relevancies {
767	0			0	1	0	my $self = shift;
768	0	0				0	die "You did not set the 'query_file' parameter in the constructor"
769							unless $self->{_query_file};
770	0	0				0	open( IN, $self->{_query_file} )
771							or die "unable to open the query file $self->{_query_file}: $!";
772	0	0				0	croak "\n\nYou need to specify a name for the relevancy file in \n" .
773							" in which the relevancy judgments will be dumped."
774							unless $self->{_relevancy_file};
775	0					0	while () {
776	0					0	chomp;
777	0	0				0	next if /^#/;
778	0	0				0	next if /^[ ]*$/;
779	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
780	0					0	/^[ ](q[0-9]+):[ ](.*)/;
781	0					0	my $query_label = $1;
782	0					0	my $query = $2;
783	0	0				0	next unless $query;
784	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
785							}
786	0	0				0	if ($self->{_debug}) {
787	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
788	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
789							}
790							}
791	0					0	$self->{_scan_dir_for_rels} = 1;
792	0					0	$self->_scan_directory($self->{_corpus_directory});
793	0					0	$self->{_scan_dir_for_rels} = 0;
794	0					0	chdir $self->{_working_directory};
795	0	0				0	open(OUT, ">$self->{_relevancy_file}")
796							or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
797	0					0	my @relevancy_list_for_query;
798	0					0	foreach (sort
799	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
	0					0
800							keys %{$self->{_relevancy_estimates}}) {
801	0					0	@relevancy_list_for_query =
802	0					0	keys %{$self->{_relevancy_estimates}->{$_}};
803	0					0	print OUT "$_ => @relevancy_list_for_query\n\n";
804	0					0	print "Number of relevant docs for query $_: " .
805							scalar(@relevancy_list_for_query) . "\n";
806							}
807							}
808
809							# If there are available human-supplied relevancy judgments in a disk
810							# file, use this script to upload that information. One of the scripts
811							# in the 'examples' directory carries out the precision-recall analysis
812							# by using this approach. IMPORTANT: The human-supplied relevancy
813							# judgments must be in a format that is shown in the sample file
814							# relevancy.txt in the 'examples' directory.
815							sub upload_document_relevancies_from_file {
816	0			0	1	0	my $self = shift;
817	0					0	chdir $self->{_working_directory};
818	0	0				0	open( IN, $self->{_relevancy_file} )
819							or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
820	0					0	while () {
821	0					0	chomp;
822	0	0				0	next if /^#/;
823	0	0				0	next if /^[ ]*$/;
824	0	0				0	die "Format of query file is not correct" unless /^[ ]q[0-9]+[ ]=>/;
825	0					0	/^[ ](q[0-9]+)[ ]=>[ ](.)/;
826	0					0	my $query_label = $1;
827	0					0	my $relevancy_docs_string = $2;
828	0	0				0	next unless $relevancy_docs_string;
829	0					0	my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
830	0					0	my %relevancies = map {$_ => 1} @relevancy_docs;
	0					0
831	0					0	$self->{_relevancy_estimates}->{$query_label} = \%relevancies;
832							}
833	0	0				0	if ($self->{_debug}) {
834	0					0	for (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
835	0					0	my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
	0					0
836	0					0	print "$_ => @rels\n";
837							}
838							}
839							}
840
841							sub display_doc_relevancies {
842	0			0	1	0	my $self = shift;
843	0					0	die "You must first estimate or provide the doc relevancies"
844	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
845	0					0	print "\nDisplaying relevancy judgments:\n\n";
846	0					0	foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
847	0					0	print "Query $query\n";
848	0					0	foreach my $file (sort {
	0					0
849	0					0	$self->{_relevancy_estimates}->{$query}->{$b}
850							<=>
851							$self->{_relevancy_estimates}->{$query}->{$a}
852							}
853							keys %{$self->{_relevancy_estimates}->{$query}}){
854	0					0	print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
855							}
856							}
857							}
858
859							sub _scan_file_for_rels {
860	0			0		0	my $self = shift;
861	0					0	my $file = shift;
862	0					0	open IN, $file;
863	0					0	my @all_text = ;
864	0					0	@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
	0					0
	0					0
865	0					0	my $all_text = join ' ', @all_text;
866	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
867	0					0	my $count = 0;
868	0					0	my @query_words = grep $_,
869							split /\s+/, $self->{_queries_for_relevancy}->{$query};
870	0	0				0	print "Query words for $query: @query_words\n" if $self->{_debug};
871	0					0	foreach my $word (@query_words) {
872	0					0	my @matches = $all_text =~ /$word/gi;
873	0	0				0	print "Number of occurrences for word '$word' in file $file: " .
874							scalar(@matches) . "\n" if $self->{_debug};
875	0	0				0	$count += @matches if @matches;
876							}
877	0	0				0	print "\nRelevancy count for query $query and file $file: $count\n\n"
878							if $self->{_debug};
879	0	0				0	$self->{_relevancy_estimates}->{$query}->{$file} = $count
880							if $count >= $self->{_relevancy_threshold};
881							}
882							}
883
884							######################### Calculate Precision versus Recall ##########################
885
886							sub precision_and_recall_calculator {
887	0			0	1	0	my $self = shift;
888	0					0	my $retrieval_type = shift;
889	0					0	die "You must first estimate or provide the doc relevancies"
890	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
891	0	0				0	unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
	0					0
892	0	0				0	open( IN, $self->{_query_file})
893							or die "unable to open the query file $self->{_query_file}: $!";
894	0					0	while () {
895	0					0	chomp;
896	0	0				0	next if /^#/;
897	0	0				0	next if /^[ ]*$/;
898	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
899	0					0	/^[ ](q[0-9]+):[ ](.*)/;
900	0					0	my $query_label = $1;
901	0					0	my $query = $2;
902	0	0				0	next unless $query;
903	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
904							}
905	0	0				0	if ($self->{_debug}) {
906	0					0	print "\n\nDisplaying queries in the query file:\n\n";
907	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
908	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
909							}
910							}
911							}
912	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
913	0	0				0	print "\n\n\nQuery $query:\n" if $self->{_debug};
914	0					0	my @query_words = grep $_,
915							split /\s+/, $self->{_queries_for_relevancy}->{$query};
916	0					0	my $retrievals;
917	0	0				0	croak "\n\nYou have not specified the retrieval type for " .
918							"precision-recall calculation. See code in 'examples'" .
919							"directory:" if !defined $retrieval_type;
920	0	0				0	if ($retrieval_type eq 'vsm') {
		0
921	0					0	$retrievals = $self->retrieve_with_vsm( \@query_words );
922							} elsif ($retrieval_type eq 'lsa') {
923	0					0	$retrievals = $self->retrieve_with_lsa( \@query_words );
924							}
925	0					0	my %ranked_retrievals;
926	0					0	my $i = 1;
927	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
	0					0
928							keys %$retrievals) {
929	0					0	$ranked_retrievals{$i++} = $_;
930							}
931	0	0				0	if ($self->{_debug}) {
932	0					0	print "\n\nDisplaying ranked retrievals for query $query:\n\n";
933	0					0	foreach (sort {$a <=> $b} keys %ranked_retrievals) {
	0					0
934	0					0	print "$_ => $ranked_retrievals{$_}\n";
935							}
936							}
937							# At this time, ranking of relevant documents based on their
938							# relevancy counts serves no particular purpose since all we want
939							# for the calculation of Precision and Recall are the total
940							# number of relevant documents. However, I believe such a
941							# ranking will play an important role in the future.
942							# IMPORTANT: The relevancy judgments are ranked only when
943							# estimated by the method estimate_doc_relevancies()
944							# of the VSM class. When relevancies are supplied
945							# directly through a disk file, they all carry the
946							# same rank.
947	0					0	my %ranked_relevancies;
948	0					0	$i = 1;
949	0					0	foreach my $file (sort {
	0					0
950	0					0	$self->{_relevancy_estimates}->{$query}->{$b}
951							<=>
952							$self->{_relevancy_estimates}->{$query}->{$a}
953							}
954							keys %{$self->{_relevancy_estimates}->{$query}}) {
955	0					0	$ranked_relevancies{$i++} = $file;
956							}
957	0	0				0	if ($self->{_debug}) {
958	0					0	print "\n\nDisplaying ranked relevancies for query $query:\n\n";
959	0					0	foreach (sort {$a <=> $b} keys %ranked_relevancies) {
	0					0
960	0					0	print "$_ => $ranked_relevancies{$_}\n";
961							}
962							}
963	0					0	my @relevant_set = values %ranked_relevancies;
964	0	0				0	warn "\n\nNo relevant docs found for query $query.\n" .
965							"Will skip over this query for precision and\n" .
966							"recall calculations\n\n" unless @relevant_set;
967	0	0				0	next unless @relevant_set;
968	0	0				0	print "\n\nRelevant set for query $query: @relevant_set\n\n"
969							if $self->{_debug};
970	0					0	my @retrieved;
971	0					0	foreach (sort keys %ranked_retrievals) {
972	0					0	push @retrieved, $ranked_retrievals{$_};
973							}
974	0	0				0	print "\n\nRetrieved set for query $query: @retrieved\n\n"
975							if $self->{_debug};
976	0					0	my @Precision_values = ();
977	0					0	my @Recall_values = ();
978	0					0	my $rank = 1;
979	0					0	while ($rank < @retrieved + 1) {
980	0					0	my $index = 1;
981	0					0	my @retrieved_at_rank = ();
982	0					0	while ($index <= $rank) {
983	0					0	push @retrieved_at_rank, $ranked_retrievals{$index};
984	0					0	$index++;
985							}
986	0					0	my $intersection =set_intersection(\@retrieved_at_rank,
987							\@relevant_set);
988	0	0				0	my $precision_at_rank = @retrieved_at_rank ?
989							(@$intersection / @retrieved_at_rank) : 0;
990	0					0	push @Precision_values, $precision_at_rank;
991	0					0	my $recall_at_rank = @$intersection / @relevant_set;
992	0					0	push @Recall_values, $recall_at_rank;
993	0					0	$rank++;
994							}
995	0	0				0	print "\n\nFor query $query, precision values: @Precision_values\n"
996							if $self->{_debug};
997	0	0				0	print "\nFor query $query, recall values: @Recall_values\n"
998							if $self->{_debug};
999	0					0	$self->{_precision_for_queries}->{$query} = \@Precision_values;
1000	0					0	$self->{_recall_for_queries}->{$query} = \@Recall_values;
1001	0					0	my $area = 0;
1002							# Use trapezoidal rule to find the area under the precision-recall
1003							# curve:
1004	0					0	for my $j (1..@Precision_values-1) {
1005	0					0	my $height = ($Precision_values[$j]+$Precision_values[$j-1])/2.0;
1006	0					0	my $base = ($Recall_values[$j] - $Recall_values[$j-1]);
1007	0					0	$area += $base * $height;
1008							}
1009	0					0	my $map_for_query = $area;
1010	0	0				0	print "\nMAP for query $query: $map_for_query\n" if $self->{_debug};
1011	0					0	$self->{_map_vals_for_queries}->{$query} = $map_for_query;
1012							}
1013							}
1014
1015							sub display_map_values_for_queries {
1016	0			0	1	0	my $self = shift;
1017	0					0	die "You must first invoke precision_and_recall_calculator function"
1018	0	0				0	unless scalar(keys %{$self->{_map_vals_for_queries}});
1019	0					0	my $map = 0;
1020	0					0	print "\n\nDisplaying average precision for different queries:\n\n";
1021	0					0	foreach my $query (sort
	0					0
1022	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1023							keys %{$self->{_map_vals_for_queries}}) {
1024	0					0	my $output = sprintf "Query %s => %.3f",
1025							$query, $self->{_map_vals_for_queries}->{$query};
1026	0					0	print "$output\n";
1027	0					0	$map += $self->{_map_vals_for_queries}->{$query};
1028							}
1029	0					0	print "\n\n";
1030	0					0	my $avg_map_for_all_queries =
1031	0					0	$map / scalar(keys %{$self->{_map_vals_for_queries}});
1032	0					0	print "MAP value: $avg_map_for_all_queries\n\n";
1033							}
1034
1035							sub display_precision_vs_recall_for_queries {
1036	0			0	1	0	my $self = shift;
1037	0					0	die "You must first invoke precision_and_recall_calculator function"
1038	0	0				0	unless scalar(keys %{$self->{_precision_for_queries}});
1039	0					0	print "\n\nDisplaying precision and recall values for different queries:\n\n";
1040	0					0	foreach my $query (sort
	0					0
1041	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1042							keys %{$self->{_map_vals_for_queries}}) {
1043	0					0	print "\n\nQuery $query:\n";
1044	0					0	print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1045	0					0	my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
	0					0
1046	0					0	@precision_vals = map {sprintf "%.3f", $_} @precision_vals;
	0					0
1047	0					0	print " Precision at rank => @precision_vals\n";
1048	0					0	my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
	0					0
1049	0					0	@recall_vals = map {sprintf "%.3f", $_} @recall_vals;
	0					0
1050	0					0	print "\n Recall at rank => @recall_vals\n";
1051							}
1052	0					0	print "\n\n";
1053							}
1054
1055							sub get_query_sorted_average_precision_for_queries {
1056	0			0	1	0	my $self = shift;
1057	0					0	die "You must first invoke precision_and_recall_calculator function"
1058	0	0				0	unless scalar(keys %{$self->{_map_vals_for_queries}});
1059	0					0	my @average_precisions_for_queries = ();
1060	0					0	foreach my $query (sort
	0					0
1061	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1062							keys %{$self->{_map_vals_for_queries}}) {
1063	0					0	my $output = sprintf "%.3f", $self->{_map_vals_for_queries}->{$query};
1064	0					0	push @average_precisions_for_queries, $output;
1065							}
1066	0					0	return \@average_precisions_for_queries;
1067							}
1068
1069							################################### Utility Routines ###################################
1070
1071							sub _check_for_illegal_params {
1072	2			2		6	my @params = @_;
1073	2					13	my @legal_params = qw / corpus_directory
1074							corpus_vocab_db
1075							doc_vectors_db
1076							normalized_doc_vecs_db
1077							use_idf_filter
1078							stop_words_file
1079							file_types
1080							case_sensitive
1081							max_number_retrievals
1082							query_file
1083							relevancy_file
1084							min_word_length
1085							want_stemming
1086							lsa_svd_threshold
1087							relevancy_threshold
1088							break_camelcased_and_underscored
1089							save_model_on_disk
1090							debug
1091							/;
1092	2					2	my $found_match_flag;
1093	2					5	foreach my $param (@params) {
1094	17					16	foreach my $legal (@legal_params) {
1095	102					70	$found_match_flag = 0;
1096	102	100				137	if ($param eq $legal) {
1097	17					13	$found_match_flag = 1;
1098	17					14	last;
1099							}
1100							}
1101	17	50				31	last if $found_match_flag == 0;
1102							}
1103	2					9	return $found_match_flag;
1104							}
1105
1106							# checks whether an element is in an array:
1107							sub contained_in {
1108	32			32	0	58	my $ele = shift;
1109	32					101	my @array = @_;
1110	32					42	my $count = 0;
1111	32	100				71	map {$count++ if $ele eq $_} @array;
	64					240
1112	32					181	return $count;
1113							}
1114
1115							# Meant only for an un-nested hash:
1116							sub deep_copy_hash {
1117	16			16	0	30	my $ref_in = shift;
1118	16					31	my $ref_out = {};
1119	16					28	foreach ( keys %{$ref_in} ) {
	16					600
1120	1600					2128	$ref_out->{$_} = $ref_in->{$_};
1121							}
1122	16					575	return $ref_out;
1123							}
1124
1125							sub vec_scalar_product {
1126	38			38	0	35	my $vec1 = shift;
1127	38					33	my $vec2 = shift;
1128	38	50				102	croak "Something is wrong --- the two vectors are of unequal length"
1129							unless @$vec1 == @$vec2;
1130	38					27	my $product;
1131	38					57	for my $i (0..@$vec1-1) {
1132	3800					3935	$product += $vec1->[$i] * $vec2->[$i];
1133							}
1134	38					70	return $product;
1135							}
1136
1137							sub vec_magnitude {
1138	61			61	0	56	my $vec = shift;
1139	61					45	my $mag_squared = 0;
1140	61					67	foreach my $num (@$vec) {
1141	6100					5394	$mag_squared += $num ** 2;
1142							}
1143	61					93	return sqrt $mag_squared;
1144							}
1145
1146							sub reduce {
1147	2			2	0	4	my $vec = shift;
1148	2					3	my $result;
1149	2					6	for my $item (@$vec) {
1150	200					175	$result += $item;
1151							}
1152	2					8	return $result;
1153							}
1154
1155							sub simple_stemmer {
1156	3310			3310	0	3727	my $word = shift;
1157	3310					2505	my $debug = shift;
1158	3310	50				4473	print "\nStemming the word: $word\n" if $debug;
1159	3310					12710	$word =~ s/(.*[a-z][^aeious])s$/$1/i;
1160	3310					4754	$word =~ s/(.*[a-z]s)es$/$1/i;
1161	3310					4436	$word =~ s/(.*[a-z][ck])es$/$1e/i;
1162	3310					12004	$word =~ s/(.*[a-z]+)tions$/$1tion/i;
1163	3310					9997	$word =~ s/(.*[a-z]+)mming$/$1m/i;
1164	3310					18875	$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1165	3310					10422	$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1166	3310					10782	$word =~ s/(.*[a-z]+)tices$/$1tex/i;
1167	3310					14847	$word =~ s/(.*[a-z]+)pes$/$1pe/i;
1168	3310					14581	$word =~ s/(.*[a-z]+)sed$/$1se/i;
1169	3310					17184	$word =~ s/(.*[a-z]+)ed$/$1/i;
1170	3310					7657	$word =~ s/(.*[a-z]+)tation$/$1t/i;
1171	3310	50				4601	print "Stemmed word: $word\n\n" if $debug;
1172	3310					8313	return $word;
1173							}
1174
1175							# Assumes the array is sorted in a descending order, as would be the
1176							# case with an array of singular values produced by an SVD algorithm
1177							sub return_index_of_last_value_above_threshold {
1178	1			1	0	2	my $pdl_obj = shift;
1179	1					7	my $size = $pdl_obj->getdim(0);
1180	1					2	my $threshold = shift;
1181	1					13	my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1182	1					91	my $i = 0;
1183	1		66			19	while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
	8					212
1184	1					6	return $i-1;
1185							}
1186
1187							sub set_intersection {
1188	0			0	0		my $set1 = shift;
1189	0						my $set2 = shift;
1190	0						my %hset1 = map {$_ => 1} @$set1;
	0
1191	0						my @common_elements = grep {$hset1{$_}} @$set2;
	0
1192	0	0					return @common_elements ? \@common_elements : [];
1193							}
1194
1195							sub get_integer_suffix {
1196	0			0	0		my $label = shift;
1197	0						$label =~ /(\d*)$/;
1198	0						return $1;
1199							}
1200
1201							1;
1202
1203							=pod
1204
1205							=head1 NAME
1206
1207							Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1208							library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1209							algorithms in response to search words and phrases.
1210
1211							=head1 SYNOPSIS
1212
1213							# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1214
1215							use Algorithm::VSM;
1216
1217							my $corpus_dir = "corpus";
1218							my @query = qw/ program ListIterator add ArrayList args /;
1219							my $stop_words_file = "stop_words.txt";
1220							my $vsm = Algorithm::VSM->new(
1221							break_camelcased_and_underscored => 1,
1222							case_sensitive => 0,
1223							corpus_directory => $corpus_dir,
1224							file_types => ['.txt', '.java'],
1225							max_number_retrievals => 10,
1226							min_word_length => 4,
1227							stop_words_file => $stop_words_file,
1228							use_idf_filter => 1,
1229							want_stemming => 1,
1230							);
1231							$vsm->get_corpus_vocabulary_and_word_counts();
1232							$vsm->display_corpus_vocab();
1233							$vsm->display_corpus_vocab_size();
1234							$vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1235							$vsm->display_inverse_document_frequencies();
1236							$vsm->generate_document_vectors();
1237							$vsm->display_doc_vectors();
1238							$vsm->display_normalized_doc_vectors();
1239							my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1240							$vsm->display_retrievals( $retrievals );
1241
1242							The purpose of each constructor option and what is accomplished by the method
1243							calls should be obvious by their names. If not, they are explained in greater
1244							detail elsewhere in this documentation page. Note that the methods
1245							display_corpus_vocab() and display_doc_vectors() are there only for testing
1246							purposes with small corpora. If you must use them for large libraries/corpora,
1247							you might wish to redirect the output to a file.
1248
1249							By default, a call to a constructor calculates normalized term-frequency vectors
1250							for the documents. Normalization consists of first calculating the term
1251							frequency tf(t) of a term t in a document as a proportion of the total numbers
1252							of words in the document and then multiplying it by idf(t), where idf(t) stands
1253							for the inverse document frequency associated with that term. Note that 'word'
1254							and 'term' mean the same thing.
1255
1256
1257
1258							# FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1259
1260							my $lsa = Algorithm::VSM->new(
1261							break_camelcased_and_underscored => 1,
1262							case_sensitive => 0,
1263							corpus_directory => $corpus_dir,
1264							file_types => ['.txt', '.java'],
1265							lsa_svd_threshold => 0.01,
1266							max_number_retrievals => 10,
1267							min_word_length => 4,
1268							stop_words_file => $stop_words_file,
1269							use_idf_filter => 1,
1270							want_stemming => 1,
1271							);
1272							$lsa->get_corpus_vocabulary_and_word_counts();
1273							$lsa->display_corpus_vocab();
1274							$lsa->display_corpus_vocab_size();
1275							$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1276							$lsa->generate_document_vectors();
1277							$lsa->construct_lsa_model();
1278							my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1279							$lsa->display_retrievals( $retrievals );
1280
1281							The initialization code before the constructor call and the calls for displaying
1282							the vocabulary and the vectors after the call remain the same as for the VSM case
1283							shown previously in this Synopsis. In the call above, the constructor parameter
1284							'lsa_svd_threshold' determines how many of the singular values will be retained
1285							after we have carried out an SVD decomposition of the term-frequency matrix for
1286							the documents in the corpus. Singular values smaller than this threshold
1287							fraction of the largest value are rejected.
1288
1289
1290
1291							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1292
1293							my $corpus_dir = "corpus";
1294							my $stop_words_file = "stop_words.txt";
1295							my $query_file = "test_queries.txt";
1296							my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1297							# will be stored in this file
1298							my $vsm = Algorithm::VSM->new(
1299							break_camelcased_and_underscored => 1,
1300							case_sensitive => 0,
1301							corpus_directory => $corpus_dir,
1302							file_types => ['.txt', '.java'],
1303							min_word_length => 4,
1304							query_file => $query_file,
1305							relevancy_file => $relevancy_file,
1306							relevancy_threshold => 5,
1307							stop_words_file => $stop_words_file,
1308							want_stemming => 1,
1309							);
1310							$vsm->get_corpus_vocabulary_and_word_counts();
1311							$vsm->generate_document_vectors();
1312							$vsm->estimate_doc_relevancies();
1313							$vsm->display_doc_relevancies(); # used only for testing
1314							$vsm->precision_and_recall_calculator('vsm');
1315							$vsm->display_precision_vs_recall_for_queries();
1316							$vsm->display_map_values_for_queries();
1317
1318							Measuring precision and recall requires a set of queries. These are supplied
1319							through the constructor parameter 'query_file'. The format of the this file
1320							must be according to the sample file 'test_queries.txt' in the 'examples'
1321							directory. The module estimates the relevancies of the documents to the
1322							queries and dumps the relevancies in a file named by the 'relevancy_file'
1323							constructor parameter. The constructor parameter 'relevancy_threshold' is used
1324							to decide which of the documents are considered to be relevant to a query. A
1325							document must contain at least the 'relevancy_threshold' occurrences of query
1326							words in order to be considered relevant to a query.
1327
1328
1329
1330							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1331
1332							my $lsa = Algorithm::VSM->new(
1333							break_camelcased_and_underscored => 1,
1334							case_sensitive => 0,
1335							corpus_directory => $corpus_dir,
1336							file_types => ['.txt', '.java'],
1337							lsa_svd_threshold => 0.01,
1338							min_word_length => 4,
1339							query_file => $query_file,
1340							relevancy_file => $relevancy_file,
1341							relevancy_threshold => 5,
1342							stop_words_file => $stop_words_file,
1343							want_stemming => 1,
1344							);
1345							$lsa->get_corpus_vocabulary_and_word_counts();
1346							$lsa->generate_document_vectors();
1347							$lsa->construct_lsa_model();
1348							$lsa->estimate_doc_relevancies();
1349							$lsa->display_doc_relevancies();
1350							$lsa->precision_and_recall_calculator('lsa');
1351							$lsa->display_precision_vs_recall_for_queries();
1352							$lsa->display_map_values_for_queries();
1353
1354							We have already explained the purpose of the constructor parameter 'query_file'
1355							and about the constraints on the format of queries in the file named through
1356							this parameter. As mentioned earlier, the module estimates the relevancies of
1357							the documents to the queries and dumps the relevancies in a file named by the
1358							'relevancy_file' constructor parameter. The constructor parameter
1359							'relevancy_threshold' is used in deciding which of the documents are considered
1360							to be relevant to a query. A document must contain at least the
1361							'relevancy_threshold' occurrences of query words in order to be considered
1362							relevant to a query. We have previously explained the role of the constructor
1363							parameter 'lsa_svd_threshold'.
1364
1365
1366
1367							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1368
1369							my $corpus_dir = "corpus";
1370							my $stop_words_file = "stop_words.txt";
1371							my $query_file = "test_queries.txt";
1372							my $relevancy_file = "relevancy.txt";
1373							my $vsm = Algorithm::VSM->new(
1374							break_camelcased_and_underscored => 1,
1375							case_sensitive => 0,
1376							corpus_directory => $corpus_dir,
1377							file_types => ['.txt', '.java'],
1378							min_word_length => 4,
1379							query_file => $query_file,
1380							relevancy_file => $relevancy_file,
1381							stop_words_file => $stop_words_file,
1382							want_stemming => 1,
1383							);
1384							$vsm->get_corpus_vocabulary_and_word_counts();
1385							$vsm->generate_document_vectors();
1386							$vsm->upload_document_relevancies_from_file();
1387							$vsm->display_doc_relevancies();
1388							$vsm->precision_and_recall_calculator('vsm');
1389							$vsm->display_precision_vs_recall_for_queries();
1390							$vsm->display_map_values_for_queries();
1391
1392							Now the filename supplied through the constructor parameter 'relevancy_file' must
1393							contain relevance judgments for the queries that are named in the file supplied
1394							through the parameter 'query_file'. The format of these two files must be
1395							according to what is shown in the sample files 'test_queries.txt' and
1396							'relevancy.txt' in the 'examples' directory.
1397
1398
1399
1400							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1401
1402							my $corpus_dir = "corpus";
1403							my $stop_words_file = "stop_words.txt";
1404							my $query_file = "test_queries.txt";
1405							my $relevancy_file = "relevancy.txt";
1406							my $lsa = Algorithm::VSM->new(
1407							break_camelcased_and_underscored => 1,
1408							case_sensitive => 0,
1409							corpus_directory => $corpus_dir,
1410							file_types => ['.txt', '.java'],
1411							lsa_svd_threshold => 0.01,
1412							min_word_length => 4,
1413							query_file => $query_file,
1414							relevancy_file => $relevancy_file,
1415							stop_words_file => $stop_words_file,
1416							want_stemming => 1,
1417							);
1418							$lsa->get_corpus_vocabulary_and_word_counts();
1419							$lsa->generate_document_vectors();
1420							$lsa->upload_document_relevancies_from_file();
1421							$lsa->display_doc_relevancies();
1422							$lsa->precision_and_recall_calculator('vsm');
1423							$lsa->display_precision_vs_recall_for_queries();
1424							$lsa->display_map_values_for_queries();
1425
1426							As mentioned for the previous code block, the filename supplied through the
1427							constructor parameter 'relevancy_file' must contain relevance judgments for the
1428							queries that are named in the file supplied through the parameter 'query_file'.
1429							The format of this file must be according to what is shown in the sample file
1430							'relevancy.txt' in the 'examples' directory. We have already explained the roles
1431							played by the constructor parameters such as 'lsa_svd_threshold'.
1432
1433
1434
1435							# FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1436
1437							my $corpus_dir = "corpus";
1438							my $stop_words_file = "stop_words.txt";
1439							my $vsm = Algorithm::VSM->new(
1440							break_camelcased_and_underscored => 1,
1441							case_sensitive => 0,
1442							corpus_directory => $corpus_dir,
1443							file_types => ['.txt', '.java'],
1444							min_word_length => 4,
1445							stop_words_file => $stop_words_file,
1446							want_stemming => 1,
1447							);
1448							$vsm->get_corpus_vocabulary_and_word_counts();
1449							$vsm->generate_document_vectors();
1450							# code for calculating pairwise similarities as shown in the
1451							# script calculate_similarity_matrix_for_all_docs.pl in the
1452							# examples directory. This script makes calls to
1453							#
1454							# $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1455							#
1456							# for every pair of documents.
1457
1458							=head1 CHANGES
1459
1460							Version 1.61 improves the implementation of the directory scanner to make it more
1461							platform independent. Additionally, you are now required to specify in the
1462							constructor call the file types to be considered for computing the database model.
1463							If, say, you have a large software library and you want only Java and text files to
1464							be scanned for creating the VSM (or the LSA) model, you must supply that information
1465							to the module by setting the constructor parameter C to the anonymous
1466							list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1467							version is C. If you set it to 1, that will force the database model
1468							and query matching to become case sensitive.
1469
1470							Version 1.60 reflects the fact that people are now more likely to use this module by
1471							keeping the model constructed for a corpus in the fast memory (as opposed to storing
1472							the models in disk-based hash tables) for its repeated invocation for different
1473							queries. As a result, the default value for the constructor option
1474							C was changed from 1 to 0. For those who still wish to store on
1475							a disk the model that is constructed, the script
1476							C shows how you can do that.
1477							Other changes in 1.60 include a slight reorganization of the scripts in the
1478							C directory. Most scripts now do not by default store their models in
1479							disk-based hash tables. This reorganization is reflected in the description of the
1480							C directory in this documentation. The basic logic of constructing VSM and
1481							LSA models and how these are used for retrievals remains unchanged.
1482
1483							Version 1.50 incorporates a couple of new features: (1) You now have the option to
1484							split camel-cased and underscored words for constructing your vocabulary set; and (2)
1485							Storing the VSM and LSA models in database files on the disk is now optional. The
1486							second feature, in particular, should prove useful to those who are using this module
1487							for large collections of documents.
1488
1489							Version 1.42 includes two new methods, C and
1490							C, for those folks who deal with very large datasets.
1491							You can get a better sense of the overall vocabulary being used by the module for
1492							file retrieval by examining the contents of a dump file whose name is supplied as an
1493							argument to C.
1494
1495							Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1496							the dependencies between this module and the submodules of PDL.
1497
1498							Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1499							documents in the corpus. The elements of such a matrix express pairwise similarities
1500							between the documents. The pairwise similarities are based on the dot product of two
1501							document vectors divided by the product of the vector magnitudes. The 'examples'
1502							directory contains two scripts to illustrate how such matrices can be calculated by
1503							the user. The similarity matrix is output as a CSV file.
1504
1505							Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1506							document file. What that means is that the words that appear in most of the documents
1507							get reduced weighting since such words are non-discriminatory with respect to the
1508							retrieval of the documents. A typical formula that is used to calculate the IDF
1509							weight for a word is the logarithm of the ratio of the total number of documents to
1510							the number of documents in which the word appears. So if a word were to appear in
1511							all the documents, its IDF multiplier would be zero in the vector representation of a
1512							document. If so desired, you can turn off the IDF weighting of the words by
1513							explicitly setting the constructor parameter C to zero.
1514
1515							Version 1.2 includes a code correction and some general code and documentation
1516							cleanup.
1517
1518							With Version 1.1, you can access the retrieval precision results so that you can
1519							compare two different retrieval algorithms (VSM or LSA with different choices for
1520							some of the constructor parameters) with significance testing. (Version 1.0 merely
1521							sent those results to standard output, typically your terminal window.) In Version
1522							1.1, the new script B in the 'examples' directory
1523							illustrates significance testing with Randomization and with Student's Paired t-Test.
1524
1525							=head1 DESCRIPTION
1526
1527							B is a I module for constructing a Vector Space Model (VSM) or
1528							a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1529							to as a corpus, and then retrieving the documents in response to search words in a
1530							query.
1531
1532							VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1533							community. More recently such models have been shown to be effective in retrieving
1534							files/documents from software libraries. For an account of this research that was
1535							presented by Shivani Rao and the author of this module at the 2011 Mining Software
1536							Repositories conference, see L.
1537
1538							VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1539							Stemming the words so extracted and eliminating the designated stop words from the
1540							vocabulary. Stemming means that closely related words like 'programming' and
1541							'programs' are reduced to the common root word 'program' and the stop words are the
1542							non-discriminating words that can be expected to exist in virtually all the
1543							documents. (3) Constructing document vectors for the individual files in the corpus
1544							--- the document vectors taken together constitute what is usually referred to as a
1545							'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1546							factor out the effect of document size and, if desired, multiplying the term
1547							frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1548							the weight of the words that appear in a large number of documents. (5) Constructing
1549							a query vector for the search query after the query is subject to the same stemming
1550							and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1551							Using a similarity metric to return the set of documents that are most similar to the
1552							query vector. The commonly used similarity metric is one based on the cosine
1553							distance between two vectors. Also note that all the vectors mentioned here are of
1554							the same size, the size of the vocabulary. An element of a vector is the frequency
1555							of occurrence of the word corresponding to that position in the vector.
1556
1557							LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1558							step further by subjecting the term-frequency matrix for the corpus to singular value
1559							decomposition (SVD). By retaining only a subset of the singular values (usually the
1560							N largest for some value of N), you can construct reduced-dimensionality vectors for
1561							the documents and the queries. In VSM, as mentioned above, the size of the document
1562							and the query vectors is equal to the size of the vocabulary. For large corpora,
1563							this size may involve tens of thousands of words --- this can slow down the VSM
1564							modeling and retrieval process. So you are very likely to get faster performance
1565							with retrieval based on LSA modeling, especially if you store the model once
1566							constructed in a database file on the disk and carry out retrievals using the
1567							disk-based model.
1568
1569
1570							=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1571
1572							This module has only been tested for software retrieval. For more general text
1573							retrieval, you would need to replace the simple stemmer used in the module by one
1574							based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1575							list of stop words appropriate to the text corpora of interest to you. As previously
1576							mentioned, the stop words are the commonly occurring words that do not carry much
1577							discriminatory power from the standpoint of distinguishing between the documents.
1578							See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1579							formatted.
1580
1581
1582							=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1583
1584							It is not uncommon for large software libraries to consist of tens of thousands of
1585							documents that include source-code files, documentation files, README files,
1586							configuration files, etc. The bug-localization work presented recently by Shivani
1587							Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1588							based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1589							size of 7553 unique words. (Here is a link to this work:
1590							L. Also note that the iBUGS dataset
1591							was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1592							automated bug detection and localization tools.) If C is the size of the
1593							vocabulary and C the number of the documents in the corpus, the size of each
1594							vector will be C and size of the term-frequency matrix for the entire corpus will
1595							be CxC. So if you were to duplicate the bug localization experiments in
1596							L you would be dealing with vectors of
1597							size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1598							to really large libraries/corpora, we are obviously talking about very large matrices
1599							for SVD decomposition. For large libraries/corpora, it would be best to store away
1600							the model in a disk file and to base all subsequent retrievals on the disk-stored
1601							models. The 'examples' directory contains scripts that carry out retrievals on the
1602							basis of disk-based models. Further speedup in retrieval can be achieved by using
1603							LSA to create reduced-dimensionality representations for the documents and by basing
1604							retrievals on the stored versions of such reduced-dimensionality representations.
1605
1606
1607							=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1608
1609							The performance of a retrieval algorithm is typically measured by two properties:
1610							C and C. As mentioned in the
1611							L publication, at a given rank C,
1612							C is the ratio of the number of retrieved documents that are relevant to
1613							the total number of retrieved documents up to that rank. And, along the same lines,
1614							C at a given rank C is the ratio of the number of retrieved documents that
1615							are relevant to the total number of relevant documents. The area under the
1616							C--C curve is called the C for a query. When
1617							the C is averaged over all the queries, we obtain what is known as
1618							C (MAP). For an oracle, the value of MAP should be 1.0. On
1619							the other hand, for purely random retrieval from a corpus, the value of MAP will be
1620							inversely proportional to the size of the corpus. (See the discussion in
1621							L for further
1622							explanation on these retrieval precision evaluators.) This module includes methods
1623							that allow you to carry out these retrieval accuracy measurements using the relevancy
1624							judgments supplied through a disk file. If human-supplied relevancy judgments are
1625							not available, the module will be happy to estimate relevancies for you just by
1626							determining the number of query words that exist in a document. Note, however, that
1627							relevancy judgments estimated in this manner cannot be trusted. That is because
1628							ultimately it is the humans who are the best judges of the relevancies of documents
1629							to queries. The humans bring to bear semantic considerations on the relevancy
1630							determination problem that are beyond the scope of this module.
1631
1632
1633							=head1 METHODS
1634
1635							The module provides the following methods for constructing VSM and LSA models of a
1636							corpus, for using the models thus constructed for retrieval, and for carrying out
1637							precision versus recall calculations for the determination of retrieval accuracy on
1638							the corpora of interest to you.
1639
1640							=over
1641
1642							=item B
1643
1644							A call to C constructs a new instance of the C class:
1645
1646							my $vsm = Algorithm::VSM->new(
1647							break_camelcased_and_underscored => 1,
1648							case_sensitive => 0,
1649							corpus_directory => "",
1650							corpus_vocab_db => "corpus_vocab_db",
1651							doc_vectors_db => "doc_vectors_db",
1652							file_types => $my_file_types,
1653							lsa_svd_threshold => 0.01,
1654							max_number_retrievals => 10,
1655							min_word_length => 4,
1656							normalized_doc_vecs_db => "normalized_doc_vecs_db",
1657							query_file => "",
1658							relevancy_file => $relevancy_file,
1659							relevancy_threshold => 5,
1660							save_model_on_disk => 0,
1661							stop_words_file => "",
1662							use_idf_filter => 1,
1663							want_stemming => 1,
1664							);
1665
1666							The values shown on the right side of the big arrows are the B
1667							parameters>. The value supplied through the variable C<$my_file_types> would be
1668							something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1669							be included in creating the database model. The following nested list will now
1670							describe each of the constructor parameters shown above:
1671
1672							=over 16
1673
1674							=item I
1675
1676							The parameter B when set causes the
1677							underscored and camel-cased words to be split. By default the parameter is
1678							set. So if you don't want such words to be split, you must set it
1679							explicitly to 0.
1680
1681							=item I
1682
1683							The parameter B points to the root of the directory of documents
1684							for which you want to create a VSM or LSA model.
1685
1686							=item I
1687
1688							The parameter B is for naming the DBM in which the corpus vocabulary
1689							will be stored after it is subject to stemming and the elimination of stop words.
1690							Once a disk-based VSM model is created and stored away in the file named by this
1691							parameter and the parameter to be described next, it can subsequently be used
1692							directly for speedier retrieval.
1693
1694							=item I
1695
1696							When set to 1, this parameter forces the module to maintain the case of the terms in
1697							the corpus files when creating the vocabulary and the document vectors. Setting
1698							C to 1 also causes the query matching to become case sensitive.
1699							(This constructor parameter was introduced in Version 1.61.)
1700
1701							=item I
1702
1703							The database named by B stores the document vector representation for
1704							each document in the corpus. Each document vector has the same size as the
1705							corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1706							the word that corresponds to that position in the vocabulary vector.
1707
1708							=item I
1709
1710							This parameter tells the module what types of files in the corpus directory you want
1711							scanned for creating the database model. The value supplied for this parameter is an
1712							anonymous list of the file suffixes for the file types. For example, if you wanted
1713							only Java and text files to be scanned, you will set this parameter to C<['.java',
1714							'.txt']>. The module throws an exception if this parameter is left unspecified.
1715							(This constructor parameter was introduced in Version 1.61.)
1716
1717							=item I
1718
1719							The parameter B is used for rejecting singular values that are
1720							smaller than this threshold fraction of the largest singular value. This plays a
1721							critical role in creating reduced-dimensionality document vectors in LSA modeling of
1722							a corpus.
1723
1724							=item I
1725
1726							The constructor parameter B stands for what it means.
1727
1728							=item I
1729
1730							The parameter B sets the minimum number of characters in a
1731							word in order for it to be included in the corpus vocabulary.
1732
1733							=item I
1734
1735							The database named by B stores the normalized document
1736							vectors. Normalization consists of factoring out the size of the documents by
1737							dividing the term frequency for each word in a document by the number of words in the
1738							document, and then multiplying the result by the idf (Inverse Document Frequency)
1739							value for the word.
1740
1741							=item I
1742
1743							The parameter B points to a file that contains the queries to be used for
1744							calculating retrieval performance with C and C numbers. The format
1745							of the query file must be as shown in the sample file C in the
1746							'examples' directory.
1747
1748							=item I
1749
1750							This option names the disk file for storing the relevancy judgments.
1751
1752							=item I
1753
1754							The constructor parameter B is used for automatic determination
1755							of document relevancies to queries on the basis of the number of occurrences of query
1756							words in a document. You can exercise control over the process of determining
1757							relevancy of a document to a query by giving a suitable value to the constructor
1758							parameter B. A document is considered relevant to a query only
1759							when the document contains at least B number of query words.
1760
1761							=item I
1762
1763							The constructor parameter B will cause the basic
1764							information about the VSM and the LSA models to be stored on the disk.
1765							Subsequently, any retrievals can be carried out from the disk-based model.
1766
1767							=item I
1768
1769							The parameter B is for naming the file that contains the stop words
1770							that you do not wish to include in the corpus vocabulary. The format of this file
1771							must be as shown in the sample file C in the 'examples' directory.
1772
1773							=item I
1774
1775							The constructor parameter B is set by default. If you want
1776							to turn off the normalization of the document vectors, including turning
1777							off the weighting of the term frequencies of the words by their idf values,
1778							you must set this parameter explicitly to 0.
1779
1780							=item I
1781
1782							The boolean parameter B determines whether or not the words extracted
1783							from the documents would be subject to stemming. As mentioned elsewhere, stemming
1784							means that related words like 'programming' and 'programs' would both be reduced to
1785							the root word 'program'.
1786
1787							=back
1788
1789							=begin html
1790
1791
1792
1793							=end html
1794
1795							=item B
1796
1797							You call this subroutine for constructing an LSA model for your corpus
1798							after you have extracted the corpus vocabulary and constructed document
1799							vectors:
1800
1801							$vsm->construct_lsa_model();
1802
1803							The SVD decomposition that is carried out in LSA model construction uses the
1804							constructor parameter C to decide how many of the singular values
1805							to retain for the LSA model. A singular is retained only if it is larger than the
1806							C fraction of the largest singular value.
1807
1808
1809							=item B
1810
1811							If you would like to see corpus vocabulary as constructed by the previous call, make
1812							the call
1813
1814							$vsm->display_corpus_vocab();
1815
1816							Note that this is a useful thing to do only on small test corpora. If you need
1817							to examine the vocabulary for a large corpus, call the two methods listed below.
1818
1819
1820							=item B
1821
1822							If you would like for the module to print out in your terminal window the size of the
1823							vocabulary, make the call
1824
1825							$vsm->display_corpus_vocab_size();
1826
1827
1828							=item B
1829
1830							If you would like to see the document relevancies generated by the previous method,
1831							you can call
1832
1833							$vsm->display_doc_relevancies()
1834
1835
1836							=item B
1837
1838							If you would like to see the document vectors constructed by the previous call, make
1839							the call:
1840
1841							$vsm->display_doc_vectors();
1842
1843							Note that this is a useful thing to do only on small test corpora. If you must call
1844							this method on a large corpus, you might wish to direct the output to a file.
1845
1846
1847							=item B
1848
1849							You can display the idf value associated with each word in the corpus by
1850
1851							$vsm->display_inverse_document_frequencies();
1852
1853							The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1854							of the total number of documents in the corpus to the number of documents in which
1855							the word appears (with protection built in to prevent division by zero). Ideally, if
1856							a word appears in all the documents, its idf would be small, close to zero. Words
1857							with small idf values are non-discriminatory and should get reduced weighting in
1858							document retrieval.
1859
1860
1861							=item B
1862
1863							The area under the precision vs. recall curve for a given query is called C
1864							Precision> for that query. When this area is averaged over all the queries, you get
1865							C (Mean Average Precision) as a measure of the accuracy of the retrieval
1866							algorithm. The C values for the queries and the overall C
1867							can be printed out by calling
1868
1869							$vsm->display_map_values_for_queries();
1870
1871
1872							=item B
1873
1874							If you would like to see the normalized document vectors, make the call:
1875
1876							$vsm->display_normalized_doc_vectors();
1877
1878							See the comment made previously as to what is meant by the normalization of a
1879							document vector.
1880
1881
1882							=item B
1883
1884							A call to C will normally be followed by the
1885							following call
1886
1887							$vsm->display_precision_vs_recall_for_queries();
1888
1889							for displaying the C and C values.
1890
1891
1892							=item B
1893
1894							You can display the retrieved document names by calling this method using the syntax:
1895
1896							$vsm->display_retrievals( $retrievals );
1897
1898							where C<$retrievals> is a reference to the hash returned by a call to one of the
1899							C methods. The display method shown here respects the retrieval size
1900							constraints expressed by the constructor parameter C.
1901
1902
1903							=item B
1904
1905							Before you can carry out precision and recall calculations to test the accuracy of
1906							VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1907							judgments for the queries. (A relevancy judgment for a query is simply the list of
1908							documents relevant to that query.) Relevancy judgments are commonly supplied by the
1909							humans who are familiar with the corpus. But if such human-supplied relevance
1910							judgments are not available, you can invoke the following method to estimate them:
1911
1912							$vsm->estimate_doc_relevancies();
1913
1914							For the above method call, a document is considered to be relevant to a query if it
1915							contains several of the query words. As to the minimum number of query words that
1916							must exist in a document in order for the latter to be considered relevant, that is
1917							determined by the C parameter in the VSM constructor.
1918
1919							But note that this estimation of document relevancies to queries is NOT for serious
1920							work. The reason for that is because ultimately it is the humans who are the best
1921							judges of the relevancies of documents to queries. The humans bring to bear semantic
1922							considerations on the relevancy determination problem that are beyond the scope of
1923							this module.
1924
1925							The generated relevancies are deposited in a file named by the constructor parameter
1926							C.
1927
1928
1929							=item B
1930
1931							If you want to get hold of all the filenames in the corpus in your own script, you
1932							can call
1933
1934							my @docs = @{$vsm->get_all_document_names()};
1935
1936							The array on the left will contain an alphabetized list of the files.
1937
1938
1939							=item B
1940
1941							This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1942							course, if you will be doing document retrieval through a disk-stored VSM or LSA
1943							model, then you do not need to call this method. You construct document vectors
1944							through the following call:
1945
1946							$vsm->generate_document_vectors();
1947
1948
1949							=item B
1950
1951							After you have constructed a new instance of the C class, you must
1952							now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1953
1954							$vsm->get_corpus_vocabulary_and_word_counts();
1955
1956							The only time you do NOT need to call this method is when you are using a previously
1957							constructed disk-stored VSM model for retrieval.
1958
1959
1960							=item B
1961
1962							If you want to run significance tests on the retrieval accuracies you obtain on a
1963							given corpus and with different algorithms (VSM or LSA with different choices for the
1964							constructor parameters), your own script would need access to the average precision
1965							data for a set of queries. You can get hold of this data by calling
1966
1967							$vsm->get_query_sorted_average_precision_for_queries();
1968
1969							The script C in the 'examples' directory shows how you can
1970							use this method for significance testing.
1971
1972
1973							=item B
1974
1975							=item B
1976
1977							If you would like to compare in your own script any two documents in the corpus, you
1978							can call
1979
1980							my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
1981							or
1982							my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
1983
1984							Both these calls return a number that is the dot product of the two document vectors
1985							normalized by the product of their magnitudes. The first call uses the regular
1986							document vectors and the second the normalized document vectors.
1987
1988
1989							=item B
1990
1991							After you have created or obtained the relevancy judgments for your test queries, you
1992							can make the following call to calculate C and C:
1993
1994							$vsm->precision_and_recall_calculator('vsm');
1995							or
1996							$vsm->precision_and_recall_calculator('lsa');
1997
1998							depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
1999
2000							=item B
2001
2002							After you have built an LSA model through the call to C, you
2003							can retrieve the document names most similar to the query by:
2004
2005							my $retrievals = $vsm->retrieve_with_lsa( \@query );
2006
2007							Subsequently, you can display the retrievals by calling the
2008							C method described previously.
2009
2010
2011							=item B
2012
2013							After you have constructed a VSM model, you call this method for document retrieval
2014							for a given query C<@query>. The call syntax is:
2015
2016							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2017
2018							The argument, C<@query>, is simply a list of words that you wish to use for
2019							retrieval. The method returns a hash whose keys are the document names and whose
2020							values the similarity distance between the document and the query. As is commonly
2021							the case with VSM, this module uses the cosine similarity distance when comparing a
2022							document vector with the query vector.
2023
2024
2025							=item B
2026
2027							When human-supplied relevancies are available, you can upload them into the program
2028							by calling
2029
2030							$vsm->upload_document_relevancies_from_file();
2031
2032							These relevance judgments will be read from a file that is named with the
2033							C constructor parameter.
2034
2035
2036							=item B
2037
2038							When you invoke the methods C and
2039							C, that automatically deposits the VSM model in the
2040							database files named with the constructor parameters C,
2041							C and C. Subsequently, you can carry out
2042							retrieval by directly using this disk-based VSM model for speedier performance. In
2043							order to do so, you must upload the disk-based model by
2044
2045							$vsm->upload_normalized_vsm_model_from_disk();
2046
2047							Subsequently you call
2048
2049							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2050							$vsm->display_retrievals( $retrievals );
2051
2052							for retrieval and for displaying the results.
2053
2054
2055							=item B
2056
2057							This is the method to call for large text corpora if you would like to examine the
2058							vocabulary created. The call syntax is
2059
2060							$vsm->write_corpus_vocab_to_file($filename);
2061
2062							where C<$filename> is the name of the file that you want the vocabulary to be written
2063							out to. This call will also show the frequency of each vocabulary word in your
2064							corpus.
2065
2066
2067							=back
2068
2069
2070							=head1 REQUIRED
2071
2072							This module requires the following modules:
2073
2074							SDBM_File
2075							Storable
2076							PDL
2077							File::Basename
2078							File::Spec::Functions
2079
2080							The first two of these are needed for creating disk-based database records for the
2081							VSM and LSA models. The third is needed for calculating the SVD of the
2082							term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2083							by the directory scanner to make pathnames platform independent.
2084
2085							=head1 EXAMPLES
2086
2087							See the 'examples' directory in the distribution for the scripts listed below:
2088
2089							=over
2090
2091							=item B
2092
2093							For basic VSM-based model construction and retrieval, run the script:
2094
2095							retrieve_with_VSM.pl
2096
2097							Starting with version 1.60, this script does not store away the VSM model in
2098							disk-based hash tables. If you want your model to be stored on the disk, you must
2099							run the script C for that.
2100
2101							=item B
2102
2103							If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2104							script
2105
2106							continuously_running_VSM_retrieval_engine.pl
2107
2108							You can create a script similar to this for doing the same with LSA models.
2109
2110							=item B
2111
2112							For storing the model information in disk-based DBM files that can subsequently be
2113							used for both VSM and LSA retrieval, run the script:
2114
2115							retrieve_with_VSM_and_also_create_disk_based_model.pl
2116
2117							=item B
2118
2119							For basic LSA-based model construction and retrieval, run the script:
2120
2121							retrieve_with_LSA.pl
2122
2123							Starting with version 1.60, this script does not store away the model information in
2124							disk-based hash tables. If you want your model to be stored on the disk, you must
2125							run the script C for that.
2126
2127							=item B
2128
2129							If you have previously run a script like
2130							C, you can run the script
2131
2132							retrieve_with_disk_based_VSM.pl
2133
2134							for repeated VSM-based retrievals from a disk-based model.
2135
2136							=item B
2137
2138							If you have previously run a script like
2139							C, you can run the script
2140
2141							retrieve_with_disk_based_LSA.pl
2142
2143							for repeated LSA-based retrievals from a disk-based model.
2144
2145							=item B
2146
2147							To experiment with precision and recall calculations for VSM retrieval, run the
2148							script:
2149
2150							calculate_precision_and_recall_for_VSM.pl
2151
2152							Note that this script will carry out its own estimation of relevancy judgments ---
2153							which in most cases would not be a safe thing to do.
2154
2155							=item B
2156
2157							To experiment with precision and recall calculations for LSA retrieval, run the
2158							script:
2159
2160							calculate_precision_and_recall_for_LSA.pl
2161
2162							Note that this script will carry out its own estimation of relevancy judgments ---
2163							which in most cases would not be a safe thing to do.
2164
2165							=item B
2166							Human-Supplied Relevancies:>
2167
2168							Precision and recall calculations for retrieval accuracy determination are best
2169							carried out with human-supplied judgments of relevancies of the documents to queries.
2170							If such judgments are available, run the script:
2171
2172							calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2173
2174							This script will print out the average precisions for the different test queries and
2175							calculate the MAP metric of retrieval accuracy.
2176
2177							=item B
2178							Human-Supplied Relevancies:>
2179
2180							If human-supplied relevancy judgments are available and you wish to experiment with
2181							precision and recall calculations for LSA-based retrieval, run the script:
2182
2183							calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2184
2185							This script will print out the average precisions for the different test queries and
2186							calculate the MAP metric of retrieval accuracy.
2187
2188							=item B
2189							Randomization or with Student's Paired t-Test:>
2190
2191							significance_testing.pl randomization
2192
2193							or
2194
2195							significance_testing.pl t-test
2196
2197							Significance testing consists of forming a null hypothesis that the two retrieval
2198							algorithms you are considering are the same from a black-box perspective and then
2199							calculating what is known as a C. If the C is less than, say,
2200							0.05, you reject the null hypothesis.
2201
2202							=item B
2203
2204							calculate_similarity_matrix_for_all_docs.pl
2205
2206							or
2207
2208							calculate_similarity_matrix_for_all_normalized_docs.pl
2209
2210							The former uses regular document vectors for calculating the similarity between every
2211							pair of documents in the corpus. And the latter uses normalized document vectors for
2212							the same purpose. The document order used for row and column indexing of the matrix
2213							corresponds to the alphabetic ordering of the document names in the corpus directory.
2214
2215							=back
2216
2217
2218							=head1 EXPORT
2219
2220							None by design.
2221
2222							=head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2223
2224							You have to be careful when carrying out Precision verses Recall calculations if you
2225							do not wish to lose the previously created relevancy judgments. Invoking the method
2226							C in your own script will cause the file C
2227							to be overwritten. If you have created a relevancy database and stored it in a file
2228							called, say, C, you should make a backup copy of this file before
2229							executing a script that calls C.
2230
2231							=head1 BUGS
2232
2233							Please notify the author if you encounter any bugs. When sending email, please place
2234							the string 'VSM' in the subject line to get past my spam filter.
2235
2236							=head1 INSTALLATION
2237
2238							Download the archive from CPAN in any directory of your choice. Unpack the archive
2239							with a command that on a Linux machine would look like:
2240
2241							tar zxvf Algorithm-VSM-1.61.tar.gz
2242
2243							This will create an installation directory for you whose name will be
2244							C. Enter this directory and execute the following commands for a
2245							standard install of the module if you have root privileges:
2246
2247							perl Makefile.PL
2248							make
2249							make test
2250							sudo make install
2251
2252							If you do not have root privileges, you can carry out a non-standard install the
2253							module in any directory of your choice by:
2254
2255							perl Makefile.PL prefix=/some/other/directory/
2256							make
2257							make test
2258							make install
2259
2260							With a non-standard install, you may also have to set your PERL5LIB environment
2261							variable so that this module can find the required other modules. How you do that
2262							would depend on what platform you are working on. In order to install this module in
2263							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2264							variable by
2265
2266							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2267
2268							If I used bash, I'd need to declare:
2269
2270							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2271
2272
2273							=head1 THANKS
2274
2275							Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2276							deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2277							similarity matrix characterization of a corpus. Thanks, Zahn! Several of the recent
2278							changes to the module are a result of the feedback I have received from Naveen
2279							Kulkarni of Infosys Labs. Thanks, Naveen!
2280
2281							=head1 AUTHOR
2282
2283							Avinash Kak, kak@purdue.edu
2284
2285							If you send email, please place the string "VSM" in your subject line to get past my
2286							spam filter.
2287
2288							=head1 COPYRIGHT
2289
2290							This library is free software; you can redistribute it and/or modify it under the
2291							same terms as Perl itself.
2292
2293							Copyright 2015 Avinash Kak
2294
2295							=cut
2296
2297