File Coverage

blib/lib/Algorithm/VSM.pm

Criterion	Covered	Total	%
statement	351	822	42.7
branch	97	338	28.7
condition	25	67	37.3
subroutine	31	54	57.4
pod	24	35	68.5
total	528	1316	40.1

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::VSM;
2
3							#---------------------------------------------------------------------------
4							# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::VSM is a Perl module for retrieving the documents from a software
9							# library that match a list of words in a query. The matching criterion used depends
10							# on whether you ask the module to construct a full-dimensionality VSM or a
11							# reduced-dimensionality LSA model for the library.
12							# ---------------------------------------------------------------------------
13
14							#use 5.10.0;
15	1			1		33328	use strict;
	1					3
	1					37
16	1			1		6	use warnings;
	1					2
	1					37
17	1			1		7	use Carp;
	1					7
	1					103
18	1			1		882	use SDBM_File;
	1					29572
	1					104
19	1			1		1082	use PDL::Lite;
	1					217243
	1					33
20	1			1		9	use PDL::MatrixOps;
	1					2
	1					9
21	1			1		217	use File::Basename;
	1					2
	1					94
22	1			1		837	use File::Spec::Functions qw(rel2abs);
	1					929
	1					68
23	1			1		13	use Fcntl;
	1					3
	1					272
24	1			1		5464	use Storable;
	1					15157
	1					83
25	1			1		8	use Cwd;
	1					2
	1					13366
26
27							our $VERSION = '1.70';
28
29							# for camelcase splits (from perlmonks):
30							my $_regex = qr/[[:lower:]0-9]+\|[[:upper:]0-9](?:[[:upper:]0-9]+\|[[:lower:]0-9]*)(?=$\|[[:upper:]0-9])/;
31
32							################################### Constructor #######################################
33
34							# Constructor for creating a VSM or LSA model of a corpus. The model instance
35							# returned by the constructor can be used for retrieving documents from the corpus
36							# in response to queries.
37							sub new {
38	2			2	1	1315	my ($class, %args) = @_;
39	2					13	my @params = keys %args;
40	2	50				12	croak "\nYou have used a wrong name for a keyword argument " .
41							"--- perhaps a misspelling\n"
42							if _check_for_illegal_params(@params) == 0;
43							bless {
44							_corpus_directory => $args{corpus_directory} \|\| "",
45							_save_model_on_disk => $args{save_model_on_disk} \|\| 0,
46							_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47							$args{break_camelcased_and_underscored} : 1,
48							_corpus_vocab_db => $args{corpus_vocab_db} \|\| "corpus_vocab_db",
49							_doc_vectors_db => $args{doc_vectors_db} \|\| "doc_vectors_db",
50							_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} \|\| "normalized_doc_vecs_db",
51							_stop_words_file => $args{stop_words_file} \|\| "",
52							_case_sensitive => $args{case_sensitive} \|\| 0,
53							_query_file => $args{query_file} \|\| "",
54							_file_types => $args{file_types} \|\| [],
55							_min_word_length => $args{min_word_length} \|\| 4,
56							_want_stemming => $args{want_stemming} \|\| 0,
57							_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58							_max_number_retrievals => $args{max_number_retrievals} \|\| 30,
59							_lsa_svd_threshold => $args{lsa_svd_threshold} \|\| 0.01,
60							_relevancy_threshold => exists $args{relevancy_threshold} ? $args{relevancy_threshold} : 1,
61							_relevancy_file => $args{relevancy_file} \|\| "",
62	2	50	50			9796	_debug => $args{debug} \|\| 0,
		50	50
		50	50
			50
			50
			50
			50
			50
			50
			50
			50
			50
			100
			50
			50
63							_working_directory => cwd,
64							_vocab_hist_on_disk => {},
65							_vocab_hist => {},
66							_doc_hist_template => {},
67							_corpus_doc_vectors => {},
68							_normalized_doc_vecs => {},
69							_query_vector => {},
70							_stop_words => [],
71							_term_document_matrix => [],
72							_corpus_vocab_done => 0,
73							_scan_dir_for_rels => 0,
74							_vocab_size => undef,
75							_doc_vecs_trunc_lsa => {},
76							_lsa_vec_truncator => undef,
77							_queries_for_relevancy => {},
78							_relevancy_estimates => {},
79							_precision_for_queries => {},
80							_avg_precision_for_queries => {},
81							_recall_for_queries => {},
82							_map => undef,
83							_vocab_idf_hist => {},
84							_idf_t => {},
85							_total_num_of_docs => 0,
86							}, $class;
87							}
88
89
90							###################### Get corpus vocabulary and word counts #########################
91
92							sub get_corpus_vocabulary_and_word_counts {
93	2			2	1	27	my $self = shift;
94							die "You must supply the name of the corpus directory to the constructor"
95	2	50				22	unless $self->{_corpus_directory};
96							print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97	2	50				14	" model construction\n\n" if $self->{_debug};
98	2					28	$self->_scan_directory( $self->{_corpus_directory} );
99	2	50				10	$self->_drop_stop_words() if $self->{_stop_words_file};
100	2	50				10	if ($self->{_debug}) {
101	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
102	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103							}
104							}
105	2	50				8	if ($self->{_save_model_on_disk}) {
106	0					0	unlink glob "$self->{_corpus_vocab_db}.*";
107	0					0	unlink glob "$self->{_doc_vectors_db}.*";
108	0					0	unlink glob "$self->{_normalized_doc_vecs_db}.*";
109	0					0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
110	0	0				0	$self->{_corpus_vocab_db}, O_RDWR\|O_CREAT, 0640
111							or die "Can't create DBM files: $!";
112	0					0	foreach (keys %{$self->{_vocab_hist}}) {
	0					0
113	0					0	$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114							}
115	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
116							}
117	2					8	$self->{_corpus_vocab_done} = 1;
118	2					5	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	2					17
119							print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120	2	50				8	if $self->{_debug};
121							# Calculate idf(t):
122	2					3	foreach (keys %{$self->{_vocab_idf_hist}}) {
	2					33
123							$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124							/
125	218					576	(1 + $self->{_vocab_idf_hist}->{$_})))
126							/ log(10) );
127							}
128							}
129
130							sub display_corpus_vocab {
131	0			0	1	0	my $self = shift;
132							die "corpus vocabulary not yet constructed"
133	0	0				0	unless keys %{$self->{_vocab_hist}};
	0					0
134	0					0	print "\n\nDisplaying corpus vocabulary:\n\n";
135	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
136	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137	0					0	print "$outstring\n";
138							}
139							}
140
141							sub display_corpus_vocab_size {
142	0			0	1	0	my $self = shift;
143							die "corpus vocabulary not yet constructed"
144	0	0				0	unless keys %{$self->{_vocab_hist}};
	0					0
145	0					0	my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
	0					0
146	0					0	print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147							}
148
149							sub write_corpus_vocab_to_file {
150	0			0	1	0	my $self = shift;
151	0					0	my $file = shift;
152	0	0				0	die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
	0					0
153	0	0				0	open OUT, "> $file"
154							or die "unable to open for output a file with name `$file': $!";
155	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
156	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157	0					0	print OUT "$outstring\n";
158							}
159	0					0	close OUT;
160							}
161
162							sub display_inverse_document_frequencies {
163	0			0	1	0	my $self = shift;
164							die "corpus vocabulary not yet constructed"
165	0	0				0	unless keys %{$self->{_vocab_idf_hist}};
	0					0
166							print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167	0	0				0	unless $self->{_idf_filter_option};
168	0					0	print "\n\nDisplaying inverse document frequencies:\n";
169	0					0	foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
	0					0
170							my $outstring = sprintf("%30s %d",
171	0					0	$_, $self->{_vocab_idf_hist}->{$_});
172	0					0	print "$outstring\n";
173							}
174	0					0	print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175	0					0	foreach ( sort keys %{$self->{_idf_t}} ) {
	0					0
176	0					0	my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177	0					0	print "$outstring\n";
178							}
179							}
180
181							sub get_all_document_names {
182	0			0	1	0	my $self = shift;
183	0					0	my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
	0					0
184	0					0	return \@all_files;
185							}
186
187							############################ Generate Document Vectors #################################
188
189							sub generate_document_vectors {
190	2			2	1	34	my $self = shift;
191	2					28	chdir $self->{_working_directory};
192	2					6	foreach ( sort keys %{$self->{_vocab_hist}} ) {
	2					108
193	218					353	$self->{_doc_hist_template}->{$_} = 0;
194							}
195	2					26	$self->_scan_directory( $self->{_corpus_directory} );
196	2					36	chdir $self->{_working_directory};
197	2	50				40	if ($self->{_save_model_on_disk}) {
198							die "You did not specify in the constructor call the names for the diskfiles " .
199							"for storing the disk-based hash tables consisting of document vectors " .
200							"and their normalized versions"
201	0	0	0			0	unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202	0					0	eval {
203	0					0	store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204							};
205	0	0				0	if ($@) {
206	0					0	print "Something went wrong with disk storage of document vectors: $@";
207							}
208	0					0	eval {
209	0					0	store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210							};
211	0	0				0	if ($@) {
212	0					0	print "Something wrong with disk storage of normalized doc vecs: $@";
213							}
214							}
215							}
216
217							sub display_doc_vectors {
218	0			0	1	0	my $self = shift;
219							die "document vectors not yet constructed"
220	0	0				0	unless keys %{$self->{_corpus_doc_vectors}};
	0					0
221	0					0	foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
222	0					0	print "\n\ndisplay doc vec for $file:\n";
223	0					0	foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
	0					0
224	0					0	print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225							}
226	0					0	my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
	0					0
227	0					0	print "\nSize of vector for $file: $docvec_size\n";
228							}
229							}
230
231							sub display_normalized_doc_vectors {
232	0			0	1	0	my $self = shift;
233							die "normalized document vectors not yet constructed"
234	0	0				0	unless keys %{$self->{_normalized_doc_vecs}};
	0					0
235	0	0				0	unless ($self->{_idf_filter_option}) {
236	0					0	print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237	0					0	return;
238							}
239	0					0	foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
	0					0
240	0					0	print "\n\ndisplay normalized doc vec for $file:\n";
241	0					0	foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
	0					0
242	0					0	print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243							}
244	0					0	my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
	0					0
245	0					0	print "\nSize of normalized vector for $file: $docvec_size\n";
246							}
247							}
248
249							######################## Calculate Pairwise Document Similarities ######################
250
251							# Returns the similarity score for two documents whose actual names are are supplied
252							# as its two arguments.
253							sub pairwise_similarity_for_docs {
254	0			0	1	0	my $self = shift;
255	0					0	my $doc1 = shift;
256	0					0	my $doc2 = shift;
257	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
258	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260	0					0	my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261	0					0	my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262	0					0	my @vec1 = ();
263	0					0	my @vec2 = ();
264	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
265	0					0	push @vec1, $vec_hash_ref1->{$word};
266	0					0	push @vec2, $vec_hash_ref2->{$word};
267							}
268	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
269	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
270	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
271	0					0	$product /= $vec_mag1 * $vec_mag2;
272	0					0	return $product;
273							}
274
275							sub pairwise_similarity_for_normalized_docs {
276	0			0	1	0	my $self = shift;
277	0					0	my $doc1 = shift;
278	0					0	my $doc2 = shift;
279	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
280	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282	0					0	my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283	0					0	my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284	0					0	my @vec1 = ();
285	0					0	my @vec2 = ();
286	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
287	0					0	push @vec1, $vec_hash_ref1->{$word};
288	0					0	push @vec2, $vec_hash_ref2->{$word};
289							}
290	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
291	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
292	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
293	0					0	$product /= $vec_mag1 * $vec_mag2;
294	0					0	return $product;
295							}
296
297							############################### Retrieve with VSM Model ################################
298
299							sub retrieve_with_vsm {
300	1			1	1	18	my $self = shift;
301	1					6	my $query = shift;
302	1					8	my @clean_words;
303	1					5	my $min = $self->{_min_word_length};
304
305	1	50				6	if ($self->{_break_camelcased_and_underscored}) {
306	1					41	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
307	1					6	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					41
308							@clean_words = $self->{_case_sensitive} ?
309	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	0					0
310	1	100				12	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8	50				82
	8					43
311							} else {
312	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
313	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
314							}
315	1					6	$query = \@clean_words;
316	1	50				6	print "\nYour query words are: @$query\n" if $self->{_debug};
317	1	50				16	if ($self->{_idf_filter_option}) {
318							die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319	1					82	unless scalar(keys %{$self->{_vocab_hist}})
320	1	50	50			10	&& scalar(keys %{$self->{_normalized_doc_vecs}});
	1					6
321							} else {
322							die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323	0					0	unless scalar(keys %{$self->{_vocab_hist}})
324	0	0	0			0	&& scalar(keys %{$self->{_corpus_doc_vectors}});
	0					0
325							}
326	1					2	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					60
327	109					184	$self->{_query_vector}->{$_} = 0;
328							}
329	1					11	foreach (@$query) {
330	7	50				18	if ($self->{_case_sensitive}) {
331	0	0				0	$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332							} else {
333	7	100				26	$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334							}
335							}
336	1					3	my @query_word_counts = values %{$self->{_query_vector}};
	1					12
337	1					13	my $query_word_count_total = sum(\@query_word_counts);
338	1	50				10	die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339							unless $query_word_count_total;
340	1					2	my %retrievals;
341	1	50				6	if ($self->{_idf_filter_option}) {
342							print "\n\nUsing idf filter option for retrieval:\n\n"
343	1	50				6	if $self->{_debug};
344	1					2	foreach (sort {$self->_doc_vec_comparator}
	24					62
345	1					21	keys %{$self->{_normalized_doc_vecs}}) {
346	10	100				28	$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
347							}
348							} else {
349							print "\n\nNOT using idf filter option for retrieval:\n\n"
350	0	0				0	if $self->{_debug};
351	0					0	foreach (sort {$self->_doc_vec_comparator}
	0					0
352	0					0	keys %{$self->{_corpus_doc_vectors}}) {
353	0	0				0	$retrievals{$_} = $self->_similarity_to_query($_) if $self->_similarity_to_query($_) > 0;
354							}
355							}
356	1	50				6	if ($self->{_debug}) {
357	0					0	print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
359	0					0	print "$_ => $retrievals{$_}\n";
360							}
361							}
362	1					11	return \%retrievals;
363							}
364
365							######################### Upload a Previously Constructed Model #########################
366
367							sub upload_vsm_model_from_disk {
368	0			0	0	0	my $self = shift;
369							die "\nCannot find the database files for the VSM model"
370							unless -s "$self->{_corpus_vocab_db}.pag"
371	0	0	0			0	&& -s $self->{_doc_vectors_db};
372	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373	0					0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
374	0	0				0	$self->{_corpus_vocab_db}, O_RDONLY, 0640
375							or die "Can't open DBM file: $!";
376	0	0				0	if ($self->{_debug}) {
377	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
378	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379							}
380							}
381	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
382	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383							}
384	0					0	$self->{_corpus_vocab_done} = 1;
385	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
386							print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387	0	0				0	if $self->{_debug};
388	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
390							}
391
392							sub upload_normalized_vsm_model_from_disk {
393	0			0	1	0	my $self = shift;
394							die "\nCannot find the database files for the VSM model"
395							unless -s "$self->{_corpus_vocab_db}.pag"
396	0	0	0			0	&& -s $self->{_normalized_doc_vecs_db};
397	0					0	$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398	0					0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
399	0	0				0	$self->{_corpus_vocab_db}, O_RDONLY, 0640
400							or die "Can't open DBM file: $!";
401	0	0				0	if ($self->{_debug}) {
402	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
403	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404							}
405							}
406	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
407	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408							}
409	0					0	$self->{_corpus_vocab_done} = 1;
410	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
411							print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412	0	0				0	if $self->{_debug};
413	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
414							}
415
416							############################## Display Retrieval Results ################################
417
418							sub display_retrievals {
419	0			0	1	0	my $self = shift;
420	0					0	my $retrievals = shift;
421	0					0	print "\n\nShowing the retrievals and the similarity scores:\n\n";
422	0					0	my $iter = 0;
423	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
	0					0
424	0					0	print "$_ => $retrievals->{$_}\n";
425	0					0	$iter++;
426	0	0				0	last if $iter > $self->{_max_number_retrievals};
427							}
428	0					0	print "\n\n";
429							}
430
431							############################### Directory Scanner ################################
432
433							sub _scan_directory {
434	4			4		9	my $self = shift;
435	4					46	my $dir = rel2abs( shift );
436	4					18346	my $current_dir = cwd;
437	4	50				245	chdir $dir or die "Unable to change directory to $dir: $!";
438	4					942	foreach ( glob "*" ) {
439	40	50	33			30743	if ( -d and !(-l) ) {
		50	33
			33
			33
			33
440	0					0	$self->_scan_directory( $_ );
441	0	0				0	chdir $dir
442							or die "Unable to change directory to $dir: $!";
443							} elsif (-r _ and
444							-T _ and
445							-M _ > 0.00001 and # modification age is at least 1 sec
446							!( -l $_ ) and
447							$self->ok_to_filetype($_) ) {
448	40	50				100	$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449	40	100				174	$self->_scan_file($_) unless $self->{_corpus_vocab_done};
450	40	100				199	$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451							}
452							}
453	4					149	chdir $current_dir;
454							}
455
456							sub _scan_file {
457	20			20		32	my $self = shift;
458	20					39	my $file = shift;
459	20					493	open IN, $file;
460	20					51	my $min = $self->{_min_word_length};
461	20					49	my %uniques = ();
462	20					218	while () {
463	856	100				3855	next if /^[ ]*\r?\n?$/;
464	730					3101	$_ =~ s/\r?\n?$//;
465	730					1177	my @clean_words;
466	730	50				1563	if ($self->{_break_camelcased_and_underscored}) {
467	730					10261	my @brokenup = grep $_, split /\W\|_\|\s+/, $_;
468	730					1993	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	2594					11704
469							@clean_words = $self->{_case_sensitive} ?
470	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	0					0
471	730	100				2140	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	2732	50				8037
	2732					10985
472							} else {
473	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
474	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
475							}
476	730	100				3037	next unless @clean_words;
477							@clean_words = grep $_, map &simple_stemmer($_), @clean_words
478	444	50				2255	if $self->{_want_stemming};
479							$self->{_case_sensitive} ?
480	0					0	map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
481	444	50				1697	map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
	1736					3999
482	444	50				905	if ($self->{_case_sensitive}) {
483	0					0	for (@clean_words) { $uniques{$_}++ }
	0					0
484							} else {
485	444					965	for (@clean_words) { $uniques{"\L$_"}++ }
	1736					4630
486							}
487							}
488	20					567	close( IN );
489	20					139	map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
	838					1431
490	20					175	$self->{_total_num_of_docs}++;
491							}
492
493							sub ok_to_filetype {
494	40			40	0	81	my $self = shift;
495	40					186	my $filename = shift;
496	40					2215	my ($base, $dir, $suffix) = fileparse($filename, '\..*');
497							croak "You called this module without specifying the file types in the constructor"
498	40	50				73	unless @{$self->{_file_types}} > 0;
	40					166
499	40	50				71	return 1 if contained_in($suffix, @{$self->{_file_types}});
	40					317
500	0					0	return 0;
501							}
502
503							############################## LSA Modeling and Retrieval ################################
504
505							sub construct_lsa_model {
506	1			1	1	21	my $self = shift;
507	1	50				12	if ($self->{_idf_filter_option}) {
508	1	50	33			19	if (!$self->{_normalized_doc_vecs} and
509							-s $self->{_normalized_doc_vecs_db}) {
510							$self->{_normalized_doc_vecs} =
511	0					0	retrieve($self->{_normalized_doc_vecs_db});
512							}
513	1					6	foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
	1					18
514	10					15	my $term_frequency_vec;
515	10					14	foreach my $word (sort keys
516	10					576	%{$self->{_normalized_doc_vecs}->{$_}}){
517							push @$term_frequency_vec,
518	1090					1895	$self->{_normalized_doc_vecs}->{$_}->{$word};
519							}
520	10					57	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	10					31
521							}
522							} else {
523	0	0	0			0	if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
524	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
525							}
526	0					0	foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
527	0					0	my $term_frequency_vec;
528	0					0	foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
	0					0
529							push @$term_frequency_vec,
530	0					0	$self->{_corpus_doc_vectors}->{$_}->{$word};
531							}
532	0					0	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	0					0
533							}
534							}
535	1					5	my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
	1					19
536	1					485	my ($U,$SIGMA,$V) = svd $A;
537	1	50				8	print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
538	1	50				7	print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
539							my $index = return_index_of_last_value_above_threshold($SIGMA,
540	1					10	$self->{_lsa_svd_threshold});
541	1					14	my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
542	1	50				32	print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
543							# When you measure the size of a matrix in PDL, the zeroth dimension
544							# is considered to be along the horizontal and the one-th dimension
545							# along the rows. This is opposite of how we want to look at
546							# matrices. For a matrix of size MxN, we mean M rows and N columns.
547							# With this 'rows x columns' convention for matrix size, if you had
548							# to check the size of, say, U matrix, you would call
549							# my @size = ( $U->getdim(1), $U->getdim(0) );
550							# print "\nsize of U: @size\n";
551	1					7	my $U_trunc = $U->slice("0:$index,:")->sever;
552	1					46	my $V_trunc = $V->slice("0:$index,0:$index")->sever;
553	1					33	$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
554							PDL::Basic::transpose($U_trunc);
555							print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
556	1	50				14151	if $self->{_debug};
557							my @sorted_doc_names = $self->{_idf_filter_option} ?
558	1					10	sort keys %{$self->{_normalized_doc_vecs}} :
559	1	50				7	sort keys %{$self->{_corpus_doc_vectors}};
	0					0
560	1					3	my $i = 0;
561	1					5	foreach (@{$self->{_term_document_matrix}}) {
	1					4
562							my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
563	10					31	PDL::Basic::transpose(pdl($_));
564	10					1384	my $doc_name = $sorted_doc_names[$i++];
565							print "\n\nTruncated doc vec for $doc_name: " .
566	10	50				26	$truncated_doc_vec . "\n" if $self->{_debug};
567	10					30	$self->{_doc_vecs_trunc_lsa}->{$doc_name}
568							= $truncated_doc_vec;
569							}
570	1					88	chdir $self->{_working_directory};
571							}
572
573							sub retrieve_with_lsa {
574	1			1	1	11	my $self = shift;
575	1					5	my $query = shift;
576	1					2	my @clean_words;
577	1					6	my $min = $self->{_min_word_length};
578	1	50				5	if ($self->{_break_camelcased_and_underscored}) {
579	1					33	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
580	1					4	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					38
581	1	100				3	@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8					57
	8					37
582							} else {
583	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
584	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
585							}
586	1					3	$query = \@clean_words;
587	1	50				6	print "\nYour processed query words are: @$query\n" if $self->{_debug};
588							die "Your vocabulary histogram is empty"
589	1	50				3	unless scalar(keys %{$self->{_vocab_hist}});
	1					7
590							die "You must first construct an LSA model"
591	1	50				2	unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
	1					10
592	1					2	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					21
593	109					190	$self->{_query_vector}->{$_} = 0;
594							}
595	1					8	foreach (@$query) {
596							$self->{_query_vector}->{"\L$_"}++
597	7	100				24	if exists $self->{_vocab_hist}->{"\L$_"};
598							}
599	1					2	my @query_word_counts = values %{$self->{_query_vector}};
	1					13
600	1					11	my $query_word_count_total = sum(\@query_word_counts);
601	1	50				4	die "Query does not contain corpus words. Nothing retrieved."
602							unless $query_word_count_total;
603	1					2	my $query_vec;
604	1					4	foreach (sort keys %{$self->{_query_vector}}) {
	1					50
605	109					180	push @$query_vec, $self->{_query_vector}->{$_};
606							}
607	1	50				8	print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
608							my $truncated_query_vec = $self->{_lsa_vec_truncator} x
609	1					5	PDL::Basic::transpose(pdl($query_vec));
610							print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
611	1	50				163	if $self->{_debug};
612	1					2	my %retrievals;
613	1					2	foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
	1					9
614							my $dot_product = PDL::Basic::transpose($truncated_query_vec)
615	10					175	x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
616							print "\n\nLSA: dot product of truncated query and\n" .
617							" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
618	10	50				886	if $self->{_debug};
619	10	100				30	$retrievals{$_} = $dot_product->sclr if $dot_product->sclr > 0;
620							}
621	1	50				15	if ($self->{_debug}) {
622	0					0	print "\n\nShowing LSA retrievals and similarity scores:\n\n";
623	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
624	0					0	print "$_ => $retrievals{$_}\n";
625							}
626	0					0	print "\n\n";
627							}
628	1					13	return \%retrievals;
629							}
630
631							sub _construct_doc_vector {
632	20			20		34	my $self = shift;
633	20					44	my $file = shift;
634	20					35	my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
	20					77
635	20					318	foreach ( sort keys %{$self->{_doc_hist_template}} ) {
	20					1129
636	2180					3237	$document_vector{$_} = 0;
637							}
638	20					179	my $min = $self->{_min_word_length};
639	20					65	my $total_words_in_doc = 0;
640	20	50				557	unless (open IN, $file) {
641							print "Unable to open file $file in the corpus: $!\n"
642	0	0				0	if $self->{_debug};
643	0					0	return;
644							}
645	20					432	while () {
646	856	100				3682	next if /^[ ]*\r?\n?$/;
647	730					2846	$_ =~ s/\r?\n?$//;
648	730					6639	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
649	730					1238	my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	4726					10785
	4726					10663
650	730	100				2848	next unless @clean_words;
651							@clean_words = grep $_,
652							map &simple_stemmer($_, $self->{_debug}), @clean_words
653	444	50				1719	if $self->{_want_stemming};
654							$self->{_case_sensitive} ?
655	0					0	map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
	0					0
656	1488					4267	map { $document_vector{"\L$_"}++ }
657	444	50				1354	grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
	1610					4037
658							}
659	20					201	close IN;
660							die "Something went wrong. Doc vector size unequal to vocab size"
661	20	50				98	unless $self->{_vocab_size} == scalar(keys %document_vector);
662	20					266	foreach (keys %document_vector) {
663	2180					3083	$total_words_in_doc += $document_vector{$_};
664							}
665	20					121	my %normalized_doc_vec;
666	20	50				57	if ($self->{_idf_filter_option}) {
667	20					238	foreach (keys %document_vector) {
668							$normalized_doc_vec{$_} = $document_vector{$_}
669							*
670	2180					4973	$self->{_idf_t}->{$_}
671							/
672							$total_words_in_doc;
673							}
674							}
675	20					81749	my $pwd = cwd;
676	20					567	$pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
677	20					43	my $file_path_name;
678	20	50				201	unless ( $1 eq "" ) {
679	0					0	$file_path_name = "$1/$file";
680							} else {
681	20					71	$file_path_name = $file;
682							}
683	20					232	$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
684	20					507	$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
685							}
686
687							################################### Drop Stop Words ###################################
688
689							sub _drop_stop_words {
690	0			0		0	my $self = shift;
691	0	0				0	open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
692							or die "unable to open stop words file: $!";
693	0					0	while () {
694	0	0				0	next if /^#/;
695	0	0				0	next if /^[ ]*\r?\n?$/;
696	0					0	$_ =~ s/\r?\n?$//;
697	0	0				0	delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
698	0					0	unshift @{$self->{_stop_words}}, $_;
	0					0
699							}
700							}
701
702							################################### Support Methods ####################################
703
704							sub _doc_vec_comparator {
705	24			24		32	my $self = shift;
706	24					30	my %query_vector = %{$self->{_query_vector}};
	24					642
707							my $vec1_hash_ref = $self->{_idf_filter_option} ?
708							$self->{_normalized_doc_vecs}->{$a} :
709	24	50				198	$self->{_corpus_doc_vectors}->{$a};
710							my $vec2_hash_ref = $self->{_idf_filter_option} ?
711							$self->{_normalized_doc_vecs}->{$b} :
712	24	50				61	$self->{_corpus_doc_vectors}->{$b};
713	24					38	my @vec1 = ();
714	24					31	my @vec2 = ();
715	24					35	my @qvec = ();
716	24					30	foreach my $word (sort keys %{$self->{_vocab_hist}}) {
	24					1255
717	2616					3808	push @vec1, $vec1_hash_ref->{$word};
718	2616					3607	push @vec2, $vec2_hash_ref->{$word};
719	2616					4249	push @qvec, $query_vector{$word};
720							}
721	24					185	my $vec1_mag = vec_magnitude(\@vec1);
722	24					55	my $vec2_mag = vec_magnitude(\@vec2);
723	24					45	my $qvec_mag = vec_magnitude(\@qvec);
724	24					54	my $product1 = vec_scalar_product(\@vec1, \@qvec);
725	24					51	$product1 /= $vec1_mag * $qvec_mag;
726	24					57	my $product2 = vec_scalar_product(\@vec2, \@qvec);
727	24					37	$product2 /= $vec2_mag * $qvec_mag;
728	24	100				230	return 1 if $product1 < $product2;
729	14	100				49	return 0 if $product1 == $product2;
730	13	50				253	return -1 if $product1 > $product2;
731							}
732
733							sub _similarity_to_query {
734	18			18		24	my $self = shift;
735	18					26	my $doc_name = shift;
736							my $vec_hash_ref = $self->{_idf_filter_option} ?
737							$self->{_normalized_doc_vecs}->{$doc_name} :
738	18	50				50	$self->{_corpus_doc_vectors}->{$doc_name};
739	18					29	my @vec = ();
740	18					22	my @qvec = ();
741	18					925	foreach my $word (sort keys %$vec_hash_ref) {
742	1962					2684	push @vec, $vec_hash_ref->{$word};
743	1962					3059	push @qvec, $self->{_query_vector}->{$word};
744							}
745	18					134	my $vec_mag = vec_magnitude(\@vec);
746	18					42	my $qvec_mag = vec_magnitude(\@qvec);
747	18					40	my $product = vec_scalar_product(\@vec, \@qvec);
748	18					32	$product /= $vec_mag * $qvec_mag;
749	18					178	return $product;
750							}
751
752							###################### Relevance Judgments for Testing Purposes #######################
753
754							## IMPORTANT: This estimation of document relevancies to queries is NOT for
755							## serious work. A document is considered to be relevant to a
756							## query if it contains several of the query words. As to the
757							## minimum number of query words that must exist in a document
758							## in order for the latter to be considered relevant is
759							## determined by the relevancy_threshold parameter in the VSM
760							## constructor. (See the relevancy and precision-recall related
761							## scripts in the 'examples' directory.) The reason for why the
762							## function shown below is not for serious work is because
763							## ultimately it is the humans who are the best judges of the
764							## relevancies of documents to queries. The humans bring to
765							## bear semantic considerations on the relevancy determination
766							## problem that are beyond the scope of this module.
767
768							sub estimate_doc_relevancies {
769	0			0	1	0	my $self = shift;
770							die "You did not set the 'query_file' parameter in the constructor"
771	0	0				0	unless $self->{_query_file};
772							open( IN, $self->{_query_file} )
773	0	0				0	or die "unable to open the query file $self->{_query_file}: $!";
774							croak "\n\nYou need to specify a name for the relevancy file in \n" .
775							" in which the relevancy judgments will be dumped."
776	0	0				0	unless $self->{_relevancy_file};
777	0					0	while () {
778	0	0				0	next if /^#/;
779	0	0				0	next if /^[ ]*\r?\n?$/;
780	0					0	$_ =~ s/\r?\n?$//;
781	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
782	0					0	/^[ ](q[0-9]+):[ ](.*)/;
783	0					0	my $query_label = $1;
784	0					0	my $query = $2;
785	0	0				0	next unless $query;
786	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
787							}
788	0	0				0	if ($self->{_debug}) {
789	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
790	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
791							}
792							}
793	0					0	$self->{_scan_dir_for_rels} = 1;
794	0					0	$self->_scan_directory($self->{_corpus_directory});
795	0					0	$self->{_scan_dir_for_rels} = 0;
796	0					0	chdir $self->{_working_directory};
797	0	0				0	open(OUT, ">$self->{_relevancy_file}")
798							or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
799	0					0	my @relevancy_list_for_query;
800	0					0	foreach (sort
801	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
802	0					0	keys %{$self->{_relevancy_estimates}}) {
803							@relevancy_list_for_query =
804	0					0	keys %{$self->{_relevancy_estimates}->{$_}};
	0					0
805	0					0	print OUT "$_ => @relevancy_list_for_query\n\n";
806	0					0	print "Number of relevant docs for query $_: " .
807							scalar(@relevancy_list_for_query) . "\n";
808							}
809							}
810
811							# If there are available human-supplied relevancy judgments in a disk
812							# file, use this script to upload that information. One of the scripts
813							# in the 'examples' directory carries out the precision-recall analysis
814							# by using this approach. IMPORTANT: The human-supplied relevancy
815							# judgments must be in a format that is shown in the sample file
816							# relevancy.txt in the 'examples' directory.
817							sub upload_document_relevancies_from_file {
818	0			0	1	0	my $self = shift;
819	0					0	chdir $self->{_working_directory};
820							open( IN, $self->{_relevancy_file} )
821	0	0				0	or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
822	0					0	while () {
823	0	0				0	next if /^#/;
824	0	0				0	next if /^[ ]*\r?\n?$/;
825	0					0	$_ =~ s/\r?\n?$//;
826	0	0				0	die "Format of query file is not correct" unless /^[ ]q[0-9]+[ ]=>/;
827	0					0	/^[ ](q[0-9]+)[ ]=>[ ](.)/;
828	0					0	my $query_label = $1;
829	0					0	my $relevancy_docs_string = $2;
830	0	0				0	next unless $relevancy_docs_string;
831	0					0	my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
832	0					0	my %relevancies = map {$_ => 1} @relevancy_docs;
	0					0
833	0					0	$self->{_relevancy_estimates}->{$query_label} = \%relevancies;
834							}
835	0	0				0	if ($self->{_debug}) {
836	0					0	for (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
837	0					0	my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
	0					0
838	0					0	print "$_ => @rels\n";
839							}
840							}
841							}
842
843							sub display_doc_relevancies {
844	0			0	1	0	my $self = shift;
845							die "You must first estimate or provide the doc relevancies"
846	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
	0					0
847	0					0	print "\nDisplaying relevancy judgments:\n\n";
848	0					0	foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
849	0					0	print "Query $query\n";
850	0					0	foreach my $file (sort {
851							$self->{_relevancy_estimates}->{$query}->{$b}
852							<=>
853	0					0	$self->{_relevancy_estimates}->{$query}->{$a}
854							}
855	0					0	keys %{$self->{_relevancy_estimates}->{$query}}){
856	0					0	print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
857							}
858							}
859							}
860
861							sub _scan_file_for_rels {
862	0			0		0	my $self = shift;
863	0					0	my $file = shift;
864	0					0	open IN, $file;
865	0					0	my @all_text = ;
866	0					0	@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
	0					0
	0					0
867	0					0	my $all_text = join ' ', @all_text;
868	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
869	0					0	my $count = 0;
870							my @query_words = grep $_,
871	0					0	split /\s+/, $self->{_queries_for_relevancy}->{$query};
872	0	0				0	print "Query words for $query: @query_words\n" if $self->{_debug};
873	0					0	foreach my $word (@query_words) {
874	0					0	my @matches = $all_text =~ /$word/gi;
875							print "Number of occurrences for word '$word' in file $file: " .
876	0	0				0	scalar(@matches) . "\n" if $self->{_debug};
877	0	0				0	$count += @matches if @matches;
878							}
879							print "\nRelevancy count for query $query and file $file: $count\n\n"
880	0	0				0	if $self->{_debug};
881							$self->{_relevancy_estimates}->{$query}->{$file} = $count
882	0	0				0	if $count >= $self->{_relevancy_threshold};
883							}
884							}
885
886							######################### Calculate Precision versus Recall ##########################
887
888							sub precision_and_recall_calculator {
889	0			0	1	0	my $self = shift;
890	0					0	my $retrieval_type = shift;
891	0	0				0	die "You must specify the retrieval type through an argument to the method " .
892							"precision_and_recall_calculator(). The retrieval type must either be 'vsm' " .
893							"or 'lsa' \n" unless $retrieval_type;
894							die "You must first estimate or provide the doc relevancies"
895	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
	0					0
896	0	0				0	unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
	0					0
897							open( IN, $self->{_query_file})
898	0	0				0	or die "unable to open the query file $self->{_query_file}: $!";
899	0					0	while () {
900	0	0				0	next if /^#/;
901	0	0				0	next if /^[ ]*\r?\n?$/;
902	0					0	$_ =~ s/\r?\n?$//;
903	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
904	0					0	/^[ ](q[0-9]+):[ ](.*)/;
905	0					0	my $query_label = $1;
906	0					0	my $query = $2;
907	0	0				0	next unless $query;
908	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
909							}
910	0	0				0	if ($self->{_debug}) {
911	0					0	print "\n\nDisplaying queries in the query file:\n\n";
912	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
913	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
914							}
915							}
916							}
917	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
918							print "\n\n====================================== query: $query ========================================\n\n"
919	0	0				0	if $self->{_debug};
920	0	0				0	print "\n\n\nQuery $query:\n" if $self->{_debug};
921							my @query_words = grep $_,
922	0					0	split /\s+/, $self->{_queries_for_relevancy}->{$query};
923	0	0				0	croak "\n\nYou have not specified the retrieval type for " .
924							"precision-recall calculation. See code in 'examples'" .
925							"directory:" if !defined $retrieval_type;
926	0					0	my $retrievals;
927	0					0	eval {
928	0	0				0	if ($retrieval_type eq 'vsm') {
		0
929	0					0	$retrievals = $self->retrieve_with_vsm( \@query_words );
930							} elsif ($retrieval_type eq 'lsa') {
931	0					0	$retrievals = $self->retrieve_with_lsa( \@query_words );
932							}
933							};
934	0	0				0	if ($@) {
935	0					0	warn "\n\nNo relevant docs found for query $query.\n" .
936							"Will skip over this query for precision and\n" .
937							"recall calculations\n\n";
938	0					0	next;
939							}
940	0					0	my %ranked_retrievals;
941	0					0	my $i = 1;
942	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
	0					0
943							keys %$retrievals) {
944	0					0	$ranked_retrievals{$i++} = $_;
945							}
946	0	0				0	if ($self->{_debug}) {
947	0					0	print "\n\nDisplaying ranked retrievals for query $query:\n\n";
948	0					0	foreach (sort {$a <=> $b} keys %ranked_retrievals) {
	0					0
949	0					0	print "$_ => $ranked_retrievals{$_}\n";
950							}
951							}
952							# At this time, ranking of relevant documents based on their
953							# relevancy counts serves no particular purpose since all we want
954							# for the calculation of Precision and Recall are the total
955							# number of relevant documents. However, I believe such a
956							# ranking will play an important role in the future.
957							# IMPORTANT: The relevancy judgments are ranked only when
958							# estimated by the method estimate_doc_relevancies()
959							# of the VSM class. When relevancies are supplied
960							# directly through a disk file, they all carry the
961							# same rank.
962	0					0	my %ranked_relevancies;
963	0					0	$i = 1;
964	0					0	foreach my $file (sort {
965							$self->{_relevancy_estimates}->{$query}->{$b}
966							<=>
967	0					0	$self->{_relevancy_estimates}->{$query}->{$a}
968							}
969	0					0	keys %{$self->{_relevancy_estimates}->{$query}}) {
970	0					0	$ranked_relevancies{$i++} = $file;
971							}
972	0	0				0	if ($self->{_debug}) {
973	0					0	print "\n\nDisplaying ranked relevancies for query $query:\n\n";
974	0					0	foreach (sort {$a <=> $b} keys %ranked_relevancies) {
	0					0
975	0					0	print "$_ => $ranked_relevancies{$_}\n";
976							}
977							}
978	0					0	my @relevant_set = values %ranked_relevancies;
979	0	0				0	warn "\n\nNo relevant docs found for query $query.\n" .
980							"Will skip over this query for precision and\n" .
981							"recall calculations\n\n" unless @relevant_set;
982	0	0				0	next unless @relevant_set;
983							print "\n\nRelevant set for query $query: @relevant_set\n\n"
984	0	0				0	if $self->{_debug};
985							# @retrieved is just to find out HOW MANY docs are retrieved. So no sorting needed.
986	0					0	my @retrieved;
987	0					0	foreach (keys %ranked_retrievals) {
988	0					0	push @retrieved, $ranked_retrievals{$_};
989							}
990							print "\n\nRetrieved items (in no particular order) for query $query: @retrieved\n\n"
991	0	0				0	if $self->{_debug};
992	0					0	my @Precision_values = ();
993	0					0	my @Recall_values = ();
994	0					0	my $rank = 1;
995	0					0	while ($rank < @retrieved + 1) {
996	0					0	my $index = 1;
997	0					0	my @retrieved_at_rank = ();
998	0					0	while ($index <= $rank) {
999	0					0	push @retrieved_at_rank, $ranked_retrievals{$index};
1000	0					0	$index++;
1001							}
1002	0					0	my $intersection =set_intersection(\@retrieved_at_rank,
1003							\@relevant_set);
1004	0	0				0	my $precision_at_rank = @retrieved_at_rank ?
1005							(@$intersection / @retrieved_at_rank) : 0;
1006	0					0	push @Precision_values, $precision_at_rank;
1007	0					0	my $recall_at_rank = @$intersection / @relevant_set;
1008	0					0	push @Recall_values, $recall_at_rank;
1009	0					0	$rank++;
1010							}
1011							print "\n\nFor query $query, precision values: @Precision_values\n"
1012	0	0				0	if $self->{_debug};
1013							print "\nFor query $query, recall values: @Recall_values\n"
1014	0	0				0	if $self->{_debug};
1015	0					0	$self->{_precision_for_queries}->{$query} = \@Precision_values;
1016	0					0	my $avg_precision;
1017	0					0	$avg_precision += $_ for @Precision_values;
1018	0					0	$self->{_avg_precision_for_queries}->{$query} += $avg_precision / (1.0 * @Precision_values);
1019	0					0	$self->{_recall_for_queries}->{$query} = \@Recall_values;
1020							}
1021							print "\n\n========= query by query processing for Precision vs. Recall calculations finished ========\n\n"
1022	0	0				0	if $self->{_debug};
1023	0					0	my @avg_precisions;
1024	0					0	foreach (keys %{$self->{_avg_precision_for_queries}}) {
	0					0
1025	0					0	push @avg_precisions, $self->{_avg_precision_for_queries}->{$_};
1026							}
1027	0					0	$self->{_map} += $_ for @avg_precisions;
1028	0					0	$self->{_map} /= scalar keys %{$self->{_queries_for_relevancy}};
	0					0
1029							}
1030
1031							sub display_average_precision_for_queries_and_map {
1032	0			0	1	0	my $self = shift;
1033							die "You must first invoke precision_and_recall_calculator function"
1034	0	0				0	unless scalar(keys %{$self->{_avg_precision_for_queries}});
	0					0
1035	0					0	print "\n\nDisplaying average precision for different queries:\n\n";
1036	0					0	foreach my $query (sort
1037	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1038	0					0	keys %{$self->{_avg_precision_for_queries}}) {
1039							my $output = sprintf "Query %s => %.3f",
1040	0					0	$query, $self->{_avg_precision_for_queries}->{$query};
1041	0					0	print "$output\n";
1042							}
1043	0					0	print "\n\nMAP value: $self->{_map}\n\n";
1044							}
1045
1046							sub display_precision_vs_recall_for_queries {
1047	0			0	1	0	my $self = shift;
1048							die "You must first invoke precision_and_recall_calculator function"
1049	0	0				0	unless scalar(keys %{$self->{_precision_for_queries}});
	0					0
1050	0					0	print "\n\nDisplaying precision and recall values for different queries:\n\n";
1051	0					0	foreach my $query (sort
1052	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1053	0					0	keys %{$self->{_avg_precision_for_queries}}) {
1054	0					0	print "\n\nQuery $query:\n";
1055	0					0	print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1056	0					0	my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
	0					0
1057	0					0	@precision_vals = map {sprintf "%.3f", $_} @precision_vals;
	0					0
1058	0					0	print " Precision at rank => @precision_vals\n";
1059	0					0	my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
	0					0
1060	0					0	@recall_vals = map {sprintf "%.3f", $_} @recall_vals;
	0					0
1061	0					0	print "\n Recall at rank => @recall_vals\n";
1062							}
1063	0					0	print "\n\n";
1064							}
1065
1066							sub get_query_sorted_average_precision_for_queries {
1067	0			0	1	0	my $self = shift;
1068							die "You must first invoke precision_and_recall_calculator function"
1069	0	0				0	unless scalar(keys %{$self->{_avg_precision_for_queries}});
	0					0
1070	0					0	my @average_precisions_for_queries = ();
1071	0					0	foreach my $query (sort
1072	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1073	0					0	keys %{$self->{_avg_precision_for_queries}}) {
1074	0					0	my $output = sprintf "%.3f", $self->{_avg_precision_for_queries}->{$query};
1075	0					0	push @average_precisions_for_queries, $output;
1076							}
1077	0					0	return \@average_precisions_for_queries;
1078							}
1079
1080							################################### Utility Routines ###################################
1081
1082							sub _check_for_illegal_params {
1083	2			2		7	my @params = @_;
1084	2					26	my @legal_params = qw / corpus_directory
1085							corpus_vocab_db
1086							doc_vectors_db
1087							normalized_doc_vecs_db
1088							use_idf_filter
1089							stop_words_file
1090							file_types
1091							case_sensitive
1092							max_number_retrievals
1093							query_file
1094							relevancy_file
1095							min_word_length
1096							want_stemming
1097							lsa_svd_threshold
1098							relevancy_threshold
1099							break_camelcased_and_underscored
1100							save_model_on_disk
1101							debug
1102							/;
1103	2					4	my $found_match_flag;
1104	2					4	foreach my $param (@params) {
1105	17					26	foreach my $legal (@legal_params) {
1106	102					116	$found_match_flag = 0;
1107	102	100				232	if ($param eq $legal) {
1108	17					23	$found_match_flag = 1;
1109	17					23	last;
1110							}
1111							}
1112	17	50				41	last if $found_match_flag == 0;
1113							}
1114	2					10	return $found_match_flag;
1115							}
1116
1117							# checks whether an element is in an array:
1118							sub contained_in {
1119	40			40	0	68	my $ele = shift;
1120	40					178	my @array = @_;
1121	40					99	my $count = 0;
1122	40	100				78	map {$count++ if $ele eq $_} @array;
	80					440
1123	40					278	return $count;
1124							}
1125
1126							# Meant only for an un-nested hash:
1127							sub deep_copy_hash {
1128	20			20	0	31	my $ref_in = shift;
1129	20					46	my $ref_out = {};
1130	20					34	foreach ( keys %{$ref_in} ) {
	20					1324
1131	2180					4245	$ref_out->{$_} = $ref_in->{$_};
1132							}
1133	20					1001	return $ref_out;
1134							}
1135
1136							sub vec_scalar_product {
1137	66			66	0	85	my $vec1 = shift;
1138	66					92	my $vec2 = shift;
1139	66	50				167	croak "Something is wrong --- the two vectors are of unequal length"
1140							unless @$vec1 == @$vec2;
1141	66					68	my $product;
1142	66					134	for my $i (0..@$vec1-1) {
1143	7194					10516	$product += $vec1->[$i] * $vec2->[$i];
1144							}
1145	66					130	return $product;
1146							}
1147
1148							sub vec_magnitude {
1149	108			108	0	144	my $vec = shift;
1150	108					129	my $mag_squared = 0;
1151	108					185	foreach my $num (@$vec) {
1152	11772					17491	$mag_squared += $num ** 2;
1153							}
1154	108					212	return sqrt $mag_squared;
1155							}
1156
1157							sub sum {
1158	2			2	0	5	my $vec = shift;
1159	2					3	my $result;
1160	2					9	for my $item (@$vec) {
1161	218					297	$result += $item;
1162							}
1163	2					6	return $result;
1164							}
1165
1166							sub simple_stemmer {
1167	3346			3346	0	5310	my $word = shift;
1168	3346					4952	my $debug = shift;
1169	3346	50				7034	print "\nStemming the word: $word\n" if $debug;
1170	3346					10255	$word =~ s/(.*[a-z][^aeious])s$/$1/i;
1171	3346					8246	$word =~ s/(.*[a-z]s)es$/$1/i;
1172	3346					7671	$word =~ s/(.*[a-z][ck])es$/$1e/i;
1173	3346					9771	$word =~ s/(.*[a-z]+)tions$/$1tion/i;
1174	3346					9045	$word =~ s/(.*[a-z]+)mming$/$1m/i;
1175	3346					13125	$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1176	3346					9297	$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1177	3346					8535	$word =~ s/(.*[a-z]+)tices$/$1tex/i;
1178	3346					11071	$word =~ s/(.*[a-z]+)pes$/$1pe/i;
1179	3346					10753	$word =~ s/(.*[a-z]+)sed$/$1se/i;
1180	3346					11089	$word =~ s/(.*[a-z]+)ed$/$1/i;
1181	3346					7270	$word =~ s/(.*[a-z]+)tation$/$1t/i;
1182	3346	50				6552	print "Stemmed word: $word\n\n" if $debug;
1183	3346					11969	return $word;
1184							}
1185
1186							# Assumes the array is sorted in a descending order, as would be the
1187							# case with an array of singular values produced by an SVD algorithm
1188							sub return_index_of_last_value_above_threshold {
1189	1			1	0	5	my $pdl_obj = shift;
1190	1					7	my $size = $pdl_obj->getdim(0);
1191	1					4	my $threshold = shift;
1192	1					13	my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1193	1					72	my $i = 0;
1194	1		66			11	while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
	10					324
1195	1					6	return $i-1;
1196							}
1197
1198							sub set_intersection {
1199	0			0	0		my $set1 = shift;
1200	0						my $set2 = shift;
1201	0						my %hset1 = map {$_ => 1} @$set1;
	0
1202	0						my @common_elements = grep {$hset1{$_}} @$set2;
	0
1203	0	0					return @common_elements ? \@common_elements : [];
1204							}
1205
1206							sub get_integer_suffix {
1207	0			0	0		my $label = shift;
1208	0						$label =~ /(\d*)$/;
1209	0						return $1;
1210							}
1211
1212							1;
1213
1214							=pod
1215
1216							=head1 NAME
1217
1218							Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1219							library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1220							algorithms in response to search words and phrases.
1221
1222							=head1 SYNOPSIS
1223
1224							# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1225
1226							use Algorithm::VSM;
1227
1228							my $corpus_dir = "corpus";
1229							my @query = qw/ program ListIterator add ArrayList args /;
1230							my $stop_words_file = "stop_words.txt";
1231							my $vsm = Algorithm::VSM->new(
1232							break_camelcased_and_underscored => 1,
1233							case_sensitive => 0,
1234							corpus_directory => $corpus_dir,
1235							file_types => ['.txt', '.java'],
1236							max_number_retrievals => 10,
1237							min_word_length => 4,
1238							stop_words_file => $stop_words_file,
1239							use_idf_filter => 1,
1240							want_stemming => 1,
1241							);
1242							$vsm->get_corpus_vocabulary_and_word_counts();
1243							$vsm->display_corpus_vocab();
1244							$vsm->display_corpus_vocab_size();
1245							$vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1246							$vsm->display_inverse_document_frequencies();
1247							$vsm->generate_document_vectors();
1248							$vsm->display_doc_vectors();
1249							$vsm->display_normalized_doc_vectors();
1250							my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1251							$vsm->display_retrievals( $retrievals );
1252
1253							The purpose of each constructor option and what is accomplished by the method
1254							calls should be obvious by their names. If not, they are explained in greater
1255							detail elsewhere in this documentation page. Note that the methods
1256							display_corpus_vocab() and display_doc_vectors() are there only for testing
1257							purposes with small corpora. If you must use them for large libraries/corpora,
1258							you might wish to redirect the output to a file.
1259
1260							By default, a call to a constructor calculates normalized term-frequency vectors
1261							for the documents. Normalization consists of first calculating the term
1262							frequency tf(t) of a term t in a document as a proportion of the total numbers
1263							of words in the document and then multiplying it by idf(t), where idf(t) stands
1264							for the inverse document frequency associated with that term. Note that 'word'
1265							and 'term' mean the same thing.
1266
1267
1268
1269							# FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1270
1271							my $lsa = Algorithm::VSM->new(
1272							break_camelcased_and_underscored => 1,
1273							case_sensitive => 0,
1274							corpus_directory => $corpus_dir,
1275							file_types => ['.txt', '.java'],
1276							lsa_svd_threshold => 0.01,
1277							max_number_retrievals => 10,
1278							min_word_length => 4,
1279							stop_words_file => $stop_words_file,
1280							use_idf_filter => 1,
1281							want_stemming => 1,
1282							);
1283							$lsa->get_corpus_vocabulary_and_word_counts();
1284							$lsa->display_corpus_vocab();
1285							$lsa->display_corpus_vocab_size();
1286							$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1287							$lsa->generate_document_vectors();
1288							$lsa->construct_lsa_model();
1289							my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1290							$lsa->display_retrievals( $retrievals );
1291
1292							The initialization code before the constructor call and the calls for displaying
1293							the vocabulary and the vectors after the call remain the same as for the VSM case
1294							shown previously in this Synopsis. In the call above, the constructor parameter
1295							'lsa_svd_threshold' determines how many of the singular values will be retained
1296							after we have carried out an SVD decomposition of the term-frequency matrix for
1297							the documents in the corpus. Singular values smaller than this threshold
1298							fraction of the largest value are rejected.
1299
1300
1301
1302							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1303
1304							my $corpus_dir = "corpus";
1305							my $stop_words_file = "stop_words.txt";
1306							my $query_file = "test_queries.txt";
1307							my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1308							# will be stored in this file
1309							my $vsm = Algorithm::VSM->new(
1310							break_camelcased_and_underscored => 1,
1311							case_sensitive => 0,
1312							corpus_directory => $corpus_dir,
1313							file_types => ['.txt', '.java'],
1314							min_word_length => 4,
1315							query_file => $query_file,
1316							relevancy_file => $relevancy_file,
1317							relevancy_threshold => 5,
1318							stop_words_file => $stop_words_file,
1319							want_stemming => 1,
1320							);
1321							$vsm->get_corpus_vocabulary_and_word_counts();
1322							$vsm->generate_document_vectors();
1323							$vsm->estimate_doc_relevancies();
1324							$vsm->display_doc_relevancies(); # used only for testing
1325							$vsm->precision_and_recall_calculator('vsm');
1326							$vsm->display_precision_vs_recall_for_queries();
1327							$vsm->display_average_precision_for_queries_and_map();
1328
1329							Measuring precision and recall requires a set of queries. These are supplied
1330							through the constructor parameter 'query_file'. The format of the this file
1331							must be according to the sample file 'test_queries.txt' in the 'examples'
1332							directory. The module estimates the relevancies of the documents to the
1333							queries and dumps the relevancies in a file named by the 'relevancy_file'
1334							constructor parameter. The constructor parameter 'relevancy_threshold' is used
1335							to decide which of the documents are considered to be relevant to a query. A
1336							document must contain at least the 'relevancy_threshold' occurrences of query
1337							words in order to be considered relevant to a query.
1338
1339
1340
1341							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1342
1343							my $lsa = Algorithm::VSM->new(
1344							break_camelcased_and_underscored => 1,
1345							case_sensitive => 0,
1346							corpus_directory => $corpus_dir,
1347							file_types => ['.txt', '.java'],
1348							lsa_svd_threshold => 0.01,
1349							min_word_length => 4,
1350							query_file => $query_file,
1351							relevancy_file => $relevancy_file,
1352							relevancy_threshold => 5,
1353							stop_words_file => $stop_words_file,
1354							want_stemming => 1,
1355							);
1356							$lsa->get_corpus_vocabulary_and_word_counts();
1357							$lsa->generate_document_vectors();
1358							$lsa->construct_lsa_model();
1359							$lsa->estimate_doc_relevancies();
1360							$lsa->display_doc_relevancies();
1361							$lsa->precision_and_recall_calculator('lsa');
1362							$lsa->display_precision_vs_recall_for_queries();
1363							$lsa->display_average_precision_for_queries_and_map();
1364
1365							We have already explained the purpose of the constructor parameter 'query_file'
1366							and about the constraints on the format of queries in the file named through
1367							this parameter. As mentioned earlier, the module estimates the relevancies of
1368							the documents to the queries and dumps the relevancies in a file named by the
1369							'relevancy_file' constructor parameter. The constructor parameter
1370							'relevancy_threshold' is used in deciding which of the documents are considered
1371							to be relevant to a query. A document must contain at least the
1372							'relevancy_threshold' occurrences of query words in order to be considered
1373							relevant to a query. We have previously explained the role of the constructor
1374							parameter 'lsa_svd_threshold'.
1375
1376
1377
1378							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1379
1380							my $corpus_dir = "corpus";
1381							my $stop_words_file = "stop_words.txt";
1382							my $query_file = "test_queries.txt";
1383							my $relevancy_file = "relevancy.txt";
1384							my $vsm = Algorithm::VSM->new(
1385							break_camelcased_and_underscored => 1,
1386							case_sensitive => 0,
1387							corpus_directory => $corpus_dir,
1388							file_types => ['.txt', '.java'],
1389							min_word_length => 4,
1390							query_file => $query_file,
1391							relevancy_file => $relevancy_file,
1392							stop_words_file => $stop_words_file,
1393							want_stemming => 1,
1394							);
1395							$vsm->get_corpus_vocabulary_and_word_counts();
1396							$vsm->generate_document_vectors();
1397							$vsm->upload_document_relevancies_from_file();
1398							$vsm->display_doc_relevancies();
1399							$vsm->precision_and_recall_calculator('vsm');
1400							$vsm->display_precision_vs_recall_for_queries();
1401							$vsm->display_average_precision_for_queries_and_map();
1402
1403							Now the filename supplied through the constructor parameter 'relevancy_file' must
1404							contain relevance judgments for the queries that are named in the file supplied
1405							through the parameter 'query_file'. The format of these two files must be
1406							according to what is shown in the sample files 'test_queries.txt' and
1407							'relevancy.txt' in the 'examples' directory.
1408
1409
1410
1411							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1412
1413							my $corpus_dir = "corpus";
1414							my $stop_words_file = "stop_words.txt";
1415							my $query_file = "test_queries.txt";
1416							my $relevancy_file = "relevancy.txt";
1417							my $lsa = Algorithm::VSM->new(
1418							break_camelcased_and_underscored => 1,
1419							case_sensitive => 0,
1420							corpus_directory => $corpus_dir,
1421							file_types => ['.txt', '.java'],
1422							lsa_svd_threshold => 0.01,
1423							min_word_length => 4,
1424							query_file => $query_file,
1425							relevancy_file => $relevancy_file,
1426							stop_words_file => $stop_words_file,
1427							want_stemming => 1,
1428							);
1429							$lsa->get_corpus_vocabulary_and_word_counts();
1430							$lsa->generate_document_vectors();
1431							$lsa->upload_document_relevancies_from_file();
1432							$lsa->display_doc_relevancies();
1433							$lsa->precision_and_recall_calculator('vsm');
1434							$lsa->display_precision_vs_recall_for_queries();
1435							$lsa->display_average_precision_for_queries_and_map();
1436
1437							As mentioned for the previous code block, the filename supplied through the
1438							constructor parameter 'relevancy_file' must contain relevance judgments for the
1439							queries that are named in the file supplied through the parameter 'query_file'.
1440							The format of this file must be according to what is shown in the sample file
1441							'relevancy.txt' in the 'examples' directory. We have already explained the roles
1442							played by the constructor parameters such as 'lsa_svd_threshold'.
1443
1444
1445
1446							# FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1447
1448							my $corpus_dir = "corpus";
1449							my $stop_words_file = "stop_words.txt";
1450							my $vsm = Algorithm::VSM->new(
1451							break_camelcased_and_underscored => 1,
1452							case_sensitive => 0,
1453							corpus_directory => $corpus_dir,
1454							file_types => ['.txt', '.java'],
1455							min_word_length => 4,
1456							stop_words_file => $stop_words_file,
1457							want_stemming => 1,
1458							);
1459							$vsm->get_corpus_vocabulary_and_word_counts();
1460							$vsm->generate_document_vectors();
1461							# code for calculating pairwise similarities as shown in the
1462							# script calculate_similarity_matrix_for_all_docs.pl in the
1463							# examples directory. This script makes calls to
1464							#
1465							# $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1466							#
1467							# for every pair of documents.
1468
1469							=head1 CHANGES
1470
1471							Version 1.70: All of the changes made in this version affect only that part of the
1472							module that is used for calculating precision-vs.-recall curve for the estimation of
1473							MAP (Mean Average Precision). The new formulas that go into estimating MAP are
1474							presented in the author's tutorial on significance testing. Additionally, when
1475							estimating the average retrieval precision for a query, this version explicitly
1476							disregards all documents that have zero similarity with the query.
1477
1478							Version 1.62 removes the Perl version restriction on the module. This version also
1479							fixes two bugs, one in the file scanner code and the other in the
1480							precision-and-recall calculator. The file scanner bug was related to the new
1481							constructor parameter C that was introduced in Version 1.61. And the
1482							precision-and-recall calculator bug was triggered if a query consisted solely of
1483							non-vocabulary words.
1484
1485							Version 1.61 improves the implementation of the directory scanner to make it more
1486							platform independent. Additionally, you are now required to specify in the
1487							constructor call the file types to be considered for computing the database model.
1488							If, say, you have a large software library and you want only Java and text files to
1489							be scanned for creating the VSM (or the LSA) model, you must supply that information
1490							to the module by setting the constructor parameter C to the anonymous
1491							list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1492							version is C. If you set it to 1, that will force the database model
1493							and query matching to become case sensitive.
1494
1495							Version 1.60 reflects the fact that people are now more likely to use this module by
1496							keeping the model constructed for a corpus in the fast memory (as opposed to storing
1497							the models in disk-based hash tables) for its repeated invocation for different
1498							queries. As a result, the default value for the constructor option
1499							C was changed from 1 to 0. For those who still wish to store on
1500							a disk the model that is constructed, the script
1501							C shows how you can do that.
1502							Other changes in 1.60 include a slight reorganization of the scripts in the
1503							C directory. Most scripts now do not by default store their models in
1504							disk-based hash tables. This reorganization is reflected in the description of the
1505							C directory in this documentation. The basic logic of constructing VSM and
1506							LSA models and how these are used for retrievals remains unchanged.
1507
1508							Version 1.50 incorporates a couple of new features: (1) You now have the option to
1509							split camel-cased and underscored words for constructing your vocabulary set; and (2)
1510							Storing the VSM and LSA models in database files on the disk is now optional. The
1511							second feature, in particular, should prove useful to those who are using this module
1512							for large collections of documents.
1513
1514							Version 1.42 includes two new methods, C and
1515							C, for those folks who deal with very large datasets.
1516							You can get a better sense of the overall vocabulary being used by the module for
1517							file retrieval by examining the contents of a dump file whose name is supplied as an
1518							argument to C.
1519
1520							Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1521							the dependencies between this module and the submodules of PDL.
1522
1523							Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1524							documents in the corpus. The elements of such a matrix express pairwise similarities
1525							between the documents. The pairwise similarities are based on the dot product of two
1526							document vectors divided by the product of the vector magnitudes. The 'examples'
1527							directory contains two scripts to illustrate how such matrices can be calculated by
1528							the user. The similarity matrix is output as a CSV file.
1529
1530							Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1531							document file. What that means is that the words that appear in most of the documents
1532							get reduced weighting since such words are non-discriminatory with respect to the
1533							retrieval of the documents. A typical formula that is used to calculate the IDF
1534							weight for a word is the logarithm of the ratio of the total number of documents to
1535							the number of documents in which the word appears. So if a word were to appear in
1536							all the documents, its IDF multiplier would be zero in the vector representation of a
1537							document. If so desired, you can turn off the IDF weighting of the words by
1538							explicitly setting the constructor parameter C to zero.
1539
1540							Version 1.2 includes a code correction and some general code and documentation
1541							cleanup.
1542
1543							With Version 1.1, you can access the retrieval precision results so that you can
1544							compare two different retrieval algorithms (VSM or LSA with different choices for
1545							some of the constructor parameters) with significance testing. (Version 1.0 merely
1546							sent those results to standard output, typically your terminal window.) In Version
1547							1.1, the new script B in the 'examples' directory
1548							illustrates significance testing with Randomization and with Student's Paired t-Test.
1549
1550							=head1 DESCRIPTION
1551
1552							B is a I module for constructing a Vector Space Model (VSM) or
1553							a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1554							to as a corpus, and then retrieving the documents in response to search words in a
1555							query.
1556
1557							VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1558							community. More recently such models have been shown to be effective in retrieving
1559							files/documents from software libraries. For an account of this research that was
1560							presented by Shivani Rao and the author of this module at the 2011 Mining Software
1561							Repositories conference, see L.
1562
1563							VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1564							Stemming the words so extracted and eliminating the designated stop words from the
1565							vocabulary. Stemming means that closely related words like 'programming' and
1566							'programs' are reduced to the common root word 'program' and the stop words are the
1567							non-discriminating words that can be expected to exist in virtually all the
1568							documents. (3) Constructing document vectors for the individual files in the corpus
1569							--- the document vectors taken together constitute what is usually referred to as a
1570							'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1571							factor out the effect of document size and, if desired, multiplying the term
1572							frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1573							the weight of the words that appear in a large number of documents. (5) Constructing
1574							a query vector for the search query after the query is subject to the same stemming
1575							and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1576							Using a similarity metric to return the set of documents that are most similar to the
1577							query vector. The commonly used similarity metric is one based on the cosine
1578							distance between two vectors. Also note that all the vectors mentioned here are of
1579							the same size, the size of the vocabulary. An element of a vector is the frequency
1580							of occurrence of the word corresponding to that position in the vector.
1581
1582							LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1583							step further by subjecting the term-frequency matrix for the corpus to singular value
1584							decomposition (SVD). By retaining only a subset of the singular values (usually the
1585							N largest for some value of N), you can construct reduced-dimensionality vectors for
1586							the documents and the queries. In VSM, as mentioned above, the size of the document
1587							and the query vectors is equal to the size of the vocabulary. For large corpora,
1588							this size may involve tens of thousands of words --- this can slow down the VSM
1589							modeling and retrieval process. So you are very likely to get faster performance
1590							with retrieval based on LSA modeling, especially if you store the model once
1591							constructed in a database file on the disk and carry out retrievals using the
1592							disk-based model.
1593
1594
1595							=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1596
1597							This module has only been tested for software retrieval. For more general text
1598							retrieval, you would need to replace the simple stemmer used in the module by one
1599							based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1600							list of stop words appropriate to the text corpora of interest to you. As previously
1601							mentioned, the stop words are the commonly occurring words that do not carry much
1602							discriminatory power from the standpoint of distinguishing between the documents.
1603							See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1604							formatted.
1605
1606
1607							=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1608
1609							It is not uncommon for large software libraries to consist of tens of thousands of
1610							documents that include source-code files, documentation files, README files,
1611							configuration files, etc. The bug-localization work presented recently by Shivani
1612							Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1613							based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1614							size of 7553 unique words. (Here is a link to this work:
1615							L. Also note that the iBUGS dataset
1616							was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1617							automated bug detection and localization tools.) If C is the size of the
1618							vocabulary and C the number of the documents in the corpus, the size of each
1619							vector will be C and size of the term-frequency matrix for the entire corpus will
1620							be CxC. So if you were to duplicate the bug localization experiments in
1621							L you would be dealing with vectors of
1622							size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1623							to really large libraries/corpora, we are obviously talking about very large matrices
1624							for SVD decomposition. For large libraries/corpora, it would be best to store away
1625							the model in a disk file and to base all subsequent retrievals on the disk-stored
1626							models. The 'examples' directory contains scripts that carry out retrievals on the
1627							basis of disk-based models. Further speedup in retrieval can be achieved by using
1628							LSA to create reduced-dimensionality representations for the documents and by basing
1629							retrievals on the stored versions of such reduced-dimensionality representations.
1630
1631
1632							=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1633
1634							The performance of a retrieval algorithm is typically measured by two properties:
1635							C and C. As mentioned in my tutorial
1636							L, at a given
1637							rank C, C is the ratio of the number of retrieved documents that are
1638							relevant to the total number of retrieved documents up to that rank. And, along the
1639							same lines, C at a given rank C is the ratio of the number of retrieved
1640							documents that are relevant to the total number of relevant documents. The Average
1641							Precision associated with a query is the average of all the Precision-at-rank values
1642							for all the documents relevant to that query. When query specific Average Precision
1643							is averaged over all the queries, you get Mean Average Precision (MAP) as a
1644							single-number characterizer of the retrieval power of an algorithm for a given
1645							corpus. For an oracle, the value of MAP should be 1.0. On the other hand, for
1646							purely random retrieval from a corpus, the value of MAP will be inversely
1647							proportional to the size of the corpus. (See the discussion in
1648							L for further
1649							explanation on these retrieval precision evaluators.)
1650
1651							This module includes methods that allow you to carry out these retrieval accuracy
1652							measurements using the relevancy judgments supplied through a disk file. If
1653							human-supplied relevancy judgments are not available, the module will be happy to
1654							estimate relevancies for you just by determining the number of query words that exist
1655							in a document. Note, however, that relevancy judgments estimated in this manner
1656							cannot be trusted. That is because ultimately it is the humans who are the best
1657							judges of the relevancies of documents to queries. The humans bring to bear semantic
1658							considerations on the relevancy determination problem that are beyond the scope of
1659							this module.
1660
1661
1662							=head1 METHODS
1663
1664							The module provides the following methods for constructing VSM and LSA models of a
1665							corpus, for using the models thus constructed for retrieval, and for carrying out
1666							precision versus recall calculations for the determination of retrieval accuracy on
1667							the corpora of interest to you.
1668
1669							=over
1670
1671							=item B
1672
1673							A call to C constructs a new instance of the C class:
1674
1675							my $vsm = Algorithm::VSM->new(
1676							break_camelcased_and_underscored => 1,
1677							case_sensitive => 0,
1678							corpus_directory => "",
1679							corpus_vocab_db => "corpus_vocab_db",
1680							doc_vectors_db => "doc_vectors_db",
1681							file_types => $my_file_types,
1682							lsa_svd_threshold => 0.01,
1683							max_number_retrievals => 10,
1684							min_word_length => 4,
1685							normalized_doc_vecs_db => "normalized_doc_vecs_db",
1686							query_file => "",
1687							relevancy_file => $relevancy_file,
1688							relevancy_threshold => 5,
1689							save_model_on_disk => 0,
1690							stop_words_file => "",
1691							use_idf_filter => 1,
1692							want_stemming => 1,
1693							);
1694
1695							The values shown on the right side of the big arrows are the B
1696							parameters>. The value supplied through the variable C<$my_file_types> would be
1697							something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1698							be included in creating the database model. The following nested list will now
1699							describe each of the constructor parameters shown above:
1700
1701							=over 16
1702
1703							=item I
1704
1705							The parameter B when set causes the
1706							underscored and camel-cased words to be split. By default the parameter is
1707							set. So if you don't want such words to be split, you must set it
1708							explicitly to 0.
1709
1710							=item I
1711
1712							When set to 1, this parameter forces the module to maintain the case of the terms in
1713							the corpus files when creating the vocabulary and the document vectors. Setting
1714							C to 1 also causes the query matching to become case sensitive.
1715							(This constructor parameter was introduced in Version 1.61.)
1716
1717							=item I
1718
1719							The parameter B points to the root of the directory of documents
1720							for which you want to create a VSM or LSA model.
1721
1722							=item I
1723
1724							The parameter B is for naming the DBM in which the corpus vocabulary
1725							will be stored after it is subject to stemming and the elimination of stop words.
1726							Once a disk-based VSM model is created and stored away in the file named by this
1727							parameter and the parameter to be described next, it can subsequently be used
1728							directly for speedier retrieval.
1729
1730							=item I
1731
1732							The database named by B stores the document vector representation for
1733							each document in the corpus. Each document vector has the same size as the
1734							corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1735							the word that corresponds to that position in the vocabulary vector.
1736
1737							=item I
1738
1739							This parameter tells the module what types of files in the corpus directory you want
1740							scanned for creating the database model. The value supplied for this parameter is an
1741							anonymous list of the file suffixes for the file types. For example, if you wanted
1742							only Java and text files to be scanned, you will set this parameter to C<['.java',
1743							'.txt']>. The module throws an exception if this parameter is left unspecified.
1744							(This constructor parameter was introduced in Version 1.61.)
1745
1746							=item I
1747
1748							The parameter B is used for rejecting singular values that are
1749							smaller than this threshold fraction of the largest singular value. This plays a
1750							critical role in creating reduced-dimensionality document vectors in LSA modeling of
1751							a corpus.
1752
1753							=item I
1754
1755							The constructor parameter B stands for what it means.
1756
1757							=item I
1758
1759							The parameter B sets the minimum number of characters in a
1760							word in order for it to be included in the corpus vocabulary.
1761
1762							=item I
1763
1764							The database named by B stores the normalized document
1765							vectors. Normalization consists of factoring out the size of the documents by
1766							dividing the term frequency for each word in a document by the number of words in the
1767							document, and then multiplying the result by the idf (Inverse Document Frequency)
1768							value for the word.
1769
1770							=item I
1771
1772							The parameter B points to a file that contains the queries to be used for
1773							calculating retrieval performance with C and C numbers. The format
1774							of the query file must be as shown in the sample file C in the
1775							'examples' directory.
1776
1777							=item I
1778
1779							This option names the disk file for storing the relevancy judgments.
1780
1781							=item I
1782
1783							The constructor parameter B is used for automatic determination
1784							of document relevancies to queries on the basis of the number of occurrences of query
1785							words in a document. You can exercise control over the process of determining
1786							relevancy of a document to a query by giving a suitable value to the constructor
1787							parameter B. A document is considered relevant to a query only
1788							when the document contains at least B number of query words.
1789
1790							=item I
1791
1792							The constructor parameter B will cause the basic
1793							information about the VSM and the LSA models to be stored on the disk.
1794							Subsequently, any retrievals can be carried out from the disk-based model.
1795
1796							=item I
1797
1798							The parameter B is for naming the file that contains the stop words
1799							that you do not wish to include in the corpus vocabulary. The format of this file
1800							must be as shown in the sample file C in the 'examples' directory.
1801
1802							=item I
1803
1804							The constructor parameter B is set by default. If you want
1805							to turn off the normalization of the document vectors, including turning
1806							off the weighting of the term frequencies of the words by their idf values,
1807							you must set this parameter explicitly to 0.
1808
1809							=item I
1810
1811							The boolean parameter B determines whether or not the words extracted
1812							from the documents would be subject to stemming. As mentioned elsewhere, stemming
1813							means that related words like 'programming' and 'programs' would both be reduced to
1814							the root word 'program'.
1815
1816							=back
1817
1818							=begin html
1819
1820
1821
1822							=end html
1823
1824							=item B
1825
1826							You call this subroutine for constructing an LSA model for your corpus
1827							after you have extracted the corpus vocabulary and constructed document
1828							vectors:
1829
1830							$vsm->construct_lsa_model();
1831
1832							The SVD decomposition that is carried out in LSA model construction uses the
1833							constructor parameter C to decide how many of the singular values
1834							to retain for the LSA model. A singular is retained only if it is larger than the
1835							C fraction of the largest singular value.
1836
1837
1838							=item B
1839
1840							The Average Precision for a query is the average of the Precision-at-rank values
1841							associated with each of the corpus documents relevant to the query. The mean of the
1842							Average Precision values for all the queries is the Mean Average Precision (MAP).
1843							The C values for the queries and the overall C can be printed
1844							out by calling
1845
1846							$vsm->display_average_precision_for_queries_and_map();
1847
1848
1849							=item B
1850
1851							If you would like to see corpus vocabulary as constructed by the previous call, make
1852							the call
1853
1854							$vsm->display_corpus_vocab();
1855
1856							Note that this is a useful thing to do only on small test corpora. If you need
1857							to examine the vocabulary for a large corpus, call the two methods listed below.
1858
1859
1860							=item B
1861
1862							If you would like for the module to print out in your terminal window the size of the
1863							vocabulary, make the call
1864
1865							$vsm->display_corpus_vocab_size();
1866
1867
1868							=item B
1869
1870							If you would like to see the document relevancies generated by the previous method,
1871							you can call
1872
1873							$vsm->display_doc_relevancies()
1874
1875
1876							=item B
1877
1878							If you would like to see the document vectors constructed by the previous call, make
1879							the call:
1880
1881							$vsm->display_doc_vectors();
1882
1883							Note that this is a useful thing to do only on small test corpora. If you must call
1884							this method on a large corpus, you might wish to direct the output to a file.
1885
1886
1887							=item B
1888
1889							You can display the idf value associated with each word in the corpus by
1890
1891							$vsm->display_inverse_document_frequencies();
1892
1893							The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1894							of the total number of documents in the corpus to the number of documents in which
1895							the word appears (with protection built in to prevent division by zero). Ideally, if
1896							a word appears in all the documents, its idf would be small, close to zero. Words
1897							with small idf values are non-discriminatory and should get reduced weighting in
1898							document retrieval.
1899
1900
1901							=item B
1902
1903							If you would like to see the normalized document vectors, make the call:
1904
1905							$vsm->display_normalized_doc_vectors();
1906
1907							See the comment made previously as to what is meant by the normalization of a
1908							document vector.
1909
1910
1911							=item B
1912
1913							A call to C will normally be followed by the
1914							following call
1915
1916							$vsm->display_precision_vs_recall_for_queries();
1917
1918							for displaying the C and C values.
1919
1920
1921							=item B
1922
1923							You can display the retrieved document names by calling this method using the syntax:
1924
1925							$vsm->display_retrievals( $retrievals );
1926
1927							where C<$retrievals> is a reference to the hash returned by a call to one of the
1928							C methods. The display method shown here respects the retrieval size
1929							constraints expressed by the constructor parameter C.
1930
1931
1932							=item B
1933
1934							Before you can carry out precision and recall calculations to test the accuracy of
1935							VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1936							judgments for the queries. (A relevancy judgment for a query is simply the list of
1937							documents relevant to that query.) Relevancy judgments are commonly supplied by the
1938							humans who are familiar with the corpus. But if such human-supplied relevance
1939							judgments are not available, you can invoke the following method to estimate them:
1940
1941							$vsm->estimate_doc_relevancies();
1942
1943							For the above method call, a document is considered to be relevant to a query if it
1944							contains several of the query words. As to the minimum number of query words that
1945							must exist in a document in order for the latter to be considered relevant, that is
1946							determined by the C parameter in the VSM constructor.
1947
1948							But note that this estimation of document relevancies to queries is NOT for serious
1949							work. The reason for that is because ultimately it is the humans who are the best
1950							judges of the relevancies of documents to queries. The humans bring to bear semantic
1951							considerations on the relevancy determination problem that are beyond the scope of
1952							this module.
1953
1954							The generated relevancies are deposited in a file named by the constructor parameter
1955							C.
1956
1957
1958							=item B
1959
1960							If you want to get hold of all the filenames in the corpus in your own script, you
1961							can call
1962
1963							my @docs = @{$vsm->get_all_document_names()};
1964
1965							The array on the left will contain an alphabetized list of the files.
1966
1967
1968							=item B
1969
1970							This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1971							course, if you will be doing document retrieval through a disk-stored VSM or LSA
1972							model, then you do not need to call this method. You construct document vectors
1973							through the following call:
1974
1975							$vsm->generate_document_vectors();
1976
1977
1978							=item B
1979
1980							After you have constructed a new instance of the C class, you must
1981							now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1982
1983							$vsm->get_corpus_vocabulary_and_word_counts();
1984
1985							The only time you do NOT need to call this method is when you are using a previously
1986							constructed disk-stored VSM model for retrieval.
1987
1988
1989							=item B
1990
1991							If you want to run significance tests on the retrieval accuracies you obtain on a
1992							given corpus and with different algorithms (VSM or LSA with different choices for the
1993							constructor parameters), your own script would need access to the average precision
1994							data for a set of queries. You can get hold of this data by calling
1995
1996							$vsm->get_query_sorted_average_precision_for_queries();
1997
1998							The script C in the 'examples' directory shows how you can
1999							use this method for significance testing.
2000
2001
2002							=item B
2003
2004							=item B
2005
2006							If you would like to compare in your own script any two documents in the corpus, you
2007							can call
2008
2009							my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
2010							or
2011							my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
2012
2013							Both these calls return a number that is the dot product of the two document vectors
2014							normalized by the product of their magnitudes. The first call uses the regular
2015							document vectors and the second the normalized document vectors.
2016
2017
2018							=item B
2019
2020							After you have created or obtained the relevancy judgments for your test queries, you
2021							can make the following call to calculate C and C:
2022
2023							$vsm->precision_and_recall_calculator('vsm');
2024							or
2025							$vsm->precision_and_recall_calculator('lsa');
2026
2027							depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
2028
2029							=item B
2030
2031							After you have built an LSA model through the call to C, you
2032							can retrieve the document names most similar to the query by:
2033
2034							my $retrievals = $vsm->retrieve_with_lsa( \@query );
2035
2036							Subsequently, you can display the retrievals by calling the
2037							C method described previously.
2038
2039
2040							=item B
2041
2042							After you have constructed a VSM model, you call this method for document retrieval
2043							for a given query C<@query>. The call syntax is:
2044
2045							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2046
2047							The argument, C<@query>, is simply a list of words that you wish to use for
2048							retrieval. The method returns a hash whose keys are the document names and whose
2049							values the similarity distance between the document and the query. As is commonly
2050							the case with VSM, this module uses the cosine similarity distance when comparing a
2051							document vector with the query vector.
2052
2053
2054							=item B
2055
2056							When human-supplied relevancies are available, you can upload them into the program
2057							by calling
2058
2059							$vsm->upload_document_relevancies_from_file();
2060
2061							These relevance judgments will be read from a file that is named with the
2062							C constructor parameter.
2063
2064
2065							=item B
2066
2067							When you invoke the methods C and
2068							C, that automatically deposits the VSM model in the
2069							database files named with the constructor parameters C,
2070							C and C. Subsequently, you can carry out
2071							retrieval by directly using this disk-based VSM model for speedier performance. In
2072							order to do so, you must upload the disk-based model by
2073
2074							$vsm->upload_normalized_vsm_model_from_disk();
2075
2076							Subsequently you call
2077
2078							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2079							$vsm->display_retrievals( $retrievals );
2080
2081							for retrieval and for displaying the results.
2082
2083
2084							=item B
2085
2086							This is the method to call for large text corpora if you would like to examine the
2087							vocabulary created. The call syntax is
2088
2089							$vsm->write_corpus_vocab_to_file($filename);
2090
2091							where C<$filename> is the name of the file that you want the vocabulary to be written
2092							out to. This call will also show the frequency of each vocabulary word in your
2093							corpus.
2094
2095
2096							=back
2097
2098
2099							=head1 REQUIRED
2100
2101							This module requires the following modules:
2102
2103							SDBM_File
2104							Storable
2105							PDL
2106							File::Basename
2107							File::Spec::Functions
2108
2109							The first two of these are needed for creating disk-based database records for the
2110							VSM and LSA models. The third is needed for calculating the SVD of the
2111							term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2112							by the directory scanner to make pathnames platform independent.
2113
2114							=head1 EXAMPLES
2115
2116							See the 'examples' directory in the distribution for the scripts listed below:
2117
2118							=over
2119
2120							=item B
2121
2122							For basic VSM-based model construction and retrieval, run the script:
2123
2124							retrieve_with_VSM.pl
2125
2126							Starting with version 1.60, this script does not store away the VSM model in
2127							disk-based hash tables. If you want your model to be stored on the disk, you must
2128							run the script C for that.
2129
2130							=item B
2131
2132							If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2133							script
2134
2135							continuously_running_VSM_retrieval_engine.pl
2136
2137							You can create a script similar to this for doing the same with LSA models.
2138
2139							=item B
2140
2141							For storing the model information in disk-based DBM files that can subsequently be
2142							used for both VSM and LSA retrieval, run the script:
2143
2144							retrieve_with_VSM_and_also_create_disk_based_model.pl
2145
2146							=item B
2147
2148							For basic LSA-based model construction and retrieval, run the script:
2149
2150							retrieve_with_LSA.pl
2151
2152							Starting with version 1.60, this script does not store away the model information in
2153							disk-based hash tables. If you want your model to be stored on the disk, you must
2154							run the script C for that.
2155
2156							=item B
2157
2158							If you have previously run a script like
2159							C, you can run the script
2160
2161							retrieve_with_disk_based_VSM.pl
2162
2163							for repeated VSM-based retrievals from a disk-based model.
2164
2165							=item B
2166
2167							If you have previously run a script like
2168							C, you can run the script
2169
2170							retrieve_with_disk_based_LSA.pl
2171
2172							for repeated LSA-based retrievals from a disk-based model.
2173
2174							=item B
2175
2176							To experiment with precision and recall calculations for VSM retrieval, run the
2177							script:
2178
2179							calculate_precision_and_recall_for_VSM.pl
2180
2181							Note that this script will carry out its own estimation of relevancy judgments ---
2182							which in most cases would not be a safe thing to do.
2183
2184							=item B
2185
2186							To experiment with precision and recall calculations for LSA retrieval, run the
2187							script:
2188
2189							calculate_precision_and_recall_for_LSA.pl
2190
2191							Note that this script will carry out its own estimation of relevancy judgments ---
2192							which in most cases would not be a safe thing to do.
2193
2194							=item B
2195							Human-Supplied Relevancies:>
2196
2197							Precision and recall calculations for retrieval accuracy determination are best
2198							carried out with human-supplied judgments of relevancies of the documents to queries.
2199							If such judgments are available, run the script:
2200
2201							calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2202
2203							This script will print out the average precisions for the different test queries and
2204							calculate the MAP metric of retrieval accuracy.
2205
2206							=item B
2207							Human-Supplied Relevancies:>
2208
2209							If human-supplied relevancy judgments are available and you wish to experiment with
2210							precision and recall calculations for LSA-based retrieval, run the script:
2211
2212							calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2213
2214							This script will print out the average precisions for the different test queries and
2215							calculate the MAP metric of retrieval accuracy.
2216
2217							=item B
2218							Randomization or with Student's Paired t-Test:>
2219
2220							significance_testing.pl randomization
2221
2222							or
2223
2224							significance_testing.pl t-test
2225
2226							Significance testing consists of forming a null hypothesis that the two retrieval
2227							algorithms you are considering are the same from a black-box perspective and then
2228							calculating what is known as a C. If the C is less than, say,
2229							0.05, you reject the null hypothesis.
2230
2231							=item B
2232
2233							calculate_similarity_matrix_for_all_docs.pl
2234
2235							or
2236
2237							calculate_similarity_matrix_for_all_normalized_docs.pl
2238
2239							The former uses regular document vectors for calculating the similarity between every
2240							pair of documents in the corpus. And the latter uses normalized document vectors for
2241							the same purpose. The document order used for row and column indexing of the matrix
2242							corresponds to the alphabetic ordering of the document names in the corpus directory.
2243
2244							=back
2245
2246
2247							=head1 EXPORT
2248
2249							None by design.
2250
2251							=head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2252
2253							You have to be careful when carrying out Precision verses Recall calculations if you
2254							do not wish to lose the previously created relevancy judgments. Invoking the method
2255							C in your own script will cause the file C
2256							to be overwritten. If you have created a relevancy database and stored it in a file
2257							called, say, C, you should make a backup copy of this file before
2258							executing a script that calls C.
2259
2260							=head1 BUGS
2261
2262							Please notify the author if you encounter any bugs. When sending email, please place
2263							the string 'VSM' in the subject line to get past my spam filter.
2264
2265							=head1 INSTALLATION
2266
2267							Download the archive from CPAN in any directory of your choice. Unpack the archive
2268							with a command that on a Linux machine would look like:
2269
2270							tar zxvf Algorithm-VSM-1.70.tar.gz
2271
2272							This will create an installation directory for you whose name will be
2273							C. Enter this directory and execute the following commands for a
2274							standard install of the module if you have root privileges:
2275
2276							perl Makefile.PL
2277							make
2278							make test
2279							sudo make install
2280
2281							If you do not have root privileges, you can carry out a non-standard install the
2282							module in any directory of your choice by:
2283
2284							perl Makefile.PL prefix=/some/other/directory/
2285							make
2286							make test
2287							make install
2288
2289							With a non-standard install, you may also have to set your PERL5LIB environment
2290							variable so that this module can find the required other modules. How you do that
2291							would depend on what platform you are working on. In order to install this module in
2292							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2293							variable by
2294
2295							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2296
2297							If I used bash, I'd need to declare:
2298
2299							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2300
2301
2302							=head1 THANKS
2303
2304							Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2305							deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2306							similarity matrix characterization of a corpus. Thanks, Zahn!
2307
2308							Several of the recent changes to the module are a result of the feedback I have
2309							received from Naveen Kulkarni of Infosys Labs. Thanks, Naveen!
2310
2311							Version 1.62 was a result of Slaven Rezic's recommendation that I remove the Perl
2312							version restriction on the module since he was able to run it with Perl version
2313							5.8.9. Another important reason for v. 1.62 was the discovery of the two bugs
2314							mentioned in Changes, one of them brought to my attention by Naveen Kulkarni.
2315
2316							=head1 AUTHOR
2317
2318							The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy" project
2319							with the publication of the book "B" by John-Wiley. If
2320							interested, check out his web page at Purdue to find out what the Objects Trilogy
2321							project was all about. You might like "B" especially if you
2322							enjoyed reading Harry Potter as a kid (or even as an adult, for that matter). The
2323							other two books in the trilogy are "B" and "B
2324							with Objects>".
2325
2326							For all issues related to this module, contact the author at C
2327
2328							If you send email, please place the string "VSM" in your subject line to get past the
2329							author's spam filter.
2330
2331							=head1 COPYRIGHT
2332
2333							This library is free software; you can redistribute it and/or modify it under the
2334							same terms as Perl itself.
2335
2336							Copyright 2015 Avinash Kak
2337
2338							=cut
2339
2340