File Coverage

blib/lib/Algorithm/VSM.pm

Criterion	Covered	Total	%
statement	351	822	42.7
branch	92	328	28.0
condition	25	67	37.3
subroutine	31	54	57.4
pod	24	35	68.5
total	523	1306	40.0

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::VSM;
2
3							#---------------------------------------------------------------------------
4							# Copyright (c) 2015 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::VSM is a Perl module for retrieving the documents from a software
9							# library that match a list of words in a query. The matching criterion used depends
10							# on whether you ask the module to construct a full-dimensionality VSM or a
11							# reduced-dimensionality LSA model for the library.
12							# ---------------------------------------------------------------------------
13
14							#use 5.10.0;
15	1			1		13497	use strict;
	1					2
	1					29
16	1			1		3	use warnings;
	1					2
	1					27
17	1			1		4	use Carp;
	1					4
	1					73
18	1			1		387	use SDBM_File;
	1					1569
	1					39
19	1			1		389	use PDL::Lite;
	1					139578
	1					27
20	1			1		7	use PDL::MatrixOps;
	1					1
	1					8
21	1			1		165	use File::Basename;
	1					1
	1					76
22	1			1		572	use File::Spec::Functions qw(rel2abs);
	1					737
	1					58
23	1			1		5	use Fcntl;
	1					2
	1					199
24	1			1		556	use Storable;
	1					2595
	1					56
25	1			1		7	use Cwd;
	1					1
	1					10284
26
27							our $VERSION = '1.62';
28
29							# for camelcase splits (from perlmonks):
30							my $_regex = qr/[[:lower:]0-9]+\|[[:upper:]0-9](?:[[:upper:]0-9]+\|[[:lower:]0-9]*)(?=$\|[[:upper:]0-9])/;
31
32							################################### Constructor #######################################
33
34							# Constructor for creating a VSM or LSA model of a corpus. The model instance
35							# returned by the constructor can be used for retrieving documents from the corpus
36							# in response to queries.
37							sub new {
38	2			2	1	514	my ($class, %args) = @_;
39	2					17	my @params = keys %args;
40	2	50				13	croak "\nYou have used a wrong name for a keyword argument " .
41							"--- perhaps a misspelling\n"
42							if _check_for_illegal_params(@params) == 0;
43	2	50	50			7035	bless {
		50	50
		50	50
			50
			50
			50
			50
			50
			50
			50
			50
			50
			100
			50
			50
44							_corpus_directory => $args{corpus_directory} \|\| "",
45							_save_model_on_disk => $args{save_model_on_disk} \|\| 0,
46							_break_camelcased_and_underscored => exists $args{break_camelcased_and_underscored} ?
47							$args{break_camelcased_and_underscored} : 1,
48							_corpus_vocab_db => $args{corpus_vocab_db} \|\| "corpus_vocab_db",
49							_doc_vectors_db => $args{doc_vectors_db} \|\| "doc_vectors_db",
50							_normalized_doc_vecs_db => $args{normalized_doc_vecs_db} \|\| "normalized_doc_vecs_db",
51							_stop_words_file => $args{stop_words_file} \|\| "",
52							_case_sensitive => $args{case_sensitive} \|\| 0,
53							_query_file => $args{query_file} \|\| "",
54							_file_types => $args{file_types} \|\| [],
55							_min_word_length => $args{min_word_length} \|\| 4,
56							_want_stemming => $args{want_stemming} \|\| 0,
57							_idf_filter_option => exists $args{use_idf_filter} ? $args{use_idf_filter} : 1,
58							_max_number_retrievals => $args{max_number_retrievals} \|\| 30,
59							_lsa_svd_threshold => $args{lsa_svd_threshold} \|\| 0.01,
60							_relevancy_threshold => exists $args{relevancy_threshold} ?
61							$args{relevancy_threshold} : 1,
62							_relevancy_file => $args{relevancy_file} \|\| "",
63							_debug => $args{debug} \|\| 0,
64							_working_directory => cwd,
65							_vocab_hist_on_disk => {},
66							_vocab_hist => {},
67							_doc_hist_template => {},
68							_corpus_doc_vectors => {},
69							_normalized_doc_vecs => {},
70							_query_vector => {},
71							_stop_words => [],
72							_term_document_matrix => [],
73							_corpus_vocab_done => 0,
74							_scan_dir_for_rels => 0,
75							_vocab_size => undef,
76							_doc_vecs_trunc_lsa => {},
77							_lsa_vec_truncator => undef,
78							_queries_for_relevancy => {},
79							_relevancy_estimates => {},
80							_precision_for_queries => {},
81							_recall_for_queries => {},
82							_map_vals_for_queries => {},
83							_vocab_idf_hist => {},
84							_idf_t => {},
85							_total_num_of_docs => 0,
86							}, $class;
87							}
88
89
90							###################### Get corpus vocabulary and word counts #########################
91
92							sub get_corpus_vocabulary_and_word_counts {
93	2			2	1	28	my $self = shift;
94	2	50				25	die "You must supply the name of the corpus directory to the constructor"
95							unless $self->{_corpus_directory};
96	2	50				11	print "Scanning the directory '$self->{_corpus_directory}' for\n" .
97							" model construction\n\n" if $self->{_debug};
98	2					23	$self->_scan_directory( $self->{_corpus_directory} );
99	2	50				9	$self->_drop_stop_words() if $self->{_stop_words_file};
100	2	50				8	if ($self->{_debug}) {
101	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
102	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
103							}
104							}
105	2	50				13	if ($self->{_save_model_on_disk}) {
106	0					0	unlink glob "$self->{_corpus_vocab_db}.*";
107	0					0	unlink glob "$self->{_doc_vectors_db}.*";
108	0					0	unlink glob "$self->{_normalized_doc_vecs_db}.*";
109	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
110							$self->{_corpus_vocab_db}, O_RDWR\|O_CREAT, 0640
111							or die "Can't create DBM files: $!";
112	0					0	foreach (keys %{$self->{_vocab_hist}}) {
	0					0
113	0					0	$self->{_vocab_hist_on_disk}->{$_} = $self->{_vocab_hist}->{$_};
114							}
115	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
116							}
117	2					6	$self->{_corpus_vocab_done} = 1;
118	2					5	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	2					25
119	2	50				6	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
120							if $self->{_debug};
121							# Calculate idf(t):
122	2					5	foreach (keys %{$self->{_vocab_idf_hist}}) {
	2					32
123	200					429	$self->{_idf_t}->{$_} = abs( (1 + log($self->{_total_num_of_docs}
124							/
125							(1 + $self->{_vocab_idf_hist}->{$_})))
126							/ log(10) );
127							}
128							}
129
130							sub display_corpus_vocab {
131	0			0	1	0	my $self = shift;
132	0					0	die "corpus vocabulary not yet constructed"
133	0	0				0	unless keys %{$self->{_vocab_hist}};
134	0					0	print "\n\nDisplaying corpus vocabulary:\n\n";
135	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
136	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
137	0					0	print "$outstring\n";
138							}
139							}
140
141							sub display_corpus_vocab_size {
142	0			0	1	0	my $self = shift;
143	0					0	die "corpus vocabulary not yet constructed"
144	0	0				0	unless keys %{$self->{_vocab_hist}};
145	0					0	my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
	0					0
146	0					0	print "\nSize of the corpus vocabulary: $vocab_size\n\n";
147							}
148
149							sub write_corpus_vocab_to_file {
150	0			0	1	0	my $self = shift;
151	0					0	my $file = shift;
152	0	0				0	die "corpus vocabulary not yet constructed" unless keys %{$self->{_vocab_hist}};
	0					0
153	0	0				0	open OUT, "> $file"
154							or die "unable to open for output a file with name `$file': $!";
155	0					0	foreach (sort keys %{$self->{_vocab_hist}}){
	0					0
156	0					0	my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
157	0					0	print OUT "$outstring\n";
158							}
159	0					0	close OUT;
160							}
161
162							sub display_inverse_document_frequencies {
163	0			0	1	0	my $self = shift;
164	0					0	die "corpus vocabulary not yet constructed"
165	0	0				0	unless keys %{$self->{_vocab_idf_hist}};
166	0	0				0	print "\n\nThe idf values and idf(t) values displayed below are not being used for retrieval since you did not set the use_idf_filter option in the constructor\n"
167							unless $self->{_idf_filter_option};
168	0					0	print "\n\nDisplaying inverse document frequencies:\n";
169	0					0	foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
	0					0
170	0					0	my $outstring = sprintf("%30s %d",
171							$_, $self->{_vocab_idf_hist}->{$_});
172	0					0	print "$outstring\n";
173							}
174	0					0	print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of documents and d(t) the number of docs with the word t:\n";
175	0					0	foreach ( sort keys %{$self->{_idf_t}} ) {
	0					0
176	0					0	my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
177	0					0	print "$outstring\n";
178							}
179							}
180
181							sub get_all_document_names {
182	0			0	1	0	my $self = shift;
183	0					0	my @all_files = sort keys %{$self->{_corpus_doc_vectors}};
	0					0
184	0					0	return \@all_files;
185							}
186
187							############################ Generate Document Vectors #################################
188
189							sub generate_document_vectors {
190	2			2	1	33	my $self = shift;
191	2					23	chdir $self->{_working_directory};
192	2					4	foreach ( sort keys %{$self->{_vocab_hist}} ) {
	2					84
193	200					352	$self->{_doc_hist_template}->{$_} = 0;
194							}
195	2					27	$self->_scan_directory( $self->{_corpus_directory} );
196	2					24	chdir $self->{_working_directory};
197	2	50				27	if ($self->{_save_model_on_disk}) {
198	0	0	0			0	die "You did not specify in the constructor call the names for the diskfiles " .
199							"for storing the disk-based hash tables consisting of document vectors " .
200							"and their normalized versions"
201							unless $self->{_doc_vectors_db} && $self->{_normalized_doc_vecs_db};
202	0					0	eval {
203	0					0	store( $self->{_corpus_doc_vectors}, $self->{_doc_vectors_db} );
204							};
205	0	0				0	if ($@) {
206	0					0	print "Something went wrong with disk storage of document vectors: $@";
207							}
208	0					0	eval {
209	0					0	store($self->{_normalized_doc_vecs}, $self->{_normalized_doc_vecs_db});
210							};
211	0	0				0	if ($@) {
212	0					0	print "Something wrong with disk storage of normalized doc vecs: $@";
213							}
214							}
215							}
216
217							sub display_doc_vectors {
218	0			0	1	0	my $self = shift;
219	0					0	die "document vectors not yet constructed"
220	0	0				0	unless keys %{$self->{_corpus_doc_vectors}};
221	0					0	foreach my $file (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
222	0					0	print "\n\ndisplay doc vec for $file:\n";
223	0					0	foreach ( sort keys %{$self->{_corpus_doc_vectors}->{$file}} ) {
	0					0
224	0					0	print "$_ => $self->{_corpus_doc_vectors}->{$file}->{$_}\n";
225							}
226	0					0	my $docvec_size = keys %{$self->{_corpus_doc_vectors}->{$file}};
	0					0
227	0					0	print "\nSize of vector for $file: $docvec_size\n";
228							}
229							}
230
231							sub display_normalized_doc_vectors {
232	0			0	1	0	my $self = shift;
233	0					0	die "normalized document vectors not yet constructed"
234	0	0				0	unless keys %{$self->{_normalized_doc_vecs}};
235	0	0				0	unless ($self->{_idf_filter_option}) {
236	0					0	print "Nothing to display for normalized doc vectors since you did not set the use_idf_filter option in the constructor\n";
237	0					0	return;
238							}
239	0					0	foreach my $file (sort keys %{$self->{_normalized_doc_vecs}}) {
	0					0
240	0					0	print "\n\ndisplay normalized doc vec for $file:\n";
241	0					0	foreach ( sort keys %{$self->{_normalized_doc_vecs}->{$file}} ) {
	0					0
242	0					0	print "$_ => $self->{_normalized_doc_vecs}->{$file}->{$_}\n";
243							}
244	0					0	my $docvec_size = keys %{$self->{_normalized_doc_vecs}->{$file}};
	0					0
245	0					0	print "\nSize of normalized vector for $file: $docvec_size\n";
246							}
247							}
248
249							######################## Calculate Pairwise Document Similarities ######################
250
251							# Returns the similarity score for two documents whose actual names are are supplied
252							# as its two arguments.
253							sub pairwise_similarity_for_docs {
254	0			0	1	0	my $self = shift;
255	0					0	my $doc1 = shift;
256	0					0	my $doc2 = shift;
257	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
258	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
259	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
260	0					0	my $vec_hash_ref1 = $self->{_corpus_doc_vectors}->{$doc1};
261	0					0	my $vec_hash_ref2 = $self->{_corpus_doc_vectors}->{$doc2};
262	0					0	my @vec1 = ();
263	0					0	my @vec2 = ();
264	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
265	0					0	push @vec1, $vec_hash_ref1->{$word};
266	0					0	push @vec2, $vec_hash_ref2->{$word};
267							}
268	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
269	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
270	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
271	0					0	$product /= $vec_mag1 * $vec_mag2;
272	0					0	return $product;
273							}
274
275							sub pairwise_similarity_for_normalized_docs {
276	0			0	1	0	my $self = shift;
277	0					0	my $doc1 = shift;
278	0					0	my $doc2 = shift;
279	0					0	my @all_files = keys %{$self->{_corpus_doc_vectors}};
	0					0
280	0	0				0	croak "The file $doc1 does not exist in the corpus: " unless contained_in($doc1, @all_files);
281	0	0				0	croak "The file $doc2 does not exist in the corpus: " unless contained_in($doc2, @all_files);
282	0					0	my $vec_hash_ref1 = $self->{_normalized_doc_vecs}->{$doc1};
283	0					0	my $vec_hash_ref2 = $self->{_normalized_doc_vecs}->{$doc2};
284	0					0	my @vec1 = ();
285	0					0	my @vec2 = ();
286	0					0	foreach my $word (sort keys %$vec_hash_ref1) {
287	0					0	push @vec1, $vec_hash_ref1->{$word};
288	0					0	push @vec2, $vec_hash_ref2->{$word};
289							}
290	0					0	my $vec_mag1 = vec_magnitude(\@vec1);
291	0					0	my $vec_mag2 = vec_magnitude(\@vec2);
292	0					0	my $product = vec_scalar_product(\@vec1, \@vec2);
293	0					0	$product /= $vec_mag1 * $vec_mag2;
294	0					0	return $product;
295							}
296
297							############################### Retrieve with VSM Model ################################
298
299							sub retrieve_with_vsm {
300	1			1	1	12	my $self = shift;
301	1					3	my $query = shift;
302	1					2	my @clean_words;
303	1					5	my $min = $self->{_min_word_length};
304
305	1	50				5	if ($self->{_break_camelcased_and_underscored}) {
306	1					25	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
307	1					3	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					25
308	0					0	@clean_words = $self->{_case_sensitive} ?
309	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	8					51
310	1	100				5	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8	50				26
311							} else {
312	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
313	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
314							}
315	1					4	$query = \@clean_words;
316	1	50				7	print "\nYour query words are: @$query\n" if $self->{_debug};
317	1	50				9	if ($self->{_idf_filter_option}) {
318	1					73	die "\nYou need to first generate normalized document vectors before you can call retrieve_with_vsm()"
319	1					10	unless scalar(keys %{$self->{_vocab_hist}})
320	1	50	50			1	&& scalar(keys %{$self->{_normalized_doc_vecs}});
321							} else {
322	0					0	die "\nYou need to first generate document vectors before you can call retrieve_with_vsm()"
323	0					0	unless scalar(keys %{$self->{_vocab_hist}})
324	0	0	0			0	&& scalar(keys %{$self->{_corpus_doc_vectors}});
325							}
326	1					2	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					42
327	100					124	$self->{_query_vector}->{$_} = 0;
328							}
329	1					6	foreach (@$query) {
330	7	50				10	if ($self->{_case_sensitive}) {
331	0	0				0	$self->{_query_vector}->{$_}++ if exists $self->{_vocab_hist}->{$_};
332							} else {
333	7	100				17	$self->{_query_vector}->{"\L$_"}++ if exists $self->{_vocab_hist}->{"\L$_"};
334							}
335							}
336	1					2	my @query_word_counts = values %{$self->{_query_vector}};
	1					14
337	1					9	my $query_word_count_total = reduce(\@query_word_counts);
338	1	50				10	die "\nYour query does not contain corpus words. Nothing retrieved.\n"
339							unless $query_word_count_total;
340	1					3	my %retrievals;
341	1	50				5	if ($self->{_idf_filter_option}) {
342	1	50				3	print "\n\nUsing idf filter option for retrieval:\n\n"
343							if $self->{_debug};
344	1					3	foreach (sort {$self->_doc_vec_comparator}
	16					38
	1					15
345							keys %{$self->{_normalized_doc_vecs}}) {
346	8					17	$retrievals{$_} = $self->_similarity_to_query($_);
347							}
348							} else {
349	0	0				0	print "\n\nNOT using idf filter option for retrieval:\n\n"
350							if $self->{_debug};
351	0					0	foreach (sort {$self->_doc_vec_comparator}
	0					0
	0					0
352							keys %{$self->{_corpus_doc_vectors}}) {
353	0					0	$retrievals{$_} = $self->_similarity_to_query($_);
354							}
355							}
356	1	50				5	if ($self->{_debug}) {
357	0					0	print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
358	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
359	0					0	print "$_ => $retrievals{$_}\n";
360							}
361							}
362	1					12	return \%retrievals;
363							}
364
365							######################### Upload a Previously Constructed Model #########################
366
367							sub upload_vsm_model_from_disk {
368	0			0	0	0	my $self = shift;
369	0	0	0			0	die "\nCannot find the database files for the VSM model"
370							unless -s "$self->{_corpus_vocab_db}.pag"
371							&& -s $self->{_doc_vectors_db};
372	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
373	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
374							$self->{_corpus_vocab_db}, O_RDONLY, 0640
375							or die "Can't open DBM file: $!";
376	0	0				0	if ($self->{_debug}) {
377	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
378	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
379							}
380							}
381	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
382	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
383							}
384	0					0	$self->{_corpus_vocab_done} = 1;
385	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
386	0	0				0	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
387							if $self->{_debug};
388	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
389	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
390							}
391
392							sub upload_normalized_vsm_model_from_disk {
393	0			0	1	0	my $self = shift;
394	0	0	0			0	die "\nCannot find the database files for the VSM model"
395							unless -s "$self->{_corpus_vocab_db}.pag"
396							&& -s $self->{_normalized_doc_vecs_db};
397	0					0	$self->{_normalized_doc_vecs} = retrieve($self->{_normalized_doc_vecs_db});
398	0	0				0	tie %{$self->{_vocab_hist_on_disk}}, 'SDBM_File',
	0					0
399							$self->{_corpus_vocab_db}, O_RDONLY, 0640
400							or die "Can't open DBM file: $!";
401	0	0				0	if ($self->{_debug}) {
402	0					0	foreach ( sort keys %{$self->{_vocab_hist_on_disk}} ) {
	0					0
403	0					0	printf( "%s\t%d\n", $_, $self->{_vocab_hist_on_disk}->{$_} );
404							}
405							}
406	0					0	foreach (keys %{$self->{_vocab_hist_on_disk}}) {
	0					0
407	0					0	$self->{_vocab_hist}->{$_} = $self->{_vocab_hist_on_disk}->{$_};
408							}
409	0					0	$self->{_corpus_vocab_done} = 1;
410	0					0	$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
	0					0
411	0	0				0	print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
412							if $self->{_debug};
413	0					0	untie %{$self->{_vocab_hist_on_disk}};
	0					0
414							}
415
416							############################## Display Retrieval Results ################################
417
418							sub display_retrievals {
419	0			0	1	0	my $self = shift;
420	0					0	my $retrievals = shift;
421	0					0	print "\n\nShowing the retrievals and the similarity scores:\n\n";
422	0					0	my $iter = 0;
423	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}} keys %$retrievals){
	0					0
424	0					0	print "$_ => $retrievals->{$_}\n";
425	0					0	$iter++;
426	0	0				0	last if $iter > $self->{_max_number_retrievals};
427							}
428	0					0	print "\n\n";
429							}
430
431							############################### Directory Scanner ################################
432
433							sub _scan_directory {
434	4			4		12	my $self = shift;
435	4					43	my $dir = rel2abs( shift );
436	4					10709	my $current_dir = cwd;
437	4	50				147	chdir $dir or die "Unable to change directory to $dir: $!";
438	4					611	foreach ( glob "*" ) {
439	32	50	33			2908	if ( -d and !(-l) ) {
		50	33
			33
			33
			33
440	0					0	$self->_scan_directory( $_ );
441	0	0				0	chdir $dir
442							or die "Unable to change directory to $dir: $!";
443							} elsif (-r _ and
444							-T _ and
445							-M _ > 0.00001 and # modification age is at least 1 sec
446							!( -l $_ ) and
447							$self->ok_to_filetype($_) ) {
448	32	50				168	$self->_scan_file_for_rels($_) if $self->{_scan_dir_for_rels};
449	32	100				112	$self->_scan_file($_) unless $self->{_corpus_vocab_done};
450	32	100				148	$self->_construct_doc_vector($_) if $self->{_corpus_vocab_done};
451							}
452							}
453	4					128	chdir $current_dir;
454							}
455
456							sub _scan_file {
457	16			16		68	my $self = shift;
458	16					19	my $file = shift;
459	16					398	open IN, $file;
460	16					48	my $min = $self->{_min_word_length};
461	16					37	my %uniques = ();
462	16					165	while () {
463	834	100				2814	next if /^[ ]*\r?\n?$/;
464	710					2272	$_ =~ s/\r?\n?$//;
465	710					693	my @clean_words;
466	710	50				1067	if ($self->{_break_camelcased_and_underscored}) {
467	710					7567	my @brokenup = grep $_, split /\W\|_\|\s+/, $_;
468	710					1350	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	2574					8207
469	0					0	@clean_words = $self->{_case_sensitive} ?
470	0	0				0	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?$1:''} @clean_words :
	2712					5482
471	710	100				1366	grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	2712	50				7312
472							} else {
473	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
474	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
475							}
476	710	100				2184	next unless @clean_words;
477	426	50				1312	@clean_words = grep $_, map &simple_stemmer($_), @clean_words
478							if $self->{_want_stemming};
479	0					0	$self->{_case_sensitive} ?
480	1718					2858	map { $self->{_vocab_hist}->{$_}++ } grep $_, @clean_words :
481	426	50				1211	map { $self->{_vocab_hist}->{"\L$_"}++ } grep $_, @clean_words;
482	426	50				572	if ($self->{_case_sensitive}) {
483	0					0	for (@clean_words) { $uniques{$_}++ }
	0					0
484							} else {
485	426					549	for (@clean_words) { $uniques{"\L$_"}++ }
	1718					3512
486							}
487							}
488	16					311	close( IN );
489	16					174	map { $self->{_vocab_idf_hist}->{$_}++ } keys %uniques;
	820					881
490	16					141	$self->{_total_num_of_docs}++;
491							}
492
493							sub ok_to_filetype {
494	32			32	0	62	my $self = shift;
495	32					59	my $filename = shift;
496	32					1560	my ($base, $dir, $suffix) = fileparse($filename, '\..*');
497	32					121	croak "You called this module without specifying the file types in the constructor"
498	32	50				49	unless @{$self->{_file_types}} > 0;
499	32	50				41	return 1 if contained_in($suffix, @{$self->{_file_types}});
	32					179
500	0					0	return 0;
501							}
502
503							############################## LSA Modeling and Retrieval ################################
504
505							sub construct_lsa_model {
506	1			1	1	17	my $self = shift;
507	1	50				10	if ($self->{_idf_filter_option}) {
508	1	50	33			15	if (!$self->{_normalized_doc_vecs} and
509							-s $self->{_normalized_doc_vecs_db}) {
510	0					0	$self->{_normalized_doc_vecs} =
511							retrieve($self->{_normalized_doc_vecs_db});
512							}
513	1					4	foreach (sort keys %{$self->{_normalized_doc_vecs}}) {
	1					22
514	8					10	my $term_frequency_vec;
515	8					12	foreach my $word (sort keys
	8					529
516							%{$self->{_normalized_doc_vecs}->{$_}}){
517	800					1243	push @$term_frequency_vec,
518							$self->{_normalized_doc_vecs}->{$_}->{$word};
519							}
520	8					48	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	8					23
521							}
522							} else {
523	0	0	0			0	if (!$self->{_corpus_doc_vectors} and -s $self->{_doc_vectors_db}) {
524	0					0	$self->{_corpus_doc_vectors} = retrieve($self->{_doc_vectors_db});
525							}
526	0					0	foreach (sort keys %{$self->{_corpus_doc_vectors}}) {
	0					0
527	0					0	my $term_frequency_vec;
528	0					0	foreach my $word (sort keys %{$self->{_corpus_doc_vectors}->{$_}}){
	0					0
529	0					0	push @$term_frequency_vec,
530							$self->{_corpus_doc_vectors}->{$_}->{$word};
531							}
532	0					0	push @{$self->{_term_document_matrix}}, $term_frequency_vec;
	0					0
533							}
534							}
535	1					6	my $A = PDL::Basic::transpose( pdl(@{$self->{_term_document_matrix}}) );
	1					19
536	1					1023	my ($U,$SIGMA,$V) = svd $A;
537	1	50				13	print "LSA: Singular Values SIGMA: " . $SIGMA . "\n" if $self->{_debug};
538	1	50				8	print "size of svd SIGMA: ", $SIGMA->dims, "\n" if $self->{_debug};
539	1					10	my $index = return_index_of_last_value_above_threshold($SIGMA,
540							$self->{_lsa_svd_threshold});
541	1					11	my $SIGMA_trunc = $SIGMA->slice("0:$index")->sever;
542	1	50				64	print "SVD's Truncated SIGMA: " . $SIGMA_trunc . "\n" if $self->{_debug};
543							# When you measure the size of a matrix in PDL, the zeroth dimension
544							# is considered to be along the horizontal and the one-th dimension
545							# along the rows. This is opposite of how we want to look at
546							# matrices. For a matrix of size MxN, we mean M rows and N columns.
547							# With this 'rows x columns' convention for matrix size, if you had
548							# to check the size of, say, U matrix, you would call
549							# my @size = ( $U->getdim(1), $U->getdim(0) );
550							# print "\nsize of U: @size\n";
551	1					9	my $U_trunc = $U->slice("0:$index,:")->sever;
552	1					59	my $V_trunc = $V->slice("0:$index,0:$index")->sever;
553	1					44	$self->{_lsa_vec_truncator} = inv(stretcher($SIGMA_trunc)) x
554							PDL::Basic::transpose($U_trunc);
555	1	50				11047	print "\n\nLSA doc truncator: " . $self->{_lsa_vec_truncator} . "\n\n"
556							if $self->{_debug};
557	1					14	my @sorted_doc_names = $self->{_idf_filter_option} ?
558	0					0	sort keys %{$self->{_normalized_doc_vecs}} :
559	1	50				4	sort keys %{$self->{_corpus_doc_vectors}};
560	1					4	my $i = 0;
561	1					2	foreach (@{$self->{_term_document_matrix}}) {
	1					3
562	8					21	my $truncated_doc_vec = $self->{_lsa_vec_truncator} x
563							PDL::Basic::transpose(pdl($_));
564	8					897	my $doc_name = $sorted_doc_names[$i++];
565	8	50				20	print "\n\nTruncated doc vec for $doc_name: " .
566							$truncated_doc_vec . "\n" if $self->{_debug};
567	8					22	$self->{_doc_vecs_trunc_lsa}->{$doc_name}
568							= $truncated_doc_vec;
569							}
570	1					41	chdir $self->{_working_directory};
571							}
572
573							sub retrieve_with_lsa {
574	1			1	1	6	my $self = shift;
575	1					2	my $query = shift;
576	1					2	my @clean_words;
577	1					6	my $min = $self->{_min_word_length};
578	1	50				4	if ($self->{_break_camelcased_and_underscored}) {
579	1					29	my @brokenup = grep $_, split /\W\|_\|\s+/, "@$query";
580	1					3	@clean_words = map {$_ =~ /$_regex/g} @brokenup;
	8					30
581	1	100				3	@clean_words = grep $_, map {$_ =~ /([[:lower:]0-9]{$min,})/i;$1?"\L$1":''} @clean_words;
	8					50
	8					26
582							} else {
583	0					0	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, "@$query";
584	0					0	@clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	0					0
	0					0
585							}
586	1					3	$query = \@clean_words;
587	1	50				4	print "\nYour processed query words are: @$query\n" if $self->{_debug};
588	1					5	die "Your vocabulary histogram is empty"
589	1	50				1	unless scalar(keys %{$self->{_vocab_hist}});
590	1					5	die "You must first construct an LSA model"
591	1	50				3	unless scalar(keys %{$self->{_doc_vecs_trunc_lsa}});
592	1					1	foreach ( keys %{$self->{_vocab_hist}} ) {
	1					24
593	100					112	$self->{_query_vector}->{$_} = 0;
594							}
595	1					4	foreach (@$query) {
596	7	100				15	$self->{_query_vector}->{"\L$_"}++
597							if exists $self->{_vocab_hist}->{"\L$_"};
598							}
599	1					1	my @query_word_counts = values %{$self->{_query_vector}};
	1					10
600	1					4	my $query_word_count_total = reduce(\@query_word_counts);
601	1	50				3	die "Query does not contain corpus words. Nothing retrieved."
602							unless $query_word_count_total;
603	1					1	my $query_vec;
604	1					2	foreach (sort keys %{$self->{_query_vector}}) {
	1					34
605	100					92	push @$query_vec, $self->{_query_vector}->{$_};
606							}
607	1	50				5	print "\n\nQuery vector: @$query_vec\n" if $self->{_debug};
608	1					4	my $truncated_query_vec = $self->{_lsa_vec_truncator} x
609							PDL::Basic::transpose(pdl($query_vec));
610	1	50				124	print "\n\nTruncated query vector: " . $truncated_query_vec . "\n"
611							if $self->{_debug};
612	1					1	my %retrievals;
613	1					2	foreach (sort keys %{$self->{_doc_vecs_trunc_lsa}}) {
	1					6
614	8					129	my $dot_product = PDL::Basic::transpose($truncated_query_vec)
615							x pdl($self->{_doc_vecs_trunc_lsa}->{$_});
616	8	50				642	print "\n\nLSA: dot product of truncated query and\n" .
617							" truncated vec for doc $_ is " . $dot_product->sclr . "\n"
618							if $self->{_debug};
619	8					16	$retrievals{$_} = $dot_product->sclr;
620							}
621	1	50				15	if ($self->{_debug}) {
622	0					0	print "\n\nShowing LSA retrievals and similarity scores:\n\n";
623	0					0	foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
	0					0
624	0					0	print "$_ => $retrievals{$_}\n";
625							}
626	0					0	print "\n\n";
627							}
628	1					12	return \%retrievals;
629							}
630
631							sub _construct_doc_vector {
632	16			16		29	my $self = shift;
633	16					27	my $file = shift;
634	16					17	my %document_vector = %{deep_copy_hash($self->{_doc_hist_template})};
	16					48
635	16					186	foreach ( sort keys %{$self->{_doc_hist_template}} ) {
	16					695
636	1600					1298	$document_vector{$_} = 0;
637							}
638	16					103	my $min = $self->{_min_word_length};
639	16					33	my $total_words_in_doc = 0;
640	16	50				422	unless (open IN, $file) {
641	0	0				0	print "Unable to open file $file in the corpus: $!\n"
642							if $self->{_debug};
643	0					0	return;
644							}
645	16					264	while () {
646	834	100				2559	next if /^[ ]*\r?\n?$/;
647	710					2003	$_ =~ s/\r?\n?$//;
648	710					4992	my @brokenup = split /\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $_;
649	710					931	my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
	4702					6914
	4702					6561
650	710	100				2894	next unless @clean_words;
651	426	50				1265	@clean_words = grep $_,
652							map &simple_stemmer($_, $self->{_debug}), @clean_words
653							if $self->{_want_stemming};
654	0					0	$self->{_case_sensitive} ?
655	0					0	map { $document_vector{$_}++ } grep {exists $self->{_vocab_hist}->{$_}} @clean_words :
	1470					2960
656	1592					2518	map { $document_vector{"\L$_"}++ }
657	426	50				958	grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
658							}
659	16					271	close IN;
660	16	50				93	die "Something went wrong. Doc vector size unequal to vocab size"
661							unless $self->{_vocab_size} == scalar(keys %document_vector);
662	16					217	foreach (keys %document_vector) {
663	1600					1265	$total_words_in_doc += $document_vector{$_};
664							}
665	16					71	my %normalized_doc_vec;
666	16	50				93	if ($self->{_idf_filter_option}) {
667	16					122	foreach (keys %document_vector) {
668	1600					2607	$normalized_doc_vec{$_} = $document_vector{$_}
669							*
670							$self->{_idf_t}->{$_}
671							/
672							$total_words_in_doc;
673							}
674							}
675	16					37391	my $pwd = cwd;
676	16					428	$pwd =~ m{$self->{_corpus_directory}.?(\S*)$};
677	16					23	my $file_path_name;
678	16	50				120	unless ( $1 eq "" ) {
679	0					0	$file_path_name = "$1/$file";
680							} else {
681	16					41	$file_path_name = $file;
682							}
683	16					119	$self->{_corpus_doc_vectors}->{$file_path_name} = \%document_vector;
684	16					199	$self->{_normalized_doc_vecs}->{$file_path_name} = \%normalized_doc_vec;
685							}
686
687							################################### Drop Stop Words ###################################
688
689							sub _drop_stop_words {
690	0			0		0	my $self = shift;
691	0	0				0	open( IN, "$self->{_working_directory}/$self->{_stop_words_file}")
692							or die "unable to open stop words file: $!";
693	0					0	while () {
694	0	0				0	next if /^#/;
695	0	0				0	next if /^[ ]*\r?\n?$/;
696	0					0	$_ =~ s/\r?\n?$//;
697	0	0				0	delete $self->{_vocab_hist}->{$_} if exists $self->{_vocab_hist}->{$_};
698	0					0	unshift @{$self->{_stop_words}}, $_;
	0					0
699							}
700							}
701
702							################################### Support Methods ####################################
703
704							sub _doc_vec_comparator {
705	16			16		14	my $self = shift;
706	16					13	my %query_vector = %{$self->{_query_vector}};
	16					314
707	16	50				84	my $vec1_hash_ref = $self->{_idf_filter_option} ?
708							$self->{_normalized_doc_vecs}->{$a} :
709							$self->{_corpus_doc_vectors}->{$a};
710	16	50				30	my $vec2_hash_ref = $self->{_idf_filter_option} ?
711							$self->{_normalized_doc_vecs}->{$b} :
712							$self->{_corpus_doc_vectors}->{$b};
713	16					16	my @vec1 = ();
714	16					12	my @vec2 = ();
715	16					14	my @qvec = ();
716	16					14	foreach my $word (sort keys %{$self->{_vocab_hist}}) {
	16					543
717	1600					1245	push @vec1, $vec1_hash_ref->{$word};
718	1600					1243	push @vec2, $vec2_hash_ref->{$word};
719	1600					1397	push @qvec, $query_vector{$word};
720							}
721	16					93	my $vec1_mag = vec_magnitude(\@vec1);
722	16					25	my $vec2_mag = vec_magnitude(\@vec2);
723	16					25	my $qvec_mag = vec_magnitude(\@qvec);
724	16					31	my $product1 = vec_scalar_product(\@vec1, \@qvec);
725	16					18	$product1 /= $vec1_mag * $qvec_mag;
726	16					21	my $product2 = vec_scalar_product(\@vec2, \@qvec);
727	16					16	$product2 /= $vec2_mag * $qvec_mag;
728	16	100				114	return 1 if $product1 < $product2;
729	9	50				12	return 0 if $product1 == $product2;
730	9	50				122	return -1 if $product1 > $product2;
731							}
732
733							sub _similarity_to_query {
734	8			8		7	my $self = shift;
735	8					13	my $doc_name = shift;
736	8	50				19	my $vec_hash_ref = $self->{_idf_filter_option} ?
737							$self->{_normalized_doc_vecs}->{$doc_name} :
738							$self->{_corpus_doc_vectors}->{$doc_name};
739	8					10	my @vec = ();
740	8					4	my @qvec = ();
741	8					300	foreach my $word (sort keys %$vec_hash_ref) {
742	800					658	push @vec, $vec_hash_ref->{$word};
743	800					801	push @qvec, $self->{_query_vector}->{$word};
744							}
745	8					54	my $vec_mag = vec_magnitude(\@vec);
746	8					13	my $qvec_mag = vec_magnitude(\@qvec);
747	8					14	my $product = vec_scalar_product(\@vec, \@qvec);
748	8					10	$product /= $vec_mag * $qvec_mag;
749	8					52	return $product;
750							}
751
752							###################### Relevance Judgments for Testing Purposes #######################
753
754							## IMPORTANT: This estimation of document relevancies to queries is NOT for
755							## serious work. A document is considered to be relevant to a
756							## query if it contains several of the query words. As to the
757							## minimum number of query words that must exist in a document
758							## in order for the latter to be considered relevant is
759							## determined by the relevancy_threshold parameter in the VSM
760							## constructor. (See the relevancy and precision-recall related
761							## scripts in the 'examples' directory.) The reason for why the
762							## function shown below is not for serious work is because
763							## ultimately it is the humans who are the best judges of the
764							## relevancies of documents to queries. The humans bring to
765							## bear semantic considerations on the relevancy determination
766							## problem that are beyond the scope of this module.
767
768							sub estimate_doc_relevancies {
769	0			0	1	0	my $self = shift;
770	0	0				0	die "You did not set the 'query_file' parameter in the constructor"
771							unless $self->{_query_file};
772	0	0				0	open( IN, $self->{_query_file} )
773							or die "unable to open the query file $self->{_query_file}: $!";
774	0	0				0	croak "\n\nYou need to specify a name for the relevancy file in \n" .
775							" in which the relevancy judgments will be dumped."
776							unless $self->{_relevancy_file};
777	0					0	while () {
778	0	0				0	next if /^#/;
779	0	0				0	next if /^[ ]*\r?\n?$/;
780	0					0	$_ =~ s/\r?\n?$//;
781	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
782	0					0	/^[ ](q[0-9]+):[ ](.*)/;
783	0					0	my $query_label = $1;
784	0					0	my $query = $2;
785	0	0				0	next unless $query;
786	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
787							}
788	0	0				0	if ($self->{_debug}) {
789	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
790	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
791							}
792							}
793	0					0	$self->{_scan_dir_for_rels} = 1;
794	0					0	$self->_scan_directory($self->{_corpus_directory});
795	0					0	$self->{_scan_dir_for_rels} = 0;
796	0					0	chdir $self->{_working_directory};
797	0	0				0	open(OUT, ">$self->{_relevancy_file}")
798							or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
799	0					0	my @relevancy_list_for_query;
800	0					0	foreach (sort
801	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
	0					0
802							keys %{$self->{_relevancy_estimates}}) {
803	0					0	@relevancy_list_for_query =
804	0					0	keys %{$self->{_relevancy_estimates}->{$_}};
805	0					0	print OUT "$_ => @relevancy_list_for_query\n\n";
806	0					0	print "Number of relevant docs for query $_: " .
807							scalar(@relevancy_list_for_query) . "\n";
808							}
809							}
810
811							# If there are available human-supplied relevancy judgments in a disk
812							# file, use this script to upload that information. One of the scripts
813							# in the 'examples' directory carries out the precision-recall analysis
814							# by using this approach. IMPORTANT: The human-supplied relevancy
815							# judgments must be in a format that is shown in the sample file
816							# relevancy.txt in the 'examples' directory.
817							sub upload_document_relevancies_from_file {
818	0			0	1	0	my $self = shift;
819	0					0	chdir $self->{_working_directory};
820	0	0				0	open( IN, $self->{_relevancy_file} )
821							or die "unable to open the relevancy file $self->{_relevancy_file}: $!";
822	0					0	while () {
823	0	0				0	next if /^#/;
824	0	0				0	next if /^[ ]*\r?\n?$/;
825	0					0	$_ =~ s/\r?\n?$//;
826	0	0				0	die "Format of query file is not correct" unless /^[ ]q[0-9]+[ ]=>/;
827	0					0	/^[ ](q[0-9]+)[ ]=>[ ](.)/;
828	0					0	my $query_label = $1;
829	0					0	my $relevancy_docs_string = $2;
830	0	0				0	next unless $relevancy_docs_string;
831	0					0	my @relevancy_docs = grep $_, split / /, $relevancy_docs_string;
832	0					0	my %relevancies = map {$_ => 1} @relevancy_docs;
	0					0
833	0					0	$self->{_relevancy_estimates}->{$query_label} = \%relevancies;
834							}
835	0	0				0	if ($self->{_debug}) {
836	0					0	for (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
837	0					0	my @rels = keys %{$self->{_relevancy_estimates}->{$_}};
	0					0
838	0					0	print "$_ => @rels\n";
839							}
840							}
841							}
842
843							sub display_doc_relevancies {
844	0			0	1	0	my $self = shift;
845	0					0	die "You must first estimate or provide the doc relevancies"
846	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
847	0					0	print "\nDisplaying relevancy judgments:\n\n";
848	0					0	foreach my $query (sort keys %{$self->{_relevancy_estimates}}) {
	0					0
849	0					0	print "Query $query\n";
850	0					0	foreach my $file (sort {
	0					0
851	0					0	$self->{_relevancy_estimates}->{$query}->{$b}
852							<=>
853							$self->{_relevancy_estimates}->{$query}->{$a}
854							}
855							keys %{$self->{_relevancy_estimates}->{$query}}){
856	0					0	print " $file => $self->{_relevancy_estimates}->{$query}->{$file}\n";
857							}
858							}
859							}
860
861							sub _scan_file_for_rels {
862	0			0		0	my $self = shift;
863	0					0	my $file = shift;
864	0					0	open IN, $file;
865	0					0	my @all_text = ;
866	0					0	@all_text = grep $_, map {s/[\r]?\n$//; $_;} @all_text;
	0					0
	0					0
867	0					0	my $all_text = join ' ', @all_text;
868	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
869	0					0	my $count = 0;
870	0					0	my @query_words = grep $_,
871							split /\s+/, $self->{_queries_for_relevancy}->{$query};
872	0	0				0	print "Query words for $query: @query_words\n" if $self->{_debug};
873	0					0	foreach my $word (@query_words) {
874	0					0	my @matches = $all_text =~ /$word/gi;
875	0	0				0	print "Number of occurrences for word '$word' in file $file: " .
876							scalar(@matches) . "\n" if $self->{_debug};
877	0	0				0	$count += @matches if @matches;
878							}
879	0	0				0	print "\nRelevancy count for query $query and file $file: $count\n\n"
880							if $self->{_debug};
881	0	0				0	$self->{_relevancy_estimates}->{$query}->{$file} = $count
882							if $count >= $self->{_relevancy_threshold};
883							}
884							}
885
886							######################### Calculate Precision versus Recall ##########################
887
888							sub precision_and_recall_calculator {
889	0			0	1	0	my $self = shift;
890	0					0	my $retrieval_type = shift;
891	0					0	die "You must first estimate or provide the doc relevancies"
892	0	0				0	unless scalar(keys %{$self->{_relevancy_estimates}});
893	0	0				0	unless (scalar(keys %{$self->{_queries_for_relevancy}})) {
	0					0
894	0	0				0	open( IN, $self->{_query_file})
895							or die "unable to open the query file $self->{_query_file}: $!";
896	0					0	while () {
897	0	0				0	next if /^#/;
898	0	0				0	next if /^[ ]*\r?\n?$/;
899	0					0	$_ =~ s/\r?\n?$//;
900	0	0				0	die "Format of query file is not correct" unless /^[ ]*q[0-9]+:/;
901	0					0	/^[ ](q[0-9]+):[ ](.*)/;
902	0					0	my $query_label = $1;
903	0					0	my $query = $2;
904	0	0				0	next unless $query;
905	0					0	$self->{_queries_for_relevancy}->{$query_label} = $query;
906							}
907	0	0				0	if ($self->{_debug}) {
908	0					0	print "\n\nDisplaying queries in the query file:\n\n";
909	0					0	foreach (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
910	0					0	print "$_ => $self->{_queries_for_relevancy}->{$_}\n";
911							}
912							}
913							}
914	0					0	foreach my $query (sort keys %{$self->{_queries_for_relevancy}}) {
	0					0
915	0	0				0	print "\n\n\nQuery $query:\n" if $self->{_debug};
916	0					0	my @query_words = grep $_,
917							split /\s+/, $self->{_queries_for_relevancy}->{$query};
918	0	0				0	croak "\n\nYou have not specified the retrieval type for " .
919							"precision-recall calculation. See code in 'examples'" .
920							"directory:" if !defined $retrieval_type;
921	0					0	my $retrievals;
922	0					0	eval {
923	0	0				0	if ($retrieval_type eq 'vsm') {
		0
924	0					0	$retrievals = $self->retrieve_with_vsm( \@query_words );
925							} elsif ($retrieval_type eq 'lsa') {
926	0					0	$retrievals = $self->retrieve_with_lsa( \@query_words );
927							}
928							};
929	0	0				0	if ($@) {
930	0					0	warn "\n\nNo relevant docs found for query $query.\n" .
931							"Will skip over this query for precision and\n" .
932							"recall calculations\n\n";
933	0					0	next;
934							}
935	0					0	my %ranked_retrievals;
936	0					0	my $i = 1;
937	0					0	foreach (sort {$retrievals->{$b} <=> $retrievals->{$a}}
	0					0
938							keys %$retrievals) {
939	0					0	$ranked_retrievals{$i++} = $_;
940							}
941	0	0				0	if ($self->{_debug}) {
942	0					0	print "\n\nDisplaying ranked retrievals for query $query:\n\n";
943	0					0	foreach (sort {$a <=> $b} keys %ranked_retrievals) {
	0					0
944	0					0	print "$_ => $ranked_retrievals{$_}\n";
945							}
946							}
947							# At this time, ranking of relevant documents based on their
948							# relevancy counts serves no particular purpose since all we want
949							# for the calculation of Precision and Recall are the total
950							# number of relevant documents. However, I believe such a
951							# ranking will play an important role in the future.
952							# IMPORTANT: The relevancy judgments are ranked only when
953							# estimated by the method estimate_doc_relevancies()
954							# of the VSM class. When relevancies are supplied
955							# directly through a disk file, they all carry the
956							# same rank.
957	0					0	my %ranked_relevancies;
958	0					0	$i = 1;
959	0					0	foreach my $file (sort {
	0					0
960	0					0	$self->{_relevancy_estimates}->{$query}->{$b}
961							<=>
962							$self->{_relevancy_estimates}->{$query}->{$a}
963							}
964							keys %{$self->{_relevancy_estimates}->{$query}}) {
965	0					0	$ranked_relevancies{$i++} = $file;
966							}
967	0	0				0	if ($self->{_debug}) {
968	0					0	print "\n\nDisplaying ranked relevancies for query $query:\n\n";
969	0					0	foreach (sort {$a <=> $b} keys %ranked_relevancies) {
	0					0
970	0					0	print "$_ => $ranked_relevancies{$_}\n";
971							}
972							}
973	0					0	my @relevant_set = values %ranked_relevancies;
974	0	0				0	warn "\n\nNo relevant docs found for query $query.\n" .
975							"Will skip over this query for precision and\n" .
976							"recall calculations\n\n" unless @relevant_set;
977	0	0				0	next unless @relevant_set;
978	0	0				0	print "\n\nRelevant set for query $query: @relevant_set\n\n"
979							if $self->{_debug};
980	0					0	my @retrieved;
981	0					0	foreach (sort keys %ranked_retrievals) {
982	0					0	push @retrieved, $ranked_retrievals{$_};
983							}
984	0	0				0	print "\n\nRetrieved set for query $query: @retrieved\n\n"
985							if $self->{_debug};
986	0					0	my @Precision_values = ();
987	0					0	my @Recall_values = ();
988	0					0	my $rank = 1;
989	0					0	while ($rank < @retrieved + 1) {
990	0					0	my $index = 1;
991	0					0	my @retrieved_at_rank = ();
992	0					0	while ($index <= $rank) {
993	0					0	push @retrieved_at_rank, $ranked_retrievals{$index};
994	0					0	$index++;
995							}
996	0					0	my $intersection =set_intersection(\@retrieved_at_rank,
997							\@relevant_set);
998	0	0				0	my $precision_at_rank = @retrieved_at_rank ?
999							(@$intersection / @retrieved_at_rank) : 0;
1000	0					0	push @Precision_values, $precision_at_rank;
1001	0					0	my $recall_at_rank = @$intersection / @relevant_set;
1002	0					0	push @Recall_values, $recall_at_rank;
1003	0					0	$rank++;
1004							}
1005	0	0				0	print "\n\nFor query $query, precision values: @Precision_values\n"
1006							if $self->{_debug};
1007	0	0				0	print "\nFor query $query, recall values: @Recall_values\n"
1008							if $self->{_debug};
1009	0					0	$self->{_precision_for_queries}->{$query} = \@Precision_values;
1010	0					0	$self->{_recall_for_queries}->{$query} = \@Recall_values;
1011	0					0	my $area = 0;
1012							# Use trapezoidal rule to find the area under the precision-recall
1013							# curve:
1014	0					0	for my $j (1..@Precision_values-1) {
1015	0					0	my $height = ($Precision_values[$j]+$Precision_values[$j-1])/2.0;
1016	0					0	my $base = ($Recall_values[$j] - $Recall_values[$j-1]);
1017	0					0	$area += $base * $height;
1018							}
1019	0					0	my $map_for_query = $area;
1020	0	0				0	print "\nMAP for query $query: $map_for_query\n" if $self->{_debug};
1021	0					0	$self->{_map_vals_for_queries}->{$query} = $map_for_query;
1022							}
1023							}
1024
1025							sub display_map_values_for_queries {
1026	0			0	1	0	my $self = shift;
1027	0					0	die "You must first invoke precision_and_recall_calculator function"
1028	0	0				0	unless scalar(keys %{$self->{_map_vals_for_queries}});
1029	0					0	my $map = 0;
1030	0					0	print "\n\nDisplaying average precision for different queries:\n\n";
1031	0					0	foreach my $query (sort
	0					0
1032	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1033							keys %{$self->{_map_vals_for_queries}}) {
1034	0					0	my $output = sprintf "Query %s => %.3f",
1035							$query, $self->{_map_vals_for_queries}->{$query};
1036	0					0	print "$output\n";
1037	0					0	$map += $self->{_map_vals_for_queries}->{$query};
1038							}
1039	0					0	print "\n\n";
1040	0					0	my $avg_map_for_all_queries =
1041	0					0	$map / scalar(keys %{$self->{_map_vals_for_queries}});
1042	0					0	print "MAP value: $avg_map_for_all_queries\n\n";
1043							}
1044
1045							sub display_precision_vs_recall_for_queries {
1046	0			0	1	0	my $self = shift;
1047	0					0	die "You must first invoke precision_and_recall_calculator function"
1048	0	0				0	unless scalar(keys %{$self->{_precision_for_queries}});
1049	0					0	print "\n\nDisplaying precision and recall values for different queries:\n\n";
1050	0					0	foreach my $query (sort
	0					0
1051	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1052							keys %{$self->{_map_vals_for_queries}}) {
1053	0					0	print "\n\nQuery $query:\n";
1054	0					0	print "\n (The first value is for rank 1, the second value at rank 2, and so on.)\n\n";
1055	0					0	my @precision_vals = @{$self->{_precision_for_queries}->{$query}};
	0					0
1056	0					0	@precision_vals = map {sprintf "%.3f", $_} @precision_vals;
	0					0
1057	0					0	print " Precision at rank => @precision_vals\n";
1058	0					0	my @recall_vals = @{$self->{_recall_for_queries}->{$query}};
	0					0
1059	0					0	@recall_vals = map {sprintf "%.3f", $_} @recall_vals;
	0					0
1060	0					0	print "\n Recall at rank => @recall_vals\n";
1061							}
1062	0					0	print "\n\n";
1063							}
1064
1065							sub get_query_sorted_average_precision_for_queries {
1066	0			0	1	0	my $self = shift;
1067	0					0	die "You must first invoke precision_and_recall_calculator function"
1068	0	0				0	unless scalar(keys %{$self->{_map_vals_for_queries}});
1069	0					0	my @average_precisions_for_queries = ();
1070	0					0	foreach my $query (sort
	0					0
1071	0					0	{get_integer_suffix($a) <=> get_integer_suffix($b)}
1072							keys %{$self->{_map_vals_for_queries}}) {
1073	0					0	my $output = sprintf "%.3f", $self->{_map_vals_for_queries}->{$query};
1074	0					0	push @average_precisions_for_queries, $output;
1075							}
1076	0					0	return \@average_precisions_for_queries;
1077							}
1078
1079							################################### Utility Routines ###################################
1080
1081							sub _check_for_illegal_params {
1082	2			2		7	my @params = @_;
1083	2					17	my @legal_params = qw / corpus_directory
1084							corpus_vocab_db
1085							doc_vectors_db
1086							normalized_doc_vecs_db
1087							use_idf_filter
1088							stop_words_file
1089							file_types
1090							case_sensitive
1091							max_number_retrievals
1092							query_file
1093							relevancy_file
1094							min_word_length
1095							want_stemming
1096							lsa_svd_threshold
1097							relevancy_threshold
1098							break_camelcased_and_underscored
1099							save_model_on_disk
1100							debug
1101							/;
1102	2					4	my $found_match_flag;
1103	2					6	foreach my $param (@params) {
1104	17					20	foreach my $legal (@legal_params) {
1105	102					81	$found_match_flag = 0;
1106	102	100				165	if ($param eq $legal) {
1107	17					16	$found_match_flag = 1;
1108	17					17	last;
1109							}
1110							}
1111	17	50				36	last if $found_match_flag == 0;
1112							}
1113	2					18	return $found_match_flag;
1114							}
1115
1116							# checks whether an element is in an array:
1117							sub contained_in {
1118	32			32	0	36	my $ele = shift;
1119	32					86	my @array = @_;
1120	32					66	my $count = 0;
1121	32	100				42	map {$count++ if $ele eq $_} @array;
	64					230
1122	32					204	return $count;
1123							}
1124
1125							# Meant only for an un-nested hash:
1126							sub deep_copy_hash {
1127	16			16	0	22	my $ref_in = shift;
1128	16					35	my $ref_out = {};
1129	16					24	foreach ( keys %{$ref_in} ) {
	16					649
1130	1600					1934	$ref_out->{$_} = $ref_in->{$_};
1131							}
1132	16					592	return $ref_out;
1133							}
1134
1135							sub vec_scalar_product {
1136	40			40	0	27	my $vec1 = shift;
1137	40					32	my $vec2 = shift;
1138	40	50				71	croak "Something is wrong --- the two vectors are of unequal length"
1139							unless @$vec1 == @$vec2;
1140	40					27	my $product;
1141	40					59	for my $i (0..@$vec1-1) {
1142	4000					3518	$product += $vec1->[$i] * $vec2->[$i];
1143							}
1144	40					59	return $product;
1145							}
1146
1147							sub vec_magnitude {
1148	64			64	0	58	my $vec = shift;
1149	64					43	my $mag_squared = 0;
1150	64					62	foreach my $num (@$vec) {
1151	6400					5018	$mag_squared += $num ** 2;
1152							}
1153	64					114	return sqrt $mag_squared;
1154							}
1155
1156							sub reduce {
1157	2			2	0	5	my $vec = shift;
1158	2					2	my $result;
1159	2					7	for my $item (@$vec) {
1160	200					137	$result += $item;
1161							}
1162	2					5	return $result;
1163							}
1164
1165							sub simple_stemmer {
1166	3310			3310	0	3262	my $word = shift;
1167	3310					2243	my $debug = shift;
1168	3310	50				4352	print "\nStemming the word: $word\n" if $debug;
1169	3310					12280	$word =~ s/(.*[a-z][^aeious])s$/$1/i;
1170	3310					4460	$word =~ s/(.*[a-z]s)es$/$1/i;
1171	3310					4185	$word =~ s/(.*[a-z][ck])es$/$1e/i;
1172	3310					11463	$word =~ s/(.*[a-z]+)tions$/$1tion/i;
1173	3310					9965	$word =~ s/(.*[a-z]+)mming$/$1m/i;
1174	3310					18276	$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1175	3310					10122	$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1176	3310					9483	$word =~ s/(.*[a-z]+)tices$/$1tex/i;
1177	3310					14624	$word =~ s/(.*[a-z]+)pes$/$1pe/i;
1178	3310					13976	$word =~ s/(.*[a-z]+)sed$/$1se/i;
1179	3310					16598	$word =~ s/(.*[a-z]+)ed$/$1/i;
1180	3310					7283	$word =~ s/(.*[a-z]+)tation$/$1t/i;
1181	3310	50				4433	print "Stemmed word: $word\n\n" if $debug;
1182	3310					8614	return $word;
1183							}
1184
1185							# Assumes the array is sorted in a descending order, as would be the
1186							# case with an array of singular values produced by an SVD algorithm
1187							sub return_index_of_last_value_above_threshold {
1188	1			1	0	4	my $pdl_obj = shift;
1189	1					10	my $size = $pdl_obj->getdim(0);
1190	1					4	my $threshold = shift;
1191	1					14	my $lower_bound = $pdl_obj->slice(0)->sclr * $threshold;
1192	1					66	my $i = 0;
1193	1		66			11	while ($i < $size && $pdl_obj->slice($i)->sclr > $lower_bound) {$i++;}
	8					256
1194	1					5	return $i-1;
1195							}
1196
1197							sub set_intersection {
1198	0			0	0		my $set1 = shift;
1199	0						my $set2 = shift;
1200	0						my %hset1 = map {$_ => 1} @$set1;
	0
1201	0						my @common_elements = grep {$hset1{$_}} @$set2;
	0
1202	0	0					return @common_elements ? \@common_elements : [];
1203							}
1204
1205							sub get_integer_suffix {
1206	0			0	0		my $label = shift;
1207	0						$label =~ /(\d*)$/;
1208	0						return $1;
1209							}
1210
1211							1;
1212
1213							=pod
1214
1215							=head1 NAME
1216
1217							Algorithm::VSM --- A Perl module for retrieving files and documents from a software
1218							library with the VSM (Vector Space Model) and LSA (Latent Semantic Analysis)
1219							algorithms in response to search words and phrases.
1220
1221							=head1 SYNOPSIS
1222
1223							# FOR CONSTRUCTING A VSM MODEL FOR RETRIEVAL:
1224
1225							use Algorithm::VSM;
1226
1227							my $corpus_dir = "corpus";
1228							my @query = qw/ program ListIterator add ArrayList args /;
1229							my $stop_words_file = "stop_words.txt";
1230							my $vsm = Algorithm::VSM->new(
1231							break_camelcased_and_underscored => 1,
1232							case_sensitive => 0,
1233							corpus_directory => $corpus_dir,
1234							file_types => ['.txt', '.java'],
1235							max_number_retrievals => 10,
1236							min_word_length => 4,
1237							stop_words_file => $stop_words_file,
1238							use_idf_filter => 1,
1239							want_stemming => 1,
1240							);
1241							$vsm->get_corpus_vocabulary_and_word_counts();
1242							$vsm->display_corpus_vocab();
1243							$vsm->display_corpus_vocab_size();
1244							$vsm->write_corpus_vocab_to_file("vocabulary_dump.txt");
1245							$vsm->display_inverse_document_frequencies();
1246							$vsm->generate_document_vectors();
1247							$vsm->display_doc_vectors();
1248							$vsm->display_normalized_doc_vectors();
1249							my $retrievals = $vsm->retrieve_for_query_with_vsm( \@query );
1250							$vsm->display_retrievals( $retrievals );
1251
1252							The purpose of each constructor option and what is accomplished by the method
1253							calls should be obvious by their names. If not, they are explained in greater
1254							detail elsewhere in this documentation page. Note that the methods
1255							display_corpus_vocab() and display_doc_vectors() are there only for testing
1256							purposes with small corpora. If you must use them for large libraries/corpora,
1257							you might wish to redirect the output to a file.
1258
1259							By default, a call to a constructor calculates normalized term-frequency vectors
1260							for the documents. Normalization consists of first calculating the term
1261							frequency tf(t) of a term t in a document as a proportion of the total numbers
1262							of words in the document and then multiplying it by idf(t), where idf(t) stands
1263							for the inverse document frequency associated with that term. Note that 'word'
1264							and 'term' mean the same thing.
1265
1266
1267
1268							# FOR CONSTRUCTING AN LSA MODEL FOR RETRIEVAL:
1269
1270							my $lsa = Algorithm::VSM->new(
1271							break_camelcased_and_underscored => 1,
1272							case_sensitive => 0,
1273							corpus_directory => $corpus_dir,
1274							file_types => ['.txt', '.java'],
1275							lsa_svd_threshold => 0.01,
1276							max_number_retrievals => 10,
1277							min_word_length => 4,
1278							stop_words_file => $stop_words_file,
1279							use_idf_filter => 1,
1280							want_stemming => 1,
1281							);
1282							$lsa->get_corpus_vocabulary_and_word_counts();
1283							$lsa->display_corpus_vocab();
1284							$lsa->display_corpus_vocab_size();
1285							$lsa->write_corpus_vocab_to_file("vocabulary_dump.txt");
1286							$lsa->generate_document_vectors();
1287							$lsa->construct_lsa_model();
1288							my $retrievals = $lsa->retrieve_for_query_with_lsa( \@query );
1289							$lsa->display_retrievals( $retrievals );
1290
1291							The initialization code before the constructor call and the calls for displaying
1292							the vocabulary and the vectors after the call remain the same as for the VSM case
1293							shown previously in this Synopsis. In the call above, the constructor parameter
1294							'lsa_svd_threshold' determines how many of the singular values will be retained
1295							after we have carried out an SVD decomposition of the term-frequency matrix for
1296							the documents in the corpus. Singular values smaller than this threshold
1297							fraction of the largest value are rejected.
1298
1299
1300
1301							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM:
1302
1303							my $corpus_dir = "corpus";
1304							my $stop_words_file = "stop_words.txt";
1305							my $query_file = "test_queries.txt";
1306							my $relevancy_file = "relevancy.txt"; # All relevancy judgments
1307							# will be stored in this file
1308							my $vsm = Algorithm::VSM->new(
1309							break_camelcased_and_underscored => 1,
1310							case_sensitive => 0,
1311							corpus_directory => $corpus_dir,
1312							file_types => ['.txt', '.java'],
1313							min_word_length => 4,
1314							query_file => $query_file,
1315							relevancy_file => $relevancy_file,
1316							relevancy_threshold => 5,
1317							stop_words_file => $stop_words_file,
1318							want_stemming => 1,
1319							);
1320							$vsm->get_corpus_vocabulary_and_word_counts();
1321							$vsm->generate_document_vectors();
1322							$vsm->estimate_doc_relevancies();
1323							$vsm->display_doc_relevancies(); # used only for testing
1324							$vsm->precision_and_recall_calculator('vsm');
1325							$vsm->display_precision_vs_recall_for_queries();
1326							$vsm->display_map_values_for_queries();
1327
1328							Measuring precision and recall requires a set of queries. These are supplied
1329							through the constructor parameter 'query_file'. The format of the this file
1330							must be according to the sample file 'test_queries.txt' in the 'examples'
1331							directory. The module estimates the relevancies of the documents to the
1332							queries and dumps the relevancies in a file named by the 'relevancy_file'
1333							constructor parameter. The constructor parameter 'relevancy_threshold' is used
1334							to decide which of the documents are considered to be relevant to a query. A
1335							document must contain at least the 'relevancy_threshold' occurrences of query
1336							words in order to be considered relevant to a query.
1337
1338
1339
1340							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA:
1341
1342							my $lsa = Algorithm::VSM->new(
1343							break_camelcased_and_underscored => 1,
1344							case_sensitive => 0,
1345							corpus_directory => $corpus_dir,
1346							file_types => ['.txt', '.java'],
1347							lsa_svd_threshold => 0.01,
1348							min_word_length => 4,
1349							query_file => $query_file,
1350							relevancy_file => $relevancy_file,
1351							relevancy_threshold => 5,
1352							stop_words_file => $stop_words_file,
1353							want_stemming => 1,
1354							);
1355							$lsa->get_corpus_vocabulary_and_word_counts();
1356							$lsa->generate_document_vectors();
1357							$lsa->construct_lsa_model();
1358							$lsa->estimate_doc_relevancies();
1359							$lsa->display_doc_relevancies();
1360							$lsa->precision_and_recall_calculator('lsa');
1361							$lsa->display_precision_vs_recall_for_queries();
1362							$lsa->display_map_values_for_queries();
1363
1364							We have already explained the purpose of the constructor parameter 'query_file'
1365							and about the constraints on the format of queries in the file named through
1366							this parameter. As mentioned earlier, the module estimates the relevancies of
1367							the documents to the queries and dumps the relevancies in a file named by the
1368							'relevancy_file' constructor parameter. The constructor parameter
1369							'relevancy_threshold' is used in deciding which of the documents are considered
1370							to be relevant to a query. A document must contain at least the
1371							'relevancy_threshold' occurrences of query words in order to be considered
1372							relevant to a query. We have previously explained the role of the constructor
1373							parameter 'lsa_svd_threshold'.
1374
1375
1376
1377							# FOR MEASURING PRECISION VERSUS RECALL FOR VSM USING FILE-BASED RELEVANCE JUDGMENTS:
1378
1379							my $corpus_dir = "corpus";
1380							my $stop_words_file = "stop_words.txt";
1381							my $query_file = "test_queries.txt";
1382							my $relevancy_file = "relevancy.txt";
1383							my $vsm = Algorithm::VSM->new(
1384							break_camelcased_and_underscored => 1,
1385							case_sensitive => 0,
1386							corpus_directory => $corpus_dir,
1387							file_types => ['.txt', '.java'],
1388							min_word_length => 4,
1389							query_file => $query_file,
1390							relevancy_file => $relevancy_file,
1391							stop_words_file => $stop_words_file,
1392							want_stemming => 1,
1393							);
1394							$vsm->get_corpus_vocabulary_and_word_counts();
1395							$vsm->generate_document_vectors();
1396							$vsm->upload_document_relevancies_from_file();
1397							$vsm->display_doc_relevancies();
1398							$vsm->precision_and_recall_calculator('vsm');
1399							$vsm->display_precision_vs_recall_for_queries();
1400							$vsm->display_map_values_for_queries();
1401
1402							Now the filename supplied through the constructor parameter 'relevancy_file' must
1403							contain relevance judgments for the queries that are named in the file supplied
1404							through the parameter 'query_file'. The format of these two files must be
1405							according to what is shown in the sample files 'test_queries.txt' and
1406							'relevancy.txt' in the 'examples' directory.
1407
1408
1409
1410							# FOR MEASURING PRECISION VERSUS RECALL FOR LSA USING FILE-BASED RELEVANCE JUDGMENTS:
1411
1412							my $corpus_dir = "corpus";
1413							my $stop_words_file = "stop_words.txt";
1414							my $query_file = "test_queries.txt";
1415							my $relevancy_file = "relevancy.txt";
1416							my $lsa = Algorithm::VSM->new(
1417							break_camelcased_and_underscored => 1,
1418							case_sensitive => 0,
1419							corpus_directory => $corpus_dir,
1420							file_types => ['.txt', '.java'],
1421							lsa_svd_threshold => 0.01,
1422							min_word_length => 4,
1423							query_file => $query_file,
1424							relevancy_file => $relevancy_file,
1425							stop_words_file => $stop_words_file,
1426							want_stemming => 1,
1427							);
1428							$lsa->get_corpus_vocabulary_and_word_counts();
1429							$lsa->generate_document_vectors();
1430							$lsa->upload_document_relevancies_from_file();
1431							$lsa->display_doc_relevancies();
1432							$lsa->precision_and_recall_calculator('vsm');
1433							$lsa->display_precision_vs_recall_for_queries();
1434							$lsa->display_map_values_for_queries();
1435
1436							As mentioned for the previous code block, the filename supplied through the
1437							constructor parameter 'relevancy_file' must contain relevance judgments for the
1438							queries that are named in the file supplied through the parameter 'query_file'.
1439							The format of this file must be according to what is shown in the sample file
1440							'relevancy.txt' in the 'examples' directory. We have already explained the roles
1441							played by the constructor parameters such as 'lsa_svd_threshold'.
1442
1443
1444
1445							# FOR MEASURING THE SIMILARITY MATRIX FOR A SET OF DOCUMENTS:
1446
1447							my $corpus_dir = "corpus";
1448							my $stop_words_file = "stop_words.txt";
1449							my $vsm = Algorithm::VSM->new(
1450							break_camelcased_and_underscored => 1,
1451							case_sensitive => 0,
1452							corpus_directory => $corpus_dir,
1453							file_types => ['.txt', '.java'],
1454							min_word_length => 4,
1455							stop_words_file => $stop_words_file,
1456							want_stemming => 1,
1457							);
1458							$vsm->get_corpus_vocabulary_and_word_counts();
1459							$vsm->generate_document_vectors();
1460							# code for calculating pairwise similarities as shown in the
1461							# script calculate_similarity_matrix_for_all_docs.pl in the
1462							# examples directory. This script makes calls to
1463							#
1464							# $vsm->pairwise_similarity_for_docs($docs[$i], $docs[$j]);
1465							#
1466							# for every pair of documents.
1467
1468							=head1 CHANGES
1469
1470							Version 1.62 removes the Perl version restriction on the module. This version also
1471							fixes two bugs, one in the file scanner code and the other in the
1472							precision-and-recall calculator. The file scanner bug was related to the new
1473							constructor parameter C that was introduced in Version 1.61. And the
1474							precision-and-recall calculator bug was triggered if a query consisted solely of
1475							non-vocabulary words.
1476
1477							Version 1.61 improves the implementation of the directory scanner to make it more
1478							platform independent. Additionally, you are now required to specify in the
1479							constructor call the file types to be considered for computing the database model.
1480							If, say, you have a large software library and you want only Java and text files to
1481							be scanned for creating the VSM (or the LSA) model, you must supply that information
1482							to the module by setting the constructor parameter C to the anonymous
1483							list C<['.java', '.txt']>. An additional constructor parameter introduced in this
1484							version is C. If you set it to 1, that will force the database model
1485							and query matching to become case sensitive.
1486
1487							Version 1.60 reflects the fact that people are now more likely to use this module by
1488							keeping the model constructed for a corpus in the fast memory (as opposed to storing
1489							the models in disk-based hash tables) for its repeated invocation for different
1490							queries. As a result, the default value for the constructor option
1491							C was changed from 1 to 0. For those who still wish to store on
1492							a disk the model that is constructed, the script
1493							C shows how you can do that.
1494							Other changes in 1.60 include a slight reorganization of the scripts in the
1495							C directory. Most scripts now do not by default store their models in
1496							disk-based hash tables. This reorganization is reflected in the description of the
1497							C directory in this documentation. The basic logic of constructing VSM and
1498							LSA models and how these are used for retrievals remains unchanged.
1499
1500							Version 1.50 incorporates a couple of new features: (1) You now have the option to
1501							split camel-cased and underscored words for constructing your vocabulary set; and (2)
1502							Storing the VSM and LSA models in database files on the disk is now optional. The
1503							second feature, in particular, should prove useful to those who are using this module
1504							for large collections of documents.
1505
1506							Version 1.42 includes two new methods, C and
1507							C, for those folks who deal with very large datasets.
1508							You can get a better sense of the overall vocabulary being used by the module for
1509							file retrieval by examining the contents of a dump file whose name is supplied as an
1510							argument to C.
1511
1512							Version 1.41 downshifts the required version of the PDL module. Also cleaned up are
1513							the dependencies between this module and the submodules of PDL.
1514
1515							Version 1.4 makes it easier for a user to calculate a similarity matrix over all the
1516							documents in the corpus. The elements of such a matrix express pairwise similarities
1517							between the documents. The pairwise similarities are based on the dot product of two
1518							document vectors divided by the product of the vector magnitudes. The 'examples'
1519							directory contains two scripts to illustrate how such matrices can be calculated by
1520							the user. The similarity matrix is output as a CSV file.
1521
1522							Version 1.3 incorporates IDF (Inverse Document Frequency) weighting of the words in a
1523							document file. What that means is that the words that appear in most of the documents
1524							get reduced weighting since such words are non-discriminatory with respect to the
1525							retrieval of the documents. A typical formula that is used to calculate the IDF
1526							weight for a word is the logarithm of the ratio of the total number of documents to
1527							the number of documents in which the word appears. So if a word were to appear in
1528							all the documents, its IDF multiplier would be zero in the vector representation of a
1529							document. If so desired, you can turn off the IDF weighting of the words by
1530							explicitly setting the constructor parameter C to zero.
1531
1532							Version 1.2 includes a code correction and some general code and documentation
1533							cleanup.
1534
1535							With Version 1.1, you can access the retrieval precision results so that you can
1536							compare two different retrieval algorithms (VSM or LSA with different choices for
1537							some of the constructor parameters) with significance testing. (Version 1.0 merely
1538							sent those results to standard output, typically your terminal window.) In Version
1539							1.1, the new script B in the 'examples' directory
1540							illustrates significance testing with Randomization and with Student's Paired t-Test.
1541
1542							=head1 DESCRIPTION
1543
1544							B is a I module for constructing a Vector Space Model (VSM) or
1545							a Latent Semantic Analysis Model (LSA) of a collection of documents, usually referred
1546							to as a corpus, and then retrieving the documents in response to search words in a
1547							query.
1548
1549							VSM and LSA models have been around for a long time in the Information Retrieval (IR)
1550							community. More recently such models have been shown to be effective in retrieving
1551							files/documents from software libraries. For an account of this research that was
1552							presented by Shivani Rao and the author of this module at the 2011 Mining Software
1553							Repositories conference, see L.
1554
1555							VSM modeling consists of: (1) Extracting the vocabulary used in a corpus. (2)
1556							Stemming the words so extracted and eliminating the designated stop words from the
1557							vocabulary. Stemming means that closely related words like 'programming' and
1558							'programs' are reduced to the common root word 'program' and the stop words are the
1559							non-discriminating words that can be expected to exist in virtually all the
1560							documents. (3) Constructing document vectors for the individual files in the corpus
1561							--- the document vectors taken together constitute what is usually referred to as a
1562							'term-frequency' matrix for the corpus. (4) Normalizing the document vectors to
1563							factor out the effect of document size and, if desired, multiplying the term
1564							frequencies by the IDF (Inverse Document Frequency) values for the words to reduce
1565							the weight of the words that appear in a large number of documents. (5) Constructing
1566							a query vector for the search query after the query is subject to the same stemming
1567							and stop-word elimination rules that were applied to the corpus. And, lastly, (6)
1568							Using a similarity metric to return the set of documents that are most similar to the
1569							query vector. The commonly used similarity metric is one based on the cosine
1570							distance between two vectors. Also note that all the vectors mentioned here are of
1571							the same size, the size of the vocabulary. An element of a vector is the frequency
1572							of occurrence of the word corresponding to that position in the vector.
1573
1574							LSA modeling is a small variation on VSM modeling. Now you take VSM modeling one
1575							step further by subjecting the term-frequency matrix for the corpus to singular value
1576							decomposition (SVD). By retaining only a subset of the singular values (usually the
1577							N largest for some value of N), you can construct reduced-dimensionality vectors for
1578							the documents and the queries. In VSM, as mentioned above, the size of the document
1579							and the query vectors is equal to the size of the vocabulary. For large corpora,
1580							this size may involve tens of thousands of words --- this can slow down the VSM
1581							modeling and retrieval process. So you are very likely to get faster performance
1582							with retrieval based on LSA modeling, especially if you store the model once
1583							constructed in a database file on the disk and carry out retrievals using the
1584							disk-based model.
1585
1586
1587							=head1 CAN THIS MODULE BE USED FOR GENERAL TEXT RETRIEVAL?
1588
1589							This module has only been tested for software retrieval. For more general text
1590							retrieval, you would need to replace the simple stemmer used in the module by one
1591							based on, say, Porter's Stemming Algorithm. You would also need to vastly expand the
1592							list of stop words appropriate to the text corpora of interest to you. As previously
1593							mentioned, the stop words are the commonly occurring words that do not carry much
1594							discriminatory power from the standpoint of distinguishing between the documents.
1595							See the file 'stop_words.txt' in the 'examples' directory for how such a file must be
1596							formatted.
1597
1598
1599							=head1 HOW DOES ONE DEAL WITH VERY LARGE LIBRARIES/CORPORA?
1600
1601							It is not uncommon for large software libraries to consist of tens of thousands of
1602							documents that include source-code files, documentation files, README files,
1603							configuration files, etc. The bug-localization work presented recently by Shivani
1604							Rao and this author at the 2011 Mining Software Repository conference (MSR11) was
1605							based on a relatively small iBUGS dataset involving 6546 documents and a vocabulary
1606							size of 7553 unique words. (Here is a link to this work:
1607							L. Also note that the iBUGS dataset
1608							was originally put together by V. Dallmeier and T. Zimmermann for the evaluation of
1609							automated bug detection and localization tools.) If C is the size of the
1610							vocabulary and C the number of the documents in the corpus, the size of each
1611							vector will be C and size of the term-frequency matrix for the entire corpus will
1612							be CxC. So if you were to duplicate the bug localization experiments in
1613							L you would be dealing with vectors of
1614							size 7553 and a term-frequency matrix of size 7553x6546. Extrapolating these numbers
1615							to really large libraries/corpora, we are obviously talking about very large matrices
1616							for SVD decomposition. For large libraries/corpora, it would be best to store away
1617							the model in a disk file and to base all subsequent retrievals on the disk-stored
1618							models. The 'examples' directory contains scripts that carry out retrievals on the
1619							basis of disk-based models. Further speedup in retrieval can be achieved by using
1620							LSA to create reduced-dimensionality representations for the documents and by basing
1621							retrievals on the stored versions of such reduced-dimensionality representations.
1622
1623
1624							=head1 ESTIMATING RETRIEVAL PERFORMANCE WITH PRECISION VS. RECALL CALCULATIONS
1625
1626							The performance of a retrieval algorithm is typically measured by two properties:
1627							C and C. As mentioned in the
1628							L publication, at a given rank C,
1629							C is the ratio of the number of retrieved documents that are relevant to
1630							the total number of retrieved documents up to that rank. And, along the same lines,
1631							C at a given rank C is the ratio of the number of retrieved documents that
1632							are relevant to the total number of relevant documents. The area under the
1633							C--C curve is called the C for a query. When
1634							the C is averaged over all the queries, we obtain what is known as
1635							C (MAP). For an oracle, the value of MAP should be 1.0. On
1636							the other hand, for purely random retrieval from a corpus, the value of MAP will be
1637							inversely proportional to the size of the corpus. (See the discussion in
1638							L for further
1639							explanation on these retrieval precision evaluators.) This module includes methods
1640							that allow you to carry out these retrieval accuracy measurements using the relevancy
1641							judgments supplied through a disk file. If human-supplied relevancy judgments are
1642							not available, the module will be happy to estimate relevancies for you just by
1643							determining the number of query words that exist in a document. Note, however, that
1644							relevancy judgments estimated in this manner cannot be trusted. That is because
1645							ultimately it is the humans who are the best judges of the relevancies of documents
1646							to queries. The humans bring to bear semantic considerations on the relevancy
1647							determination problem that are beyond the scope of this module.
1648
1649
1650							=head1 METHODS
1651
1652							The module provides the following methods for constructing VSM and LSA models of a
1653							corpus, for using the models thus constructed for retrieval, and for carrying out
1654							precision versus recall calculations for the determination of retrieval accuracy on
1655							the corpora of interest to you.
1656
1657							=over
1658
1659							=item B
1660
1661							A call to C constructs a new instance of the C class:
1662
1663							my $vsm = Algorithm::VSM->new(
1664							break_camelcased_and_underscored => 1,
1665							case_sensitive => 0,
1666							corpus_directory => "",
1667							corpus_vocab_db => "corpus_vocab_db",
1668							doc_vectors_db => "doc_vectors_db",
1669							file_types => $my_file_types,
1670							lsa_svd_threshold => 0.01,
1671							max_number_retrievals => 10,
1672							min_word_length => 4,
1673							normalized_doc_vecs_db => "normalized_doc_vecs_db",
1674							query_file => "",
1675							relevancy_file => $relevancy_file,
1676							relevancy_threshold => 5,
1677							save_model_on_disk => 0,
1678							stop_words_file => "",
1679							use_idf_filter => 1,
1680							want_stemming => 1,
1681							);
1682
1683							The values shown on the right side of the big arrows are the B
1684							parameters>. The value supplied through the variable C<$my_file_types> would be
1685							something like C<['.java', '.txt']> if, say, you wanted only Java and text files to
1686							be included in creating the database model. The following nested list will now
1687							describe each of the constructor parameters shown above:
1688
1689							=over 16
1690
1691							=item I
1692
1693							The parameter B when set causes the
1694							underscored and camel-cased words to be split. By default the parameter is
1695							set. So if you don't want such words to be split, you must set it
1696							explicitly to 0.
1697
1698							=item I
1699
1700							When set to 1, this parameter forces the module to maintain the case of the terms in
1701							the corpus files when creating the vocabulary and the document vectors. Setting
1702							C to 1 also causes the query matching to become case sensitive.
1703							(This constructor parameter was introduced in Version 1.61.)
1704
1705							=item I
1706
1707							The parameter B points to the root of the directory of documents
1708							for which you want to create a VSM or LSA model.
1709
1710							=item I
1711
1712							The parameter B is for naming the DBM in which the corpus vocabulary
1713							will be stored after it is subject to stemming and the elimination of stop words.
1714							Once a disk-based VSM model is created and stored away in the file named by this
1715							parameter and the parameter to be described next, it can subsequently be used
1716							directly for speedier retrieval.
1717
1718							=item I
1719
1720							The database named by B stores the document vector representation for
1721							each document in the corpus. Each document vector has the same size as the
1722							corpus-wide vocabulary; each element of such a vector is the number of occurrences of
1723							the word that corresponds to that position in the vocabulary vector.
1724
1725							=item I
1726
1727							This parameter tells the module what types of files in the corpus directory you want
1728							scanned for creating the database model. The value supplied for this parameter is an
1729							anonymous list of the file suffixes for the file types. For example, if you wanted
1730							only Java and text files to be scanned, you will set this parameter to C<['.java',
1731							'.txt']>. The module throws an exception if this parameter is left unspecified.
1732							(This constructor parameter was introduced in Version 1.61.)
1733
1734							=item I
1735
1736							The parameter B is used for rejecting singular values that are
1737							smaller than this threshold fraction of the largest singular value. This plays a
1738							critical role in creating reduced-dimensionality document vectors in LSA modeling of
1739							a corpus.
1740
1741							=item I
1742
1743							The constructor parameter B stands for what it means.
1744
1745							=item I
1746
1747							The parameter B sets the minimum number of characters in a
1748							word in order for it to be included in the corpus vocabulary.
1749
1750							=item I
1751
1752							The database named by B stores the normalized document
1753							vectors. Normalization consists of factoring out the size of the documents by
1754							dividing the term frequency for each word in a document by the number of words in the
1755							document, and then multiplying the result by the idf (Inverse Document Frequency)
1756							value for the word.
1757
1758							=item I
1759
1760							The parameter B points to a file that contains the queries to be used for
1761							calculating retrieval performance with C and C numbers. The format
1762							of the query file must be as shown in the sample file C in the
1763							'examples' directory.
1764
1765							=item I
1766
1767							This option names the disk file for storing the relevancy judgments.
1768
1769							=item I
1770
1771							The constructor parameter B is used for automatic determination
1772							of document relevancies to queries on the basis of the number of occurrences of query
1773							words in a document. You can exercise control over the process of determining
1774							relevancy of a document to a query by giving a suitable value to the constructor
1775							parameter B. A document is considered relevant to a query only
1776							when the document contains at least B number of query words.
1777
1778							=item I
1779
1780							The constructor parameter B will cause the basic
1781							information about the VSM and the LSA models to be stored on the disk.
1782							Subsequently, any retrievals can be carried out from the disk-based model.
1783
1784							=item I
1785
1786							The parameter B is for naming the file that contains the stop words
1787							that you do not wish to include in the corpus vocabulary. The format of this file
1788							must be as shown in the sample file C in the 'examples' directory.
1789
1790							=item I
1791
1792							The constructor parameter B is set by default. If you want
1793							to turn off the normalization of the document vectors, including turning
1794							off the weighting of the term frequencies of the words by their idf values,
1795							you must set this parameter explicitly to 0.
1796
1797							=item I
1798
1799							The boolean parameter B determines whether or not the words extracted
1800							from the documents would be subject to stemming. As mentioned elsewhere, stemming
1801							means that related words like 'programming' and 'programs' would both be reduced to
1802							the root word 'program'.
1803
1804							=back
1805
1806							=begin html
1807
1808
1809
1810							=end html
1811
1812							=item B
1813
1814							You call this subroutine for constructing an LSA model for your corpus
1815							after you have extracted the corpus vocabulary and constructed document
1816							vectors:
1817
1818							$vsm->construct_lsa_model();
1819
1820							The SVD decomposition that is carried out in LSA model construction uses the
1821							constructor parameter C to decide how many of the singular values
1822							to retain for the LSA model. A singular is retained only if it is larger than the
1823							C fraction of the largest singular value.
1824
1825
1826							=item B
1827
1828							If you would like to see corpus vocabulary as constructed by the previous call, make
1829							the call
1830
1831							$vsm->display_corpus_vocab();
1832
1833							Note that this is a useful thing to do only on small test corpora. If you need
1834							to examine the vocabulary for a large corpus, call the two methods listed below.
1835
1836
1837							=item B
1838
1839							If you would like for the module to print out in your terminal window the size of the
1840							vocabulary, make the call
1841
1842							$vsm->display_corpus_vocab_size();
1843
1844
1845							=item B
1846
1847							If you would like to see the document relevancies generated by the previous method,
1848							you can call
1849
1850							$vsm->display_doc_relevancies()
1851
1852
1853							=item B
1854
1855							If you would like to see the document vectors constructed by the previous call, make
1856							the call:
1857
1858							$vsm->display_doc_vectors();
1859
1860							Note that this is a useful thing to do only on small test corpora. If you must call
1861							this method on a large corpus, you might wish to direct the output to a file.
1862
1863
1864							=item B
1865
1866							You can display the idf value associated with each word in the corpus by
1867
1868							$vsm->display_inverse_document_frequencies();
1869
1870							The idf of a word in the corpus is calculated typically as the logarithm of the ratio
1871							of the total number of documents in the corpus to the number of documents in which
1872							the word appears (with protection built in to prevent division by zero). Ideally, if
1873							a word appears in all the documents, its idf would be small, close to zero. Words
1874							with small idf values are non-discriminatory and should get reduced weighting in
1875							document retrieval.
1876
1877
1878							=item B
1879
1880							The area under the precision vs. recall curve for a given query is called C
1881							Precision> for that query. When this area is averaged over all the queries, you get
1882							C (Mean Average Precision) as a measure of the accuracy of the retrieval
1883							algorithm. The C values for the queries and the overall C
1884							can be printed out by calling
1885
1886							$vsm->display_map_values_for_queries();
1887
1888
1889							=item B
1890
1891							If you would like to see the normalized document vectors, make the call:
1892
1893							$vsm->display_normalized_doc_vectors();
1894
1895							See the comment made previously as to what is meant by the normalization of a
1896							document vector.
1897
1898
1899							=item B
1900
1901							A call to C will normally be followed by the
1902							following call
1903
1904							$vsm->display_precision_vs_recall_for_queries();
1905
1906							for displaying the C and C values.
1907
1908
1909							=item B
1910
1911							You can display the retrieved document names by calling this method using the syntax:
1912
1913							$vsm->display_retrievals( $retrievals );
1914
1915							where C<$retrievals> is a reference to the hash returned by a call to one of the
1916							C methods. The display method shown here respects the retrieval size
1917							constraints expressed by the constructor parameter C.
1918
1919
1920							=item B
1921
1922							Before you can carry out precision and recall calculations to test the accuracy of
1923							VSM and LSA based retrievals from a corpus, you need to have available the relevancy
1924							judgments for the queries. (A relevancy judgment for a query is simply the list of
1925							documents relevant to that query.) Relevancy judgments are commonly supplied by the
1926							humans who are familiar with the corpus. But if such human-supplied relevance
1927							judgments are not available, you can invoke the following method to estimate them:
1928
1929							$vsm->estimate_doc_relevancies();
1930
1931							For the above method call, a document is considered to be relevant to a query if it
1932							contains several of the query words. As to the minimum number of query words that
1933							must exist in a document in order for the latter to be considered relevant, that is
1934							determined by the C parameter in the VSM constructor.
1935
1936							But note that this estimation of document relevancies to queries is NOT for serious
1937							work. The reason for that is because ultimately it is the humans who are the best
1938							judges of the relevancies of documents to queries. The humans bring to bear semantic
1939							considerations on the relevancy determination problem that are beyond the scope of
1940							this module.
1941
1942							The generated relevancies are deposited in a file named by the constructor parameter
1943							C.
1944
1945
1946							=item B
1947
1948							If you want to get hold of all the filenames in the corpus in your own script, you
1949							can call
1950
1951							my @docs = @{$vsm->get_all_document_names()};
1952
1953							The array on the left will contain an alphabetized list of the files.
1954
1955
1956							=item B
1957
1958							This is a necessary step after the vocabulary used by a corpus is constructed. (Of
1959							course, if you will be doing document retrieval through a disk-stored VSM or LSA
1960							model, then you do not need to call this method. You construct document vectors
1961							through the following call:
1962
1963							$vsm->generate_document_vectors();
1964
1965
1966							=item B
1967
1968							After you have constructed a new instance of the C class, you must
1969							now scan the corpus documents for constructing the corpus vocabulary. This you do by:
1970
1971							$vsm->get_corpus_vocabulary_and_word_counts();
1972
1973							The only time you do NOT need to call this method is when you are using a previously
1974							constructed disk-stored VSM model for retrieval.
1975
1976
1977							=item B
1978
1979							If you want to run significance tests on the retrieval accuracies you obtain on a
1980							given corpus and with different algorithms (VSM or LSA with different choices for the
1981							constructor parameters), your own script would need access to the average precision
1982							data for a set of queries. You can get hold of this data by calling
1983
1984							$vsm->get_query_sorted_average_precision_for_queries();
1985
1986							The script C in the 'examples' directory shows how you can
1987							use this method for significance testing.
1988
1989
1990							=item B
1991
1992							=item B
1993
1994							If you would like to compare in your own script any two documents in the corpus, you
1995							can call
1996
1997							my $similarity = $vsm->pairwise_similarity_for_docs("filename_1", "filename_2");
1998							or
1999							my $similarity = $vsm->pairwise_similarity_for_normalized_docs("filename_1", "filename_2");
2000
2001							Both these calls return a number that is the dot product of the two document vectors
2002							normalized by the product of their magnitudes. The first call uses the regular
2003							document vectors and the second the normalized document vectors.
2004
2005
2006							=item B
2007
2008							After you have created or obtained the relevancy judgments for your test queries, you
2009							can make the following call to calculate C and C:
2010
2011							$vsm->precision_and_recall_calculator('vsm');
2012							or
2013							$vsm->precision_and_recall_calculator('lsa');
2014
2015							depending on whether you are testing VSM-based retrieval or LSA-based retrieval.
2016
2017							=item B
2018
2019							After you have built an LSA model through the call to C, you
2020							can retrieve the document names most similar to the query by:
2021
2022							my $retrievals = $vsm->retrieve_with_lsa( \@query );
2023
2024							Subsequently, you can display the retrievals by calling the
2025							C method described previously.
2026
2027
2028							=item B
2029
2030							After you have constructed a VSM model, you call this method for document retrieval
2031							for a given query C<@query>. The call syntax is:
2032
2033							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2034
2035							The argument, C<@query>, is simply a list of words that you wish to use for
2036							retrieval. The method returns a hash whose keys are the document names and whose
2037							values the similarity distance between the document and the query. As is commonly
2038							the case with VSM, this module uses the cosine similarity distance when comparing a
2039							document vector with the query vector.
2040
2041
2042							=item B
2043
2044							When human-supplied relevancies are available, you can upload them into the program
2045							by calling
2046
2047							$vsm->upload_document_relevancies_from_file();
2048
2049							These relevance judgments will be read from a file that is named with the
2050							C constructor parameter.
2051
2052
2053							=item B
2054
2055							When you invoke the methods C and
2056							C, that automatically deposits the VSM model in the
2057							database files named with the constructor parameters C,
2058							C and C. Subsequently, you can carry out
2059							retrieval by directly using this disk-based VSM model for speedier performance. In
2060							order to do so, you must upload the disk-based model by
2061
2062							$vsm->upload_normalized_vsm_model_from_disk();
2063
2064							Subsequently you call
2065
2066							my $retrievals = $vsm->retrieve_with_vsm( \@query );
2067							$vsm->display_retrievals( $retrievals );
2068
2069							for retrieval and for displaying the results.
2070
2071
2072							=item B
2073
2074							This is the method to call for large text corpora if you would like to examine the
2075							vocabulary created. The call syntax is
2076
2077							$vsm->write_corpus_vocab_to_file($filename);
2078
2079							where C<$filename> is the name of the file that you want the vocabulary to be written
2080							out to. This call will also show the frequency of each vocabulary word in your
2081							corpus.
2082
2083
2084							=back
2085
2086
2087							=head1 REQUIRED
2088
2089							This module requires the following modules:
2090
2091							SDBM_File
2092							Storable
2093							PDL
2094							File::Basename
2095							File::Spec::Functions
2096
2097							The first two of these are needed for creating disk-based database records for the
2098							VSM and LSA models. The third is needed for calculating the SVD of the
2099							term-frequency matrix. (PDL stands for Perl Data Language.) The last two are needed
2100							by the directory scanner to make pathnames platform independent.
2101
2102							=head1 EXAMPLES
2103
2104							See the 'examples' directory in the distribution for the scripts listed below:
2105
2106							=over
2107
2108							=item B
2109
2110							For basic VSM-based model construction and retrieval, run the script:
2111
2112							retrieve_with_VSM.pl
2113
2114							Starting with version 1.60, this script does not store away the VSM model in
2115							disk-based hash tables. If you want your model to be stored on the disk, you must
2116							run the script C for that.
2117
2118							=item B
2119
2120							If you want to run an infinite loop for repeated retrievals from a VSM model, run the
2121							script
2122
2123							continuously_running_VSM_retrieval_engine.pl
2124
2125							You can create a script similar to this for doing the same with LSA models.
2126
2127							=item B
2128
2129							For storing the model information in disk-based DBM files that can subsequently be
2130							used for both VSM and LSA retrieval, run the script:
2131
2132							retrieve_with_VSM_and_also_create_disk_based_model.pl
2133
2134							=item B
2135
2136							For basic LSA-based model construction and retrieval, run the script:
2137
2138							retrieve_with_LSA.pl
2139
2140							Starting with version 1.60, this script does not store away the model information in
2141							disk-based hash tables. If you want your model to be stored on the disk, you must
2142							run the script C for that.
2143
2144							=item B
2145
2146							If you have previously run a script like
2147							C, you can run the script
2148
2149							retrieve_with_disk_based_VSM.pl
2150
2151							for repeated VSM-based retrievals from a disk-based model.
2152
2153							=item B
2154
2155							If you have previously run a script like
2156							C, you can run the script
2157
2158							retrieve_with_disk_based_LSA.pl
2159
2160							for repeated LSA-based retrievals from a disk-based model.
2161
2162							=item B
2163
2164							To experiment with precision and recall calculations for VSM retrieval, run the
2165							script:
2166
2167							calculate_precision_and_recall_for_VSM.pl
2168
2169							Note that this script will carry out its own estimation of relevancy judgments ---
2170							which in most cases would not be a safe thing to do.
2171
2172							=item B
2173
2174							To experiment with precision and recall calculations for LSA retrieval, run the
2175							script:
2176
2177							calculate_precision_and_recall_for_LSA.pl
2178
2179							Note that this script will carry out its own estimation of relevancy judgments ---
2180							which in most cases would not be a safe thing to do.
2181
2182							=item B
2183							Human-Supplied Relevancies:>
2184
2185							Precision and recall calculations for retrieval accuracy determination are best
2186							carried out with human-supplied judgments of relevancies of the documents to queries.
2187							If such judgments are available, run the script:
2188
2189							calculate_precision_and_recall_from_file_based_relevancies_for_VSM.pl
2190
2191							This script will print out the average precisions for the different test queries and
2192							calculate the MAP metric of retrieval accuracy.
2193
2194							=item B
2195							Human-Supplied Relevancies:>
2196
2197							If human-supplied relevancy judgments are available and you wish to experiment with
2198							precision and recall calculations for LSA-based retrieval, run the script:
2199
2200							calculate_precision_and_recall_from_file_based_relevancies_for_LSA.pl
2201
2202							This script will print out the average precisions for the different test queries and
2203							calculate the MAP metric of retrieval accuracy.
2204
2205							=item B
2206							Randomization or with Student's Paired t-Test:>
2207
2208							significance_testing.pl randomization
2209
2210							or
2211
2212							significance_testing.pl t-test
2213
2214							Significance testing consists of forming a null hypothesis that the two retrieval
2215							algorithms you are considering are the same from a black-box perspective and then
2216							calculating what is known as a C. If the C is less than, say,
2217							0.05, you reject the null hypothesis.
2218
2219							=item B
2220
2221							calculate_similarity_matrix_for_all_docs.pl
2222
2223							or
2224
2225							calculate_similarity_matrix_for_all_normalized_docs.pl
2226
2227							The former uses regular document vectors for calculating the similarity between every
2228							pair of documents in the corpus. And the latter uses normalized document vectors for
2229							the same purpose. The document order used for row and column indexing of the matrix
2230							corresponds to the alphabetic ordering of the document names in the corpus directory.
2231
2232							=back
2233
2234
2235							=head1 EXPORT
2236
2237							None by design.
2238
2239							=head1 SO THAT YOU DO NOT LOSE RELEVANCY JUDGMENTS
2240
2241							You have to be careful when carrying out Precision verses Recall calculations if you
2242							do not wish to lose the previously created relevancy judgments. Invoking the method
2243							C in your own script will cause the file C
2244							to be overwritten. If you have created a relevancy database and stored it in a file
2245							called, say, C, you should make a backup copy of this file before
2246							executing a script that calls C.
2247
2248							=head1 BUGS
2249
2250							Please notify the author if you encounter any bugs. When sending email, please place
2251							the string 'VSM' in the subject line to get past my spam filter.
2252
2253							=head1 INSTALLATION
2254
2255							Download the archive from CPAN in any directory of your choice. Unpack the archive
2256							with a command that on a Linux machine would look like:
2257
2258							tar zxvf Algorithm-VSM-1.62.tar.gz
2259
2260							This will create an installation directory for you whose name will be
2261							C. Enter this directory and execute the following commands for a
2262							standard install of the module if you have root privileges:
2263
2264							perl Makefile.PL
2265							make
2266							make test
2267							sudo make install
2268
2269							If you do not have root privileges, you can carry out a non-standard install the
2270							module in any directory of your choice by:
2271
2272							perl Makefile.PL prefix=/some/other/directory/
2273							make
2274							make test
2275							make install
2276
2277							With a non-standard install, you may also have to set your PERL5LIB environment
2278							variable so that this module can find the required other modules. How you do that
2279							would depend on what platform you are working on. In order to install this module in
2280							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
2281							variable by
2282
2283							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2284
2285							If I used bash, I'd need to declare:
2286
2287							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
2288
2289
2290							=head1 THANKS
2291
2292							Many thanks are owed to Shivani Rao and Bunyamin Sisman for sharing with me their
2293							deep insights in IR. Version 1.4 was prompted by Zahn Bozanic's interest in
2294							similarity matrix characterization of a corpus. Thanks, Zahn!
2295
2296							Several of the recent changes to the module are a result of the feedback I have
2297							received from Naveen Kulkarni of Infosys Labs. Thanks, Naveen!
2298
2299							Version 1.62 was a result of Slaven Rezic's recommendation that I remove the Perl
2300							version restriction on the module since he was able to run it with Perl version
2301							5.8.9. Another important reason for v. 1.62 was the discovery of the two bugs
2302							mentioned in Changes, one of them brought to my attention by Naveen Kulkarni.
2303
2304							=head1 AUTHOR
2305
2306							Avinash Kak, kak@purdue.edu
2307
2308							If you send email, please place the string "VSM" in your subject line to get past my
2309							spam filter.
2310
2311							=head1 COPYRIGHT
2312
2313							This library is free software; you can redistribute it and/or modify it under the
2314							same terms as Perl itself.
2315
2316							Copyright 2015 Avinash Kak
2317
2318							=cut
2319
2320