File Coverage

blib/lib/Algorithm/TicketClusterer.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Algorithm::TicketClusterer;
2
3				#---------------------------------------------------------------------------
4				# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is
5				# free software. You may modify and/or distribute it under the same terms
6				# as Perl itself. This copyright notice must remain attached to the file.
7				#
8				# Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored
9				# past tickets that are most similar to a new ticket. Tickets are commonly
10				# used in software services industry and customer support businesses to
11				# record requests for service, product complaints, user feedback, and so
12				# on.
13				# ---------------------------------------------------------------------------
14
15	1	1	12990	use 5.10.0;
	1		3
	1		37
16	1	1	4	use strict;
	1		1
	1		22
17	1	1	3	use warnings;
	1		10
	1		28
18	1	1	3	use Carp;
	1		1
	1		75
19	1	1	612	use Storable;
	1		2749
	1		69
20	1	1	837	use Spreadsheet::ParseExcel;
	1		52964
	1		33
21	1	1	536	use Spreadsheet::XLSX;
	1		64852
	1		33
22	1	1	1084	use WordNet::QueryData;
	0
	0
23				use Text::Iconv;
24				use SDBM_File;
25				use Fcntl;
26
27				our $VERSION = '1.01';
28
29				############################### The Constructor #############################
30
31				sub new {
32				my ($class, %args) = @_;
33				my @params = keys %args;
34				croak "\nYou have used a wrong name for a keyword argument " .
35				"--- perhaps a misspelling\n"
36				if _check_for_illegal_params(@params) == 0;
37				bless {
38				_excel_filename => $args{excel_filename},
39				_which_worksheet => $args{which_worksheet},
40				_raw_tickets_db => $args{raw_tickets_db},
41				_processed_tickets_db => $args{processed_tickets_db},
42				_synset_cache_db => $args{synset_cache_db},
43				_stemmed_tickets_db => $args{stemmed_tickets_db},
44				_inverted_index_db => $args{inverted_index_db},
45				_tickets_vocab_db => $args{tickets_vocab_db},
46				_idf_db => $args{idf_db},
47				_tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
48				_tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
49				_clustering_fieldname => $args{clustering_fieldname},
50				_unique_id_fieldname => $args{unique_id_fieldname},
51				_stop_words_file => $args{stop_words_file},
52				_misspelled_words_file => $args{misspelled_words_file},
53				_min_word_length => $args{min_word_length} \|\| 4,
54				_add_synsets_to_tickets => $args{add_synsets_to_tickets} \|\| 0,
55				_want_stemming => $args{want_stemming} \|\| 0,
56				_how_many_retrievals => $args{how_many_retrievals} \|\| 5,
57				_min_idf_threshold => $args{min_idf_threshold},
58				_max_num_syn_words => $args{max_num_syn_words} \|\| 3,
59				_want_synset_caching => $args{want_synset_caching} \|\| 0,
60				_stop_words => {},
61				_all_tickets => [],
62				_column_headers => [],
63				_good_columns => [],
64				_tickets_by_ids => {},
65				_processed_tkts_by_ids => {},
66				_stemmed_tkts_by_ids => {},
67				_misspelled_words => {},
68				_total_num_tickets => 0,
69				_synset_cache => {},
70				_vocab_hash => {},
71				_vocab_idf_hist => {},
72				_idf_t => {},
73				_vocab_size => undef,
74				_doc_vector_template => {},
75				_tkt_doc_vecs => {},
76				_tkt_doc_vecs_normed => {},
77				_query_ticket_id => undef,
78				_inverted_index => {},
79				_debug1 => $args{debug1} \|\| 0, # for processing Excel
80				_debug2 => $args{debug2} \|\| 0, # for modeling tickets
81				_debug3 => $args{debug3} \|\| 0, # for retrieving similar tickets
82				_wn => WordNet::QueryData->new( verbose => 0,
83				noload => 1 ),
84				}, $class;
85				}
86
87				############################# Extract info from Excel #######################
88
89				sub get_tickets_from_excel {
90				my $self = shift;
91				unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
92				unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
93				unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
94				unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
95				unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
96				unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
97				unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
98				unlink glob "$self->{_tickets_vocab_db}.*";
99				unlink glob "$self->{_idf_db}.*";
100				my $filename = $self->{_excel_filename} \|\| die("Excel file required"),
101				my $clustering_fieldname = $self->{_clustering_fieldname}
102				\|\| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
103				my $unique_id_fieldname = $self->{_unique_id_fieldname}
104				\|\| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
105				my $workbook;
106				if ($filename =~ /\.xls$/) {
107				my $parser = Spreadsheet::ParseExcel->new();
108				$workbook = $parser->parse($filename);
109				die $parser->error() unless defined $workbook;
110				} elsif ($filename =~ /\.xlsx$/) {
111				# use Text::Iconv;
112				my $converter = Text::Iconv->new("utf-8", "windows-1251");
113				$workbook = Spreadsheet::XLSX->new($filename, $converter);
114				} else {
115				die "File suffix on the Excel file not recognized";
116				}
117				my @worksheets = $workbook->worksheets();
118				my $which_worksheet = $self->{_which_worksheet} \|\|
119				die "\nYou have not specified which Excel worksheet contains the tickets\n";
120				my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
121				my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
122				my @good_columns;
123				my $col_headers_row;
124				my $col_headers_found = 0;
125				my $col_index_for_unique_id;
126				my $col_index_for_clustering_field;
127				for my $row ( $row_min .. $row_max ) {
128				last if $col_headers_found;
129				@good_columns = ();
130				for my $col ( $col_min .. $col_max ) {
131				my $cell =
132				$worksheets[$which_worksheet-1]->get_cell( $row, $col );
133				next unless $cell;
134				my $cell_value = _get_rid_of_wide_chars($cell->value());
135				push @good_columns, $col if $cell_value;
136				if ($cell_value eq $unique_id_fieldname) {
137				$col_index_for_unique_id = $col;
138				$col_headers_row = $row;
139				$col_headers_found = 1;
140				}
141				if ($cell_value eq $clustering_fieldname) {
142				$col_index_for_clustering_field = $col;
143				}
144				}
145				}
146				$self->{_good_columns} = \@good_columns;
147				print "\nThe unique id is in column: $col_index_for_unique_id\n"
148				if $self->{_debug1};
149				print "The clustering field is in column: " .
150				"$col_index_for_clustering_field\n\n" if $self->{_debug1};
151				my %Column_Headers;
152				foreach my $field_index (0..@good_columns-1) {
153				my $key = "field_" . $field_index;
154				$Column_Headers{$key} = "";
155				}
156				my @col_headers = map {
157				my $cell =
158				$worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
159				$cell ? _get_rid_of_wide_chars($cell->value()) : '';
160				} @good_columns;
161				$self->{_column_headers} = \@col_headers;
162				$self->_display_column_headers() if $self->{_debug1};
163				my $unique_id_field_index_in_good_columns =
164				_find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
165				my $clustering_field_index_in_good_columns =
166				_find_index_for_given_element( $col_index_for_clustering_field,
167				\@good_columns );
168				die "Something is wrong with the info extracted from Excel " .
169				"as the index for the column with unique IDs is not one of " .
170				"good columns\n\n"
171				unless (defined $unique_id_field_index_in_good_columns) &&
172				(defined $clustering_field_index_in_good_columns);
173				for my $row_index ( $col_headers_row+1..$row_max-1) {
174				my @values = map {
175				my $cell =
176				$worksheets[$which_worksheet-1]->get_cell($row_index, $_);
177				$cell ? _get_rid_of_wide_chars($cell->value()) : '';
178				} @good_columns;
179				next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/;
180				next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/;
181				my %onerow;
182				foreach my $field_index (0..@good_columns-1) {
183				my $key = "field_" . $field_index;
184				die "The Columns Headers hash has no field for index " .
185				"$field_index\n "
186				unless exists $col_headers[$field_index];
187				$onerow{$col_headers[$field_index]} = $values[$field_index];
188				}
189				push @{$self->{_all_tickets}}, \%onerow;
190				}
191				my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
192				if (@duplicates_for_id_field > 0) {
193				print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
194				die "\n\nYour unique id field for tickets contains duplicate id's";
195				}
196				foreach my $ticket (@{$self->{_all_tickets}}) {
197				$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
198				lc($ticket->{$clustering_fieldname});
199				}
200				$self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
201				$self->store_raw_tickets_on_disk();
202				}
203
204				sub _test_excel_for_tickets {
205				my $self = shift;
206				use Text::Iconv;
207				my $converter = Text::Iconv->new("utf-8", "windows-1251");
208				my $filename = $self->{_excel_filename} \|\| die("Excel sheet needed for testing is missing");
209				my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
210				my @worksheets = $workbook->worksheets();
211				my ( $row_min, $row_max ) = $worksheets[0]->row_range();
212				my ( $col_min, $col_max ) = $worksheets[0]->col_range();
213				return ($row_min, $row_max, $col_min, $col_max);
214				}
215
216				sub _display_column_headers {
217				my $self = shift;
218				print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
219				my $overall_header_string = join ' <> ', @{$self->{_column_headers}};
220				print "The column headers are: $overall_header_string\n\n";
221				}
222
223				sub _check_unique_id_field {
224				my $self = shift;
225				my %check_hash;
226				my @duplicates;
227				foreach my $ticket (@{$self->{_all_tickets}}) {
228				if (exists $ticket->{$self->{_unique_id_fieldname}}) {
229				push @duplicates, $ticket->{$self->{_unique_id_fieldname}}
230				if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
231				$check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
232				}
233				}
234				if ($self->{_debug1}) {
235				my $num_of_tickets = @{$self->{_all_tickets}};
236				my $num_entries_check_hash = keys %check_hash;
237				print "Number of tickets: $num_of_tickets\n";
238				print "Number of keys in check hash: $num_entries_check_hash\n";
239				}
240				return \@duplicates;
241				}
242
243				sub show_original_ticket_for_given_id {
244				my $self = shift;
245				my $id = shift;
246				print "\n\nDisplaying the fields for the ticket $id:\n\n";
247				foreach my $ticket (@{$self->{_all_tickets}}) {
248				if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
249				foreach my $key (sort keys %{$ticket}) {
250				my $value = $ticket->{$key};
251				$value =~ s/^\s+//;
252				$value =~ s/\s+$//;
253				printf("%20s ==> %s\n", $key, $value);
254				}
255				}
256				}
257				}
258
259				sub show_raw_ticket_clustering_data_for_given_id {
260				my $self = shift;
261				my $ticket_id = shift;
262				my $record = $self->{_tickets_by_ids}->{$ticket_id};
263				print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
264				"$record\n\n";
265				return $record;
266				}
267
268				# Needed by test.t
269				sub _raw_ticket_clustering_data_for_given_id {
270				my $self = shift;
271				my $ticket_id = shift;
272				my $record = $self->{_tickets_by_ids}->{$ticket_id};
273				return $record;
274				}
275
276
277				sub show_processed_ticket_clustering_data_for_given_id {
278				my $self = shift;
279				my $ticket_id = shift;
280				my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
281				print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
282				"$record\n\n";
283				}
284
285				sub show_stemmed_ticket_clustering_data_for_given_id {
286				my $self = shift;
287				my $ticket_id = shift;
288				my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
289				print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
290				"$record\n\n";
291				}
292
293				# The following function is a good diagnostic tool to look into the
294				# array stored in $self->{_all_tickets}. Each element of this array
295				# is a record that represents one row of the Excel file.
296				sub _show_row {
297				my $self = shift;
298				my $row_num = shift;
299				my $total_rows = @{$self->{_all_tickets}};
300				print "There are $total_rows items in the \$all_tickets array\n";
301				die "The row that you want to see does not exist"
302				unless $row_num < $total_rows;
303				my %record = %{$self->{_all_tickets}->[$row_num]};
304				foreach my $field (sort keys %record) {
305				my $value = $record{$field};
306				no warnings;
307				print "$field ==> $value\n";
308				}
309				}
310
311				sub store_raw_tickets_on_disk {
312				my $self = shift;
313				$self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db};
314				unlink $self->{_raw_tickets_db};
315				eval {
316				store( $self->{_all_tickets}, $self->{_raw_tickets_db} );
317				};
318				if ($@) {
319				die "Something went wrong with disk storage of ticket data: $@";
320				}
321				}
322
323				sub restore_raw_tickets_from_disk {
324				my $self = shift;
325				my $clustering_fieldname = $self->{_clustering_fieldname}
326				\|\| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
327				my $unique_id_fieldname = $self->{_unique_id_fieldname}
328				\|\| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
329				eval {
330				$self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} );
331				};
332				if ($@) {
333				die "Unable to retrieve raw tickets from disk: $@";
334				}
335				foreach my $ticket (@{$self->{_all_tickets}}) {
336				$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
337				lc($ticket->{$clustering_fieldname});
338				}
339				$self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}};
340				}
341
342				sub delete_markup_from_all_tickets {
343				my $self = shift;
344				foreach my $ticket (@{$self->{_all_tickets}}) {
345				$self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}});
346				}
347				}
348
349				sub _delete_markup_from_one_ticket {
350				my $self = shift;
351				my $ticket_id = shift;
352				my $ticket_strings = $self->{_tickets_by_ids}->{$ticket_id};
353				my @strings = grep $_, split /\s+/, $ticket_strings;
354				my $cleaned_up_strings = join ' ', grep {$_ !~ /^<[^<>]+>$/} @strings;
355				$self->{_tickets_by_ids}->{$ticket_id} = $cleaned_up_strings;
356				foreach my $ticket (@{$self->{_all_tickets}}) {
357				if ( $ticket->{$self->{_unique_id_fieldname}} == $ticket_id ) {
358				$ticket->{$self->{_clustering_fieldname}} = $cleaned_up_strings;
359				last;
360				}
361				}
362				}
363
364				sub apply_filter_to_all_tickets {
365				my $self = shift;
366				my $stop_words_file = $self->{_stop_words_file}
367				\|\| die("\nYou forgot to supply the name of the stop words file in your constructor call\n");
368				my @stop_words = @{_fetch_words_from_file($stop_words_file)};
369				my $misspelled_words_file = $self->{_misspelled_words_file}
370				\|\| die("\nYou forgot to supply the name of the misspelled words file in your constructor call\n");
371				foreach my $word (@stop_words) {
372				$self->{_stop_words}->{$word} = 1;
373				}
374				if ($self->{_misspelled_words_file}) {
375				my @misspelled_word_pairs =
376				@{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
377				foreach my $wordpair (@misspelled_word_pairs) {
378				my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
379				$self->{_misspelled_words}->{$wrong_word} = $good_word;
380				}
381				}
382				my $i = 1;
383				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tickets_by_ids}}) {
384				print "\nApplying filter to ticket $ticket_id ($i out of $self->{_total_num_tickets})\n";
385				$self->_apply_filter_to_one_ticket($ticket_id);
386				$i++;
387				}
388				}
389
390				sub _apply_filter_to_one_ticket {
391				my $self = shift;
392				my $ticket_id = shift;
393
394				unless (keys %{$self->{_stop_words}} > 0) {
395				my @stop_words = @{_fetch_words_from_file($self->{_stop_words_file})};
396				foreach my $word (@stop_words) {
397				$self->{_stop_words}->{$word} = 1;
398				}
399				}
400				unless (keys %{$self->{_misspelled_words}} > 0) {
401				if ($self->{_misspelled_words_file}) {
402				my @misspelled_word_pairs =
403				@{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
404				foreach my $wordpair (@misspelled_word_pairs) {
405				my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
406				$self->{_misspelled_words}->{$wrong_word} = $good_word;
407				}
408				}
409				}
410				my $record = $self->{_tickets_by_ids}->{$ticket_id};
411				my $min = $self->{_min_word_length};
412				my @words = split /\n\|\r\|\"\|\'\|\.\|\,\|\;\|\?\|$\|$\|\[\|\]\|\\\|\/\|\s+\|\&/, $record;
413				my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words;
414				return unless @clean_words;
415				my @new_words;
416				foreach my $word (@words) {
417				$word =~ s/(.+)[.,:!-]$/$1/;
418				unless (($word eq 'no') or ($word eq 'not')) {
419				next if length($word) < $self->{_min_word_length};
420				}
421				if (exists $self->{_misspelled_words}->{lc($word)}) {
422				push @new_words, $self->{_misspelled_words}->{$word};
423				next;
424				}
425				push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
426				}
427				my $new_record = join ' ', @new_words;
428				$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
429				}
430
431				sub _get_synonyms_for_word {
432				my $self = shift;
433				my $word = shift;
434				my $no_sense_indicators = 1;
435				my $wn = $self->{_wn};
436				my @parts_of_speech = $wn->querySense("$word");
437
438				my %noun_synonyms;
439				my %verb_synonyms;
440				my %adj_synonyms;
441				my %adv_synonyms;
442				foreach my $pos (@parts_of_speech) {
443				if ($pos =~ /n$/) {
444				my @all_noun_syn_sense_labels = $wn->querySense( $pos, "syns");
445				my $how_many = @all_noun_syn_sense_labels;
446				foreach my $noun_sense (@all_noun_syn_sense_labels) {
447				my @noun_synonyms = $wn->querySense($noun_sense, "syns");
448				my $answer = "";
449				foreach my $noun_syn (@noun_synonyms) {
450				next if $noun_syn eq $noun_sense;
451				$noun_syn =~ s/\#.+$// if $no_sense_indicators;
452				$noun_synonyms{$noun_syn} = 1;
453				$answer .= " $noun_syn ";
454				}
455				}
456				} elsif ($pos =~ /v$/) {
457				my @all_verb_syn_sense_labels = $wn->querySense( $pos, "syns");
458				my $how_many = @all_verb_syn_sense_labels;
459				foreach my $verb_sense (@all_verb_syn_sense_labels) {
460				my @verb_synonyms = $wn->querySense($verb_sense, "syns");
461				my $answer = "";
462				foreach my $verb_syn (@verb_synonyms) {
463				next if $verb_syn eq $verb_sense;
464				$verb_syn =~ s/\#.+$// if $no_sense_indicators;
465				$verb_synonyms{$verb_syn} = 1;
466				$answer .= " $verb_syn ";
467				}
468				}
469				} elsif ($pos =~ /a$/) {
470				my @all_adj_syn_sense_labels = $wn->querySense( $pos, "syns");
471				my $how_many = @all_adj_syn_sense_labels;
472				foreach my $adj_sense (@all_adj_syn_sense_labels) {
473				my @adj_synonyms = $wn->querySense($adj_sense, "syns");
474				my $answer = "";
475				foreach my $adj_syn (@adj_synonyms) {
476				next if $adj_syn eq $adj_sense;
477				$adj_syn =~ s/\#.+$// if $no_sense_indicators;
478				$adj_synonyms{$adj_syn} = 1;
479				$answer .= " $adj_syn ";
480				}
481				}
482				} elsif ($pos =~ /r$/) {
483				my @all_adv_syn_sense_labels = $wn->querySense( $pos, "syns");
484				my $how_many = @all_adv_syn_sense_labels;
485				foreach my $adv_sense (@all_adv_syn_sense_labels) {
486				my @adv_synonyms = $wn->querySense($adv_sense, "syns");
487				my $answer = "";
488				foreach my $adv_syn (@adv_synonyms) {
489				next if $adv_syn eq $adv_sense;
490				$adv_syn =~ s/\#.+$// if $no_sense_indicators;
491				$adv_synonyms{$adv_syn} = 1;
492				$answer .= " $adv_syn ";
493				}
494				}
495				} else {
496				die "\nThe Part of Speech $pos not recognized\n\n";
497				}
498				}
499				my @all_synonyms;
500				my @all_noun_synonyms = keys %noun_synonyms;
501				my @all_verb_synonyms = keys %verb_synonyms;
502				my @all_adj_synonyms = keys %adj_synonyms;
503				my @all_adv_synonyms = keys %adv_synonyms;
504				push @all_synonyms, @all_noun_synonyms if @all_noun_synonyms > 0;
505				push @all_synonyms, @all_verb_synonyms if @all_verb_synonyms > 0;
506				push @all_synonyms, @all_adj_synonyms if @all_adj_synonyms > 0;
507				push @all_synonyms, @all_adv_synonyms if @all_adv_synonyms > 0;
508				my %synonym_set;
509				foreach my $synonym (@all_synonyms) {
510				$synonym_set{$synonym} = 1;
511				}
512				my @synonym_set = sort keys %synonym_set;
513				return \@synonym_set;
514				}
515
516				sub _get_antonyms_for_word {
517				my $self = shift;
518				my $word = shift;
519				my $no_sense_indicators = 1;
520				my $wn = $self->{_wn};
521				my @parts_of_speech = $wn->querySense("$word");
522				my %noun_antonyms;
523				my %verb_antonyms;
524				my %adj_antonyms;
525				my %adv_antonyms;
526				foreach my $pos (@parts_of_speech) {
527				if ($pos =~ /n$/) {
528				my @all_noun_ant_sense_labels = $wn->queryWord( $pos, "ants");
529				my $how_many = @all_noun_ant_sense_labels;
530				foreach my $noun_sense (@all_noun_ant_sense_labels) {
531				my @noun_antonyms = $wn->queryWord($noun_sense, "ants");
532				my $answer = "";
533				foreach my $noun_ant (@noun_antonyms) {
534				next if $noun_ant eq $noun_sense;
535				$noun_ant =~ s/\#.+$// if $no_sense_indicators;
536				$noun_antonyms{$noun_ant} = 1;
537				$answer .= " $noun_ant ";
538				}
539				}
540				} elsif ($pos =~ /v$/) {
541				my @all_verb_ant_sense_labels = $wn->queryWord( $pos, "ants");
542				my $how_many = @all_verb_ant_sense_labels;
543				foreach my $verb_sense (@all_verb_ant_sense_labels) {
544				my @verb_antonyms = $wn->queryWord($verb_sense, "ants");
545				my $answer = "";
546				foreach my $verb_ant (@verb_antonyms) {
547				next if $verb_ant eq $verb_sense;
548				$verb_ant =~ s/\#.+$// if $no_sense_indicators;
549				$verb_antonyms{$verb_ant} = 1;
550				$answer .= " $verb_ant ";
551				}
552				}
553				} elsif ($pos =~ /a$/) {
554				my @all_adj_ant_sense_labels = $wn->queryWord( $pos, "ants");
555				my $how_many = @all_adj_ant_sense_labels;
556				foreach my $adj_sense (@all_adj_ant_sense_labels) {
557				my @adj_antonyms = $wn->queryWord($adj_sense, "ants");
558				my $answer = "";
559				foreach my $adj_ant (@adj_antonyms) {
560				next if $adj_ant eq $adj_sense;
561				$adj_ant =~ s/\#.+$// if $no_sense_indicators;
562				$adj_antonyms{$adj_ant} = 1;
563				$answer .= " $adj_ant ";
564				}
565				}
566				} elsif ($pos =~ /r$/) {
567				my @all_adv_ant_sense_labels = $wn->queryWord( $pos, "ants");
568				my $how_many = @all_adv_ant_sense_labels;
569				foreach my $adv_sense (@all_adv_ant_sense_labels) {
570				my @adv_antonyms = $wn->queryWord($adv_sense, "ants");
571				my $answer = "";
572				foreach my $adv_ant (@adv_antonyms) {
573				next if $adv_ant eq $adv_sense;
574				$adv_ant =~ s/\#.+$// if $no_sense_indicators;
575				$adv_antonyms{$adv_ant} = 1;
576				$answer .= " $adv_ant ";
577				}
578				}
579				} else {
580				die "\nThe Part of Speech $pos not recognized\n\n";
581				}
582				}
583				my @all_antonyms;
584				my @all_noun_antonyms = keys %noun_antonyms;
585				my @all_verb_antonyms = keys %verb_antonyms;
586				my @all_adj_antonyms = keys %adj_antonyms;
587				my @all_adv_antonyms = keys %adv_antonyms;
588				push @all_antonyms, @all_noun_antonyms if @all_noun_antonyms > 0;
589				push @all_antonyms, @all_verb_antonyms if @all_verb_antonyms > 0;
590				push @all_antonyms, @all_adj_antonyms if @all_adj_antonyms > 0;
591				push @all_antonyms, @all_adv_antonyms if @all_adv_antonyms > 0;
592				my %antonym_set;
593				foreach my $antonym (@all_antonyms) {
594				$antonym_set{$antonym} = 1;
595				}
596				my @antonym_set = sort keys %antonym_set;
597				return \@antonym_set;
598				}
599
600				sub expand_all_tickets_with_synonyms {
601				my $self = shift;
602				return unless $self->{_add_synsets_to_tickets};
603				my $num_of_tickets = $self->{_total_num_tickets};
604				if ($self->{_want_synset_caching}) {
605				eval {
606				$self->{_synset_cache} = retrieve( $self->{_synset_cache_db} );
607				} if -s $self->{_synset_cache_db};
608				if ($@) {
609				print "Something went wrong with restoration of synset cache: $@";
610				}
611				}
612				my $i = 1;
613				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
614				$self->_expand_one_ticket_with_synonyms($ticket_id);
615				print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n";
616				$i++;
617				}
618				if ($self->{_want_synset_caching}) {
619				$self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db};
620				eval {
621				store( $self->{_synset_cache}, $self->{_synset_cache_db} );
622				};
623				if ($@) {
624				die "Something went wrong with disk storage of synset cache: $@";
625				}
626				}
627				}
628
629				sub _expand_one_ticket_with_synonyms {
630				my $self = shift;
631				my $ticket_id = shift;
632				print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n"
633				if $self->{_debug2};
634				$self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id );
635				$self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
636				}
637
638				sub _replace_negated_words_with_antonyms_one_ticket {
639				my $self = shift;
640				my $ticket_id = shift;
641				my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
642				my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
643				foreach my $word (@words_negated_with_not) {
644				next unless (($word =~ /^\w+$/) &&
645				(length($word) > $self->{_min_word_length}));
646				my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
647				next unless @antonym_words > 0;
648				$#antonym_words = $self->{_max_num_syn_words} - 1
649				if @antonym_words > $self->{_max_num_syn_words};
650				my $antonym_replacement_string = join ' ', @antonym_words;
651				print "Antonym for $word is $antonym_replacement_string\n"
652				if $self->{_debug2};
653				$record =~ s/not\s+$word/$antonym_replacement_string/g;
654				}
655				my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
656				foreach my $word (@words_negated_with_no) {
657				next unless (($word =~ /^\w+$/) &&
658				(length($word) > $self->{_min_word_length}));
659				my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
660				next unless @antonym_words > 0;
661				$#antonym_words = $self->{_max_num_syn_words} - 1
662				if @antonym_words > $self->{_max_num_syn_words};
663				my $antonym_replacement_string = join ' ', @antonym_words;
664				print "Antonym for $word is $antonym_replacement_string\n"
665				if $self->{_debug2};
666				$record =~ s/no\s+$word/$antonym_replacement_string/g;
667				}
668				$self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
669				}
670
671				sub _add_to_words_their_synonyms_one_ticket {
672				my $self = shift;
673				my $ticket_id = shift;
674				my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
675				my @words = split /\s+/, $record;
676				my @synonym_bag;
677				foreach my $word (@words) {
678				next if $word eq 'no';
679				next if $word eq 'not';
680				next unless $word =~ /^\w+$/ &&
681				length($word) > $self->{_min_word_length};
682				my @synonym_words;
683				@synonym_words = @{$self->{_synset_cache}->{$word}}
684				if exists $self->{_synset_cache}->{$word};
685				unless (exists $self->{_synset_cache}->{$word}) {
686				@synonym_words = @{$self->_get_synonyms_for_word( $word )};
687				print "syn-set for $word => @synonym_words\n\n"
688				if $self->{_debug2};
689				my $word_root;
690				if (@synonym_words == 0) {
691				if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
692				$word_root = $1;
693				@synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
694				if length($word_root) >= $self->{_min_word_length};
695				} elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
696				$word_root = $1;
697				@synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
698				if length($word_root) >= $self->{_min_word_length};
699				}
700				}
701				print "syn-set for word root $word_root => @synonym_words\n\n"
702				if ( $self->{_debug2} && defined $word_root );
703				_fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
704				$#synonym_words = $self->{_max_num_syn_words} - 1
705				if @synonym_words > $self->{_max_num_syn_words};
706				print "Retained syn-set for $word => @synonym_words\n\n"
707				if $self->{_debug2};
708				$self->{_synset_cache}->{$word} = \@synonym_words;
709				push @synonym_bag, @synonym_words;
710				}
711				}
712				foreach my $syn_word (@synonym_bag) {
713				push @words, lc($syn_word)
714				unless ((exists $self->{_stop_words}->{$syn_word}) \|\|
715				(length($syn_word) <= $self->{_min_word_length}));
716				}
717				my @sorted_words = sort @words;
718				my $new_record = join ' ', @sorted_words;
719				$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
720				}
721
722				sub store_processed_tickets_on_disk {
723				my $self = shift;
724				$self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
725				unlink $self->{_processed_tickets_db};
726				eval {
727				store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} );
728				};
729				if ($@) {
730				die "Something went wrong with disk storage of processed tickets: $@";
731				}
732				}
733
734				sub store_stemmed_tickets_and_inverted_index_on_disk {
735				my $self = shift;
736				$self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
737				unlink $self->{_stemmed_tickets_db};
738				eval {
739				print "\n\nStoring stemmed tickets on disk\n\n";
740				store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} );
741				};
742				if ($@) {
743				die "Something went wrong with disk storage of stemmed tickets: $@";
744				}
745				$self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db};
746				unlink $self->{_inverted_index_db};
747				eval {
748				print "\nStoring inverted index on disk\n\n";
749				store( $self->{_inverted_index}, $self->{_inverted_index_db} );
750				};
751				if ($@) {
752				die "Something went wrong with disk storage of the inverted index: $@";
753				}
754				}
755
756				sub restore_processed_tickets_from_disk {
757				my $self = shift;
758				eval {
759				$self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
760				};
761				if ($@) {
762				die "Something went wrong with restoration of processed tickets: $@";
763				}
764				}
765
766				sub restore_stemmed_tickets_from_disk {
767				my $self = shift;
768				eval {
769				$self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
770				};
771				if ($@) {
772				die "Something went wrong with restoration of stemmed tickets: $@";
773				}
774				}
775
776				#################### Get Ticket Vocabulary and Word Counts #################
777
778				sub get_ticket_vocabulary_and_construct_inverted_index {
779				my $self = shift;
780				my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
781				$self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
782				unlink glob "$self->{_tickets_vocab_db}.*";
783				my %vocab_hist_on_disk;
784				tie %vocab_hist_on_disk, 'SDBM_File',
785				$self->{_tickets_vocab_db}, O_RDWR\|O_CREAT, 0640
786				or die "Can't create DBM files: $!";
787				my %inverted_index;
788				my $min = $self->{_min_word_length};
789				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
790				my %uniques = ();
791				my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
792				my @brokenup = split /\n\|\r\|\"\|\'\|\.\|$\|$\|\[\|\]\|\\\|\/\|\s+/, $record;
793				my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
794				next unless @clean_words;
795				@clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}),
796				@clean_words;
797				map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
798				for (@clean_words) { $uniques{"\L$_"}++ };
799				map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
800				map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id }
801				keys %uniques;
802				$self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
803				}
804				foreach (keys %vocab_hist_on_disk) {
805				$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
806				}
807				untie %vocab_hist_on_disk;
808				$self->{_tkt_vocab_done} = 1;
809				$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
810				print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
811				if $self->{_debug2};
812				# Calculate idf(t):
813				$self->{_idf_db} = "idf.db" unless $self->{_idf_db};
814				unlink glob "$self->{_idf_db}.*";
815				tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR\|O_CREAT, 0640
816				or die "Can't create DBM files: $!";
817				foreach (keys %{$self->{_vocab_idf_hist}}) {
818				$idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
819				/
820				(1 + $self->{_vocab_idf_hist}->{$_})))
821				/ log(10) );
822				}
823				foreach (keys %idf_t_on_disk) {
824				$self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
825				}
826				untie %idf_t_on_disk;
827				}
828
829				sub display_tickets_vocab {
830				my $self = shift;
831				die "tickets vocabulary not yet constructed"
832				unless keys %{$self->{_vocab_hist}};
833				print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n";
834				foreach (sort keys %{$self->{_vocab_hist}}){
835				my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
836				print "$outstring\n";
837				}
838				my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
839				print "\nSize of the tickets vocabulary: $vocab_size\n\n";
840				}
841
842				sub display_inverse_document_frequencies {
843				my $self = shift;
844				die "tickets vocabulary not yet constructed"
845				unless keys %{$self->{_vocab_idf_hist}};
846				print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
847				foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
848				my $outstring = sprintf("%30s %d",
849				$_, $self->{_vocab_idf_hist}->{$_});
850				print "$outstring\n";
851				}
852				print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
853				foreach ( sort keys %{$self->{_idf_t}} ) {
854				my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
855				print "$outstring\n";
856				}
857				}
858
859				# The following subroutine is useful for diagnostic purposes. It
860				# lists the number of tickets that a word appears in and also lists
861				# the tickets. But be careful in interpreting its results. Note
862				# if you invoke this subroutine after the synsets have been added
863				# to the tickets, you may find words being attributed to tickets
864				# that do not actually contain them in the original Excel sheet.
865				sub list_processed_tickets_for_a_word {
866				my $self = shift;
867				while (my $word = ) { #enter ctrl-D to exit the loop
868				chomp $word;
869				my @ticket_list;
870				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
871				my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
872				push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
873				}
874				my $num = @ticket_list;
875				print "\nThe number of processed tickets that mention the word `$word': $num\n\n";
876				print "The processed tickets: @ticket_list\n\n";
877				}
878				}
879
880				sub list_stemmed_tickets_for_a_word {
881				my $self = shift;
882				while (my $word = ) { #enter ctrl-D to exit the loop
883				chomp $word;
884				my @ticket_list;
885				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
886				my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
887				push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
888				}
889				my $num = @ticket_list;
890				print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n";
891				print "The stemmed tickets: @ticket_list\n\n";
892				}
893				}
894
895				############## Generate Document Vectors for Tickets ####################
896
897				sub construct_doc_vectors_for_all_tickets {
898				my $self = shift;
899				foreach ( sort keys %{$self->{_vocab_hist}} ) {
900				$self->{_doc_vector_template}->{$_} = 0;
901				}
902				my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}};
903				my $i = 1;
904				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
905				my $doc_vec_ref =
906				$self->_construct_doc_vector_for_one_ticket($ticket_id);
907				print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n";
908				$i++;
909				}
910				}
911
912				sub _construct_doc_vector_for_one_ticket {
913				my $self = shift;
914				my $ticket_id = shift;
915				unless (keys %{$self->{_doc_vector_template}}) {
916				foreach ( sort keys %{$self->{_vocab_hist}} ) {
917				$self->{_doc_vector_template}->{$_} = 0;
918				}
919				}
920				my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
921				foreach ( sort keys %{$self->{_doc_vector_template}} ) {
922				$doc_vector{$_} = 0;
923				}
924				my $min = $self->{_min_word_length};
925				my $total_words_in_ticket = 0;
926				my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
927				my @clean_words = split /\s+/, $record;
928				map { $doc_vector{"\L$_"}++ }
929				grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
930				die "Something went wrong. Doc vector size unequal to vocab size"
931				unless $self->{_vocab_size} == scalar(keys %doc_vector);
932				foreach (keys %doc_vector) {
933				$total_words_in_ticket += $doc_vector{$_};
934				}
935				my %normalized_doc_vector;
936				foreach (keys %doc_vector) {
937				$normalized_doc_vector{$_} = $doc_vector{$_}
938				*
939				$self->{_idf_t}->{$_}
940				/
941				$total_words_in_ticket;
942				}
943				$self->{_tkt_doc_vecs}->{$ticket_id} = \%doc_vector;
944				$self->{_tkt_doc_vecs_normed}->{$ticket_id} = \%normalized_doc_vector;
945				}
946
947				sub store_ticket_vectors {
948				my $self = shift;
949				die "You have not yet created doc vectors for tickets"
950				unless keys %{$self->{_tkt_doc_vecs}};
951				$self->{_tkt_doc_vecs_db} = "tkt_doc_vecs.db" unless $self->{_tkt_doc_vecs_db};
952				$self->{_tkt_doc_vecs_normed_db} = "tkt_doc_vecs_normed.db"
953				unless $self->{_tkt_doc_vecs_normed_db};
954				unlink $self->{_tkt_doc_vecs_db};
955				unlink $self->{_tkt_doc_vecs_normed_db};
956				print "\nStoring the ticket doc vecs on disk. This could take a while.\n\n";
957				eval {
958				store( $self->{_tkt_doc_vecs}, $self->{_tkt_doc_vecs_db} );
959				};
960				if ($@) {
961				die "Something went wrong with disk storage of ticket doc vectors: $@";
962				}
963				print "\nStoring normalized ticket doc vecs on disk. This could take a while.\n\n";
964				eval {
965				store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db});
966				};
967				if ($@) {
968				die "Something wrong with disk storage of normalized doc vecs: $@";
969				}
970				}
971
972				sub restore_ticket_vectors_and_inverted_index {
973				my $self = shift;
974				$self->restore_raw_tickets_from_disk();
975				$self->restore_processed_tickets_from_disk();
976				$self->restore_stemmed_tickets_from_disk();
977				tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640
978				or die "Can't connect with DBM file: $!";
979				foreach (keys %vocab_hist_on_disk) {
980				$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
981				}
982				untie %vocab_hist_on_disk;
983				tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640
984				or die "Can't connect with DBM file: $!";
985				foreach (keys %idf_t_on_disk) {
986				$self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
987				}
988				untie %idf_t_on_disk;
989				eval {
990				$self->{_tkt_doc_vecs} = retrieve( $self->{_tkt_doc_vecs_db} );
991				};
992				if ($@) {
993				print "Something went wrong with retrieval of ticket doc vectors: $@";
994				}
995				eval {
996				$self->{_tkt_doc_vecs_normed} = retrieve( $self->{_tkt_doc_vecs_normed_db} );
997				};
998				if ($@) {
999				print "Something went wrong with retrieval of normed ticket doc vectors: $@";
1000				}
1001				eval {
1002				$self->{_inverted_index} =
1003				retrieve( $self->{_inverted_index_db} );
1004				};
1005				if ($@) {
1006				print "Something went wrong with retrieval of inverted_index: $@";
1007				}
1008				}
1009
1010				sub display_all_doc_vectors {
1011				my $self = shift;
1012				die "Ticket doc vectors not yet constructed"
1013				unless keys %{$self->{_tkt_doc_vecs}};
1014				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs}}) {
1015				$self->_display_a_single_ticket_doc_vector($ticket_id);
1016				}
1017				}
1018
1019				sub _display_a_single_ticket_doc_vector {
1020				my $self = shift;
1021				my $ticket_id = shift;
1022				die "Ticket doc vectors not yet constructed"
1023				unless keys %{$self->{_tkt_doc_vecs}};
1024				print "\n\nDISPLAYING Doc Vec FOR TICKET $ticket_id:\n\n";
1025				foreach ( sort keys %{$self->{_tkt_doc_vecs}->{$ticket_id}} ) {
1026				print "$_ => $self->{_tkt_doc_vecs}->{$ticket_id}->{$_}\n";
1027				}
1028				my $docvec_size = keys %{$self->{_tkt_doc_vecs}->{$ticket_id}};
1029				print "\nSize of vector for ticket $ticket_id: $docvec_size\n";
1030				}
1031
1032				sub display_all_normalized_doc_vectors {
1033				my $self = shift;
1034				die "normalized document vectors not yet constructed"
1035				unless keys %{$self->{_tkt_doc_vecs_normed}};
1036				foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs_normed}}) {
1037				$self->_display_a_single_normalized_doc_vector($ticket_id);
1038				}
1039				}
1040
1041				sub _display_a_single_normalized_doc_vector {
1042				my $self = shift;
1043				my $ticket_id = shift;
1044				die "Normalized ticket doc vectors not yet constructed"
1045				unless keys %{$self->{_tkt_doc_vecs_normed}};
1046				print "\n\nDISPLAYING Normalized Doc Vec FOR TICKET $ticket_id:\n\n";
1047				foreach ( sort keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}} ) {
1048				print "$_ => $self->{_tkt_doc_vecs_normed}->{$ticket_id}->{$_}\n";
1049				}
1050				my $docvec_size = keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}};
1051				print "\nSize of normalized vector for ticket $ticket_id: $docvec_size\n";
1052				}
1053
1054				########################## Display Inverted Index ###########################
1055
1056				sub display_inverted_index {
1057				my $self = shift;
1058				print "\n\nDisplaying inverted index:\n\n";
1059				foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1060				$self->display_inverted_index_for_given_word($word);
1061				}
1062				}
1063
1064				sub display_inverted_index_for_given_word {
1065				my $self = shift;
1066				my $word = shift;
1067				defined $self->{_inverted_index}->{$word} ?
1068				print "$word => @{$self->{_inverted_index}->{$word}}\n" :
1069				die "Something is wrong with your inverted index\n";
1070				}
1071
1072				sub display_inverted_index_for_given_query {
1073				my $self = shift;
1074				my $query_ticket_id = shift;
1075				my $query_record = $self->{_stemmed_tkts_by_ids}->{$query_ticket_id};
1076				my @query_words = grep $_, split /\s+/, $query_record;
1077				foreach my $qword (@query_words) {
1078				my $idf_t = $self->{_idf_t}->{$qword};
1079				my @relevant_tickets = @{$self->{_inverted_index}->{$qword}};
1080				print "\n$qword ($idf_t) ===> @relevant_tickets\n\n";
1081				}
1082				}
1083
1084				############# Retrieve Most Similar Tickets with VSM Model ###################
1085
1086				sub retrieve_similar_tickets_with_vsm {
1087				my $self = shift;
1088				$self->{_query_ticket_id} = shift;
1089				die "\nFirst generate normalized doc vectors for tickets before you can call retrieve with vsm function()\n"
1090				unless scalar(keys %{$self->{_vocab_hist}})
1091				&& scalar(keys %{$self->{_tkt_doc_vecs_normed}});
1092				print "\nCalculating the similarity set for query ticket $self->{_query_ticket_id}\n\n";
1093				my $query_record = $self->{_stemmed_tkts_by_ids}->{$self->{_query_ticket_id}};
1094				my @query_words = grep $_, split /\s+/, $query_record;
1095				my %relevant_tickets_set;
1096				die "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- "
1097				unless $self->{_min_idf_threshold};
1098				foreach my $qword (@query_words) {
1099				map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}}
1100				if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold};
1101				}
1102				my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set;
1103				print "The relevant tickets for query: @relevant_tickets"
1104				if $self->{_debug3};
1105				my $num_relevant_tkts = @relevant_tickets;
1106				print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n";
1107				my %retrievals;
1108				my $rank = 0;
1109				foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) {
1110				$retrievals{$_} = $self->_similarity_to_query_ticket($_);
1111				$rank++;
1112				last if $rank == $self->{_how_many_retrievals};
1113				}
1114				if ($self->{_debug3}) {
1115				print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
1116				foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
1117				print "$_ => $retrievals{$_}\n";
1118				}
1119				}
1120				return \%retrievals;
1121				}
1122
1123				sub _doc_vec_comparator {
1124				my $self = shift;
1125				my %query_ticket_data_normed =
1126				%{$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}};
1127				my $vec1_hash_ref = $self->{_tkt_doc_vecs_normed}->{$a};
1128				my $vec2_hash_ref = $self->{_tkt_doc_vecs_normed}->{$b};
1129				my @vec1 = ();
1130				my @vec2 = ();
1131				my @qvec = ();
1132				foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1133				push @vec1, $vec1_hash_ref->{$word};
1134				push @vec2, $vec2_hash_ref->{$word};
1135				push @qvec, $query_ticket_data_normed{$word};
1136				}
1137				my $vec1_mag = _vec_magnitude(\@vec1);
1138				my $vec2_mag = _vec_magnitude(\@vec2);
1139				my $qvec_mag = _vec_magnitude(\@qvec);
1140				my $product1 = _vec_scalar_product(\@vec1, \@qvec);
1141				$product1 /= $vec1_mag * $qvec_mag;
1142				my $product2 = _vec_scalar_product(\@vec2, \@qvec);
1143				$product2 /= $vec2_mag * $qvec_mag;
1144				return 1 if $product1 < $product2;
1145				return 0 if $product1 == $product2;
1146				return -1 if $product1 > $product2;
1147				}
1148
1149				sub _similarity_to_query_ticket {
1150				my $self = shift;
1151				my $ticket_id = shift;
1152				my $ticket_data_normed = $self->{_tkt_doc_vecs_normed}->{$ticket_id};
1153				my @vec = ();
1154				my @qvec = ();
1155				foreach my $word (sort keys %$ticket_data_normed) {
1156				push @vec, $ticket_data_normed->{$word};
1157				push @qvec,
1158				$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}->{$word};
1159				}
1160				my $vec_mag = _vec_magnitude(\@vec);
1161				my $qvec_mag = _vec_magnitude(\@qvec);
1162				die "\nThe query ticket appears to be empty\n" if $qvec_mag == 0;
1163				my $product = _vec_scalar_product(\@vec, \@qvec);
1164				$product /= $vec_mag * $qvec_mag;
1165				return $product;
1166				}
1167
1168
1169				######################## Utility Subroutines ##########################
1170
1171				sub _simple_stemmer {
1172				my $word = shift;
1173				my $debug = shift;
1174				print "\nStemming the word: $word\n" if $debug;
1175				$word =~ s/(.*[a-z]t)ted$/$1/i;
1176				$word =~ s/(.*[a-z]t)ting$/$1/i;
1177				$word =~ s/(.*[a-z]l)ling$/$1/i;
1178				$word =~ s/(.*[a-z]g)ging$/$1/i;
1179				$word =~ s/(.*[a-z]ll)ed$/$1/i;
1180				$word =~ s/(.*[a-z][^aeious])s$/$1/i;
1181				$word =~ s/(.*[a-z])ies$/$1y/i;
1182				$word =~ s/(.*[a-z]s)es$/$1/i;
1183				$word =~ s/(.*[a-z][ck])es$/$1e/i;
1184				$word =~ s/(.*[a-z]+)tions$/$1tion/i;
1185				$word =~ s/(.*[a-z]+)mming$/$1m/i;
1186				$word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1187				$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1188				$word =~ s/(.*[a-z]+)tices$/$1tex/i;
1189				$word =~ s/(.*[a-z]+)pes$/$1pe/i;
1190				$word =~ s/(.*[a-z]+)sed$/$1se/i;
1191				$word =~ s/(.*[a-z]+)ed$/$1/i;
1192				$word =~ s/(.*[a-z]+)tation$/$1t/i;
1193				print "Stemmed word: $word\n\n" if $debug;
1194				return $word;
1195				}
1196
1197				sub _exists {
1198				my $element = shift;
1199				my $array = shift;
1200				my %hash;
1201				for my $item (@$array) {
1202				$hash{$item} = 1;
1203				}
1204				return exists $hash{$element};
1205				}
1206
1207				sub _fetch_words_from_file {
1208				my $file = shift;
1209				my @words;
1210				open( IN, "$file" ) or die "unable to open the file $file: $!";
1211				while () {
1212				next if /^#/;
1213				next if /^[ ]*\r?\n?$/;
1214				$_ =~ s/\r?\n?$//;
1215				my @how_many_in_line = grep $_, split /\s+/, $_;
1216				die "File $file: Exactly one word allowed in each line -- "
1217				unless @how_many_in_line == 1;
1218				push @words, $_;
1219				}
1220				close IN;
1221				return \@words;
1222				}
1223
1224				sub _fetch_word_pairs_from_file {
1225				my $file = shift;
1226				my @word_pairs;
1227				open( IN, "$file" ) or die "unable to open the file $file: $!";
1228				while () {
1229				next if /^#/;
1230				next if /^[ ]*$/;
1231				chomp;
1232				my @how_many_in_line = grep $_, split /\s+/, $_;
1233				die "File: $file --- Exactly two words must be in each non-comment or not-empty line -- "
1234				unless @how_many_in_line == 2;
1235				push @word_pairs, $_;
1236				}
1237				close IN;
1238				return \@word_pairs;
1239				}
1240
1241				sub _get_rid_of_wide_chars {
1242				my $string = shift;
1243				$string =~ s/[^[:ascii:]]+//g;
1244				# $string =~ s/\x{FEFF}//g; to get rid of wide characters
1245				return $string;
1246				}
1247
1248				sub _find_index_for_given_element {
1249				my $ele = shift;
1250				my $array_ref = shift;
1251				foreach my $i (0..@{$array_ref}-1) {
1252				return $i if $ele == $array_ref->[$i];
1253				}
1254				}
1255
1256				sub _check_for_illegal_params {
1257				my @params = @_;
1258				my @legal_params = qw / excel_filename
1259				which_worksheet
1260				raw_tickets_db
1261				processed_tickets_db
1262				stemmed_tickets_db
1263				inverted_index_db
1264				tickets_vocab_db
1265				idf_db
1266				tkt_doc_vecs_db
1267				tkt_doc_vecs_normed_db
1268				synset_cache_db
1269				want_synset_caching
1270				add_synsets_to_tickets
1271				clustering_fieldname
1272				min_word_length
1273				min_idf_threshold
1274				max_num_syn_words
1275				stop_words_file
1276				misspelled_words_file
1277				unique_id_fieldname
1278				want_stemming
1279				how_many_retrievals
1280				debug1
1281				debug2
1282				debug3
1283				/;
1284				my $found_match_flag;
1285				foreach my $param (@params) {
1286
1287				foreach my $legal (@legal_params) {
1288				$found_match_flag = 0;
1289				if ($param eq $legal) {
1290				$found_match_flag = 1;
1291				last;
1292				}
1293				}
1294				last if $found_match_flag == 0;
1295				}
1296				return $found_match_flag;
1297				}
1298
1299				# Meant only for an un-nested hash:
1300				sub _deep_copy_hash {
1301				my $ref_in = shift;
1302				my $ref_out = {};
1303				foreach ( keys %{$ref_in} ) {
1304				$ref_out->{$_} = $ref_in->{$_};
1305				}
1306				return $ref_out;
1307				}
1308
1309				# from perl docs:
1310				sub _fisher_yates_shuffle {
1311				my $arr = shift;
1312				my $i = @$arr;
1313				while (--$i) {
1314				my $j = int rand( $i + 1 );
1315				@$arr[$i, $j] = @$arr[$j, $i];
1316				}
1317				}
1318
1319				sub _vec_scalar_product {
1320				my $vec1 = shift;
1321				my $vec2 = shift;
1322				die "Something is wrong --- the two vectors are of unequal length"
1323				unless @$vec1 == @$vec2;
1324				my $product;
1325				for my $i (0..@$vec1-1) {
1326				$product += $vec1->[$i] * $vec2->[$i];
1327				}
1328				return $product;
1329				}
1330
1331				sub _vec_magnitude {
1332				my $vec = shift;
1333				my $mag_squared = 0;
1334				foreach my $num (@$vec) {
1335				$mag_squared += $num ** 2;
1336				}
1337				return sqrt $mag_squared;
1338				}
1339
1340				1;
1341
1342				__END__