File Coverage

blib/lib/Algorithm/TicketClusterer.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::TicketClusterer;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2012 Avinash Kak. All rights reserved. This program is
5             # free software. You may modify and/or distribute it under the same terms
6             # as Perl itself. This copyright notice must remain attached to the file.
7             #
8             # Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored
9             # past tickets that are most similar to a new ticket. Tickets are commonly
10             # used in software services industry and customer support businesses to
11             # record requests for service, product complaints, user feedback, and so
12             # on.
13             # ---------------------------------------------------------------------------
14              
15 1     1   20042 use 5.10.0;
  1         3  
  1         39  
16 1     1   5 use strict;
  1         2  
  1         26  
17 1     1   4 use warnings;
  1         12  
  1         35  
18 1     1   5 use Carp;
  1         2  
  1         93  
19 1     1   2336 use Storable;
  1         4670  
  1         68  
20 1     1   1441 use Spreadsheet::ParseExcel;
  1         104056  
  1         33  
21 1     1   813 use Spreadsheet::XLSX;
  1         110430  
  1         36  
22 1     1   1991 use WordNet::QueryData;
  0            
  0            
23             use SDBM_File;
24             use Fcntl;
25              
26             our $VERSION = '1.0';
27              
28             ############################### The Constructor #############################
29              
30             sub new {
31             my ($class, %args) = @_;
32             my @params = keys %args;
33             croak "\nYou have used a wrong name for a keyword argument " .
34             "--- perhaps a misspelling\n"
35             if _check_for_illegal_params(@params) == 0;
36             bless {
37             _excel_filename => $args{excel_filename},
38             _which_worksheet => $args{which_worksheet},
39             _raw_tickets_db => $args{raw_tickets_db},
40             _processed_tickets_db => $args{processed_tickets_db},
41             _synset_cache_db => $args{synset_cache_db},
42             _stemmed_tickets_db => $args{stemmed_tickets_db},
43             _inverted_index_db => $args{inverted_index_db},
44             _tickets_vocab_db => $args{tickets_vocab_db},
45             _idf_db => $args{idf_db},
46             _tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
47             _tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
48             _clustering_fieldname => $args{clustering_fieldname},
49             _unique_id_fieldname => $args{unique_id_fieldname},
50             _stop_words_file => $args{stop_words_file},
51             _misspelled_words_file => $args{misspelled_words_file},
52             _min_word_length => $args{min_word_length} || 4,
53             _add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
54             _want_stemming => $args{want_stemming} || 0,
55             _how_many_retrievals => $args{how_many_retrievals} || 5,
56             _min_idf_threshold => $args{min_idf_threshold},
57             _max_num_syn_words => $args{max_num_syn_words} || 3,
58             _want_synset_caching => $args{want_synset_caching} || 0,
59             _stop_words => {},
60             _all_tickets => [],
61             _column_headers => [],
62             _good_columns => [],
63             _tickets_by_ids => {},
64             _processed_tkts_by_ids => {},
65             _stemmed_tkts_by_ids => {},
66             _misspelled_words => {},
67             _total_num_tickets => 0,
68             _synset_cache => {},
69             _vocab_hash => {},
70             _vocab_idf_hist => {},
71             _idf_t => {},
72             _vocab_size => undef,
73             _doc_vector_template => {},
74             _tkt_doc_vecs => {},
75             _tkt_doc_vecs_normed => {},
76             _query_ticket_id => undef,
77             _inverted_index => {},
78             _debug1 => $args{debug1} || 0, # for processing Excel
79             _debug2 => $args{debug2} || 0, # for modeling tickets
80             _debug3 => $args{debug3} || 0, # for similarity retrieval
81             _wn => WordNet::QueryData->new( verbose => 0,
82             noload => 1 ),
83             }, $class;
84             }
85              
86             ############################# Extract info from Excel #######################
87              
88             sub get_tickets_from_excel {
89             my $self = shift;
90             unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
91             unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
92             unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
93             unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
94             unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
95             unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
96             unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
97             unlink glob "$self->{_tickets_vocab_db}.*";
98             unlink glob "$self->{_idf_db}.*";
99             my $filename = $self->{_excel_filename} || croak("Excel file required"),
100             my $clustering_fieldname = $self->{_clustering_fieldname}
101             || croak("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
102             my $unique_id_fieldname = $self->{_unique_id_fieldname}
103             || croak("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifer for the rows of your Excel sheet -- ");
104             my $workbook;
105             if ($filename =~ /\.xls$/) {
106             my $parser = Spreadsheet::ParseExcel->new();
107             $workbook = $parser->parse($filename);
108             die $parser->error() unless defined $workbook;
109             } elsif ($filename =~ /\.xlsx$/) {
110             use Text::Iconv;
111             my $converter = Text::Iconv->new("utf-8", "windows-1251");
112             $workbook = Spreadsheet::XLSX->new($filename, $converter);
113             } else {
114             die "File suffix on the Excel file not recognized";
115             }
116             my @worksheets = $workbook->worksheets();
117             my $which_worksheet = $self->{_which_worksheet} ||
118             croak "\nYou have not specified which Excel worksheet contains the tickets\n";
119             my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
120             my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
121             my @good_columns;
122             my $col_headers_row;
123             my $col_headers_found = 0;
124             my $col_index_for_unique_id;
125             my $col_index_for_clustering_field;
126             for my $row ( $row_min .. $row_max ) {
127             last if $col_headers_found;
128             @good_columns = ();
129             for my $col ( $col_min .. $col_max ) {
130             my $cell =
131             $worksheets[$which_worksheet-1]->get_cell( $row, $col );
132             next unless $cell;
133             my $cell_value = _get_rid_of_wide_chars($cell->value());
134             push @good_columns, $col if $cell_value;
135             if ($cell_value eq $unique_id_fieldname) {
136             $col_index_for_unique_id = $col;
137             $col_headers_row = $row;
138             $col_headers_found = 1;
139             }
140             if ($cell_value eq $clustering_fieldname) {
141             $col_index_for_clustering_field = $col;
142             }
143             }
144             }
145             $self->{_good_columns} = \@good_columns;
146             print "\nThe unique id is in column: $col_index_for_unique_id\n"
147             if $self->{_debug1};
148             print "The clustering field is in column: " .
149             "$col_index_for_clustering_field\n\n" if $self->{_debug1};
150             my %Column_Headers;
151             foreach my $field_index (0..@good_columns-1) {
152             my $key = "field_" . $field_index;
153             $Column_Headers{$key} = "";
154             }
155             my @col_headers = map {
156             my $cell =
157             $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
158             $cell ? _get_rid_of_wide_chars($cell->value()) : '';
159             } @good_columns;
160             $self->{_column_headers} = \@col_headers;
161             $self->_display_column_headers() if $self->{_debug1};
162             my $unique_id_field_index_in_good_columns =
163             _find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
164             my $clustering_field_index_in_good_columns =
165             _find_index_for_given_element( $col_index_for_clustering_field,
166             \@good_columns );
167             die "Something is wrong with the info extracted from Excel " .
168             "as the index for the column with unique IDs is not one of " .
169             "good columns\n\n"
170             unless (defined $unique_id_field_index_in_good_columns) &&
171             (defined $clustering_field_index_in_good_columns);
172             for my $row_index ( $col_headers_row+1..$row_max-1) {
173             my @values = map {
174             my $cell =
175             $worksheets[$which_worksheet-1]->get_cell($row_index, $_);
176             $cell ? _get_rid_of_wide_chars($cell->value()) : '';
177             } @good_columns;
178             next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/;
179             next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/;
180             my %onerow;
181             foreach my $field_index (0..@good_columns-1) {
182             my $key = "field_" . $field_index;
183             die "The Columns Headers hash has no field for index " .
184             "$field_index\n "
185             unless exists $col_headers[$field_index];
186             $onerow{$col_headers[$field_index]} = $values[$field_index];
187             }
188             push @{$self->{_all_tickets}}, \%onerow;
189             }
190             my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
191             if (@duplicates_for_id_field > 0) {
192             print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
193             die "\n\nYour unique id field for tickets contains duplicate id's";
194             }
195             foreach my $ticket (@{$self->{_all_tickets}}) {
196             $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
197             lc($ticket->{$clustering_fieldname});
198             }
199             $self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
200             $self->store_raw_tickets_on_disk();
201             }
202              
203             sub _test_excel_for_tickets {
204             my $self = shift;
205             use Text::Iconv;
206             my $converter = Text::Iconv->new("utf-8", "windows-1251");
207             my $filename = $self->{_excel_filename} || croak("Excel sheed need for testing is missing");
208             my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
209             my @worksheets = $workbook->worksheets();
210             my ( $row_min, $row_max ) = $worksheets[0]->row_range();
211             my ( $col_min, $col_max ) = $worksheets[0]->col_range();
212             return ($row_min, $row_max, $col_min, $col_max);
213             }
214              
215             sub _display_column_headers {
216             my $self = shift;
217             print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
218             my $overall_header_string = join ' <> ', @{$self->{_column_headers}};
219             print "The column headers are: $overall_header_string\n\n";
220             }
221              
222             sub _check_unique_id_field {
223             my $self = shift;
224             my %check_hash;
225             my @duplicates;
226             foreach my $ticket (@{$self->{_all_tickets}}) {
227             if (exists $ticket->{$self->{_unique_id_fieldname}}) {
228             push @duplicates, $ticket->{$self->{_unique_id_fieldname}}
229             if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
230             $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
231             }
232             }
233             if ($self->{_debug1}) {
234             my $num_of_tickets = @{$self->{_all_tickets}};
235             my $num_entries_check_hash = keys %check_hash;
236             print "Number of tickets: $num_of_tickets\n";
237             print "Number of keys in check hash: $num_entries_check_hash\n";
238             }
239             return \@duplicates;
240             }
241              
242             sub show_original_ticket_for_given_id {
243             my $self = shift;
244             my $id = shift;
245             print "\n\nDisplaying the fields for the ticket $id:\n\n";
246             foreach my $ticket (@{$self->{_all_tickets}}) {
247             if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
248             foreach my $key (sort keys %{$ticket}) {
249             my $value = $ticket->{$key};
250             $value =~ s/^\s+//;
251             $value =~ s/\s+$//;
252             printf("%20s ==> %s\n", $key, $value);
253             }
254             }
255             }
256             }
257              
258             sub show_raw_ticket_clustering_data_for_given_id {
259             my $self = shift;
260             my $ticket_id = shift;
261             my $record = $self->{_tickets_by_ids}->{$ticket_id};
262             print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
263             "$record\n\n";
264             return $record;
265             }
266              
267             # Needed by test.t
268             sub _raw_ticket_clustering_data_for_given_id {
269             my $self = shift;
270             my $ticket_id = shift;
271             my $record = $self->{_tickets_by_ids}->{$ticket_id};
272             return $record;
273             }
274              
275              
276             sub show_processed_ticket_clustering_data_for_given_id {
277             my $self = shift;
278             my $ticket_id = shift;
279             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
280             print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
281             "$record\n\n";
282             }
283              
284             sub show_stemmed_ticket_clustering_data_for_given_id {
285             my $self = shift;
286             my $ticket_id = shift;
287             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
288             print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
289             "$record\n\n";
290             }
291              
292             # The following function is a good diagnostic tool to look into the
293             # array stored in $self->{_all_tickets}. Each element of this array
294             # is a record that represents one row of the Excel file.
295             sub _show_row {
296             my $self = shift;
297             my $row_num = shift;
298             my $total_rows = @{$self->{_all_tickets}};
299             print "There are $total_rows items in the \$all_tickets array\n";
300             die "The row that you want to see does not exist"
301             unless $row_num < $total_rows;
302             my %record = %{$self->{_all_tickets}->[$row_num]};
303             foreach my $field (sort keys %record) {
304             my $value = $record{$field};
305             no warnings;
306             print "$field ==> $value\n";
307             }
308             }
309              
310             sub store_raw_tickets_on_disk {
311             my $self = shift;
312             $self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db};
313             unlink $self->{_raw_tickets_db};
314             eval {
315             store( $self->{_all_tickets}, $self->{_raw_tickets_db} );
316             };
317             if ($@) {
318             die "Something went wrong with disk storage of ticket data: $@";
319             }
320             }
321              
322             sub restore_raw_tickets_from_disk {
323             my $self = shift;
324             my $clustering_fieldname = $self->{_clustering_fieldname}
325             || croak("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
326             my $unique_id_fieldname = $self->{_unique_id_fieldname}
327             || croak("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifer for the rows of your Excel sheet -- ");
328             eval {
329             $self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} );
330             };
331             if ($@) {
332             die "Unable to retrieve raw tickets from disk: $@";
333             }
334             foreach my $ticket (@{$self->{_all_tickets}}) {
335             $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
336             lc($ticket->{$clustering_fieldname});
337             }
338             $self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}};
339             }
340              
341             sub delete_markup_from_all_tickets {
342             my $self = shift;
343             foreach my $ticket (@{$self->{_all_tickets}}) {
344             $self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}});
345             }
346             }
347              
348             sub _delete_markup_from_one_ticket {
349             my $self = shift;
350             my $ticket_id = shift;
351             my $ticket_strings = $self->{_tickets_by_ids}->{$ticket_id};
352             my @strings = grep $_, split /\s+/, $ticket_strings;
353             my $cleaned_up_strings = join ' ', grep {$_ !~ /^<[^<>]+>$/} @strings;
354             $self->{_tickets_by_ids}->{$ticket_id} = $cleaned_up_strings;
355             foreach my $ticket (@{$self->{_all_tickets}}) {
356             if ( $ticket->{$self->{_unique_id_fieldname}} == $ticket_id ) {
357             $ticket->{$self->{_clustering_fieldname}} = $cleaned_up_strings;
358             last;
359             }
360             }
361             }
362              
363             sub apply_filter_to_all_tickets {
364             my $self = shift;
365             my $stop_words_file = $self->{_stop_words_file}
366             || croak("\nYou forgot to supply the name of the stop words file in your constructor call\n");
367             my @stop_words = @{_fetch_words_from_file($stop_words_file)};
368             my $misspelled_words_file = $self->{_misspelled_words_file}
369             || croak("\nYou forgot to supply the name of the misspelled words file in your constructor call\n");
370             foreach my $word (@stop_words) {
371             $self->{_stop_words}->{$word} = 1;
372             }
373             if ($self->{_misspelled_words_file}) {
374             my @misspelled_word_pairs =
375             @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
376             foreach my $wordpair (@misspelled_word_pairs) {
377             my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
378             $self->{_misspelled_words}->{$wrong_word} = $good_word;
379             }
380             }
381             my $i = 1;
382             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tickets_by_ids}}) {
383             print "\nApplying filter to ticket $ticket_id ($i out of $self->{_total_num_tickets})\n";
384             $self->_apply_filter_to_one_ticket($ticket_id);
385             $i++;
386             }
387             }
388              
389             sub _apply_filter_to_one_ticket {
390             my $self = shift;
391             my $ticket_id = shift;
392              
393             unless (keys %{$self->{_stop_words}} > 0) {
394             my @stop_words = @{_fetch_words_from_file($self->{_stop_words_file})};
395             foreach my $word (@stop_words) {
396             $self->{_stop_words}->{$word} = 1;
397             }
398             }
399             unless (keys %{$self->{_misspelled_words}} > 0) {
400             if ($self->{_misspelled_words_file}) {
401             my @misspelled_word_pairs =
402             @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
403             foreach my $wordpair (@misspelled_word_pairs) {
404             my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
405             $self->{_misspelled_words}->{$wrong_word} = $good_word;
406             }
407             }
408             }
409             my $record = $self->{_tickets_by_ids}->{$ticket_id};
410             my $min = $self->{_min_word_length};
411             my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record;
412             my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words;
413             return unless @clean_words;
414             my @new_words;
415             foreach my $word (@words) {
416             $word =~ s/(.+)[.,:!-]$/$1/;
417             unless (($word eq 'no') or ($word eq 'not')) {
418             next if length($word) < $self->{_min_word_length};
419             }
420             if (exists $self->{_misspelled_words}->{lc($word)}) {
421             push @new_words, $self->{_misspelled_words}->{$word};
422             next;
423             }
424             push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
425             }
426             my $new_record = join ' ', @new_words;
427             $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
428             }
429              
430             sub _get_synonyms_for_word {
431             my $self = shift;
432             my $word = shift;
433             my $no_sense_indicators = 1;
434             my $wn = $self->{_wn};
435             my @parts_of_speech = $wn->querySense("$word");
436              
437             my %noun_synonyms;
438             my %verb_synonyms;
439             my %adj_synonyms;
440             my %adv_synonyms;
441             foreach my $pos (@parts_of_speech) {
442             if ($pos =~ /n$/) {
443             my @all_noun_syn_sense_labels = $wn->querySense( $pos, "syns");
444             my $how_many = @all_noun_syn_sense_labels;
445             foreach my $noun_sense (@all_noun_syn_sense_labels) {
446             my @noun_synonyms = $wn->querySense($noun_sense, "syns");
447             my $answer = "";
448             foreach my $noun_syn (@noun_synonyms) {
449             next if $noun_syn eq $noun_sense;
450             $noun_syn =~ s/\#.+$// if $no_sense_indicators;
451             $noun_synonyms{$noun_syn} = 1;
452             $answer .= " $noun_syn ";
453             }
454             }
455             } elsif ($pos =~ /v$/) {
456             my @all_verb_syn_sense_labels = $wn->querySense( $pos, "syns");
457             my $how_many = @all_verb_syn_sense_labels;
458             foreach my $verb_sense (@all_verb_syn_sense_labels) {
459             my @verb_synonyms = $wn->querySense($verb_sense, "syns");
460             my $answer = "";
461             foreach my $verb_syn (@verb_synonyms) {
462             next if $verb_syn eq $verb_sense;
463             $verb_syn =~ s/\#.+$// if $no_sense_indicators;
464             $verb_synonyms{$verb_syn} = 1;
465             $answer .= " $verb_syn ";
466             }
467             }
468             } elsif ($pos =~ /a$/) {
469             my @all_adj_syn_sense_labels = $wn->querySense( $pos, "syns");
470             my $how_many = @all_adj_syn_sense_labels;
471             foreach my $adj_sense (@all_adj_syn_sense_labels) {
472             my @adj_synonyms = $wn->querySense($adj_sense, "syns");
473             my $answer = "";
474             foreach my $adj_syn (@adj_synonyms) {
475             next if $adj_syn eq $adj_sense;
476             $adj_syn =~ s/\#.+$// if $no_sense_indicators;
477             $adj_synonyms{$adj_syn} = 1;
478             $answer .= " $adj_syn ";
479             }
480             }
481             } elsif ($pos =~ /r$/) {
482             my @all_adv_syn_sense_labels = $wn->querySense( $pos, "syns");
483             my $how_many = @all_adv_syn_sense_labels;
484             foreach my $adv_sense (@all_adv_syn_sense_labels) {
485             my @adv_synonyms = $wn->querySense($adv_sense, "syns");
486             my $answer = "";
487             foreach my $adv_syn (@adv_synonyms) {
488             next if $adv_syn eq $adv_sense;
489             $adv_syn =~ s/\#.+$// if $no_sense_indicators;
490             $adv_synonyms{$adv_syn} = 1;
491             $answer .= " $adv_syn ";
492             }
493             }
494             } else {
495             die "\nThe Part of Speech $pos not recognized\n\n";
496             }
497             }
498             my @all_synonyms;
499             my @all_noun_synonyms = keys %noun_synonyms;
500             my @all_verb_synonyms = keys %verb_synonyms;
501             my @all_adj_synonyms = keys %adj_synonyms;
502             my @all_adv_synonyms = keys %adv_synonyms;
503             push @all_synonyms, @all_noun_synonyms if @all_noun_synonyms > 0;
504             push @all_synonyms, @all_verb_synonyms if @all_verb_synonyms > 0;
505             push @all_synonyms, @all_adj_synonyms if @all_adj_synonyms > 0;
506             push @all_synonyms, @all_adv_synonyms if @all_adv_synonyms > 0;
507             my %synonym_set;
508             foreach my $synonym (@all_synonyms) {
509             $synonym_set{$synonym} = 1;
510             }
511             my @synonym_set = sort keys %synonym_set;
512             return \@synonym_set;
513             }
514              
515             sub _get_antonyms_for_word {
516             my $self = shift;
517             my $word = shift;
518             my $no_sense_indicators = 1;
519             my $wn = $self->{_wn};
520             my @parts_of_speech = $wn->querySense("$word");
521             my %noun_antonyms;
522             my %verb_antonyms;
523             my %adj_antonyms;
524             my %adv_antonyms;
525             foreach my $pos (@parts_of_speech) {
526             if ($pos =~ /n$/) {
527             my @all_noun_ant_sense_labels = $wn->queryWord( $pos, "ants");
528             my $how_many = @all_noun_ant_sense_labels;
529             foreach my $noun_sense (@all_noun_ant_sense_labels) {
530             my @noun_antonyms = $wn->queryWord($noun_sense, "ants");
531             my $answer = "";
532             foreach my $noun_ant (@noun_antonyms) {
533             next if $noun_ant eq $noun_sense;
534             $noun_ant =~ s/\#.+$// if $no_sense_indicators;
535             $noun_antonyms{$noun_ant} = 1;
536             $answer .= " $noun_ant ";
537             }
538             }
539             } elsif ($pos =~ /v$/) {
540             my @all_verb_ant_sense_labels = $wn->queryWord( $pos, "ants");
541             my $how_many = @all_verb_ant_sense_labels;
542             foreach my $verb_sense (@all_verb_ant_sense_labels) {
543             my @verb_antonyms = $wn->queryWord($verb_sense, "ants");
544             my $answer = "";
545             foreach my $verb_ant (@verb_antonyms) {
546             next if $verb_ant eq $verb_sense;
547             $verb_ant =~ s/\#.+$// if $no_sense_indicators;
548             $verb_antonyms{$verb_ant} = 1;
549             $answer .= " $verb_ant ";
550             }
551             }
552             } elsif ($pos =~ /a$/) {
553             my @all_adj_ant_sense_labels = $wn->queryWord( $pos, "ants");
554             my $how_many = @all_adj_ant_sense_labels;
555             foreach my $adj_sense (@all_adj_ant_sense_labels) {
556             my @adj_antonyms = $wn->queryWord($adj_sense, "ants");
557             my $answer = "";
558             foreach my $adj_ant (@adj_antonyms) {
559             next if $adj_ant eq $adj_sense;
560             $adj_ant =~ s/\#.+$// if $no_sense_indicators;
561             $adj_antonyms{$adj_ant} = 1;
562             $answer .= " $adj_ant ";
563             }
564             }
565             } elsif ($pos =~ /r$/) {
566             my @all_adv_ant_sense_labels = $wn->queryWord( $pos, "ants");
567             my $how_many = @all_adv_ant_sense_labels;
568             foreach my $adv_sense (@all_adv_ant_sense_labels) {
569             my @adv_antonyms = $wn->queryWord($adv_sense, "ants");
570             my $answer = "";
571             foreach my $adv_ant (@adv_antonyms) {
572             next if $adv_ant eq $adv_sense;
573             $adv_ant =~ s/\#.+$// if $no_sense_indicators;
574             $adv_antonyms{$adv_ant} = 1;
575             $answer .= " $adv_ant ";
576             }
577             }
578             } else {
579             die "\nThe Part of Speech $pos not recognized\n\n";
580             }
581             }
582             my @all_antonyms;
583             my @all_noun_antonyms = keys %noun_antonyms;
584             my @all_verb_antonyms = keys %verb_antonyms;
585             my @all_adj_antonyms = keys %adj_antonyms;
586             my @all_adv_antonyms = keys %adv_antonyms;
587             push @all_antonyms, @all_noun_antonyms if @all_noun_antonyms > 0;
588             push @all_antonyms, @all_verb_antonyms if @all_verb_antonyms > 0;
589             push @all_antonyms, @all_adj_antonyms if @all_adj_antonyms > 0;
590             push @all_antonyms, @all_adv_antonyms if @all_adv_antonyms > 0;
591             my %antonym_set;
592             foreach my $antonym (@all_antonyms) {
593             $antonym_set{$antonym} = 1;
594             }
595             my @antonym_set = sort keys %antonym_set;
596             return \@antonym_set;
597             }
598              
599             sub expand_all_tickets_with_synonyms {
600             my $self = shift;
601             return unless $self->{_add_synsets_to_tickets};
602             my $num_of_tickets = $self->{_total_num_tickets};
603             if ($self->{_want_synset_caching}) {
604             eval {
605             $self->{_synset_cache} = retrieve( $self->{_synset_cache_db} );
606             } if -s $self->{_synset_cache_db};
607             if ($@) {
608             print "Something went wrong with restoration of synset cache: $@";
609             }
610             }
611             my $i = 1;
612             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
613             $self->_expand_one_ticket_with_synonyms($ticket_id);
614             print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n";
615             $i++;
616             }
617             if ($self->{_want_synset_caching}) {
618             $self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db};
619             eval {
620             store( $self->{_synset_cache}, $self->{_synset_cache_db} );
621             };
622             if ($@) {
623             die "Something went wrong with disk storage of synset cache: $@";
624             }
625             }
626             }
627              
628             sub _expand_one_ticket_with_synonyms {
629             my $self = shift;
630             my $ticket_id = shift;
631             print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n"
632             if $self->{_debug2};
633             $self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id );
634             $self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
635             }
636              
637             sub _replace_negated_words_with_antonyms_one_ticket {
638             my $self = shift;
639             my $ticket_id = shift;
640             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
641             my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
642             foreach my $word (@words_negated_with_not) {
643             next unless (($word =~ /^\w+$/) &&
644             (length($word) > $self->{_min_word_length}));
645             my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
646             next unless @antonym_words > 0;
647             $#antonym_words = $self->{_max_num_syn_words} - 1
648             if @antonym_words > $self->{_max_num_syn_words};
649             my $antonym_replacement_string = join ' ', @antonym_words;
650             print "Antonym for $word is $antonym_replacement_string\n"
651             if $self->{_debug2};
652             $record =~ s/not\s+$word/$antonym_replacement_string/g;
653             }
654             my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
655             foreach my $word (@words_negated_with_no) {
656             next unless (($word =~ /^\w+$/) &&
657             (length($word) > $self->{_min_word_length}));
658             my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
659             next unless @antonym_words > 0;
660             $#antonym_words = $self->{_max_num_syn_words} - 1
661             if @antonym_words > $self->{_max_num_syn_words};
662             my $antonym_replacement_string = join ' ', @antonym_words;
663             print "Antonym for $word is $antonym_replacement_string\n"
664             if $self->{_debug2};
665             $record =~ s/no\s+$word/$antonym_replacement_string/g;
666             }
667             $self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
668             }
669              
670             sub _add_to_words_their_synonyms_one_ticket {
671             my $self = shift;
672             my $ticket_id = shift;
673             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
674             my @words = split /\s+/, $record;
675             my @synonym_bag;
676             foreach my $word (@words) {
677             next if $word eq 'no';
678             next if $word eq 'not';
679             next unless $word =~ /^\w+$/ &&
680             length($word) > $self->{_min_word_length};
681             my @synonym_words;
682             @synonym_words = @{$self->{_synset_cache}->{$word}}
683             if exists $self->{_synset_cache}->{$word};
684             unless (exists $self->{_synset_cache}->{$word}) {
685             @synonym_words = @{$self->_get_synonyms_for_word( $word )};
686             print "syn-set for $word => @synonym_words\n\n"
687             if $self->{_debug2};
688             my $word_root;
689             if (@synonym_words == 0) {
690             if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
691             $word_root = $1;
692             @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
693             if length($word_root) >= $self->{_min_word_length};
694             } elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
695             $word_root = $1;
696             @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
697             if length($word_root) >= $self->{_min_word_length};
698             }
699             }
700             print "syn-set for word root $word_root => @synonym_words\n\n"
701             if ( $self->{_debug2} && defined $word_root );
702             _fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
703             $#synonym_words = $self->{_max_num_syn_words} - 1
704             if @synonym_words > $self->{_max_num_syn_words};
705             print "Retained syn-set for $word => @synonym_words\n\n"
706             if $self->{_debug2};
707             $self->{_synset_cache}->{$word} = \@synonym_words;
708             push @synonym_bag, @synonym_words;
709             }
710             }
711             foreach my $syn_word (@synonym_bag) {
712             push @words, lc($syn_word)
713             unless ((exists $self->{_stop_words}->{$syn_word}) ||
714             (length($syn_word) <= $self->{_min_word_length}));
715             }
716             my @sorted_words = sort @words;
717             my $new_record = join ' ', @sorted_words;
718             $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
719             }
720              
721             sub store_processed_tickets_on_disk {
722             my $self = shift;
723             $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
724             unlink $self->{_processed_tickets_db};
725             eval {
726             store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} );
727             };
728             if ($@) {
729             die "Something went wrong with disk storage of processed tickets: $@";
730             }
731             }
732              
733             sub store_stemmed_tickets_and_inverted_index_on_disk {
734             my $self = shift;
735             $self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
736             unlink $self->{_stemmed_tickets_db};
737             eval {
738             print "\n\nStoring stemmed tickets on disk\n\n";
739             store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} );
740             };
741             if ($@) {
742             die "Something went wrong with disk storage of stemmed tickets: $@";
743             }
744             $self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db};
745             unlink $self->{_inverted_index_db};
746             eval {
747             print "\nStoring inverted index on disk\n\n";
748             store( $self->{_inverted_index}, $self->{_inverted_index_db} );
749             };
750             if ($@) {
751             die "Something went wrong with disk storage of the inverted index: $@";
752             }
753             }
754              
755             sub restore_processed_tickets_from_disk {
756             my $self = shift;
757             eval {
758             $self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
759             };
760             if ($@) {
761             die "Something went wrong with restoration of processed tickets: $@";
762             }
763             }
764              
765             sub restore_stemmed_tickets_from_disk {
766             my $self = shift;
767             eval {
768             $self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
769             };
770             if ($@) {
771             die "Something went wrong with restoration of stemmed tickets: $@";
772             }
773             }
774              
775             #################### Get Ticket Vocabulary and Word Counts #################
776              
777             sub get_ticket_vocabulary_and_construct_inverted_index {
778             my $self = shift;
779             my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
780             $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
781             unlink glob "$self->{_tickets_vocab_db}.*";
782             my %vocab_hist_on_disk;
783             tie %vocab_hist_on_disk, 'SDBM_File',
784             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
785             or die "Can't create DBM files: $!";
786             my %inverted_index;
787             my $min = $self->{_min_word_length};
788             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
789             my %uniques = ();
790             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
791             my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
792             my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
793             next unless @clean_words;
794             @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}),
795             @clean_words;
796             map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
797             for (@clean_words) { $uniques{"\L$_"}++ };
798             map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
799             map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id }
800             keys %uniques;
801             $self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
802             }
803             foreach (keys %vocab_hist_on_disk) {
804             $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
805             }
806             untie %vocab_hist_on_disk;
807             $self->{_tkt_vocab_done} = 1;
808             $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
809             print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
810             if $self->{_debug2};
811             # Calculate idf(t):
812             $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
813             unlink glob "$self->{_idf_db}.*";
814             tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
815             or die "Can't create DBM files: $!";
816             foreach (keys %{$self->{_vocab_idf_hist}}) {
817             $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
818             /
819             (1 + $self->{_vocab_idf_hist}->{$_})))
820             / log(10) );
821             }
822             foreach (keys %idf_t_on_disk) {
823             $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
824             }
825             untie %idf_t_on_disk;
826             }
827              
828             sub display_tickets_vocab {
829             my $self = shift;
830             die "tickets vocabulary not yet constructed"
831             unless keys %{$self->{_vocab_hist}};
832             print "\n\nDisplaying tickets vocabulary (the number shown agaist each word is the number of times each word appears in ALL the tickets):\n\n";
833             foreach (sort keys %{$self->{_vocab_hist}}){
834             my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
835             print "$outstring\n";
836             }
837             my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
838             print "\nSize of the tickets vocabulary: $vocab_size\n\n";
839             }
840              
841             sub display_inverse_document_frequencies {
842             my $self = shift;
843             die "tickets vocabulary not yet constructed"
844             unless keys %{$self->{_vocab_idf_hist}};
845             print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
846             foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
847             my $outstring = sprintf("%30s %d",
848             $_, $self->{_vocab_idf_hist}->{$_});
849             print "$outstring\n";
850             }
851             print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
852             foreach ( sort keys %{$self->{_idf_t}} ) {
853             my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
854             print "$outstring\n";
855             }
856             }
857              
858             # The following subroutine is useful for diagnostic purposes. It
859             # lists the number of tickets that a word appears in and also lists
860             # the tickets. But be careful in interpreting its results. Note
861             # if you invoke this subroutine after the synsets have been added
862             # to the tickets, you may find words being attributed to tickets
863             # that do not actually contain them in the original Excel sheet.
864             sub list_processed_tickets_for_a_word {
865             my $self = shift;
866             while (my $word = ) { #enter ctrl-D to exit the loop
867             chomp $word;
868             my @ticket_list;
869             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
870             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
871             push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
872             }
873             my $num = @ticket_list;
874             print "\nThe number of processed tickets that mention the word `$word': $num\n\n";
875             print "The processed tickets: @ticket_list\n\n";
876             }
877             }
878              
879             sub list_stemmed_tickets_for_a_word {
880             my $self = shift;
881             while (my $word = ) { #enter ctrl-D to exit the loop
882             chomp $word;
883             my @ticket_list;
884             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
885             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
886             push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
887             }
888             my $num = @ticket_list;
889             print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n";
890             print "The stemmed tickets: @ticket_list\n\n";
891             }
892             }
893              
894             ############## Generate Document Vectors for Tickets ####################
895              
896             sub construct_doc_vectors_for_all_tickets {
897             my $self = shift;
898             foreach ( sort keys %{$self->{_vocab_hist}} ) {
899             $self->{_doc_vector_template}->{$_} = 0;
900             }
901             my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}};
902             my $i = 1;
903             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
904             my $doc_vec_ref =
905             $self->_construct_doc_vector_for_one_ticket($ticket_id);
906             print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n";
907             $i++;
908             }
909             }
910              
911             sub _construct_doc_vector_for_one_ticket {
912             my $self = shift;
913             my $ticket_id = shift;
914             unless (keys %{$self->{_doc_vector_template}}) {
915             foreach ( sort keys %{$self->{_vocab_hist}} ) {
916             $self->{_doc_vector_template}->{$_} = 0;
917             }
918             }
919             my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
920             foreach ( sort keys %{$self->{_doc_vector_template}} ) {
921             $doc_vector{$_} = 0;
922             }
923             my $min = $self->{_min_word_length};
924             my $total_words_in_ticket = 0;
925             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
926             my @clean_words = split /\s+/, $record;
927             map { $doc_vector{"\L$_"}++ }
928             grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
929             die "Something went wrong. Doc vector size unequal to vocab size"
930             unless $self->{_vocab_size} == scalar(keys %doc_vector);
931             foreach (keys %doc_vector) {
932             $total_words_in_ticket += $doc_vector{$_};
933             }
934             my %normalized_doc_vector;
935             foreach (keys %doc_vector) {
936             $normalized_doc_vector{$_} = $doc_vector{$_}
937             *
938             $self->{_idf_t}->{$_}
939             /
940             $total_words_in_ticket;
941             }
942             $self->{_tkt_doc_vecs}->{$ticket_id} = \%doc_vector;
943             $self->{_tkt_doc_vecs_normed}->{$ticket_id} = \%normalized_doc_vector;
944             }
945              
946             sub store_ticket_vectors {
947             my $self = shift;
948             die "You have not yet created doc vectors for tickets"
949             unless keys %{$self->{_tkt_doc_vecs}};
950             $self->{_tkt_doc_vecs_db} = "tkt_doc_vecs.db" unless $self->{_tkt_doc_vecs_db};
951             $self->{_tkt_doc_vecs_normed_db} = "tkt_doc_vecs_normed.db"
952             unless $self->{_tkt_doc_vecs_normed_db};
953             unlink $self->{_tkt_doc_vecs_db};
954             unlink $self->{_tkt_doc_vecs_normed_db};
955             print "\nStoring the ticket doc vecs on disk. This could take a while.\n\n";
956             eval {
957             store( $self->{_tkt_doc_vecs}, $self->{_tkt_doc_vecs_db} );
958             };
959             if ($@) {
960             die "Something went wrong with disk storage of ticket doc vectors: $@";
961             }
962             print "\nStoring normalized ticket doc vecs on disk. This could take a while.\n\n";
963             eval {
964             store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db});
965             };
966             if ($@) {
967             die "Something wrong with disk storage of normalized doc vecs: $@";
968             }
969             }
970              
971             sub restore_ticket_vectors_and_inverted_index {
972             my $self = shift;
973             $self->restore_raw_tickets_from_disk();
974             $self->restore_processed_tickets_from_disk();
975             $self->restore_stemmed_tickets_from_disk();
976             tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640
977             or die "Can't connect with DBM file: $!";
978             foreach (keys %vocab_hist_on_disk) {
979             $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
980             }
981             untie %vocab_hist_on_disk;
982             tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640
983             or die "Can't connect with DBM file: $!";
984             foreach (keys %idf_t_on_disk) {
985             $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
986             }
987             untie %idf_t_on_disk;
988             eval {
989             $self->{_tkt_doc_vecs} = retrieve( $self->{_tkt_doc_vecs_db} );
990             };
991             if ($@) {
992             print "Something went wrong with retrieval of ticket doc vectors: $@";
993             }
994             eval {
995             $self->{_tkt_doc_vecs_normed} = retrieve( $self->{_tkt_doc_vecs_normed_db} );
996             };
997             if ($@) {
998             print "Something went wrong with retrieval of normed ticket doc vectors: $@";
999             }
1000             eval {
1001             $self->{_inverted_index} =
1002             retrieve( $self->{_inverted_index_db} );
1003             };
1004             if ($@) {
1005             print "Something went wrong with retrieval of inverted_index: $@";
1006             }
1007             }
1008              
1009             sub display_all_doc_vectors {
1010             my $self = shift;
1011             die "Ticket doc vectors not yet constructed"
1012             unless keys %{$self->{_tkt_doc_vecs}};
1013             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs}}) {
1014             $self->_display_a_single_ticket_doc_vector($ticket_id);
1015             }
1016             }
1017              
1018             sub _display_a_single_ticket_doc_vector {
1019             my $self = shift;
1020             my $ticket_id = shift;
1021             die "Ticket doc vectors not yet constructed"
1022             unless keys %{$self->{_tkt_doc_vecs}};
1023             print "\n\nDISPLAYING Doc Vec FOR TICKET $ticket_id:\n\n";
1024             foreach ( sort keys %{$self->{_tkt_doc_vecs}->{$ticket_id}} ) {
1025             print "$_ => $self->{_tkt_doc_vecs}->{$ticket_id}->{$_}\n";
1026             }
1027             my $docvec_size = keys %{$self->{_tkt_doc_vecs}->{$ticket_id}};
1028             print "\nSize of vector for ticket $ticket_id: $docvec_size\n";
1029             }
1030              
1031             sub display_all_normalized_doc_vectors {
1032             my $self = shift;
1033             die "normalized document vectors not yet constructed"
1034             unless keys %{$self->{_tkt_doc_vecs_normed}};
1035             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs_normed}}) {
1036             $self->_display_a_single_normalized_doc_vector($ticket_id);
1037             }
1038             }
1039              
1040             sub _display_a_single_normalized_doc_vector {
1041             my $self = shift;
1042             my $ticket_id = shift;
1043             die "Normalized ticket doc vectors not yet constructed"
1044             unless keys %{$self->{_tkt_doc_vecs_normed}};
1045             print "\n\nDISPLAYING Normalized Doc Vec FOR TICKET $ticket_id:\n\n";
1046             foreach ( sort keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}} ) {
1047             print "$_ => $self->{_tkt_doc_vecs_normed}->{$ticket_id}->{$_}\n";
1048             }
1049             my $docvec_size = keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}};
1050             print "\nSize of normalized vector for ticket $ticket_id: $docvec_size\n";
1051             }
1052              
1053             ########################## Display Inverted Index ###########################
1054              
1055             sub display_inverted_index {
1056             my $self = shift;
1057             print "\n\nDisplaying inverted index:\n\n";
1058             foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1059             $self->display_inverted_index_for_given_word($word);
1060             }
1061             }
1062              
1063             sub display_inverted_index_for_given_word {
1064             my $self = shift;
1065             my $word = shift;
1066             defined $self->{_inverted_index}->{$word} ?
1067             print "$word => @{$self->{_inverted_index}->{$word}}\n" :
1068             die "Something is wrong with your inverted index\n";
1069             }
1070              
1071             sub display_inverted_index_for_given_query {
1072             my $self = shift;
1073             my $query_ticket_id = shift;
1074             my $query_record = $self->{_stemmed_tkts_by_ids}->{$query_ticket_id};
1075             my @query_words = grep $_, split /\s+/, $query_record;
1076             foreach my $qword (@query_words) {
1077             my $idf_t = $self->{_idf_t}->{$qword};
1078             my @relevant_tickets = @{$self->{_inverted_index}->{$qword}};
1079             print "\n$qword ($idf_t) ===> @relevant_tickets\n\n";
1080             }
1081             }
1082              
1083             ############# Retrieve Most Similar Tickets with VSM Model ###################
1084              
1085             sub retrieve_similar_tickets_with_vsm {
1086             my $self = shift;
1087             $self->{_query_ticket_id} = shift;
1088             die "\nFirst generate normalized doc vectors for ticketsbefore you can call retrieve with vsm function()\n"
1089             unless scalar(keys %{$self->{_vocab_hist}})
1090             && scalar(keys %{$self->{_tkt_doc_vecs_normed}});
1091             print "\nCalculating the similarity set for query ticket $self->{_query_ticket_id}\n\n";
1092             my $query_record = $self->{_stemmed_tkts_by_ids}->{$self->{_query_ticket_id}};
1093             my @query_words = grep $_, split /\s+/, $query_record;
1094             my %relevant_tickets_set;
1095             croak "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- "
1096             unless $self->{_min_idf_threshold};
1097             foreach my $qword (@query_words) {
1098             map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}}
1099             if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold};
1100             }
1101             my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set;
1102             print "The relevant tickets for query: @relevant_tickets"
1103             if $self->{_debug3};
1104             my $num_relevant_tkts = @relevant_tickets;
1105             print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n";
1106             my %retrievals;
1107             my $rank = 0;
1108             foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) {
1109             $retrievals{$_} = $self->_similarity_to_query_ticket($_);
1110             $rank++;
1111             last if $rank == $self->{_how_many_retrievals};
1112             }
1113             if ($self->{_debug3}) {
1114             print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
1115             foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
1116             print "$_ => $retrievals{$_}\n";
1117             }
1118             }
1119             return \%retrievals;
1120             }
1121              
1122             sub _doc_vec_comparator {
1123             my $self = shift;
1124             my %query_ticket_data_normed =
1125             %{$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}};
1126             my $vec1_hash_ref = $self->{_tkt_doc_vecs_normed}->{$a};
1127             my $vec2_hash_ref = $self->{_tkt_doc_vecs_normed}->{$b};
1128             my @vec1 = ();
1129             my @vec2 = ();
1130             my @qvec = ();
1131             foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1132             push @vec1, $vec1_hash_ref->{$word};
1133             push @vec2, $vec2_hash_ref->{$word};
1134             push @qvec, $query_ticket_data_normed{$word};
1135             }
1136             my $vec1_mag = _vec_magnitude(\@vec1);
1137             my $vec2_mag = _vec_magnitude(\@vec2);
1138             my $qvec_mag = _vec_magnitude(\@qvec);
1139             my $product1 = _vec_scalar_product(\@vec1, \@qvec);
1140             $product1 /= $vec1_mag * $qvec_mag;
1141             my $product2 = _vec_scalar_product(\@vec2, \@qvec);
1142             $product2 /= $vec2_mag * $qvec_mag;
1143             return 1 if $product1 < $product2;
1144             return 0 if $product1 == $product2;
1145             return -1 if $product1 > $product2;
1146             }
1147              
1148             sub _similarity_to_query_ticket {
1149             my $self = shift;
1150             my $ticket_id = shift;
1151             my $ticket_data_normed = $self->{_tkt_doc_vecs_normed}->{$ticket_id};
1152             my @vec = ();
1153             my @qvec = ();
1154             foreach my $word (sort keys %$ticket_data_normed) {
1155             push @vec, $ticket_data_normed->{$word};
1156             push @qvec,
1157             $self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}->{$word};
1158             }
1159             my $vec_mag = _vec_magnitude(\@vec);
1160             my $qvec_mag = _vec_magnitude(\@qvec);
1161             die "\nThe query ticket appears to be empty\n" if $qvec_mag == 0;
1162             my $product = _vec_scalar_product(\@vec, \@qvec);
1163             $product /= $vec_mag * $qvec_mag;
1164             return $product;
1165             }
1166              
1167              
1168             ######################## Utility Subroutimes ##########################
1169              
1170             sub _simple_stemmer {
1171             my $word = shift;
1172             my $debug = shift;
1173             print "\nStemming the word: $word\n" if $debug;
1174             $word =~ s/(.*[a-z]t)ted$/$1/i;
1175             $word =~ s/(.*[a-z]t)ting$/$1/i;
1176             $word =~ s/(.*[a-z]l)ling$/$1/i;
1177             $word =~ s/(.*[a-z]g)ging$/$1/i;
1178             $word =~ s/(.*[a-z]ll)ed$/$1/i;
1179             $word =~ s/(.*[a-z][^aeious])s$/$1/i;
1180             $word =~ s/(.*[a-z])ies$/$1y/i;
1181             $word =~ s/(.*[a-z]s)es$/$1/i;
1182             $word =~ s/(.*[a-z][ck])es$/$1e/i;
1183             $word =~ s/(.*[a-z]+)tions$/$1tion/i;
1184             $word =~ s/(.*[a-z]+)mming$/$1m/i;
1185             $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1186             $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1187             $word =~ s/(.*[a-z]+)tices$/$1tex/i;
1188             $word =~ s/(.*[a-z]+)pes$/$1pe/i;
1189             $word =~ s/(.*[a-z]+)sed$/$1se/i;
1190             $word =~ s/(.*[a-z]+)ed$/$1/i;
1191             $word =~ s/(.*[a-z]+)tation$/$1t/i;
1192             print "Stemmed word: $word\n\n" if $debug;
1193             return $word;
1194             }
1195              
1196             sub _exists {
1197             my $element = shift;
1198             my $array = shift;
1199             my %hash;
1200             for my $item (@$array) {
1201             $hash{$item} = 1;
1202             }
1203             return exists $hash{$element};
1204             }
1205              
1206             sub _fetch_words_from_file {
1207             my $file = shift;
1208             my @words;
1209             open( IN, "$file" ) or die "unable to open the file $file: $!";
1210             while () {
1211             next if /^#/;
1212             next if /^[ ]*$/;
1213             chomp;
1214             my @how_many_in_line = grep $_, split /\s+/, $_;
1215             croak "File $file: Only one word allowed in each line -- "
1216             unless @how_many_in_line == 1;
1217             push @words, $_;
1218             }
1219             close IN;
1220             return \@words;
1221             }
1222              
1223             sub _fetch_word_pairs_from_file {
1224             my $file = shift;
1225             my @word_pairs;
1226             open( IN, "$file" ) or die "unable to open the file $file: $!";
1227             while () {
1228             next if /^#/;
1229             next if /^[ ]*$/;
1230             chomp;
1231             my @how_many_in_line = grep $_, split /\s+/, $_;
1232             croak "File: $file --- Exactly two words must be in each non-comment or not-empty line -- "
1233             unless @how_many_in_line == 2;
1234             push @word_pairs, $_;
1235             }
1236             close IN;
1237             return \@word_pairs;
1238             }
1239              
1240             sub _get_rid_of_wide_chars {
1241             my $string = shift;
1242             $string =~ s/[^[:ascii:]]+//g;
1243             # $string =~ s/\x{FEFF}//g; to get rid of wide characters
1244             return $string;
1245             }
1246              
1247             sub _find_index_for_given_element {
1248             my $ele = shift;
1249             my $array_ref = shift;
1250             foreach my $i (0..@{$array_ref}-1) {
1251             return $i if $ele == $array_ref->[$i];
1252             }
1253             }
1254              
1255             sub _check_for_illegal_params {
1256             my @params = @_;
1257             my @legal_params = qw / excel_filename
1258             which_worksheet
1259             raw_tickets_db
1260             processed_tickets_db
1261             stemmed_tickets_db
1262             inverted_index_db
1263             tickets_vocab_db
1264             idf_db
1265             tkt_doc_vecs_db
1266             tkt_doc_vecs_normed_db
1267             synset_cache_db
1268             want_synset_caching
1269             add_synsets_to_tickets
1270             clustering_fieldname
1271             min_word_length
1272             min_idf_threshold
1273             max_num_syn_words
1274             stop_words_file
1275             misspelled_words_file
1276             unique_id_fieldname
1277             want_stemming
1278             how_many_retrievals
1279             debug1
1280             debug2
1281             debug3
1282             /;
1283             my $found_match_flag;
1284             foreach my $param (@params) {
1285              
1286             foreach my $legal (@legal_params) {
1287             $found_match_flag = 0;
1288             if ($param eq $legal) {
1289             $found_match_flag = 1;
1290             last;
1291             }
1292             }
1293             last if $found_match_flag == 0;
1294             }
1295             return $found_match_flag;
1296             }
1297              
1298             # Meant only for an un-nested hash:
1299             sub _deep_copy_hash {
1300             my $ref_in = shift;
1301             my $ref_out = {};
1302             foreach ( keys %{$ref_in} ) {
1303             $ref_out->{$_} = $ref_in->{$_};
1304             }
1305             return $ref_out;
1306             }
1307              
1308             # from perl docs:
1309             sub _fisher_yates_shuffle {
1310             my $arr = shift;
1311             my $i = @$arr;
1312             while (--$i) {
1313             my $j = int rand( $i + 1 );
1314             @$arr[$i, $j] = @$arr[$j, $i];
1315             }
1316             }
1317              
1318             sub _vec_scalar_product {
1319             my $vec1 = shift;
1320             my $vec2 = shift;
1321             croak "Something is wrong --- the two vectors are of unequal length"
1322             unless @$vec1 == @$vec2;
1323             my $product;
1324             for my $i (0..@$vec1-1) {
1325             $product += $vec1->[$i] * $vec2->[$i];
1326             }
1327             return $product;
1328             }
1329              
1330             sub _vec_magnitude {
1331             my $vec = shift;
1332             my $mag_squared = 0;
1333             foreach my $num (@$vec) {
1334             $mag_squared += $num ** 2;
1335             }
1336             return sqrt $mag_squared;
1337             }
1338              
1339             1;
1340              
1341             __END__