File Coverage

blib/lib/Algorithm/TicketClusterer.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Algorithm::TicketClusterer;
2              
3             #---------------------------------------------------------------------------
4             # Copyright (c) 2014 Avinash Kak. All rights reserved. This program is
5             # free software. You may modify and/or distribute it under the same terms
6             # as Perl itself. This copyright notice must remain attached to the file.
7             #
8             # Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored
9             # past tickets that are most similar to a new ticket. Tickets are commonly
10             # used in software services industry and customer support businesses to
11             # record requests for service, product complaints, user feedback, and so
12             # on.
13             # ---------------------------------------------------------------------------
14              
15 1     1   12990 use 5.10.0;
  1         3  
  1         37  
16 1     1   4 use strict;
  1         1  
  1         22  
17 1     1   3 use warnings;
  1         10  
  1         28  
18 1     1   3 use Carp;
  1         1  
  1         75  
19 1     1   612 use Storable;
  1         2749  
  1         69  
20 1     1   837 use Spreadsheet::ParseExcel;
  1         52964  
  1         33  
21 1     1   536 use Spreadsheet::XLSX;
  1         64852  
  1         33  
22 1     1   1084 use WordNet::QueryData;
  0            
  0            
23             use Text::Iconv;
24             use SDBM_File;
25             use Fcntl;
26              
27             our $VERSION = '1.01';
28              
29             ############################### The Constructor #############################
30              
31             sub new {
32             my ($class, %args) = @_;
33             my @params = keys %args;
34             croak "\nYou have used a wrong name for a keyword argument " .
35             "--- perhaps a misspelling\n"
36             if _check_for_illegal_params(@params) == 0;
37             bless {
38             _excel_filename => $args{excel_filename},
39             _which_worksheet => $args{which_worksheet},
40             _raw_tickets_db => $args{raw_tickets_db},
41             _processed_tickets_db => $args{processed_tickets_db},
42             _synset_cache_db => $args{synset_cache_db},
43             _stemmed_tickets_db => $args{stemmed_tickets_db},
44             _inverted_index_db => $args{inverted_index_db},
45             _tickets_vocab_db => $args{tickets_vocab_db},
46             _idf_db => $args{idf_db},
47             _tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
48             _tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
49             _clustering_fieldname => $args{clustering_fieldname},
50             _unique_id_fieldname => $args{unique_id_fieldname},
51             _stop_words_file => $args{stop_words_file},
52             _misspelled_words_file => $args{misspelled_words_file},
53             _min_word_length => $args{min_word_length} || 4,
54             _add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
55             _want_stemming => $args{want_stemming} || 0,
56             _how_many_retrievals => $args{how_many_retrievals} || 5,
57             _min_idf_threshold => $args{min_idf_threshold},
58             _max_num_syn_words => $args{max_num_syn_words} || 3,
59             _want_synset_caching => $args{want_synset_caching} || 0,
60             _stop_words => {},
61             _all_tickets => [],
62             _column_headers => [],
63             _good_columns => [],
64             _tickets_by_ids => {},
65             _processed_tkts_by_ids => {},
66             _stemmed_tkts_by_ids => {},
67             _misspelled_words => {},
68             _total_num_tickets => 0,
69             _synset_cache => {},
70             _vocab_hash => {},
71             _vocab_idf_hist => {},
72             _idf_t => {},
73             _vocab_size => undef,
74             _doc_vector_template => {},
75             _tkt_doc_vecs => {},
76             _tkt_doc_vecs_normed => {},
77             _query_ticket_id => undef,
78             _inverted_index => {},
79             _debug1 => $args{debug1} || 0, # for processing Excel
80             _debug2 => $args{debug2} || 0, # for modeling tickets
81             _debug3 => $args{debug3} || 0, # for retrieving similar tickets
82             _wn => WordNet::QueryData->new( verbose => 0,
83             noload => 1 ),
84             }, $class;
85             }
86              
87             ############################# Extract info from Excel #######################
88              
89             sub get_tickets_from_excel {
90             my $self = shift;
91             unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
92             unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
93             unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
94             unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
95             unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
96             unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
97             unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
98             unlink glob "$self->{_tickets_vocab_db}.*";
99             unlink glob "$self->{_idf_db}.*";
100             my $filename = $self->{_excel_filename} || die("Excel file required"),
101             my $clustering_fieldname = $self->{_clustering_fieldname}
102             || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
103             my $unique_id_fieldname = $self->{_unique_id_fieldname}
104             || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
105             my $workbook;
106             if ($filename =~ /\.xls$/) {
107             my $parser = Spreadsheet::ParseExcel->new();
108             $workbook = $parser->parse($filename);
109             die $parser->error() unless defined $workbook;
110             } elsif ($filename =~ /\.xlsx$/) {
111             # use Text::Iconv;
112             my $converter = Text::Iconv->new("utf-8", "windows-1251");
113             $workbook = Spreadsheet::XLSX->new($filename, $converter);
114             } else {
115             die "File suffix on the Excel file not recognized";
116             }
117             my @worksheets = $workbook->worksheets();
118             my $which_worksheet = $self->{_which_worksheet} ||
119             die "\nYou have not specified which Excel worksheet contains the tickets\n";
120             my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
121             my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
122             my @good_columns;
123             my $col_headers_row;
124             my $col_headers_found = 0;
125             my $col_index_for_unique_id;
126             my $col_index_for_clustering_field;
127             for my $row ( $row_min .. $row_max ) {
128             last if $col_headers_found;
129             @good_columns = ();
130             for my $col ( $col_min .. $col_max ) {
131             my $cell =
132             $worksheets[$which_worksheet-1]->get_cell( $row, $col );
133             next unless $cell;
134             my $cell_value = _get_rid_of_wide_chars($cell->value());
135             push @good_columns, $col if $cell_value;
136             if ($cell_value eq $unique_id_fieldname) {
137             $col_index_for_unique_id = $col;
138             $col_headers_row = $row;
139             $col_headers_found = 1;
140             }
141             if ($cell_value eq $clustering_fieldname) {
142             $col_index_for_clustering_field = $col;
143             }
144             }
145             }
146             $self->{_good_columns} = \@good_columns;
147             print "\nThe unique id is in column: $col_index_for_unique_id\n"
148             if $self->{_debug1};
149             print "The clustering field is in column: " .
150             "$col_index_for_clustering_field\n\n" if $self->{_debug1};
151             my %Column_Headers;
152             foreach my $field_index (0..@good_columns-1) {
153             my $key = "field_" . $field_index;
154             $Column_Headers{$key} = "";
155             }
156             my @col_headers = map {
157             my $cell =
158             $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
159             $cell ? _get_rid_of_wide_chars($cell->value()) : '';
160             } @good_columns;
161             $self->{_column_headers} = \@col_headers;
162             $self->_display_column_headers() if $self->{_debug1};
163             my $unique_id_field_index_in_good_columns =
164             _find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
165             my $clustering_field_index_in_good_columns =
166             _find_index_for_given_element( $col_index_for_clustering_field,
167             \@good_columns );
168             die "Something is wrong with the info extracted from Excel " .
169             "as the index for the column with unique IDs is not one of " .
170             "good columns\n\n"
171             unless (defined $unique_id_field_index_in_good_columns) &&
172             (defined $clustering_field_index_in_good_columns);
173             for my $row_index ( $col_headers_row+1..$row_max-1) {
174             my @values = map {
175             my $cell =
176             $worksheets[$which_worksheet-1]->get_cell($row_index, $_);
177             $cell ? _get_rid_of_wide_chars($cell->value()) : '';
178             } @good_columns;
179             next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/;
180             next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/;
181             my %onerow;
182             foreach my $field_index (0..@good_columns-1) {
183             my $key = "field_" . $field_index;
184             die "The Columns Headers hash has no field for index " .
185             "$field_index\n "
186             unless exists $col_headers[$field_index];
187             $onerow{$col_headers[$field_index]} = $values[$field_index];
188             }
189             push @{$self->{_all_tickets}}, \%onerow;
190             }
191             my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
192             if (@duplicates_for_id_field > 0) {
193             print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
194             die "\n\nYour unique id field for tickets contains duplicate id's";
195             }
196             foreach my $ticket (@{$self->{_all_tickets}}) {
197             $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
198             lc($ticket->{$clustering_fieldname});
199             }
200             $self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
201             $self->store_raw_tickets_on_disk();
202             }
203              
204             sub _test_excel_for_tickets {
205             my $self = shift;
206             use Text::Iconv;
207             my $converter = Text::Iconv->new("utf-8", "windows-1251");
208             my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
209             my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
210             my @worksheets = $workbook->worksheets();
211             my ( $row_min, $row_max ) = $worksheets[0]->row_range();
212             my ( $col_min, $col_max ) = $worksheets[0]->col_range();
213             return ($row_min, $row_max, $col_min, $col_max);
214             }
215              
216             sub _display_column_headers {
217             my $self = shift;
218             print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
219             my $overall_header_string = join ' <> ', @{$self->{_column_headers}};
220             print "The column headers are: $overall_header_string\n\n";
221             }
222              
223             sub _check_unique_id_field {
224             my $self = shift;
225             my %check_hash;
226             my @duplicates;
227             foreach my $ticket (@{$self->{_all_tickets}}) {
228             if (exists $ticket->{$self->{_unique_id_fieldname}}) {
229             push @duplicates, $ticket->{$self->{_unique_id_fieldname}}
230             if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
231             $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
232             }
233             }
234             if ($self->{_debug1}) {
235             my $num_of_tickets = @{$self->{_all_tickets}};
236             my $num_entries_check_hash = keys %check_hash;
237             print "Number of tickets: $num_of_tickets\n";
238             print "Number of keys in check hash: $num_entries_check_hash\n";
239             }
240             return \@duplicates;
241             }
242              
243             sub show_original_ticket_for_given_id {
244             my $self = shift;
245             my $id = shift;
246             print "\n\nDisplaying the fields for the ticket $id:\n\n";
247             foreach my $ticket (@{$self->{_all_tickets}}) {
248             if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
249             foreach my $key (sort keys %{$ticket}) {
250             my $value = $ticket->{$key};
251             $value =~ s/^\s+//;
252             $value =~ s/\s+$//;
253             printf("%20s ==> %s\n", $key, $value);
254             }
255             }
256             }
257             }
258              
259             sub show_raw_ticket_clustering_data_for_given_id {
260             my $self = shift;
261             my $ticket_id = shift;
262             my $record = $self->{_tickets_by_ids}->{$ticket_id};
263             print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
264             "$record\n\n";
265             return $record;
266             }
267              
268             # Needed by test.t
269             sub _raw_ticket_clustering_data_for_given_id {
270             my $self = shift;
271             my $ticket_id = shift;
272             my $record = $self->{_tickets_by_ids}->{$ticket_id};
273             return $record;
274             }
275              
276              
277             sub show_processed_ticket_clustering_data_for_given_id {
278             my $self = shift;
279             my $ticket_id = shift;
280             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
281             print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
282             "$record\n\n";
283             }
284              
285             sub show_stemmed_ticket_clustering_data_for_given_id {
286             my $self = shift;
287             my $ticket_id = shift;
288             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
289             print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
290             "$record\n\n";
291             }
292              
293             # The following function is a good diagnostic tool to look into the
294             # array stored in $self->{_all_tickets}. Each element of this array
295             # is a record that represents one row of the Excel file.
296             sub _show_row {
297             my $self = shift;
298             my $row_num = shift;
299             my $total_rows = @{$self->{_all_tickets}};
300             print "There are $total_rows items in the \$all_tickets array\n";
301             die "The row that you want to see does not exist"
302             unless $row_num < $total_rows;
303             my %record = %{$self->{_all_tickets}->[$row_num]};
304             foreach my $field (sort keys %record) {
305             my $value = $record{$field};
306             no warnings;
307             print "$field ==> $value\n";
308             }
309             }
310              
311             sub store_raw_tickets_on_disk {
312             my $self = shift;
313             $self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db};
314             unlink $self->{_raw_tickets_db};
315             eval {
316             store( $self->{_all_tickets}, $self->{_raw_tickets_db} );
317             };
318             if ($@) {
319             die "Something went wrong with disk storage of ticket data: $@";
320             }
321             }
322              
323             sub restore_raw_tickets_from_disk {
324             my $self = shift;
325             my $clustering_fieldname = $self->{_clustering_fieldname}
326             || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
327             my $unique_id_fieldname = $self->{_unique_id_fieldname}
328             || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
329             eval {
330             $self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} );
331             };
332             if ($@) {
333             die "Unable to retrieve raw tickets from disk: $@";
334             }
335             foreach my $ticket (@{$self->{_all_tickets}}) {
336             $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
337             lc($ticket->{$clustering_fieldname});
338             }
339             $self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}};
340             }
341              
342             sub delete_markup_from_all_tickets {
343             my $self = shift;
344             foreach my $ticket (@{$self->{_all_tickets}}) {
345             $self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}});
346             }
347             }
348              
349             sub _delete_markup_from_one_ticket {
350             my $self = shift;
351             my $ticket_id = shift;
352             my $ticket_strings = $self->{_tickets_by_ids}->{$ticket_id};
353             my @strings = grep $_, split /\s+/, $ticket_strings;
354             my $cleaned_up_strings = join ' ', grep {$_ !~ /^<[^<>]+>$/} @strings;
355             $self->{_tickets_by_ids}->{$ticket_id} = $cleaned_up_strings;
356             foreach my $ticket (@{$self->{_all_tickets}}) {
357             if ( $ticket->{$self->{_unique_id_fieldname}} == $ticket_id ) {
358             $ticket->{$self->{_clustering_fieldname}} = $cleaned_up_strings;
359             last;
360             }
361             }
362             }
363              
364             sub apply_filter_to_all_tickets {
365             my $self = shift;
366             my $stop_words_file = $self->{_stop_words_file}
367             || die("\nYou forgot to supply the name of the stop words file in your constructor call\n");
368             my @stop_words = @{_fetch_words_from_file($stop_words_file)};
369             my $misspelled_words_file = $self->{_misspelled_words_file}
370             || die("\nYou forgot to supply the name of the misspelled words file in your constructor call\n");
371             foreach my $word (@stop_words) {
372             $self->{_stop_words}->{$word} = 1;
373             }
374             if ($self->{_misspelled_words_file}) {
375             my @misspelled_word_pairs =
376             @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
377             foreach my $wordpair (@misspelled_word_pairs) {
378             my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
379             $self->{_misspelled_words}->{$wrong_word} = $good_word;
380             }
381             }
382             my $i = 1;
383             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tickets_by_ids}}) {
384             print "\nApplying filter to ticket $ticket_id ($i out of $self->{_total_num_tickets})\n";
385             $self->_apply_filter_to_one_ticket($ticket_id);
386             $i++;
387             }
388             }
389              
390             sub _apply_filter_to_one_ticket {
391             my $self = shift;
392             my $ticket_id = shift;
393              
394             unless (keys %{$self->{_stop_words}} > 0) {
395             my @stop_words = @{_fetch_words_from_file($self->{_stop_words_file})};
396             foreach my $word (@stop_words) {
397             $self->{_stop_words}->{$word} = 1;
398             }
399             }
400             unless (keys %{$self->{_misspelled_words}} > 0) {
401             if ($self->{_misspelled_words_file}) {
402             my @misspelled_word_pairs =
403             @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
404             foreach my $wordpair (@misspelled_word_pairs) {
405             my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
406             $self->{_misspelled_words}->{$wrong_word} = $good_word;
407             }
408             }
409             }
410             my $record = $self->{_tickets_by_ids}->{$ticket_id};
411             my $min = $self->{_min_word_length};
412             my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record;
413             my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words;
414             return unless @clean_words;
415             my @new_words;
416             foreach my $word (@words) {
417             $word =~ s/(.+)[.,:!-]$/$1/;
418             unless (($word eq 'no') or ($word eq 'not')) {
419             next if length($word) < $self->{_min_word_length};
420             }
421             if (exists $self->{_misspelled_words}->{lc($word)}) {
422             push @new_words, $self->{_misspelled_words}->{$word};
423             next;
424             }
425             push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
426             }
427             my $new_record = join ' ', @new_words;
428             $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
429             }
430              
431             sub _get_synonyms_for_word {
432             my $self = shift;
433             my $word = shift;
434             my $no_sense_indicators = 1;
435             my $wn = $self->{_wn};
436             my @parts_of_speech = $wn->querySense("$word");
437              
438             my %noun_synonyms;
439             my %verb_synonyms;
440             my %adj_synonyms;
441             my %adv_synonyms;
442             foreach my $pos (@parts_of_speech) {
443             if ($pos =~ /n$/) {
444             my @all_noun_syn_sense_labels = $wn->querySense( $pos, "syns");
445             my $how_many = @all_noun_syn_sense_labels;
446             foreach my $noun_sense (@all_noun_syn_sense_labels) {
447             my @noun_synonyms = $wn->querySense($noun_sense, "syns");
448             my $answer = "";
449             foreach my $noun_syn (@noun_synonyms) {
450             next if $noun_syn eq $noun_sense;
451             $noun_syn =~ s/\#.+$// if $no_sense_indicators;
452             $noun_synonyms{$noun_syn} = 1;
453             $answer .= " $noun_syn ";
454             }
455             }
456             } elsif ($pos =~ /v$/) {
457             my @all_verb_syn_sense_labels = $wn->querySense( $pos, "syns");
458             my $how_many = @all_verb_syn_sense_labels;
459             foreach my $verb_sense (@all_verb_syn_sense_labels) {
460             my @verb_synonyms = $wn->querySense($verb_sense, "syns");
461             my $answer = "";
462             foreach my $verb_syn (@verb_synonyms) {
463             next if $verb_syn eq $verb_sense;
464             $verb_syn =~ s/\#.+$// if $no_sense_indicators;
465             $verb_synonyms{$verb_syn} = 1;
466             $answer .= " $verb_syn ";
467             }
468             }
469             } elsif ($pos =~ /a$/) {
470             my @all_adj_syn_sense_labels = $wn->querySense( $pos, "syns");
471             my $how_many = @all_adj_syn_sense_labels;
472             foreach my $adj_sense (@all_adj_syn_sense_labels) {
473             my @adj_synonyms = $wn->querySense($adj_sense, "syns");
474             my $answer = "";
475             foreach my $adj_syn (@adj_synonyms) {
476             next if $adj_syn eq $adj_sense;
477             $adj_syn =~ s/\#.+$// if $no_sense_indicators;
478             $adj_synonyms{$adj_syn} = 1;
479             $answer .= " $adj_syn ";
480             }
481             }
482             } elsif ($pos =~ /r$/) {
483             my @all_adv_syn_sense_labels = $wn->querySense( $pos, "syns");
484             my $how_many = @all_adv_syn_sense_labels;
485             foreach my $adv_sense (@all_adv_syn_sense_labels) {
486             my @adv_synonyms = $wn->querySense($adv_sense, "syns");
487             my $answer = "";
488             foreach my $adv_syn (@adv_synonyms) {
489             next if $adv_syn eq $adv_sense;
490             $adv_syn =~ s/\#.+$// if $no_sense_indicators;
491             $adv_synonyms{$adv_syn} = 1;
492             $answer .= " $adv_syn ";
493             }
494             }
495             } else {
496             die "\nThe Part of Speech $pos not recognized\n\n";
497             }
498             }
499             my @all_synonyms;
500             my @all_noun_synonyms = keys %noun_synonyms;
501             my @all_verb_synonyms = keys %verb_synonyms;
502             my @all_adj_synonyms = keys %adj_synonyms;
503             my @all_adv_synonyms = keys %adv_synonyms;
504             push @all_synonyms, @all_noun_synonyms if @all_noun_synonyms > 0;
505             push @all_synonyms, @all_verb_synonyms if @all_verb_synonyms > 0;
506             push @all_synonyms, @all_adj_synonyms if @all_adj_synonyms > 0;
507             push @all_synonyms, @all_adv_synonyms if @all_adv_synonyms > 0;
508             my %synonym_set;
509             foreach my $synonym (@all_synonyms) {
510             $synonym_set{$synonym} = 1;
511             }
512             my @synonym_set = sort keys %synonym_set;
513             return \@synonym_set;
514             }
515              
516             sub _get_antonyms_for_word {
517             my $self = shift;
518             my $word = shift;
519             my $no_sense_indicators = 1;
520             my $wn = $self->{_wn};
521             my @parts_of_speech = $wn->querySense("$word");
522             my %noun_antonyms;
523             my %verb_antonyms;
524             my %adj_antonyms;
525             my %adv_antonyms;
526             foreach my $pos (@parts_of_speech) {
527             if ($pos =~ /n$/) {
528             my @all_noun_ant_sense_labels = $wn->queryWord( $pos, "ants");
529             my $how_many = @all_noun_ant_sense_labels;
530             foreach my $noun_sense (@all_noun_ant_sense_labels) {
531             my @noun_antonyms = $wn->queryWord($noun_sense, "ants");
532             my $answer = "";
533             foreach my $noun_ant (@noun_antonyms) {
534             next if $noun_ant eq $noun_sense;
535             $noun_ant =~ s/\#.+$// if $no_sense_indicators;
536             $noun_antonyms{$noun_ant} = 1;
537             $answer .= " $noun_ant ";
538             }
539             }
540             } elsif ($pos =~ /v$/) {
541             my @all_verb_ant_sense_labels = $wn->queryWord( $pos, "ants");
542             my $how_many = @all_verb_ant_sense_labels;
543             foreach my $verb_sense (@all_verb_ant_sense_labels) {
544             my @verb_antonyms = $wn->queryWord($verb_sense, "ants");
545             my $answer = "";
546             foreach my $verb_ant (@verb_antonyms) {
547             next if $verb_ant eq $verb_sense;
548             $verb_ant =~ s/\#.+$// if $no_sense_indicators;
549             $verb_antonyms{$verb_ant} = 1;
550             $answer .= " $verb_ant ";
551             }
552             }
553             } elsif ($pos =~ /a$/) {
554             my @all_adj_ant_sense_labels = $wn->queryWord( $pos, "ants");
555             my $how_many = @all_adj_ant_sense_labels;
556             foreach my $adj_sense (@all_adj_ant_sense_labels) {
557             my @adj_antonyms = $wn->queryWord($adj_sense, "ants");
558             my $answer = "";
559             foreach my $adj_ant (@adj_antonyms) {
560             next if $adj_ant eq $adj_sense;
561             $adj_ant =~ s/\#.+$// if $no_sense_indicators;
562             $adj_antonyms{$adj_ant} = 1;
563             $answer .= " $adj_ant ";
564             }
565             }
566             } elsif ($pos =~ /r$/) {
567             my @all_adv_ant_sense_labels = $wn->queryWord( $pos, "ants");
568             my $how_many = @all_adv_ant_sense_labels;
569             foreach my $adv_sense (@all_adv_ant_sense_labels) {
570             my @adv_antonyms = $wn->queryWord($adv_sense, "ants");
571             my $answer = "";
572             foreach my $adv_ant (@adv_antonyms) {
573             next if $adv_ant eq $adv_sense;
574             $adv_ant =~ s/\#.+$// if $no_sense_indicators;
575             $adv_antonyms{$adv_ant} = 1;
576             $answer .= " $adv_ant ";
577             }
578             }
579             } else {
580             die "\nThe Part of Speech $pos not recognized\n\n";
581             }
582             }
583             my @all_antonyms;
584             my @all_noun_antonyms = keys %noun_antonyms;
585             my @all_verb_antonyms = keys %verb_antonyms;
586             my @all_adj_antonyms = keys %adj_antonyms;
587             my @all_adv_antonyms = keys %adv_antonyms;
588             push @all_antonyms, @all_noun_antonyms if @all_noun_antonyms > 0;
589             push @all_antonyms, @all_verb_antonyms if @all_verb_antonyms > 0;
590             push @all_antonyms, @all_adj_antonyms if @all_adj_antonyms > 0;
591             push @all_antonyms, @all_adv_antonyms if @all_adv_antonyms > 0;
592             my %antonym_set;
593             foreach my $antonym (@all_antonyms) {
594             $antonym_set{$antonym} = 1;
595             }
596             my @antonym_set = sort keys %antonym_set;
597             return \@antonym_set;
598             }
599              
600             sub expand_all_tickets_with_synonyms {
601             my $self = shift;
602             return unless $self->{_add_synsets_to_tickets};
603             my $num_of_tickets = $self->{_total_num_tickets};
604             if ($self->{_want_synset_caching}) {
605             eval {
606             $self->{_synset_cache} = retrieve( $self->{_synset_cache_db} );
607             } if -s $self->{_synset_cache_db};
608             if ($@) {
609             print "Something went wrong with restoration of synset cache: $@";
610             }
611             }
612             my $i = 1;
613             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
614             $self->_expand_one_ticket_with_synonyms($ticket_id);
615             print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n";
616             $i++;
617             }
618             if ($self->{_want_synset_caching}) {
619             $self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db};
620             eval {
621             store( $self->{_synset_cache}, $self->{_synset_cache_db} );
622             };
623             if ($@) {
624             die "Something went wrong with disk storage of synset cache: $@";
625             }
626             }
627             }
628              
629             sub _expand_one_ticket_with_synonyms {
630             my $self = shift;
631             my $ticket_id = shift;
632             print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n"
633             if $self->{_debug2};
634             $self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id );
635             $self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
636             }
637              
638             sub _replace_negated_words_with_antonyms_one_ticket {
639             my $self = shift;
640             my $ticket_id = shift;
641             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
642             my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
643             foreach my $word (@words_negated_with_not) {
644             next unless (($word =~ /^\w+$/) &&
645             (length($word) > $self->{_min_word_length}));
646             my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
647             next unless @antonym_words > 0;
648             $#antonym_words = $self->{_max_num_syn_words} - 1
649             if @antonym_words > $self->{_max_num_syn_words};
650             my $antonym_replacement_string = join ' ', @antonym_words;
651             print "Antonym for $word is $antonym_replacement_string\n"
652             if $self->{_debug2};
653             $record =~ s/not\s+$word/$antonym_replacement_string/g;
654             }
655             my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
656             foreach my $word (@words_negated_with_no) {
657             next unless (($word =~ /^\w+$/) &&
658             (length($word) > $self->{_min_word_length}));
659             my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
660             next unless @antonym_words > 0;
661             $#antonym_words = $self->{_max_num_syn_words} - 1
662             if @antonym_words > $self->{_max_num_syn_words};
663             my $antonym_replacement_string = join ' ', @antonym_words;
664             print "Antonym for $word is $antonym_replacement_string\n"
665             if $self->{_debug2};
666             $record =~ s/no\s+$word/$antonym_replacement_string/g;
667             }
668             $self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
669             }
670              
671             sub _add_to_words_their_synonyms_one_ticket {
672             my $self = shift;
673             my $ticket_id = shift;
674             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
675             my @words = split /\s+/, $record;
676             my @synonym_bag;
677             foreach my $word (@words) {
678             next if $word eq 'no';
679             next if $word eq 'not';
680             next unless $word =~ /^\w+$/ &&
681             length($word) > $self->{_min_word_length};
682             my @synonym_words;
683             @synonym_words = @{$self->{_synset_cache}->{$word}}
684             if exists $self->{_synset_cache}->{$word};
685             unless (exists $self->{_synset_cache}->{$word}) {
686             @synonym_words = @{$self->_get_synonyms_for_word( $word )};
687             print "syn-set for $word => @synonym_words\n\n"
688             if $self->{_debug2};
689             my $word_root;
690             if (@synonym_words == 0) {
691             if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
692             $word_root = $1;
693             @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
694             if length($word_root) >= $self->{_min_word_length};
695             } elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
696             $word_root = $1;
697             @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
698             if length($word_root) >= $self->{_min_word_length};
699             }
700             }
701             print "syn-set for word root $word_root => @synonym_words\n\n"
702             if ( $self->{_debug2} && defined $word_root );
703             _fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
704             $#synonym_words = $self->{_max_num_syn_words} - 1
705             if @synonym_words > $self->{_max_num_syn_words};
706             print "Retained syn-set for $word => @synonym_words\n\n"
707             if $self->{_debug2};
708             $self->{_synset_cache}->{$word} = \@synonym_words;
709             push @synonym_bag, @synonym_words;
710             }
711             }
712             foreach my $syn_word (@synonym_bag) {
713             push @words, lc($syn_word)
714             unless ((exists $self->{_stop_words}->{$syn_word}) ||
715             (length($syn_word) <= $self->{_min_word_length}));
716             }
717             my @sorted_words = sort @words;
718             my $new_record = join ' ', @sorted_words;
719             $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
720             }
721              
722             sub store_processed_tickets_on_disk {
723             my $self = shift;
724             $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
725             unlink $self->{_processed_tickets_db};
726             eval {
727             store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} );
728             };
729             if ($@) {
730             die "Something went wrong with disk storage of processed tickets: $@";
731             }
732             }
733              
734             sub store_stemmed_tickets_and_inverted_index_on_disk {
735             my $self = shift;
736             $self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
737             unlink $self->{_stemmed_tickets_db};
738             eval {
739             print "\n\nStoring stemmed tickets on disk\n\n";
740             store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} );
741             };
742             if ($@) {
743             die "Something went wrong with disk storage of stemmed tickets: $@";
744             }
745             $self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db};
746             unlink $self->{_inverted_index_db};
747             eval {
748             print "\nStoring inverted index on disk\n\n";
749             store( $self->{_inverted_index}, $self->{_inverted_index_db} );
750             };
751             if ($@) {
752             die "Something went wrong with disk storage of the inverted index: $@";
753             }
754             }
755              
756             sub restore_processed_tickets_from_disk {
757             my $self = shift;
758             eval {
759             $self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
760             };
761             if ($@) {
762             die "Something went wrong with restoration of processed tickets: $@";
763             }
764             }
765              
766             sub restore_stemmed_tickets_from_disk {
767             my $self = shift;
768             eval {
769             $self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
770             };
771             if ($@) {
772             die "Something went wrong with restoration of stemmed tickets: $@";
773             }
774             }
775              
776             #################### Get Ticket Vocabulary and Word Counts #################
777              
778             sub get_ticket_vocabulary_and_construct_inverted_index {
779             my $self = shift;
780             my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
781             $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
782             unlink glob "$self->{_tickets_vocab_db}.*";
783             my %vocab_hist_on_disk;
784             tie %vocab_hist_on_disk, 'SDBM_File',
785             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
786             or die "Can't create DBM files: $!";
787             my %inverted_index;
788             my $min = $self->{_min_word_length};
789             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
790             my %uniques = ();
791             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
792             my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
793             my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
794             next unless @clean_words;
795             @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}),
796             @clean_words;
797             map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
798             for (@clean_words) { $uniques{"\L$_"}++ };
799             map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
800             map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id }
801             keys %uniques;
802             $self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
803             }
804             foreach (keys %vocab_hist_on_disk) {
805             $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
806             }
807             untie %vocab_hist_on_disk;
808             $self->{_tkt_vocab_done} = 1;
809             $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
810             print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
811             if $self->{_debug2};
812             # Calculate idf(t):
813             $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
814             unlink glob "$self->{_idf_db}.*";
815             tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
816             or die "Can't create DBM files: $!";
817             foreach (keys %{$self->{_vocab_idf_hist}}) {
818             $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
819             /
820             (1 + $self->{_vocab_idf_hist}->{$_})))
821             / log(10) );
822             }
823             foreach (keys %idf_t_on_disk) {
824             $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
825             }
826             untie %idf_t_on_disk;
827             }
828              
829             sub display_tickets_vocab {
830             my $self = shift;
831             die "tickets vocabulary not yet constructed"
832             unless keys %{$self->{_vocab_hist}};
833             print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n";
834             foreach (sort keys %{$self->{_vocab_hist}}){
835             my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
836             print "$outstring\n";
837             }
838             my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
839             print "\nSize of the tickets vocabulary: $vocab_size\n\n";
840             }
841              
842             sub display_inverse_document_frequencies {
843             my $self = shift;
844             die "tickets vocabulary not yet constructed"
845             unless keys %{$self->{_vocab_idf_hist}};
846             print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
847             foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
848             my $outstring = sprintf("%30s %d",
849             $_, $self->{_vocab_idf_hist}->{$_});
850             print "$outstring\n";
851             }
852             print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
853             foreach ( sort keys %{$self->{_idf_t}} ) {
854             my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
855             print "$outstring\n";
856             }
857             }
858              
859             # The following subroutine is useful for diagnostic purposes. It
860             # lists the number of tickets that a word appears in and also lists
861             # the tickets. But be careful in interpreting its results. Note
862             # if you invoke this subroutine after the synsets have been added
863             # to the tickets, you may find words being attributed to tickets
864             # that do not actually contain them in the original Excel sheet.
865             sub list_processed_tickets_for_a_word {
866             my $self = shift;
867             while (my $word = ) { #enter ctrl-D to exit the loop
868             chomp $word;
869             my @ticket_list;
870             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
871             my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
872             push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
873             }
874             my $num = @ticket_list;
875             print "\nThe number of processed tickets that mention the word `$word': $num\n\n";
876             print "The processed tickets: @ticket_list\n\n";
877             }
878             }
879              
880             sub list_stemmed_tickets_for_a_word {
881             my $self = shift;
882             while (my $word = ) { #enter ctrl-D to exit the loop
883             chomp $word;
884             my @ticket_list;
885             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
886             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
887             push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
888             }
889             my $num = @ticket_list;
890             print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n";
891             print "The stemmed tickets: @ticket_list\n\n";
892             }
893             }
894              
895             ############## Generate Document Vectors for Tickets ####################
896              
897             sub construct_doc_vectors_for_all_tickets {
898             my $self = shift;
899             foreach ( sort keys %{$self->{_vocab_hist}} ) {
900             $self->{_doc_vector_template}->{$_} = 0;
901             }
902             my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}};
903             my $i = 1;
904             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
905             my $doc_vec_ref =
906             $self->_construct_doc_vector_for_one_ticket($ticket_id);
907             print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n";
908             $i++;
909             }
910             }
911              
912             sub _construct_doc_vector_for_one_ticket {
913             my $self = shift;
914             my $ticket_id = shift;
915             unless (keys %{$self->{_doc_vector_template}}) {
916             foreach ( sort keys %{$self->{_vocab_hist}} ) {
917             $self->{_doc_vector_template}->{$_} = 0;
918             }
919             }
920             my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
921             foreach ( sort keys %{$self->{_doc_vector_template}} ) {
922             $doc_vector{$_} = 0;
923             }
924             my $min = $self->{_min_word_length};
925             my $total_words_in_ticket = 0;
926             my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
927             my @clean_words = split /\s+/, $record;
928             map { $doc_vector{"\L$_"}++ }
929             grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
930             die "Something went wrong. Doc vector size unequal to vocab size"
931             unless $self->{_vocab_size} == scalar(keys %doc_vector);
932             foreach (keys %doc_vector) {
933             $total_words_in_ticket += $doc_vector{$_};
934             }
935             my %normalized_doc_vector;
936             foreach (keys %doc_vector) {
937             $normalized_doc_vector{$_} = $doc_vector{$_}
938             *
939             $self->{_idf_t}->{$_}
940             /
941             $total_words_in_ticket;
942             }
943             $self->{_tkt_doc_vecs}->{$ticket_id} = \%doc_vector;
944             $self->{_tkt_doc_vecs_normed}->{$ticket_id} = \%normalized_doc_vector;
945             }
946              
947             sub store_ticket_vectors {
948             my $self = shift;
949             die "You have not yet created doc vectors for tickets"
950             unless keys %{$self->{_tkt_doc_vecs}};
951             $self->{_tkt_doc_vecs_db} = "tkt_doc_vecs.db" unless $self->{_tkt_doc_vecs_db};
952             $self->{_tkt_doc_vecs_normed_db} = "tkt_doc_vecs_normed.db"
953             unless $self->{_tkt_doc_vecs_normed_db};
954             unlink $self->{_tkt_doc_vecs_db};
955             unlink $self->{_tkt_doc_vecs_normed_db};
956             print "\nStoring the ticket doc vecs on disk. This could take a while.\n\n";
957             eval {
958             store( $self->{_tkt_doc_vecs}, $self->{_tkt_doc_vecs_db} );
959             };
960             if ($@) {
961             die "Something went wrong with disk storage of ticket doc vectors: $@";
962             }
963             print "\nStoring normalized ticket doc vecs on disk. This could take a while.\n\n";
964             eval {
965             store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db});
966             };
967             if ($@) {
968             die "Something wrong with disk storage of normalized doc vecs: $@";
969             }
970             }
971              
972             sub restore_ticket_vectors_and_inverted_index {
973             my $self = shift;
974             $self->restore_raw_tickets_from_disk();
975             $self->restore_processed_tickets_from_disk();
976             $self->restore_stemmed_tickets_from_disk();
977             tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640
978             or die "Can't connect with DBM file: $!";
979             foreach (keys %vocab_hist_on_disk) {
980             $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
981             }
982             untie %vocab_hist_on_disk;
983             tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640
984             or die "Can't connect with DBM file: $!";
985             foreach (keys %idf_t_on_disk) {
986             $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
987             }
988             untie %idf_t_on_disk;
989             eval {
990             $self->{_tkt_doc_vecs} = retrieve( $self->{_tkt_doc_vecs_db} );
991             };
992             if ($@) {
993             print "Something went wrong with retrieval of ticket doc vectors: $@";
994             }
995             eval {
996             $self->{_tkt_doc_vecs_normed} = retrieve( $self->{_tkt_doc_vecs_normed_db} );
997             };
998             if ($@) {
999             print "Something went wrong with retrieval of normed ticket doc vectors: $@";
1000             }
1001             eval {
1002             $self->{_inverted_index} =
1003             retrieve( $self->{_inverted_index_db} );
1004             };
1005             if ($@) {
1006             print "Something went wrong with retrieval of inverted_index: $@";
1007             }
1008             }
1009              
1010             sub display_all_doc_vectors {
1011             my $self = shift;
1012             die "Ticket doc vectors not yet constructed"
1013             unless keys %{$self->{_tkt_doc_vecs}};
1014             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs}}) {
1015             $self->_display_a_single_ticket_doc_vector($ticket_id);
1016             }
1017             }
1018              
1019             sub _display_a_single_ticket_doc_vector {
1020             my $self = shift;
1021             my $ticket_id = shift;
1022             die "Ticket doc vectors not yet constructed"
1023             unless keys %{$self->{_tkt_doc_vecs}};
1024             print "\n\nDISPLAYING Doc Vec FOR TICKET $ticket_id:\n\n";
1025             foreach ( sort keys %{$self->{_tkt_doc_vecs}->{$ticket_id}} ) {
1026             print "$_ => $self->{_tkt_doc_vecs}->{$ticket_id}->{$_}\n";
1027             }
1028             my $docvec_size = keys %{$self->{_tkt_doc_vecs}->{$ticket_id}};
1029             print "\nSize of vector for ticket $ticket_id: $docvec_size\n";
1030             }
1031              
1032             sub display_all_normalized_doc_vectors {
1033             my $self = shift;
1034             die "normalized document vectors not yet constructed"
1035             unless keys %{$self->{_tkt_doc_vecs_normed}};
1036             foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs_normed}}) {
1037             $self->_display_a_single_normalized_doc_vector($ticket_id);
1038             }
1039             }
1040              
1041             sub _display_a_single_normalized_doc_vector {
1042             my $self = shift;
1043             my $ticket_id = shift;
1044             die "Normalized ticket doc vectors not yet constructed"
1045             unless keys %{$self->{_tkt_doc_vecs_normed}};
1046             print "\n\nDISPLAYING Normalized Doc Vec FOR TICKET $ticket_id:\n\n";
1047             foreach ( sort keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}} ) {
1048             print "$_ => $self->{_tkt_doc_vecs_normed}->{$ticket_id}->{$_}\n";
1049             }
1050             my $docvec_size = keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}};
1051             print "\nSize of normalized vector for ticket $ticket_id: $docvec_size\n";
1052             }
1053              
1054             ########################## Display Inverted Index ###########################
1055              
1056             sub display_inverted_index {
1057             my $self = shift;
1058             print "\n\nDisplaying inverted index:\n\n";
1059             foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1060             $self->display_inverted_index_for_given_word($word);
1061             }
1062             }
1063              
1064             sub display_inverted_index_for_given_word {
1065             my $self = shift;
1066             my $word = shift;
1067             defined $self->{_inverted_index}->{$word} ?
1068             print "$word => @{$self->{_inverted_index}->{$word}}\n" :
1069             die "Something is wrong with your inverted index\n";
1070             }
1071              
1072             sub display_inverted_index_for_given_query {
1073             my $self = shift;
1074             my $query_ticket_id = shift;
1075             my $query_record = $self->{_stemmed_tkts_by_ids}->{$query_ticket_id};
1076             my @query_words = grep $_, split /\s+/, $query_record;
1077             foreach my $qword (@query_words) {
1078             my $idf_t = $self->{_idf_t}->{$qword};
1079             my @relevant_tickets = @{$self->{_inverted_index}->{$qword}};
1080             print "\n$qword ($idf_t) ===> @relevant_tickets\n\n";
1081             }
1082             }
1083              
1084             ############# Retrieve Most Similar Tickets with VSM Model ###################
1085              
1086             sub retrieve_similar_tickets_with_vsm {
1087             my $self = shift;
1088             $self->{_query_ticket_id} = shift;
1089             die "\nFirst generate normalized doc vectors for tickets before you can call retrieve with vsm function()\n"
1090             unless scalar(keys %{$self->{_vocab_hist}})
1091             && scalar(keys %{$self->{_tkt_doc_vecs_normed}});
1092             print "\nCalculating the similarity set for query ticket $self->{_query_ticket_id}\n\n";
1093             my $query_record = $self->{_stemmed_tkts_by_ids}->{$self->{_query_ticket_id}};
1094             my @query_words = grep $_, split /\s+/, $query_record;
1095             my %relevant_tickets_set;
1096             die "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- "
1097             unless $self->{_min_idf_threshold};
1098             foreach my $qword (@query_words) {
1099             map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}}
1100             if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold};
1101             }
1102             my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set;
1103             print "The relevant tickets for query: @relevant_tickets"
1104             if $self->{_debug3};
1105             my $num_relevant_tkts = @relevant_tickets;
1106             print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n";
1107             my %retrievals;
1108             my $rank = 0;
1109             foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) {
1110             $retrievals{$_} = $self->_similarity_to_query_ticket($_);
1111             $rank++;
1112             last if $rank == $self->{_how_many_retrievals};
1113             }
1114             if ($self->{_debug3}) {
1115             print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
1116             foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
1117             print "$_ => $retrievals{$_}\n";
1118             }
1119             }
1120             return \%retrievals;
1121             }
1122              
1123             sub _doc_vec_comparator {
1124             my $self = shift;
1125             my %query_ticket_data_normed =
1126             %{$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}};
1127             my $vec1_hash_ref = $self->{_tkt_doc_vecs_normed}->{$a};
1128             my $vec2_hash_ref = $self->{_tkt_doc_vecs_normed}->{$b};
1129             my @vec1 = ();
1130             my @vec2 = ();
1131             my @qvec = ();
1132             foreach my $word (sort keys %{$self->{_vocab_hist}}) {
1133             push @vec1, $vec1_hash_ref->{$word};
1134             push @vec2, $vec2_hash_ref->{$word};
1135             push @qvec, $query_ticket_data_normed{$word};
1136             }
1137             my $vec1_mag = _vec_magnitude(\@vec1);
1138             my $vec2_mag = _vec_magnitude(\@vec2);
1139             my $qvec_mag = _vec_magnitude(\@qvec);
1140             my $product1 = _vec_scalar_product(\@vec1, \@qvec);
1141             $product1 /= $vec1_mag * $qvec_mag;
1142             my $product2 = _vec_scalar_product(\@vec2, \@qvec);
1143             $product2 /= $vec2_mag * $qvec_mag;
1144             return 1 if $product1 < $product2;
1145             return 0 if $product1 == $product2;
1146             return -1 if $product1 > $product2;
1147             }
1148              
1149             sub _similarity_to_query_ticket {
1150             my $self = shift;
1151             my $ticket_id = shift;
1152             my $ticket_data_normed = $self->{_tkt_doc_vecs_normed}->{$ticket_id};
1153             my @vec = ();
1154             my @qvec = ();
1155             foreach my $word (sort keys %$ticket_data_normed) {
1156             push @vec, $ticket_data_normed->{$word};
1157             push @qvec,
1158             $self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}->{$word};
1159             }
1160             my $vec_mag = _vec_magnitude(\@vec);
1161             my $qvec_mag = _vec_magnitude(\@qvec);
1162             die "\nThe query ticket appears to be empty\n" if $qvec_mag == 0;
1163             my $product = _vec_scalar_product(\@vec, \@qvec);
1164             $product /= $vec_mag * $qvec_mag;
1165             return $product;
1166             }
1167              
1168              
1169             ######################## Utility Subroutines ##########################
1170              
1171             sub _simple_stemmer {
1172             my $word = shift;
1173             my $debug = shift;
1174             print "\nStemming the word: $word\n" if $debug;
1175             $word =~ s/(.*[a-z]t)ted$/$1/i;
1176             $word =~ s/(.*[a-z]t)ting$/$1/i;
1177             $word =~ s/(.*[a-z]l)ling$/$1/i;
1178             $word =~ s/(.*[a-z]g)ging$/$1/i;
1179             $word =~ s/(.*[a-z]ll)ed$/$1/i;
1180             $word =~ s/(.*[a-z][^aeious])s$/$1/i;
1181             $word =~ s/(.*[a-z])ies$/$1y/i;
1182             $word =~ s/(.*[a-z]s)es$/$1/i;
1183             $word =~ s/(.*[a-z][ck])es$/$1e/i;
1184             $word =~ s/(.*[a-z]+)tions$/$1tion/i;
1185             $word =~ s/(.*[a-z]+)mming$/$1m/i;
1186             $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
1187             $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
1188             $word =~ s/(.*[a-z]+)tices$/$1tex/i;
1189             $word =~ s/(.*[a-z]+)pes$/$1pe/i;
1190             $word =~ s/(.*[a-z]+)sed$/$1se/i;
1191             $word =~ s/(.*[a-z]+)ed$/$1/i;
1192             $word =~ s/(.*[a-z]+)tation$/$1t/i;
1193             print "Stemmed word: $word\n\n" if $debug;
1194             return $word;
1195             }
1196              
1197             sub _exists {
1198             my $element = shift;
1199             my $array = shift;
1200             my %hash;
1201             for my $item (@$array) {
1202             $hash{$item} = 1;
1203             }
1204             return exists $hash{$element};
1205             }
1206              
1207             sub _fetch_words_from_file {
1208             my $file = shift;
1209             my @words;
1210             open( IN, "$file" ) or die "unable to open the file $file: $!";
1211             while () {
1212             next if /^#/;
1213             next if /^[ ]*\r?\n?$/;
1214             $_ =~ s/\r?\n?$//;
1215             my @how_many_in_line = grep $_, split /\s+/, $_;
1216             die "File $file: Exactly one word allowed in each line -- "
1217             unless @how_many_in_line == 1;
1218             push @words, $_;
1219             }
1220             close IN;
1221             return \@words;
1222             }
1223              
1224             sub _fetch_word_pairs_from_file {
1225             my $file = shift;
1226             my @word_pairs;
1227             open( IN, "$file" ) or die "unable to open the file $file: $!";
1228             while () {
1229             next if /^#/;
1230             next if /^[ ]*$/;
1231             chomp;
1232             my @how_many_in_line = grep $_, split /\s+/, $_;
1233             die "File: $file --- Exactly two words must be in each non-comment or not-empty line -- "
1234             unless @how_many_in_line == 2;
1235             push @word_pairs, $_;
1236             }
1237             close IN;
1238             return \@word_pairs;
1239             }
1240              
1241             sub _get_rid_of_wide_chars {
1242             my $string = shift;
1243             $string =~ s/[^[:ascii:]]+//g;
1244             # $string =~ s/\x{FEFF}//g; to get rid of wide characters
1245             return $string;
1246             }
1247              
1248             sub _find_index_for_given_element {
1249             my $ele = shift;
1250             my $array_ref = shift;
1251             foreach my $i (0..@{$array_ref}-1) {
1252             return $i if $ele == $array_ref->[$i];
1253             }
1254             }
1255              
1256             sub _check_for_illegal_params {
1257             my @params = @_;
1258             my @legal_params = qw / excel_filename
1259             which_worksheet
1260             raw_tickets_db
1261             processed_tickets_db
1262             stemmed_tickets_db
1263             inverted_index_db
1264             tickets_vocab_db
1265             idf_db
1266             tkt_doc_vecs_db
1267             tkt_doc_vecs_normed_db
1268             synset_cache_db
1269             want_synset_caching
1270             add_synsets_to_tickets
1271             clustering_fieldname
1272             min_word_length
1273             min_idf_threshold
1274             max_num_syn_words
1275             stop_words_file
1276             misspelled_words_file
1277             unique_id_fieldname
1278             want_stemming
1279             how_many_retrievals
1280             debug1
1281             debug2
1282             debug3
1283             /;
1284             my $found_match_flag;
1285             foreach my $param (@params) {
1286              
1287             foreach my $legal (@legal_params) {
1288             $found_match_flag = 0;
1289             if ($param eq $legal) {
1290             $found_match_flag = 1;
1291             last;
1292             }
1293             }
1294             last if $found_match_flag == 0;
1295             }
1296             return $found_match_flag;
1297             }
1298              
1299             # Meant only for an un-nested hash:
1300             sub _deep_copy_hash {
1301             my $ref_in = shift;
1302             my $ref_out = {};
1303             foreach ( keys %{$ref_in} ) {
1304             $ref_out->{$_} = $ref_in->{$_};
1305             }
1306             return $ref_out;
1307             }
1308              
1309             # from perl docs:
1310             sub _fisher_yates_shuffle {
1311             my $arr = shift;
1312             my $i = @$arr;
1313             while (--$i) {
1314             my $j = int rand( $i + 1 );
1315             @$arr[$i, $j] = @$arr[$j, $i];
1316             }
1317             }
1318              
1319             sub _vec_scalar_product {
1320             my $vec1 = shift;
1321             my $vec2 = shift;
1322             die "Something is wrong --- the two vectors are of unequal length"
1323             unless @$vec1 == @$vec2;
1324             my $product;
1325             for my $i (0..@$vec1-1) {
1326             $product += $vec1->[$i] * $vec2->[$i];
1327             }
1328             return $product;
1329             }
1330              
1331             sub _vec_magnitude {
1332             my $vec = shift;
1333             my $mag_squared = 0;
1334             foreach my $num (@$vec) {
1335             $mag_squared += $num ** 2;
1336             }
1337             return sqrt $mag_squared;
1338             }
1339              
1340             1;
1341              
1342             __END__