| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Algorithm::TicketClusterer; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | #--------------------------------------------------------------------------- | 
| 4 |  |  |  |  |  |  | # Copyright (c) 2014 Avinash Kak. All rights reserved.  This program is | 
| 5 |  |  |  |  |  |  | # free software.  You may modify and/or distribute it under the same terms | 
| 6 |  |  |  |  |  |  | # as Perl itself.  This copyright notice must remain attached to the file. | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored | 
| 9 |  |  |  |  |  |  | # past tickets that are most similar to a new ticket.  Tickets are commonly | 
| 10 |  |  |  |  |  |  | # used in software services industry and customer support businesses to | 
| 11 |  |  |  |  |  |  | # record requests for service, product complaints, user feedback, and so | 
| 12 |  |  |  |  |  |  | # on. | 
| 13 |  |  |  |  |  |  | # --------------------------------------------------------------------------- | 
| 14 |  |  |  |  |  |  |  | 
| 15 | 1 |  |  | 1 |  | 12990 | use 5.10.0; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 16 | 1 |  |  | 1 |  | 4 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 22 |  | 
| 17 | 1 |  |  | 1 |  | 3 | use warnings; | 
|  | 1 |  |  |  |  | 10 |  | 
|  | 1 |  |  |  |  | 28 |  | 
| 18 | 1 |  |  | 1 |  | 3 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 75 |  | 
| 19 | 1 |  |  | 1 |  | 612 | use Storable; | 
|  | 1 |  |  |  |  | 2749 |  | 
|  | 1 |  |  |  |  | 69 |  | 
| 20 | 1 |  |  | 1 |  | 837 | use Spreadsheet::ParseExcel; | 
|  | 1 |  |  |  |  | 52964 |  | 
|  | 1 |  |  |  |  | 33 |  | 
| 21 | 1 |  |  | 1 |  | 536 | use Spreadsheet::XLSX; | 
|  | 1 |  |  |  |  | 64852 |  | 
|  | 1 |  |  |  |  | 33 |  | 
| 22 | 1 |  |  | 1 |  | 1084 | use WordNet::QueryData; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | use Text::Iconv; | 
| 24 |  |  |  |  |  |  | use SDBM_File; | 
| 25 |  |  |  |  |  |  | use Fcntl; | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | our $VERSION = '1.01'; | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | ############################### The Constructor ############################# | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | sub new { | 
| 32 |  |  |  |  |  |  | my ($class, %args) = @_; | 
| 33 |  |  |  |  |  |  | my @params = keys %args; | 
| 34 |  |  |  |  |  |  | croak "\nYou have used a wrong name for a keyword argument " . | 
| 35 |  |  |  |  |  |  | "--- perhaps a misspelling\n" | 
| 36 |  |  |  |  |  |  | if _check_for_illegal_params(@params) == 0; | 
| 37 |  |  |  |  |  |  | bless { | 
| 38 |  |  |  |  |  |  | _excel_filename         =>   $args{excel_filename}, | 
| 39 |  |  |  |  |  |  | _which_worksheet        =>   $args{which_worksheet}, | 
| 40 |  |  |  |  |  |  | _raw_tickets_db         =>   $args{raw_tickets_db}, | 
| 41 |  |  |  |  |  |  | _processed_tickets_db   =>   $args{processed_tickets_db}, | 
| 42 |  |  |  |  |  |  | _synset_cache_db        =>   $args{synset_cache_db}, | 
| 43 |  |  |  |  |  |  | _stemmed_tickets_db     =>   $args{stemmed_tickets_db}, | 
| 44 |  |  |  |  |  |  | _inverted_index_db      =>   $args{inverted_index_db}, | 
| 45 |  |  |  |  |  |  | _tickets_vocab_db       =>   $args{tickets_vocab_db}, | 
| 46 |  |  |  |  |  |  | _idf_db                 =>   $args{idf_db}, | 
| 47 |  |  |  |  |  |  | _tkt_doc_vecs_db        =>   $args{tkt_doc_vecs_db}, | 
| 48 |  |  |  |  |  |  | _tkt_doc_vecs_normed_db =>   $args{tkt_doc_vecs_normed_db}, | 
| 49 |  |  |  |  |  |  | _clustering_fieldname   =>   $args{clustering_fieldname}, | 
| 50 |  |  |  |  |  |  | _unique_id_fieldname    =>   $args{unique_id_fieldname}, | 
| 51 |  |  |  |  |  |  | _stop_words_file        =>   $args{stop_words_file}, | 
| 52 |  |  |  |  |  |  | _misspelled_words_file  =>   $args{misspelled_words_file}, | 
| 53 |  |  |  |  |  |  | _min_word_length        =>   $args{min_word_length} || 4, | 
| 54 |  |  |  |  |  |  | _add_synsets_to_tickets =>   $args{add_synsets_to_tickets} || 0, | 
| 55 |  |  |  |  |  |  | _want_stemming          =>   $args{want_stemming} || 0, | 
| 56 |  |  |  |  |  |  | _how_many_retrievals    =>   $args{how_many_retrievals} || 5, | 
| 57 |  |  |  |  |  |  | _min_idf_threshold      =>   $args{min_idf_threshold}, | 
| 58 |  |  |  |  |  |  | _max_num_syn_words      =>   $args{max_num_syn_words} || 3, | 
| 59 |  |  |  |  |  |  | _want_synset_caching    =>   $args{want_synset_caching} || 0, | 
| 60 |  |  |  |  |  |  | _stop_words             =>   {}, | 
| 61 |  |  |  |  |  |  | _all_tickets            =>   [], | 
| 62 |  |  |  |  |  |  | _column_headers         =>   [], | 
| 63 |  |  |  |  |  |  | _good_columns           =>   [], | 
| 64 |  |  |  |  |  |  | _tickets_by_ids         =>   {}, | 
| 65 |  |  |  |  |  |  | _processed_tkts_by_ids  =>   {}, | 
| 66 |  |  |  |  |  |  | _stemmed_tkts_by_ids    =>   {}, | 
| 67 |  |  |  |  |  |  | _misspelled_words       =>   {}, | 
| 68 |  |  |  |  |  |  | _total_num_tickets      =>   0, | 
| 69 |  |  |  |  |  |  | _synset_cache           =>   {}, | 
| 70 |  |  |  |  |  |  | _vocab_hash             =>   {}, | 
| 71 |  |  |  |  |  |  | _vocab_idf_hist         =>   {}, | 
| 72 |  |  |  |  |  |  | _idf_t                  =>   {}, | 
| 73 |  |  |  |  |  |  | _vocab_size             =>   undef, | 
| 74 |  |  |  |  |  |  | _doc_vector_template    =>   {}, | 
| 75 |  |  |  |  |  |  | _tkt_doc_vecs           =>   {}, | 
| 76 |  |  |  |  |  |  | _tkt_doc_vecs_normed    =>   {}, | 
| 77 |  |  |  |  |  |  | _query_ticket_id        =>   undef, | 
| 78 |  |  |  |  |  |  | _inverted_index         =>   {}, | 
| 79 |  |  |  |  |  |  | _debug1                 =>   $args{debug1} || 0, # for processing Excel | 
| 80 |  |  |  |  |  |  | _debug2                 =>   $args{debug2} || 0, # for modeling tickets | 
| 81 |  |  |  |  |  |  | _debug3                 =>   $args{debug3} || 0, # for retrieving similar tickets | 
| 82 |  |  |  |  |  |  | _wn                     =>   WordNet::QueryData->new( verbose => 0, | 
| 83 |  |  |  |  |  |  | noload => 1 ), | 
| 84 |  |  |  |  |  |  | }, $class; | 
| 85 |  |  |  |  |  |  | } | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | #############################  Extract info from Excel  ####################### | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | sub get_tickets_from_excel { | 
| 90 |  |  |  |  |  |  | my $self = shift; | 
| 91 |  |  |  |  |  |  | unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db}; | 
| 92 |  |  |  |  |  |  | unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db}; | 
| 93 |  |  |  |  |  |  | unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db}; | 
| 94 |  |  |  |  |  |  | unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db}; | 
| 95 |  |  |  |  |  |  | unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db}; | 
| 96 |  |  |  |  |  |  | unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db}; | 
| 97 |  |  |  |  |  |  | unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db}; | 
| 98 |  |  |  |  |  |  | unlink glob "$self->{_tickets_vocab_db}.*"; | 
| 99 |  |  |  |  |  |  | unlink glob "$self->{_idf_db}.*"; | 
| 100 |  |  |  |  |  |  | my $filename = $self->{_excel_filename} || die("Excel file required"), | 
| 101 |  |  |  |  |  |  | my $clustering_fieldname = $self->{_clustering_fieldname} | 
| 102 |  |  |  |  |  |  | || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- "); | 
| 103 |  |  |  |  |  |  | my $unique_id_fieldname = $self->{_unique_id_fieldname} | 
| 104 |  |  |  |  |  |  | || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- "); | 
| 105 |  |  |  |  |  |  | my $workbook; | 
| 106 |  |  |  |  |  |  | if ($filename =~ /\.xls$/) { | 
| 107 |  |  |  |  |  |  | my $parser = Spreadsheet::ParseExcel->new(); | 
| 108 |  |  |  |  |  |  | $workbook = $parser->parse($filename); | 
| 109 |  |  |  |  |  |  | die $parser->error() unless defined $workbook; | 
| 110 |  |  |  |  |  |  | } elsif ($filename =~ /\.xlsx$/) { | 
| 111 |  |  |  |  |  |  | #        use Text::Iconv; | 
| 112 |  |  |  |  |  |  | my $converter = Text::Iconv->new("utf-8", "windows-1251"); | 
| 113 |  |  |  |  |  |  | $workbook = Spreadsheet::XLSX->new($filename, $converter); | 
| 114 |  |  |  |  |  |  | } else { | 
| 115 |  |  |  |  |  |  | die "File suffix on the Excel file not recognized"; | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  | my @worksheets = $workbook->worksheets(); | 
| 118 |  |  |  |  |  |  | my $which_worksheet = $self->{_which_worksheet} || | 
| 119 |  |  |  |  |  |  | die "\nYou have not specified which Excel worksheet contains the tickets\n"; | 
| 120 |  |  |  |  |  |  | my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range(); | 
| 121 |  |  |  |  |  |  | my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range(); | 
| 122 |  |  |  |  |  |  | my @good_columns; | 
| 123 |  |  |  |  |  |  | my $col_headers_row; | 
| 124 |  |  |  |  |  |  | my $col_headers_found = 0; | 
| 125 |  |  |  |  |  |  | my $col_index_for_unique_id; | 
| 126 |  |  |  |  |  |  | my $col_index_for_clustering_field; | 
| 127 |  |  |  |  |  |  | for my $row ( $row_min .. $row_max ) { | 
| 128 |  |  |  |  |  |  | last if $col_headers_found; | 
| 129 |  |  |  |  |  |  | @good_columns = (); | 
| 130 |  |  |  |  |  |  | for my $col ( $col_min .. $col_max ) { | 
| 131 |  |  |  |  |  |  | my $cell = | 
| 132 |  |  |  |  |  |  | $worksheets[$which_worksheet-1]->get_cell( $row, $col ); | 
| 133 |  |  |  |  |  |  | next unless $cell; | 
| 134 |  |  |  |  |  |  | my $cell_value = _get_rid_of_wide_chars($cell->value()); | 
| 135 |  |  |  |  |  |  | push @good_columns, $col if $cell_value; | 
| 136 |  |  |  |  |  |  | if ($cell_value eq $unique_id_fieldname) { | 
| 137 |  |  |  |  |  |  | $col_index_for_unique_id = $col; | 
| 138 |  |  |  |  |  |  | $col_headers_row = $row; | 
| 139 |  |  |  |  |  |  | $col_headers_found = 1; | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  | if ($cell_value eq $clustering_fieldname) { | 
| 142 |  |  |  |  |  |  | $col_index_for_clustering_field = $col; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  | $self->{_good_columns} = \@good_columns; | 
| 147 |  |  |  |  |  |  | print "\nThe unique id is in column: $col_index_for_unique_id\n" | 
| 148 |  |  |  |  |  |  | if $self->{_debug1}; | 
| 149 |  |  |  |  |  |  | print "The clustering field is in column: " . | 
| 150 |  |  |  |  |  |  | "$col_index_for_clustering_field\n\n" if $self->{_debug1}; | 
| 151 |  |  |  |  |  |  | my %Column_Headers; | 
| 152 |  |  |  |  |  |  | foreach my $field_index (0..@good_columns-1) { | 
| 153 |  |  |  |  |  |  | my $key = "field_" . $field_index; | 
| 154 |  |  |  |  |  |  | $Column_Headers{$key} = ""; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  | my @col_headers = map { | 
| 157 |  |  |  |  |  |  | my $cell = | 
| 158 |  |  |  |  |  |  | $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_); | 
| 159 |  |  |  |  |  |  | $cell ? _get_rid_of_wide_chars($cell->value()) : ''; | 
| 160 |  |  |  |  |  |  | } @good_columns; | 
| 161 |  |  |  |  |  |  | $self->{_column_headers} = \@col_headers; | 
| 162 |  |  |  |  |  |  | $self->_display_column_headers() if $self->{_debug1}; | 
| 163 |  |  |  |  |  |  | my $unique_id_field_index_in_good_columns = | 
| 164 |  |  |  |  |  |  | _find_index_for_given_element( $col_index_for_unique_id, \@good_columns ); | 
| 165 |  |  |  |  |  |  | my $clustering_field_index_in_good_columns = | 
| 166 |  |  |  |  |  |  | _find_index_for_given_element( $col_index_for_clustering_field, | 
| 167 |  |  |  |  |  |  | \@good_columns ); | 
| 168 |  |  |  |  |  |  | die "Something is wrong with the info extracted from Excel " . | 
| 169 |  |  |  |  |  |  | "as the index for the column with unique IDs is not one of " . | 
| 170 |  |  |  |  |  |  | "good columns\n\n" | 
| 171 |  |  |  |  |  |  | unless (defined $unique_id_field_index_in_good_columns) && | 
| 172 |  |  |  |  |  |  | (defined $clustering_field_index_in_good_columns); | 
| 173 |  |  |  |  |  |  | for my $row_index ( $col_headers_row+1..$row_max-1) { | 
| 174 |  |  |  |  |  |  | my @values = map { | 
| 175 |  |  |  |  |  |  | my $cell = | 
| 176 |  |  |  |  |  |  | $worksheets[$which_worksheet-1]->get_cell($row_index, $_); | 
| 177 |  |  |  |  |  |  | $cell ? _get_rid_of_wide_chars($cell->value()) : ''; | 
| 178 |  |  |  |  |  |  | } @good_columns; | 
| 179 |  |  |  |  |  |  | next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/; | 
| 180 |  |  |  |  |  |  | next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/; | 
| 181 |  |  |  |  |  |  | my %onerow; | 
| 182 |  |  |  |  |  |  | foreach my $field_index (0..@good_columns-1) { | 
| 183 |  |  |  |  |  |  | my $key = "field_" . $field_index; | 
| 184 |  |  |  |  |  |  | die "The Columns Headers hash has no field for index " . | 
| 185 |  |  |  |  |  |  | "$field_index\n    " | 
| 186 |  |  |  |  |  |  | unless exists $col_headers[$field_index]; | 
| 187 |  |  |  |  |  |  | $onerow{$col_headers[$field_index]} = $values[$field_index]; | 
| 188 |  |  |  |  |  |  | } | 
| 189 |  |  |  |  |  |  | push @{$self->{_all_tickets}}, \%onerow; | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  | my @duplicates_for_id_field = @{$self->_check_unique_id_field()}; | 
| 192 |  |  |  |  |  |  | if (@duplicates_for_id_field > 0) { | 
| 193 |  |  |  |  |  |  | print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n"; | 
| 194 |  |  |  |  |  |  | die "\n\nYour unique id field for tickets contains duplicate id's"; | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 197 |  |  |  |  |  |  | $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} = | 
| 198 |  |  |  |  |  |  | lc($ticket->{$clustering_fieldname}); | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  | $self->{_total_num_tickets} = scalar @{$self->{_all_tickets}}; | 
| 201 |  |  |  |  |  |  | $self->store_raw_tickets_on_disk(); | 
| 202 |  |  |  |  |  |  | } | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | sub _test_excel_for_tickets { | 
| 205 |  |  |  |  |  |  | my $self = shift; | 
| 206 |  |  |  |  |  |  | use Text::Iconv; | 
| 207 |  |  |  |  |  |  | my $converter = Text::Iconv->new("utf-8", "windows-1251"); | 
| 208 |  |  |  |  |  |  | my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing"); | 
| 209 |  |  |  |  |  |  | my $workbook = Spreadsheet::XLSX->new( $filename, $converter ); | 
| 210 |  |  |  |  |  |  | my @worksheets = $workbook->worksheets(); | 
| 211 |  |  |  |  |  |  | my ( $row_min, $row_max ) = $worksheets[0]->row_range(); | 
| 212 |  |  |  |  |  |  | my ( $col_min, $col_max ) = $worksheets[0]->col_range(); | 
| 213 |  |  |  |  |  |  | return ($row_min, $row_max, $col_min, $col_max); | 
| 214 |  |  |  |  |  |  | } | 
| 215 |  |  |  |  |  |  |  | 
| 216 |  |  |  |  |  |  | sub _display_column_headers { | 
| 217 |  |  |  |  |  |  | my $self = shift; | 
| 218 |  |  |  |  |  |  | print "\nThe good columns are: @{$self->{_good_columns}}\n\n"; | 
| 219 |  |  |  |  |  |  | my $overall_header_string = join '  <>  ', @{$self->{_column_headers}}; | 
| 220 |  |  |  |  |  |  | print "The column headers are: $overall_header_string\n\n"; | 
| 221 |  |  |  |  |  |  | } | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | sub _check_unique_id_field { | 
| 224 |  |  |  |  |  |  | my $self = shift; | 
| 225 |  |  |  |  |  |  | my %check_hash; | 
| 226 |  |  |  |  |  |  | my @duplicates; | 
| 227 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 228 |  |  |  |  |  |  | if (exists $ticket->{$self->{_unique_id_fieldname}}) { | 
| 229 |  |  |  |  |  |  | push @duplicates, $ticket->{$self->{_unique_id_fieldname}} | 
| 230 |  |  |  |  |  |  | if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}}; | 
| 231 |  |  |  |  |  |  | $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1; | 
| 232 |  |  |  |  |  |  | } | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  | if ($self->{_debug1}) { | 
| 235 |  |  |  |  |  |  | my $num_of_tickets = @{$self->{_all_tickets}}; | 
| 236 |  |  |  |  |  |  | my $num_entries_check_hash = keys %check_hash; | 
| 237 |  |  |  |  |  |  | print "Number of tickets: $num_of_tickets\n"; | 
| 238 |  |  |  |  |  |  | print "Number of keys in check hash: $num_entries_check_hash\n"; | 
| 239 |  |  |  |  |  |  | } | 
| 240 |  |  |  |  |  |  | return \@duplicates; | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | sub show_original_ticket_for_given_id { | 
| 244 |  |  |  |  |  |  | my $self = shift; | 
| 245 |  |  |  |  |  |  | my $id = shift; | 
| 246 |  |  |  |  |  |  | print "\n\nDisplaying the fields for the ticket $id:\n\n"; | 
| 247 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 248 |  |  |  |  |  |  | if ( $ticket->{$self->{_unique_id_fieldname}} == $id) { | 
| 249 |  |  |  |  |  |  | foreach my $key (sort keys %{$ticket}) { | 
| 250 |  |  |  |  |  |  | my $value = $ticket->{$key}; | 
| 251 |  |  |  |  |  |  | $value =~ s/^\s+//; | 
| 252 |  |  |  |  |  |  | $value =~ s/\s+$//; | 
| 253 |  |  |  |  |  |  | printf("%20s  ==>  %s\n", $key, $value); | 
| 254 |  |  |  |  |  |  | } | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  | } | 
| 257 |  |  |  |  |  |  | } | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  | sub show_raw_ticket_clustering_data_for_given_id { | 
| 260 |  |  |  |  |  |  | my $self = shift; | 
| 261 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 262 |  |  |  |  |  |  | my $record = $self->{_tickets_by_ids}->{$ticket_id}; | 
| 263 |  |  |  |  |  |  | print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . | 
| 264 |  |  |  |  |  |  | "$record\n\n"; | 
| 265 |  |  |  |  |  |  | return $record; | 
| 266 |  |  |  |  |  |  | } | 
| 267 |  |  |  |  |  |  |  | 
| 268 |  |  |  |  |  |  | # Needed by test.t | 
| 269 |  |  |  |  |  |  | sub _raw_ticket_clustering_data_for_given_id { | 
| 270 |  |  |  |  |  |  | my $self = shift; | 
| 271 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 272 |  |  |  |  |  |  | my $record = $self->{_tickets_by_ids}->{$ticket_id}; | 
| 273 |  |  |  |  |  |  | return $record; | 
| 274 |  |  |  |  |  |  | } | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | sub show_processed_ticket_clustering_data_for_given_id { | 
| 278 |  |  |  |  |  |  | my $self = shift; | 
| 279 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 280 |  |  |  |  |  |  | my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; | 
| 281 |  |  |  |  |  |  | print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . | 
| 282 |  |  |  |  |  |  | "$record\n\n"; | 
| 283 |  |  |  |  |  |  | } | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | sub show_stemmed_ticket_clustering_data_for_given_id { | 
| 286 |  |  |  |  |  |  | my $self = shift; | 
| 287 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 288 |  |  |  |  |  |  | my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; | 
| 289 |  |  |  |  |  |  | print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . | 
| 290 |  |  |  |  |  |  | "$record\n\n"; | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | # The following function is a good diagnostic tool to look into the | 
| 294 |  |  |  |  |  |  | # array stored in $self->{_all_tickets}.  Each element of this array | 
| 295 |  |  |  |  |  |  | # is a record that represents one row of the Excel file. | 
| 296 |  |  |  |  |  |  | sub _show_row { | 
| 297 |  |  |  |  |  |  | my $self = shift; | 
| 298 |  |  |  |  |  |  | my $row_num = shift; | 
| 299 |  |  |  |  |  |  | my $total_rows = @{$self->{_all_tickets}}; | 
| 300 |  |  |  |  |  |  | print "There are $total_rows items in the \$all_tickets array\n"; | 
| 301 |  |  |  |  |  |  | die "The row that you want to see does not exist" | 
| 302 |  |  |  |  |  |  | unless $row_num < $total_rows; | 
| 303 |  |  |  |  |  |  | my %record = %{$self->{_all_tickets}->[$row_num]}; | 
| 304 |  |  |  |  |  |  | foreach my $field (sort keys %record) { | 
| 305 |  |  |  |  |  |  | my $value = $record{$field}; | 
| 306 |  |  |  |  |  |  | no warnings; | 
| 307 |  |  |  |  |  |  | print "$field  ==>  $value\n"; | 
| 308 |  |  |  |  |  |  | } | 
| 309 |  |  |  |  |  |  | } | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | sub store_raw_tickets_on_disk { | 
| 312 |  |  |  |  |  |  | my $self = shift; | 
| 313 |  |  |  |  |  |  | $self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db}; | 
| 314 |  |  |  |  |  |  | unlink $self->{_raw_tickets_db}; | 
| 315 |  |  |  |  |  |  | eval { | 
| 316 |  |  |  |  |  |  | store( $self->{_all_tickets}, $self->{_raw_tickets_db} ); | 
| 317 |  |  |  |  |  |  | }; | 
| 318 |  |  |  |  |  |  | if ($@) { | 
| 319 |  |  |  |  |  |  | die "Something went wrong with disk storage of ticket data: $@"; | 
| 320 |  |  |  |  |  |  | } | 
| 321 |  |  |  |  |  |  | } | 
| 322 |  |  |  |  |  |  |  | 
| 323 |  |  |  |  |  |  | sub restore_raw_tickets_from_disk { | 
| 324 |  |  |  |  |  |  | my $self = shift; | 
| 325 |  |  |  |  |  |  | my $clustering_fieldname = $self->{_clustering_fieldname} | 
| 326 |  |  |  |  |  |  | || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- "); | 
| 327 |  |  |  |  |  |  | my $unique_id_fieldname = $self->{_unique_id_fieldname} | 
| 328 |  |  |  |  |  |  | || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- "); | 
| 329 |  |  |  |  |  |  | eval { | 
| 330 |  |  |  |  |  |  | $self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} ); | 
| 331 |  |  |  |  |  |  | }; | 
| 332 |  |  |  |  |  |  | if ($@) { | 
| 333 |  |  |  |  |  |  | die "Unable to retrieve raw tickets from disk: $@"; | 
| 334 |  |  |  |  |  |  | } | 
| 335 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 336 |  |  |  |  |  |  | $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} = | 
| 337 |  |  |  |  |  |  | lc($ticket->{$clustering_fieldname}); | 
| 338 |  |  |  |  |  |  | } | 
| 339 |  |  |  |  |  |  | $self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}}; | 
| 340 |  |  |  |  |  |  | } | 
| 341 |  |  |  |  |  |  |  | 
| 342 |  |  |  |  |  |  | sub delete_markup_from_all_tickets { | 
| 343 |  |  |  |  |  |  | my $self = shift; | 
| 344 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 345 |  |  |  |  |  |  | $self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}}); | 
| 346 |  |  |  |  |  |  | } | 
| 347 |  |  |  |  |  |  | } | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | sub _delete_markup_from_one_ticket { | 
| 350 |  |  |  |  |  |  | my $self = shift; | 
| 351 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 352 |  |  |  |  |  |  | my $ticket_strings = $self->{_tickets_by_ids}->{$ticket_id}; | 
| 353 |  |  |  |  |  |  | my @strings = grep $_, split /\s+/, $ticket_strings; | 
| 354 |  |  |  |  |  |  | my $cleaned_up_strings = join ' ', grep {$_ !~ /^<[^<>]+>$/} @strings; | 
| 355 |  |  |  |  |  |  | $self->{_tickets_by_ids}->{$ticket_id} = $cleaned_up_strings; | 
| 356 |  |  |  |  |  |  | foreach my $ticket (@{$self->{_all_tickets}}) { | 
| 357 |  |  |  |  |  |  | if ( $ticket->{$self->{_unique_id_fieldname}} == $ticket_id ) { | 
| 358 |  |  |  |  |  |  | $ticket->{$self->{_clustering_fieldname}} = $cleaned_up_strings; | 
| 359 |  |  |  |  |  |  | last; | 
| 360 |  |  |  |  |  |  | } | 
| 361 |  |  |  |  |  |  | } | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | sub apply_filter_to_all_tickets { | 
| 365 |  |  |  |  |  |  | my $self = shift; | 
| 366 |  |  |  |  |  |  | my $stop_words_file = $self->{_stop_words_file} | 
| 367 |  |  |  |  |  |  | || die("\nYou forgot to supply the name of the stop words file in your constructor call\n"); | 
| 368 |  |  |  |  |  |  | my @stop_words = @{_fetch_words_from_file($stop_words_file)}; | 
| 369 |  |  |  |  |  |  | my $misspelled_words_file = $self->{_misspelled_words_file} | 
| 370 |  |  |  |  |  |  | || die("\nYou forgot to supply the name of the misspelled words file in your constructor call\n"); | 
| 371 |  |  |  |  |  |  | foreach my $word (@stop_words) { | 
| 372 |  |  |  |  |  |  | $self->{_stop_words}->{$word} = 1; | 
| 373 |  |  |  |  |  |  | } | 
| 374 |  |  |  |  |  |  | if ($self->{_misspelled_words_file}) { | 
| 375 |  |  |  |  |  |  | my @misspelled_word_pairs = | 
| 376 |  |  |  |  |  |  | @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})}; | 
| 377 |  |  |  |  |  |  | foreach my $wordpair (@misspelled_word_pairs) { | 
| 378 |  |  |  |  |  |  | my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair; | 
| 379 |  |  |  |  |  |  | $self->{_misspelled_words}->{$wrong_word} = $good_word; | 
| 380 |  |  |  |  |  |  | } | 
| 381 |  |  |  |  |  |  | } | 
| 382 |  |  |  |  |  |  | my $i = 1; | 
| 383 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tickets_by_ids}}) { | 
| 384 |  |  |  |  |  |  | print "\nApplying filter to ticket $ticket_id ($i out of $self->{_total_num_tickets})\n"; | 
| 385 |  |  |  |  |  |  | $self->_apply_filter_to_one_ticket($ticket_id); | 
| 386 |  |  |  |  |  |  | $i++; | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  | } | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | sub _apply_filter_to_one_ticket { | 
| 391 |  |  |  |  |  |  | my $self = shift; | 
| 392 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | unless (keys %{$self->{_stop_words}} > 0) { | 
| 395 |  |  |  |  |  |  | my @stop_words = @{_fetch_words_from_file($self->{_stop_words_file})}; | 
| 396 |  |  |  |  |  |  | foreach my $word (@stop_words) { | 
| 397 |  |  |  |  |  |  | $self->{_stop_words}->{$word} = 1; | 
| 398 |  |  |  |  |  |  | } | 
| 399 |  |  |  |  |  |  | } | 
| 400 |  |  |  |  |  |  | unless (keys %{$self->{_misspelled_words}} > 0) { | 
| 401 |  |  |  |  |  |  | if ($self->{_misspelled_words_file}) { | 
| 402 |  |  |  |  |  |  | my @misspelled_word_pairs = | 
| 403 |  |  |  |  |  |  | @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})}; | 
| 404 |  |  |  |  |  |  | foreach my $wordpair (@misspelled_word_pairs) { | 
| 405 |  |  |  |  |  |  | my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair; | 
| 406 |  |  |  |  |  |  | $self->{_misspelled_words}->{$wrong_word} = $good_word; | 
| 407 |  |  |  |  |  |  | } | 
| 408 |  |  |  |  |  |  | } | 
| 409 |  |  |  |  |  |  | } | 
| 410 |  |  |  |  |  |  | my $record = $self->{_tickets_by_ids}->{$ticket_id}; | 
| 411 |  |  |  |  |  |  | my $min = $self->{_min_word_length}; | 
| 412 |  |  |  |  |  |  | my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record; | 
| 413 |  |  |  |  |  |  | my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words; | 
| 414 |  |  |  |  |  |  | return unless @clean_words; | 
| 415 |  |  |  |  |  |  | my @new_words; | 
| 416 |  |  |  |  |  |  | foreach my $word (@words) { | 
| 417 |  |  |  |  |  |  | $word =~ s/(.+)[.,:!-]$/$1/; | 
| 418 |  |  |  |  |  |  | unless (($word eq 'no') or ($word eq 'not')) { | 
| 419 |  |  |  |  |  |  | next if length($word) < $self->{_min_word_length}; | 
| 420 |  |  |  |  |  |  | } | 
| 421 |  |  |  |  |  |  | if (exists $self->{_misspelled_words}->{lc($word)}) { | 
| 422 |  |  |  |  |  |  | push @new_words, $self->{_misspelled_words}->{$word}; | 
| 423 |  |  |  |  |  |  | next; | 
| 424 |  |  |  |  |  |  | } | 
| 425 |  |  |  |  |  |  | push @new_words, $word unless exists $self->{_stop_words}->{lc($word)}; | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  | my $new_record = join ' ', @new_words; | 
| 428 |  |  |  |  |  |  | $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record; | 
| 429 |  |  |  |  |  |  | } | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | sub _get_synonyms_for_word { | 
| 432 |  |  |  |  |  |  | my $self = shift; | 
| 433 |  |  |  |  |  |  | my $word = shift; | 
| 434 |  |  |  |  |  |  | my $no_sense_indicators = 1; | 
| 435 |  |  |  |  |  |  | my $wn = $self->{_wn}; | 
| 436 |  |  |  |  |  |  | my @parts_of_speech = $wn->querySense("$word"); | 
| 437 |  |  |  |  |  |  |  | 
| 438 |  |  |  |  |  |  | my %noun_synonyms; | 
| 439 |  |  |  |  |  |  | my %verb_synonyms; | 
| 440 |  |  |  |  |  |  | my %adj_synonyms; | 
| 441 |  |  |  |  |  |  | my %adv_synonyms; | 
| 442 |  |  |  |  |  |  | foreach my $pos (@parts_of_speech) { | 
| 443 |  |  |  |  |  |  | if ($pos =~ /n$/) { | 
| 444 |  |  |  |  |  |  | my @all_noun_syn_sense_labels = $wn->querySense( $pos, "syns"); | 
| 445 |  |  |  |  |  |  | my $how_many = @all_noun_syn_sense_labels; | 
| 446 |  |  |  |  |  |  | foreach my $noun_sense (@all_noun_syn_sense_labels) { | 
| 447 |  |  |  |  |  |  | my @noun_synonyms = $wn->querySense($noun_sense, "syns"); | 
| 448 |  |  |  |  |  |  | my $answer = ""; | 
| 449 |  |  |  |  |  |  | foreach my $noun_syn (@noun_synonyms) { | 
| 450 |  |  |  |  |  |  | next if $noun_syn eq $noun_sense; | 
| 451 |  |  |  |  |  |  | $noun_syn =~ s/\#.+$// if $no_sense_indicators; | 
| 452 |  |  |  |  |  |  | $noun_synonyms{$noun_syn} = 1; | 
| 453 |  |  |  |  |  |  | $answer .= " $noun_syn "; | 
| 454 |  |  |  |  |  |  | } | 
| 455 |  |  |  |  |  |  | } | 
| 456 |  |  |  |  |  |  | } elsif ($pos =~ /v$/) { | 
| 457 |  |  |  |  |  |  | my @all_verb_syn_sense_labels = $wn->querySense( $pos, "syns"); | 
| 458 |  |  |  |  |  |  | my $how_many = @all_verb_syn_sense_labels; | 
| 459 |  |  |  |  |  |  | foreach my $verb_sense (@all_verb_syn_sense_labels) { | 
| 460 |  |  |  |  |  |  | my @verb_synonyms = $wn->querySense($verb_sense, "syns"); | 
| 461 |  |  |  |  |  |  | my $answer = ""; | 
| 462 |  |  |  |  |  |  | foreach my $verb_syn (@verb_synonyms) { | 
| 463 |  |  |  |  |  |  | next if $verb_syn eq $verb_sense; | 
| 464 |  |  |  |  |  |  | $verb_syn =~ s/\#.+$// if $no_sense_indicators; | 
| 465 |  |  |  |  |  |  | $verb_synonyms{$verb_syn} = 1; | 
| 466 |  |  |  |  |  |  | $answer .= " $verb_syn "; | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  | } | 
| 469 |  |  |  |  |  |  | } elsif ($pos =~ /a$/) { | 
| 470 |  |  |  |  |  |  | my @all_adj_syn_sense_labels = $wn->querySense( $pos, "syns"); | 
| 471 |  |  |  |  |  |  | my $how_many = @all_adj_syn_sense_labels; | 
| 472 |  |  |  |  |  |  | foreach my $adj_sense (@all_adj_syn_sense_labels) { | 
| 473 |  |  |  |  |  |  | my @adj_synonyms = $wn->querySense($adj_sense, "syns"); | 
| 474 |  |  |  |  |  |  | my $answer = ""; | 
| 475 |  |  |  |  |  |  | foreach my $adj_syn (@adj_synonyms) { | 
| 476 |  |  |  |  |  |  | next if $adj_syn eq $adj_sense; | 
| 477 |  |  |  |  |  |  | $adj_syn =~ s/\#.+$// if $no_sense_indicators; | 
| 478 |  |  |  |  |  |  | $adj_synonyms{$adj_syn} = 1; | 
| 479 |  |  |  |  |  |  | $answer .= " $adj_syn "; | 
| 480 |  |  |  |  |  |  | } | 
| 481 |  |  |  |  |  |  | } | 
| 482 |  |  |  |  |  |  | } elsif ($pos =~ /r$/) { | 
| 483 |  |  |  |  |  |  | my @all_adv_syn_sense_labels = $wn->querySense( $pos, "syns"); | 
| 484 |  |  |  |  |  |  | my $how_many = @all_adv_syn_sense_labels; | 
| 485 |  |  |  |  |  |  | foreach my $adv_sense (@all_adv_syn_sense_labels) { | 
| 486 |  |  |  |  |  |  | my @adv_synonyms = $wn->querySense($adv_sense, "syns"); | 
| 487 |  |  |  |  |  |  | my $answer = ""; | 
| 488 |  |  |  |  |  |  | foreach my $adv_syn (@adv_synonyms) { | 
| 489 |  |  |  |  |  |  | next if $adv_syn eq $adv_sense; | 
| 490 |  |  |  |  |  |  | $adv_syn =~ s/\#.+$// if $no_sense_indicators; | 
| 491 |  |  |  |  |  |  | $adv_synonyms{$adv_syn} = 1; | 
| 492 |  |  |  |  |  |  | $answer .= " $adv_syn "; | 
| 493 |  |  |  |  |  |  | } | 
| 494 |  |  |  |  |  |  | } | 
| 495 |  |  |  |  |  |  | } else { | 
| 496 |  |  |  |  |  |  | die "\nThe Part of Speech $pos not recognized\n\n"; | 
| 497 |  |  |  |  |  |  | } | 
| 498 |  |  |  |  |  |  | } | 
| 499 |  |  |  |  |  |  | my @all_synonyms; | 
| 500 |  |  |  |  |  |  | my @all_noun_synonyms = keys %noun_synonyms; | 
| 501 |  |  |  |  |  |  | my @all_verb_synonyms = keys %verb_synonyms; | 
| 502 |  |  |  |  |  |  | my @all_adj_synonyms =  keys %adj_synonyms; | 
| 503 |  |  |  |  |  |  | my @all_adv_synonyms =  keys %adv_synonyms; | 
| 504 |  |  |  |  |  |  | push @all_synonyms, @all_noun_synonyms if @all_noun_synonyms > 0; | 
| 505 |  |  |  |  |  |  | push @all_synonyms, @all_verb_synonyms if @all_verb_synonyms > 0; | 
| 506 |  |  |  |  |  |  | push @all_synonyms, @all_adj_synonyms  if @all_adj_synonyms > 0; | 
| 507 |  |  |  |  |  |  | push @all_synonyms, @all_adv_synonyms  if @all_adv_synonyms > 0; | 
| 508 |  |  |  |  |  |  | my %synonym_set; | 
| 509 |  |  |  |  |  |  | foreach my $synonym (@all_synonyms) { | 
| 510 |  |  |  |  |  |  | $synonym_set{$synonym} = 1; | 
| 511 |  |  |  |  |  |  | } | 
| 512 |  |  |  |  |  |  | my @synonym_set = sort keys %synonym_set; | 
| 513 |  |  |  |  |  |  | return \@synonym_set; | 
| 514 |  |  |  |  |  |  | } | 
| 515 |  |  |  |  |  |  |  | 
| 516 |  |  |  |  |  |  | sub _get_antonyms_for_word { | 
| 517 |  |  |  |  |  |  | my $self = shift; | 
| 518 |  |  |  |  |  |  | my $word = shift; | 
| 519 |  |  |  |  |  |  | my $no_sense_indicators = 1; | 
| 520 |  |  |  |  |  |  | my $wn = $self->{_wn}; | 
| 521 |  |  |  |  |  |  | my @parts_of_speech = $wn->querySense("$word"); | 
| 522 |  |  |  |  |  |  | my %noun_antonyms; | 
| 523 |  |  |  |  |  |  | my %verb_antonyms; | 
| 524 |  |  |  |  |  |  | my %adj_antonyms; | 
| 525 |  |  |  |  |  |  | my %adv_antonyms; | 
| 526 |  |  |  |  |  |  | foreach my $pos (@parts_of_speech) { | 
| 527 |  |  |  |  |  |  | if ($pos =~ /n$/) { | 
| 528 |  |  |  |  |  |  | my @all_noun_ant_sense_labels = $wn->queryWord( $pos, "ants"); | 
| 529 |  |  |  |  |  |  | my $how_many = @all_noun_ant_sense_labels; | 
| 530 |  |  |  |  |  |  | foreach my $noun_sense (@all_noun_ant_sense_labels) { | 
| 531 |  |  |  |  |  |  | my @noun_antonyms = $wn->queryWord($noun_sense, "ants"); | 
| 532 |  |  |  |  |  |  | my $answer = ""; | 
| 533 |  |  |  |  |  |  | foreach my $noun_ant (@noun_antonyms) { | 
| 534 |  |  |  |  |  |  | next if $noun_ant eq $noun_sense; | 
| 535 |  |  |  |  |  |  | $noun_ant =~ s/\#.+$// if $no_sense_indicators; | 
| 536 |  |  |  |  |  |  | $noun_antonyms{$noun_ant} = 1; | 
| 537 |  |  |  |  |  |  | $answer .= " $noun_ant "; | 
| 538 |  |  |  |  |  |  | } | 
| 539 |  |  |  |  |  |  | } | 
| 540 |  |  |  |  |  |  | } elsif ($pos =~ /v$/) { | 
| 541 |  |  |  |  |  |  | my @all_verb_ant_sense_labels = $wn->queryWord( $pos, "ants"); | 
| 542 |  |  |  |  |  |  | my $how_many = @all_verb_ant_sense_labels; | 
| 543 |  |  |  |  |  |  | foreach my $verb_sense (@all_verb_ant_sense_labels) { | 
| 544 |  |  |  |  |  |  | my @verb_antonyms = $wn->queryWord($verb_sense, "ants"); | 
| 545 |  |  |  |  |  |  | my $answer = ""; | 
| 546 |  |  |  |  |  |  | foreach my $verb_ant (@verb_antonyms) { | 
| 547 |  |  |  |  |  |  | next if $verb_ant eq $verb_sense; | 
| 548 |  |  |  |  |  |  | $verb_ant =~ s/\#.+$// if $no_sense_indicators; | 
| 549 |  |  |  |  |  |  | $verb_antonyms{$verb_ant} = 1; | 
| 550 |  |  |  |  |  |  | $answer .= " $verb_ant "; | 
| 551 |  |  |  |  |  |  | } | 
| 552 |  |  |  |  |  |  | } | 
| 553 |  |  |  |  |  |  | } elsif ($pos =~ /a$/) { | 
| 554 |  |  |  |  |  |  | my @all_adj_ant_sense_labels = $wn->queryWord( $pos, "ants"); | 
| 555 |  |  |  |  |  |  | my $how_many = @all_adj_ant_sense_labels; | 
| 556 |  |  |  |  |  |  | foreach my $adj_sense (@all_adj_ant_sense_labels) { | 
| 557 |  |  |  |  |  |  | my @adj_antonyms = $wn->queryWord($adj_sense, "ants"); | 
| 558 |  |  |  |  |  |  | my $answer = ""; | 
| 559 |  |  |  |  |  |  | foreach my $adj_ant (@adj_antonyms) { | 
| 560 |  |  |  |  |  |  | next if $adj_ant eq $adj_sense; | 
| 561 |  |  |  |  |  |  | $adj_ant =~ s/\#.+$// if $no_sense_indicators; | 
| 562 |  |  |  |  |  |  | $adj_antonyms{$adj_ant} = 1; | 
| 563 |  |  |  |  |  |  | $answer .= " $adj_ant "; | 
| 564 |  |  |  |  |  |  | } | 
| 565 |  |  |  |  |  |  | } | 
| 566 |  |  |  |  |  |  | } elsif ($pos =~ /r$/) { | 
| 567 |  |  |  |  |  |  | my @all_adv_ant_sense_labels = $wn->queryWord( $pos, "ants"); | 
| 568 |  |  |  |  |  |  | my $how_many = @all_adv_ant_sense_labels; | 
| 569 |  |  |  |  |  |  | foreach my $adv_sense (@all_adv_ant_sense_labels) { | 
| 570 |  |  |  |  |  |  | my @adv_antonyms = $wn->queryWord($adv_sense, "ants"); | 
| 571 |  |  |  |  |  |  | my $answer = ""; | 
| 572 |  |  |  |  |  |  | foreach my $adv_ant (@adv_antonyms) { | 
| 573 |  |  |  |  |  |  | next if $adv_ant eq $adv_sense; | 
| 574 |  |  |  |  |  |  | $adv_ant =~ s/\#.+$// if $no_sense_indicators; | 
| 575 |  |  |  |  |  |  | $adv_antonyms{$adv_ant} = 1; | 
| 576 |  |  |  |  |  |  | $answer .= " $adv_ant "; | 
| 577 |  |  |  |  |  |  | } | 
| 578 |  |  |  |  |  |  | } | 
| 579 |  |  |  |  |  |  | } else { | 
| 580 |  |  |  |  |  |  | die "\nThe Part of Speech $pos not recognized\n\n"; | 
| 581 |  |  |  |  |  |  | } | 
| 582 |  |  |  |  |  |  | } | 
| 583 |  |  |  |  |  |  | my @all_antonyms; | 
| 584 |  |  |  |  |  |  | my @all_noun_antonyms = keys %noun_antonyms; | 
| 585 |  |  |  |  |  |  | my @all_verb_antonyms = keys %verb_antonyms; | 
| 586 |  |  |  |  |  |  | my @all_adj_antonyms =  keys %adj_antonyms; | 
| 587 |  |  |  |  |  |  | my @all_adv_antonyms =  keys %adv_antonyms; | 
| 588 |  |  |  |  |  |  | push @all_antonyms, @all_noun_antonyms if @all_noun_antonyms > 0; | 
| 589 |  |  |  |  |  |  | push @all_antonyms, @all_verb_antonyms if @all_verb_antonyms > 0; | 
| 590 |  |  |  |  |  |  | push @all_antonyms, @all_adj_antonyms  if @all_adj_antonyms > 0; | 
| 591 |  |  |  |  |  |  | push @all_antonyms, @all_adv_antonyms  if @all_adv_antonyms > 0; | 
| 592 |  |  |  |  |  |  | my %antonym_set; | 
| 593 |  |  |  |  |  |  | foreach my $antonym (@all_antonyms) { | 
| 594 |  |  |  |  |  |  | $antonym_set{$antonym} = 1; | 
| 595 |  |  |  |  |  |  | } | 
| 596 |  |  |  |  |  |  | my @antonym_set = sort keys %antonym_set; | 
| 597 |  |  |  |  |  |  | return \@antonym_set; | 
| 598 |  |  |  |  |  |  | } | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | sub expand_all_tickets_with_synonyms { | 
| 601 |  |  |  |  |  |  | my $self = shift; | 
| 602 |  |  |  |  |  |  | return unless $self->{_add_synsets_to_tickets}; | 
| 603 |  |  |  |  |  |  | my $num_of_tickets = $self->{_total_num_tickets}; | 
| 604 |  |  |  |  |  |  | if ($self->{_want_synset_caching}) { | 
| 605 |  |  |  |  |  |  | eval { | 
| 606 |  |  |  |  |  |  | $self->{_synset_cache} = retrieve( $self->{_synset_cache_db} ); | 
| 607 |  |  |  |  |  |  | } if -s $self->{_synset_cache_db}; | 
| 608 |  |  |  |  |  |  | if ($@) { | 
| 609 |  |  |  |  |  |  | print "Something went wrong with restoration of synset cache: $@"; | 
| 610 |  |  |  |  |  |  | } | 
| 611 |  |  |  |  |  |  | } | 
| 612 |  |  |  |  |  |  | my $i = 1; | 
| 613 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { | 
| 614 |  |  |  |  |  |  | $self->_expand_one_ticket_with_synonyms($ticket_id); | 
| 615 |  |  |  |  |  |  | print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n"; | 
| 616 |  |  |  |  |  |  | $i++; | 
| 617 |  |  |  |  |  |  | } | 
| 618 |  |  |  |  |  |  | if ($self->{_want_synset_caching}) { | 
| 619 |  |  |  |  |  |  | $self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db}; | 
| 620 |  |  |  |  |  |  | eval { | 
| 621 |  |  |  |  |  |  | store( $self->{_synset_cache}, $self->{_synset_cache_db} ); | 
| 622 |  |  |  |  |  |  | }; | 
| 623 |  |  |  |  |  |  | if ($@) { | 
| 624 |  |  |  |  |  |  | die "Something went wrong with disk storage of synset cache: $@"; | 
| 625 |  |  |  |  |  |  | } | 
| 626 |  |  |  |  |  |  | } | 
| 627 |  |  |  |  |  |  | } | 
| 628 |  |  |  |  |  |  |  | 
| 629 |  |  |  |  |  |  | sub _expand_one_ticket_with_synonyms { | 
| 630 |  |  |  |  |  |  | my $self = shift; | 
| 631 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 632 |  |  |  |  |  |  | print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n" | 
| 633 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 634 |  |  |  |  |  |  | $self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id ); | 
| 635 |  |  |  |  |  |  | $self->_add_to_words_their_synonyms_one_ticket( $ticket_id ); | 
| 636 |  |  |  |  |  |  | } | 
| 637 |  |  |  |  |  |  |  | 
| 638 |  |  |  |  |  |  | sub _replace_negated_words_with_antonyms_one_ticket { | 
| 639 |  |  |  |  |  |  | my $self = shift; | 
| 640 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 641 |  |  |  |  |  |  | my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; | 
| 642 |  |  |  |  |  |  | my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig; | 
| 643 |  |  |  |  |  |  | foreach my $word (@words_negated_with_not) { | 
| 644 |  |  |  |  |  |  | next unless (($word =~ /^\w+$/) && | 
| 645 |  |  |  |  |  |  | (length($word) > $self->{_min_word_length})); | 
| 646 |  |  |  |  |  |  | my @antonym_words = @{$self->_get_antonyms_for_word( $word )}; | 
| 647 |  |  |  |  |  |  | next unless @antonym_words > 0; | 
| 648 |  |  |  |  |  |  | $#antonym_words = $self->{_max_num_syn_words} - 1 | 
| 649 |  |  |  |  |  |  | if @antonym_words > $self->{_max_num_syn_words}; | 
| 650 |  |  |  |  |  |  | my $antonym_replacement_string = join ' ', @antonym_words; | 
| 651 |  |  |  |  |  |  | print "Antonym for $word is $antonym_replacement_string\n" | 
| 652 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 653 |  |  |  |  |  |  | $record =~ s/not\s+$word/$antonym_replacement_string/g; | 
| 654 |  |  |  |  |  |  | } | 
| 655 |  |  |  |  |  |  | my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig; | 
| 656 |  |  |  |  |  |  | foreach my $word (@words_negated_with_no) { | 
| 657 |  |  |  |  |  |  | next unless (($word =~ /^\w+$/) && | 
| 658 |  |  |  |  |  |  | (length($word) > $self->{_min_word_length})); | 
| 659 |  |  |  |  |  |  | my @antonym_words = @{$self->_get_antonyms_for_word( $word )}; | 
| 660 |  |  |  |  |  |  | next unless @antonym_words > 0; | 
| 661 |  |  |  |  |  |  | $#antonym_words = $self->{_max_num_syn_words} - 1 | 
| 662 |  |  |  |  |  |  | if @antonym_words > $self->{_max_num_syn_words}; | 
| 663 |  |  |  |  |  |  | my $antonym_replacement_string = join ' ', @antonym_words; | 
| 664 |  |  |  |  |  |  | print "Antonym for $word is $antonym_replacement_string\n" | 
| 665 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 666 |  |  |  |  |  |  | $record =~ s/no\s+$word/$antonym_replacement_string/g; | 
| 667 |  |  |  |  |  |  | } | 
| 668 |  |  |  |  |  |  | $self->{_processed_tkts_by_ids}->{$ticket_id} = $record; | 
| 669 |  |  |  |  |  |  | } | 
| 670 |  |  |  |  |  |  |  | 
| 671 |  |  |  |  |  |  | sub _add_to_words_their_synonyms_one_ticket { | 
| 672 |  |  |  |  |  |  | my $self = shift; | 
| 673 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 674 |  |  |  |  |  |  | my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; | 
| 675 |  |  |  |  |  |  | my @words = split /\s+/, $record; | 
| 676 |  |  |  |  |  |  | my @synonym_bag; | 
| 677 |  |  |  |  |  |  | foreach my $word (@words) { | 
| 678 |  |  |  |  |  |  | next if $word eq 'no'; | 
| 679 |  |  |  |  |  |  | next if $word eq 'not'; | 
| 680 |  |  |  |  |  |  | next unless $word =~ /^\w+$/ && | 
| 681 |  |  |  |  |  |  | length($word) > $self->{_min_word_length}; | 
| 682 |  |  |  |  |  |  | my @synonym_words; | 
| 683 |  |  |  |  |  |  | @synonym_words = @{$self->{_synset_cache}->{$word}} | 
| 684 |  |  |  |  |  |  | if exists $self->{_synset_cache}->{$word}; | 
| 685 |  |  |  |  |  |  | unless (exists $self->{_synset_cache}->{$word}) { | 
| 686 |  |  |  |  |  |  | @synonym_words = @{$self->_get_synonyms_for_word( $word )}; | 
| 687 |  |  |  |  |  |  | print "syn-set for $word  =>   @synonym_words\n\n" | 
| 688 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 689 |  |  |  |  |  |  | my $word_root; | 
| 690 |  |  |  |  |  |  | if (@synonym_words == 0) { | 
| 691 |  |  |  |  |  |  | if ((length($word) > 4) && ($word =~ /(.+)s$/)) { | 
| 692 |  |  |  |  |  |  | $word_root = $1; | 
| 693 |  |  |  |  |  |  | @synonym_words = @{$self->_get_synonyms_for_word( $word_root )} | 
| 694 |  |  |  |  |  |  | if length($word_root) >= $self->{_min_word_length}; | 
| 695 |  |  |  |  |  |  | } elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) { | 
| 696 |  |  |  |  |  |  | $word_root = $1; | 
| 697 |  |  |  |  |  |  | @synonym_words = @{$self->_get_synonyms_for_word( $word_root )} | 
| 698 |  |  |  |  |  |  | if length($word_root) >= $self->{_min_word_length}; | 
| 699 |  |  |  |  |  |  | } | 
| 700 |  |  |  |  |  |  | } | 
| 701 |  |  |  |  |  |  | print "syn-set for word root $word_root  =>   @synonym_words\n\n" | 
| 702 |  |  |  |  |  |  | if ( $self->{_debug2} && defined $word_root ); | 
| 703 |  |  |  |  |  |  | _fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0; | 
| 704 |  |  |  |  |  |  | $#synonym_words = $self->{_max_num_syn_words} - 1 | 
| 705 |  |  |  |  |  |  | if @synonym_words > $self->{_max_num_syn_words}; | 
| 706 |  |  |  |  |  |  | print "Retained syn-set for $word  =>   @synonym_words\n\n" | 
| 707 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 708 |  |  |  |  |  |  | $self->{_synset_cache}->{$word} = \@synonym_words; | 
| 709 |  |  |  |  |  |  | push @synonym_bag, @synonym_words; | 
| 710 |  |  |  |  |  |  | } | 
| 711 |  |  |  |  |  |  | } | 
| 712 |  |  |  |  |  |  | foreach my $syn_word (@synonym_bag) { | 
| 713 |  |  |  |  |  |  | push @words, lc($syn_word) | 
| 714 |  |  |  |  |  |  | unless ((exists $self->{_stop_words}->{$syn_word}) || | 
| 715 |  |  |  |  |  |  | (length($syn_word) <= $self->{_min_word_length})); | 
| 716 |  |  |  |  |  |  | } | 
| 717 |  |  |  |  |  |  | my @sorted_words = sort @words; | 
| 718 |  |  |  |  |  |  | my $new_record = join ' ', @sorted_words; | 
| 719 |  |  |  |  |  |  | $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record; | 
| 720 |  |  |  |  |  |  | } | 
| 721 |  |  |  |  |  |  |  | 
| 722 |  |  |  |  |  |  | sub store_processed_tickets_on_disk { | 
| 723 |  |  |  |  |  |  | my $self = shift; | 
| 724 |  |  |  |  |  |  | $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db}; | 
| 725 |  |  |  |  |  |  | unlink $self->{_processed_tickets_db}; | 
| 726 |  |  |  |  |  |  | eval { | 
| 727 |  |  |  |  |  |  | store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} ); | 
| 728 |  |  |  |  |  |  | }; | 
| 729 |  |  |  |  |  |  | if ($@) { | 
| 730 |  |  |  |  |  |  | die "Something went wrong with disk storage of processed tickets: $@"; | 
| 731 |  |  |  |  |  |  | } | 
| 732 |  |  |  |  |  |  | } | 
| 733 |  |  |  |  |  |  |  | 
| 734 |  |  |  |  |  |  | sub store_stemmed_tickets_and_inverted_index_on_disk { | 
| 735 |  |  |  |  |  |  | my $self = shift; | 
| 736 |  |  |  |  |  |  | $self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db}; | 
| 737 |  |  |  |  |  |  | unlink $self->{_stemmed_tickets_db}; | 
| 738 |  |  |  |  |  |  | eval { | 
| 739 |  |  |  |  |  |  | print "\n\nStoring stemmed tickets on disk\n\n"; | 
| 740 |  |  |  |  |  |  | store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} ); | 
| 741 |  |  |  |  |  |  | }; | 
| 742 |  |  |  |  |  |  | if ($@) { | 
| 743 |  |  |  |  |  |  | die "Something went wrong with disk storage of stemmed tickets: $@"; | 
| 744 |  |  |  |  |  |  | } | 
| 745 |  |  |  |  |  |  | $self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db}; | 
| 746 |  |  |  |  |  |  | unlink $self->{_inverted_index_db}; | 
| 747 |  |  |  |  |  |  | eval { | 
| 748 |  |  |  |  |  |  | print "\nStoring inverted index on disk\n\n"; | 
| 749 |  |  |  |  |  |  | store( $self->{_inverted_index}, $self->{_inverted_index_db} ); | 
| 750 |  |  |  |  |  |  | }; | 
| 751 |  |  |  |  |  |  | if ($@) { | 
| 752 |  |  |  |  |  |  | die "Something went wrong with disk storage of the inverted index: $@"; | 
| 753 |  |  |  |  |  |  | } | 
| 754 |  |  |  |  |  |  | } | 
| 755 |  |  |  |  |  |  |  | 
| 756 |  |  |  |  |  |  | sub restore_processed_tickets_from_disk { | 
| 757 |  |  |  |  |  |  | my $self = shift; | 
| 758 |  |  |  |  |  |  | eval { | 
| 759 |  |  |  |  |  |  | $self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} ); | 
| 760 |  |  |  |  |  |  | }; | 
| 761 |  |  |  |  |  |  | if ($@) { | 
| 762 |  |  |  |  |  |  | die "Something went wrong with restoration of processed tickets: $@"; | 
| 763 |  |  |  |  |  |  | } | 
| 764 |  |  |  |  |  |  | } | 
| 765 |  |  |  |  |  |  |  | 
| 766 |  |  |  |  |  |  | sub restore_stemmed_tickets_from_disk { | 
| 767 |  |  |  |  |  |  | my $self = shift; | 
| 768 |  |  |  |  |  |  | eval { | 
| 769 |  |  |  |  |  |  | $self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} ); | 
| 770 |  |  |  |  |  |  | }; | 
| 771 |  |  |  |  |  |  | if ($@) { | 
| 772 |  |  |  |  |  |  | die "Something went wrong with restoration of stemmed tickets: $@"; | 
| 773 |  |  |  |  |  |  | } | 
| 774 |  |  |  |  |  |  | } | 
| 775 |  |  |  |  |  |  |  | 
| 776 |  |  |  |  |  |  | ####################  Get Ticket Vocabulary and Word Counts ################# | 
| 777 |  |  |  |  |  |  |  | 
| 778 |  |  |  |  |  |  | sub get_ticket_vocabulary_and_construct_inverted_index { | 
| 779 |  |  |  |  |  |  | my $self = shift; | 
| 780 |  |  |  |  |  |  | my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}}; | 
| 781 |  |  |  |  |  |  | $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db}; | 
| 782 |  |  |  |  |  |  | unlink glob "$self->{_tickets_vocab_db}.*"; | 
| 783 |  |  |  |  |  |  | my %vocab_hist_on_disk; | 
| 784 |  |  |  |  |  |  | tie %vocab_hist_on_disk, 'SDBM_File', | 
| 785 |  |  |  |  |  |  | $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640 | 
| 786 |  |  |  |  |  |  | or die "Can't create DBM files: $!"; | 
| 787 |  |  |  |  |  |  | my %inverted_index; | 
| 788 |  |  |  |  |  |  | my $min = $self->{_min_word_length}; | 
| 789 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { | 
| 790 |  |  |  |  |  |  | my %uniques = (); | 
| 791 |  |  |  |  |  |  | my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; | 
| 792 |  |  |  |  |  |  | my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record; | 
| 793 |  |  |  |  |  |  | my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; | 
| 794 |  |  |  |  |  |  | next unless @clean_words; | 
| 795 |  |  |  |  |  |  | @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), | 
| 796 |  |  |  |  |  |  | @clean_words; | 
| 797 |  |  |  |  |  |  | map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words; | 
| 798 |  |  |  |  |  |  | for (@clean_words) { $uniques{"\L$_"}++ }; | 
| 799 |  |  |  |  |  |  | map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques; | 
| 800 |  |  |  |  |  |  | map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id } | 
| 801 |  |  |  |  |  |  | keys %uniques; | 
| 802 |  |  |  |  |  |  | $self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words; | 
| 803 |  |  |  |  |  |  | } | 
| 804 |  |  |  |  |  |  | foreach (keys %vocab_hist_on_disk) { | 
| 805 |  |  |  |  |  |  | $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_}; | 
| 806 |  |  |  |  |  |  | } | 
| 807 |  |  |  |  |  |  | untie %vocab_hist_on_disk; | 
| 808 |  |  |  |  |  |  | $self->{_tkt_vocab_done} = 1; | 
| 809 |  |  |  |  |  |  | $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); | 
| 810 |  |  |  |  |  |  | print "\n\nVocabulary size:  $self->{_vocab_size}\n\n" | 
| 811 |  |  |  |  |  |  | if $self->{_debug2}; | 
| 812 |  |  |  |  |  |  | # Calculate idf(t): | 
| 813 |  |  |  |  |  |  | $self->{_idf_db} = "idf.db" unless $self->{_idf_db}; | 
| 814 |  |  |  |  |  |  | unlink glob "$self->{_idf_db}.*"; | 
| 815 |  |  |  |  |  |  | tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640 | 
| 816 |  |  |  |  |  |  | or die "Can't create DBM files: $!"; | 
| 817 |  |  |  |  |  |  | foreach (keys %{$self->{_vocab_idf_hist}}) { | 
| 818 |  |  |  |  |  |  | $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets | 
| 819 |  |  |  |  |  |  | / | 
| 820 |  |  |  |  |  |  | (1 + $self->{_vocab_idf_hist}->{$_}))) | 
| 821 |  |  |  |  |  |  | / log(10) ); | 
| 822 |  |  |  |  |  |  | } | 
| 823 |  |  |  |  |  |  | foreach (keys %idf_t_on_disk) { | 
| 824 |  |  |  |  |  |  | $self->{_idf_t}->{$_} = $idf_t_on_disk{$_}; | 
| 825 |  |  |  |  |  |  | } | 
| 826 |  |  |  |  |  |  | untie %idf_t_on_disk; | 
| 827 |  |  |  |  |  |  | } | 
| 828 |  |  |  |  |  |  |  | 
| 829 |  |  |  |  |  |  | sub display_tickets_vocab { | 
| 830 |  |  |  |  |  |  | my $self = shift; | 
| 831 |  |  |  |  |  |  | die "tickets vocabulary not yet constructed" | 
| 832 |  |  |  |  |  |  | unless keys %{$self->{_vocab_hist}}; | 
| 833 |  |  |  |  |  |  | print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n"; | 
| 834 |  |  |  |  |  |  | foreach (sort keys %{$self->{_vocab_hist}}){ | 
| 835 |  |  |  |  |  |  | my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_}); | 
| 836 |  |  |  |  |  |  | print "$outstring\n"; | 
| 837 |  |  |  |  |  |  | } | 
| 838 |  |  |  |  |  |  | my $vocab_size = scalar( keys %{$self->{_vocab_hist}} ); | 
| 839 |  |  |  |  |  |  | print "\nSize of the tickets vocabulary: $vocab_size\n\n"; | 
| 840 |  |  |  |  |  |  | } | 
| 841 |  |  |  |  |  |  |  | 
| 842 |  |  |  |  |  |  | sub display_inverse_document_frequencies { | 
| 843 |  |  |  |  |  |  | my $self = shift; | 
| 844 |  |  |  |  |  |  | die "tickets vocabulary not yet constructed" | 
| 845 |  |  |  |  |  |  | unless keys %{$self->{_vocab_idf_hist}}; | 
| 846 |  |  |  |  |  |  | print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n"; | 
| 847 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_vocab_idf_hist}} ) { | 
| 848 |  |  |  |  |  |  | my $outstring = sprintf("%30s     %d", | 
| 849 |  |  |  |  |  |  | $_, $self->{_vocab_idf_hist}->{$_}); | 
| 850 |  |  |  |  |  |  | print "$outstring\n"; | 
| 851 |  |  |  |  |  |  | } | 
| 852 |  |  |  |  |  |  | print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n"; | 
| 853 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_idf_t}} ) { | 
| 854 |  |  |  |  |  |  | my $outstring = sprintf("%30s     %f", $_,$self->{_idf_t}->{$_}); | 
| 855 |  |  |  |  |  |  | print "$outstring\n"; | 
| 856 |  |  |  |  |  |  | } | 
| 857 |  |  |  |  |  |  | } | 
| 858 |  |  |  |  |  |  |  | 
| 859 |  |  |  |  |  |  | # The following subroutine is useful for diagnostic purposes.  It | 
| 860 |  |  |  |  |  |  | # lists the number of tickets that a word appears in and also lists | 
| 861 |  |  |  |  |  |  | # the tickets.  But be careful in interpreting its results.  Note | 
| 862 |  |  |  |  |  |  | # if you invoke this subroutine after the synsets have been added | 
| 863 |  |  |  |  |  |  | # to the tickets, you may find words being attributed to tickets | 
| 864 |  |  |  |  |  |  | # that do not actually contain them in the original Excel sheet. | 
| 865 |  |  |  |  |  |  | sub list_processed_tickets_for_a_word { | 
| 866 |  |  |  |  |  |  | my $self = shift; | 
| 867 |  |  |  |  |  |  | while (my $word = ) {    #enter ctrl-D to exit the loop | 
| 868 |  |  |  |  |  |  | chomp $word; | 
| 869 |  |  |  |  |  |  | my @ticket_list; | 
| 870 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { | 
| 871 |  |  |  |  |  |  | my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; | 
| 872 |  |  |  |  |  |  | push @ticket_list, $ticket_id if $record =~ /\b$word\b/i; | 
| 873 |  |  |  |  |  |  | } | 
| 874 |  |  |  |  |  |  | my $num = @ticket_list; | 
| 875 |  |  |  |  |  |  | print "\nThe number of processed tickets that mention the word `$word': $num\n\n"; | 
| 876 |  |  |  |  |  |  | print "The processed tickets: @ticket_list\n\n"; | 
| 877 |  |  |  |  |  |  | } | 
| 878 |  |  |  |  |  |  | } | 
| 879 |  |  |  |  |  |  |  | 
| 880 |  |  |  |  |  |  | sub list_stemmed_tickets_for_a_word { | 
| 881 |  |  |  |  |  |  | my $self = shift; | 
| 882 |  |  |  |  |  |  | while (my $word = ) {    #enter ctrl-D to exit the loop | 
| 883 |  |  |  |  |  |  | chomp $word; | 
| 884 |  |  |  |  |  |  | my @ticket_list; | 
| 885 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) { | 
| 886 |  |  |  |  |  |  | my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; | 
| 887 |  |  |  |  |  |  | push @ticket_list, $ticket_id if $record =~ /\b$word\b/i; | 
| 888 |  |  |  |  |  |  | } | 
| 889 |  |  |  |  |  |  | my $num = @ticket_list; | 
| 890 |  |  |  |  |  |  | print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n"; | 
| 891 |  |  |  |  |  |  | print "The stemmed tickets: @ticket_list\n\n"; | 
| 892 |  |  |  |  |  |  | } | 
| 893 |  |  |  |  |  |  | } | 
| 894 |  |  |  |  |  |  |  | 
| 895 |  |  |  |  |  |  | ##############  Generate Document Vectors for Tickets  #################### | 
| 896 |  |  |  |  |  |  |  | 
| 897 |  |  |  |  |  |  | sub construct_doc_vectors_for_all_tickets { | 
| 898 |  |  |  |  |  |  | my $self = shift; | 
| 899 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_vocab_hist}} ) { | 
| 900 |  |  |  |  |  |  | $self->{_doc_vector_template}->{$_} = 0; | 
| 901 |  |  |  |  |  |  | } | 
| 902 |  |  |  |  |  |  | my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}}; | 
| 903 |  |  |  |  |  |  | my $i = 1; | 
| 904 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) { | 
| 905 |  |  |  |  |  |  | my $doc_vec_ref = | 
| 906 |  |  |  |  |  |  | $self->_construct_doc_vector_for_one_ticket($ticket_id); | 
| 907 |  |  |  |  |  |  | print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n"; | 
| 908 |  |  |  |  |  |  | $i++; | 
| 909 |  |  |  |  |  |  | } | 
| 910 |  |  |  |  |  |  | } | 
| 911 |  |  |  |  |  |  |  | 
| 912 |  |  |  |  |  |  | sub _construct_doc_vector_for_one_ticket { | 
| 913 |  |  |  |  |  |  | my $self = shift; | 
| 914 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 915 |  |  |  |  |  |  | unless (keys %{$self->{_doc_vector_template}}) { | 
| 916 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_vocab_hist}} ) { | 
| 917 |  |  |  |  |  |  | $self->{_doc_vector_template}->{$_} = 0; | 
| 918 |  |  |  |  |  |  | } | 
| 919 |  |  |  |  |  |  | } | 
| 920 |  |  |  |  |  |  | my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})}; | 
| 921 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_doc_vector_template}} ) { | 
| 922 |  |  |  |  |  |  | $doc_vector{$_} = 0; | 
| 923 |  |  |  |  |  |  | } | 
| 924 |  |  |  |  |  |  | my $min = $self->{_min_word_length}; | 
| 925 |  |  |  |  |  |  | my $total_words_in_ticket = 0; | 
| 926 |  |  |  |  |  |  | my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; | 
| 927 |  |  |  |  |  |  | my @clean_words = split /\s+/, $record; | 
| 928 |  |  |  |  |  |  | map { $doc_vector{"\L$_"}++ } | 
| 929 |  |  |  |  |  |  | grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; | 
| 930 |  |  |  |  |  |  | die "Something went wrong. Doc vector size unequal to vocab size" | 
| 931 |  |  |  |  |  |  | unless $self->{_vocab_size} == scalar(keys %doc_vector); | 
| 932 |  |  |  |  |  |  | foreach (keys %doc_vector) { | 
| 933 |  |  |  |  |  |  | $total_words_in_ticket += $doc_vector{$_}; | 
| 934 |  |  |  |  |  |  | } | 
| 935 |  |  |  |  |  |  | my %normalized_doc_vector; | 
| 936 |  |  |  |  |  |  | foreach (keys %doc_vector) { | 
| 937 |  |  |  |  |  |  | $normalized_doc_vector{$_} = $doc_vector{$_} | 
| 938 |  |  |  |  |  |  | * | 
| 939 |  |  |  |  |  |  | $self->{_idf_t}->{$_} | 
| 940 |  |  |  |  |  |  | / | 
| 941 |  |  |  |  |  |  | $total_words_in_ticket; | 
| 942 |  |  |  |  |  |  | } | 
| 943 |  |  |  |  |  |  | $self->{_tkt_doc_vecs}->{$ticket_id} = \%doc_vector; | 
| 944 |  |  |  |  |  |  | $self->{_tkt_doc_vecs_normed}->{$ticket_id} = \%normalized_doc_vector; | 
| 945 |  |  |  |  |  |  | } | 
| 946 |  |  |  |  |  |  |  | 
| 947 |  |  |  |  |  |  | sub store_ticket_vectors { | 
| 948 |  |  |  |  |  |  | my $self = shift; | 
| 949 |  |  |  |  |  |  | die "You have not yet created doc vectors for tickets" | 
| 950 |  |  |  |  |  |  | unless keys %{$self->{_tkt_doc_vecs}}; | 
| 951 |  |  |  |  |  |  | $self->{_tkt_doc_vecs_db} = "tkt_doc_vecs.db" unless $self->{_tkt_doc_vecs_db}; | 
| 952 |  |  |  |  |  |  | $self->{_tkt_doc_vecs_normed_db} = "tkt_doc_vecs_normed.db" | 
| 953 |  |  |  |  |  |  | unless $self->{_tkt_doc_vecs_normed_db}; | 
| 954 |  |  |  |  |  |  | unlink $self->{_tkt_doc_vecs_db}; | 
| 955 |  |  |  |  |  |  | unlink $self->{_tkt_doc_vecs_normed_db}; | 
| 956 |  |  |  |  |  |  | print "\nStoring the ticket doc vecs on disk. This could take a while.\n\n"; | 
| 957 |  |  |  |  |  |  | eval { | 
| 958 |  |  |  |  |  |  | store( $self->{_tkt_doc_vecs}, $self->{_tkt_doc_vecs_db} ); | 
| 959 |  |  |  |  |  |  | }; | 
| 960 |  |  |  |  |  |  | if ($@) { | 
| 961 |  |  |  |  |  |  | die "Something went wrong with disk storage of ticket doc vectors: $@"; | 
| 962 |  |  |  |  |  |  | } | 
| 963 |  |  |  |  |  |  | print "\nStoring normalized ticket doc vecs on disk. This could take a while.\n\n"; | 
| 964 |  |  |  |  |  |  | eval { | 
| 965 |  |  |  |  |  |  | store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db}); | 
| 966 |  |  |  |  |  |  | }; | 
| 967 |  |  |  |  |  |  | if ($@) { | 
| 968 |  |  |  |  |  |  | die "Something wrong with disk storage of normalized doc vecs: $@"; | 
| 969 |  |  |  |  |  |  | } | 
| 970 |  |  |  |  |  |  | } | 
| 971 |  |  |  |  |  |  |  | 
| 972 |  |  |  |  |  |  | sub restore_ticket_vectors_and_inverted_index { | 
| 973 |  |  |  |  |  |  | my $self = shift; | 
| 974 |  |  |  |  |  |  | $self->restore_raw_tickets_from_disk(); | 
| 975 |  |  |  |  |  |  | $self->restore_processed_tickets_from_disk(); | 
| 976 |  |  |  |  |  |  | $self->restore_stemmed_tickets_from_disk(); | 
| 977 |  |  |  |  |  |  | tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640 | 
| 978 |  |  |  |  |  |  | or die "Can't connect with DBM file: $!"; | 
| 979 |  |  |  |  |  |  | foreach (keys %vocab_hist_on_disk) { | 
| 980 |  |  |  |  |  |  | $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_}; | 
| 981 |  |  |  |  |  |  | } | 
| 982 |  |  |  |  |  |  | untie %vocab_hist_on_disk; | 
| 983 |  |  |  |  |  |  | tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640 | 
| 984 |  |  |  |  |  |  | or die "Can't connect with DBM file: $!"; | 
| 985 |  |  |  |  |  |  | foreach (keys %idf_t_on_disk) { | 
| 986 |  |  |  |  |  |  | $self->{_idf_t}->{$_} = $idf_t_on_disk{$_}; | 
| 987 |  |  |  |  |  |  | } | 
| 988 |  |  |  |  |  |  | untie %idf_t_on_disk; | 
| 989 |  |  |  |  |  |  | eval { | 
| 990 |  |  |  |  |  |  | $self->{_tkt_doc_vecs} = retrieve( $self->{_tkt_doc_vecs_db} ); | 
| 991 |  |  |  |  |  |  | }; | 
| 992 |  |  |  |  |  |  | if ($@) { | 
| 993 |  |  |  |  |  |  | print "Something went wrong with retrieval of ticket doc vectors: $@"; | 
| 994 |  |  |  |  |  |  | } | 
| 995 |  |  |  |  |  |  | eval { | 
| 996 |  |  |  |  |  |  | $self->{_tkt_doc_vecs_normed} = retrieve( $self->{_tkt_doc_vecs_normed_db} ); | 
| 997 |  |  |  |  |  |  | }; | 
| 998 |  |  |  |  |  |  | if ($@) { | 
| 999 |  |  |  |  |  |  | print "Something went wrong with retrieval of normed ticket doc vectors: $@"; | 
| 1000 |  |  |  |  |  |  | } | 
| 1001 |  |  |  |  |  |  | eval { | 
| 1002 |  |  |  |  |  |  | $self->{_inverted_index} = | 
| 1003 |  |  |  |  |  |  | retrieve( $self->{_inverted_index_db} ); | 
| 1004 |  |  |  |  |  |  | }; | 
| 1005 |  |  |  |  |  |  | if ($@) { | 
| 1006 |  |  |  |  |  |  | print "Something went wrong with retrieval of inverted_index: $@"; | 
| 1007 |  |  |  |  |  |  | } | 
| 1008 |  |  |  |  |  |  | } | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  | sub display_all_doc_vectors { | 
| 1011 |  |  |  |  |  |  | my $self = shift; | 
| 1012 |  |  |  |  |  |  | die "Ticket doc vectors not yet constructed" | 
| 1013 |  |  |  |  |  |  | unless keys %{$self->{_tkt_doc_vecs}}; | 
| 1014 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs}}) { | 
| 1015 |  |  |  |  |  |  | $self->_display_a_single_ticket_doc_vector($ticket_id); | 
| 1016 |  |  |  |  |  |  | } | 
| 1017 |  |  |  |  |  |  | } | 
| 1018 |  |  |  |  |  |  |  | 
| 1019 |  |  |  |  |  |  | sub _display_a_single_ticket_doc_vector { | 
| 1020 |  |  |  |  |  |  | my $self = shift; | 
| 1021 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 1022 |  |  |  |  |  |  | die "Ticket doc vectors not yet constructed" | 
| 1023 |  |  |  |  |  |  | unless keys %{$self->{_tkt_doc_vecs}}; | 
| 1024 |  |  |  |  |  |  | print "\n\nDISPLAYING Doc Vec FOR TICKET $ticket_id:\n\n"; | 
| 1025 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_tkt_doc_vecs}->{$ticket_id}} ) { | 
| 1026 |  |  |  |  |  |  | print "$_  =>   $self->{_tkt_doc_vecs}->{$ticket_id}->{$_}\n"; | 
| 1027 |  |  |  |  |  |  | } | 
| 1028 |  |  |  |  |  |  | my $docvec_size = keys %{$self->{_tkt_doc_vecs}->{$ticket_id}}; | 
| 1029 |  |  |  |  |  |  | print "\nSize of vector for ticket $ticket_id: $docvec_size\n"; | 
| 1030 |  |  |  |  |  |  | } | 
| 1031 |  |  |  |  |  |  |  | 
| 1032 |  |  |  |  |  |  | sub display_all_normalized_doc_vectors { | 
| 1033 |  |  |  |  |  |  | my $self = shift; | 
| 1034 |  |  |  |  |  |  | die "normalized document vectors not yet constructed" | 
| 1035 |  |  |  |  |  |  | unless keys %{$self->{_tkt_doc_vecs_normed}}; | 
| 1036 |  |  |  |  |  |  | foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs_normed}}) { | 
| 1037 |  |  |  |  |  |  | $self->_display_a_single_normalized_doc_vector($ticket_id); | 
| 1038 |  |  |  |  |  |  | } | 
| 1039 |  |  |  |  |  |  | } | 
| 1040 |  |  |  |  |  |  |  | 
| 1041 |  |  |  |  |  |  | sub _display_a_single_normalized_doc_vector { | 
| 1042 |  |  |  |  |  |  | my $self = shift; | 
| 1043 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 1044 |  |  |  |  |  |  | die "Normalized ticket doc vectors not yet constructed" | 
| 1045 |  |  |  |  |  |  | unless keys %{$self->{_tkt_doc_vecs_normed}}; | 
| 1046 |  |  |  |  |  |  | print "\n\nDISPLAYING Normalized Doc Vec FOR TICKET $ticket_id:\n\n"; | 
| 1047 |  |  |  |  |  |  | foreach ( sort keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}} ) { | 
| 1048 |  |  |  |  |  |  | print "$_  =>   $self->{_tkt_doc_vecs_normed}->{$ticket_id}->{$_}\n"; | 
| 1049 |  |  |  |  |  |  | } | 
| 1050 |  |  |  |  |  |  | my $docvec_size = keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}}; | 
| 1051 |  |  |  |  |  |  | print "\nSize of normalized vector for ticket $ticket_id: $docvec_size\n"; | 
| 1052 |  |  |  |  |  |  | } | 
| 1053 |  |  |  |  |  |  |  | 
| 1054 |  |  |  |  |  |  | ##########################  Display Inverted Index  ########################### | 
| 1055 |  |  |  |  |  |  |  | 
| 1056 |  |  |  |  |  |  | sub display_inverted_index { | 
| 1057 |  |  |  |  |  |  | my $self = shift; | 
| 1058 |  |  |  |  |  |  | print "\n\nDisplaying inverted index:\n\n"; | 
| 1059 |  |  |  |  |  |  | foreach my $word (sort keys %{$self->{_vocab_hist}}) { | 
| 1060 |  |  |  |  |  |  | $self->display_inverted_index_for_given_word($word); | 
| 1061 |  |  |  |  |  |  | } | 
| 1062 |  |  |  |  |  |  | } | 
| 1063 |  |  |  |  |  |  |  | 
| 1064 |  |  |  |  |  |  | sub display_inverted_index_for_given_word { | 
| 1065 |  |  |  |  |  |  | my $self = shift; | 
| 1066 |  |  |  |  |  |  | my $word = shift; | 
| 1067 |  |  |  |  |  |  | defined $self->{_inverted_index}->{$word} ? | 
| 1068 |  |  |  |  |  |  | print "$word =>  @{$self->{_inverted_index}->{$word}}\n"  : | 
| 1069 |  |  |  |  |  |  | die "Something is wrong with your inverted index\n"; | 
| 1070 |  |  |  |  |  |  | } | 
| 1071 |  |  |  |  |  |  |  | 
| 1072 |  |  |  |  |  |  | sub display_inverted_index_for_given_query { | 
| 1073 |  |  |  |  |  |  | my $self = shift; | 
| 1074 |  |  |  |  |  |  | my $query_ticket_id = shift; | 
| 1075 |  |  |  |  |  |  | my $query_record = $self->{_stemmed_tkts_by_ids}->{$query_ticket_id}; | 
| 1076 |  |  |  |  |  |  | my @query_words = grep $_, split /\s+/, $query_record; | 
| 1077 |  |  |  |  |  |  | foreach my $qword (@query_words) { | 
| 1078 |  |  |  |  |  |  | my $idf_t = $self->{_idf_t}->{$qword}; | 
| 1079 |  |  |  |  |  |  | my @relevant_tickets = @{$self->{_inverted_index}->{$qword}}; | 
| 1080 |  |  |  |  |  |  | print "\n$qword ($idf_t)  ===>  @relevant_tickets\n\n"; | 
| 1081 |  |  |  |  |  |  | } | 
| 1082 |  |  |  |  |  |  | } | 
| 1083 |  |  |  |  |  |  |  | 
| 1084 |  |  |  |  |  |  | #############  Retrieve Most Similar Tickets with VSM Model  ################### | 
| 1085 |  |  |  |  |  |  |  | 
| 1086 |  |  |  |  |  |  | sub retrieve_similar_tickets_with_vsm { | 
| 1087 |  |  |  |  |  |  | my $self = shift; | 
| 1088 |  |  |  |  |  |  | $self->{_query_ticket_id} = shift; | 
| 1089 |  |  |  |  |  |  | die "\nFirst generate normalized doc vectors for tickets before you can call retrieve with vsm function()\n" | 
| 1090 |  |  |  |  |  |  | unless scalar(keys %{$self->{_vocab_hist}}) | 
| 1091 |  |  |  |  |  |  | && scalar(keys %{$self->{_tkt_doc_vecs_normed}}); | 
| 1092 |  |  |  |  |  |  | print "\nCalculating the similarity set for query ticket $self->{_query_ticket_id}\n\n"; | 
| 1093 |  |  |  |  |  |  | my $query_record = $self->{_stemmed_tkts_by_ids}->{$self->{_query_ticket_id}}; | 
| 1094 |  |  |  |  |  |  | my @query_words = grep $_, split /\s+/, $query_record; | 
| 1095 |  |  |  |  |  |  | my %relevant_tickets_set; | 
| 1096 |  |  |  |  |  |  | die "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- " | 
| 1097 |  |  |  |  |  |  | unless $self->{_min_idf_threshold}; | 
| 1098 |  |  |  |  |  |  | foreach my $qword (@query_words) { | 
| 1099 |  |  |  |  |  |  | map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}} | 
| 1100 |  |  |  |  |  |  | if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold}; | 
| 1101 |  |  |  |  |  |  | } | 
| 1102 |  |  |  |  |  |  | my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set; | 
| 1103 |  |  |  |  |  |  | print "The relevant tickets for query: @relevant_tickets" | 
| 1104 |  |  |  |  |  |  | if $self->{_debug3}; | 
| 1105 |  |  |  |  |  |  | my $num_relevant_tkts = @relevant_tickets; | 
| 1106 |  |  |  |  |  |  | print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n"; | 
| 1107 |  |  |  |  |  |  | my %retrievals; | 
| 1108 |  |  |  |  |  |  | my $rank = 0; | 
| 1109 |  |  |  |  |  |  | foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) { | 
| 1110 |  |  |  |  |  |  | $retrievals{$_} = $self->_similarity_to_query_ticket($_); | 
| 1111 |  |  |  |  |  |  | $rank++; | 
| 1112 |  |  |  |  |  |  | last if $rank == $self->{_how_many_retrievals}; | 
| 1113 |  |  |  |  |  |  | } | 
| 1114 |  |  |  |  |  |  | if ($self->{_debug3}) { | 
| 1115 |  |  |  |  |  |  | print "\n\nShowing the VSM retrievals and the similarity scores:\n\n"; | 
| 1116 |  |  |  |  |  |  | foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { | 
| 1117 |  |  |  |  |  |  | print "$_   =>   $retrievals{$_}\n"; | 
| 1118 |  |  |  |  |  |  | } | 
| 1119 |  |  |  |  |  |  | } | 
| 1120 |  |  |  |  |  |  | return \%retrievals; | 
| 1121 |  |  |  |  |  |  | } | 
| 1122 |  |  |  |  |  |  |  | 
| 1123 |  |  |  |  |  |  | sub _doc_vec_comparator { | 
| 1124 |  |  |  |  |  |  | my $self = shift; | 
| 1125 |  |  |  |  |  |  | my %query_ticket_data_normed = | 
| 1126 |  |  |  |  |  |  | %{$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}}; | 
| 1127 |  |  |  |  |  |  | my $vec1_hash_ref = $self->{_tkt_doc_vecs_normed}->{$a}; | 
| 1128 |  |  |  |  |  |  | my $vec2_hash_ref = $self->{_tkt_doc_vecs_normed}->{$b}; | 
| 1129 |  |  |  |  |  |  | my @vec1 = (); | 
| 1130 |  |  |  |  |  |  | my @vec2 = (); | 
| 1131 |  |  |  |  |  |  | my @qvec = (); | 
| 1132 |  |  |  |  |  |  | foreach my $word (sort keys %{$self->{_vocab_hist}}) { | 
| 1133 |  |  |  |  |  |  | push @vec1, $vec1_hash_ref->{$word}; | 
| 1134 |  |  |  |  |  |  | push @vec2, $vec2_hash_ref->{$word}; | 
| 1135 |  |  |  |  |  |  | push @qvec, $query_ticket_data_normed{$word}; | 
| 1136 |  |  |  |  |  |  | } | 
| 1137 |  |  |  |  |  |  | my $vec1_mag = _vec_magnitude(\@vec1); | 
| 1138 |  |  |  |  |  |  | my $vec2_mag = _vec_magnitude(\@vec2); | 
| 1139 |  |  |  |  |  |  | my $qvec_mag = _vec_magnitude(\@qvec); | 
| 1140 |  |  |  |  |  |  | my $product1 = _vec_scalar_product(\@vec1, \@qvec); | 
| 1141 |  |  |  |  |  |  | $product1 /= $vec1_mag * $qvec_mag; | 
| 1142 |  |  |  |  |  |  | my $product2 = _vec_scalar_product(\@vec2, \@qvec); | 
| 1143 |  |  |  |  |  |  | $product2 /= $vec2_mag * $qvec_mag; | 
| 1144 |  |  |  |  |  |  | return 1 if $product1 < $product2; | 
| 1145 |  |  |  |  |  |  | return 0  if $product1 == $product2; | 
| 1146 |  |  |  |  |  |  | return -1  if $product1 > $product2; | 
| 1147 |  |  |  |  |  |  | } | 
| 1148 |  |  |  |  |  |  |  | 
| 1149 |  |  |  |  |  |  | sub _similarity_to_query_ticket { | 
| 1150 |  |  |  |  |  |  | my $self = shift; | 
| 1151 |  |  |  |  |  |  | my $ticket_id = shift; | 
| 1152 |  |  |  |  |  |  | my $ticket_data_normed = $self->{_tkt_doc_vecs_normed}->{$ticket_id}; | 
| 1153 |  |  |  |  |  |  | my @vec = (); | 
| 1154 |  |  |  |  |  |  | my @qvec = (); | 
| 1155 |  |  |  |  |  |  | foreach my $word (sort keys %$ticket_data_normed) { | 
| 1156 |  |  |  |  |  |  | push @vec, $ticket_data_normed->{$word}; | 
| 1157 |  |  |  |  |  |  | push @qvec, | 
| 1158 |  |  |  |  |  |  | $self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}->{$word}; | 
| 1159 |  |  |  |  |  |  | } | 
| 1160 |  |  |  |  |  |  | my $vec_mag = _vec_magnitude(\@vec); | 
| 1161 |  |  |  |  |  |  | my $qvec_mag = _vec_magnitude(\@qvec); | 
| 1162 |  |  |  |  |  |  | die "\nThe query ticket appears to be empty\n" if $qvec_mag == 0; | 
| 1163 |  |  |  |  |  |  | my $product = _vec_scalar_product(\@vec, \@qvec); | 
| 1164 |  |  |  |  |  |  | $product /= $vec_mag * $qvec_mag; | 
| 1165 |  |  |  |  |  |  | return $product; | 
| 1166 |  |  |  |  |  |  | } | 
| 1167 |  |  |  |  |  |  |  | 
| 1168 |  |  |  |  |  |  |  | 
| 1169 |  |  |  |  |  |  | ########################  Utility Subroutines  ########################## | 
| 1170 |  |  |  |  |  |  |  | 
| 1171 |  |  |  |  |  |  | sub _simple_stemmer { | 
| 1172 |  |  |  |  |  |  | my $word = shift; | 
| 1173 |  |  |  |  |  |  | my $debug = shift; | 
| 1174 |  |  |  |  |  |  | print "\nStemming the word:        $word\n" if $debug; | 
| 1175 |  |  |  |  |  |  | $word =~ s/(.*[a-z]t)ted$/$1/i; | 
| 1176 |  |  |  |  |  |  | $word =~ s/(.*[a-z]t)ting$/$1/i; | 
| 1177 |  |  |  |  |  |  | $word =~ s/(.*[a-z]l)ling$/$1/i; | 
| 1178 |  |  |  |  |  |  | $word =~ s/(.*[a-z]g)ging$/$1/i; | 
| 1179 |  |  |  |  |  |  | $word =~ s/(.*[a-z]ll)ed$/$1/i; | 
| 1180 |  |  |  |  |  |  | $word =~ s/(.*[a-z][^aeious])s$/$1/i; | 
| 1181 |  |  |  |  |  |  | $word =~ s/(.*[a-z])ies$/$1y/i; | 
| 1182 |  |  |  |  |  |  | $word =~ s/(.*[a-z]s)es$/$1/i; | 
| 1183 |  |  |  |  |  |  | $word =~ s/(.*[a-z][ck])es$/$1e/i; | 
| 1184 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)tions$/$1tion/i; | 
| 1185 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)mming$/$1m/i; | 
| 1186 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+[^rl])ing$/$1/i; | 
| 1187 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i; | 
| 1188 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)tices$/$1tex/i; | 
| 1189 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)pes$/$1pe/i; | 
| 1190 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)sed$/$1se/i; | 
| 1191 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)ed$/$1/i; | 
| 1192 |  |  |  |  |  |  | $word =~ s/(.*[a-z]+)tation$/$1t/i; | 
| 1193 |  |  |  |  |  |  | print "Stemmed word:                           $word\n\n" if $debug; | 
| 1194 |  |  |  |  |  |  | return $word; | 
| 1195 |  |  |  |  |  |  | } | 
| 1196 |  |  |  |  |  |  |  | 
| 1197 |  |  |  |  |  |  | sub _exists { | 
| 1198 |  |  |  |  |  |  | my $element = shift; | 
| 1199 |  |  |  |  |  |  | my $array   = shift; | 
| 1200 |  |  |  |  |  |  | my %hash; | 
| 1201 |  |  |  |  |  |  | for my $item (@$array) { | 
| 1202 |  |  |  |  |  |  | $hash{$item} = 1; | 
| 1203 |  |  |  |  |  |  | } | 
| 1204 |  |  |  |  |  |  | return exists $hash{$element}; | 
| 1205 |  |  |  |  |  |  | } | 
| 1206 |  |  |  |  |  |  |  | 
| 1207 |  |  |  |  |  |  | sub _fetch_words_from_file { | 
| 1208 |  |  |  |  |  |  | my $file = shift; | 
| 1209 |  |  |  |  |  |  | my @words; | 
| 1210 |  |  |  |  |  |  | open( IN, "$file" ) or die "unable to open the file $file: $!"; | 
| 1211 |  |  |  |  |  |  | while () { | 
| 1212 |  |  |  |  |  |  | next if /^#/; | 
| 1213 |  |  |  |  |  |  | next if /^[ ]*\r?\n?$/; | 
| 1214 |  |  |  |  |  |  | $_ =~ s/\r?\n?$//; | 
| 1215 |  |  |  |  |  |  | my @how_many_in_line = grep $_, split /\s+/, $_; | 
| 1216 |  |  |  |  |  |  | die "File $file: Exactly one word allowed in each line  -- " | 
| 1217 |  |  |  |  |  |  | unless @how_many_in_line == 1; | 
| 1218 |  |  |  |  |  |  | push @words, $_; | 
| 1219 |  |  |  |  |  |  | } | 
| 1220 |  |  |  |  |  |  | close IN; | 
| 1221 |  |  |  |  |  |  | return \@words; | 
| 1222 |  |  |  |  |  |  | } | 
| 1223 |  |  |  |  |  |  |  | 
| 1224 |  |  |  |  |  |  | sub _fetch_word_pairs_from_file { | 
| 1225 |  |  |  |  |  |  | my $file = shift; | 
| 1226 |  |  |  |  |  |  | my @word_pairs; | 
| 1227 |  |  |  |  |  |  | open( IN, "$file" ) or die "unable to open the file $file: $!"; | 
| 1228 |  |  |  |  |  |  | while () { | 
| 1229 |  |  |  |  |  |  | next if /^#/; | 
| 1230 |  |  |  |  |  |  | next if /^[ ]*$/; | 
| 1231 |  |  |  |  |  |  | chomp; | 
| 1232 |  |  |  |  |  |  | my @how_many_in_line = grep $_, split /\s+/, $_; | 
| 1233 |  |  |  |  |  |  | die "File: $file --- Exactly two words must be in each non-comment or not-empty line -- " | 
| 1234 |  |  |  |  |  |  | unless @how_many_in_line == 2; | 
| 1235 |  |  |  |  |  |  | push @word_pairs, $_; | 
| 1236 |  |  |  |  |  |  | } | 
| 1237 |  |  |  |  |  |  | close IN; | 
| 1238 |  |  |  |  |  |  | return \@word_pairs; | 
| 1239 |  |  |  |  |  |  | } | 
| 1240 |  |  |  |  |  |  |  | 
| 1241 |  |  |  |  |  |  | sub _get_rid_of_wide_chars { | 
| 1242 |  |  |  |  |  |  | my $string = shift; | 
| 1243 |  |  |  |  |  |  | $string =~ s/[^[:ascii:]]+//g; | 
| 1244 |  |  |  |  |  |  | #    $string =~ s/\x{FEFF}//g;           to get rid of wide characters | 
| 1245 |  |  |  |  |  |  | return $string; | 
| 1246 |  |  |  |  |  |  | } | 
| 1247 |  |  |  |  |  |  |  | 
| 1248 |  |  |  |  |  |  | sub _find_index_for_given_element { | 
| 1249 |  |  |  |  |  |  | my $ele = shift; | 
| 1250 |  |  |  |  |  |  | my $array_ref = shift; | 
| 1251 |  |  |  |  |  |  | foreach my $i (0..@{$array_ref}-1) { | 
| 1252 |  |  |  |  |  |  | return $i if $ele == $array_ref->[$i]; | 
| 1253 |  |  |  |  |  |  | } | 
| 1254 |  |  |  |  |  |  | } | 
| 1255 |  |  |  |  |  |  |  | 
| 1256 |  |  |  |  |  |  | sub _check_for_illegal_params { | 
| 1257 |  |  |  |  |  |  | my @params = @_; | 
| 1258 |  |  |  |  |  |  | my @legal_params = qw / excel_filename | 
| 1259 |  |  |  |  |  |  | which_worksheet | 
| 1260 |  |  |  |  |  |  | raw_tickets_db | 
| 1261 |  |  |  |  |  |  | processed_tickets_db | 
| 1262 |  |  |  |  |  |  | stemmed_tickets_db | 
| 1263 |  |  |  |  |  |  | inverted_index_db | 
| 1264 |  |  |  |  |  |  | tickets_vocab_db | 
| 1265 |  |  |  |  |  |  | idf_db | 
| 1266 |  |  |  |  |  |  | tkt_doc_vecs_db | 
| 1267 |  |  |  |  |  |  | tkt_doc_vecs_normed_db | 
| 1268 |  |  |  |  |  |  | synset_cache_db | 
| 1269 |  |  |  |  |  |  | want_synset_caching | 
| 1270 |  |  |  |  |  |  | add_synsets_to_tickets | 
| 1271 |  |  |  |  |  |  | clustering_fieldname | 
| 1272 |  |  |  |  |  |  | min_word_length | 
| 1273 |  |  |  |  |  |  | min_idf_threshold | 
| 1274 |  |  |  |  |  |  | max_num_syn_words | 
| 1275 |  |  |  |  |  |  | stop_words_file | 
| 1276 |  |  |  |  |  |  | misspelled_words_file | 
| 1277 |  |  |  |  |  |  | unique_id_fieldname | 
| 1278 |  |  |  |  |  |  | want_stemming | 
| 1279 |  |  |  |  |  |  | how_many_retrievals | 
| 1280 |  |  |  |  |  |  | debug1 | 
| 1281 |  |  |  |  |  |  | debug2 | 
| 1282 |  |  |  |  |  |  | debug3 | 
| 1283 |  |  |  |  |  |  | /; | 
| 1284 |  |  |  |  |  |  | my $found_match_flag; | 
| 1285 |  |  |  |  |  |  | foreach my $param (@params) { | 
| 1286 |  |  |  |  |  |  |  | 
| 1287 |  |  |  |  |  |  | foreach my $legal (@legal_params) { | 
| 1288 |  |  |  |  |  |  | $found_match_flag = 0; | 
| 1289 |  |  |  |  |  |  | if ($param eq $legal) { | 
| 1290 |  |  |  |  |  |  | $found_match_flag = 1; | 
| 1291 |  |  |  |  |  |  | last; | 
| 1292 |  |  |  |  |  |  | } | 
| 1293 |  |  |  |  |  |  | } | 
| 1294 |  |  |  |  |  |  | last if $found_match_flag == 0; | 
| 1295 |  |  |  |  |  |  | } | 
| 1296 |  |  |  |  |  |  | return $found_match_flag; | 
| 1297 |  |  |  |  |  |  | } | 
| 1298 |  |  |  |  |  |  |  | 
| 1299 |  |  |  |  |  |  | # Meant only for an un-nested hash: | 
| 1300 |  |  |  |  |  |  | sub _deep_copy_hash { | 
| 1301 |  |  |  |  |  |  | my $ref_in = shift; | 
| 1302 |  |  |  |  |  |  | my $ref_out = {}; | 
| 1303 |  |  |  |  |  |  | foreach ( keys %{$ref_in} ) { | 
| 1304 |  |  |  |  |  |  | $ref_out->{$_} = $ref_in->{$_}; | 
| 1305 |  |  |  |  |  |  | } | 
| 1306 |  |  |  |  |  |  | return $ref_out; | 
| 1307 |  |  |  |  |  |  | } | 
| 1308 |  |  |  |  |  |  |  | 
| 1309 |  |  |  |  |  |  | # from perl docs: | 
| 1310 |  |  |  |  |  |  | sub _fisher_yates_shuffle { | 
| 1311 |  |  |  |  |  |  | my $arr =  shift; | 
| 1312 |  |  |  |  |  |  | my $i = @$arr; | 
| 1313 |  |  |  |  |  |  | while (--$i) { | 
| 1314 |  |  |  |  |  |  | my $j = int rand( $i + 1 ); | 
| 1315 |  |  |  |  |  |  | @$arr[$i, $j] = @$arr[$j, $i]; | 
| 1316 |  |  |  |  |  |  | } | 
| 1317 |  |  |  |  |  |  | } | 
| 1318 |  |  |  |  |  |  |  | 
| 1319 |  |  |  |  |  |  | sub _vec_scalar_product { | 
| 1320 |  |  |  |  |  |  | my $vec1 = shift; | 
| 1321 |  |  |  |  |  |  | my $vec2 = shift; | 
| 1322 |  |  |  |  |  |  | die "Something is wrong --- the two vectors are of unequal length" | 
| 1323 |  |  |  |  |  |  | unless @$vec1 == @$vec2; | 
| 1324 |  |  |  |  |  |  | my $product; | 
| 1325 |  |  |  |  |  |  | for my $i (0..@$vec1-1) { | 
| 1326 |  |  |  |  |  |  | $product += $vec1->[$i] * $vec2->[$i]; | 
| 1327 |  |  |  |  |  |  | } | 
| 1328 |  |  |  |  |  |  | return $product; | 
| 1329 |  |  |  |  |  |  | } | 
| 1330 |  |  |  |  |  |  |  | 
| 1331 |  |  |  |  |  |  | sub _vec_magnitude { | 
| 1332 |  |  |  |  |  |  | my $vec = shift; | 
| 1333 |  |  |  |  |  |  | my $mag_squared = 0; | 
| 1334 |  |  |  |  |  |  | foreach my $num (@$vec) { | 
| 1335 |  |  |  |  |  |  | $mag_squared += $num ** 2; | 
| 1336 |  |  |  |  |  |  | } | 
| 1337 |  |  |  |  |  |  | return sqrt $mag_squared; | 
| 1338 |  |  |  |  |  |  | } | 
| 1339 |  |  |  |  |  |  |  | 
| 1340 |  |  |  |  |  |  | 1; | 
| 1341 |  |  |  |  |  |  |  | 
| 1342 |  |  |  |  |  |  | __END__ |