line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Algorithm::TicketClusterer; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
4
|
|
|
|
|
|
|
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is |
5
|
|
|
|
|
|
|
# free software. You may modify and/or distribute it under the same terms |
6
|
|
|
|
|
|
|
# as Perl itself. This copyright notice must remain attached to the file. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored |
9
|
|
|
|
|
|
|
# past tickets that are most similar to a new ticket. Tickets are commonly |
10
|
|
|
|
|
|
|
# used in software services industry and customer support businesses to |
11
|
|
|
|
|
|
|
# record requests for service, product complaints, user feedback, and so |
12
|
|
|
|
|
|
|
# on. |
13
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
14
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
12990
|
use 5.10.0; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
16
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
17
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
10
|
|
|
1
|
|
|
|
|
28
|
|
18
|
1
|
|
|
1
|
|
3
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
75
|
|
19
|
1
|
|
|
1
|
|
612
|
use Storable; |
|
1
|
|
|
|
|
2749
|
|
|
1
|
|
|
|
|
69
|
|
20
|
1
|
|
|
1
|
|
837
|
use Spreadsheet::ParseExcel; |
|
1
|
|
|
|
|
52964
|
|
|
1
|
|
|
|
|
33
|
|
21
|
1
|
|
|
1
|
|
536
|
use Spreadsheet::XLSX; |
|
1
|
|
|
|
|
64852
|
|
|
1
|
|
|
|
|
33
|
|
22
|
1
|
|
|
1
|
|
1084
|
use WordNet::QueryData; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Text::Iconv; |
24
|
|
|
|
|
|
|
use SDBM_File; |
25
|
|
|
|
|
|
|
use Fcntl; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our $VERSION = '1.01'; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
############################### The Constructor ############################# |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub new { |
32
|
|
|
|
|
|
|
my ($class, %args) = @_; |
33
|
|
|
|
|
|
|
my @params = keys %args; |
34
|
|
|
|
|
|
|
croak "\nYou have used a wrong name for a keyword argument " . |
35
|
|
|
|
|
|
|
"--- perhaps a misspelling\n" |
36
|
|
|
|
|
|
|
if _check_for_illegal_params(@params) == 0; |
37
|
|
|
|
|
|
|
bless { |
38
|
|
|
|
|
|
|
_excel_filename => $args{excel_filename}, |
39
|
|
|
|
|
|
|
_which_worksheet => $args{which_worksheet}, |
40
|
|
|
|
|
|
|
_raw_tickets_db => $args{raw_tickets_db}, |
41
|
|
|
|
|
|
|
_processed_tickets_db => $args{processed_tickets_db}, |
42
|
|
|
|
|
|
|
_synset_cache_db => $args{synset_cache_db}, |
43
|
|
|
|
|
|
|
_stemmed_tickets_db => $args{stemmed_tickets_db}, |
44
|
|
|
|
|
|
|
_inverted_index_db => $args{inverted_index_db}, |
45
|
|
|
|
|
|
|
_tickets_vocab_db => $args{tickets_vocab_db}, |
46
|
|
|
|
|
|
|
_idf_db => $args{idf_db}, |
47
|
|
|
|
|
|
|
_tkt_doc_vecs_db => $args{tkt_doc_vecs_db}, |
48
|
|
|
|
|
|
|
_tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db}, |
49
|
|
|
|
|
|
|
_clustering_fieldname => $args{clustering_fieldname}, |
50
|
|
|
|
|
|
|
_unique_id_fieldname => $args{unique_id_fieldname}, |
51
|
|
|
|
|
|
|
_stop_words_file => $args{stop_words_file}, |
52
|
|
|
|
|
|
|
_misspelled_words_file => $args{misspelled_words_file}, |
53
|
|
|
|
|
|
|
_min_word_length => $args{min_word_length} || 4, |
54
|
|
|
|
|
|
|
_add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0, |
55
|
|
|
|
|
|
|
_want_stemming => $args{want_stemming} || 0, |
56
|
|
|
|
|
|
|
_how_many_retrievals => $args{how_many_retrievals} || 5, |
57
|
|
|
|
|
|
|
_min_idf_threshold => $args{min_idf_threshold}, |
58
|
|
|
|
|
|
|
_max_num_syn_words => $args{max_num_syn_words} || 3, |
59
|
|
|
|
|
|
|
_want_synset_caching => $args{want_synset_caching} || 0, |
60
|
|
|
|
|
|
|
_stop_words => {}, |
61
|
|
|
|
|
|
|
_all_tickets => [], |
62
|
|
|
|
|
|
|
_column_headers => [], |
63
|
|
|
|
|
|
|
_good_columns => [], |
64
|
|
|
|
|
|
|
_tickets_by_ids => {}, |
65
|
|
|
|
|
|
|
_processed_tkts_by_ids => {}, |
66
|
|
|
|
|
|
|
_stemmed_tkts_by_ids => {}, |
67
|
|
|
|
|
|
|
_misspelled_words => {}, |
68
|
|
|
|
|
|
|
_total_num_tickets => 0, |
69
|
|
|
|
|
|
|
_synset_cache => {}, |
70
|
|
|
|
|
|
|
_vocab_hash => {}, |
71
|
|
|
|
|
|
|
_vocab_idf_hist => {}, |
72
|
|
|
|
|
|
|
_idf_t => {}, |
73
|
|
|
|
|
|
|
_vocab_size => undef, |
74
|
|
|
|
|
|
|
_doc_vector_template => {}, |
75
|
|
|
|
|
|
|
_tkt_doc_vecs => {}, |
76
|
|
|
|
|
|
|
_tkt_doc_vecs_normed => {}, |
77
|
|
|
|
|
|
|
_query_ticket_id => undef, |
78
|
|
|
|
|
|
|
_inverted_index => {}, |
79
|
|
|
|
|
|
|
_debug1 => $args{debug1} || 0, # for processing Excel |
80
|
|
|
|
|
|
|
_debug2 => $args{debug2} || 0, # for modeling tickets |
81
|
|
|
|
|
|
|
_debug3 => $args{debug3} || 0, # for retrieving similar tickets |
82
|
|
|
|
|
|
|
_wn => WordNet::QueryData->new( verbose => 0, |
83
|
|
|
|
|
|
|
noload => 1 ), |
84
|
|
|
|
|
|
|
}, $class; |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
############################# Extract info from Excel ####################### |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub get_tickets_from_excel { |
90
|
|
|
|
|
|
|
my $self = shift; |
91
|
|
|
|
|
|
|
unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db}; |
92
|
|
|
|
|
|
|
unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db}; |
93
|
|
|
|
|
|
|
unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db}; |
94
|
|
|
|
|
|
|
unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db}; |
95
|
|
|
|
|
|
|
unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db}; |
96
|
|
|
|
|
|
|
unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db}; |
97
|
|
|
|
|
|
|
unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db}; |
98
|
|
|
|
|
|
|
unlink glob "$self->{_tickets_vocab_db}.*"; |
99
|
|
|
|
|
|
|
unlink glob "$self->{_idf_db}.*"; |
100
|
|
|
|
|
|
|
my $filename = $self->{_excel_filename} || die("Excel file required"), |
101
|
|
|
|
|
|
|
my $clustering_fieldname = $self->{_clustering_fieldname} |
102
|
|
|
|
|
|
|
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- "); |
103
|
|
|
|
|
|
|
my $unique_id_fieldname = $self->{_unique_id_fieldname} |
104
|
|
|
|
|
|
|
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- "); |
105
|
|
|
|
|
|
|
my $workbook; |
106
|
|
|
|
|
|
|
if ($filename =~ /\.xls$/) { |
107
|
|
|
|
|
|
|
my $parser = Spreadsheet::ParseExcel->new(); |
108
|
|
|
|
|
|
|
$workbook = $parser->parse($filename); |
109
|
|
|
|
|
|
|
die $parser->error() unless defined $workbook; |
110
|
|
|
|
|
|
|
} elsif ($filename =~ /\.xlsx$/) { |
111
|
|
|
|
|
|
|
# use Text::Iconv; |
112
|
|
|
|
|
|
|
my $converter = Text::Iconv->new("utf-8", "windows-1251"); |
113
|
|
|
|
|
|
|
$workbook = Spreadsheet::XLSX->new($filename, $converter); |
114
|
|
|
|
|
|
|
} else { |
115
|
|
|
|
|
|
|
die "File suffix on the Excel file not recognized"; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
my @worksheets = $workbook->worksheets(); |
118
|
|
|
|
|
|
|
my $which_worksheet = $self->{_which_worksheet} || |
119
|
|
|
|
|
|
|
die "\nYou have not specified which Excel worksheet contains the tickets\n"; |
120
|
|
|
|
|
|
|
my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range(); |
121
|
|
|
|
|
|
|
my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range(); |
122
|
|
|
|
|
|
|
my @good_columns; |
123
|
|
|
|
|
|
|
my $col_headers_row; |
124
|
|
|
|
|
|
|
my $col_headers_found = 0; |
125
|
|
|
|
|
|
|
my $col_index_for_unique_id; |
126
|
|
|
|
|
|
|
my $col_index_for_clustering_field; |
127
|
|
|
|
|
|
|
for my $row ( $row_min .. $row_max ) { |
128
|
|
|
|
|
|
|
last if $col_headers_found; |
129
|
|
|
|
|
|
|
@good_columns = (); |
130
|
|
|
|
|
|
|
for my $col ( $col_min .. $col_max ) { |
131
|
|
|
|
|
|
|
my $cell = |
132
|
|
|
|
|
|
|
$worksheets[$which_worksheet-1]->get_cell( $row, $col ); |
133
|
|
|
|
|
|
|
next unless $cell; |
134
|
|
|
|
|
|
|
my $cell_value = _get_rid_of_wide_chars($cell->value()); |
135
|
|
|
|
|
|
|
push @good_columns, $col if $cell_value; |
136
|
|
|
|
|
|
|
if ($cell_value eq $unique_id_fieldname) { |
137
|
|
|
|
|
|
|
$col_index_for_unique_id = $col; |
138
|
|
|
|
|
|
|
$col_headers_row = $row; |
139
|
|
|
|
|
|
|
$col_headers_found = 1; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
if ($cell_value eq $clustering_fieldname) { |
142
|
|
|
|
|
|
|
$col_index_for_clustering_field = $col; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
$self->{_good_columns} = \@good_columns; |
147
|
|
|
|
|
|
|
print "\nThe unique id is in column: $col_index_for_unique_id\n" |
148
|
|
|
|
|
|
|
if $self->{_debug1}; |
149
|
|
|
|
|
|
|
print "The clustering field is in column: " . |
150
|
|
|
|
|
|
|
"$col_index_for_clustering_field\n\n" if $self->{_debug1}; |
151
|
|
|
|
|
|
|
my %Column_Headers; |
152
|
|
|
|
|
|
|
foreach my $field_index (0..@good_columns-1) { |
153
|
|
|
|
|
|
|
my $key = "field_" . $field_index; |
154
|
|
|
|
|
|
|
$Column_Headers{$key} = ""; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
my @col_headers = map { |
157
|
|
|
|
|
|
|
my $cell = |
158
|
|
|
|
|
|
|
$worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_); |
159
|
|
|
|
|
|
|
$cell ? _get_rid_of_wide_chars($cell->value()) : ''; |
160
|
|
|
|
|
|
|
} @good_columns; |
161
|
|
|
|
|
|
|
$self->{_column_headers} = \@col_headers; |
162
|
|
|
|
|
|
|
$self->_display_column_headers() if $self->{_debug1}; |
163
|
|
|
|
|
|
|
my $unique_id_field_index_in_good_columns = |
164
|
|
|
|
|
|
|
_find_index_for_given_element( $col_index_for_unique_id, \@good_columns ); |
165
|
|
|
|
|
|
|
my $clustering_field_index_in_good_columns = |
166
|
|
|
|
|
|
|
_find_index_for_given_element( $col_index_for_clustering_field, |
167
|
|
|
|
|
|
|
\@good_columns ); |
168
|
|
|
|
|
|
|
die "Something is wrong with the info extracted from Excel " . |
169
|
|
|
|
|
|
|
"as the index for the column with unique IDs is not one of " . |
170
|
|
|
|
|
|
|
"good columns\n\n" |
171
|
|
|
|
|
|
|
unless (defined $unique_id_field_index_in_good_columns) && |
172
|
|
|
|
|
|
|
(defined $clustering_field_index_in_good_columns); |
173
|
|
|
|
|
|
|
for my $row_index ( $col_headers_row+1..$row_max-1) { |
174
|
|
|
|
|
|
|
my @values = map { |
175
|
|
|
|
|
|
|
my $cell = |
176
|
|
|
|
|
|
|
$worksheets[$which_worksheet-1]->get_cell($row_index, $_); |
177
|
|
|
|
|
|
|
$cell ? _get_rid_of_wide_chars($cell->value()) : ''; |
178
|
|
|
|
|
|
|
} @good_columns; |
179
|
|
|
|
|
|
|
next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/; |
180
|
|
|
|
|
|
|
next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/; |
181
|
|
|
|
|
|
|
my %onerow; |
182
|
|
|
|
|
|
|
foreach my $field_index (0..@good_columns-1) { |
183
|
|
|
|
|
|
|
my $key = "field_" . $field_index; |
184
|
|
|
|
|
|
|
die "The Columns Headers hash has no field for index " . |
185
|
|
|
|
|
|
|
"$field_index\n " |
186
|
|
|
|
|
|
|
unless exists $col_headers[$field_index]; |
187
|
|
|
|
|
|
|
$onerow{$col_headers[$field_index]} = $values[$field_index]; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
push @{$self->{_all_tickets}}, \%onerow; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
my @duplicates_for_id_field = @{$self->_check_unique_id_field()}; |
192
|
|
|
|
|
|
|
if (@duplicates_for_id_field > 0) { |
193
|
|
|
|
|
|
|
print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n"; |
194
|
|
|
|
|
|
|
die "\n\nYour unique id field for tickets contains duplicate id's"; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
197
|
|
|
|
|
|
|
$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} = |
198
|
|
|
|
|
|
|
lc($ticket->{$clustering_fieldname}); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
$self->{_total_num_tickets} = scalar @{$self->{_all_tickets}}; |
201
|
|
|
|
|
|
|
$self->store_raw_tickets_on_disk(); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub _test_excel_for_tickets { |
205
|
|
|
|
|
|
|
my $self = shift; |
206
|
|
|
|
|
|
|
use Text::Iconv; |
207
|
|
|
|
|
|
|
my $converter = Text::Iconv->new("utf-8", "windows-1251"); |
208
|
|
|
|
|
|
|
my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing"); |
209
|
|
|
|
|
|
|
my $workbook = Spreadsheet::XLSX->new( $filename, $converter ); |
210
|
|
|
|
|
|
|
my @worksheets = $workbook->worksheets(); |
211
|
|
|
|
|
|
|
my ( $row_min, $row_max ) = $worksheets[0]->row_range(); |
212
|
|
|
|
|
|
|
my ( $col_min, $col_max ) = $worksheets[0]->col_range(); |
213
|
|
|
|
|
|
|
return ($row_min, $row_max, $col_min, $col_max); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub _display_column_headers { |
217
|
|
|
|
|
|
|
my $self = shift; |
218
|
|
|
|
|
|
|
print "\nThe good columns are: @{$self->{_good_columns}}\n\n"; |
219
|
|
|
|
|
|
|
my $overall_header_string = join ' <> ', @{$self->{_column_headers}}; |
220
|
|
|
|
|
|
|
print "The column headers are: $overall_header_string\n\n"; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub _check_unique_id_field { |
224
|
|
|
|
|
|
|
my $self = shift; |
225
|
|
|
|
|
|
|
my %check_hash; |
226
|
|
|
|
|
|
|
my @duplicates; |
227
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
228
|
|
|
|
|
|
|
if (exists $ticket->{$self->{_unique_id_fieldname}}) { |
229
|
|
|
|
|
|
|
push @duplicates, $ticket->{$self->{_unique_id_fieldname}} |
230
|
|
|
|
|
|
|
if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}}; |
231
|
|
|
|
|
|
|
$check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
if ($self->{_debug1}) { |
235
|
|
|
|
|
|
|
my $num_of_tickets = @{$self->{_all_tickets}}; |
236
|
|
|
|
|
|
|
my $num_entries_check_hash = keys %check_hash; |
237
|
|
|
|
|
|
|
print "Number of tickets: $num_of_tickets\n"; |
238
|
|
|
|
|
|
|
print "Number of keys in check hash: $num_entries_check_hash\n"; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
return \@duplicates; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub show_original_ticket_for_given_id { |
244
|
|
|
|
|
|
|
my $self = shift; |
245
|
|
|
|
|
|
|
my $id = shift; |
246
|
|
|
|
|
|
|
print "\n\nDisplaying the fields for the ticket $id:\n\n"; |
247
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
248
|
|
|
|
|
|
|
if ( $ticket->{$self->{_unique_id_fieldname}} == $id) { |
249
|
|
|
|
|
|
|
foreach my $key (sort keys %{$ticket}) { |
250
|
|
|
|
|
|
|
my $value = $ticket->{$key}; |
251
|
|
|
|
|
|
|
$value =~ s/^\s+//; |
252
|
|
|
|
|
|
|
$value =~ s/\s+$//; |
253
|
|
|
|
|
|
|
printf("%20s ==> %s\n", $key, $value); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
sub show_raw_ticket_clustering_data_for_given_id { |
260
|
|
|
|
|
|
|
my $self = shift; |
261
|
|
|
|
|
|
|
my $ticket_id = shift; |
262
|
|
|
|
|
|
|
my $record = $self->{_tickets_by_ids}->{$ticket_id}; |
263
|
|
|
|
|
|
|
print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . |
264
|
|
|
|
|
|
|
"$record\n\n"; |
265
|
|
|
|
|
|
|
return $record; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# Needed by test.t |
269
|
|
|
|
|
|
|
sub _raw_ticket_clustering_data_for_given_id { |
270
|
|
|
|
|
|
|
my $self = shift; |
271
|
|
|
|
|
|
|
my $ticket_id = shift; |
272
|
|
|
|
|
|
|
my $record = $self->{_tickets_by_ids}->{$ticket_id}; |
273
|
|
|
|
|
|
|
return $record; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub show_processed_ticket_clustering_data_for_given_id { |
278
|
|
|
|
|
|
|
my $self = shift; |
279
|
|
|
|
|
|
|
my $ticket_id = shift; |
280
|
|
|
|
|
|
|
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; |
281
|
|
|
|
|
|
|
print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . |
282
|
|
|
|
|
|
|
"$record\n\n"; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub show_stemmed_ticket_clustering_data_for_given_id { |
286
|
|
|
|
|
|
|
my $self = shift; |
287
|
|
|
|
|
|
|
my $ticket_id = shift; |
288
|
|
|
|
|
|
|
my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; |
289
|
|
|
|
|
|
|
print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" . |
290
|
|
|
|
|
|
|
"$record\n\n"; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# The following function is a good diagnostic tool to look into the |
294
|
|
|
|
|
|
|
# array stored in $self->{_all_tickets}. Each element of this array |
295
|
|
|
|
|
|
|
# is a record that represents one row of the Excel file. |
296
|
|
|
|
|
|
|
sub _show_row { |
297
|
|
|
|
|
|
|
my $self = shift; |
298
|
|
|
|
|
|
|
my $row_num = shift; |
299
|
|
|
|
|
|
|
my $total_rows = @{$self->{_all_tickets}}; |
300
|
|
|
|
|
|
|
print "There are $total_rows items in the \$all_tickets array\n"; |
301
|
|
|
|
|
|
|
die "The row that you want to see does not exist" |
302
|
|
|
|
|
|
|
unless $row_num < $total_rows; |
303
|
|
|
|
|
|
|
my %record = %{$self->{_all_tickets}->[$row_num]}; |
304
|
|
|
|
|
|
|
foreach my $field (sort keys %record) { |
305
|
|
|
|
|
|
|
my $value = $record{$field}; |
306
|
|
|
|
|
|
|
no warnings; |
307
|
|
|
|
|
|
|
print "$field ==> $value\n"; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub store_raw_tickets_on_disk { |
312
|
|
|
|
|
|
|
my $self = shift; |
313
|
|
|
|
|
|
|
$self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db}; |
314
|
|
|
|
|
|
|
unlink $self->{_raw_tickets_db}; |
315
|
|
|
|
|
|
|
eval { |
316
|
|
|
|
|
|
|
store( $self->{_all_tickets}, $self->{_raw_tickets_db} ); |
317
|
|
|
|
|
|
|
}; |
318
|
|
|
|
|
|
|
if ($@) { |
319
|
|
|
|
|
|
|
die "Something went wrong with disk storage of ticket data: $@"; |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub restore_raw_tickets_from_disk { |
324
|
|
|
|
|
|
|
my $self = shift; |
325
|
|
|
|
|
|
|
my $clustering_fieldname = $self->{_clustering_fieldname} |
326
|
|
|
|
|
|
|
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- "); |
327
|
|
|
|
|
|
|
my $unique_id_fieldname = $self->{_unique_id_fieldname} |
328
|
|
|
|
|
|
|
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- "); |
329
|
|
|
|
|
|
|
eval { |
330
|
|
|
|
|
|
|
$self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} ); |
331
|
|
|
|
|
|
|
}; |
332
|
|
|
|
|
|
|
if ($@) { |
333
|
|
|
|
|
|
|
die "Unable to retrieve raw tickets from disk: $@"; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
336
|
|
|
|
|
|
|
$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} = |
337
|
|
|
|
|
|
|
lc($ticket->{$clustering_fieldname}); |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
$self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}}; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
sub delete_markup_from_all_tickets { |
343
|
|
|
|
|
|
|
my $self = shift; |
344
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
345
|
|
|
|
|
|
|
$self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}}); |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub _delete_markup_from_one_ticket { |
350
|
|
|
|
|
|
|
my $self = shift; |
351
|
|
|
|
|
|
|
my $ticket_id = shift; |
352
|
|
|
|
|
|
|
my $ticket_strings = $self->{_tickets_by_ids}->{$ticket_id}; |
353
|
|
|
|
|
|
|
my @strings = grep $_, split /\s+/, $ticket_strings; |
354
|
|
|
|
|
|
|
my $cleaned_up_strings = join ' ', grep {$_ !~ /^<[^<>]+>$/} @strings; |
355
|
|
|
|
|
|
|
$self->{_tickets_by_ids}->{$ticket_id} = $cleaned_up_strings; |
356
|
|
|
|
|
|
|
foreach my $ticket (@{$self->{_all_tickets}}) { |
357
|
|
|
|
|
|
|
if ( $ticket->{$self->{_unique_id_fieldname}} == $ticket_id ) { |
358
|
|
|
|
|
|
|
$ticket->{$self->{_clustering_fieldname}} = $cleaned_up_strings; |
359
|
|
|
|
|
|
|
last; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub apply_filter_to_all_tickets { |
365
|
|
|
|
|
|
|
my $self = shift; |
366
|
|
|
|
|
|
|
my $stop_words_file = $self->{_stop_words_file} |
367
|
|
|
|
|
|
|
|| die("\nYou forgot to supply the name of the stop words file in your constructor call\n"); |
368
|
|
|
|
|
|
|
my @stop_words = @{_fetch_words_from_file($stop_words_file)}; |
369
|
|
|
|
|
|
|
my $misspelled_words_file = $self->{_misspelled_words_file} |
370
|
|
|
|
|
|
|
|| die("\nYou forgot to supply the name of the misspelled words file in your constructor call\n"); |
371
|
|
|
|
|
|
|
foreach my $word (@stop_words) { |
372
|
|
|
|
|
|
|
$self->{_stop_words}->{$word} = 1; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
if ($self->{_misspelled_words_file}) { |
375
|
|
|
|
|
|
|
my @misspelled_word_pairs = |
376
|
|
|
|
|
|
|
@{_fetch_word_pairs_from_file($self->{_misspelled_words_file})}; |
377
|
|
|
|
|
|
|
foreach my $wordpair (@misspelled_word_pairs) { |
378
|
|
|
|
|
|
|
my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair; |
379
|
|
|
|
|
|
|
$self->{_misspelled_words}->{$wrong_word} = $good_word; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
my $i = 1; |
383
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tickets_by_ids}}) { |
384
|
|
|
|
|
|
|
print "\nApplying filter to ticket $ticket_id ($i out of $self->{_total_num_tickets})\n"; |
385
|
|
|
|
|
|
|
$self->_apply_filter_to_one_ticket($ticket_id); |
386
|
|
|
|
|
|
|
$i++; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub _apply_filter_to_one_ticket { |
391
|
|
|
|
|
|
|
my $self = shift; |
392
|
|
|
|
|
|
|
my $ticket_id = shift; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
unless (keys %{$self->{_stop_words}} > 0) { |
395
|
|
|
|
|
|
|
my @stop_words = @{_fetch_words_from_file($self->{_stop_words_file})}; |
396
|
|
|
|
|
|
|
foreach my $word (@stop_words) { |
397
|
|
|
|
|
|
|
$self->{_stop_words}->{$word} = 1; |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
unless (keys %{$self->{_misspelled_words}} > 0) { |
401
|
|
|
|
|
|
|
if ($self->{_misspelled_words_file}) { |
402
|
|
|
|
|
|
|
my @misspelled_word_pairs = |
403
|
|
|
|
|
|
|
@{_fetch_word_pairs_from_file($self->{_misspelled_words_file})}; |
404
|
|
|
|
|
|
|
foreach my $wordpair (@misspelled_word_pairs) { |
405
|
|
|
|
|
|
|
my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair; |
406
|
|
|
|
|
|
|
$self->{_misspelled_words}->{$wrong_word} = $good_word; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
my $record = $self->{_tickets_by_ids}->{$ticket_id}; |
411
|
|
|
|
|
|
|
my $min = $self->{_min_word_length}; |
412
|
|
|
|
|
|
|
my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record; |
413
|
|
|
|
|
|
|
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words; |
414
|
|
|
|
|
|
|
return unless @clean_words; |
415
|
|
|
|
|
|
|
my @new_words; |
416
|
|
|
|
|
|
|
foreach my $word (@words) { |
417
|
|
|
|
|
|
|
$word =~ s/(.+)[.,:!-]$/$1/; |
418
|
|
|
|
|
|
|
unless (($word eq 'no') or ($word eq 'not')) { |
419
|
|
|
|
|
|
|
next if length($word) < $self->{_min_word_length}; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
if (exists $self->{_misspelled_words}->{lc($word)}) { |
422
|
|
|
|
|
|
|
push @new_words, $self->{_misspelled_words}->{$word}; |
423
|
|
|
|
|
|
|
next; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
push @new_words, $word unless exists $self->{_stop_words}->{lc($word)}; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
my $new_record = join ' ', @new_words; |
428
|
|
|
|
|
|
|
$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub _get_synonyms_for_word { |
432
|
|
|
|
|
|
|
my $self = shift; |
433
|
|
|
|
|
|
|
my $word = shift; |
434
|
|
|
|
|
|
|
my $no_sense_indicators = 1; |
435
|
|
|
|
|
|
|
my $wn = $self->{_wn}; |
436
|
|
|
|
|
|
|
my @parts_of_speech = $wn->querySense("$word"); |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
my %noun_synonyms; |
439
|
|
|
|
|
|
|
my %verb_synonyms; |
440
|
|
|
|
|
|
|
my %adj_synonyms; |
441
|
|
|
|
|
|
|
my %adv_synonyms; |
442
|
|
|
|
|
|
|
foreach my $pos (@parts_of_speech) { |
443
|
|
|
|
|
|
|
if ($pos =~ /n$/) { |
444
|
|
|
|
|
|
|
my @all_noun_syn_sense_labels = $wn->querySense( $pos, "syns"); |
445
|
|
|
|
|
|
|
my $how_many = @all_noun_syn_sense_labels; |
446
|
|
|
|
|
|
|
foreach my $noun_sense (@all_noun_syn_sense_labels) { |
447
|
|
|
|
|
|
|
my @noun_synonyms = $wn->querySense($noun_sense, "syns"); |
448
|
|
|
|
|
|
|
my $answer = ""; |
449
|
|
|
|
|
|
|
foreach my $noun_syn (@noun_synonyms) { |
450
|
|
|
|
|
|
|
next if $noun_syn eq $noun_sense; |
451
|
|
|
|
|
|
|
$noun_syn =~ s/\#.+$// if $no_sense_indicators; |
452
|
|
|
|
|
|
|
$noun_synonyms{$noun_syn} = 1; |
453
|
|
|
|
|
|
|
$answer .= " $noun_syn "; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
} elsif ($pos =~ /v$/) { |
457
|
|
|
|
|
|
|
my @all_verb_syn_sense_labels = $wn->querySense( $pos, "syns"); |
458
|
|
|
|
|
|
|
my $how_many = @all_verb_syn_sense_labels; |
459
|
|
|
|
|
|
|
foreach my $verb_sense (@all_verb_syn_sense_labels) { |
460
|
|
|
|
|
|
|
my @verb_synonyms = $wn->querySense($verb_sense, "syns"); |
461
|
|
|
|
|
|
|
my $answer = ""; |
462
|
|
|
|
|
|
|
foreach my $verb_syn (@verb_synonyms) { |
463
|
|
|
|
|
|
|
next if $verb_syn eq $verb_sense; |
464
|
|
|
|
|
|
|
$verb_syn =~ s/\#.+$// if $no_sense_indicators; |
465
|
|
|
|
|
|
|
$verb_synonyms{$verb_syn} = 1; |
466
|
|
|
|
|
|
|
$answer .= " $verb_syn "; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
} elsif ($pos =~ /a$/) { |
470
|
|
|
|
|
|
|
my @all_adj_syn_sense_labels = $wn->querySense( $pos, "syns"); |
471
|
|
|
|
|
|
|
my $how_many = @all_adj_syn_sense_labels; |
472
|
|
|
|
|
|
|
foreach my $adj_sense (@all_adj_syn_sense_labels) { |
473
|
|
|
|
|
|
|
my @adj_synonyms = $wn->querySense($adj_sense, "syns"); |
474
|
|
|
|
|
|
|
my $answer = ""; |
475
|
|
|
|
|
|
|
foreach my $adj_syn (@adj_synonyms) { |
476
|
|
|
|
|
|
|
next if $adj_syn eq $adj_sense; |
477
|
|
|
|
|
|
|
$adj_syn =~ s/\#.+$// if $no_sense_indicators; |
478
|
|
|
|
|
|
|
$adj_synonyms{$adj_syn} = 1; |
479
|
|
|
|
|
|
|
$answer .= " $adj_syn "; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} elsif ($pos =~ /r$/) { |
483
|
|
|
|
|
|
|
my @all_adv_syn_sense_labels = $wn->querySense( $pos, "syns"); |
484
|
|
|
|
|
|
|
my $how_many = @all_adv_syn_sense_labels; |
485
|
|
|
|
|
|
|
foreach my $adv_sense (@all_adv_syn_sense_labels) { |
486
|
|
|
|
|
|
|
my @adv_synonyms = $wn->querySense($adv_sense, "syns"); |
487
|
|
|
|
|
|
|
my $answer = ""; |
488
|
|
|
|
|
|
|
foreach my $adv_syn (@adv_synonyms) { |
489
|
|
|
|
|
|
|
next if $adv_syn eq $adv_sense; |
490
|
|
|
|
|
|
|
$adv_syn =~ s/\#.+$// if $no_sense_indicators; |
491
|
|
|
|
|
|
|
$adv_synonyms{$adv_syn} = 1; |
492
|
|
|
|
|
|
|
$answer .= " $adv_syn "; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
} else { |
496
|
|
|
|
|
|
|
die "\nThe Part of Speech $pos not recognized\n\n"; |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
my @all_synonyms; |
500
|
|
|
|
|
|
|
my @all_noun_synonyms = keys %noun_synonyms; |
501
|
|
|
|
|
|
|
my @all_verb_synonyms = keys %verb_synonyms; |
502
|
|
|
|
|
|
|
my @all_adj_synonyms = keys %adj_synonyms; |
503
|
|
|
|
|
|
|
my @all_adv_synonyms = keys %adv_synonyms; |
504
|
|
|
|
|
|
|
push @all_synonyms, @all_noun_synonyms if @all_noun_synonyms > 0; |
505
|
|
|
|
|
|
|
push @all_synonyms, @all_verb_synonyms if @all_verb_synonyms > 0; |
506
|
|
|
|
|
|
|
push @all_synonyms, @all_adj_synonyms if @all_adj_synonyms > 0; |
507
|
|
|
|
|
|
|
push @all_synonyms, @all_adv_synonyms if @all_adv_synonyms > 0; |
508
|
|
|
|
|
|
|
my %synonym_set; |
509
|
|
|
|
|
|
|
foreach my $synonym (@all_synonyms) { |
510
|
|
|
|
|
|
|
$synonym_set{$synonym} = 1; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
my @synonym_set = sort keys %synonym_set; |
513
|
|
|
|
|
|
|
return \@synonym_set; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub _get_antonyms_for_word { |
517
|
|
|
|
|
|
|
my $self = shift; |
518
|
|
|
|
|
|
|
my $word = shift; |
519
|
|
|
|
|
|
|
my $no_sense_indicators = 1; |
520
|
|
|
|
|
|
|
my $wn = $self->{_wn}; |
521
|
|
|
|
|
|
|
my @parts_of_speech = $wn->querySense("$word"); |
522
|
|
|
|
|
|
|
my %noun_antonyms; |
523
|
|
|
|
|
|
|
my %verb_antonyms; |
524
|
|
|
|
|
|
|
my %adj_antonyms; |
525
|
|
|
|
|
|
|
my %adv_antonyms; |
526
|
|
|
|
|
|
|
foreach my $pos (@parts_of_speech) { |
527
|
|
|
|
|
|
|
if ($pos =~ /n$/) { |
528
|
|
|
|
|
|
|
my @all_noun_ant_sense_labels = $wn->queryWord( $pos, "ants"); |
529
|
|
|
|
|
|
|
my $how_many = @all_noun_ant_sense_labels; |
530
|
|
|
|
|
|
|
foreach my $noun_sense (@all_noun_ant_sense_labels) { |
531
|
|
|
|
|
|
|
my @noun_antonyms = $wn->queryWord($noun_sense, "ants"); |
532
|
|
|
|
|
|
|
my $answer = ""; |
533
|
|
|
|
|
|
|
foreach my $noun_ant (@noun_antonyms) { |
534
|
|
|
|
|
|
|
next if $noun_ant eq $noun_sense; |
535
|
|
|
|
|
|
|
$noun_ant =~ s/\#.+$// if $no_sense_indicators; |
536
|
|
|
|
|
|
|
$noun_antonyms{$noun_ant} = 1; |
537
|
|
|
|
|
|
|
$answer .= " $noun_ant "; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
} elsif ($pos =~ /v$/) { |
541
|
|
|
|
|
|
|
my @all_verb_ant_sense_labels = $wn->queryWord( $pos, "ants"); |
542
|
|
|
|
|
|
|
my $how_many = @all_verb_ant_sense_labels; |
543
|
|
|
|
|
|
|
foreach my $verb_sense (@all_verb_ant_sense_labels) { |
544
|
|
|
|
|
|
|
my @verb_antonyms = $wn->queryWord($verb_sense, "ants"); |
545
|
|
|
|
|
|
|
my $answer = ""; |
546
|
|
|
|
|
|
|
foreach my $verb_ant (@verb_antonyms) { |
547
|
|
|
|
|
|
|
next if $verb_ant eq $verb_sense; |
548
|
|
|
|
|
|
|
$verb_ant =~ s/\#.+$// if $no_sense_indicators; |
549
|
|
|
|
|
|
|
$verb_antonyms{$verb_ant} = 1; |
550
|
|
|
|
|
|
|
$answer .= " $verb_ant "; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
} elsif ($pos =~ /a$/) { |
554
|
|
|
|
|
|
|
my @all_adj_ant_sense_labels = $wn->queryWord( $pos, "ants"); |
555
|
|
|
|
|
|
|
my $how_many = @all_adj_ant_sense_labels; |
556
|
|
|
|
|
|
|
foreach my $adj_sense (@all_adj_ant_sense_labels) { |
557
|
|
|
|
|
|
|
my @adj_antonyms = $wn->queryWord($adj_sense, "ants"); |
558
|
|
|
|
|
|
|
my $answer = ""; |
559
|
|
|
|
|
|
|
foreach my $adj_ant (@adj_antonyms) { |
560
|
|
|
|
|
|
|
next if $adj_ant eq $adj_sense; |
561
|
|
|
|
|
|
|
$adj_ant =~ s/\#.+$// if $no_sense_indicators; |
562
|
|
|
|
|
|
|
$adj_antonyms{$adj_ant} = 1; |
563
|
|
|
|
|
|
|
$answer .= " $adj_ant "; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
} elsif ($pos =~ /r$/) { |
567
|
|
|
|
|
|
|
my @all_adv_ant_sense_labels = $wn->queryWord( $pos, "ants"); |
568
|
|
|
|
|
|
|
my $how_many = @all_adv_ant_sense_labels; |
569
|
|
|
|
|
|
|
foreach my $adv_sense (@all_adv_ant_sense_labels) { |
570
|
|
|
|
|
|
|
my @adv_antonyms = $wn->queryWord($adv_sense, "ants"); |
571
|
|
|
|
|
|
|
my $answer = ""; |
572
|
|
|
|
|
|
|
foreach my $adv_ant (@adv_antonyms) { |
573
|
|
|
|
|
|
|
next if $adv_ant eq $adv_sense; |
574
|
|
|
|
|
|
|
$adv_ant =~ s/\#.+$// if $no_sense_indicators; |
575
|
|
|
|
|
|
|
$adv_antonyms{$adv_ant} = 1; |
576
|
|
|
|
|
|
|
$answer .= " $adv_ant "; |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
} else { |
580
|
|
|
|
|
|
|
die "\nThe Part of Speech $pos not recognized\n\n"; |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
my @all_antonyms; |
584
|
|
|
|
|
|
|
my @all_noun_antonyms = keys %noun_antonyms; |
585
|
|
|
|
|
|
|
my @all_verb_antonyms = keys %verb_antonyms; |
586
|
|
|
|
|
|
|
my @all_adj_antonyms = keys %adj_antonyms; |
587
|
|
|
|
|
|
|
my @all_adv_antonyms = keys %adv_antonyms; |
588
|
|
|
|
|
|
|
push @all_antonyms, @all_noun_antonyms if @all_noun_antonyms > 0; |
589
|
|
|
|
|
|
|
push @all_antonyms, @all_verb_antonyms if @all_verb_antonyms > 0; |
590
|
|
|
|
|
|
|
push @all_antonyms, @all_adj_antonyms if @all_adj_antonyms > 0; |
591
|
|
|
|
|
|
|
push @all_antonyms, @all_adv_antonyms if @all_adv_antonyms > 0; |
592
|
|
|
|
|
|
|
my %antonym_set; |
593
|
|
|
|
|
|
|
foreach my $antonym (@all_antonyms) { |
594
|
|
|
|
|
|
|
$antonym_set{$antonym} = 1; |
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
my @antonym_set = sort keys %antonym_set; |
597
|
|
|
|
|
|
|
return \@antonym_set; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
sub expand_all_tickets_with_synonyms { |
601
|
|
|
|
|
|
|
my $self = shift; |
602
|
|
|
|
|
|
|
return unless $self->{_add_synsets_to_tickets}; |
603
|
|
|
|
|
|
|
my $num_of_tickets = $self->{_total_num_tickets}; |
604
|
|
|
|
|
|
|
if ($self->{_want_synset_caching}) { |
605
|
|
|
|
|
|
|
eval { |
606
|
|
|
|
|
|
|
$self->{_synset_cache} = retrieve( $self->{_synset_cache_db} ); |
607
|
|
|
|
|
|
|
} if -s $self->{_synset_cache_db}; |
608
|
|
|
|
|
|
|
if ($@) { |
609
|
|
|
|
|
|
|
print "Something went wrong with restoration of synset cache: $@"; |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
my $i = 1; |
613
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { |
614
|
|
|
|
|
|
|
$self->_expand_one_ticket_with_synonyms($ticket_id); |
615
|
|
|
|
|
|
|
print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n"; |
616
|
|
|
|
|
|
|
$i++; |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
if ($self->{_want_synset_caching}) { |
619
|
|
|
|
|
|
|
$self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db}; |
620
|
|
|
|
|
|
|
eval { |
621
|
|
|
|
|
|
|
store( $self->{_synset_cache}, $self->{_synset_cache_db} ); |
622
|
|
|
|
|
|
|
}; |
623
|
|
|
|
|
|
|
if ($@) { |
624
|
|
|
|
|
|
|
die "Something went wrong with disk storage of synset cache: $@"; |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
sub _expand_one_ticket_with_synonyms { |
630
|
|
|
|
|
|
|
my $self = shift; |
631
|
|
|
|
|
|
|
my $ticket_id = shift; |
632
|
|
|
|
|
|
|
print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n" |
633
|
|
|
|
|
|
|
if $self->{_debug2}; |
634
|
|
|
|
|
|
|
$self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id ); |
635
|
|
|
|
|
|
|
$self->_add_to_words_their_synonyms_one_ticket( $ticket_id ); |
636
|
|
|
|
|
|
|
} |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
sub _replace_negated_words_with_antonyms_one_ticket { |
639
|
|
|
|
|
|
|
my $self = shift; |
640
|
|
|
|
|
|
|
my $ticket_id = shift; |
641
|
|
|
|
|
|
|
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; |
642
|
|
|
|
|
|
|
my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig; |
643
|
|
|
|
|
|
|
foreach my $word (@words_negated_with_not) { |
644
|
|
|
|
|
|
|
next unless (($word =~ /^\w+$/) && |
645
|
|
|
|
|
|
|
(length($word) > $self->{_min_word_length})); |
646
|
|
|
|
|
|
|
my @antonym_words = @{$self->_get_antonyms_for_word( $word )}; |
647
|
|
|
|
|
|
|
next unless @antonym_words > 0; |
648
|
|
|
|
|
|
|
$#antonym_words = $self->{_max_num_syn_words} - 1 |
649
|
|
|
|
|
|
|
if @antonym_words > $self->{_max_num_syn_words}; |
650
|
|
|
|
|
|
|
my $antonym_replacement_string = join ' ', @antonym_words; |
651
|
|
|
|
|
|
|
print "Antonym for $word is $antonym_replacement_string\n" |
652
|
|
|
|
|
|
|
if $self->{_debug2}; |
653
|
|
|
|
|
|
|
$record =~ s/not\s+$word/$antonym_replacement_string/g; |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig; |
656
|
|
|
|
|
|
|
foreach my $word (@words_negated_with_no) { |
657
|
|
|
|
|
|
|
next unless (($word =~ /^\w+$/) && |
658
|
|
|
|
|
|
|
(length($word) > $self->{_min_word_length})); |
659
|
|
|
|
|
|
|
my @antonym_words = @{$self->_get_antonyms_for_word( $word )}; |
660
|
|
|
|
|
|
|
next unless @antonym_words > 0; |
661
|
|
|
|
|
|
|
$#antonym_words = $self->{_max_num_syn_words} - 1 |
662
|
|
|
|
|
|
|
if @antonym_words > $self->{_max_num_syn_words}; |
663
|
|
|
|
|
|
|
my $antonym_replacement_string = join ' ', @antonym_words; |
664
|
|
|
|
|
|
|
print "Antonym for $word is $antonym_replacement_string\n" |
665
|
|
|
|
|
|
|
if $self->{_debug2}; |
666
|
|
|
|
|
|
|
$record =~ s/no\s+$word/$antonym_replacement_string/g; |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
$self->{_processed_tkts_by_ids}->{$ticket_id} = $record; |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
sub _add_to_words_their_synonyms_one_ticket { |
672
|
|
|
|
|
|
|
my $self = shift; |
673
|
|
|
|
|
|
|
my $ticket_id = shift; |
674
|
|
|
|
|
|
|
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; |
675
|
|
|
|
|
|
|
my @words = split /\s+/, $record; |
676
|
|
|
|
|
|
|
my @synonym_bag; |
677
|
|
|
|
|
|
|
foreach my $word (@words) { |
678
|
|
|
|
|
|
|
next if $word eq 'no'; |
679
|
|
|
|
|
|
|
next if $word eq 'not'; |
680
|
|
|
|
|
|
|
next unless $word =~ /^\w+$/ && |
681
|
|
|
|
|
|
|
length($word) > $self->{_min_word_length}; |
682
|
|
|
|
|
|
|
my @synonym_words; |
683
|
|
|
|
|
|
|
@synonym_words = @{$self->{_synset_cache}->{$word}} |
684
|
|
|
|
|
|
|
if exists $self->{_synset_cache}->{$word}; |
685
|
|
|
|
|
|
|
unless (exists $self->{_synset_cache}->{$word}) { |
686
|
|
|
|
|
|
|
@synonym_words = @{$self->_get_synonyms_for_word( $word )}; |
687
|
|
|
|
|
|
|
print "syn-set for $word => @synonym_words\n\n" |
688
|
|
|
|
|
|
|
if $self->{_debug2}; |
689
|
|
|
|
|
|
|
my $word_root; |
690
|
|
|
|
|
|
|
if (@synonym_words == 0) { |
691
|
|
|
|
|
|
|
if ((length($word) > 4) && ($word =~ /(.+)s$/)) { |
692
|
|
|
|
|
|
|
$word_root = $1; |
693
|
|
|
|
|
|
|
@synonym_words = @{$self->_get_synonyms_for_word( $word_root )} |
694
|
|
|
|
|
|
|
if length($word_root) >= $self->{_min_word_length}; |
695
|
|
|
|
|
|
|
} elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) { |
696
|
|
|
|
|
|
|
$word_root = $1; |
697
|
|
|
|
|
|
|
@synonym_words = @{$self->_get_synonyms_for_word( $word_root )} |
698
|
|
|
|
|
|
|
if length($word_root) >= $self->{_min_word_length}; |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
print "syn-set for word root $word_root => @synonym_words\n\n" |
702
|
|
|
|
|
|
|
if ( $self->{_debug2} && defined $word_root ); |
703
|
|
|
|
|
|
|
_fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0; |
704
|
|
|
|
|
|
|
$#synonym_words = $self->{_max_num_syn_words} - 1 |
705
|
|
|
|
|
|
|
if @synonym_words > $self->{_max_num_syn_words}; |
706
|
|
|
|
|
|
|
print "Retained syn-set for $word => @synonym_words\n\n" |
707
|
|
|
|
|
|
|
if $self->{_debug2}; |
708
|
|
|
|
|
|
|
$self->{_synset_cache}->{$word} = \@synonym_words; |
709
|
|
|
|
|
|
|
push @synonym_bag, @synonym_words; |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
foreach my $syn_word (@synonym_bag) { |
713
|
|
|
|
|
|
|
push @words, lc($syn_word) |
714
|
|
|
|
|
|
|
unless ((exists $self->{_stop_words}->{$syn_word}) || |
715
|
|
|
|
|
|
|
(length($syn_word) <= $self->{_min_word_length})); |
716
|
|
|
|
|
|
|
} |
717
|
|
|
|
|
|
|
my @sorted_words = sort @words; |
718
|
|
|
|
|
|
|
my $new_record = join ' ', @sorted_words; |
719
|
|
|
|
|
|
|
$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record; |
720
|
|
|
|
|
|
|
} |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
sub store_processed_tickets_on_disk { |
723
|
|
|
|
|
|
|
my $self = shift; |
724
|
|
|
|
|
|
|
$self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db}; |
725
|
|
|
|
|
|
|
unlink $self->{_processed_tickets_db}; |
726
|
|
|
|
|
|
|
eval { |
727
|
|
|
|
|
|
|
store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} ); |
728
|
|
|
|
|
|
|
}; |
729
|
|
|
|
|
|
|
if ($@) { |
730
|
|
|
|
|
|
|
die "Something went wrong with disk storage of processed tickets: $@"; |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
sub store_stemmed_tickets_and_inverted_index_on_disk { |
735
|
|
|
|
|
|
|
my $self = shift; |
736
|
|
|
|
|
|
|
$self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db}; |
737
|
|
|
|
|
|
|
unlink $self->{_stemmed_tickets_db}; |
738
|
|
|
|
|
|
|
eval { |
739
|
|
|
|
|
|
|
print "\n\nStoring stemmed tickets on disk\n\n"; |
740
|
|
|
|
|
|
|
store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} ); |
741
|
|
|
|
|
|
|
}; |
742
|
|
|
|
|
|
|
if ($@) { |
743
|
|
|
|
|
|
|
die "Something went wrong with disk storage of stemmed tickets: $@"; |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
$self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db}; |
746
|
|
|
|
|
|
|
unlink $self->{_inverted_index_db}; |
747
|
|
|
|
|
|
|
eval { |
748
|
|
|
|
|
|
|
print "\nStoring inverted index on disk\n\n"; |
749
|
|
|
|
|
|
|
store( $self->{_inverted_index}, $self->{_inverted_index_db} ); |
750
|
|
|
|
|
|
|
}; |
751
|
|
|
|
|
|
|
if ($@) { |
752
|
|
|
|
|
|
|
die "Something went wrong with disk storage of the inverted index: $@"; |
753
|
|
|
|
|
|
|
} |
754
|
|
|
|
|
|
|
} |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
sub restore_processed_tickets_from_disk { |
757
|
|
|
|
|
|
|
my $self = shift; |
758
|
|
|
|
|
|
|
eval { |
759
|
|
|
|
|
|
|
$self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} ); |
760
|
|
|
|
|
|
|
}; |
761
|
|
|
|
|
|
|
if ($@) { |
762
|
|
|
|
|
|
|
die "Something went wrong with restoration of processed tickets: $@"; |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
sub restore_stemmed_tickets_from_disk { |
767
|
|
|
|
|
|
|
my $self = shift; |
768
|
|
|
|
|
|
|
eval { |
769
|
|
|
|
|
|
|
$self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} ); |
770
|
|
|
|
|
|
|
}; |
771
|
|
|
|
|
|
|
if ($@) { |
772
|
|
|
|
|
|
|
die "Something went wrong with restoration of stemmed tickets: $@"; |
773
|
|
|
|
|
|
|
} |
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
#################### Get Ticket Vocabulary and Word Counts ################# |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
sub get_ticket_vocabulary_and_construct_inverted_index { |
779
|
|
|
|
|
|
|
my $self = shift; |
780
|
|
|
|
|
|
|
my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}}; |
781
|
|
|
|
|
|
|
$self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db}; |
782
|
|
|
|
|
|
|
unlink glob "$self->{_tickets_vocab_db}.*"; |
783
|
|
|
|
|
|
|
my %vocab_hist_on_disk; |
784
|
|
|
|
|
|
|
tie %vocab_hist_on_disk, 'SDBM_File', |
785
|
|
|
|
|
|
|
$self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640 |
786
|
|
|
|
|
|
|
or die "Can't create DBM files: $!"; |
787
|
|
|
|
|
|
|
my %inverted_index; |
788
|
|
|
|
|
|
|
my $min = $self->{_min_word_length}; |
789
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { |
790
|
|
|
|
|
|
|
my %uniques = (); |
791
|
|
|
|
|
|
|
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; |
792
|
|
|
|
|
|
|
my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record; |
793
|
|
|
|
|
|
|
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup; |
794
|
|
|
|
|
|
|
next unless @clean_words; |
795
|
|
|
|
|
|
|
@clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), |
796
|
|
|
|
|
|
|
@clean_words; |
797
|
|
|
|
|
|
|
map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words; |
798
|
|
|
|
|
|
|
for (@clean_words) { $uniques{"\L$_"}++ }; |
799
|
|
|
|
|
|
|
map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques; |
800
|
|
|
|
|
|
|
map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id } |
801
|
|
|
|
|
|
|
keys %uniques; |
802
|
|
|
|
|
|
|
$self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words; |
803
|
|
|
|
|
|
|
} |
804
|
|
|
|
|
|
|
foreach (keys %vocab_hist_on_disk) { |
805
|
|
|
|
|
|
|
$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_}; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
untie %vocab_hist_on_disk; |
808
|
|
|
|
|
|
|
$self->{_tkt_vocab_done} = 1; |
809
|
|
|
|
|
|
|
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} ); |
810
|
|
|
|
|
|
|
print "\n\nVocabulary size: $self->{_vocab_size}\n\n" |
811
|
|
|
|
|
|
|
if $self->{_debug2}; |
812
|
|
|
|
|
|
|
# Calculate idf(t): |
813
|
|
|
|
|
|
|
$self->{_idf_db} = "idf.db" unless $self->{_idf_db}; |
814
|
|
|
|
|
|
|
unlink glob "$self->{_idf_db}.*"; |
815
|
|
|
|
|
|
|
tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640 |
816
|
|
|
|
|
|
|
or die "Can't create DBM files: $!"; |
817
|
|
|
|
|
|
|
foreach (keys %{$self->{_vocab_idf_hist}}) { |
818
|
|
|
|
|
|
|
$idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets |
819
|
|
|
|
|
|
|
/ |
820
|
|
|
|
|
|
|
(1 + $self->{_vocab_idf_hist}->{$_}))) |
821
|
|
|
|
|
|
|
/ log(10) ); |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
foreach (keys %idf_t_on_disk) { |
824
|
|
|
|
|
|
|
$self->{_idf_t}->{$_} = $idf_t_on_disk{$_}; |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
untie %idf_t_on_disk; |
827
|
|
|
|
|
|
|
} |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
sub display_tickets_vocab { |
830
|
|
|
|
|
|
|
my $self = shift; |
831
|
|
|
|
|
|
|
die "tickets vocabulary not yet constructed" |
832
|
|
|
|
|
|
|
unless keys %{$self->{_vocab_hist}}; |
833
|
|
|
|
|
|
|
print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n"; |
834
|
|
|
|
|
|
|
foreach (sort keys %{$self->{_vocab_hist}}){ |
835
|
|
|
|
|
|
|
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_}); |
836
|
|
|
|
|
|
|
print "$outstring\n"; |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} ); |
839
|
|
|
|
|
|
|
print "\nSize of the tickets vocabulary: $vocab_size\n\n"; |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
sub display_inverse_document_frequencies { |
843
|
|
|
|
|
|
|
my $self = shift; |
844
|
|
|
|
|
|
|
die "tickets vocabulary not yet constructed" |
845
|
|
|
|
|
|
|
unless keys %{$self->{_vocab_idf_hist}}; |
846
|
|
|
|
|
|
|
print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n"; |
847
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_vocab_idf_hist}} ) { |
848
|
|
|
|
|
|
|
my $outstring = sprintf("%30s %d", |
849
|
|
|
|
|
|
|
$_, $self->{_vocab_idf_hist}->{$_}); |
850
|
|
|
|
|
|
|
print "$outstring\n"; |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n"; |
853
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_idf_t}} ) { |
854
|
|
|
|
|
|
|
my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_}); |
855
|
|
|
|
|
|
|
print "$outstring\n"; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
# The following subroutine is useful for diagnostic purposes. It |
860
|
|
|
|
|
|
|
# lists the number of tickets that a word appears in and also lists |
861
|
|
|
|
|
|
|
# the tickets. But be careful in interpreting its results. Note |
862
|
|
|
|
|
|
|
# if you invoke this subroutine after the synsets have been added |
863
|
|
|
|
|
|
|
# to the tickets, you may find words being attributed to tickets |
864
|
|
|
|
|
|
|
# that do not actually contain them in the original Excel sheet. |
865
|
|
|
|
|
|
|
sub list_processed_tickets_for_a_word { |
866
|
|
|
|
|
|
|
my $self = shift; |
867
|
|
|
|
|
|
|
while (my $word = ) { #enter ctrl-D to exit the loop |
868
|
|
|
|
|
|
|
chomp $word; |
869
|
|
|
|
|
|
|
my @ticket_list; |
870
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) { |
871
|
|
|
|
|
|
|
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id}; |
872
|
|
|
|
|
|
|
push @ticket_list, $ticket_id if $record =~ /\b$word\b/i; |
873
|
|
|
|
|
|
|
} |
874
|
|
|
|
|
|
|
my $num = @ticket_list; |
875
|
|
|
|
|
|
|
print "\nThe number of processed tickets that mention the word `$word': $num\n\n"; |
876
|
|
|
|
|
|
|
print "The processed tickets: @ticket_list\n\n"; |
877
|
|
|
|
|
|
|
} |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
sub list_stemmed_tickets_for_a_word { |
881
|
|
|
|
|
|
|
my $self = shift; |
882
|
|
|
|
|
|
|
while (my $word = ) { #enter ctrl-D to exit the loop |
883
|
|
|
|
|
|
|
chomp $word; |
884
|
|
|
|
|
|
|
my @ticket_list; |
885
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) { |
886
|
|
|
|
|
|
|
my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; |
887
|
|
|
|
|
|
|
push @ticket_list, $ticket_id if $record =~ /\b$word\b/i; |
888
|
|
|
|
|
|
|
} |
889
|
|
|
|
|
|
|
my $num = @ticket_list; |
890
|
|
|
|
|
|
|
print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n"; |
891
|
|
|
|
|
|
|
print "The stemmed tickets: @ticket_list\n\n"; |
892
|
|
|
|
|
|
|
} |
893
|
|
|
|
|
|
|
} |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
############## Generate Document Vectors for Tickets #################### |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
sub construct_doc_vectors_for_all_tickets { |
898
|
|
|
|
|
|
|
my $self = shift; |
899
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_vocab_hist}} ) { |
900
|
|
|
|
|
|
|
$self->{_doc_vector_template}->{$_} = 0; |
901
|
|
|
|
|
|
|
} |
902
|
|
|
|
|
|
|
my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}}; |
903
|
|
|
|
|
|
|
my $i = 1; |
904
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) { |
905
|
|
|
|
|
|
|
my $doc_vec_ref = |
906
|
|
|
|
|
|
|
$self->_construct_doc_vector_for_one_ticket($ticket_id); |
907
|
|
|
|
|
|
|
print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n"; |
908
|
|
|
|
|
|
|
$i++; |
909
|
|
|
|
|
|
|
} |
910
|
|
|
|
|
|
|
} |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
sub _construct_doc_vector_for_one_ticket { |
913
|
|
|
|
|
|
|
my $self = shift; |
914
|
|
|
|
|
|
|
my $ticket_id = shift; |
915
|
|
|
|
|
|
|
unless (keys %{$self->{_doc_vector_template}}) { |
916
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_vocab_hist}} ) { |
917
|
|
|
|
|
|
|
$self->{_doc_vector_template}->{$_} = 0; |
918
|
|
|
|
|
|
|
} |
919
|
|
|
|
|
|
|
} |
920
|
|
|
|
|
|
|
my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})}; |
921
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_doc_vector_template}} ) { |
922
|
|
|
|
|
|
|
$doc_vector{$_} = 0; |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
my $min = $self->{_min_word_length}; |
925
|
|
|
|
|
|
|
my $total_words_in_ticket = 0; |
926
|
|
|
|
|
|
|
my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id}; |
927
|
|
|
|
|
|
|
my @clean_words = split /\s+/, $record; |
928
|
|
|
|
|
|
|
map { $doc_vector{"\L$_"}++ } |
929
|
|
|
|
|
|
|
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; |
930
|
|
|
|
|
|
|
die "Something went wrong. Doc vector size unequal to vocab size" |
931
|
|
|
|
|
|
|
unless $self->{_vocab_size} == scalar(keys %doc_vector); |
932
|
|
|
|
|
|
|
foreach (keys %doc_vector) { |
933
|
|
|
|
|
|
|
$total_words_in_ticket += $doc_vector{$_}; |
934
|
|
|
|
|
|
|
} |
935
|
|
|
|
|
|
|
my %normalized_doc_vector; |
936
|
|
|
|
|
|
|
foreach (keys %doc_vector) { |
937
|
|
|
|
|
|
|
$normalized_doc_vector{$_} = $doc_vector{$_} |
938
|
|
|
|
|
|
|
* |
939
|
|
|
|
|
|
|
$self->{_idf_t}->{$_} |
940
|
|
|
|
|
|
|
/ |
941
|
|
|
|
|
|
|
$total_words_in_ticket; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs}->{$ticket_id} = \%doc_vector; |
944
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs_normed}->{$ticket_id} = \%normalized_doc_vector; |
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
sub store_ticket_vectors { |
948
|
|
|
|
|
|
|
my $self = shift; |
949
|
|
|
|
|
|
|
die "You have not yet created doc vectors for tickets" |
950
|
|
|
|
|
|
|
unless keys %{$self->{_tkt_doc_vecs}}; |
951
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs_db} = "tkt_doc_vecs.db" unless $self->{_tkt_doc_vecs_db}; |
952
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs_normed_db} = "tkt_doc_vecs_normed.db" |
953
|
|
|
|
|
|
|
unless $self->{_tkt_doc_vecs_normed_db}; |
954
|
|
|
|
|
|
|
unlink $self->{_tkt_doc_vecs_db}; |
955
|
|
|
|
|
|
|
unlink $self->{_tkt_doc_vecs_normed_db}; |
956
|
|
|
|
|
|
|
print "\nStoring the ticket doc vecs on disk. This could take a while.\n\n"; |
957
|
|
|
|
|
|
|
eval { |
958
|
|
|
|
|
|
|
store( $self->{_tkt_doc_vecs}, $self->{_tkt_doc_vecs_db} ); |
959
|
|
|
|
|
|
|
}; |
960
|
|
|
|
|
|
|
if ($@) { |
961
|
|
|
|
|
|
|
die "Something went wrong with disk storage of ticket doc vectors: $@"; |
962
|
|
|
|
|
|
|
} |
963
|
|
|
|
|
|
|
print "\nStoring normalized ticket doc vecs on disk. This could take a while.\n\n"; |
964
|
|
|
|
|
|
|
eval { |
965
|
|
|
|
|
|
|
store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db}); |
966
|
|
|
|
|
|
|
}; |
967
|
|
|
|
|
|
|
if ($@) { |
968
|
|
|
|
|
|
|
die "Something wrong with disk storage of normalized doc vecs: $@"; |
969
|
|
|
|
|
|
|
} |
970
|
|
|
|
|
|
|
} |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
sub restore_ticket_vectors_and_inverted_index { |
973
|
|
|
|
|
|
|
my $self = shift; |
974
|
|
|
|
|
|
|
$self->restore_raw_tickets_from_disk(); |
975
|
|
|
|
|
|
|
$self->restore_processed_tickets_from_disk(); |
976
|
|
|
|
|
|
|
$self->restore_stemmed_tickets_from_disk(); |
977
|
|
|
|
|
|
|
tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640 |
978
|
|
|
|
|
|
|
or die "Can't connect with DBM file: $!"; |
979
|
|
|
|
|
|
|
foreach (keys %vocab_hist_on_disk) { |
980
|
|
|
|
|
|
|
$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_}; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
untie %vocab_hist_on_disk; |
983
|
|
|
|
|
|
|
tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640 |
984
|
|
|
|
|
|
|
or die "Can't connect with DBM file: $!"; |
985
|
|
|
|
|
|
|
foreach (keys %idf_t_on_disk) { |
986
|
|
|
|
|
|
|
$self->{_idf_t}->{$_} = $idf_t_on_disk{$_}; |
987
|
|
|
|
|
|
|
} |
988
|
|
|
|
|
|
|
untie %idf_t_on_disk; |
989
|
|
|
|
|
|
|
eval { |
990
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs} = retrieve( $self->{_tkt_doc_vecs_db} ); |
991
|
|
|
|
|
|
|
}; |
992
|
|
|
|
|
|
|
if ($@) { |
993
|
|
|
|
|
|
|
print "Something went wrong with retrieval of ticket doc vectors: $@"; |
994
|
|
|
|
|
|
|
} |
995
|
|
|
|
|
|
|
eval { |
996
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs_normed} = retrieve( $self->{_tkt_doc_vecs_normed_db} ); |
997
|
|
|
|
|
|
|
}; |
998
|
|
|
|
|
|
|
if ($@) { |
999
|
|
|
|
|
|
|
print "Something went wrong with retrieval of normed ticket doc vectors: $@"; |
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
eval { |
1002
|
|
|
|
|
|
|
$self->{_inverted_index} = |
1003
|
|
|
|
|
|
|
retrieve( $self->{_inverted_index_db} ); |
1004
|
|
|
|
|
|
|
}; |
1005
|
|
|
|
|
|
|
if ($@) { |
1006
|
|
|
|
|
|
|
print "Something went wrong with retrieval of inverted_index: $@"; |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
} |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
sub display_all_doc_vectors { |
1011
|
|
|
|
|
|
|
my $self = shift; |
1012
|
|
|
|
|
|
|
die "Ticket doc vectors not yet constructed" |
1013
|
|
|
|
|
|
|
unless keys %{$self->{_tkt_doc_vecs}}; |
1014
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs}}) { |
1015
|
|
|
|
|
|
|
$self->_display_a_single_ticket_doc_vector($ticket_id); |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
} |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
sub _display_a_single_ticket_doc_vector { |
1020
|
|
|
|
|
|
|
my $self = shift; |
1021
|
|
|
|
|
|
|
my $ticket_id = shift; |
1022
|
|
|
|
|
|
|
die "Ticket doc vectors not yet constructed" |
1023
|
|
|
|
|
|
|
unless keys %{$self->{_tkt_doc_vecs}}; |
1024
|
|
|
|
|
|
|
print "\n\nDISPLAYING Doc Vec FOR TICKET $ticket_id:\n\n"; |
1025
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_tkt_doc_vecs}->{$ticket_id}} ) { |
1026
|
|
|
|
|
|
|
print "$_ => $self->{_tkt_doc_vecs}->{$ticket_id}->{$_}\n"; |
1027
|
|
|
|
|
|
|
} |
1028
|
|
|
|
|
|
|
my $docvec_size = keys %{$self->{_tkt_doc_vecs}->{$ticket_id}}; |
1029
|
|
|
|
|
|
|
print "\nSize of vector for ticket $ticket_id: $docvec_size\n"; |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
sub display_all_normalized_doc_vectors { |
1033
|
|
|
|
|
|
|
my $self = shift; |
1034
|
|
|
|
|
|
|
die "normalized document vectors not yet constructed" |
1035
|
|
|
|
|
|
|
unless keys %{$self->{_tkt_doc_vecs_normed}}; |
1036
|
|
|
|
|
|
|
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_tkt_doc_vecs_normed}}) { |
1037
|
|
|
|
|
|
|
$self->_display_a_single_normalized_doc_vector($ticket_id); |
1038
|
|
|
|
|
|
|
} |
1039
|
|
|
|
|
|
|
} |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
sub _display_a_single_normalized_doc_vector { |
1042
|
|
|
|
|
|
|
my $self = shift; |
1043
|
|
|
|
|
|
|
my $ticket_id = shift; |
1044
|
|
|
|
|
|
|
die "Normalized ticket doc vectors not yet constructed" |
1045
|
|
|
|
|
|
|
unless keys %{$self->{_tkt_doc_vecs_normed}}; |
1046
|
|
|
|
|
|
|
print "\n\nDISPLAYING Normalized Doc Vec FOR TICKET $ticket_id:\n\n"; |
1047
|
|
|
|
|
|
|
foreach ( sort keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}} ) { |
1048
|
|
|
|
|
|
|
print "$_ => $self->{_tkt_doc_vecs_normed}->{$ticket_id}->{$_}\n"; |
1049
|
|
|
|
|
|
|
} |
1050
|
|
|
|
|
|
|
my $docvec_size = keys %{$self->{_tkt_doc_vecs_normed}->{$ticket_id}}; |
1051
|
|
|
|
|
|
|
print "\nSize of normalized vector for ticket $ticket_id: $docvec_size\n"; |
1052
|
|
|
|
|
|
|
} |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
########################## Display Inverted Index ########################### |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
sub display_inverted_index { |
1057
|
|
|
|
|
|
|
my $self = shift; |
1058
|
|
|
|
|
|
|
print "\n\nDisplaying inverted index:\n\n"; |
1059
|
|
|
|
|
|
|
foreach my $word (sort keys %{$self->{_vocab_hist}}) { |
1060
|
|
|
|
|
|
|
$self->display_inverted_index_for_given_word($word); |
1061
|
|
|
|
|
|
|
} |
1062
|
|
|
|
|
|
|
} |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
sub display_inverted_index_for_given_word { |
1065
|
|
|
|
|
|
|
my $self = shift; |
1066
|
|
|
|
|
|
|
my $word = shift; |
1067
|
|
|
|
|
|
|
defined $self->{_inverted_index}->{$word} ? |
1068
|
|
|
|
|
|
|
print "$word => @{$self->{_inverted_index}->{$word}}\n" : |
1069
|
|
|
|
|
|
|
die "Something is wrong with your inverted index\n"; |
1070
|
|
|
|
|
|
|
} |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
sub display_inverted_index_for_given_query { |
1073
|
|
|
|
|
|
|
my $self = shift; |
1074
|
|
|
|
|
|
|
my $query_ticket_id = shift; |
1075
|
|
|
|
|
|
|
my $query_record = $self->{_stemmed_tkts_by_ids}->{$query_ticket_id}; |
1076
|
|
|
|
|
|
|
my @query_words = grep $_, split /\s+/, $query_record; |
1077
|
|
|
|
|
|
|
foreach my $qword (@query_words) { |
1078
|
|
|
|
|
|
|
my $idf_t = $self->{_idf_t}->{$qword}; |
1079
|
|
|
|
|
|
|
my @relevant_tickets = @{$self->{_inverted_index}->{$qword}}; |
1080
|
|
|
|
|
|
|
print "\n$qword ($idf_t) ===> @relevant_tickets\n\n"; |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
############# Retrieve Most Similar Tickets with VSM Model ################### |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
sub retrieve_similar_tickets_with_vsm { |
1087
|
|
|
|
|
|
|
my $self = shift; |
1088
|
|
|
|
|
|
|
$self->{_query_ticket_id} = shift; |
1089
|
|
|
|
|
|
|
die "\nFirst generate normalized doc vectors for tickets before you can call retrieve with vsm function()\n" |
1090
|
|
|
|
|
|
|
unless scalar(keys %{$self->{_vocab_hist}}) |
1091
|
|
|
|
|
|
|
&& scalar(keys %{$self->{_tkt_doc_vecs_normed}}); |
1092
|
|
|
|
|
|
|
print "\nCalculating the similarity set for query ticket $self->{_query_ticket_id}\n\n"; |
1093
|
|
|
|
|
|
|
my $query_record = $self->{_stemmed_tkts_by_ids}->{$self->{_query_ticket_id}}; |
1094
|
|
|
|
|
|
|
my @query_words = grep $_, split /\s+/, $query_record; |
1095
|
|
|
|
|
|
|
my %relevant_tickets_set; |
1096
|
|
|
|
|
|
|
die "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- " |
1097
|
|
|
|
|
|
|
unless $self->{_min_idf_threshold}; |
1098
|
|
|
|
|
|
|
foreach my $qword (@query_words) { |
1099
|
|
|
|
|
|
|
map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}} |
1100
|
|
|
|
|
|
|
if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold}; |
1101
|
|
|
|
|
|
|
} |
1102
|
|
|
|
|
|
|
my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set; |
1103
|
|
|
|
|
|
|
print "The relevant tickets for query: @relevant_tickets" |
1104
|
|
|
|
|
|
|
if $self->{_debug3}; |
1105
|
|
|
|
|
|
|
my $num_relevant_tkts = @relevant_tickets; |
1106
|
|
|
|
|
|
|
print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n"; |
1107
|
|
|
|
|
|
|
my %retrievals; |
1108
|
|
|
|
|
|
|
my $rank = 0; |
1109
|
|
|
|
|
|
|
foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) { |
1110
|
|
|
|
|
|
|
$retrievals{$_} = $self->_similarity_to_query_ticket($_); |
1111
|
|
|
|
|
|
|
$rank++; |
1112
|
|
|
|
|
|
|
last if $rank == $self->{_how_many_retrievals}; |
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
if ($self->{_debug3}) { |
1115
|
|
|
|
|
|
|
print "\n\nShowing the VSM retrievals and the similarity scores:\n\n"; |
1116
|
|
|
|
|
|
|
foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) { |
1117
|
|
|
|
|
|
|
print "$_ => $retrievals{$_}\n"; |
1118
|
|
|
|
|
|
|
} |
1119
|
|
|
|
|
|
|
} |
1120
|
|
|
|
|
|
|
return \%retrievals; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
sub _doc_vec_comparator { |
1124
|
|
|
|
|
|
|
my $self = shift; |
1125
|
|
|
|
|
|
|
my %query_ticket_data_normed = |
1126
|
|
|
|
|
|
|
%{$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}}; |
1127
|
|
|
|
|
|
|
my $vec1_hash_ref = $self->{_tkt_doc_vecs_normed}->{$a}; |
1128
|
|
|
|
|
|
|
my $vec2_hash_ref = $self->{_tkt_doc_vecs_normed}->{$b}; |
1129
|
|
|
|
|
|
|
my @vec1 = (); |
1130
|
|
|
|
|
|
|
my @vec2 = (); |
1131
|
|
|
|
|
|
|
my @qvec = (); |
1132
|
|
|
|
|
|
|
foreach my $word (sort keys %{$self->{_vocab_hist}}) { |
1133
|
|
|
|
|
|
|
push @vec1, $vec1_hash_ref->{$word}; |
1134
|
|
|
|
|
|
|
push @vec2, $vec2_hash_ref->{$word}; |
1135
|
|
|
|
|
|
|
push @qvec, $query_ticket_data_normed{$word}; |
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
my $vec1_mag = _vec_magnitude(\@vec1); |
1138
|
|
|
|
|
|
|
my $vec2_mag = _vec_magnitude(\@vec2); |
1139
|
|
|
|
|
|
|
my $qvec_mag = _vec_magnitude(\@qvec); |
1140
|
|
|
|
|
|
|
my $product1 = _vec_scalar_product(\@vec1, \@qvec); |
1141
|
|
|
|
|
|
|
$product1 /= $vec1_mag * $qvec_mag; |
1142
|
|
|
|
|
|
|
my $product2 = _vec_scalar_product(\@vec2, \@qvec); |
1143
|
|
|
|
|
|
|
$product2 /= $vec2_mag * $qvec_mag; |
1144
|
|
|
|
|
|
|
return 1 if $product1 < $product2; |
1145
|
|
|
|
|
|
|
return 0 if $product1 == $product2; |
1146
|
|
|
|
|
|
|
return -1 if $product1 > $product2; |
1147
|
|
|
|
|
|
|
} |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
sub _similarity_to_query_ticket { |
1150
|
|
|
|
|
|
|
my $self = shift; |
1151
|
|
|
|
|
|
|
my $ticket_id = shift; |
1152
|
|
|
|
|
|
|
my $ticket_data_normed = $self->{_tkt_doc_vecs_normed}->{$ticket_id}; |
1153
|
|
|
|
|
|
|
my @vec = (); |
1154
|
|
|
|
|
|
|
my @qvec = (); |
1155
|
|
|
|
|
|
|
foreach my $word (sort keys %$ticket_data_normed) { |
1156
|
|
|
|
|
|
|
push @vec, $ticket_data_normed->{$word}; |
1157
|
|
|
|
|
|
|
push @qvec, |
1158
|
|
|
|
|
|
|
$self->{_tkt_doc_vecs_normed}->{$self->{_query_ticket_id}}->{$word}; |
1159
|
|
|
|
|
|
|
} |
1160
|
|
|
|
|
|
|
my $vec_mag = _vec_magnitude(\@vec); |
1161
|
|
|
|
|
|
|
my $qvec_mag = _vec_magnitude(\@qvec); |
1162
|
|
|
|
|
|
|
die "\nThe query ticket appears to be empty\n" if $qvec_mag == 0; |
1163
|
|
|
|
|
|
|
my $product = _vec_scalar_product(\@vec, \@qvec); |
1164
|
|
|
|
|
|
|
$product /= $vec_mag * $qvec_mag; |
1165
|
|
|
|
|
|
|
return $product; |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
######################## Utility Subroutines ########################## |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
sub _simple_stemmer { |
1172
|
|
|
|
|
|
|
my $word = shift; |
1173
|
|
|
|
|
|
|
my $debug = shift; |
1174
|
|
|
|
|
|
|
print "\nStemming the word: $word\n" if $debug; |
1175
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]t)ted$/$1/i; |
1176
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]t)ting$/$1/i; |
1177
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]l)ling$/$1/i; |
1178
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]g)ging$/$1/i; |
1179
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]ll)ed$/$1/i; |
1180
|
|
|
|
|
|
|
$word =~ s/(.*[a-z][^aeious])s$/$1/i; |
1181
|
|
|
|
|
|
|
$word =~ s/(.*[a-z])ies$/$1y/i; |
1182
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]s)es$/$1/i; |
1183
|
|
|
|
|
|
|
$word =~ s/(.*[a-z][ck])es$/$1e/i; |
1184
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)tions$/$1tion/i; |
1185
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)mming$/$1m/i; |
1186
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+[^rl])ing$/$1/i; |
1187
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+o[sn])ing$/$1e/i; |
1188
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)tices$/$1tex/i; |
1189
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)pes$/$1pe/i; |
1190
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)sed$/$1se/i; |
1191
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)ed$/$1/i; |
1192
|
|
|
|
|
|
|
$word =~ s/(.*[a-z]+)tation$/$1t/i; |
1193
|
|
|
|
|
|
|
print "Stemmed word: $word\n\n" if $debug; |
1194
|
|
|
|
|
|
|
return $word; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
sub _exists { |
1198
|
|
|
|
|
|
|
my $element = shift; |
1199
|
|
|
|
|
|
|
my $array = shift; |
1200
|
|
|
|
|
|
|
my %hash; |
1201
|
|
|
|
|
|
|
for my $item (@$array) { |
1202
|
|
|
|
|
|
|
$hash{$item} = 1; |
1203
|
|
|
|
|
|
|
} |
1204
|
|
|
|
|
|
|
return exists $hash{$element}; |
1205
|
|
|
|
|
|
|
} |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
sub _fetch_words_from_file { |
1208
|
|
|
|
|
|
|
my $file = shift; |
1209
|
|
|
|
|
|
|
my @words; |
1210
|
|
|
|
|
|
|
open( IN, "$file" ) or die "unable to open the file $file: $!"; |
1211
|
|
|
|
|
|
|
while () { |
1212
|
|
|
|
|
|
|
next if /^#/; |
1213
|
|
|
|
|
|
|
next if /^[ ]*\r?\n?$/; |
1214
|
|
|
|
|
|
|
$_ =~ s/\r?\n?$//; |
1215
|
|
|
|
|
|
|
my @how_many_in_line = grep $_, split /\s+/, $_; |
1216
|
|
|
|
|
|
|
die "File $file: Exactly one word allowed in each line -- " |
1217
|
|
|
|
|
|
|
unless @how_many_in_line == 1; |
1218
|
|
|
|
|
|
|
push @words, $_; |
1219
|
|
|
|
|
|
|
} |
1220
|
|
|
|
|
|
|
close IN; |
1221
|
|
|
|
|
|
|
return \@words; |
1222
|
|
|
|
|
|
|
} |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
sub _fetch_word_pairs_from_file { |
1225
|
|
|
|
|
|
|
my $file = shift; |
1226
|
|
|
|
|
|
|
my @word_pairs; |
1227
|
|
|
|
|
|
|
open( IN, "$file" ) or die "unable to open the file $file: $!"; |
1228
|
|
|
|
|
|
|
while () { |
1229
|
|
|
|
|
|
|
next if /^#/; |
1230
|
|
|
|
|
|
|
next if /^[ ]*$/; |
1231
|
|
|
|
|
|
|
chomp; |
1232
|
|
|
|
|
|
|
my @how_many_in_line = grep $_, split /\s+/, $_; |
1233
|
|
|
|
|
|
|
die "File: $file --- Exactly two words must be in each non-comment or not-empty line -- " |
1234
|
|
|
|
|
|
|
unless @how_many_in_line == 2; |
1235
|
|
|
|
|
|
|
push @word_pairs, $_; |
1236
|
|
|
|
|
|
|
} |
1237
|
|
|
|
|
|
|
close IN; |
1238
|
|
|
|
|
|
|
return \@word_pairs; |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
sub _get_rid_of_wide_chars { |
1242
|
|
|
|
|
|
|
my $string = shift; |
1243
|
|
|
|
|
|
|
$string =~ s/[^[:ascii:]]+//g; |
1244
|
|
|
|
|
|
|
# $string =~ s/\x{FEFF}//g; to get rid of wide characters |
1245
|
|
|
|
|
|
|
return $string; |
1246
|
|
|
|
|
|
|
} |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
sub _find_index_for_given_element { |
1249
|
|
|
|
|
|
|
my $ele = shift; |
1250
|
|
|
|
|
|
|
my $array_ref = shift; |
1251
|
|
|
|
|
|
|
foreach my $i (0..@{$array_ref}-1) { |
1252
|
|
|
|
|
|
|
return $i if $ele == $array_ref->[$i]; |
1253
|
|
|
|
|
|
|
} |
1254
|
|
|
|
|
|
|
} |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
sub _check_for_illegal_params { |
1257
|
|
|
|
|
|
|
my @params = @_; |
1258
|
|
|
|
|
|
|
my @legal_params = qw / excel_filename |
1259
|
|
|
|
|
|
|
which_worksheet |
1260
|
|
|
|
|
|
|
raw_tickets_db |
1261
|
|
|
|
|
|
|
processed_tickets_db |
1262
|
|
|
|
|
|
|
stemmed_tickets_db |
1263
|
|
|
|
|
|
|
inverted_index_db |
1264
|
|
|
|
|
|
|
tickets_vocab_db |
1265
|
|
|
|
|
|
|
idf_db |
1266
|
|
|
|
|
|
|
tkt_doc_vecs_db |
1267
|
|
|
|
|
|
|
tkt_doc_vecs_normed_db |
1268
|
|
|
|
|
|
|
synset_cache_db |
1269
|
|
|
|
|
|
|
want_synset_caching |
1270
|
|
|
|
|
|
|
add_synsets_to_tickets |
1271
|
|
|
|
|
|
|
clustering_fieldname |
1272
|
|
|
|
|
|
|
min_word_length |
1273
|
|
|
|
|
|
|
min_idf_threshold |
1274
|
|
|
|
|
|
|
max_num_syn_words |
1275
|
|
|
|
|
|
|
stop_words_file |
1276
|
|
|
|
|
|
|
misspelled_words_file |
1277
|
|
|
|
|
|
|
unique_id_fieldname |
1278
|
|
|
|
|
|
|
want_stemming |
1279
|
|
|
|
|
|
|
how_many_retrievals |
1280
|
|
|
|
|
|
|
debug1 |
1281
|
|
|
|
|
|
|
debug2 |
1282
|
|
|
|
|
|
|
debug3 |
1283
|
|
|
|
|
|
|
/; |
1284
|
|
|
|
|
|
|
my $found_match_flag; |
1285
|
|
|
|
|
|
|
foreach my $param (@params) { |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
foreach my $legal (@legal_params) { |
1288
|
|
|
|
|
|
|
$found_match_flag = 0; |
1289
|
|
|
|
|
|
|
if ($param eq $legal) { |
1290
|
|
|
|
|
|
|
$found_match_flag = 1; |
1291
|
|
|
|
|
|
|
last; |
1292
|
|
|
|
|
|
|
} |
1293
|
|
|
|
|
|
|
} |
1294
|
|
|
|
|
|
|
last if $found_match_flag == 0; |
1295
|
|
|
|
|
|
|
} |
1296
|
|
|
|
|
|
|
return $found_match_flag; |
1297
|
|
|
|
|
|
|
} |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
# Meant only for an un-nested hash: |
1300
|
|
|
|
|
|
|
sub _deep_copy_hash { |
1301
|
|
|
|
|
|
|
my $ref_in = shift; |
1302
|
|
|
|
|
|
|
my $ref_out = {}; |
1303
|
|
|
|
|
|
|
foreach ( keys %{$ref_in} ) { |
1304
|
|
|
|
|
|
|
$ref_out->{$_} = $ref_in->{$_}; |
1305
|
|
|
|
|
|
|
} |
1306
|
|
|
|
|
|
|
return $ref_out; |
1307
|
|
|
|
|
|
|
} |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
# from perl docs: |
1310
|
|
|
|
|
|
|
sub _fisher_yates_shuffle { |
1311
|
|
|
|
|
|
|
my $arr = shift; |
1312
|
|
|
|
|
|
|
my $i = @$arr; |
1313
|
|
|
|
|
|
|
while (--$i) { |
1314
|
|
|
|
|
|
|
my $j = int rand( $i + 1 ); |
1315
|
|
|
|
|
|
|
@$arr[$i, $j] = @$arr[$j, $i]; |
1316
|
|
|
|
|
|
|
} |
1317
|
|
|
|
|
|
|
} |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
sub _vec_scalar_product { |
1320
|
|
|
|
|
|
|
my $vec1 = shift; |
1321
|
|
|
|
|
|
|
my $vec2 = shift; |
1322
|
|
|
|
|
|
|
die "Something is wrong --- the two vectors are of unequal length" |
1323
|
|
|
|
|
|
|
unless @$vec1 == @$vec2; |
1324
|
|
|
|
|
|
|
my $product; |
1325
|
|
|
|
|
|
|
for my $i (0..@$vec1-1) { |
1326
|
|
|
|
|
|
|
$product += $vec1->[$i] * $vec2->[$i]; |
1327
|
|
|
|
|
|
|
} |
1328
|
|
|
|
|
|
|
return $product; |
1329
|
|
|
|
|
|
|
} |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
sub _vec_magnitude { |
1332
|
|
|
|
|
|
|
my $vec = shift; |
1333
|
|
|
|
|
|
|
my $mag_squared = 0; |
1334
|
|
|
|
|
|
|
foreach my $num (@$vec) { |
1335
|
|
|
|
|
|
|
$mag_squared += $num ** 2; |
1336
|
|
|
|
|
|
|
} |
1337
|
|
|
|
|
|
|
return sqrt $mag_squared; |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
1; |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
__END__ |