line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl -w |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# Declaring the Package for the module. |
4
|
|
|
|
|
|
|
package Text::SenseClusters::LabelEvaluation::ConfusionMatrixTotalCalc; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1811
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
7
|
1
|
|
|
1
|
|
6
|
use encoding "utf-8"; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# The following two lines will make this module inherit from the Exporter Class. |
10
|
|
|
|
|
|
|
require Exporter; |
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
####################################################################################################################### |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 Name |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Text::SenseClusters::LabelEvaluation::ConfusionMatrixTotalCalc - Module responsible for processing of decision matrix. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 DESCRIPTION |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
This module provide two functions. First function will calculate the probability |
24
|
|
|
|
|
|
|
decision matrix from the scores of the original decision matrix. The second |
25
|
|
|
|
|
|
|
function will then use the new decision matrix to decide whether labels are |
26
|
|
|
|
|
|
|
appropriately assigned or not. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=cut |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
########################################################################################## |
32
|
|
|
|
|
|
|
=pod |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 function: printCalculatedScoreMatrix |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
The following function is responsible for printing the calculated score |
37
|
|
|
|
|
|
|
matrix from the decision matrix. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
@argument1 : outputFileHandle: DataType(File Handler) |
40
|
|
|
|
|
|
|
This the file handler used for defining where to print |
41
|
|
|
|
|
|
|
the output message/statements of this module. |
42
|
|
|
|
|
|
|
Its default value is: STDERR. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
@argument2 : clusterNameArrayRef: DataType(Reference_Of_Array) |
45
|
|
|
|
|
|
|
Reference to Array containing Cluster Name. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
@argument3 : standardTermsArrayRef: DataType(Reference_Of_Array) |
48
|
|
|
|
|
|
|
Reference to Array containing Standard terms. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
@argument4 : hashForClusterTopicScoreRef: DataType(Reference_Of_Hash) |
51
|
|
|
|
|
|
|
Reference to hash containing Cluster Name, corresponding |
52
|
|
|
|
|
|
|
StandardTopic and its score. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
@argument5 : topicTotalSumHashRef: DataType(Reference_Of_Hash) |
55
|
|
|
|
|
|
|
Hash which will contains the total score for a topic |
56
|
|
|
|
|
|
|
against each clusters. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
@argument6 : clusterTotalSumHashRef: DataType(Reference_Of_Hash) |
59
|
|
|
|
|
|
|
Hash which will contains the total score for a cluster |
60
|
|
|
|
|
|
|
against each topics. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
@argument7 : $isDecisionMatrixDebugOn: DataType(number 0 or 1) |
63
|
|
|
|
|
|
|
Verbose:: This decide whether to detail output or not. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
@return : SimilarityScore |
67
|
|
|
|
|
|
|
This indicate the similarity score of labels and actual |
68
|
|
|
|
|
|
|
topics which are correctly identified by SenseClusters |
69
|
|
|
|
|
|
|
or similar application. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
@description : |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
This module is responsible of decision matrix which is identified as: |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Calculated Decision MATRIX: |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
========================================================= |
78
|
|
|
|
|
|
|
| Cluster0 | Cluster1 | |
79
|
|
|
|
|
|
|
--------------------------------------------------------- |
80
|
|
|
|
|
|
|
Bill Clinton: | 0.478 | 0.522 | |
81
|
|
|
|
|
|
|
--------------------------------------------------------- |
82
|
|
|
|
|
|
|
--------------------------------------------------------- |
83
|
|
|
|
|
|
|
Tony Blair: | 0.625 | 0.375 | |
84
|
|
|
|
|
|
|
--------------------------------------------------------- |
85
|
|
|
|
|
|
|
========================================================= |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Where, 1) Cluster0, Cluster1 are Cluster Names, (Column Header). |
89
|
|
|
|
|
|
|
2) Bill Clinton, Tony Blair are Standard Topics, (Row Header). |
90
|
|
|
|
|
|
|
3) Cell content is the probability measure which indicates |
91
|
|
|
|
|
|
|
likelihood of a cluster's label against a Topic. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Steps: |
95
|
|
|
|
|
|
|
1. First, it will iterate through hash, '%hashForClusterTopicScore'. |
96
|
|
|
|
|
|
|
2. It will divide the cluster-topic overlapping score with the total |
97
|
|
|
|
|
|
|
count value of the decision matrix. |
98
|
|
|
|
|
|
|
3. This will give the normalized score. |
99
|
|
|
|
|
|
|
4. Based on user input on Verbose, it will display the normalized |
100
|
|
|
|
|
|
|
decision matrix. |
101
|
|
|
|
|
|
|
5. It will then call the function 'concludingFromDecisionMatrix' |
102
|
|
|
|
|
|
|
which will used the normalized decision matrix to conclude |
103
|
|
|
|
|
|
|
a) which cluster's labels is matching with which Gold-Standard |
104
|
|
|
|
|
|
|
-topic's data. |
105
|
|
|
|
|
|
|
a) which Gold-Standard-topic's data label is matching with |
106
|
|
|
|
|
|
|
which cluster's labels. |
107
|
|
|
|
|
|
|
6. Finally, it will compare the Clusterwise results with Topicwise |
108
|
|
|
|
|
|
|
results to conclude final cluster-topic match results along with |
109
|
|
|
|
|
|
|
their matching score. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
112
|
|
|
|
|
|
|
########################################################################################## |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub printCalculatedScoreMatrix{ |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# Getting the File Handle from the function argument. |
117
|
1
|
|
|
1
|
0
|
2
|
my $outputFileHandle = shift; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# Getting the ReferenceToArray which contains ClusterName from the argument. |
120
|
1
|
|
|
|
|
2
|
my $clusterNameArrayRef = shift; |
121
|
|
|
|
|
|
|
# Getting the array from the reference. |
122
|
1
|
|
|
|
|
4
|
my @clusterNameArray = @$clusterNameArrayRef; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# Getting the ReferenceToArray which contains StandardTerms from the argument. |
125
|
1
|
|
|
|
|
2
|
my $standardTermsArrayRef = shift; |
126
|
|
|
|
|
|
|
# Getting the array from the reference. |
127
|
1
|
|
|
|
|
2
|
my @standardTermsArray = @$standardTermsArrayRef; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Getting the Reference to hash which contains Cluster Name, corresponding |
130
|
|
|
|
|
|
|
# StandardTopic and its score from the argument. |
131
|
1
|
|
|
|
|
2
|
my $hashForClusterTopicScoreRef = shift; |
132
|
|
|
|
|
|
|
# Getting the hash from the reference. |
133
|
1
|
|
|
|
|
4
|
my %hashForClusterTopicScore = %$hashForClusterTopicScoreRef; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Getting the Reference to hash which contains the topics and their total |
136
|
|
|
|
|
|
|
# score from the argument. |
137
|
1
|
|
|
|
|
3
|
my $topicTotalSumHashRef = shift; |
138
|
|
|
|
|
|
|
# Getting the hash from its reference. |
139
|
1
|
|
|
|
|
3
|
my %topicTotalSumHash = %$topicTotalSumHashRef; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Getting the Reference to hash which contains the clusters and their total |
142
|
|
|
|
|
|
|
# score from the argument. |
143
|
1
|
|
|
|
|
3
|
my $clusterTotalSumHashRef = shift; |
144
|
|
|
|
|
|
|
# Getting the hash from its reference. |
145
|
1
|
|
|
|
|
3
|
my %clusterTotalSumHash = %$clusterTotalSumHashRef; |
146
|
|
|
|
|
|
|
# Variable which will decide whether to display verbose or not. |
147
|
1
|
|
|
|
|
2
|
my $isDecisionMatrixDebugOn = shift; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# This value is going to store the total value for the decision matrix. |
150
|
1
|
|
|
|
|
3
|
my $totalValueOfDecisionScore = 0; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# Getting the total for the decision matrix table. |
153
|
1
|
|
|
|
|
3
|
foreach my $clusterName (keys %clusterTotalSumHash){ |
154
|
2
|
|
|
|
|
5
|
$totalValueOfDecisionScore += $clusterTotalSumHash{$clusterName}; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Printing only if the debug option is on. |
158
|
1
|
50
|
|
|
|
4
|
if($isDecisionMatrixDebugOn == 1){ |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only print the below. |
161
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n\n\n\nDecision MATRIX (Probability)::"; |
162
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n===========================================================". |
163
|
|
|
|
|
|
|
"=================================================================\n\t\t"; |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only print the below. |
166
|
|
|
|
|
|
|
# This will print the cluster name in the decision matrix. |
167
|
0
|
|
|
|
|
0
|
foreach my $clusterName (@clusterNameArray){ |
168
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\t|\t$clusterName "; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# HashOfHash to store conclusion of Direct calculation, rowwise i.e |
173
|
|
|
|
|
|
|
# a topic (OuterKey) score against each cluster(InnerKey). |
174
|
1
|
|
|
|
|
3
|
my %directTopicClusterHash = (); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# HashOfHash to store conclusion of Direct calculation, columnwise i.e |
177
|
|
|
|
|
|
|
# a Cluster (OuterKey) scores against each topics(InnerKey). |
178
|
1
|
|
|
|
|
2
|
my %directClusterTopicHash = (); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Looping through each of the topic from the topics list. |
182
|
1
|
|
|
|
|
2
|
foreach my $topicName (@standardTermsArray){ |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# The variable to store the maximum score in a row. |
185
|
2
|
|
|
|
|
3
|
my $rowMaxScore = 0; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# The variable to store the maximum score in a column. |
188
|
2
|
|
|
|
|
3
|
my $colMaxScore = 0; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# The variable to store the cluster name which will have maximum score |
191
|
|
|
|
|
|
|
# in direct approach. |
192
|
2
|
|
|
|
|
3
|
my $clusterNameDirect = ""; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Storing the topics in temporary variable for some preprocessing. |
196
|
2
|
|
|
|
|
3
|
my $topicNameLabel = $topicName; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Removing the extra white space with single space. |
199
|
2
|
|
|
|
|
9
|
$topicNameLabel =~ s/\s+/ /g; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# Removing the white space from the front and end of the sentence |
202
|
|
|
|
|
|
|
# (in this case single word). |
203
|
2
|
|
|
|
|
12
|
$topicNameLabel =~ s/^\s+|\s+$//g; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# Printing only if the debug option is on. |
207
|
2
|
50
|
|
|
|
6
|
if($isDecisionMatrixDebugOn == 1){ |
208
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only |
209
|
|
|
|
|
|
|
# print the below. |
210
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n----------------------------------------------------------". |
211
|
|
|
|
|
|
|
"---------------------------------------------------------------"; |
212
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n\t$topicNameLabel: "; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# Removing the white space with underscore. |
217
|
2
|
|
|
|
|
8
|
$topicNameLabel =~ s/\s+/_/g; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Then, creating the filename from the topic name. We are doing this |
220
|
|
|
|
|
|
|
# because score about a cluster and the topic is stored in a hashOfHash |
221
|
|
|
|
|
|
|
# using the filename format of topic. |
222
|
2
|
|
|
|
|
5
|
$topicNameLabel = "temp_$topicNameLabel.txt"; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# Iterating through hash which store the score of a cluster against all the topics. |
225
|
2
|
|
|
|
|
6
|
foreach my $sortedOuterKey (sort keys %hashForClusterTopicScore){ |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# Variable used for storing the direct tempoarary probability value i.e. the |
228
|
|
|
|
|
|
|
# denominator for probability calculation will be sum of similarity score of |
229
|
|
|
|
|
|
|
# a row. |
230
|
4
|
|
|
|
|
5
|
my $tempRowScore =0; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# If the total sum against a topic is zero, then the make the tempRowScore zero. |
233
|
4
|
50
|
|
|
|
14
|
if($topicTotalSumHash{$topicNameLabel} == 0){ |
234
|
0
|
|
|
|
|
0
|
$tempRowScore =0; |
235
|
|
|
|
|
|
|
}else{ |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# Calculating the probability of occurrence of "a cluster having a topic". |
238
|
|
|
|
|
|
|
# This is calculated by dividing the similarity score of a cluster against |
239
|
|
|
|
|
|
|
# a topic with total similarity score of all the clusters against that topic. |
240
|
4
|
|
|
|
|
9
|
$tempRowScore = $hashForClusterTopicScore{$sortedOuterKey}{$topicNameLabel} |
241
|
|
|
|
|
|
|
/$totalValueOfDecisionScore; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# Formating the probability value to round off to 3 decimal place. |
245
|
4
|
|
|
|
|
27
|
$tempRowScore = sprintf("%.3f", $tempRowScore); |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# Printing only if the debug option is on. |
248
|
4
|
50
|
|
|
|
11
|
if($isDecisionMatrixDebugOn == 1){ |
249
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only print the below. |
250
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\t|\t$tempRowScore"; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# For Direct Approach: The following code will get the maximum score in a row |
255
|
|
|
|
|
|
|
# and its corresponding Cluster name, which will be then be stored against the |
256
|
|
|
|
|
|
|
# given topic. |
257
|
4
|
50
|
|
|
|
13
|
if($rowMaxScore < $tempRowScore){ |
258
|
4
|
|
|
|
|
5
|
$rowMaxScore = $tempRowScore; |
259
|
4
|
|
|
|
|
5
|
$clusterNameDirect = $sortedOuterKey; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Remvoing unwanted characters related to the file name, which was used |
262
|
|
|
|
|
|
|
# while storing in the hash. |
263
|
4
|
|
|
|
|
13
|
$clusterNameDirect =~ s/temp_//; |
264
|
4
|
|
|
|
|
21
|
$clusterNameDirect =~ s/.txt//; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# Storing the maximum direct score and its corresponding cluster name for the |
269
|
|
|
|
|
|
|
# given topic. |
270
|
2
|
|
|
|
|
5
|
$directTopicClusterHash{$topicNameLabel} = "$clusterNameDirect \t,\t $rowMaxScore"; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# Printing only if the debug option is on. |
273
|
2
|
50
|
|
|
|
7
|
if($isDecisionMatrixDebugOn == 1){ |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only print the below. |
276
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n-------------------------------------------------------". |
277
|
|
|
|
|
|
|
"-------------------------------------------------------------------"; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# Calling the function 'concludingFromDecisionMatrix' which will used the normalized |
283
|
|
|
|
|
|
|
# decision matrix to conclude which cluster label is matching with which Gold-Standard |
284
|
|
|
|
|
|
|
# -topic's data. |
285
|
1
|
|
|
|
|
6
|
my ($directClusterTopicHashRef,$directTopicClusterHashRef) |
286
|
|
|
|
|
|
|
= concludingFromDecisionMatrix( $outputFileHandle, |
287
|
|
|
|
|
|
|
\%hashForClusterTopicScore, \%topicTotalSumHash , |
288
|
|
|
|
|
|
|
\%clusterTotalSumHash, \%directClusterTopicHash, |
289
|
|
|
|
|
|
|
\%directTopicClusterHash, $totalValueOfDecisionScore, |
290
|
|
|
|
|
|
|
$isDecisionMatrixDebugOn); |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
# Getting the hashes from the references. |
293
|
1
|
|
|
|
|
5
|
%directClusterTopicHash = %$directClusterTopicHashRef; |
294
|
1
|
|
|
|
|
6
|
%directTopicClusterHash = %$directTopicClusterHashRef; |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# The following code is responsible for printing the final result |
298
|
|
|
|
|
|
|
# of direct method approach. |
299
|
|
|
|
|
|
|
# |
300
|
|
|
|
|
|
|
# In this approach we will compare the results of Cluster-Topic conclusion |
301
|
|
|
|
|
|
|
# and Topic-Cluster conclusion. If both are matching then we will consider |
302
|
|
|
|
|
|
|
# as the clear winner. |
303
|
1
|
|
|
|
|
10
|
print $outputFileHandle "\n\n\n\n Matched:: \t"; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# This variable will hold the total number of successful match from wikipedia. |
306
|
1
|
|
|
|
|
2
|
my $totalTopicsMatched = 0; |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Getting the size of the Hash. |
309
|
1
|
|
|
|
|
1
|
my $totalTopicCount = keys(%directClusterTopicHash); |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# This variable will hold the overall score for the match of labels. |
312
|
1
|
|
|
|
|
3
|
my $matchedScore = 1; |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# Going through the hash which contains the hash that contains the cluster-topic |
315
|
|
|
|
|
|
|
# overlapping score. |
316
|
1
|
|
|
|
|
3
|
foreach my $clusterKey (sort keys %directClusterTopicHash){ |
317
|
2
|
|
|
|
|
3
|
my $topicValue = $directClusterTopicHash{$clusterKey}; |
318
|
2
|
|
|
|
|
9
|
my @topicArray = split(/[\,]/, $topicValue); |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# Remvoing unwanted characters related to the file name, which was used |
321
|
|
|
|
|
|
|
# while storing in the hash. |
322
|
2
|
|
|
|
|
7
|
$topicArray[0]=~s/\s+//g; |
323
|
2
|
|
|
|
|
8
|
$topicArray[0] =~ s/temp_//; |
324
|
2
|
|
|
|
|
8
|
$topicArray[0] =~ s/.txt//; |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# Remvoing unwanted characters related to the file name, which was used |
327
|
|
|
|
|
|
|
# while storing in the hash. |
328
|
2
|
|
|
|
|
6
|
$clusterKey =~ s/temp_//; |
329
|
2
|
|
|
|
|
5
|
$clusterKey =~ s/.txt//; |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# Iterating through the hash to get the topic, cluster-name and score from topic-cluster hash. |
332
|
2
|
|
|
|
|
7
|
foreach my $topicKey (sort keys %directTopicClusterHash){ |
333
|
4
|
|
|
|
|
7
|
my $clusterValue = $directTopicClusterHash{$topicKey}; |
334
|
4
|
|
|
|
|
12
|
my @clusterArray = split(/[\,]/, $clusterValue); |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# Remvoing unwanted characters related to the file name, which was used |
337
|
|
|
|
|
|
|
# while storing in the hash. |
338
|
4
|
|
|
|
|
11
|
$clusterArray[0]=~s/\s+//g; |
339
|
4
|
|
|
|
|
9
|
$topicKey =~ s/temp_//; |
340
|
4
|
|
|
|
|
9
|
$topicKey =~ s/.txt//; |
341
|
|
|
|
|
|
|
|
342
|
4
|
|
|
|
|
13
|
$clusterArray[1]=~s/\s+//g; |
343
|
|
|
|
|
|
|
#print "\n temp score::".$clusterArray[1]; |
344
|
|
|
|
|
|
|
|
345
|
4
|
100
|
100
|
|
|
26
|
if($clusterKey eq $clusterArray[0] && $topicKey eq $topicArray[0]){ |
346
|
1
|
|
|
|
|
4
|
print $outputFileHandle "\n \t$clusterKey \t:\t$topicKey"; |
347
|
1
|
|
|
|
|
2
|
$totalTopicsMatched++; |
348
|
1
|
|
|
|
|
4
|
$matchedScore *= $clusterArray[1]; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
1
|
|
|
|
|
5
|
print $outputFileHandle "\n\n\nSuccessful labels verified $totalTopicsMatched out of $totalTopicCount"; |
354
|
1
|
|
|
|
|
5
|
print $outputFileHandle "\nScore = $matchedScore"; |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# Close the file handle. |
357
|
1
|
|
|
|
|
320
|
close ($outputFileHandle); |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# Returning the score for the labels. |
360
|
1
|
|
|
|
|
12
|
return $matchedScore; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
######################################################################################################### |
366
|
|
|
|
|
|
|
=pod |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head1 function: concludingFromDecisionMatrix |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
The following matrix is responsible for printing the calculated score |
371
|
|
|
|
|
|
|
matrix from the decision matrix. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
@argument1 : hashForClusterTopicScoreRef: DataType(Reference_Of_Hash) |
374
|
|
|
|
|
|
|
Reference to hash containing Cluster Name, corresponding |
375
|
|
|
|
|
|
|
StandardTopic and its score. |
376
|
|
|
|
|
|
|
@argument2 : topicTotalSumHashRef: DataType(Reference_Of_Hash) |
377
|
|
|
|
|
|
|
Hash which will contains the total score for a topic |
378
|
|
|
|
|
|
|
against each clusters. |
379
|
|
|
|
|
|
|
@argument3 : clusterTotalSumHashRef: DataType(Reference_Of_Hash) |
380
|
|
|
|
|
|
|
Hash which will contains the total score for a cluster |
381
|
|
|
|
|
|
|
against each topics. |
382
|
|
|
|
|
|
|
@argument4 : directClusterTopicHashRef: DataType(Reference_Of_Hash) |
383
|
|
|
|
|
|
|
HashOfHash to store conclusion of Direct calculation, |
384
|
|
|
|
|
|
|
row-wise i.e a topic (OuterKey) score against each |
385
|
|
|
|
|
|
|
cluster(InnerKey). |
386
|
|
|
|
|
|
|
@argument5 : directTopicClusterHashRef: DataType(Reference_Of_Hash) |
387
|
|
|
|
|
|
|
HashOfHash to store conclusion of Direct calculation, |
388
|
|
|
|
|
|
|
columnwise i.e a Cluster (OuterKey) scores against |
389
|
|
|
|
|
|
|
each topics(InnerKey). |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
@return1 : directClusterTopicHashRef: DataType(Reference_Of_Hash) |
393
|
|
|
|
|
|
|
HashOfHash which store conclusion of calculation, |
394
|
|
|
|
|
|
|
row-wise i.e a topic (OuterKey) score against each |
395
|
|
|
|
|
|
|
cluster(InnerKey). |
396
|
|
|
|
|
|
|
@return2 : directTopicClusterHashRef: DataType(Reference_Of_Hash) |
397
|
|
|
|
|
|
|
HashOfHash to store conclusion of calculation, |
398
|
|
|
|
|
|
|
columnwise i.e a Cluster (OuterKey) scores against |
399
|
|
|
|
|
|
|
each topics(InnerKey). |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
@description : |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
The following block of code is responsible for |
404
|
|
|
|
|
|
|
1. Calculating the probabilities (normalized value) of all the |
405
|
|
|
|
|
|
|
topic against a cluster. |
406
|
|
|
|
|
|
|
2. Chosing a topic which has the maximum probability (normali |
407
|
|
|
|
|
|
|
-zed value) value for the given cluster. |
408
|
|
|
|
|
|
|
3. In current approach, for calculating the probability (norm |
409
|
|
|
|
|
|
|
-alized value) we will divide the similarity score of a |
410
|
|
|
|
|
|
|
topic against a cluster with total similarity score of all |
411
|
|
|
|
|
|
|
the topics against all the cluster. |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
Future enhancement:: |
415
|
|
|
|
|
|
|
4. The above approach can be done in two way i.e. using the |
416
|
|
|
|
|
|
|
direct way as well as inverse way. |
417
|
|
|
|
|
|
|
5. In direct approach, for calculating the probability we |
418
|
|
|
|
|
|
|
will divide the similarity score of a topic against a |
419
|
|
|
|
|
|
|
cluster with total similarity score of all the topics |
420
|
|
|
|
|
|
|
against that cluster. |
421
|
|
|
|
|
|
|
6. In inverse approach, for calculating the probability we |
422
|
|
|
|
|
|
|
will divide the similarity score of a topic against a |
423
|
|
|
|
|
|
|
cluster with total similarity score of all the clusters |
424
|
|
|
|
|
|
|
against that topic. |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=cut |
427
|
|
|
|
|
|
|
######################################################################################################### |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
sub concludingFromDecisionMatrix{ |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# Getting the File Handle from the function argument. |
432
|
1
|
|
|
1
|
0
|
2
|
my $outputFileHandle = shift; |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Getting the Reference to hash which contains Cluster Name, corresponding |
435
|
|
|
|
|
|
|
# StandardTopic and its score from the argument. |
436
|
1
|
|
|
|
|
2
|
my $hashForClusterTopicScoreRef = shift; |
437
|
|
|
|
|
|
|
# Getting the hash from the reference. |
438
|
1
|
|
|
|
|
6
|
my %hashForClusterTopicScore = %$hashForClusterTopicScoreRef; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# Getting the Reference to hash which contains the topics and their total |
441
|
|
|
|
|
|
|
# score from the argument. |
442
|
1
|
|
|
|
|
2
|
my $topicTotalSumHashRef = shift; |
443
|
|
|
|
|
|
|
# Getting the hash from its reference. |
444
|
1
|
|
|
|
|
5
|
my %topicTotalSumHash = %$topicTotalSumHashRef; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Getting the Reference to hash which contains the clusters and their total |
447
|
|
|
|
|
|
|
# score from the argument. |
448
|
1
|
|
|
|
|
3
|
my $clusterTotalSumHashRef = shift; |
449
|
|
|
|
|
|
|
# Getting the hash from its reference. |
450
|
1
|
|
|
|
|
4
|
my %clusterTotalSumHash = %$clusterTotalSumHashRef; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# HashOfHash to store conclusion of Direct calculation, columnwise i.e |
453
|
|
|
|
|
|
|
# a Cluster (OuterKey) scores against each topics(InnerKey). |
454
|
1
|
|
|
|
|
2
|
my $directClusterTopicHashRef = shift; |
455
|
1
|
|
|
|
|
4
|
my %directClusterTopicHash = %$directClusterTopicHashRef; |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# HashOfHash to store conclusion of Direct calculation, rowwise i.e |
458
|
|
|
|
|
|
|
# a topic (OuterKey) score against each cluster(InnerKey). |
459
|
1
|
|
|
|
|
2
|
my $directTopicClusterHashRef = shift; |
460
|
1
|
|
|
|
|
6
|
my %directTopicClusterHash = %$directTopicClusterHashRef; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# This value is going to store the total value for the decision matrix. |
463
|
1
|
|
|
|
|
2
|
my $totalValueOfDecisionScore = shift; |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# Variable which will decide whether to dispaly details results or not. |
466
|
1
|
|
|
|
|
3
|
my $isDecisionMatrixDebugOn = shift; |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# The following block of code is responsible for |
470
|
|
|
|
|
|
|
# 1. Calculating the probabilities (normalized value) of all the topic |
471
|
|
|
|
|
|
|
# against a cluster. |
472
|
|
|
|
|
|
|
# 2. Chosing a topic which has the maximum probability (normalized value) |
473
|
|
|
|
|
|
|
# value for the given cluster. |
474
|
|
|
|
|
|
|
# 3. In current approach, for calculating the probability (normalized value) |
475
|
|
|
|
|
|
|
# we will divide the similarity score of a topic against a cluster with |
476
|
|
|
|
|
|
|
# total similarity score of all the topics against all the cluster. |
477
|
|
|
|
|
|
|
# |
478
|
|
|
|
|
|
|
# |
479
|
|
|
|
|
|
|
# Future enhancement:: |
480
|
|
|
|
|
|
|
# |
481
|
|
|
|
|
|
|
# 3. The above approach is done in two way i.e. using the direct way |
482
|
|
|
|
|
|
|
# as well as inverse way. |
483
|
|
|
|
|
|
|
# 4. In direct approach, for calculating the probability we will divide |
484
|
|
|
|
|
|
|
# the similarity score of a topic against a cluster with total |
485
|
|
|
|
|
|
|
# similarity score of all the topics against that cluster. |
486
|
|
|
|
|
|
|
# 5. In inverse approach, for calculating the probability we will divide |
487
|
|
|
|
|
|
|
# the similarity score of a topic against a cluster with total |
488
|
|
|
|
|
|
|
# similarity score of all the clusters against that topic. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# Iterating through hash which store the score of a cluster against all the topics. |
491
|
1
|
|
|
|
|
5
|
foreach my $sortedOuterKey (sort keys %hashForClusterTopicScore){ |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# The variable to store the maximum score in a column. |
494
|
2
|
|
|
|
|
2
|
my $colBasedMaxScore = 0; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# The variable to store the topic name which will have maximum score |
497
|
|
|
|
|
|
|
# in direct approach. |
498
|
2
|
|
|
|
|
4
|
my $topicNameDirect = ""; |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
# Iterating through hash which store the score of a cluster against all the topics. |
502
|
|
|
|
|
|
|
# Iterating through low level key, this will give name of the topics. |
503
|
2
|
|
|
|
|
4
|
foreach my $sortedInnerKey (sort keys %{$hashForClusterTopicScore{$sortedOuterKey}}){ |
|
2
|
|
|
|
|
7
|
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Direct Approach:: |
506
|
4
|
|
|
|
|
6
|
my $tempRowScore =0; |
507
|
4
|
50
|
|
|
|
12
|
if($topicTotalSumHash{$sortedInnerKey} == 0){ |
508
|
0
|
|
|
|
|
0
|
$tempRowScore =0; |
509
|
|
|
|
|
|
|
}else{ |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# Direct approach of Calculating the probability: |
512
|
|
|
|
|
|
|
# We are diving the similarity score of a topic against a cluster with |
513
|
|
|
|
|
|
|
# total similarity score of all the topics against that cluster. |
514
|
4
|
|
|
|
|
10
|
$tempRowScore = $hashForClusterTopicScore{$sortedOuterKey}{$sortedInnerKey} |
515
|
|
|
|
|
|
|
/ $totalValueOfDecisionScore; |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
# Formating the probability value to round off to 3 decimal place. |
518
|
4
|
|
|
|
|
16
|
$tempRowScore = sprintf("%.3f", $tempRowScore); |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# Inverse Approach:: |
521
|
4
|
|
|
|
|
6
|
my $tempColScore =0; |
522
|
4
|
50
|
|
|
|
10
|
if($clusterTotalSumHash{$sortedOuterKey} == 0){ |
523
|
0
|
|
|
|
|
0
|
$tempColScore =0; |
524
|
|
|
|
|
|
|
}else{ |
525
|
|
|
|
|
|
|
# Inverse approach of Calculating the probability: |
526
|
|
|
|
|
|
|
# We are diving the similarity score of a topic against a cluster with |
527
|
|
|
|
|
|
|
# total similarity score of all the clusters against that topic. |
528
|
4
|
|
|
|
|
9
|
$tempColScore = $hashForClusterTopicScore{$sortedOuterKey}{$sortedInnerKey} |
529
|
|
|
|
|
|
|
/ $totalValueOfDecisionScore; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
# Formating the probability value to round off to 3 decimal place. |
532
|
4
|
|
|
|
|
13
|
$tempColScore = sprintf("%.3f", $tempColScore); |
533
|
|
|
|
|
|
|
|
534
|
4
|
100
|
|
|
|
15
|
if($colBasedMaxScore < $tempColScore){ |
535
|
2
|
|
|
|
|
3
|
$colBasedMaxScore = $tempColScore; |
536
|
2
|
|
|
|
|
4
|
$topicNameDirect = $sortedInnerKey; |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# Remvoing unwanted characters related to the file name, which was used |
539
|
|
|
|
|
|
|
# while storing in the hash. |
540
|
2
|
|
|
|
|
6
|
$topicNameDirect =~ s/temp_//; |
541
|
2
|
|
|
|
|
9
|
$topicNameDirect =~ s/.txt//; |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# Storing the maximum direct score and its corresponding topic name for the |
546
|
|
|
|
|
|
|
# given cluster. |
547
|
2
|
|
|
|
|
9
|
$directClusterTopicHash{$sortedOuterKey} = |
548
|
|
|
|
|
|
|
"temp_$topicNameDirect.txt \t,\t $colBasedMaxScore"; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# Print this only if the detailed debug output is on. |
552
|
1
|
50
|
|
|
|
4
|
if($isDecisionMatrixDebugOn ==1){ |
553
|
|
|
|
|
|
|
# If user opted to print the calculated decision matrix, then only print the below. |
554
|
|
|
|
|
|
|
# Following block of code is responsible for printing all the decision |
555
|
|
|
|
|
|
|
# we made using the decision matrix, based on direct and inverse approach. |
556
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n=====================================================". |
557
|
|
|
|
|
|
|
"======================================================================\n"; |
558
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n Column-wise Conclusion::\t"; |
559
|
0
|
|
|
|
|
0
|
Text::SenseClusters::Wikipedia::PrintingHashData::prinHashOfScore( |
560
|
|
|
|
|
|
|
\%directClusterTopicHash, $outputFileHandle); |
561
|
|
|
|
|
|
|
|
562
|
0
|
|
|
|
|
0
|
print $outputFileHandle "\n\n\n Row-wise Conclusion::\t"; |
563
|
0
|
|
|
|
|
0
|
Text::SenseClusters::Wikipedia::PrintingHashData::prinHashOfScore( |
564
|
|
|
|
|
|
|
\%directTopicClusterHash, $outputFileHandle); |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# Returning all the populated hashes. |
568
|
1
|
|
|
|
|
5
|
return(\%directClusterTopicHash,\%directTopicClusterHash); |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
####################################################################################################### |
575
|
|
|
|
|
|
|
=pod |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=head1 SEE ALSO |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/ |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
@Last modified by : Anand Jha |
584
|
|
|
|
|
|
|
@Last_Modified_Date : 24th Dec. 2012 |
585
|
|
|
|
|
|
|
@Modified Version : 1.6 |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=head1 AUTHORS |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
Ted Pedersen, University of Minnesota, Duluth |
591
|
|
|
|
|
|
|
tpederse at d.umn.edu |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
Anand Jha, University of Minnesota, Duluth |
594
|
|
|
|
|
|
|
jhaxx030 at d.umn.edu |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
Copyright (C) 2012 Ted Pedersen, Anand Jha |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
605
|
|
|
|
|
|
|
it under the terms of the GNU General Public License as published by |
606
|
|
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or |
607
|
|
|
|
|
|
|
(at your option) any later version. |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, |
610
|
|
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
611
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
612
|
|
|
|
|
|
|
GNU General Public License for more details. |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License |
615
|
|
|
|
|
|
|
along with this program; if not, write to: |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
The Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
619
|
|
|
|
|
|
|
Boston, MA 02111-1307 USA |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=cut |
623
|
|
|
|
|
|
|
####################################################################################################### |
624
|
|
|
|
|
|
|
# Making the default return statement as 1; |
625
|
|
|
|
|
|
|
# Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html |
626
|
|
|
|
|
|
|
1; |