line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl -w |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package Text::SenseClusters::LabelEvaluation::ReadingFilesData; |
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
65546
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
81
|
|
6
|
2
|
|
|
2
|
|
2086
|
use encoding "utf-8"; |
|
2
|
|
|
|
|
48041
|
|
|
2
|
|
|
|
|
15
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
# The following two lines will make this module inherit from the Exporter Class. |
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
####################################################################################################################### |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 Name |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Text::SenseClusters::LabelEvaluation::ReadingFilesData - Module for reading the data from a file as single string object. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
The following code snippet will show how to use this module. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
package Text::SenseClusters::LabelEvaluation::Test_ReadingFilesData; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# Including the LabelEvaluation Module. |
25
|
|
|
|
|
|
|
use Text::SenseClusters::LabelEvaluation::ReadingFilesData; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Including the FileHandle module. |
28
|
|
|
|
|
|
|
use FileHandle; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# The following block-of-code, create a file and write the data into it. |
32
|
|
|
|
|
|
|
# At the end of this test program, we will delete that file. |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# File that will contain the topic information. |
35
|
|
|
|
|
|
|
my $topicFileName = "temp_TopicData.txt"; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Defining the file handle for the topic file. |
38
|
|
|
|
|
|
|
our $topicFileHandle = FileHandle->new(">$topicFileName"); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Writing into the Topic file. |
41
|
|
|
|
|
|
|
# Bill Clinton , Tony Blair |
42
|
|
|
|
|
|
|
print $topicFileHandle "Bill Clinton is an American politician who served as the 42nd President of". |
43
|
|
|
|
|
|
|
"the United States from 1993 to 2001. Inaugurated at age 46, he was the third-youngest president.". |
44
|
|
|
|
|
|
|
"He took office at the end of the Cold War, and was the first president of the baby boomer generation.". |
45
|
|
|
|
|
|
|
"Clinton has been described as a New Democrat. Many of his policies have been attributed to a centrist". |
46
|
|
|
|
|
|
|
"Third Way philosophy of governance. He is married to Hillary Rodham Clinton, who has served as the". |
47
|
|
|
|
|
|
|
"United States Secretary of State since 2009 and was a Senator from New York from 2001 to 2009.". |
48
|
|
|
|
|
|
|
"As Governor of Arkansas, Clinton overhauled the state's education system, and served as Chair ". |
49
|
|
|
|
|
|
|
"of the National Governors Association.Clinton was elected president in 1992, defeating incumbent". |
50
|
|
|
|
|
|
|
"president George H. W. Bush. The Congressional Budget Office reported a budget surplus between ". |
51
|
|
|
|
|
|
|
"the years 1998 and 2000, the last three years of Clinton's presidency. Since leaving office,". |
52
|
|
|
|
|
|
|
"Clinton has been rated highly in public opinion polls of U.S. presidents. \n"; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Closing file handle. |
55
|
|
|
|
|
|
|
close($topicFileHandle); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# END OF file creation block. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# The following code will call the readLinesFromTopicFile() function from the |
61
|
|
|
|
|
|
|
# ReadingFilesData modules. It will return the content of the file in a string. |
62
|
|
|
|
|
|
|
my $fileData = Text::SenseClusters::LabelEvaluation::ReadingFilesData::readLinesFromTopicFile( |
63
|
|
|
|
|
|
|
$topicFileName); |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Printing the content of the file. |
66
|
|
|
|
|
|
|
print "\n Data of the input file is $fileData \n"; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# Deleting the temporary label and topic files. |
70
|
|
|
|
|
|
|
unlink $topicFileName or warn "Could not unlink $topicFileName: $!"; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head1 DESCRIPTION |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
This module provides the two functions. The first function reads the labelled |
76
|
|
|
|
|
|
|
data generated by the SenseClusters and create hash from it. The data of the |
77
|
|
|
|
|
|
|
input file must match the format of label-file generated by SenseClusters. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
The second function reads a file into a string variable by removing all the |
81
|
|
|
|
|
|
|
newline characters from it. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
########################################################################################### |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 Function: readLinesFromClusterFile |
88
|
|
|
|
|
|
|
------------------------------------------------ |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
This function will read lines from the file containing the Labels of the |
91
|
|
|
|
|
|
|
Clusters and make the hash file. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
@argument1 : Name of the cluster file name. |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
@argument2 : Reference of Hash ($labelSenseClustersHash) which will hold |
96
|
|
|
|
|
|
|
the information in the following format: |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
For e.g.: Cluster0{ |
99
|
|
|
|
|
|
|
Descriptive => George Bush, Al Gore, White |
100
|
|
|
|
|
|
|
House, New York |
101
|
|
|
|
|
|
|
Discriminating => George Bush, York Times |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
Cluster1{ |
104
|
|
|
|
|
|
|
Descriptive => George Bush, BRITAIN London, |
105
|
|
|
|
|
|
|
Prime Minister |
106
|
|
|
|
|
|
|
Discriminating => BRITAIN London, Prime Minister |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
@return : It will return the reference of the Hash mentioned above: |
111
|
|
|
|
|
|
|
$labelSenseClustersHashRef. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
@description : |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
1. Read the file line by line. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
2. Ignore the lines which do not follow one of the following format: |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Cluster 0 (Discriminating): George Bush, BRITAIN London |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
3. Create Key from the "Cluster # (Descriptive)" or "Cluster # (Discrim |
124
|
|
|
|
|
|
|
- inating)" as "OuterKey: Cluster#" "InnerKey: Descriptive". |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
4. Store the value of hash as the keywords similar to above example: |
127
|
|
|
|
|
|
|
for e.g: |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
$labelSenseClustersGlobalRef{Cluster0}{Discriminating} |
130
|
|
|
|
|
|
|
= "BRITAIN London, Prime Minister"; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
134
|
|
|
|
|
|
|
########################################################################################### |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub readLinesFromClusterFile{ |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Reading the cluster file Name from the argument. |
139
|
1
|
|
|
1
|
0
|
2
|
my $clusterFileName = shift; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Reading the reference from the argument. |
142
|
1
|
|
|
|
|
3
|
my $labelSenseClustersHashRef = shift; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# Getting the hash from the reference. |
145
|
1
|
|
|
|
|
3
|
my %labelSenseClustersHash = %$labelSenseClustersHashRef; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# Opening the File passed by user as the first command line argument. |
148
|
|
|
|
|
|
|
# It should be the name of the cluster file containing the labels. |
149
|
1
|
50
|
|
|
|
2824
|
open clusterFile, $clusterFileName or die $!; |
150
|
|
|
|
|
|
|
|
151
|
1
|
|
|
|
|
22
|
while (){ |
152
|
|
|
|
|
|
|
# Removing the new line character. |
153
|
7
|
|
|
|
|
11
|
chomp; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Removing the white space from the front and end of the word. |
156
|
7
|
|
|
|
|
93
|
$_ =~ s/^\s+|\s+$//g; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# If the line is empty then ignore that line and go to next line. |
159
|
7
|
100
|
|
|
|
18
|
if($_ eq ''){ |
160
|
3
|
|
|
|
|
9
|
next; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Contents of LabelFile. |
164
|
|
|
|
|
|
|
# Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York |
165
|
|
|
|
|
|
|
# Cluster 0 (Discriminating): George Bush, BRITAIN London |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Spiliting each line by ":". |
168
|
4
|
|
|
|
|
20
|
my @lineArray = split(/:/, $_); |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# If the given do not have Two elements after split. (It means no data for the |
171
|
|
|
|
|
|
|
# given cluster.) Then ignore that cluster. |
172
|
4
|
50
|
|
|
|
12
|
if(scalar(@lineArray)!=2){ |
173
|
0
|
|
|
|
|
0
|
next; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# Following Code are for making the Key (which will be Cluster Number and Type of |
177
|
|
|
|
|
|
|
# Labels) Typical Key Structure --> "Cluster 0 (Descriptive)" |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# Spiliting the elements contianing the information about the key with whitespace |
180
|
4
|
|
|
|
|
21
|
my @keyArray = split(/\s+/, $lineArray[0]); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# If something wrong with the structure than ignore the key and carry on with |
183
|
|
|
|
|
|
|
# next line. |
184
|
4
|
50
|
|
|
|
11
|
if(scalar(@keyArray)!=3){ |
185
|
0
|
|
|
|
|
0
|
next; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Making of the Outer key, which is "cluster#" |
189
|
4
|
|
|
|
|
9
|
my $outerKey = $keyArray[0].$keyArray[1]; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# The inner key indicates the type of label i.e. Descriptive or Discriminating. |
192
|
4
|
|
|
|
|
6
|
my $innerKey = $keyArray[2]; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# Removing the start parenthesis '(' and closing ')' parenthesis from the inner |
195
|
|
|
|
|
|
|
# key. |
196
|
4
|
|
|
|
|
18
|
$innerKey =~s/[(,)]+//g; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Setting the keywords associated with this keys as the value. |
199
|
|
|
|
|
|
|
# For e.g.: Cluster0{ |
200
|
|
|
|
|
|
|
# Descriptive => George Bush, Al Gore, White House, New York |
201
|
|
|
|
|
|
|
# Discriminating => George Bush, BRITAIN London |
202
|
|
|
|
|
|
|
# } |
203
|
4
|
|
|
|
|
31
|
$labelSenseClustersHash{$outerKey}{$innerKey} = $lineArray[1]; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# Close the file handle. |
207
|
1
|
|
|
|
|
12
|
close (clusterFile); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Returning the reference of the Hash containg the Labels information from |
210
|
|
|
|
|
|
|
# the cluster. |
211
|
1
|
|
|
|
|
7
|
return \%labelSenseClustersHash; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
########################################################################################## |
217
|
|
|
|
|
|
|
=head1 Function: readLinesFromTopicFile |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
------------------------------------------------ |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
This function will read lines from the topic file and list of all the |
222
|
|
|
|
|
|
|
topics. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
@argument1 : Name of the topicFile. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
@return : String containing the list of all the topics(labels) for |
227
|
|
|
|
|
|
|
the clusters. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
@description : |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
1. Read the file line by line. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
2. Remove the new line characters and making string variable which |
234
|
|
|
|
|
|
|
contains the list of all the topics. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
########################################################################################## |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub readLinesFromTopicFile{ |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# Getting the topic file name from argument. |
242
|
2
|
|
|
2
|
0
|
368
|
my $topicFileName = shift; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# Opening the File, whose name is passed as the second command-line-argument. |
245
|
|
|
|
|
|
|
# It is the name of the file which contains the list of the topics for clusters. |
246
|
2
|
50
|
|
|
|
73
|
open topicFile, $topicFileName or die $!; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# Defining the variable which will hold all the topics. |
249
|
2
|
|
|
|
|
7
|
my $topicData = ""; |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
# Reading the file line by line till end of file. |
252
|
2
|
|
|
|
|
43
|
while (){ |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Removing the new line character. |
255
|
2
|
|
|
|
|
7
|
chomp; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# Concatenating it to previous line. |
258
|
2
|
|
|
|
|
22
|
$topicData = $topicData.$_; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# Close the file handle. |
262
|
2
|
|
|
|
|
22
|
close (topicFile); |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# Returning the topic list. |
265
|
2
|
|
|
|
|
10
|
return $topicData; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
####################################################################################################### |
272
|
|
|
|
|
|
|
=pod |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head1 SEE ALSO |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/ |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
@Last modified by : Anand Jha |
281
|
|
|
|
|
|
|
@Last_Modified_Date : 24th Dec. 2012 |
282
|
|
|
|
|
|
|
@Modified Version : 1.6 |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head1 AUTHORS |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
Ted Pedersen, University of Minnesota, Duluth |
287
|
|
|
|
|
|
|
tpederse at d.umn.edu |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
Anand Jha, University of Minnesota, Duluth |
290
|
|
|
|
|
|
|
jhaxx030 at d.umn.edu |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Copyright (C) 2012 Ted Pedersen, Anand Jha |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
301
|
|
|
|
|
|
|
it under the terms of the GNU General Public License as published by |
302
|
|
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or |
303
|
|
|
|
|
|
|
(at your option) any later version. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, |
306
|
|
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
307
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
308
|
|
|
|
|
|
|
GNU General Public License for more details. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License |
311
|
|
|
|
|
|
|
along with this program; if not, write to: |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
The Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
315
|
|
|
|
|
|
|
Boston, MA 02111-1307 USA |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
319
|
|
|
|
|
|
|
####################################################################################################### |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# Making the default return statement as 1; |
322
|
|
|
|
|
|
|
# Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
1; |