line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::NSP::Measures; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# moved this version information to the top of the file to avoid |
4
|
|
|
|
|
|
|
# confusion with the documentation below, that includes code examples |
5
|
|
|
|
|
|
|
# that set versions, etc. |
6
|
|
|
|
|
|
|
|
7
|
28
|
|
|
28
|
|
50839
|
use Text::NSP; |
|
28
|
|
|
|
|
81
|
|
|
28
|
|
|
|
|
7187
|
|
8
|
28
|
|
|
28
|
|
165
|
use strict; |
|
28
|
|
|
|
|
64
|
|
|
28
|
|
|
|
|
1599
|
|
9
|
28
|
|
|
28
|
|
146
|
use Carp; |
|
28
|
|
|
|
|
205
|
|
|
28
|
|
|
|
|
1569
|
|
10
|
28
|
|
|
28
|
|
193
|
use warnings; |
|
28
|
|
|
|
|
51
|
|
|
28
|
|
|
|
|
19419
|
|
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our ($VERSION, @ISA, @EXPORT, $errorCodeNumber, $errorMessage); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
@EXPORT = qw(initializeStatistic calculateStatistic |
18
|
|
|
|
|
|
|
getErrorCode getErrorMessage getStatisticName |
19
|
|
|
|
|
|
|
$errorCodeNumber $errorMessage); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
$VERSION = '0.97'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=cut |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 NAME |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Text::NSP::Measures - Perl modules for computing association scores of |
28
|
|
|
|
|
|
|
Ngrams. This module provides the basic framework |
29
|
|
|
|
|
|
|
for these measures. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 SYNOPSIS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 Basic Usage |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
use Text::NSP::Measures::2D::MI::ll; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$ll_value = calculateStatistic( n11=>$n11, |
40
|
|
|
|
|
|
|
n1p=>$n1p, |
41
|
|
|
|
|
|
|
np1=>$np1, |
42
|
|
|
|
|
|
|
npp=>$npp); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
if( ($errorCode = getErrorCode())) |
45
|
|
|
|
|
|
|
{ |
46
|
|
|
|
|
|
|
print STDERR $errorCode." - ".getErrorMessage()."\n""; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
else |
49
|
|
|
|
|
|
|
{ |
50
|
|
|
|
|
|
|
print getStatisticName."value for bigram is ".$ll_value."\n""; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 DESCRIPTION |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head2 Introduction |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
These modules provide perl implementations of mathematical functions |
60
|
|
|
|
|
|
|
(association measures) that can be used to interpret the co-occurrence |
61
|
|
|
|
|
|
|
frequency data for Ngrams. We define an Ngram as a sequence of 'n' |
62
|
|
|
|
|
|
|
tokens that occur within a window of at least 'n' tokens in the text; |
63
|
|
|
|
|
|
|
what constitutes a "token" can be defined by the user. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
The measures that have been implemented in this distribution are: |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=over |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=item 1) MI (Mutual Information based Measures) |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=over |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=item a) Loglikelihood (for Bigrams and Trigrams) |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=item b) True Mutual Information (for Bigrams and Trigrams) |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item c) Pointwise Mutual Information (for Bigrams and Trigrams) |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=item d) Poisson Stirling Measure (for Bigrams and Trigrams) |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=back |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=item 2) CHI (Measures belonging to the CHI family) |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=over |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=item a) Chi-squared Measure |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item b) Phi Coefficient |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=item c) T-Score |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=back |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=item 3) Dice (Measures belonging to the Dice family) |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=over |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=item a) Dice Coefficient |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=item b) Jaccard Measure |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=back |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=item 4) Fishers Exact Tests |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=over |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=item a) Left Fishers Exact Test |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=item b) Right Fishers Exact Test |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item c) Two-Tailed Fishers Exact Test |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=back |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item 5) Odds Ratio |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=back |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Further discussion about these measures is in their respective |
122
|
|
|
|
|
|
|
documentations. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 Writing your own association measures |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
This module also provides a basic framework for building new measures |
127
|
|
|
|
|
|
|
of association for Ngrams. The new Measure should either inherit from |
128
|
|
|
|
|
|
|
Text::NSP::Measures::2D or Text::NSP::Measures::3D modules, depending |
129
|
|
|
|
|
|
|
on whether it is a bigram or a trigram measure. Both these modules |
130
|
|
|
|
|
|
|
implement methods that retrieve observed frequency counts, marginal |
131
|
|
|
|
|
|
|
totals, and also compute expected values. They also provide error |
132
|
|
|
|
|
|
|
checks for these counts. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
You can either write your new measure as a new module, or you can |
135
|
|
|
|
|
|
|
simply write a perl program. Here we will describe how to write a |
136
|
|
|
|
|
|
|
new measure as a perl module Perl. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=over 4 |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=item 1 |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
To create a new Perl module for the measure issue the following |
144
|
|
|
|
|
|
|
command (replace 'NewMeasure' with the name of your measure): |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=over |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
h2xs -AXc -n Text::NSP::Measures::2D::NewMeasure |
149
|
|
|
|
|
|
|
(for bigram measures) |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
or |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
h2xs -AXc -n Text::NSP::Measures::3D::NewMeasure |
154
|
|
|
|
|
|
|
(for trigram measures) |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=back |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
This will create a new folder namely... |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=over |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Text-NSP-Measures-2D-NewMeasure (for bigram) |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
or |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Text-NSP-Measures-3D-NewMeasure (for trigram) |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=back |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
This will create an empty framework for the new association measure. |
171
|
|
|
|
|
|
|
Once you are done completing the changes you will have to install the |
172
|
|
|
|
|
|
|
module before you can use it. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
To make changes to the module open: |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=over |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Text-NSP-Measures-2D-NewMeasure/lib/Text/NSP/Measures/2D/NewMeasure/ |
179
|
|
|
|
|
|
|
NewMeasure.pm |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
or |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
Text-NSP-Measures-3D-NewMeasure/lib/Text/NSP/Measures/3D/NewMeasure/ |
184
|
|
|
|
|
|
|
NewMeasure.pm |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=back |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
in your favorite text editor, and do as follows. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=item 2 |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
Let us say you have named your module NewMeasure. The first line of |
193
|
|
|
|
|
|
|
the file should declare that it is a package. Thus the first line of |
194
|
|
|
|
|
|
|
the file NewMeasure.pm should be... |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=over |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
package Text::NSP::Measures::2D::NewMeasure; (for bigram measures) |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
or |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
package Text::NSP::Measures::3D::NewMeasure; (for trigram measures) |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=back |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
To inherit the functionality from the 2D or 3D module you need to |
207
|
|
|
|
|
|
|
include it in your NewMeasure.pm module. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
A small code snippet to ensure that it is included is as follows: |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=over |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=item 1 For Bigrams |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
use Text::NSP::Measures::2D::MI; |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=item 2 For Trigrams |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
use Text::NSP::Measures::2D::MI; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=back |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
You also need to insert the following lines to make sure that the required |
224
|
|
|
|
|
|
|
functions are visible to the programs using your module. These lines are |
225
|
|
|
|
|
|
|
same for bigrams and trigrams. The "no warnings 'redefine';" statement is |
226
|
|
|
|
|
|
|
used to suppress perl warnings about method overriding. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=over |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
use strict; |
231
|
|
|
|
|
|
|
use Carp; |
232
|
|
|
|
|
|
|
use warnings; |
233
|
|
|
|
|
|
|
no warnings 'redefine'; |
234
|
|
|
|
|
|
|
require Exporter; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
our ($VERSION, @EXPORT, @ISA); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
@EXPORT = qw(initializeStatistic calculateStatistic |
241
|
|
|
|
|
|
|
getErrorCode getErrorMessage getStatisticName); |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=back |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item 3 |
246
|
|
|
|
|
|
|
You need to implement at least one method in your package |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=over |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item i) calculateStatistic() |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=back |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
This method is passed reference to a hash containing the |
255
|
|
|
|
|
|
|
frequency values for a Ngram as found in the input Ngram file. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
method calculateStatistic() is expected to return a (possibly |
258
|
|
|
|
|
|
|
floating point) value as the value of the statistical measure calculated |
259
|
|
|
|
|
|
|
using the frequency values passed to it. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
There exist three methods in the modules Text::NSP::Measures::2d and |
262
|
|
|
|
|
|
|
Text::NSP::Measures::3D in order to help calculate the ngram |
263
|
|
|
|
|
|
|
statistic. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=over |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=item 1. computeMarginalTotals($frequencies); |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item 2. computeObservedValues($frequencies); |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=item 3. computeExpectedValues($frequencies); |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=back |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
These methods return the observed and expected values of the cells in |
276
|
|
|
|
|
|
|
the contingency table. A 2D contingency table looks like: |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|word2 | not-word2| |
279
|
|
|
|
|
|
|
-------------------- |
280
|
|
|
|
|
|
|
word1 | n11 | n12 | n1p |
281
|
|
|
|
|
|
|
not-word1 | n21 | n22 | n2p |
282
|
|
|
|
|
|
|
-------------------- |
283
|
|
|
|
|
|
|
np1 np2 npp |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Here the marginal totals are np1, n1p, np2, n2p, the Observed values |
286
|
|
|
|
|
|
|
are n11, n12, n21, n22 and the expected values for the corresponding |
287
|
|
|
|
|
|
|
observed values are represented using m11, m12, m21, m22, here m11 |
288
|
|
|
|
|
|
|
represents the expected value for the cell (1,1), m12 for the cell |
289
|
|
|
|
|
|
|
(1,2) and so on. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
Before calling either computeObservedValues() or computeExpectedValues() |
292
|
|
|
|
|
|
|
you MUST call computeMarginalTotals(), since these methods require the |
293
|
|
|
|
|
|
|
marginal to be set. The computeMarginalTotals method computes the marginal |
294
|
|
|
|
|
|
|
totals in the contingency table based on the observed frequencies. It |
295
|
|
|
|
|
|
|
returns an undefined value in case of some error. In case success it |
296
|
|
|
|
|
|
|
returns '1'. An example of usage for the computeMarginalTotals() method is |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=over |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
my %values = @_; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
if(!(Text::NSP::Measures::2D::computeMarginalTotals(\%values)) ){ |
303
|
|
|
|
|
|
|
return; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=back |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
@_ is the parameters passed to calculateStatistic. After this call the |
309
|
|
|
|
|
|
|
marginal totals will be available in the following variables |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=over |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=item 1. For bigrams |
314
|
|
|
|
|
|
|
$npp , $n1p, $np1, $n2p, $np2 |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=item 1. For trigrams |
317
|
|
|
|
|
|
|
$nppp, $n1pp, $np1p, $npp1, $n11p, $n1p1, $np11, $n2pp, |
318
|
|
|
|
|
|
|
$np2p, $npp2 |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=back |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
computeObservedValues() computes the observed values of a ngram, It can be |
323
|
|
|
|
|
|
|
called using the following code snippet. Please remember that you should call |
324
|
|
|
|
|
|
|
computeMarginalTotals() before calling computeObservedValues(). |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=over |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
if( !(Text::NSP::Measures::2D::computeObservedValues(\%values)) ) { |
329
|
|
|
|
|
|
|
return; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=back |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
%value is the same hash that was initialized earlier for computeMarginalTotals. |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
If successful it returns 1 otherwise an undefined value is returned. The |
337
|
|
|
|
|
|
|
computed observed values will be available in the following variables: |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=over |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=item 1. For bigrams |
342
|
|
|
|
|
|
|
$n11 , $n12, $n21, $n22 |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=item 1. For trigrams |
345
|
|
|
|
|
|
|
$n111, $n112, $n121, $n122, $n211, $n212, $n221, $n222, |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=back |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Similarly, computeExpectedValues() computes the expected values for each of |
350
|
|
|
|
|
|
|
the cells in the contingency table. You should call computeMarginalTotals() |
351
|
|
|
|
|
|
|
before calling computeExpectedValues(). The following code snippet |
352
|
|
|
|
|
|
|
demonstrates its usage. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=over |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
if( !(Text::NSP::Measures::2D::computeExpectedValues()) ) { |
357
|
|
|
|
|
|
|
return; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=back |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
If successful it returns 1 otherwise an undefined value is returned. The |
363
|
|
|
|
|
|
|
computed expected values will be available in the following variables: |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=over |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=item 1. For bigrams |
368
|
|
|
|
|
|
|
$m11 , $m12, $m21, $m22 |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=item 1. For trigrams |
371
|
|
|
|
|
|
|
$m111, $m112, $m121, $m122, $m211, $m212, $m221, $m222, |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=back |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=item 4 |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
The last lines of a module should always return true, to achieve this |
378
|
|
|
|
|
|
|
make sure that the last two lines of the are: |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
1; |
381
|
|
|
|
|
|
|
__END__ |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
Please see, that you can put in documentation after these lines. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=item 5 |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
There are four other methods that are not mandatory, but may be |
388
|
|
|
|
|
|
|
implemented. These are: |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
i) initializeStatistic() |
391
|
|
|
|
|
|
|
ii) getErrorCode |
392
|
|
|
|
|
|
|
iii) getErrorMessage |
393
|
|
|
|
|
|
|
iv) getStatisticName() |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
statistical.pl calls initializeStatistic before calling any |
396
|
|
|
|
|
|
|
other method, if there is no need for any specific initialization |
397
|
|
|
|
|
|
|
in the measure you need not define this method, and the |
398
|
|
|
|
|
|
|
initialization will be handled by the Text::NSP::Measures modules |
399
|
|
|
|
|
|
|
initializeStatistic() method. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
The getErrorCode method is called immediately after every call to |
402
|
|
|
|
|
|
|
method calculateStatistic(). This method is used to return the |
403
|
|
|
|
|
|
|
errorCode, if any, in the previous operations. To view all the |
404
|
|
|
|
|
|
|
possible error codes and the corresponding error message please refer |
405
|
|
|
|
|
|
|
to the Text::NSP documentation (perldoc Text::NSP).You can create new |
406
|
|
|
|
|
|
|
error codes in your measure, if the existing error codes are not |
407
|
|
|
|
|
|
|
sufficient. |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
The Text::NSP::Measures module implements both getErrorCode() |
410
|
|
|
|
|
|
|
and getErrorMessage() methods and these implementations of the method |
411
|
|
|
|
|
|
|
will be invoked if the user does not define these methods. But if you |
412
|
|
|
|
|
|
|
want to add some other actions that need to be performed in case |
413
|
|
|
|
|
|
|
of an error you must override these methods by implementing them in |
414
|
|
|
|
|
|
|
your module. You can invoke the Text::NSP::Measures getErrorCode() |
415
|
|
|
|
|
|
|
methods from your measures getErrorCode() method. |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
An example of this is below: |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub getErrorCode |
420
|
|
|
|
|
|
|
{ |
421
|
|
|
|
|
|
|
my $code = Text::NSP::Measures::getErrorCode(); |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#your code here |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
return $code; #(or any other value) |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
sub getErrorMessage |
429
|
|
|
|
|
|
|
{ |
430
|
|
|
|
|
|
|
my $message = Text::NSP::MeasuresgetErrorMessage(); |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
#your code here |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
return $message; #(or any other value) |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
The fourth method that may be implemented is getStatisticName(). |
438
|
|
|
|
|
|
|
If this method is implemented, it is expected to return a string |
439
|
|
|
|
|
|
|
containing the name of the statistic being implemented. This string |
440
|
|
|
|
|
|
|
is used in the formatted output of statistic.pl. If this method |
441
|
|
|
|
|
|
|
is not implemented, then the statistic name entered on the |
442
|
|
|
|
|
|
|
commandline is used in the formatted output. |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Note that all the methods described in this section are optional. |
445
|
|
|
|
|
|
|
So, if the user elects to not implement these methods, no harm will |
446
|
|
|
|
|
|
|
be done. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
The user may implement other methods too, but since statistic.pl is |
449
|
|
|
|
|
|
|
not expecting anything besides the five methods above, doing so would |
450
|
|
|
|
|
|
|
have no effect on statistic.pl. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=item 6 |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
You will need to install your module before you can use it. You can do |
455
|
|
|
|
|
|
|
this by |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Change to the base directory for the module, i.e. |
458
|
|
|
|
|
|
|
NewMeasure |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
Then issue the following commands: |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
perl Makefile.PL |
463
|
|
|
|
|
|
|
make |
464
|
|
|
|
|
|
|
make test |
465
|
|
|
|
|
|
|
make install |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
or |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
perl Makefile.PL PREFIX= |
470
|
|
|
|
|
|
|
make |
471
|
|
|
|
|
|
|
make test |
472
|
|
|
|
|
|
|
make install |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
If you get any errors in the installation process, please make sure |
476
|
|
|
|
|
|
|
that you have not made any syntactical error in your code and also |
477
|
|
|
|
|
|
|
make sure that you have already installed the Text-NSP package. |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=back |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=head2 An Example |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
To tie it all together here is an example of a measure that computes |
484
|
|
|
|
|
|
|
the sum of ngram frequency counts. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=over |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
package Text::NSP::Measures::2D::sum; |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
use Text::NSP::Measures::2D::MI::2D; |
492
|
|
|
|
|
|
|
use strict; |
493
|
|
|
|
|
|
|
use Carp; |
494
|
|
|
|
|
|
|
use warnings; |
495
|
|
|
|
|
|
|
no warnings 'redefine'; |
496
|
|
|
|
|
|
|
require Exporter; |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
our ($VERSION, @EXPORT, @ISA); |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
@EXPORT = qw(initializeStatistic calculateStatistic |
503
|
|
|
|
|
|
|
getErrorCode getErrorMessage getStatisticName); |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
$VERSION = '0.01'; |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub calculateStatistic |
508
|
|
|
|
|
|
|
{ |
509
|
|
|
|
|
|
|
my %values = @_; |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# computes and returns the marginal totals from the frequency |
512
|
|
|
|
|
|
|
# combination values. returns undef if there is an error in |
513
|
|
|
|
|
|
|
# the computation or the values are inconsistent. |
514
|
|
|
|
|
|
|
if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){ |
515
|
|
|
|
|
|
|
return; |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# computes and returns the observed and marginal values from |
519
|
|
|
|
|
|
|
# the frequency combination values. returns 0 if there is an |
520
|
|
|
|
|
|
|
# error in the computation or the values are inconsistent. |
521
|
|
|
|
|
|
|
if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) { |
522
|
|
|
|
|
|
|
return; |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# Now for the actual calculation of the association measure |
527
|
|
|
|
|
|
|
my $NewMeasure = 0; |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
$NewMeasure += $n11; |
530
|
|
|
|
|
|
|
$NewMeasure += $n12; |
531
|
|
|
|
|
|
|
$NewMeasure += $n21; |
532
|
|
|
|
|
|
|
$NewMeasure += $n22; |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
return ( $NewMeasure ); |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub getStatisticName |
538
|
|
|
|
|
|
|
{ |
539
|
|
|
|
|
|
|
return "Sum"; |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
1; |
543
|
|
|
|
|
|
|
__END__ |