line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Text::NSP::Measures::2D - Perl module that provides basic framework |
4
|
|
|
|
|
|
|
for building measure of association for |
5
|
|
|
|
|
|
|
bigrams. |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head3 Basic Usage |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Text::NSP::Measures::2D::MI::ll; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
$ll_value = calculateStatistic( n11=>$n11, |
16
|
|
|
|
|
|
|
n1p=>$n1p, |
17
|
|
|
|
|
|
|
np1=>$np1, |
18
|
|
|
|
|
|
|
npp=>$npp); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
if( ($errorCode = getErrorCode())) |
21
|
|
|
|
|
|
|
{ |
22
|
|
|
|
|
|
|
print STDERR $errorCode." - ".getErrorMessage()."\n""; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
else |
25
|
|
|
|
|
|
|
{ |
26
|
|
|
|
|
|
|
print getStatisticName."value for bigram is ".$ll_value."\n""; |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 DESCRIPTION |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
This module is to be used as a foundation for building 2-dimensional |
33
|
|
|
|
|
|
|
measures of association. The methods in this module retrieve observed |
34
|
|
|
|
|
|
|
bigram frequency counts, marginal totals, and also compute expected |
35
|
|
|
|
|
|
|
values. They also provide error checks for these counts. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
With bigram or 2d measures we use variables with corresponding names |
38
|
|
|
|
|
|
|
to store the 2x2 contingency table to store the frequency counts |
39
|
|
|
|
|
|
|
associated with each word in the bigram, as well as the number of |
40
|
|
|
|
|
|
|
times the bigram occurs. A contingency table looks like |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|word2 | not-word2| |
43
|
|
|
|
|
|
|
-------------------- |
44
|
|
|
|
|
|
|
word1 | n11 | n12 | n1p |
45
|
|
|
|
|
|
|
not-word1 | n21 | n22 | n2p |
46
|
|
|
|
|
|
|
-------------------- |
47
|
|
|
|
|
|
|
np1 np2 npp |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
Marginal Frequencies: |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
n1p = the number of bigrams where the first word is word1. |
52
|
|
|
|
|
|
|
np1 = the number of bigrams where the second word is word2. |
53
|
|
|
|
|
|
|
n2p = the number of bigrams where the first word is not word1. |
54
|
|
|
|
|
|
|
np2 = the number of bigrams where the second word is not word2. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
These marginal totals are stored in variables which have names |
57
|
|
|
|
|
|
|
corresponding to the cell they represent. These values may then be |
58
|
|
|
|
|
|
|
referred to as follows: |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
$n1p, |
61
|
|
|
|
|
|
|
$np1, |
62
|
|
|
|
|
|
|
$n2p, |
63
|
|
|
|
|
|
|
$np2, |
64
|
|
|
|
|
|
|
$npp |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Observed Frequencies: |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
n11 = number of times the bigram occurs, joint frequency |
69
|
|
|
|
|
|
|
n12 = number of times word1 occurs in the first position of a bigram |
70
|
|
|
|
|
|
|
when word2 does not occur in the second position. |
71
|
|
|
|
|
|
|
n21 = number of times word2 occurs in the second position of a |
72
|
|
|
|
|
|
|
bigram when word1 does not occur in the first position. |
73
|
|
|
|
|
|
|
n22 = number of bigrams where word1 is not in the first position and |
74
|
|
|
|
|
|
|
word2 is not in the second position. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
The observed frequencies are also stored in variables with corresponding names. |
77
|
|
|
|
|
|
|
These values may then be referred to as follows: |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
$n11, |
81
|
|
|
|
|
|
|
$n12, |
82
|
|
|
|
|
|
|
$n21, |
83
|
|
|
|
|
|
|
$n22 |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
Expected Frequencies: |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
m11 = expected number of times both words in the bigram occur |
88
|
|
|
|
|
|
|
together if they are independent. (n1p*np1/npp) |
89
|
|
|
|
|
|
|
m12 = expected number of times word1 in the bigram will occur in |
90
|
|
|
|
|
|
|
the first position when word2 does not occur in the second |
91
|
|
|
|
|
|
|
position given that the words are independent. (n1p*np2/npp) |
92
|
|
|
|
|
|
|
m21 = expected number of times word2 in the bigram will occur |
93
|
|
|
|
|
|
|
in the second position when word1 does not occur in the first |
94
|
|
|
|
|
|
|
position given that the words are independent. (np1*n2p/npp) |
95
|
|
|
|
|
|
|
m22 = expected number of times word1 will not occur in the first |
96
|
|
|
|
|
|
|
position and word2 will not occur in the second position |
97
|
|
|
|
|
|
|
given that the words are independent. (n2p*np2/npp) |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Similarly the expected values are stored as |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
$m11, |
102
|
|
|
|
|
|
|
$m12, |
103
|
|
|
|
|
|
|
$m21, |
104
|
|
|
|
|
|
|
$m22 |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head2 Methods |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=over |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=cut |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
package Text::NSP::Measures::2D; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
|
116
|
19
|
|
|
19
|
|
12075
|
use Text::NSP::Measures; |
|
19
|
|
|
|
|
44
|
|
|
19
|
|
|
|
|
2427
|
|
117
|
19
|
|
|
19
|
|
84
|
use strict; |
|
19
|
|
|
|
|
39
|
|
|
19
|
|
|
|
|
335
|
|
118
|
19
|
|
|
19
|
|
86
|
use Carp; |
|
19
|
|
|
|
|
35
|
|
|
19
|
|
|
|
|
976
|
|
119
|
19
|
|
|
19
|
|
85
|
use warnings; |
|
19
|
|
|
|
|
33
|
|
|
19
|
|
|
|
|
17063
|
|
120
|
|
|
|
|
|
|
require Exporter; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
our ($VERSION, @ISA, @EXPORT); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
our ($n11, $n12, $n21, $n22); |
127
|
|
|
|
|
|
|
our ($m11, $m12, $m21, $m22); |
128
|
|
|
|
|
|
|
our ($npp, $n1p, $np1, $n2p, $np2); |
129
|
|
|
|
|
|
|
# $npp = -1; $n1p = -1; $np1 = -1; |
130
|
|
|
|
|
|
|
# $n2p = -1; $np2 = -1; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
@EXPORT = qw(initializeStatistic calculateStatistic |
134
|
|
|
|
|
|
|
getErrorCode getErrorMessage getStatisticName |
135
|
|
|
|
|
|
|
$errorCodeNumber $errorMessage |
136
|
|
|
|
|
|
|
$n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22 |
137
|
|
|
|
|
|
|
$npp $np1 $np2 $n2p $n1p); |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
$VERSION = '0.97'; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=item computeObservedValues() - A method to compute observed values, |
143
|
|
|
|
|
|
|
and also to verify that the computed Observed values are correct, |
144
|
|
|
|
|
|
|
That is they are positive, less than the marginal totals and the |
145
|
|
|
|
|
|
|
total bigram count. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
INPUT PARAMS : $count_values .. Reference to an hash consisting |
149
|
|
|
|
|
|
|
of the count values passed to |
150
|
|
|
|
|
|
|
the calculateStatistic() method. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
RETURN VALUES : 1/undef ..returns '1' to indicate success |
153
|
|
|
|
|
|
|
and an undefined(NULL) value to indicate |
154
|
|
|
|
|
|
|
failure. |
155
|
|
|
|
|
|
|
=cut |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub computeObservedValues |
158
|
|
|
|
|
|
|
{ |
159
|
197
|
|
|
197
|
1
|
321
|
my ($values) = @_; |
160
|
|
|
|
|
|
|
|
161
|
197
|
100
|
|
|
|
435
|
if(!defined $values->{n11}) |
162
|
|
|
|
|
|
|
{ |
163
|
14
|
|
|
|
|
38
|
$errorMessage = "Required frequency count (1,1) not passed"; |
164
|
14
|
|
|
|
|
39
|
$errorCodeNumber = 200; |
165
|
14
|
|
|
|
|
76
|
return; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
else |
168
|
|
|
|
|
|
|
{ |
169
|
183
|
|
|
|
|
283
|
$n11 = $values->{n11}; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
# joint frequency should be greater than equal to zero |
172
|
183
|
100
|
|
|
|
421
|
if ($n11 < 0) |
173
|
|
|
|
|
|
|
{ |
174
|
14
|
|
|
|
|
44
|
$errorMessage = "Frequency value 'n11' must not be negative."; |
175
|
14
|
|
|
|
|
33
|
$errorCodeNumber = 201; |
176
|
14
|
|
|
|
|
54
|
return; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# joint frequency (n11) should be less than or equal to the |
180
|
|
|
|
|
|
|
# total number of bigrams (npp) |
181
|
169
|
100
|
|
|
|
350
|
if($n11 > $npp) |
182
|
|
|
|
|
|
|
{ |
183
|
14
|
|
|
|
|
42
|
$errorMessage = "Frequency value 'n11' must not exceed total number of bigrams."; |
184
|
14
|
|
|
|
|
30
|
$errorCodeNumber = 202; |
185
|
14
|
|
|
|
|
55
|
return; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# joint frequency should be less than or equal to the marginal totals |
189
|
155
|
100
|
66
|
|
|
725
|
if ($n11 > $np1 || $n11 > $n1p) |
190
|
|
|
|
|
|
|
{ |
191
|
14
|
|
|
|
|
46
|
$errorMessage = "Frequency value of ngram 'n11' must not exceed the marginal totals."; |
192
|
14
|
|
|
|
|
32
|
$errorCodeNumber = 202; |
193
|
14
|
|
|
|
|
70
|
return; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# The marginal totals are reasonable so we can |
197
|
|
|
|
|
|
|
# calculate the observed frequencies |
198
|
141
|
|
|
|
|
187
|
$n12 = $n1p - $n11; |
199
|
141
|
|
|
|
|
213
|
$n21 = $np1 - $n11; |
200
|
141
|
|
|
|
|
189
|
$n22 = $np2 - $n12; |
201
|
|
|
|
|
|
|
|
202
|
141
|
50
|
|
|
|
296
|
if ($n12 < 0) |
203
|
|
|
|
|
|
|
{ |
204
|
0
|
|
|
|
|
0
|
$errorMessage = "Frequency value 'n12' must not be negative."; |
205
|
0
|
|
|
|
|
0
|
$errorCodeNumber = 201; |
206
|
0
|
|
|
|
|
0
|
return; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
141
|
50
|
|
|
|
305
|
if ($n21 < 0) |
210
|
|
|
|
|
|
|
{ |
211
|
0
|
|
|
|
|
0
|
$errorMessage = "Frequency value 'n21' must not be negative."; |
212
|
0
|
|
|
|
|
0
|
$errorCodeNumber = 201; |
213
|
0
|
|
|
|
|
0
|
return; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
141
|
100
|
|
|
|
290
|
if ($n22 < 0) |
217
|
|
|
|
|
|
|
{ |
218
|
14
|
|
|
|
|
36
|
$errorMessage = "Frequency value 'n22' must not be negative."; |
219
|
14
|
|
|
|
|
100
|
$errorCodeNumber = 201; |
220
|
14
|
|
|
|
|
56
|
return; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
127
|
|
|
|
|
411
|
return 1; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=item computeExpectedValues() - A method to compute expected values. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
INPUT PARAMS :none |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
RETURN VALUES : 1/undef ..returns '1' to indicate success |
234
|
|
|
|
|
|
|
and an undefined(NULL) value to indicate |
235
|
|
|
|
|
|
|
failure. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=cut |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub computeExpectedValues |
240
|
|
|
|
|
|
|
{ |
241
|
|
|
|
|
|
|
# calculate the expected values |
242
|
83
|
|
|
83
|
1
|
170
|
$m11 = $n1p * $np1 / $npp; |
243
|
83
|
|
|
|
|
201
|
$m12 = $n1p * $np2 / $npp; |
244
|
83
|
|
|
|
|
120
|
$m21 = $n2p * $np1 / $npp; |
245
|
83
|
|
|
|
|
128
|
$m22 = $n2p * $np2 / $npp; |
246
|
|
|
|
|
|
|
|
247
|
83
|
|
|
|
|
281
|
return 1; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item computeMarginalTotals() - This method computes the marginal totals from the count values as |
253
|
|
|
|
|
|
|
passed to it. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
INPUT PARAMS : $count_values .. Reference to an hash consisting |
257
|
|
|
|
|
|
|
of the frequency combination |
258
|
|
|
|
|
|
|
output. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
RETURN VALUES : 1/undef ..returns '1' to indicate success |
261
|
|
|
|
|
|
|
and an undefined(NULL) value to indicate |
262
|
|
|
|
|
|
|
failure. |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=cut |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub computeMarginalTotals |
267
|
|
|
|
|
|
|
{ |
268
|
|
|
|
|
|
|
|
269
|
267
|
|
|
267
|
1
|
1968
|
my ($values)=@_; |
270
|
|
|
|
|
|
|
|
271
|
267
|
100
|
|
|
|
891
|
if(!defined $values->{npp}) |
|
|
100
|
|
|
|
|
|
272
|
|
|
|
|
|
|
{ |
273
|
14
|
|
|
|
|
33
|
$errorMessage = "Total bigram count not passed"; |
274
|
14
|
|
|
|
|
32
|
$errorCodeNumber = 200; |
275
|
14
|
|
|
|
|
58
|
return; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
elsif($values->{npp}<=0) |
278
|
|
|
|
|
|
|
{ |
279
|
14
|
|
|
|
|
35
|
$errorMessage = "Total bigram count cannot be less than to zero"; |
280
|
14
|
|
|
|
|
65
|
$errorCodeNumber = 204; |
281
|
14
|
|
|
|
|
57
|
return; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
else |
284
|
|
|
|
|
|
|
{ |
285
|
239
|
|
|
|
|
385
|
$npp = $values->{npp}; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
239
|
|
|
|
|
378
|
$n1p=-1; |
289
|
239
|
100
|
|
|
|
461
|
if(!defined $values->{n1p}) |
290
|
|
|
|
|
|
|
{ |
291
|
14
|
|
|
|
|
34
|
$errorMessage = "Required Marginal total (1,p) count not passed"; |
292
|
14
|
|
|
|
|
29
|
$errorCodeNumber = 200; |
293
|
14
|
|
|
|
|
61
|
return; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
else |
296
|
|
|
|
|
|
|
{ |
297
|
225
|
|
|
|
|
327
|
$n1p=$values->{n1p}; |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
# right frequency (n1p) should be greater than or equal to zero |
300
|
225
|
100
|
|
|
|
601
|
if ($n1p < 0) |
301
|
|
|
|
|
|
|
{ |
302
|
14
|
|
|
|
|
41
|
$errorMessage = "Marginal total value 'n1p' must not be negative."; |
303
|
14
|
|
|
|
|
35
|
$errorCodeNumber = 204; |
304
|
14
|
|
|
|
|
62
|
return; |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
# right frequency (n1p) should be less than or equal to the total |
307
|
|
|
|
|
|
|
# number of bigrams (npp) |
308
|
211
|
100
|
|
|
|
484
|
if ($n1p > $npp) |
309
|
|
|
|
|
|
|
{ |
310
|
14
|
|
|
|
|
32
|
$errorMessage = "Marginal total value 'n1p' must not exceed total number of bigrams."; |
311
|
14
|
|
|
|
|
125
|
$errorCodeNumber = 203; |
312
|
14
|
|
|
|
|
54
|
return; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
|
316
|
197
|
|
|
|
|
241
|
$np1 = -1; |
317
|
197
|
50
|
|
|
|
378
|
if(!defined $values->{np1}) |
318
|
|
|
|
|
|
|
{ |
319
|
0
|
|
|
|
|
0
|
$errorMessage = "Required Marginal total (p,1) count not passed"; |
320
|
0
|
|
|
|
|
0
|
$errorCodeNumber = 200; |
321
|
0
|
|
|
|
|
0
|
return; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
else |
324
|
|
|
|
|
|
|
{ |
325
|
197
|
|
|
|
|
303
|
$np1=$values->{np1}; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
# left frequency (np1) should be greater than or equal to zero |
328
|
197
|
50
|
|
|
|
447
|
if ($np1 < 0) |
329
|
|
|
|
|
|
|
{ |
330
|
0
|
|
|
|
|
0
|
$errorMessage = "Marginal total value 'np1' must not be negative."; |
331
|
0
|
|
|
|
|
0
|
$errorCodeNumber = 204; |
332
|
0
|
|
|
|
|
0
|
return; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
# left frequency (np1) should be less than or equal to the total |
335
|
|
|
|
|
|
|
# number of bigrams (npp) |
336
|
197
|
50
|
|
|
|
442
|
if ($np1 > $npp) |
337
|
|
|
|
|
|
|
{ |
338
|
0
|
|
|
|
|
0
|
$errorMessage = "Marginal total value 'np1' must not exceed total number of bigrams."; |
339
|
0
|
|
|
|
|
0
|
$errorCodeNumber = 203; |
340
|
0
|
|
|
|
|
0
|
return; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
197
|
|
|
|
|
275
|
$np2 = $npp - $np1; |
344
|
197
|
|
|
|
|
245
|
$n2p = $npp - $n1p; |
345
|
|
|
|
|
|
|
|
346
|
197
|
|
|
|
|
659
|
return 1; |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
1; |
352
|
|
|
|
|
|
|
__END__ |