line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Text::NSP::Measures::4D::MI::ll - Perl module that implements Loglikelihood |
4
|
|
|
|
|
|
|
measure of association for 4-grams. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 SYNOPSIS |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head3 Basic Usage |
9
|
|
|
|
|
|
|
use Text::NSP::Measures::4D::MI::ll; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
$ll_value = calculateStatistic( |
12
|
|
|
|
|
|
|
n1111=>8, |
13
|
|
|
|
|
|
|
n1ppp=>306, |
14
|
|
|
|
|
|
|
np1pp=>83, |
15
|
|
|
|
|
|
|
npp1p=>83, |
16
|
|
|
|
|
|
|
nppp1=>57, |
17
|
|
|
|
|
|
|
n11pp=>8, |
18
|
|
|
|
|
|
|
n1p1p=>8, |
19
|
|
|
|
|
|
|
n1pp1=>8, |
20
|
|
|
|
|
|
|
np11p=>83, |
21
|
|
|
|
|
|
|
np1p1=>56, |
22
|
|
|
|
|
|
|
npp11=>56, |
23
|
|
|
|
|
|
|
n111p=>8, |
24
|
|
|
|
|
|
|
n11p1=>8, |
25
|
|
|
|
|
|
|
n1p11=>8, |
26
|
|
|
|
|
|
|
np111=>56, |
27
|
|
|
|
|
|
|
npppp=>15180); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
if( ($errorCode = getErrorCode())) |
30
|
|
|
|
|
|
|
{ |
31
|
|
|
|
|
|
|
print STDERR $erroCode." - ".getErrorMessage()."\n"; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
else |
34
|
|
|
|
|
|
|
{ |
35
|
|
|
|
|
|
|
print getStatisticName."value for 4-gram is ".$ll_value."\n"; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 DESCRIPTION |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
The log-likelihood ratio measures the devitation between the observed data |
41
|
|
|
|
|
|
|
and what would be expected if , , and were |
42
|
|
|
|
|
|
|
independent.The higher the score, the less evidence there is in favor of |
43
|
|
|
|
|
|
|
concluding thatthe words are independent. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
The expected values for the internal cells are calculated by taking the |
46
|
|
|
|
|
|
|
product of their associated marginals and dividing by the sample size, |
47
|
|
|
|
|
|
|
for example: |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
n1ppp * np1pp * npp1p * nppp1 |
50
|
|
|
|
|
|
|
m111= ------------------------------- |
51
|
|
|
|
|
|
|
npppp ^ 3 |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Then the deviation between observed and expected values for each internal |
54
|
|
|
|
|
|
|
cell is computed to arrive at the log-likelihood value. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Log-Likelihood = 2 * [n1111 * log ( n1111 / m1111 ) + n1112 * log ( n1112 / m1112 ) + |
57
|
|
|
|
|
|
|
n1121 * log ( n1121 / m1121 ) + n1122 * log ( n1122 / m1122 ) + |
58
|
|
|
|
|
|
|
n1211 * log ( n1211 / m1211 ) + n1212 * log ( n1212 / m1212 ) + |
59
|
|
|
|
|
|
|
n1221 * log ( n1221 / m1221 ) + n1222 * log ( n1222 / m1222 ) + |
60
|
|
|
|
|
|
|
n2111 * log ( n2111 / m2111 ) + n2112 * log ( n2112 / m2112 ) + |
61
|
|
|
|
|
|
|
n2121 * log ( n2121 / m2121 ) + n2122 * log ( n2122 / m2122 ) + |
62
|
|
|
|
|
|
|
n2211 * log ( n2211 / m2211 ) + n2212 * log ( n2212 / m2212 ) + |
63
|
|
|
|
|
|
|
n2221 * log ( n2221 / m2221 ) + n2222 * log ( n2222 / m2222 )]; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 Methods |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=over |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
package Text::NSP::Measures::4D::MI::ll; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
1
|
|
|
1
|
|
3030
|
use Text::NSP::Measures::4D::MI; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
591
|
|
76
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
77
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
56
|
|
78
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
25
|
|
|
1
|
|
|
|
|
30
|
|
79
|
1
|
|
|
1
|
|
4
|
no warnings 'redefine'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2652
|
|
80
|
|
|
|
|
|
|
require Exporter; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
our ($VERSION, @EXPORT, @ISA); |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
@EXPORT = qw(initializeStatistic calculateStatistic |
87
|
|
|
|
|
|
|
getErrorCode getErrorMessage getStatisticName); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
$VERSION = '0.97'; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=item calculateStatistic($count_values) - This method calculates |
92
|
|
|
|
|
|
|
the ll value |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
INPUT PARAMS : $count_values .. Reference of an hash containing |
95
|
|
|
|
|
|
|
the count values computed by the |
96
|
|
|
|
|
|
|
count.pl program. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
RETURN VALUES : $loglikelihood .. Loglikelihood value for this 4-gram. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=cut |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub calculateStatistic |
103
|
|
|
|
|
|
|
{ |
104
|
16
|
|
|
16
|
|
11150
|
my %values = @_; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# computes and sets the observed and expected values from |
107
|
|
|
|
|
|
|
# the frequency combination values. returns 0 if there is an |
108
|
|
|
|
|
|
|
# error in the computation or the values are inconsistent. |
109
|
16
|
100
|
|
|
|
64
|
if( !(Text::NSP::Measures::4D::MI::getValues(\%values)) ) { |
110
|
15
|
|
|
|
|
116
|
return; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# Now for the actual calculation of Loglikelihood! |
114
|
1
|
|
|
|
|
3
|
my $logLikelihood = 0; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# dont want ($nxy / $mxy) to be 0 or less! flag error if so! |
118
|
1
|
|
|
|
|
11
|
$logLikelihood += $n1111 * Text::NSP::Measures::4D::MI::computePMI ( $n1111, $m1111 ); |
119
|
1
|
|
|
|
|
4
|
$logLikelihood += $n1112 * Text::NSP::Measures::4D::MI::computePMI ( $n1112, $m1112 ); |
120
|
1
|
|
|
|
|
5
|
$logLikelihood += $n1121 * Text::NSP::Measures::4D::MI::computePMI ( $n1121, $m1121 ); |
121
|
1
|
|
|
|
|
4
|
$logLikelihood += $n1122 * Text::NSP::Measures::4D::MI::computePMI ( $n1122, $m1122 ); |
122
|
1
|
|
|
|
|
5
|
$logLikelihood += $n1211 * Text::NSP::Measures::4D::MI::computePMI ( $n1211, $m1211 ); |
123
|
1
|
|
|
|
|
4
|
$logLikelihood += $n1212 * Text::NSP::Measures::4D::MI::computePMI ( $n1212, $m1212 ); |
124
|
1
|
|
|
|
|
4
|
$logLikelihood += $n1221 * Text::NSP::Measures::4D::MI::computePMI ( $n1221, $m1221 ); |
125
|
1
|
|
|
|
|
4
|
$logLikelihood += $n1222 * Text::NSP::Measures::4D::MI::computePMI ( $n1222, $m1222 ); |
126
|
1
|
|
|
|
|
4
|
$logLikelihood += $n2111 * Text::NSP::Measures::4D::MI::computePMI ( $n2111, $m2111 ); |
127
|
1
|
|
|
|
|
16
|
$logLikelihood += $n2112 * Text::NSP::Measures::4D::MI::computePMI ( $n2112, $m2112 ); |
128
|
1
|
|
|
|
|
4
|
$logLikelihood += $n2121 * Text::NSP::Measures::4D::MI::computePMI ( $n2121, $m2121 ); |
129
|
1
|
|
|
|
|
4
|
$logLikelihood += $n2122 * Text::NSP::Measures::4D::MI::computePMI ( $n2122, $m2122 ); |
130
|
1
|
|
|
|
|
4
|
$logLikelihood += $n2211 * Text::NSP::Measures::4D::MI::computePMI ( $n2211, $m2211 ); |
131
|
1
|
|
|
|
|
4
|
$logLikelihood += $n2212 * Text::NSP::Measures::4D::MI::computePMI ( $n2212, $m2212 ); |
132
|
1
|
|
|
|
|
3
|
$logLikelihood += $n2221 * Text::NSP::Measures::4D::MI::computePMI ( $n2221, $m2221 ); |
133
|
1
|
|
|
|
|
5
|
$logLikelihood += $n2222 * Text::NSP::Measures::4D::MI::computePMI ( $n2222, $m2222 ); |
134
|
1
|
|
|
|
|
6
|
return ( 2 * $logLikelihood ); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item getStatisticName() - Returns the name of this statistic |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
INPUT PARAMS : none |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
RETURN VALUES : $name .. Name of the measure. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub getStatisticName |
147
|
|
|
|
|
|
|
{ |
148
|
0
|
|
|
0
|
|
|
return "Loglikelihood"; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
1; |
154
|
|
|
|
|
|
|
__END__ |