File Coverage

blib/lib/Text/NSP/Measures/2D/MI/ll.pm

Criterion	Covered	Total	%
statement	24	25	96.0
branch	2	2	100.0
condition			n/a
subroutine	6	7	85.7
pod			n/a
total	32	34	94.1

line	stmt	bran	sub	time	code
1					=head1 NAME
2
3					Text::NSP::Measures::2D::MI::ll - Perl module that implements Loglikelihood
4					measure of association for bigrams.
5
6					=head1 SYNOPSIS
7
8					=head3 Basic Usage
9
10					use Text::NSP::Measures::2D::MI::ll;
11
12					my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
13
14					$ll_value = calculateStatistic( n11=>$n11,
15					n1p=>$n1p,
16					np1=>$np1,
17					npp=>$npp);
18
19					if( ($errorCode = getErrorCode()))
20					{
21					print STDERR $errorCode." - ".getErrorMessage();
22					}
23					else
24					{
25					print getStatisticName."value for bigram is ".$ll_value;
26					}
27
28					=head1 DESCRIPTION
29
30					The log-likelihood ratio measures the deviation between the observed data
31					and what would be expected if and were independent. The
32					higher the score, the less evidence there is in favor of concluding that
33					the words are independent.
34
35					Assume that the frequency count data associated with a bigram
36					as shown by a 2x2 contingency table:
37
38					word2 ~word2
39					word1 n11 n12 \| n1p
40					~word1 n21 n22 \| n2p
41					--------------
42					np1 np2 npp
43
44					where n11 is the number of times occur together, and
45					n12 is the number of times occurs with some word other than
46					word2, and n1p is the number of times in total that word1 occurs as
47					the first word in a bigram.
48
49					The expected values for the internal cells are calculated by taking the
50					product of their associated marginals and dividing by the sample size,
51					for example:
52
53					np1 * n1p
54					m11= ---------
55					npp
56
57					Then the deviation between observed and expected values for each internal
58					cell is computed to arrive at the log-likelihood value.
59
60					Log-Likelihood = 2 * [n11 * log(n11/m11) + n12 * log(n12/m12) +
61					n21 * log(n21/m21) + n22 * log(n22/m22)]
62
63					=head2 Methods
64
65					=over
66
67					=cut
68
69
70					package Text::NSP::Measures::2D::MI::ll;
71
72
73	2		2	5507	use Text::NSP::Measures::2D::MI;
	2			5
	2			679
74	2		2	13	use strict;
	2			4
	2			56
75	2		2	11	use Carp;
	2			6
	2			118
76	2		2	12	use warnings;
	2			4
	2			63
77	2		2	9	no warnings 'redefine';
	2			4
	2			610
78					require Exporter;
79
80					our ($VERSION, @EXPORT, @ISA);
81
82					@ISA = qw(Exporter);
83
84					@EXPORT = qw(initializeStatistic calculateStatistic
85					getErrorCode getErrorMessage getStatisticName);
86
87					$VERSION = '0.97';
88
89					=item calculateStatistic() - This method calculates the ll value
90
91					INPUT PARAMS : $count_values .. Reference of an hash containing
92					the count values computed by the
93					count.pl program.
94
95					RETURN VALUES : $loglikelihood .. Loglikelihood value for this bigram.
96
97					=cut
98
99					sub calculateStatistic
100					{
101	31		31	6409	my %values = @_;
102
103					# computes and sets the observed and expected values from
104					# the frequency combination values. returns 0 if there is an
105					# error in the computation or the values are inconsistent.
106	31	100		181	if( !Text::NSP::Measures::2D::MI::getValues(\%values) )
107					{
108	10			28	return;
109					}
110
111					# Now for the actual calculation of Loglikelihood!
112	21			33	my $logLikelihood = 0;
113
114					# dont want ($nxy / $mxy) to be 0 or less! flag error if so!
115	21			59	$logLikelihood += $n11 * Text::NSP::Measures::2D::MI::computePMI( $n11, $m11 );
116	21			123	$logLikelihood += $n12 * Text::NSP::Measures::2D::MI::computePMI( $n12, $m12 );
117	21			87	$logLikelihood += $n21 * Text::NSP::Measures::2D::MI::computePMI( $n21, $m21 );
118	21			53	$logLikelihood += $n22 * Text::NSP::Measures::2D::MI::computePMI( $n22, $m22 );
119
120	21			157	return ( 2 * $logLikelihood );
121					}
122
123
124					=item getStatisticName() - Returns the name of this statistic
125
126					INPUT PARAMS : none
127
128					RETURN VALUES : $name .. Name of the measure.
129
130					=cut
131
132					sub getStatisticName
133					{
134	0		0		return "Log-likelihood";
135					}
136
137
138
139					1;
140					__END__