File Coverage

blib/lib/Text/NSP/Measures/2D/Fisher.pm

Criterion	Covered	Total	%
statement	92	96	95.8
branch	13	14	92.8
condition			n/a
subroutine	7	7	100.0
pod	2	3	66.6
total	114	120	95.0

line	stmt	bran	sub	pod	time	code
1						=head1 NAME
2
3						Text::NSP::Measures::2D::Fisher - Perl module that provides methods
4						to compute the Fishers exact tests.
5
6						=head1 SYNOPSIS
7
8						=head3 Basic Usage
9
10						use Text::NSP::Measures::2D::Fisher::left;
11
12						my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
13
14						$left_value = calculateStatistic( n11=>$n11,
15						n1p=>$n1p,
16						np1=>$np1,
17						npp=>$npp);
18
19						if( ($errorCode = getErrorCode()))
20						{
21						print STDERR $errorCode." - ".getErrorMessage();
22						}
23						else
24						{
25						print getStatisticName."value for bigram is ".$left_value;
26						}
27
28
29						=head1 DESCRIPTION
30
31						Assume that the frequency count data associated with a bigram
32						is stored in a 2x2 contingency table:
33
34						word2 ~word2
35						word1 n11 n12 \| n1p
36						~word1 n21 n22 \| n2p
37						--------------
38						np1 np2 npp
39
40						where n11 is the number of times occur together, and
41						n12 is the number of times occurs with some word other than
42						word2, and n1p is the number of times in total that word1 occurs as
43						the first word in a bigram.
44
45						The fishers exact tests are calculated by fixing the marginal totals
46						and computing the hypergeometric probabilities for all the possible
47						contingency tables,
48
49						A left sided test is calculated by adding the probabilities of all
50						the possible two by two contingency tables formed by fixing the
51						marginal totals and changing the value of n11 to less than the given
52						value. A left sided Fisher's Exact Test tells us how likely it is to
53						randomly sample a table where n11 is less than observed. In other words,
54						it tells us how likely it is to sample an observation where the two words
55						are less dependent than currently observed.
56
57						A right sided test is calculated by adding the probabilities of all
58						the possible two by two contingency tables formed by fixing the
59						marginal totals and changing the value of n11 to greater than or
60						equal to the given value. A right sided Fisher's Exact Test tells us
61						how likely it is to randomly sample a table where n11 is greater
62						than observed. In other words, it tells us how likely it is to sample
63						an observation where the two words are more dependent than currently
64						observed.
65
66						A two-tailed fishers test is calculated by adding the probabilities of
67						all the contingency tables with probabilities less than the probability
68						of the observed table. The two-tailed fishers test tells us how likely
69						it would be to observe an contingency table which is less probable than
70						the current table.
71
72						=head2 Methods
73
74						=over
75
76						=cut
77
78
79						package Text::NSP::Measures::2D::Fisher;
80
81
82	4		4		2981	use Text::NSP::Measures::2D;
	4				10
	4				775
83	4		4		18	use strict;
	4				6
	4				76
84	4		4		15	use Carp;
	4				7
	4				172
85	4		4		25	use warnings;
	4				14
	4				3133
86						# use subs(calculateStatistic);
87						require Exporter;
88
89						our ($VERSION, @EXPORT, @ISA);
90
91						@ISA = qw(Exporter);
92
93						@EXPORT = qw(initializeStatistic calculateStatistic
94						getErrorCode getErrorMessage getStatisticName
95						$n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
96						$npp $np1 $np2 $n2p $n1p $errorCodeNumber
97						$errorMessage);
98
99						$VERSION = '0.97';
100
101
102						=item getValues() -This method calls the
103						computeObservedValues() and the computeExpectedValues() methods to
104						compute the observed and marginal total values. It checks these values
105						for any errors that might cause the Fishers Exact test measures to
106						fail.
107
108						INPUT PARAMS : $count_values .. Reference of an array containing
109						the count values computed by the
110						count.pl program.
111
112						RETURN VALUES : 1/undef ..returns '1' to indicate success
113						and an undefined(NULL) value to indicate
114						failure.
115
116						=cut
117
118						sub getValues
119						{
120	36		36	1	42	my $values = shift;
121
122						# computes and returns the marginal totals from the frequency
123						# combination values. returns undef if there is an error in
124						# the computation or the values are inconsistent.
125	36	100			88	if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){
126	15				43	return;
127						}
128
129						# computes and returns the observed and marginal values from
130						# the frequency combination values. returns 0 if there is an
131						# error in the computation or the values are inconsistent.
132	21	100			53	if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
133	15				41	return;
134						}
135
136	6				20	return 1;
137						}
138
139
140						=item computeDistribution() - This method calculates the probabilities
141						for all the possible tables
142
143						INPUT PARAMS : $n11_start .. the value for the cell 1,1 in the first contingency
144						table
145						$final_limit .. the value of cell 1,1 in the last contingency table
146						for which we have to compute the probability.
147
148						RETURN VALUES : $probability .. Reference to a hash containing hypergeometric
149						probabilities for all the possible contingency
150						tables
151
152						=cut
153
154						sub computeDistribution
155						{
156	6		6	1	8	my $n11_start = shift @_;
157	6				12	my $final_limit = shift @_;
158
159						# first sort the numerator array in the descending order.
160	6				70	my @numerator = sort { $b <=> $a } ($n1p, $np1, $n2p, $np2);
	24				45
161
162						# initialize the hash to store the probability distribution values.
163	6				9	my %probability = ();
164
165						# declare some temporary variables for use in loops and computing the values.
166	6				13	my $i;
167	6				10	my $j=0;
168
169						# initialize the product variable to be used in the probability computation.
170	6				7	my $product = 0;
171
172						# set the values for the first contingency table.
173	6				10	$n11 = $n11_start;
174	6				11	$n12 = $n1p-$n11;
175	6				9	$n21 = $np1-$n11;
176	6				10	$n22 = $n2p - $n21;
177
178	6				17	while($n22 < 0)
179						{
180	0				0	$n11++;
181	0				0	$n12 = $n1p - $n11;
182	0				0	$n21 = $np1 - $n11;
183	0				0	$n22 = $n2p - $n21;
184						}
185
186						# declare the denominator array.
187	6				11	my @denominator = ();
188
189	6				9	$product = 0;
190
191	6				8	my $prob = 0;
192
193	6				8	$i = $n11;
194	6				10	$n12 = $n1p - $i;
195	6				8	$n21 = $np1 - $i;
196	6				8	$n22 = $n2p - $n21;
197
198						# initialize the denominator array with values sorted in the descending order.
199	6				17	@denominator = sort { $b <=> $a } ($npp, $n22, $n12, $n21, $i);
	51				72
200
201						#decalare other variables for use in computation.
202	6				17	my @dLimits = ();
203	6				8	my @nLimits = ();
204	6				9	my $dIndex = 0;
205	6				8	my $nIndex = 0;
206
207						# set the dLimits and nLimits arrays to be used in the cancellation of factorials
208						# and to be used in the computation of factorial.
209						# the dLimits and the nLimits allow us to cancel out factorials in the numerator
210						# and the denominator. for example:
211						# 6! 12345*6
212						# --- = --------------- = 5*6
213						# 4! 123*4
214						#
215						# we achieve this by defining a range within which all the
216						# nos must be multiplied. So every pair of entries in the nLimits array defines a range
217						# so for the above case the entries would be:
218						# 5,6
219						#
220	6				19	for ( $j = 0; $j < 4; $j++ )
221						{
222	24	100			77	if ( $numerator[$j] > $denominator[$j] )
		100
223						{
224	6				11	$nLimits[$nIndex] = $denominator[$j] + 1;
225	6				31	$nLimits[$nIndex+1] = $numerator[$j];
226	6				17	$nIndex += 2;
227						}
228						elsif ( $denominator[$j] > $numerator[$j] )
229						{
230	6				13	$dLimits[$dIndex] = $numerator[$j] + 1;
231	6				12	$dLimits[$dIndex+1] = $denominator[$j];
232	6				16	$dIndex += 2;
233						}
234						}
235	6				13	$dLimits[$dIndex] = 1;
236	6				12	$dLimits[$dIndex+1] = $denominator[4];
237
238						# since, all the variables have been initialized, we start the computations.
239	6				27	$product = computeHyperGeometric(\@dLimits, \@nLimits);
240	6				18	$probability{$i} = $product;
241	6				10	$prob = $probability{$i};
242
243						# to reduce the no. of computations and the make the measure more efficient
244						# we use the previous tables probabilities to compute the new tables probabilities
245						# we can do this because the counts in the table will change by only a factor of 1
246						# thus instead of repeating all those multiplications we have to perform only
247						# 4 multiplications.
248	6				8	my $subproduct = 0;
249
250	6				17	for ($i = $n11+1; $i <= $final_limit; $i++ )
251						{
252	49				61	$subproduct += log $n12;
253	49				55	$n22++;
254	49				55	$subproduct -= log $n22;
255	49				59	$subproduct += log $n21;
256	49				44	$n12--;
257	49				46	$n21--;
258	49				58	$subproduct -= log $i;
259	49				98	$probability{$i} = $product+$subproduct;
260	49	50			101	if($probability{$i} != 0)
261						{
262	49				48	$product = $product+$subproduct;
263	49				102	$subproduct=0;
264						}
265						}
266
267
268	6				30	return (\%probability);
269						}
270
271
272
273						sub computeHyperGeometric
274						{
275	6		6	0	10	my $dLimits = shift @_;
276	6				10	my $nLimits = shift @_;
277	6				9	my $product = 0;
278
279						# compute the probability now, since all the variables have been initialized.
280	6				21	while ( defined ( $nLimits->[0] ) )
281						{
282	6				23	while ( defined ( $nLimits->[0] ) )
283						{
284	90				128	$product += log $nLimits->[0];
285	90				87	$nLimits->[0]++;
286	90	100			224	if ( $nLimits->[0] > $nLimits->[1] )
287						{
288	6				7	shift @{$nLimits};
	6				49
289	6				9	shift @{$nLimits};
	6				19
290						}
291						}
292	6				14	while ( defined ( $dLimits->[0] ) )
293						{
294	96				138	$product -= log $dLimits->[0];
295	96				96	$dLimits->[0]++;
296	96	100			223	if ( $dLimits->[0] > $dLimits->[1] )
297						{
298	12				43	shift @{$dLimits};
	12				49
299	12				18	shift @{$dLimits};
	12				36
300						}
301						}
302						}
303	6				13	return $product;
304						}
305
306
307						1;
308						__END__