File Coverage

blib/lib/Text/NSP/Measures/2D.pm
Criterion Covered Total %
statement 76 91 83.5
branch 25 30 83.3
condition 2 3 66.6
subroutine 7 7 100.0
pod 3 3 100.0
total 113 134 84.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D - Perl module that provides basic framework
4             for building measure of association for
5             bigrams.
6              
7             =head1 SYNOPSIS
8              
9             =head3 Basic Usage
10              
11             use Text::NSP::Measures::2D::MI::ll;
12              
13             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
14              
15             $ll_value = calculateStatistic( n11=>$n11,
16             n1p=>$n1p,
17             np1=>$np1,
18             npp=>$npp);
19              
20             if( ($errorCode = getErrorCode()))
21             {
22             print STDERR $errorCode." - ".getErrorMessage()."\n"";
23             }
24             else
25             {
26             print getStatisticName."value for bigram is ".$ll_value."\n"";
27             }
28              
29              
30             =head1 DESCRIPTION
31              
32             This module is to be used as a foundation for building 2-dimensional
33             measures of association. The methods in this module retrieve observed
34             bigram frequency counts, marginal totals, and also compute expected
35             values. They also provide error checks for these counts.
36              
37             With bigram or 2d measures we use variables with corresponding names
38             to store the 2x2 contingency table to store the frequency counts
39             associated with each word in the bigram, as well as the number of
40             times the bigram occurs. A contingency table looks like
41              
42             |word2 | not-word2|
43             --------------------
44             word1 | n11 | n12 | n1p
45             not-word1 | n21 | n22 | n2p
46             --------------------
47             np1 np2 npp
48              
49             Marginal Frequencies:
50              
51             n1p = the number of bigrams where the first word is word1.
52             np1 = the number of bigrams where the second word is word2.
53             n2p = the number of bigrams where the first word is not word1.
54             np2 = the number of bigrams where the second word is not word2.
55              
56             These marginal totals are stored in variables which have names
57             corresponding to the cell they represent. These values may then be
58             referred to as follows:
59              
60             $n1p,
61             $np1,
62             $n2p,
63             $np2,
64             $npp
65              
66             Observed Frequencies:
67              
68             n11 = number of times the bigram occurs, joint frequency
69             n12 = number of times word1 occurs in the first position of a bigram
70             when word2 does not occur in the second position.
71             n21 = number of times word2 occurs in the second position of a
72             bigram when word1 does not occur in the first position.
73             n22 = number of bigrams where word1 is not in the first position and
74             word2 is not in the second position.
75              
76             The observed frequencies are also stored in variables with corresponding names.
77             These values may then be referred to as follows:
78              
79              
80             $n11,
81             $n12,
82             $n21,
83             $n22
84              
85             Expected Frequencies:
86              
87             m11 = expected number of times both words in the bigram occur
88             together if they are independent. (n1p*np1/npp)
89             m12 = expected number of times word1 in the bigram will occur in
90             the first position when word2 does not occur in the second
91             position given that the words are independent. (n1p*np2/npp)
92             m21 = expected number of times word2 in the bigram will occur
93             in the second position when word1 does not occur in the first
94             position given that the words are independent. (np1*n2p/npp)
95             m22 = expected number of times word1 will not occur in the first
96             position and word2 will not occur in the second position
97             given that the words are independent. (n2p*np2/npp)
98              
99             Similarly the expected values are stored as
100              
101             $m11,
102             $m12,
103             $m21,
104             $m22
105              
106             =head2 Methods
107              
108             =over
109              
110             =cut
111              
112              
113             package Text::NSP::Measures::2D;
114              
115              
116 19     19   12075 use Text::NSP::Measures;
  19         44  
  19         2427  
117 19     19   84 use strict;
  19         39  
  19         335  
118 19     19   86 use Carp;
  19         35  
  19         976  
119 19     19   85 use warnings;
  19         33  
  19         17063  
120             require Exporter;
121              
122             our ($VERSION, @ISA, @EXPORT);
123              
124             @ISA = qw(Exporter);
125              
126             our ($n11, $n12, $n21, $n22);
127             our ($m11, $m12, $m21, $m22);
128             our ($npp, $n1p, $np1, $n2p, $np2);
129             # $npp = -1; $n1p = -1; $np1 = -1;
130             # $n2p = -1; $np2 = -1;
131              
132              
133             @EXPORT = qw(initializeStatistic calculateStatistic
134             getErrorCode getErrorMessage getStatisticName
135             $errorCodeNumber $errorMessage
136             $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
137             $npp $np1 $np2 $n2p $n1p);
138              
139             $VERSION = '0.97';
140              
141              
142             =item computeObservedValues() - A method to compute observed values,
143             and also to verify that the computed Observed values are correct,
144             That is they are positive, less than the marginal totals and the
145             total bigram count.
146              
147              
148             INPUT PARAMS : $count_values .. Reference to an hash consisting
149             of the count values passed to
150             the calculateStatistic() method.
151              
152             RETURN VALUES : 1/undef ..returns '1' to indicate success
153             and an undefined(NULL) value to indicate
154             failure.
155             =cut
156              
157             sub computeObservedValues
158             {
159 197     197 1 321 my ($values) = @_;
160              
161 197 100       435 if(!defined $values->{n11})
162             {
163 14         38 $errorMessage = "Required frequency count (1,1) not passed";
164 14         39 $errorCodeNumber = 200;
165 14         76 return;
166             }
167             else
168             {
169 183         283 $n11 = $values->{n11};
170             }
171             # joint frequency should be greater than equal to zero
172 183 100       421 if ($n11 < 0)
173             {
174 14         44 $errorMessage = "Frequency value 'n11' must not be negative.";
175 14         33 $errorCodeNumber = 201;
176 14         54 return;
177             }
178              
179             # joint frequency (n11) should be less than or equal to the
180             # total number of bigrams (npp)
181 169 100       350 if($n11 > $npp)
182             {
183 14         42 $errorMessage = "Frequency value 'n11' must not exceed total number of bigrams.";
184 14         30 $errorCodeNumber = 202;
185 14         55 return;
186             }
187              
188             # joint frequency should be less than or equal to the marginal totals
189 155 100 66     725 if ($n11 > $np1 || $n11 > $n1p)
190             {
191 14         46 $errorMessage = "Frequency value of ngram 'n11' must not exceed the marginal totals.";
192 14         32 $errorCodeNumber = 202;
193 14         70 return;
194             }
195              
196             # The marginal totals are reasonable so we can
197             # calculate the observed frequencies
198 141         187 $n12 = $n1p - $n11;
199 141         213 $n21 = $np1 - $n11;
200 141         189 $n22 = $np2 - $n12;
201              
202 141 50       296 if ($n12 < 0)
203             {
204 0         0 $errorMessage = "Frequency value 'n12' must not be negative.";
205 0         0 $errorCodeNumber = 201;
206 0         0 return;
207             }
208              
209 141 50       305 if ($n21 < 0)
210             {
211 0         0 $errorMessage = "Frequency value 'n21' must not be negative.";
212 0         0 $errorCodeNumber = 201;
213 0         0 return;
214             }
215              
216 141 100       290 if ($n22 < 0)
217             {
218 14         36 $errorMessage = "Frequency value 'n22' must not be negative.";
219 14         100 $errorCodeNumber = 201;
220 14         56 return;
221             }
222              
223 127         411 return 1;
224             }
225              
226              
227              
228             =item computeExpectedValues() - A method to compute expected values.
229              
230              
231             INPUT PARAMS :none
232              
233             RETURN VALUES : 1/undef ..returns '1' to indicate success
234             and an undefined(NULL) value to indicate
235             failure.
236              
237             =cut
238              
239             sub computeExpectedValues
240             {
241             # calculate the expected values
242 83     83 1 170 $m11 = $n1p * $np1 / $npp;
243 83         201 $m12 = $n1p * $np2 / $npp;
244 83         120 $m21 = $n2p * $np1 / $npp;
245 83         128 $m22 = $n2p * $np2 / $npp;
246              
247 83         281 return 1;
248             }
249              
250              
251              
252             =item computeMarginalTotals() - This method computes the marginal totals from the count values as
253             passed to it.
254              
255              
256             INPUT PARAMS : $count_values .. Reference to an hash consisting
257             of the frequency combination
258             output.
259              
260             RETURN VALUES : 1/undef ..returns '1' to indicate success
261             and an undefined(NULL) value to indicate
262             failure.
263              
264             =cut
265              
266             sub computeMarginalTotals
267             {
268              
269 267     267 1 1968 my ($values)=@_;
270              
271 267 100       891 if(!defined $values->{npp})
    100          
272             {
273 14         33 $errorMessage = "Total bigram count not passed";
274 14         32 $errorCodeNumber = 200;
275 14         58 return;
276             }
277             elsif($values->{npp}<=0)
278             {
279 14         35 $errorMessage = "Total bigram count cannot be less than to zero";
280 14         65 $errorCodeNumber = 204;
281 14         57 return;
282             }
283             else
284             {
285 239         385 $npp = $values->{npp};
286             }
287              
288 239         378 $n1p=-1;
289 239 100       461 if(!defined $values->{n1p})
290             {
291 14         34 $errorMessage = "Required Marginal total (1,p) count not passed";
292 14         29 $errorCodeNumber = 200;
293 14         61 return;
294             }
295             else
296             {
297 225         327 $n1p=$values->{n1p};
298             }
299             # right frequency (n1p) should be greater than or equal to zero
300 225 100       601 if ($n1p < 0)
301             {
302 14         41 $errorMessage = "Marginal total value 'n1p' must not be negative.";
303 14         35 $errorCodeNumber = 204;
304 14         62 return;
305             }
306             # right frequency (n1p) should be less than or equal to the total
307             # number of bigrams (npp)
308 211 100       484 if ($n1p > $npp)
309             {
310 14         32 $errorMessage = "Marginal total value 'n1p' must not exceed total number of bigrams.";
311 14         125 $errorCodeNumber = 203;
312 14         54 return;
313             }
314              
315              
316 197         241 $np1 = -1;
317 197 50       378 if(!defined $values->{np1})
318             {
319 0         0 $errorMessage = "Required Marginal total (p,1) count not passed";
320 0         0 $errorCodeNumber = 200;
321 0         0 return;
322             }
323             else
324             {
325 197         303 $np1=$values->{np1};
326             }
327             # left frequency (np1) should be greater than or equal to zero
328 197 50       447 if ($np1 < 0)
329             {
330 0         0 $errorMessage = "Marginal total value 'np1' must not be negative.";
331 0         0 $errorCodeNumber = 204;
332 0         0 return;
333             }
334             # left frequency (np1) should be less than or equal to the total
335             # number of bigrams (npp)
336 197 50       442 if ($np1 > $npp)
337             {
338 0         0 $errorMessage = "Marginal total value 'np1' must not exceed total number of bigrams.";
339 0         0 $errorCodeNumber = 203;
340 0         0 return;
341             }
342              
343 197         275 $np2 = $npp - $np1;
344 197         245 $n2p = $npp - $n1p;
345              
346 197         659 return 1;
347             }
348              
349              
350              
351             1;
352             __END__