File Coverage

blib/lib/Text/NSP/Measures.pm
Criterion Covered Total %
statement 19 25 76.0
branch n/a
condition n/a
subroutine 6 9 66.6
pod 5 5 100.0
total 30 39 76.9


line stmt bran cond sub pod time code
1             package Text::NSP::Measures;
2              
3             # moved this version information to the top of the file to avoid
4             # confusion with the documentation below, that includes code examples
5             # that set versions, etc.
6              
7 28     28   15035 use Text::NSP;
  28         69  
  28         1171  
8 28     28   151 use strict;
  28         53  
  28         490  
9 28     28   129 use Carp;
  28         47  
  28         1248  
10 28     28   128 use warnings;
  28         48  
  28         10924  
11             require Exporter;
12              
13             our ($VERSION, @ISA, @EXPORT, $errorCodeNumber, $errorMessage);
14              
15             @ISA = qw(Exporter);
16              
17             @EXPORT = qw(initializeStatistic calculateStatistic
18             getErrorCode getErrorMessage getStatisticName
19             $errorCodeNumber $errorMessage);
20              
21             $VERSION = '0.97';
22              
23             =cut
24              
25             =head1 NAME
26              
27             Text::NSP::Measures - Perl modules for computing association scores of
28             Ngrams. This module provides the basic framework
29             for these measures.
30              
31             =head1 SYNOPSIS
32              
33             =head2 Basic Usage
34              
35             use Text::NSP::Measures::2D::MI::ll;
36              
37             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
38              
39             $ll_value = calculateStatistic( n11=>$n11,
40             n1p=>$n1p,
41             np1=>$np1,
42             npp=>$npp);
43              
44             if( ($errorCode = getErrorCode()))
45             {
46             print STDERR $errorCode." - ".getErrorMessage()."\n"";
47             }
48             else
49             {
50             print getStatisticName."value for bigram is ".$ll_value."\n"";
51             }
52              
53             =head1 DESCRIPTION
54              
55              
56              
57             =head2 Introduction
58              
59             These modules provide perl implementations of mathematical functions
60             (association measures) that can be used to interpret the co-occurrence
61             frequency data for Ngrams. We define an Ngram as a sequence of 'n'
62             tokens that occur within a window of at least 'n' tokens in the text;
63             what constitutes a "token" can be defined by the user.
64              
65             The measures that have been implemented in this distribution are:
66              
67             =over
68              
69             =item 1) MI (Mutual Information based Measures)
70              
71             =over
72              
73             =item a) Loglikelihood (for Bigrams and Trigrams)
74              
75             =item b) True Mutual Information (for Bigrams and Trigrams)
76              
77             =item c) Pointwise Mutual Information (for Bigrams and Trigrams)
78              
79             =item d) Poisson Stirling Measure (for Bigrams and Trigrams)
80              
81             =back
82              
83             =item 2) CHI (Measures belonging to the CHI family)
84              
85             =over
86              
87             =item a) Chi-squared Measure
88              
89             =item b) Phi Coefficient
90              
91             =item c) T-Score
92              
93             =back
94              
95             =item 3) Dice (Measures belonging to the Dice family)
96              
97             =over
98              
99             =item a) Dice Coefficient
100              
101             =item b) Jaccard Measure
102              
103             =back
104              
105             =item 4) Fishers Exact Tests
106              
107             =over
108              
109             =item a) Left Fishers Exact Test
110              
111             =item b) Right Fishers Exact Test
112              
113             =item c) Two-Tailed Fishers Exact Test
114              
115             =back
116              
117             =item 5) Odds Ratio
118              
119             =back
120              
121             Further discussion about these measures is in their respective
122             documentations.
123              
124             =head2 Writing your own association measures
125              
126             This module also provides a basic framework for building new measures
127             of association for Ngrams. The new Measure should either inherit from
128             Text::NSP::Measures::2D or Text::NSP::Measures::3D modules, depending
129             on whether it is a bigram or a trigram measure. Both these modules
130             implement methods that retrieve observed frequency counts, marginal
131             totals, and also compute expected values. They also provide error
132             checks for these counts.
133              
134             You can either write your new measure as a new module, or you can
135             simply write a perl program. Here we will describe how to write a
136             new measure as a perl module Perl.
137              
138             =over 4
139              
140             =item 1
141              
142              
143             To create a new Perl module for the measure issue the following
144             command (replace 'NewMeasure' with the name of your measure):
145              
146             =over
147              
148             h2xs -AXc -n Text::NSP::Measures::2D::NewMeasure
149             (for bigram measures)
150              
151             or
152              
153             h2xs -AXc -n Text::NSP::Measures::3D::NewMeasure
154             (for trigram measures)
155              
156             =back
157              
158             This will create a new folder namely...
159              
160             =over
161              
162             Text-NSP-Measures-2D-NewMeasure (for bigram)
163              
164             or
165              
166             Text-NSP-Measures-3D-NewMeasure (for trigram)
167              
168             =back
169              
170             This will create an empty framework for the new association measure.
171             Once you are done completing the changes you will have to install the
172             module before you can use it.
173              
174             To make changes to the module open:
175              
176             =over
177              
178             Text-NSP-Measures-2D-NewMeasure/lib/Text/NSP/Measures/2D/NewMeasure/
179             NewMeasure.pm
180              
181             or
182              
183             Text-NSP-Measures-3D-NewMeasure/lib/Text/NSP/Measures/3D/NewMeasure/
184             NewMeasure.pm
185              
186             =back
187              
188             in your favorite text editor, and do as follows.
189              
190             =item 2
191              
192             Let us say you have named your module NewMeasure. The first line of
193             the file should declare that it is a package. Thus the first line of
194             the file NewMeasure.pm should be...
195              
196             =over
197              
198             package Text::NSP::Measures::2D::NewMeasure; (for bigram measures)
199              
200             or
201              
202             package Text::NSP::Measures::3D::NewMeasure; (for trigram measures)
203              
204             =back
205              
206             To inherit the functionality from the 2D or 3D module you need to
207             include it in your NewMeasure.pm module.
208              
209             A small code snippet to ensure that it is included is as follows:
210              
211             =over
212              
213             =item 1 For Bigrams
214              
215             use Text::NSP::Measures::2D::MI;
216              
217             =item 2 For Trigrams
218              
219             use Text::NSP::Measures::2D::MI;
220              
221             =back
222              
223             You also need to insert the following lines to make sure that the required
224             functions are visible to the programs using your module. These lines are
225             same for bigrams and trigrams. The "no warnings 'redefine';" statement is
226             used to suppress perl warnings about method overriding.
227              
228             =over
229              
230             use strict;
231             use Carp;
232             use warnings;
233             no warnings 'redefine';
234             require Exporter;
235              
236             our ($VERSION, @EXPORT, @ISA);
237              
238             @ISA = qw(Exporter);
239              
240             @EXPORT = qw(initializeStatistic calculateStatistic
241             getErrorCode getErrorMessage getStatisticName);
242              
243             =back
244              
245             =item 3
246             You need to implement at least one method in your package
247              
248             =over
249              
250             =item i) calculateStatistic()
251              
252             =back
253              
254             This method is passed reference to a hash containing the
255             frequency values for a Ngram as found in the input Ngram file.
256              
257             method calculateStatistic() is expected to return a (possibly
258             floating point) value as the value of the statistical measure calculated
259             using the frequency values passed to it.
260              
261             There exist three methods in the modules Text::NSP::Measures::2d and
262             Text::NSP::Measures::3D in order to help calculate the ngram
263             statistic.
264              
265             =over
266              
267             =item 1. computeMarginalTotals($frequencies);
268              
269             =item 2. computeObservedValues($frequencies);
270              
271             =item 3. computeExpectedValues($frequencies);
272              
273             =back
274              
275             These methods return the observed and expected values of the cells in
276             the contingency table. A 2D contingency table looks like:
277              
278             |word2 | not-word2|
279             --------------------
280             word1 | n11 | n12 | n1p
281             not-word1 | n21 | n22 | n2p
282             --------------------
283             np1 np2 npp
284              
285             Here the marginal totals are np1, n1p, np2, n2p, the Observed values
286             are n11, n12, n21, n22 and the expected values for the corresponding
287             observed values are represented using m11, m12, m21, m22, here m11
288             represents the expected value for the cell (1,1), m12 for the cell
289             (1,2) and so on.
290              
291             Before calling either computeObservedValues() or computeExpectedValues()
292             you MUST call computeMarginalTotals(), since these methods require the
293             marginal to be set. The computeMarginalTotals method computes the marginal
294             totals in the contingency table based on the observed frequencies. It
295             returns an undefined value in case of some error. In case success it
296             returns '1'. An example of usage for the computeMarginalTotals() method is
297              
298             =over
299              
300             my %values = @_;
301              
302             if(!(Text::NSP::Measures::2D::computeMarginalTotals(\%values)) ){
303             return;
304             }
305              
306             =back
307              
308             @_ is the parameters passed to calculateStatistic. After this call the
309             marginal totals will be available in the following variables
310              
311             =over
312              
313             =item 1. For bigrams
314             $npp , $n1p, $np1, $n2p, $np2
315              
316             =item 1. For trigrams
317             $nppp, $n1pp, $np1p, $npp1, $n11p, $n1p1, $np11, $n2pp,
318             $np2p, $npp2
319              
320             =back
321              
322             computeObservedValues() computes the observed values of a ngram, It can be
323             called using the following code snippet. Please remember that you should call
324             computeMarginalTotals() before calling computeObservedValues().
325              
326             =over
327              
328             if( !(Text::NSP::Measures::2D::computeObservedValues(\%values)) ) {
329             return;
330             }
331              
332             =back
333              
334             %value is the same hash that was initialized earlier for computeMarginalTotals.
335              
336             If successful it returns 1 otherwise an undefined value is returned. The
337             computed observed values will be available in the following variables:
338              
339             =over
340              
341             =item 1. For bigrams
342             $n11 , $n12, $n21, $n22
343              
344             =item 1. For trigrams
345             $n111, $n112, $n121, $n122, $n211, $n212, $n221, $n222,
346              
347             =back
348              
349             Similarly, computeExpectedValues() computes the expected values for each of
350             the cells in the contingency table. You should call computeMarginalTotals()
351             before calling computeExpectedValues(). The following code snippet
352             demonstrates its usage.
353              
354             =over
355              
356             if( !(Text::NSP::Measures::2D::computeExpectedValues()) ) {
357             return;
358             }
359              
360             =back
361              
362             If successful it returns 1 otherwise an undefined value is returned. The
363             computed expected values will be available in the following variables:
364              
365             =over
366              
367             =item 1. For bigrams
368             $m11 , $m12, $m21, $m22
369              
370             =item 1. For trigrams
371             $m111, $m112, $m121, $m122, $m211, $m212, $m221, $m222,
372              
373             =back
374              
375             =item 4
376              
377             The last lines of a module should always return true, to achieve this
378             make sure that the last two lines of the are:
379              
380             1;
381             __END__
382              
383             Please see, that you can put in documentation after these lines.
384              
385             =item 5
386              
387             There are four other methods that are not mandatory, but may be
388             implemented. These are:
389              
390             i) initializeStatistic()
391             ii) getErrorCode
392             iii) getErrorMessage
393             iv) getStatisticName()
394              
395             statistical.pl calls initializeStatistic before calling any
396             other method, if there is no need for any specific initialization
397             in the measure you need not define this method, and the
398             initialization will be handled by the Text::NSP::Measures modules
399             initializeStatistic() method.
400              
401             The getErrorCode method is called immediately after every call to
402             method calculateStatistic(). This method is used to return the
403             errorCode, if any, in the previous operations. To view all the
404             possible error codes and the corresponding error message please refer
405             to the Text::NSP documentation (perldoc Text::NSP).You can create new
406             error codes in your measure, if the existing error codes are not
407             sufficient.
408              
409             The Text::NSP::Measures module implements both getErrorCode()
410             and getErrorMessage() methods and these implementations of the method
411             will be invoked if the user does not define these methods. But if you
412             want to add some other actions that need to be performed in case
413             of an error you must override these methods by implementing them in
414             your module. You can invoke the Text::NSP::Measures getErrorCode()
415             methods from your measures getErrorCode() method.
416              
417             An example of this is below:
418              
419             sub getErrorCode
420             {
421             my $code = Text::NSP::Measures::getErrorCode();
422              
423             #your code here
424              
425             return $code; #(or any other value)
426             }
427              
428             sub getErrorMessage
429             {
430             my $message = Text::NSP::MeasuresgetErrorMessage();
431              
432             #your code here
433              
434             return $message; #(or any other value)
435             }
436              
437             The fourth method that may be implemented is getStatisticName().
438             If this method is implemented, it is expected to return a string
439             containing the name of the statistic being implemented. This string
440             is used in the formatted output of statistic.pl. If this method
441             is not implemented, then the statistic name entered on the
442             commandline is used in the formatted output.
443              
444             Note that all the methods described in this section are optional.
445             So, if the user elects to not implement these methods, no harm will
446             be done.
447              
448             The user may implement other methods too, but since statistic.pl is
449             not expecting anything besides the five methods above, doing so would
450             have no effect on statistic.pl.
451              
452             =item 6
453              
454             You will need to install your module before you can use it. You can do
455             this by
456              
457             Change to the base directory for the module, i.e.
458             NewMeasure
459              
460             Then issue the following commands:
461              
462             perl Makefile.PL
463             make
464             make test
465             make install
466              
467             or
468              
469             perl Makefile.PL PREFIX=
470             make
471             make test
472             make install
473              
474              
475             If you get any errors in the installation process, please make sure
476             that you have not made any syntactical error in your code and also
477             make sure that you have already installed the Text-NSP package.
478              
479             =back
480              
481             =head2 An Example
482              
483             To tie it all together here is an example of a measure that computes
484             the sum of ngram frequency counts.
485              
486             =over
487              
488             package Text::NSP::Measures::2D::sum;
489              
490              
491             use Text::NSP::Measures::2D::MI::2D;
492             use strict;
493             use Carp;
494             use warnings;
495             no warnings 'redefine';
496             require Exporter;
497              
498             our ($VERSION, @EXPORT, @ISA);
499              
500             @ISA = qw(Exporter);
501              
502             @EXPORT = qw(initializeStatistic calculateStatistic
503             getErrorCode getErrorMessage getStatisticName);
504              
505             $VERSION = '0.01';
506              
507             sub calculateStatistic
508             {
509             my %values = @_;
510              
511             # computes and returns the marginal totals from the frequency
512             # combination values. returns undef if there is an error in
513             # the computation or the values are inconsistent.
514             if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){
515             return;
516             }
517              
518             # computes and returns the observed and marginal values from
519             # the frequency combination values. returns 0 if there is an
520             # error in the computation or the values are inconsistent.
521             if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
522             return;
523             }
524              
525              
526             # Now for the actual calculation of the association measure
527             my $NewMeasure = 0;
528              
529             $NewMeasure += $n11;
530             $NewMeasure += $n12;
531             $NewMeasure += $n21;
532             $NewMeasure += $n22;
533              
534             return ( $NewMeasure );
535             }
536              
537             sub getStatisticName
538             {
539             return "Sum";
540             }
541              
542             1;
543             __END__