File Coverage

blib/lib/Algorithm/WordLevelStatistics.pm
Criterion Covered Total %
statement 43 43 100.0
branch 2 2 100.0
condition n/a
subroutine 6 6 100.0
pod 3 3 100.0
total 54 54 100.0


line stmt bran cond sub pod time code
1             # Copyright 2009 Francesco Nidito. All rights reserved.
2             #
3             # This library is free software; you can redistribute it and/or
4             # modify it under the same terms as Perl itself.
5              
6             package Algorithm::WordLevelStatistics;
7              
8 1     1   6306 use strict;
  1         2  
  1         38  
9              
10 1     1   6 use vars qw($VERSION);
  1         2  
  1         497  
11             $VERSION = '0.03';
12              
13             sub new {
14 1     1 1 79 my $class = shift;
15 1         6 return bless {
16             version => $VERSION,
17             }, $class;
18             }
19              
20             # computes the statistical level of a single word (given its spectrum and the total number of words in the text)
21             sub compute_spectrum {
22 2907     2907 1 3735 my ($self, $N, $s) = @_;
23              
24 2907         2653 my $n = @{$s};
  2907         3894  
25              
26 2907         8157 my $ls = { count => $n, C => 0, sigma_nor => 0 };
27 2907 100       5396 if( $n > 3 ) {
28             # position -> distance from preceding element in text
29 973         1153 my @tmp = ();
30 973         1951 for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); }
  28079         64612  
31              
32 973         1976 my ($avg, $sigma) = $self->_mean_and_variance( \@tmp );
33 973         1273 $sigma = sqrt($sigma)/$avg;
34              
35             # normalize sigma using an hypothetical uniform distribution
36 973         1075 my $p = $n/$N;
37 973         1639 $ls->{sigma_nor} = $sigma/sqrt(1.0-$p);
38              
39             # this is not simple:
40 973         4267 $ls->{C} = ($ls->{sigma_nor} - (2.0*$n-1.0)/(2.0*$n+2.0)) * ( sqrt($n)*(1.0+2.8*$n**-0.865) );
41             }
42              
43 2907         7374 return $ls;
44             }
45              
46             # computes the statistical level of a group of words (given their spectra)
47             sub compute_spectra {
48 1     1 1 153570 my ($self, $s) = @_;
49              
50             # count the total number of "words" in text
51 1         3 my $N = 0;
52 1         2 foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; }
  1         849  
  2907         2571  
  2907         4579  
53              
54             # computes the level statistic for all terms
55 1         161 my %r = ();
56 1         3 foreach my $i (keys(%{$s})){
  1         313  
57 2907         6820 $r{$i} = $self->compute_spectrum( $N, $s->{$i} );
58             }
59              
60 1         421 return \%r;
61             }
62              
63             # fast, on-line algorithm to compute mean and variance:
64             # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
65             sub _mean_and_variance {
66 973     973   1049 my ($self, $v) = @_;
67 973         1232 my ($n, $mean, $M2) = (0, 0, 0);
68              
69 973         812 foreach my $x (@{$v}) {
  973         1520  
70 28079         33327 $n++;
71 28079         31278 my $delta = $x - $mean;
72 28079         31202 $mean += $delta/$n;
73 28079         37409 $M2 += $delta*($x - $mean);
74             }
75              
76 973         1517 my $variance = $M2/$n;
77              
78 973         1823 return ($mean, $variance);
79             }
80             1;
81              
82             __END__