line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Copyright 2009 Francesco Nidito. All rights reserved. |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# This library is free software; you can redistribute it and/or |
4
|
|
|
|
|
|
|
# modify it under the same terms as Perl itself. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Algorithm::WordLevelStatistics; |
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
6306
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use vars qw($VERSION); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
497
|
|
11
|
|
|
|
|
|
|
$VERSION = '0.03'; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
14
|
1
|
|
|
1
|
1
|
79
|
my $class = shift; |
15
|
1
|
|
|
|
|
6
|
return bless { |
16
|
|
|
|
|
|
|
version => $VERSION, |
17
|
|
|
|
|
|
|
}, $class; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# computes the statistical level of a single word (given its spectrum and the total number of words in the text) |
21
|
|
|
|
|
|
|
sub compute_spectrum { |
22
|
2907
|
|
|
2907
|
1
|
3735
|
my ($self, $N, $s) = @_; |
23
|
|
|
|
|
|
|
|
24
|
2907
|
|
|
|
|
2653
|
my $n = @{$s}; |
|
2907
|
|
|
|
|
3894
|
|
25
|
|
|
|
|
|
|
|
26
|
2907
|
|
|
|
|
8157
|
my $ls = { count => $n, C => 0, sigma_nor => 0 }; |
27
|
2907
|
100
|
|
|
|
5396
|
if( $n > 3 ) { |
28
|
|
|
|
|
|
|
# position -> distance from preceding element in text |
29
|
973
|
|
|
|
|
1153
|
my @tmp = (); |
30
|
973
|
|
|
|
|
1951
|
for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); } |
|
28079
|
|
|
|
|
64612
|
|
31
|
|
|
|
|
|
|
|
32
|
973
|
|
|
|
|
1976
|
my ($avg, $sigma) = $self->_mean_and_variance( \@tmp ); |
33
|
973
|
|
|
|
|
1273
|
$sigma = sqrt($sigma)/$avg; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# normalize sigma using an hypothetical uniform distribution |
36
|
973
|
|
|
|
|
1075
|
my $p = $n/$N; |
37
|
973
|
|
|
|
|
1639
|
$ls->{sigma_nor} = $sigma/sqrt(1.0-$p); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# this is not simple: |
40
|
973
|
|
|
|
|
4267
|
$ls->{C} = ($ls->{sigma_nor} - (2.0*$n-1.0)/(2.0*$n+2.0)) * ( sqrt($n)*(1.0+2.8*$n**-0.865) ); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
2907
|
|
|
|
|
7374
|
return $ls; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# computes the statistical level of a group of words (given their spectra) |
47
|
|
|
|
|
|
|
sub compute_spectra { |
48
|
1
|
|
|
1
|
1
|
153570
|
my ($self, $s) = @_; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# count the total number of "words" in text |
51
|
1
|
|
|
|
|
3
|
my $N = 0; |
52
|
1
|
|
|
|
|
2
|
foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; } |
|
1
|
|
|
|
|
849
|
|
|
2907
|
|
|
|
|
2571
|
|
|
2907
|
|
|
|
|
4579
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# computes the level statistic for all terms |
55
|
1
|
|
|
|
|
161
|
my %r = (); |
56
|
1
|
|
|
|
|
3
|
foreach my $i (keys(%{$s})){ |
|
1
|
|
|
|
|
313
|
|
57
|
2907
|
|
|
|
|
6820
|
$r{$i} = $self->compute_spectrum( $N, $s->{$i} ); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
1
|
|
|
|
|
421
|
return \%r; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# fast, on-line algorithm to compute mean and variance: |
64
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm |
65
|
|
|
|
|
|
|
sub _mean_and_variance { |
66
|
973
|
|
|
973
|
|
1049
|
my ($self, $v) = @_; |
67
|
973
|
|
|
|
|
1232
|
my ($n, $mean, $M2) = (0, 0, 0); |
68
|
|
|
|
|
|
|
|
69
|
973
|
|
|
|
|
812
|
foreach my $x (@{$v}) { |
|
973
|
|
|
|
|
1520
|
|
70
|
28079
|
|
|
|
|
33327
|
$n++; |
71
|
28079
|
|
|
|
|
31278
|
my $delta = $x - $mean; |
72
|
28079
|
|
|
|
|
31202
|
$mean += $delta/$n; |
73
|
28079
|
|
|
|
|
37409
|
$M2 += $delta*($x - $mean); |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
973
|
|
|
|
|
1517
|
my $variance = $M2/$n; |
77
|
|
|
|
|
|
|
|
78
|
973
|
|
|
|
|
1823
|
return ($mean, $variance); |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
1; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
__END__ |