File Coverage

blib/lib/Algorithm/WordLevelStatistics.pm

Criterion	Covered	Total	%
statement	43	43	100.0
branch	2	2	100.0
condition			n/a
subroutine	6	6	100.0
pod	3	3	100.0
total	54	54	100.0

line	stmt	bran	sub	pod	time	code
1						# Copyright 2009 Francesco Nidito. All rights reserved.
2						#
3						# This library is free software; you can redistribute it and/or
4						# modify it under the same terms as Perl itself.
5
6						package Algorithm::WordLevelStatistics;
7
8	1		1		6306	use strict;
	1				2
	1				38
9
10	1		1		6	use vars qw($VERSION);
	1				2
	1				497
11						$VERSION = '0.03';
12
13						sub new {
14	1		1	1	79	my $class = shift;
15	1				6	return bless {
16						version => $VERSION,
17						}, $class;
18						}
19
20						# computes the statistical level of a single word (given its spectrum and the total number of words in the text)
21						sub compute_spectrum {
22	2907		2907	1	3735	my ($self, $N, $s) = @_;
23
24	2907				2653	my $n = @{$s};
	2907				3894
25
26	2907				8157	my $ls = { count => $n, C => 0, sigma_nor => 0 };
27	2907	100			5396	if( $n > 3 ) {
28						# position -> distance from preceding element in text
29	973				1153	my @tmp = ();
30	973				1951	for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); }
	28079				64612
31
32	973				1976	my ($avg, $sigma) = $self->_mean_and_variance( \@tmp );
33	973				1273	$sigma = sqrt($sigma)/$avg;
34
35						# normalize sigma using an hypothetical uniform distribution
36	973				1075	my $p = $n/$N;
37	973				1639	$ls->{sigma_nor} = $sigma/sqrt(1.0-$p);
38
39						# this is not simple:
40	973				4267	$ls->{C} = ($ls->{sigma_nor} - (2.0$n-1.0)/(2.0$n+2.0)) * ( sqrt($n)(1.0+2.8$n**-0.865) );
41						}
42
43	2907				7374	return $ls;
44						}
45
46						# computes the statistical level of a group of words (given their spectra)
47						sub compute_spectra {
48	1		1	1	153570	my ($self, $s) = @_;
49
50						# count the total number of "words" in text
51	1				3	my $N = 0;
52	1				2	foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; }
	1				849
	2907				2571
	2907				4579
53
54						# computes the level statistic for all terms
55	1				161	my %r = ();
56	1				3	foreach my $i (keys(%{$s})){
	1				313
57	2907				6820	$r{$i} = $self->compute_spectrum( $N, $s->{$i} );
58						}
59
60	1				421	return \%r;
61						}
62
63						# fast, on-line algorithm to compute mean and variance:
64						# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
65						sub _mean_and_variance {
66	973		973		1049	my ($self, $v) = @_;
67	973				1232	my ($n, $mean, $M2) = (0, 0, 0);
68
69	973				812	foreach my $x (@{$v}) {
	973				1520
70	28079				33327	$n++;
71	28079				31278	my $delta = $x - $mean;
72	28079				31202	$mean += $delta/$n;
73	28079				37409	$M2 += $delta*($x - $mean);
74						}
75
76	973				1517	my $variance = $M2/$n;
77
78	973				1823	return ($mean, $variance);
79						}
80						1;
81
82						__END__