File Coverage

blib/lib/Lingua/EN/Segmenter/Evaluator.pm

Criterion	Covered	Total	%
statement	11	63	17.4
branch	0	24	0.0
condition	0	44	0.0
subroutine	4	9	44.4
pod	0	6	0.0
total	15	146	10.2

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::Segmenter::Evaluator;
2
3							=head1 NAME
4
5							Lingua::EN::Segmenter::Evaluator - Evaluate a segmenting method
6
7							=head1 SYNOPSIS
8
9							my $tiling_segmenter = Lingua::EN::Segmenter::TextTiling->new();
10
11							foreach (@ARGV) {
12							my $input = read_file($_);
13
14							print "\nFile name: $_\n";
15
16							printf "Results from TextTiling algorithm:
17							Strict scoring: %2d%% recall, %2d%% precision
18							Relaxed scoring: %2d%% recall, %2d%% precision
19							V. relaxed scoring: %2d%% recall, %2d%% precision
20							", calc_stats(evaluate_segmenter($tiling_segmenter,20,$input));
21							}
22
23							=head1 DESCRIPTION
24
25							See synopsis.
26
27							Also check out segmenter.pl in the eg directory.
28
29							=head1 BUGS
30
31							This module only works correctly when the segmenter has a MIN_SEGMENT_SIZE >= 2.
32
33							=head1 AUTHORS
34
35							David James
36
37							=head1 SEE ALSO
38
39							L, L,
40							L
41
42							=cut
43
44
45							$VERSION = 0.10;
46							@EXPORT_OK = qw(evaluate_segmenter calc_stats);
47	1			1		2262	use strict;
	1					2
	1					37
48	1			1		6	use base 'Class::Exporter';
	1					3
	1					84
49	1			1		6	use Math::HashSum qw(hashsum);
	1					2
	1					1011
50
51							# Create a new Evaluator object
52							sub new {
53	1			1	0	60	my $self = shift;
54	1					7	bless {
55							@_
56							}, $self
57							}
58
59
60							# Evaluate the segmenter on a particular input
61							sub evaluate_segmenter {
62	0			0	0		my ($self, $segmenter, $input, $num_segments) = @_;
63
64	0						$self->{taken} = {};
65
66	0						my $num_paragraphs = @{$segmenter->{splitter}->paragraph_breaks($input)};
	0
67
68	0						my $break = $self->{break} = $segmenter->{splitter}->segment_breaks($input);
69	0		0				$num_segments \|\|= scalar keys %{$break};
	0
70	0						my $assigned = $self->{assigned} = $segmenter->segment($num_segments, $input);
71
72	0		0				my @description = map { {
	0
73							para=>$_,
74							true=>exists $break->{$_},
75							label=>$assigned->{$_},
76							strict=>exists $break->{$_} && exists $assigned->{$_},
77							relaxed=>$self->relaxed_weight($_),
78							very_relaxed=>$self->very_relaxed_weight($_),
79							} } (0..$num_paragraphs-1);
80
81	0						return @description;
82							}
83
84							# Get the weight of a particular index based on a relaxed scheme
85							# NOTE: Assumes that MIN_SEGMENT_SIZE >= 2
86							sub relaxed_weight {
87	0			0	0		my ($self, $i) = @_;
88	0						my $assigned = $self->{assigned}{$i};
89	0						my $break = $self->{break}{$i};
90	0	0	0				if ($assigned and $break) {
91	0						$self->take(1,"break",$i);
92	0						$self->take(1,"assigned",$i);
93	0						return 1;
94							}
95	0	0					if (defined $assigned) {
		0
96	0	0	0				if ($assigned =~ /L/ and $self->take(1,"break",$i-1) or
		0	0
			0
			0
97							$assigned =~ /R/ and $self->take(1,"break",$i+1)) {
98	0						return 0.8;
99							} elsif ($self->take(1,"break",$i-1) or $self->take(1,"break",$i+1)) {
100	0						return 0.4;
101							}
102							} elsif (exists $self->{break}{$i}) {
103	0	0	0				if ($self->take(1,"assigned",$i-1,"R") or
		0	0
104							$self->take(1,"assigned",$i+1,"L")) {
105	0						return 0.8;
106							} elsif ($self->take(1,"assigned",$i-1) or
107							$self->take(1,"assigned",$i+1)) {
108
109	0						return 0.4;
110							}
111							}
112	0						return 0;
113							}
114
115							# Get the weight of a particular index based on a very relaxed scheme
116							# NOTE: Assumes that MIN_SEGMENT_SIZE >= 2
117							sub very_relaxed_weight {
118	0			0	0		my ($self, $i) = @_;
119	0						my $assigned = $self->{assigned}{$i};
120	0						my $break = $self->{break}{$i};
121
122	0	0	0				if ($assigned or $break) {
123	0						foreach (-2..2) {
124	0		0				$assigned \|\|= $self->take(2,"assigned",$i+$_);
125	0		0				$break \|\|= $self->take(2,"break",$i+$_);
126							}
127							}
128	0		0				return ($assigned and $break);
129							}
130
131							# Mark a particular index as used if it's not already used
132							sub take {
133	0			0	0		my ($self,$count,$which,$i,$req) = @_;
134	0	0	0				if (!$self->{taken}{$count}{$which}{$i} and $self->{$which}{$i}) {
135	0	0	0				if (!$req or $self->{$which}{$i} =~ /$req/) {
136	0						$self->{taken}{$count}{$which}{$i}++;
137	0						return 1;
138
139							}
140							}
141	0						return;
142							}
143
144							# Calculate precision and recall for strict, relaxed, very_relaxed
145							sub calc_stats {
146	0			0	0		my $self = shift;
147	0						my %sum = hashsum map { %$_ } @_;
	0
148
149							# Ensure "R" and "L" count as categories
150	0						$sum{label} = grep { $_->{label} } @_;
	0
151
152							# Ensure relaxed counts don't double-count
153	0						$sum{relaxed} -= ($sum{relaxed} - $sum{strict})/2;
154	0						$sum{very_relaxed} -= ($sum{very_relaxed} - $sum{strict})/2;
155
156							# Sanity checks
157	0	0					if ($sum{true} == 0) {
		0
158	0						die "No segment_breaks found. Please label the true segments in the original text so that we can evaluate the performance of the Segmenting algorithm";
159							} elsif ($sum{label} == 0) {
160	0						die "No segments labelled by Segmenting algorithm";
161							}
162
163							# Return results
164	0						return map { 100$sum{$_}/$sum{true}, 100$sum{$_} / $sum{label} }
	0
165							qw(strict relaxed very_relaxed);
166							}
167
168							1;
169