File Coverage

blib/lib/Text/Similarity/Overlaps.pm

Criterion	Covered	Total	%
statement	112	137	81.7
branch	21	32	65.6
condition			n/a
subroutine	14	15	93.3
pod	0	5	0.0
total	147	189	77.7

line	stmt	bran	sub	pod	time	code
1						package Text::Similarity::Overlaps;
2
3	3		3		12540	use 5.006;
	3				10
	3				109
4	3		3		14	use strict;
	3				4
	3				84
5	3		3		13	use warnings;
	3				4
	3				69
6
7	3		3		18	use Text::Similarity;
	3				17
	3				128
8	3		3		1502	use Text::OverlapFinder;
	3				7
	3				332
9
10						our @ISA = qw(Text::Similarity);
11
12						# Items to export into callers namespace by default. Note: do not export
13						# names by default without a very good reason. Use EXPORT_OK instead.
14						# Do not simply export all your public functions/methods/constants.
15
16						# This allows declaration use Text::Similarity::Overlaps ':all';
17						# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
18						# will save memory.
19
20						our %EXPORT_TAGS = ( 'all' => [ qw() ] );
21						our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
22
23						our @EXPORT = qw();
24
25						our $VERSION = '0.05';
26
27						# information about granularity is not used now
28						# the would be that you could figure out similarity
29						# unit by unit, that is sentence by sentence,
30						# paragraph by paragraph, etc. however, at this
31						# point similarity is only computed document by
32						# document
33
34	3		3		20	use constant WORD => 0;
	3				4
	3				144
35	3		3		17	use constant SENTENCE => 1;
	3				5
	3				137
36	3		3		16	use constant PARAGRAPH => 2;
	3				5
	3				126
37	3		3		15	use constant DOCUMENT => 3;
	3				5
	3				4084
38
39						my %finder;
40
41						sub new
42						{
43	4		4	0	472	my $class = shift;
44	4				44	my $self = $class->SUPER::new (@_);
45	4	50			29	if ($self->stoplist) {
46	0				0	$finder{$self} = Text::OverlapFinder->new(stoplist => $self->stoplist);
47						}
48						else {
49	4				31	$finder{$self} = Text::OverlapFinder->new;
50						}
51
52						#string_compare_initialize (0, undef);
53	4				24	return $self;
54						}
55
56
57						sub finder : lvalue
58						{
59	42		42	0	52	my $self = shift;
60	42				252	$finder{$self};
61						}
62
63						sub DESTROY
64						{
65	4		4		2156	my $self = shift;
66	4				31	delete $finder{$self};
67						}
68
69						# this method requires that the input be provided in files.
70						# this is now just a front end to getSimilarityStrings that
71						# does file handling. Actual similarity measurements are
72						# performed between strings in getSimilarityStrings.
73
74						# this seems like it might want to move up into Similarity.pm
75
76						sub getSimilarity
77						{
78	28		28	0	29462	my $self = shift;
79
80						# we created a separate method for getSimilarityStrings since overloading
81						# to accept both strings and file names as input parameters to
82						# getSimilarity would have required that we treat any file name that does
83						# not have a corresponding file to be treated as a string, thus making it
84						# impossible to really deal with missing file errors, and probably resulting
85						# in quite a bit of user annoyance as 'textt1.txt' is measured for similarity
86						# with the contents of 'text2.txt'
87
88	28				59	my $file1 = shift;
89	28				35	my $file2 = shift;
90
91						# granularity is not currently supported
92
93	28				35	my $granularity = shift;
94	28	50			93	$granularity = DOCUMENT unless defined $granularity;
95
96	28	50			563	unless (-e $file1) {
97	0				0	$self->error ("The file '$file1' does not exist");
98	0				0	return undef;
99						}
100	28	50			310	unless (-e $file2) {
101	0				0	$self->error ("The file '$file2' does not exist");
102	0				0	return undef;
103						}
104
105	28	50			916	unless (open (FH1, '<', $file1)) {
106	0				0	$self->error ("Cannot open $file1: $!");
107	0				0	return undef;
108						}
109	28	50			1141	unless (open (FH2, '<', $file2)) {
110	0				0	$self->error ("Cannot open $file2: $!");
111	0				0	return undef;
112						}
113
114	28				38	my $str1;
115						my $str2;
116	28				471	while () {
117	100				278	$str1 .= $self->sanitizeString ($_);
118						}
119	28				538	$str1 =~ s/\s+/ /g;
120	28				519	while () {
121	66				159	$str2 .= $self->sanitizeString ($_);
122						}
123	28				597	$str2 =~ s/\s+/ /g;
124
125						# find compounds, not working right now
126
127	28				102	$str1 = $self->compoundify ($str1);
128	28				72	$str2 = $self->compoundify ($str2);
129
130	28				403	close FH1;
131	28				236	close FH2;
132
133						# add this call, to expose this method to users too (in case they want to
134						# just measure the similarity of two strings. So we have our files converted
135						# into strings, and now measure their similarity.
136
137	28				91	my ($score, %allScores) = $self -> getSimilarityStrings ($str1,$str2);
138
139						# end getSimilarity here, making sure to return similarity value from
140						# get SimilarityStrings
141
142	28	50			215	return wantarray ? ($score, %allScores) : $score;
143						}
144
145						# this method measures the similarity between two strings. If a string is empty
146						# or missing then we generate an error and return undefined
147
148						sub getSimilarityStrings {
149
150	44		44	0	10345	my $self = shift;
151
152	44				66	my $input1 = shift;
153	44				51	my $input2 = shift;
154
155						# check to make sure you have a string! empty file or string should be rejected
156
157	44	100			114	if (!defined($input1)) {
158	2				19	$self->error ("first input string is undefined: $!");
159	2				6	return undef;
160						}
161	42	50			79	if (!defined($input2)) {
162	0				0	$self->error ("second input string is undefined: $!");
163	0				0	return undef;
164						}
165
166						# clean the strings, maybe make this optional
167
168	42				122	my $str1 .= $self->sanitizeString ($input1);
169	42				127	my $str2 .= $self->sanitizeString ($input2);
170
171						# find compounds, not working right now
172
173	42				139	$str1 = $self->compoundify ($str1);
174	42				110	$str2 = $self->compoundify ($str2);
175
176						# this is where we find our overlaps....
177						# then we score them using both the standard overlaps measure
178						# as well as based on the lesk.pm module from WordNet-Similarity
179
180	42				114	my ($overlaps, $wc1, $wc2) = $self->finder->getOverlaps ($str1, $str2);
181
182	42				71	my $score = 0;
183	42				84	my $raw_lesk = 0;
184	42				163	my %allScores = ('wc1' => $wc1, 'wc2' => $wc2);
185
186	42	50			201	if ($self->verbose) {
187	0				0	print STDERR "keys: ", scalar keys %$overlaps, "\n";
188						}
189
190	42				199	foreach my $key (sort keys %$overlaps) {
191
192	40				390	my @words = split /\s+/, $key;
193
194	40	50			142	if ($self->verbose) {
195	0				0	print STDERR "-->'$key' len(", scalar @words, ") cnt(", $overlaps->{$key}, ")\n";
196						}
197
198						# find out how many words match, add 1 for each match
199
200						#$score += scalar @words * scalar @words * ${$overlaps}{$key};
201	40				141	$score += scalar @words * $overlaps->{$key};
202	40				81	$allScores{'raw'} = $score;
203						# find the length of the key, square it, multiply with its
204						# value to get the lesk score for this particular match
205
206	40				81	my $value = ($#words + 1) * ($#words + 1) * ${$overlaps}{$key};
	40				74
207	40				176	$raw_lesk += $value;
208
209						}
210
211						# fix for divide by zero error, will short circuit when score is 0
212						# provided by cernst at esoft.com
213						# who reported via rt.cpan.org ticket 29902
214
215	42	100			107	if ($score == 0){
216	10				18	$allScores{'raw'} = 0;
217	10				16	$allScores{'precision'} = 0;
218	10				19	$allScores{'recall'} = 0;
219	10				16	$allScores{'F'} = 0;
220	10				14	$allScores{'dice'} = 0;
221	10				16	$allScores{'E'} = 0;
222	10				15	$allScores{'cosine'} = 0;
223	10				20	$allScores{'raw_lesk'} = 0;
224	10				17	$allScores{'lesk'} = 0;
225
226	10	100			92	return wantarray ? ($score, %allScores) : $score;
227						}
228
229						# end of fix
230
231	32				58	my $prec = $score / $wc2;
232	32				43	my $recall = $score / $wc1;
233	32				78	my $f = 2 * $prec * $recall / ($prec + $recall);
234
235	32				56	my $dice = 2 * $score / ($wc1 + $wc2) ;
236	32				48	my $e = 1 - $f;
237	32				86	my $cos = $score / sqrt ($wc1 * $wc2);
238	32				45	my $lesk = $raw_lesk/ ($wc1 * $wc2);
239
240	32				72	$allScores{'precision'} = $prec;
241	32				48	$allScores{'recall'} = $recall;
242	32				61	$allScores{'F'} = $f;
243	32				50	$allScores{'dice'} = $dice;
244	32				59	$allScores{'E'} = $e;
245	32				54	$allScores{'cosine'} = $cos;
246	32				49	$allScores{'raw_lesk'} = $raw_lesk;
247	32				59	$allScores{'lesk'} = $lesk;
248
249						# display them, if requested
250	32	50			96	if ($self->verbose) {
251	0				0	print STDERR "wc 1: $wc1\n";
252	0				0	print STDERR "wc 2: $wc2\n";
253	0				0	print STDERR " Raw score: $score\n";
254	0				0	print STDERR " Precision: $prec\n";
255	0				0	print STDERR " Recall : $recall\n";
256	0				0	print STDERR " F-measure: $f\n";
257	0				0	print STDERR " Dice : $dice\n";
258	0				0	print STDERR " E-measure: $e\n";
259	0				0	print STDERR " Cosine : $cos\n";
260	0				0	print STDERR " Raw lesk : $raw_lesk\n";
261	0				0	print STDERR " Lesk : $lesk\n";
262
263						#
264						# include might be the jaccard coefficient ....
265						# jaccard is 2*raw_score / union of string1 and string2
266						#
267						}
268
269	32	100			106	if ($self->normalize) {
270	16				24	$score = $f;
271						}
272
273	32	100			426	return wantarray ? ($score, %allScores) : $score;
274						}
275
276	0		0	0		sub doStop {0}
277
278						1;
279
280						__END__