File Coverage

blib/lib/String/Compare.pm

Criterion	Covered	Total	%
statement	77	78	98.7
branch	18	26	69.2
condition	2	2	100.0
subroutine	8	8	100.0
pod	6	6	100.0
total	111	120	92.5

line	stmt	bran	cond	sub	pod	time	code
1							package String::Compare;
2
3	1			1		16878	use base qw(Exporter);
	1					3
	1					608
4							@EXPORT = qw(compare);
5	1			1		8	use strict;
	1					3
	1					2919
6
7							=head1 NAME
8
9							String::Compare - Compare two strings and return how much they are alike
10
11							=head1 SYNOPSIS
12
13							use String::Compare;
14							my $str1 = "J R Company";
15							my $str2 = "J. R. Company";
16							my $str3 = "J R Associates";
17							my $points12 = compare($str1,$str2);
18							my $points13 = compare($str1,$str3);
19							if ($points12 > $points13) {
20							print $str1." refers to ".$str2;
21							} else {
22							print $str1." refers to ".$str3;
23							}
24
25							=head1 DESCRIPTION
26
27							This module was created when I needed to merge the information
28							between two databases, and I had to find who were who in each database,
29							but the names weren't always equals, sometimes there were differences.
30
31							The problem was that I need to choose the right person, so I must see how
32							much the different names are alike. I've tried testing char by char, but situations
33							like the described in the synopsis showed me that wasn't enough. So I created a
34							set of tests to give a more accurate pontuation of how much the names are alike.
35
36							The result is in percentage. If the strings are exactly equal, it would return 1,
37							if they have nothing in common, it would return 0.
38
39							=head1 METHODS
40
41							=over
42
43							=item compare($str1,$str2,%tests)
44
45							This method receives the two strings and optionally the names and weights
46							of each test. The default behavior is to use all the tests with the weigth 1.
47							This method lowercases both strings, since case doesn't change the meaning
48							of the content. But each test is case sensitive, so if you like you must lc the strings.
49
50							The current tests are (you can use the tests individually if you like:
51
52							P.S.: You can use custom tests, because the tests are executed using eval,
53							so if you want a custom test, just use the full name of a method.
54
55							P.S.2: If you created a test, please share it, sending me by email and I will be
56							glad to include it into the default set.
57
58							=back
59
60							=cut
61
62							my %default_options =
63							(
64							char_by_char => 1,
65							consonants => 1,
66							vowels => 1,
67							word_by_word => 1,
68							chars_only => 1
69							);
70
71							sub compare {
72	2			2	1	77	my $str1 = shift;
73	2					4	my $str2 = shift;
74
75	2					5	$str1 = lc($str1);
76	2					5	$str2 = lc($str2);
77							# skip any tests if they are the same
78	2	50				6	return 1 if $str1 eq $str2;
79
80	2					5	my %user_opt = @_;
81	2					13	my %opt = (%default_options, %user_opt);
82	2					4	my %results;
83	2					3	my $totalPoints = 0;
84	2					3	my $score = 0;
85	2					19	foreach my $test (keys %opt) {
86	10					15	$totalPoints += $opt{$test};
87							}
88	2					7	foreach my $test (keys %opt) {
89	10	50				26	next if $opt{$test} == 0;
90	10		100			58	my $result = __PACKAGE__->can($test)->($str1,$str2) \|\| 0;
91	10					27	$score += $result * $opt{$test}/$totalPoints;
92							}
93	2					10	return $score;
94							}
95
96							=over
97
98							=item char_by_char($str1,$str2)
99
100							Tests character by character
101
102							=back
103
104							=cut
105
106							sub char_by_char {
107	26			26	1	31	my $str1 = shift;
108	26					27	my $str2 = shift;
109	26					34	my $size1 = length $str1;
110	26					35	my $size2 = length $str2;
111	26					24	my $score = 0;
112	26	100				45	my $size = $size1>$size2?$size1:$size2;
113	26					124	for (my $i = 0;$i < $size; $i++) {
114	121	100				285	if (length $str1 < $i) {
115	9					12	last;
116							}
117	112	100				171	if (length $str2 < $i) {
118	4					6	last;
119							}
120	108					126	my $c1 = substr $str1, $i, 1;
121	108					120	my $c2 = substr $str2, $i, 1;
122	108	100				326	if ($c1 eq $c2) {
123	38					90	$score += 1/$size;
124							}
125							}
126	26					68	return $score;
127							}
128
129							=over
130
131							=item consonants($str1,$str2)
132
133							Test char_by_char only in the consonants.
134
135							=back
136
137							=cut
138
139							consoants = consonants;
140							sub consonants {
141	2			2	1	3	my $str1 = shift;
142	2					26	my $str2 = shift;
143	2					12	$str1 =~ s/[^bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]//g;
144	2					12	$str2 =~ s/[^bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]//g;
145	2					6	return char_by_char($str1,$str2);
146							}
147
148							=over
149
150							=item vowels($str1,$str2)
151
152							Test char_by_char only in the vowels.
153
154							=back
155
156							=cut
157
158							sub vowels {
159	2			2	1	4	my $str1 = shift;
160	2					3	my $str2 = shift;
161	2					20	$str1 =~ s/[^aeiouyAEIOUY]//g;
162	2					60	$str2 =~ s/[^aeiouyAEIOUY]//g;
163	2					6	return char_by_char($str1,$str2);
164							}
165
166							=over
167
168							=item word_by_word($str1, $str2)
169
170							Test char_by_char each word, giving points according to the
171							size of the word.
172
173							=back
174
175							=cut
176
177							sub word_by_word {
178	2			2	1	4	my $str1 = shift;
179	2					3	my $str2 = shift;
180	2					7	my @words1 = split(/\s+/,$str1);
181	2					7	my @words2 = split(/\s+/,$str2);
182	2					3	my $size1 = scalar @words1;
183	2					4	my $size2 = scalar @words2;
184	2	50				5	my $size = $size1>$size2?$size1:$size2;
185	2					3	my $score;
186							my $totalChars;
187	0					0	my @totalCharsPerWord;
188	2					7	for (my $i = 0; $i < $size; $i++) {
189	6	50				13	my $subsize1 = $i < $size1 ? length($words1[$i]) : 0;
190	6	50				14	my $subsize2 = $i < $size2 ? length($words2[$i]) : 0;
191	6	50				10	my $subsize = $subsize1 > $subsize2?$subsize1:$subsize2;
192	6					7	$totalChars += $subsize;
193	6					16	push @totalCharsPerWord, $subsize;
194							}
195	2					6	for (my $i = 0; $i < $size; $i++) {
196	6	50				12	last if $i >= $size1;
197	6					7	my $bestScore = 0;
198	6					14	for (my $j = 0; $j < $size; $j++) {
199	18	50				120	last if $j >= $size2;
200	18					37	my $result = char_by_char($words1[$i],$words2[$j]);
201	18	100				56	$bestScore = $result if $result > $bestScore;
202							}
203	6					19	$score += $bestScore * $totalCharsPerWord[$i]/$totalChars;
204							}
205	2					10	return $score;
206							}
207
208							=over
209
210							=item chars_only($str1,$str2)
211
212							Test char_by_char only with the characters matched by \w.
213
214							=back
215
216							=cut
217
218							sub chars_only {
219	2			2	1	15	my $str1 = shift;
220	2					3	my $str2 = shift;
221	2					10	$str1 =~ s/\W//g;
222	2					9	$str2 =~ s/\W//g;
223	2					5	return char_by_char($str1,$str2);
224							}
225
226
227							=head1 COPYRIGHT
228
229							This module was created by "Daniel Ruoso" . It is licensed under both
230							the GNU GPL and the Artistic License.
231
232							=cut
233