line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::SimilarNames::Levenshtein; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
2029
|
use MooseX::Declare; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
use Text::LevenshteinXS qw(distance); |
5
|
|
|
|
|
|
|
use Math::Combinatorics; |
6
|
|
|
|
|
|
|
use strict; |
7
|
|
|
|
|
|
|
use warnings; |
8
|
|
|
|
|
|
|
use 5.010; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 Name |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Lingua::EN::SimilarNames::Levenshtein - Compare people first and last names. |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 Synopsis |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $people = [ |
19
|
|
|
|
|
|
|
[ 'John', 'Wayne' ], |
20
|
|
|
|
|
|
|
[ 'Sundance', 'Kid' ], |
21
|
|
|
|
|
|
|
[ 'Jose', 'Wales' ], |
22
|
|
|
|
|
|
|
[ 'John', 'Wall' ], |
23
|
|
|
|
|
|
|
]; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my @people_objects = map { |
26
|
|
|
|
|
|
|
Person->new( |
27
|
|
|
|
|
|
|
first_name => $_->[0], |
28
|
|
|
|
|
|
|
last_name => $_->[1], |
29
|
|
|
|
|
|
|
) |
30
|
|
|
|
|
|
|
} @{$people}; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# Build list of name pairs within 5 character edits of each other |
33
|
|
|
|
|
|
|
my $similar_people = SimilarNames->new( |
34
|
|
|
|
|
|
|
list_of_people => \@people_objects, |
35
|
|
|
|
|
|
|
maximum_distance => 5 |
36
|
|
|
|
|
|
|
); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Get the people name pairs as an ArrayRef[ArrayRef[ArrayRef[Str]]] |
39
|
|
|
|
|
|
|
print Dumper $similar_people->list_of_similar_name_pairs; |
40
|
|
|
|
|
|
|
# which results in: |
41
|
|
|
|
|
|
|
[ |
42
|
|
|
|
|
|
|
[ [ "Jose", "Wales" ], [ "John", "Wall" ] ], |
43
|
|
|
|
|
|
|
[ [ "Jose", "Wales" ], [ "John", "Wayne" ] ], |
44
|
|
|
|
|
|
|
[ [ "John", "Wall" ], [ "John", "Wayne" ] ] |
45
|
|
|
|
|
|
|
] |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 Description |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
Given a list of people objects, find the people whose names are within a |
50
|
|
|
|
|
|
|
specified edit distance. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 Classes |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 Person |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This class defines people objects with first and last name attributes. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
class Person { |
63
|
|
|
|
|
|
|
has 'first_name' => (isa => 'Str', is => 'ro', default => ''); |
64
|
|
|
|
|
|
|
has 'last_name' => (isa => 'Str', is => 'ro', default => ''); |
65
|
|
|
|
|
|
|
has 'full_name' => ( |
66
|
|
|
|
|
|
|
isa => 'Str', |
67
|
|
|
|
|
|
|
is => 'ro', |
68
|
|
|
|
|
|
|
lazy_build => 1, |
69
|
|
|
|
|
|
|
); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
method say_name() { |
72
|
|
|
|
|
|
|
say $self->full_name; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
method _build_full_name { |
76
|
|
|
|
|
|
|
return $self->first_name . ' ' . $self->last_name; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head2 CompareTwoNames |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
This class defines comparator objects. Given two Person objects, |
83
|
|
|
|
|
|
|
it computes the edit distance between their names. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=cut |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
class CompareTwoNames { |
88
|
|
|
|
|
|
|
has 'one_person' => (isa => 'Person', is => 'rw'); |
89
|
|
|
|
|
|
|
has 'another_person' => (isa => 'Person', is => 'rw'); |
90
|
|
|
|
|
|
|
has 'distance_between' => ( |
91
|
|
|
|
|
|
|
isa => 'Int', |
92
|
|
|
|
|
|
|
is => 'ro', |
93
|
|
|
|
|
|
|
lazy_build => 1, |
94
|
|
|
|
|
|
|
); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
method _build_distance_between() { |
97
|
|
|
|
|
|
|
return Text::LevenshteinXS::distance($self->one_person->first_name, |
98
|
|
|
|
|
|
|
$self->another_person->first_name) + |
99
|
|
|
|
|
|
|
Text::LevenshteinXS::distance($self->one_person->last_name, |
100
|
|
|
|
|
|
|
$self->another_person->last_name); |
101
|
|
|
|
|
|
|
}; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 SimilarNames |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
This class takes a list of Person objects and uses CompareTwoNames to |
107
|
|
|
|
|
|
|
generate a list of people with similar names based on an edit distance range. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
One can get at the list of Person object pairs with similar name via the |
110
|
|
|
|
|
|
|
C<list_of_people_with_similar_names> attribute. Alternatively, one can |
111
|
|
|
|
|
|
|
get at list of the names pairs themselves (no Person object) via the |
112
|
|
|
|
|
|
|
C<list_of_similar_name_pairs> attribute. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
class SimilarNames { |
117
|
|
|
|
|
|
|
has 'list_of_people' => ( |
118
|
|
|
|
|
|
|
isa => 'ArrayRef[Person]', |
119
|
|
|
|
|
|
|
is => 'ro', |
120
|
|
|
|
|
|
|
lazy_build => 1 |
121
|
|
|
|
|
|
|
); |
122
|
|
|
|
|
|
|
has 'minimum_distance' => (isa => 'Int', is => 'rw', default => 1); |
123
|
|
|
|
|
|
|
has 'maximum_distance' => (isa => 'Int', is => 'rw', default => 3); |
124
|
|
|
|
|
|
|
has 'list_of_people_with_similar_names' => ( |
125
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[Person]]', |
126
|
|
|
|
|
|
|
is => 'ro', |
127
|
|
|
|
|
|
|
lazy_build => 1 |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
has 'list_of_similar_name_pairs' => ( |
130
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[ArrayRef[Str]]]', |
131
|
|
|
|
|
|
|
is => 'ro', |
132
|
|
|
|
|
|
|
lazy_build => 1 |
133
|
|
|
|
|
|
|
); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
method _build_list_of_people_with_similar_names() { |
136
|
|
|
|
|
|
|
my $people_tuples = Math::Combinatorics->new( |
137
|
|
|
|
|
|
|
count => 2, # This could be abstracted |
138
|
|
|
|
|
|
|
data => $self->list_of_people, |
139
|
|
|
|
|
|
|
); |
140
|
|
|
|
|
|
|
my @list_of_people_with_similar_names; |
141
|
|
|
|
|
|
|
while (my ($first_person, $second_person) = $people_tuples->next_combination()) { |
142
|
|
|
|
|
|
|
my $name_comparison = CompareTwoNames->new( |
143
|
|
|
|
|
|
|
one_person => $first_person, |
144
|
|
|
|
|
|
|
another_person => $second_person, |
145
|
|
|
|
|
|
|
); |
146
|
|
|
|
|
|
|
my $distance_between_names = $name_comparison->distance_between(); |
147
|
|
|
|
|
|
|
if ( ($distance_between_names >= $self->minimum_distance) |
148
|
|
|
|
|
|
|
&& ($distance_between_names <= $self->maximum_distance)) |
149
|
|
|
|
|
|
|
{ |
150
|
|
|
|
|
|
|
push @list_of_people_with_similar_names, [ $first_person, $second_person ]; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
return \@list_of_people_with_similar_names |
155
|
|
|
|
|
|
|
}; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
method _build_list_of_similar_name_pairs() { |
158
|
|
|
|
|
|
|
my @list_of_similar_name_pairs; |
159
|
|
|
|
|
|
|
foreach my $pair_of_people (@{ $self->list_of_people_with_similar_names }) { |
160
|
|
|
|
|
|
|
push @list_of_similar_name_pairs, |
161
|
|
|
|
|
|
|
[ |
162
|
|
|
|
|
|
|
[ $pair_of_people->[0]->first_name, $pair_of_people->[0]->last_name ], |
163
|
|
|
|
|
|
|
[ $pair_of_people->[1]->first_name, $pair_of_people->[1]->last_name ] |
164
|
|
|
|
|
|
|
]; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
return \@list_of_similar_name_pairs |
167
|
|
|
|
|
|
|
}; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
__END__ |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 Accessors |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 list_of_similar_name_pairs |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
This is called on a SimilarNames object to return a list of similar |
177
|
|
|
|
|
|
|
name pairs for the list of Person objects passed in. It uses the Levenshtein |
178
|
|
|
|
|
|
|
edit distance. This means the names are close to one another in spelling. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head2 list_of_people_with_similar_names |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
This accessor is similar to the C<list_of_similar_name_pairs> but returns a |
183
|
|
|
|
|
|
|
list of Person object pairs instead of the names. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head1 Authors |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Mateu X. Hunter C<hunter@missoula.org> |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 Copyright |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Copyright 2010, Mateu X. Hunter |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 License |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
You may distribute this code under the same terms as Perl itself. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 Code Repository |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
http://github.com/mateu/Lingua-EN-SimilarNames-Levenshtein |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=cut |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1 |