line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
=head1 NAME
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
Lingua::EN::MatchNames - Smart matching for human names.
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use Lingua::EN::MatchNames;
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
$score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 );
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
You have two databases of person records that need to be synchronized or matched up,
|
15
|
|
|
|
|
|
|
but they use different keys--maybe one uses SSN and the other uses employee id.
|
16
|
|
|
|
|
|
|
The only fields you have to match on are first and last name.
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
That's what this module is for.
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Just feed the first and last names to the C function, and it returns
|
21
|
|
|
|
|
|
|
C for no possible match, and a percentage of certainty (rank) otherwise.
|
22
|
|
|
|
|
|
|
The ranking system isn't very scientific, and gender isn't considered, though
|
23
|
|
|
|
|
|
|
it probably should be.
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The C function, checks for:
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=over 4
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=item * inconsistent case (MacHenry = Machenry = MACHENRY)
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=item * inconsistent symbols (O'Brien = Obrien = O BRIEN)
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=item * misspellings (Grene = Green)
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=item * last name hyphenation (Smith-Curry = Curry)
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=item * similar phonetics (Hanson = Hansen)
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=item * nicknames (Midge = Peggy = Margaret)
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=item * extraneous initials (H. Ross = Ross)
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=item * extraneous suffixes (Reed, Jr. = Reed II = Reed)
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=item * and more...
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=back
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 Preliminary Tests:
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Homer Simpson HOMER SIMPOSN: 77
|
52
|
|
|
|
|
|
|
Marge Simpson MIDGE SIMPSON: 81
|
53
|
|
|
|
|
|
|
Brian Lalonde BRYAN LA LONDE: 82
|
54
|
|
|
|
|
|
|
Brian Lalonde RYAN LALAND: 72
|
55
|
|
|
|
|
|
|
Peggy MacHenry Midge Machenry: 81
|
56
|
|
|
|
|
|
|
Liz Grene Elizabeth Green: 72
|
57
|
|
|
|
|
|
|
Chuck Reed, Jr. Charles Reed II: 82
|
58
|
|
|
|
|
|
|
Kathy O'Brien Catherine Obrien: 81
|
59
|
|
|
|
|
|
|
Lizzie Hanson Lisa Hanson: 91
|
60
|
|
|
|
|
|
|
H. Ross Perot Ross PEROT: 88
|
61
|
|
|
|
|
|
|
Kathy Smith-Curry KATIE CURRY: 81
|
62
|
|
|
|
|
|
|
Dina Johnson-Warner Dinah J-Warner: 80
|
63
|
|
|
|
|
|
|
Leela Miles-Conrad Leela MilesConrad: 86
|
64
|
|
|
|
|
|
|
C. Renee Smythe Cathy Smythe: 71
|
65
|
|
|
|
|
|
|
Victoria (Honey) Rider HONEY RIDER: 88
|
66
|
|
|
|
|
|
|
Bart Simpson El Barto Simpson: 80
|
67
|
|
|
|
|
|
|
Bart Simpson Lisa Simpson: (no match)
|
68
|
|
|
|
|
|
|
Arthur Dent Zaphod Beeblebrox: (no match)
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 WARNING
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
The scoring in this version is utterly arbitrary.
|
73
|
|
|
|
|
|
|
I made all of the numbers up.
|
74
|
|
|
|
|
|
|
The certainty percentages should be OK relative to each other, but
|
75
|
|
|
|
|
|
|
would be better if someone could give me some statistical data.
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
Be sure and B this against your data first!
|
78
|
|
|
|
|
|
|
Your data may not look like my test data.
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
And although I hope this is useful to many, I do not provide any
|
81
|
|
|
|
|
|
|
kind of warranty (expressed or implied), and do not suggest the
|
82
|
|
|
|
|
|
|
suitability of this module to any particular purpose.
|
83
|
|
|
|
|
|
|
This module probably should not be used for life support or military
|
84
|
|
|
|
|
|
|
purposes, and it B not be used for unsolicited commercial email
|
85
|
|
|
|
|
|
|
or other bulk advertising.
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 REPOSITORY
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
L
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 AUTHOR
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Brian Lalonde, Ebrian@webcoder.infoE
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head1 REQUIREMENTS
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Lingua::EN::NameParse,
|
98
|
|
|
|
|
|
|
Lingua::EN::Nickname,
|
99
|
|
|
|
|
|
|
Parse::RecDescent,
|
100
|
|
|
|
|
|
|
String::Approx,
|
101
|
|
|
|
|
|
|
Text::Metaphone,
|
102
|
|
|
|
|
|
|
Text::Soundex
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head1 SEE ALSO
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
perl(1),
|
107
|
|
|
|
|
|
|
L,
|
108
|
|
|
|
|
|
|
L,
|
109
|
|
|
|
|
|
|
L,
|
110
|
|
|
|
|
|
|
L,
|
111
|
|
|
|
|
|
|
L
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=cut
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
package Lingua::EN::MatchNames;
|
116
|
|
|
|
|
|
|
require Exporter;
|
117
|
1
|
|
|
1
|
|
692
|
use Carp;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
95
|
|
118
|
1
|
|
|
1
|
|
970
|
use Lingua::EN::NameParse;
|
|
1
|
|
|
|
|
62962
|
|
|
1
|
|
|
|
|
61
|
|
119
|
1
|
|
|
1
|
|
826
|
use Lingua::EN::Nickname;
|
|
1
|
|
|
|
|
24907
|
|
|
1
|
|
|
|
|
264
|
|
120
|
1
|
|
|
1
|
|
761
|
use String::Approx 'amatch';
|
|
1
|
|
|
|
|
4569
|
|
|
1
|
|
|
|
|
82
|
|
121
|
1
|
|
|
1
|
|
4323
|
use Text::Metaphone;
|
|
1
|
|
|
|
|
603
|
|
|
1
|
|
|
|
|
68
|
|
122
|
1
|
|
|
1
|
|
476
|
use Text::Soundex;
|
|
1
|
|
|
|
|
2518
|
|
|
1
|
|
|
|
|
156
|
|
123
|
1
|
|
|
1
|
|
6
|
use strict;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
28
|
|
124
|
1
|
|
|
1
|
|
4
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
151
|
|
125
|
1
|
|
|
1
|
|
6
|
use vars qw($debug);
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1286
|
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
$VERSION= '1.35';
|
128
|
|
|
|
|
|
|
@ISA= qw(Exporter);
|
129
|
|
|
|
|
|
|
@EXPORT= qw(name_eq);
|
130
|
|
|
|
|
|
|
@EXPORT_OK= qw(fname_eq lname_eq);
|
131
|
|
|
|
|
|
|
%EXPORT_TAGS=
|
132
|
|
|
|
|
|
|
(
|
133
|
|
|
|
|
|
|
ALL => [ @EXPORT, @EXPORT_OK ],
|
134
|
|
|
|
|
|
|
);
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub _nparse($)
|
137
|
|
|
|
|
|
|
{
|
138
|
0
|
|
|
0
|
|
0
|
local $_= shift;
|
139
|
0
|
0
|
|
|
|
0
|
my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 )
|
140
|
|
|
|
|
|
|
or carp "Unable to set up name parser.\n$!\n";
|
141
|
0
|
|
|
|
|
0
|
$nparse->parse($_);
|
142
|
0
|
|
|
|
|
0
|
my %name= $nparse->components;
|
143
|
0
|
0
|
|
|
|
0
|
return($name{given_name_1},$name{surname_1}.
|
144
|
|
|
|
|
|
|
( $name{surname_2} ? '-'.$name{surname_2} : '' ));
|
145
|
|
|
|
|
|
|
}
|
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub fname_eq
|
148
|
|
|
|
|
|
|
{
|
149
|
20
|
|
|
20
|
0
|
21
|
my($name0,$name1,$match)= @_;
|
150
|
20
|
50
|
33
|
|
|
57
|
return unless $name0 and $name1;
|
151
|
20
|
100
|
|
|
|
28
|
return 100 if $name0 eq $name1;
|
152
|
|
|
|
|
|
|
# recurse offset nicknames
|
153
|
19
|
50
|
|
|
|
33
|
if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); }
|
|
1
|
100
|
|
|
|
5
|
|
154
|
18
|
0
|
|
|
|
25
|
if($name0=~ m/"(\w+)"/) { return $match if $match= fname_eq($name1,$1); }
|
|
0
|
50
|
|
|
|
0
|
|
155
|
18
|
0
|
|
|
|
26
|
if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); }
|
|
0
|
50
|
|
|
|
0
|
|
156
|
18
|
0
|
|
|
|
25
|
if($name1=~ m/"(\w+)"/) { return $match if $match= fname_eq($name0,$1); }
|
|
0
|
50
|
|
|
|
0
|
|
157
|
|
|
|
|
|
|
# strip leading/trailing initial(s) (98%)
|
158
|
18
|
|
|
|
|
40
|
$name0=~ s/\W*\b\w\b\W*//g;
|
159
|
18
|
|
|
|
|
27
|
$name1=~ s/\W*\b\w\b\W*//g;
|
160
|
18
|
100
|
|
|
|
24
|
return 98 if $name0 eq $name1;
|
161
|
|
|
|
|
|
|
# recurse separate parts
|
162
|
17
|
50
|
|
|
|
40
|
if($name0=~ /\W/)
|
|
|
100
|
|
|
|
|
|
163
|
|
|
|
|
|
|
{ # split parts, find best match
|
164
|
0
|
|
|
|
|
0
|
my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0;
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
165
|
0
|
0
|
|
|
|
0
|
return $match if $match;
|
166
|
|
|
|
|
|
|
}
|
167
|
|
|
|
|
|
|
elsif($name1=~ /\W/)
|
168
|
|
|
|
|
|
|
{ # split parts, find best match
|
169
|
1
|
|
|
|
|
4
|
my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1;
|
|
0
|
|
|
|
|
0
|
|
|
2
|
|
|
|
|
9
|
|
170
|
1
|
50
|
|
|
|
45
|
return $match if $match;
|
171
|
|
|
|
|
|
|
}
|
172
|
|
|
|
|
|
|
# all caps, no symbols (95%)
|
173
|
16
|
|
|
|
|
19
|
($name0= uc $name0)=~ y/A-Z//cd;
|
174
|
16
|
|
|
|
|
15
|
($name1= uc $name1)=~ y/A-Z//cd;
|
175
|
16
|
100
|
|
|
|
40
|
return 95 if $name0 eq $name1;
|
176
|
|
|
|
|
|
|
# nickname (80%)
|
177
|
14
|
100
|
|
|
|
32
|
return int 0.8*$match if $match= nickname_eq($name0,$name1);
|
178
|
|
|
|
|
|
|
# fuzzy approx (15%)
|
179
|
5
|
100
|
66
|
|
|
585
|
return 35 if amatch($name0,$name1) and amatch($name1,$name0);
|
180
|
|
|
|
|
|
|
# simple trucation
|
181
|
3
|
50
|
33
|
|
|
133
|
return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/;
|
182
|
|
|
|
|
|
|
# a single initial
|
183
|
3
|
|
|
|
|
9
|
($name0,$name1)= @_;
|
184
|
3
|
50
|
|
|
|
10
|
for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; }
|
|
1
|
|
|
|
|
10
|
|
185
|
2
|
0
|
|
|
|
6
|
for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; }
|
|
0
|
|
|
|
|
0
|
|
186
|
2
|
|
|
|
|
7
|
return;
|
187
|
|
|
|
|
|
|
}
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub lname_eq
|
190
|
|
|
|
|
|
|
{
|
191
|
34
|
|
|
34
|
0
|
34
|
my($name0,$name1)= @_;
|
192
|
34
|
50
|
33
|
|
|
104
|
return unless $name0 and $name1;
|
193
|
34
|
100
|
|
|
|
56
|
return 100 if $name0 eq $name1;
|
194
|
|
|
|
|
|
|
# strip trailing suffixes (95%)
|
195
|
31
|
|
|
|
|
44
|
$name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
|
196
|
31
|
|
|
|
|
58
|
$name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
|
197
|
31
|
50
|
|
|
|
44
|
return 95 if $name0 eq $name1;
|
198
|
|
|
|
|
|
|
# recurse hyphenated components
|
199
|
31
|
100
|
|
|
|
72
|
if($name0=~ /-/)
|
|
|
50
|
|
|
|
|
|
200
|
|
|
|
|
|
|
{ # split hyphenation on hyphen ONLY
|
201
|
7
|
|
|
|
|
14
|
my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0;
|
|
2
|
|
|
|
|
4
|
|
|
14
|
|
|
|
|
22
|
|
202
|
7
|
50
|
|
|
|
28
|
return $match if $match;
|
203
|
|
|
|
|
|
|
}
|
204
|
|
|
|
|
|
|
elsif($name1=~ /-/)
|
205
|
|
|
|
|
|
|
{ # split hyphenation on hyphen ONLY
|
206
|
0
|
|
|
|
|
0
|
my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1;
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
207
|
0
|
0
|
|
|
|
0
|
return $match if $match;
|
208
|
|
|
|
|
|
|
}
|
209
|
|
|
|
|
|
|
# all caps, no symbols (85%)
|
210
|
24
|
|
|
|
|
39
|
($name0= uc $name0)=~ y/A-Z//cd;
|
211
|
24
|
|
|
|
|
25
|
($name1= uc $name1)=~ y/A-Z//cd;
|
212
|
24
|
100
|
|
|
|
56
|
return 85 if $name0 eq $name1;
|
213
|
|
|
|
|
|
|
# metaphone (70%)
|
214
|
12
|
100
|
|
|
|
57
|
return 70 if Metaphone($name0) eq Metaphone($name1);
|
215
|
|
|
|
|
|
|
# soundex (40%)
|
216
|
9
|
50
|
|
|
|
31
|
return 40 if soundex($name0) eq soundex($name1);
|
217
|
|
|
|
|
|
|
# fuzzy approx (15%)
|
218
|
9
|
50
|
33
|
|
|
19
|
return 25 if amatch($name0,$name1) and amatch($name1,$name0);
|
219
|
|
|
|
|
|
|
# nonstandard 'hyphenation'/simple truncation
|
220
|
9
|
|
|
|
|
243
|
($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_;
|
|
18
|
|
|
|
|
34
|
|
|
18
|
|
|
|
|
34
|
|
|
18
|
|
|
|
|
29
|
|
221
|
9
|
100
|
66
|
|
|
61
|
return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g
|
222
|
|
|
|
|
|
|
or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g;
|
223
|
7
|
100
|
66
|
|
|
167
|
return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i;
|
224
|
6
|
|
|
|
|
22
|
return;
|
225
|
|
|
|
|
|
|
}
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub name_eq
|
228
|
|
|
|
|
|
|
{
|
229
|
18
|
50
|
|
18
|
0
|
594
|
my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)=
|
230
|
|
|
|
|
|
|
( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ );
|
231
|
18
|
100
|
|
|
|
30
|
return unless $Lrank= lname_eq $nomL0, $nomL1;
|
232
|
17
|
100
|
|
|
|
26
|
return unless $Frank= fname_eq $nomF0, $nomF1;
|
233
|
16
|
|
|
|
|
582
|
return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up
|
234
|
|
|
|
|
|
|
}
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
1
|