File Coverage

blib/lib/Lingua/EN/MatchNames.pm
Criterion Covered Total %
statement 81 99 81.8
branch 46 78 58.9
condition 10 21 47.6
subroutine 12 13 92.3
pod 0 3 0.0
total 149 214 69.6


line stmt bran cond sub pod time code
1            
2             =head1 NAME
3            
4             Lingua::EN::MatchNames - Smart matching for human names.
5            
6             =head1 SYNOPSIS
7            
8             use Lingua::EN::MatchNames;
9            
10             $score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 );
11            
12             =head1 DESCRIPTION
13            
14             You have two databases of person records that need to be synchronized or matched up,
15             but they use different keys--maybe one uses SSN and the other uses employee id.
16             The only fields you have to match on are first and last name.
17            
18             That's what this module is for.
19            
20             Just feed the first and last names to the C function, and it returns
21             C for no possible match, and a percentage of certainty (rank) otherwise.
22             The ranking system isn't very scientific, and gender isn't considered, though
23             it probably should be.
24            
25             The C function, checks for:
26            
27             =over 4
28            
29             =item * inconsistent case (MacHenry = Machenry = MACHENRY)
30            
31             =item * inconsistent symbols (O'Brien = Obrien = O BRIEN)
32            
33             =item * misspellings (Grene = Green)
34            
35             =item * last name hyphenation (Smith-Curry = Curry)
36            
37             =item * similar phonetics (Hanson = Hansen)
38            
39             =item * nicknames (Midge = Peggy = Margaret)
40            
41             =item * extraneous initials (H. Ross = Ross)
42            
43             =item * extraneous suffixes (Reed, Jr. = Reed II = Reed)
44            
45             =item * and more...
46            
47             =back
48            
49             =head2 Preliminary Tests:
50            
51             Homer Simpson HOMER SIMPOSN: 77
52             Marge Simpson MIDGE SIMPSON: 81
53             Brian Lalonde BRYAN LA LONDE: 82
54             Brian Lalonde RYAN LALAND: 72
55             Peggy MacHenry Midge Machenry: 81
56             Liz Grene Elizabeth Green: 72
57             Chuck Reed, Jr. Charles Reed II: 82
58             Kathy O'Brien Catherine Obrien: 81
59             Lizzie Hanson Lisa Hanson: 91
60             H. Ross Perot Ross PEROT: 88
61             Kathy Smith-Curry KATIE CURRY: 81
62             Dina Johnson-Warner Dinah J-Warner: 80
63             Leela Miles-Conrad Leela MilesConrad: 86
64             C. Renee Smythe Cathy Smythe: 71
65             Victoria (Honey) Rider HONEY RIDER: 88
66             Bart Simpson El Barto Simpson: 80
67             Bart Simpson Lisa Simpson: (no match)
68             Arthur Dent Zaphod Beeblebrox: (no match)
69            
70             =head1 WARNING
71            
72             The scoring in this version is utterly arbitrary.
73             I made all of the numbers up.
74             The certainty percentages should be OK relative to each other, but
75             would be better if someone could give me some statistical data.
76            
77             Be sure and B this against your data first!
78             Your data may not look like my test data.
79            
80             And although I hope this is useful to many, I do not provide any
81             kind of warranty (expressed or implied), and do not suggest the
82             suitability of this module to any particular purpose.
83             This module probably should not be used for life support or military
84             purposes, and it B not be used for unsolicited commercial email
85             or other bulk advertising.
86            
87             =head1 REPOSITORY
88            
89             L
90            
91             =head1 AUTHOR
92            
93             Brian Lalonde, Ebrian@webcoder.infoE
94            
95             =head1 REQUIREMENTS
96            
97             Lingua::EN::NameParse,
98             Lingua::EN::Nickname,
99             Parse::RecDescent,
100             String::Approx,
101             Text::Metaphone,
102             Text::Soundex
103            
104             =head1 SEE ALSO
105            
106             perl(1),
107             L,
108             L,
109             L,
110             L,
111             L
112            
113             =cut
114            
115             package Lingua::EN::MatchNames;
116             require Exporter;
117 1     1   692 use Carp;
  1         2  
  1         95  
118 1     1   970 use Lingua::EN::NameParse;
  1         62962  
  1         61  
119 1     1   826 use Lingua::EN::Nickname;
  1         24907  
  1         264  
120 1     1   761 use String::Approx 'amatch';
  1         4569  
  1         82  
121 1     1   4323 use Text::Metaphone;
  1         603  
  1         68  
122 1     1   476 use Text::Soundex;
  1         2518  
  1         156  
123 1     1   6 use strict;
  1         1  
  1         28  
124 1     1   4 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         1  
  1         151  
125 1     1   6 use vars qw($debug);
  1         1  
  1         1286  
126            
127             $VERSION= '1.35';
128             @ISA= qw(Exporter);
129             @EXPORT= qw(name_eq);
130             @EXPORT_OK= qw(fname_eq lname_eq);
131             %EXPORT_TAGS=
132             (
133             ALL => [ @EXPORT, @EXPORT_OK ],
134             );
135            
136             sub _nparse($)
137             {
138 0     0   0 local $_= shift;
139 0 0       0 my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 )
140             or carp "Unable to set up name parser.\n$!\n";
141 0         0 $nparse->parse($_);
142 0         0 my %name= $nparse->components;
143 0 0       0 return($name{given_name_1},$name{surname_1}.
144             ( $name{surname_2} ? '-'.$name{surname_2} : '' ));
145             }
146            
147             sub fname_eq
148             {
149 20     20 0 21 my($name0,$name1,$match)= @_;
150 20 50 33     57 return unless $name0 and $name1;
151 20 100       28 return 100 if $name0 eq $name1;
152             # recurse offset nicknames
153 19 50       33 if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); }
  1 100       5  
154 18 0       25 if($name0=~ m/"(\w+)"/) { return $match if $match= fname_eq($name1,$1); }
  0 50       0  
155 18 0       26 if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); }
  0 50       0  
156 18 0       25 if($name1=~ m/"(\w+)"/) { return $match if $match= fname_eq($name0,$1); }
  0 50       0  
157             # strip leading/trailing initial(s) (98%)
158 18         40 $name0=~ s/\W*\b\w\b\W*//g;
159 18         27 $name1=~ s/\W*\b\w\b\W*//g;
160 18 100       24 return 98 if $name0 eq $name1;
161             # recurse separate parts
162 17 50       40 if($name0=~ /\W/)
    100          
163             { # split parts, find best match
164 0         0 my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0;
  0         0  
  0         0  
165 0 0       0 return $match if $match;
166             }
167             elsif($name1=~ /\W/)
168             { # split parts, find best match
169 1         4 my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1;
  0         0  
  2         9  
170 1 50       45 return $match if $match;
171             }
172             # all caps, no symbols (95%)
173 16         19 ($name0= uc $name0)=~ y/A-Z//cd;
174 16         15 ($name1= uc $name1)=~ y/A-Z//cd;
175 16 100       40 return 95 if $name0 eq $name1;
176             # nickname (80%)
177 14 100       32 return int 0.8*$match if $match= nickname_eq($name0,$name1);
178             # fuzzy approx (15%)
179 5 100 66     585 return 35 if amatch($name0,$name1) and amatch($name1,$name0);
180             # simple trucation
181 3 50 33     133 return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/;
182             # a single initial
183 3         9 ($name0,$name1)= @_;
184 3 50       10 for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; }
  1         10  
185 2 0       6 for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; }
  0         0  
186 2         7 return;
187             }
188            
189             sub lname_eq
190             {
191 34     34 0 34 my($name0,$name1)= @_;
192 34 50 33     104 return unless $name0 and $name1;
193 34 100       56 return 100 if $name0 eq $name1;
194             # strip trailing suffixes (95%)
195 31         44 $name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
196 31         58 $name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
197 31 50       44 return 95 if $name0 eq $name1;
198             # recurse hyphenated components
199 31 100       72 if($name0=~ /-/)
    50          
200             { # split hyphenation on hyphen ONLY
201 7         14 my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0;
  2         4  
  14         22  
202 7 50       28 return $match if $match;
203             }
204             elsif($name1=~ /-/)
205             { # split hyphenation on hyphen ONLY
206 0         0 my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1;
  0         0  
  0         0  
207 0 0       0 return $match if $match;
208             }
209             # all caps, no symbols (85%)
210 24         39 ($name0= uc $name0)=~ y/A-Z//cd;
211 24         25 ($name1= uc $name1)=~ y/A-Z//cd;
212 24 100       56 return 85 if $name0 eq $name1;
213             # metaphone (70%)
214 12 100       57 return 70 if Metaphone($name0) eq Metaphone($name1);
215             # soundex (40%)
216 9 50       31 return 40 if soundex($name0) eq soundex($name1);
217             # fuzzy approx (15%)
218 9 50 33     19 return 25 if amatch($name0,$name1) and amatch($name1,$name0);
219             # nonstandard 'hyphenation'/simple truncation
220 9         243 ($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_;
  18         34  
  18         34  
  18         29  
221 9 100 66     61 return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g
222             or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g;
223 7 100 66     167 return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i;
224 6         22 return;
225             }
226            
227             sub name_eq
228             {
229 18 50   18 0 594 my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)=
230             ( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ );
231 18 100       30 return unless $Lrank= lname_eq $nomL0, $nomL1;
232 17 100       26 return unless $Frank= fname_eq $nomF0, $nomF1;
233 16         582 return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up
234             }
235            
236             1