line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ============================================================================ |
2
|
|
|
|
|
|
|
package Text::Phonetic::Phonix; |
3
|
|
|
|
|
|
|
# ============================================================================ |
4
|
4
|
|
|
4
|
|
56921
|
use utf8; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
14
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
539
|
use Moo; |
|
4
|
|
|
|
|
9048
|
|
|
4
|
|
|
|
|
13
|
|
7
|
|
|
|
|
|
|
extends qw(Text::Phonetic); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = $Text::Phonetic::VERSION; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our $VOVEL = '[AEIOU]'; |
14
|
|
|
|
|
|
|
our $VOVEL_WITHY = '[AEIOUY]'; |
15
|
|
|
|
|
|
|
our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]'; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @VALUES = ( |
18
|
|
|
|
|
|
|
[qr/[AEIOUHWY]/,0], |
19
|
|
|
|
|
|
|
[qr/[BP]/,1], |
20
|
|
|
|
|
|
|
[qr/[CGJKQ]/,2], |
21
|
|
|
|
|
|
|
[qr/[DT]/,3], |
22
|
|
|
|
|
|
|
[qr/L/,4], |
23
|
|
|
|
|
|
|
[qr/[MN]/,5], |
24
|
|
|
|
|
|
|
[qr/R/,6], |
25
|
|
|
|
|
|
|
[qr/[FV]/,7], |
26
|
|
|
|
|
|
|
[qr/[SXZ]/,8], |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
our @RULES = ( |
30
|
|
|
|
|
|
|
[qr/DG/,'G'], |
31
|
|
|
|
|
|
|
[qr/C([OAU])/,'K1'], |
32
|
|
|
|
|
|
|
[qr/C[YI]/,'SI'], |
33
|
|
|
|
|
|
|
[qr/CE/,'SE'], |
34
|
|
|
|
|
|
|
[qr/^CL($VOVEL)/,'KL1'], |
35
|
|
|
|
|
|
|
[qr/CK/,'K'], |
36
|
|
|
|
|
|
|
[qr/[GJ]C$/,'K'], |
37
|
|
|
|
|
|
|
[qr/^CH?R($VOVEL)/,'KR1'], |
38
|
|
|
|
|
|
|
[qr/^WR/,'R'], |
39
|
|
|
|
|
|
|
[qr/NC/,'NK'], |
40
|
|
|
|
|
|
|
[qr/CT/,'KT'], |
41
|
|
|
|
|
|
|
[qr/PH/,'F'], |
42
|
|
|
|
|
|
|
[qr/AA/,'AR'], #neu |
43
|
|
|
|
|
|
|
[qr/SCH/,'SH'], |
44
|
|
|
|
|
|
|
[qr/BTL/,'TL'], |
45
|
|
|
|
|
|
|
[qr/GHT/,'T'], |
46
|
|
|
|
|
|
|
[qr/AUGH/,'ARF'], |
47
|
|
|
|
|
|
|
[qr/($VOVEL)LJ($VOVEL)/,'1LD2'], |
48
|
|
|
|
|
|
|
[qr/LOUGH/,'LOW'], |
49
|
|
|
|
|
|
|
[qr/^Q/,'KW'], |
50
|
|
|
|
|
|
|
[qr/^KN/,'N'], |
51
|
|
|
|
|
|
|
[qr/GN$/,'N'], |
52
|
|
|
|
|
|
|
[qr/GHN/,'N'], |
53
|
|
|
|
|
|
|
[qr/GNE$/,'N'], |
54
|
|
|
|
|
|
|
[qr/GHNE/,'NE'], |
55
|
|
|
|
|
|
|
[qr/GNES$/,'NS'], |
56
|
|
|
|
|
|
|
[qr/^GN/,'N'], |
57
|
|
|
|
|
|
|
[qr/(\w)GN($CONSONANT)/,'1N2'], |
58
|
|
|
|
|
|
|
[qr/^PS/,'S'], |
59
|
|
|
|
|
|
|
[qr/^PT/,'T'], |
60
|
|
|
|
|
|
|
[qr/^CZ/,'C'], |
61
|
|
|
|
|
|
|
[qr/($VOVEL)WZ(\w)/,'1Z2'], |
62
|
|
|
|
|
|
|
[qr/(\w)CZ(\w)/,'1CH2'], |
63
|
|
|
|
|
|
|
[qr/LZ/,'LSH'], |
64
|
|
|
|
|
|
|
[qr/RZ/,'RSH'], |
65
|
|
|
|
|
|
|
[qr/(\w)Z($VOVEL)/,'1S2'], |
66
|
|
|
|
|
|
|
[qr/ZZ/,'TS'], |
67
|
|
|
|
|
|
|
[qr/($CONSONANT)Z(\w)/,'1TS2'], |
68
|
|
|
|
|
|
|
[qr/HROUGH/,'REW'], |
69
|
|
|
|
|
|
|
[qr/OUGH/,'OF'], |
70
|
|
|
|
|
|
|
[qr/($VOVEL)Q($VOVEL)/,'1KW2'], |
71
|
|
|
|
|
|
|
[qr/($VOVEL)J($VOVEL)/,'1Y2'], |
72
|
|
|
|
|
|
|
[qr/^YJ($VOVEL)/,'Y1'], |
73
|
|
|
|
|
|
|
[qr/^GH/,'G'], |
74
|
|
|
|
|
|
|
[qr/($VOVEL)E$/,'1GH'], |
75
|
|
|
|
|
|
|
[qr/^CY/,'S'], |
76
|
|
|
|
|
|
|
[qr/NX/,'NKS'], |
77
|
|
|
|
|
|
|
[qr/^PF/,'F'], |
78
|
|
|
|
|
|
|
[qr/DT$/,'T'], |
79
|
|
|
|
|
|
|
[qr/(T|D)L$/,'1IL'], |
80
|
|
|
|
|
|
|
[qr/YTH/,'ITH'], |
81
|
|
|
|
|
|
|
[qr/^TS?J($VOVEL)/,'CH1'], |
82
|
|
|
|
|
|
|
[qr/^TS($VOVEL)/,'T1'], |
83
|
|
|
|
|
|
|
[qr/TCH/,'CH'], # old che |
84
|
|
|
|
|
|
|
[qr/($VOVEL)WSK/,'1VSIKE'], |
85
|
|
|
|
|
|
|
[qr/^[PM]N($VOVEL)/,'N1'], |
86
|
|
|
|
|
|
|
[qr/($VOVEL)STL/,'1SL'], |
87
|
|
|
|
|
|
|
[qr/TNT$/,'ENT'], |
88
|
|
|
|
|
|
|
[qr/EAUX$/,'OH'], |
89
|
|
|
|
|
|
|
[qr/EXCI/,'ECS'], |
90
|
|
|
|
|
|
|
[qr/X/,'ECS'], |
91
|
|
|
|
|
|
|
[qr/NED$/,'ND'], |
92
|
|
|
|
|
|
|
[qr/JR/,'DR'], |
93
|
|
|
|
|
|
|
[qr/EE$/,'EA'], |
94
|
|
|
|
|
|
|
[qr/ZS/,'S'], |
95
|
|
|
|
|
|
|
[qr/($VOVEL)H?R($CONSONANT)/,'1AH2'], |
96
|
|
|
|
|
|
|
[qr/($VOVEL)HR$/,'1AH'], |
97
|
|
|
|
|
|
|
[qr/RE$/,'AR'], |
98
|
|
|
|
|
|
|
[qr/($VOVEL)R$/,'1AH'], |
99
|
|
|
|
|
|
|
[qr/LLE/,'LE'], |
100
|
|
|
|
|
|
|
[qr/($CONSONANT)LE(S?)$/,'1ILE2'], |
101
|
|
|
|
|
|
|
[qr/E$/,''], |
102
|
|
|
|
|
|
|
[qr/ES$/,'S'], |
103
|
|
|
|
|
|
|
[qr/($VOVEL)SS/,'1AS'], |
104
|
|
|
|
|
|
|
[qr/($VOVEL)MB$/,'1M'], |
105
|
|
|
|
|
|
|
[qr/MPTS/,'MPS'], |
106
|
|
|
|
|
|
|
[qr/MPS/,'MS'], |
107
|
|
|
|
|
|
|
[qr/MPT/,'MT'], |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
#sub _do_compare { |
112
|
|
|
|
|
|
|
# my $obj = shift; |
113
|
|
|
|
|
|
|
# my $result1 = shift; |
114
|
|
|
|
|
|
|
# my $result2 = shift; |
115
|
|
|
|
|
|
|
# |
116
|
|
|
|
|
|
|
# # Main values are different |
117
|
|
|
|
|
|
|
# return 0 unless ($result1->[0] eq $result2->[0]); |
118
|
|
|
|
|
|
|
# |
119
|
|
|
|
|
|
|
# # Ending values the same |
120
|
|
|
|
|
|
|
# return 75 if ($result1->[1] eq $result2->[1]); |
121
|
|
|
|
|
|
|
# |
122
|
|
|
|
|
|
|
# # Ending values differ in length, and are same for the shorter |
123
|
|
|
|
|
|
|
# my $length1 = length $result1->[1]; |
124
|
|
|
|
|
|
|
# my $length2 = length $result2->[1]; |
125
|
|
|
|
|
|
|
# if ($length1 > $length2 |
126
|
|
|
|
|
|
|
# && $length1 - $length2 == 1) { |
127
|
|
|
|
|
|
|
# return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]); |
128
|
|
|
|
|
|
|
# }elsif ($length2 > $length1 |
129
|
|
|
|
|
|
|
# && $length2 - $length1 == 1) { |
130
|
|
|
|
|
|
|
# return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]); |
131
|
|
|
|
|
|
|
# } |
132
|
|
|
|
|
|
|
# |
133
|
|
|
|
|
|
|
# return 25; |
134
|
|
|
|
|
|
|
#} |
135
|
|
|
|
|
|
|
#The algorithm always returns either a scalar value or an array reference with |
136
|
|
|
|
|
|
|
#two elements. The fist element represents the sound of the name without the |
137
|
|
|
|
|
|
|
#ending sound, and the second element represents the ending sound. To get a |
138
|
|
|
|
|
|
|
#full representation of the name you need to concat the two elements. |
139
|
|
|
|
|
|
|
# |
140
|
|
|
|
|
|
|
#If you want to compare two names the following rules apply: |
141
|
|
|
|
|
|
|
# |
142
|
|
|
|
|
|
|
#=over |
143
|
|
|
|
|
|
|
# |
144
|
|
|
|
|
|
|
#=item * If the ending sound values of an entered name and a retrieved name are |
145
|
|
|
|
|
|
|
#the same, the retrieved name is a LIKELY candidate. |
146
|
|
|
|
|
|
|
# |
147
|
|
|
|
|
|
|
#=item * If an entered name has an ending-sound value, and the retrieved name |
148
|
|
|
|
|
|
|
#does not, then the retrieved name is a LEAST-LIKELY candidate. |
149
|
|
|
|
|
|
|
# |
150
|
|
|
|
|
|
|
#=item * If the two ending-sound values are the same for the length of the |
151
|
|
|
|
|
|
|
#shorter, and the difference in length between the two ending-sound is one |
152
|
|
|
|
|
|
|
#digit only, then the retrieved name isa LESS-LIKELY candidate. |
153
|
|
|
|
|
|
|
# |
154
|
|
|
|
|
|
|
#=item * All other cases result in LEAST-LIKELY candidates. |
155
|
|
|
|
|
|
|
# |
156
|
|
|
|
|
|
|
#=back |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub _do_encode { |
159
|
26
|
|
|
26
|
|
28
|
my ($self,$string) = @_; |
160
|
|
|
|
|
|
|
|
161
|
26
|
|
|
|
|
35
|
my ($original_string, $first_char); |
162
|
26
|
|
|
|
|
20
|
$original_string = $string; |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# To uppercase and remove other characters |
165
|
26
|
|
|
|
|
40
|
$string = uc($string); |
166
|
26
|
|
|
|
|
33
|
$string =~ tr/A-Z//cd; |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# RULE 1: Replcace rule |
169
|
26
|
|
|
|
|
28
|
foreach my $rule (@RULES) { |
170
|
2028
|
|
|
|
|
1615
|
my $regexp = $rule->[0]; |
171
|
2028
|
|
|
|
|
1201
|
my $replace = $rule->[1]; |
172
|
2028
|
|
|
|
|
2604
|
$string =~ s/$regexp/_replace($replace,$1,$2)/ge; |
|
40
|
|
|
|
|
51
|
|
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# RULE 2: Fetch first character |
176
|
26
|
|
|
|
|
43
|
$first_char = substr($string,0,1,''); |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# RULE 3: Exceptions for first character rule |
179
|
26
|
100
|
100
|
|
|
29
|
if (grep { $first_char eq $_ } qw(A E I O U Y)) { |
|
156
|
100
|
|
|
|
224
|
|
180
|
1
|
|
|
|
|
1
|
$first_char = 'v'; |
181
|
1
|
|
|
|
|
12
|
$string =~ s/^$VOVEL_WITHY//; |
182
|
|
|
|
|
|
|
} elsif ($first_char eq 'W' || $first_char eq 'H') { |
183
|
|
|
|
|
|
|
#$string =~ s/^[WH]//; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# RULE 4 |
187
|
26
|
|
|
|
|
29
|
$string =~ s/ES$/S/; |
188
|
|
|
|
|
|
|
# RULE 5 |
189
|
26
|
|
|
|
|
90
|
$string =~ s/($VOVEL_WITHY)$/$1E/; |
190
|
|
|
|
|
|
|
# RULE 6 |
191
|
|
|
|
|
|
|
#$string =~ s/\w$//; # This rule seems kind of strict |
192
|
|
|
|
|
|
|
# RULE 7-8 |
193
|
|
|
|
|
|
|
# if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) { |
194
|
|
|
|
|
|
|
# # RULE 13 |
195
|
|
|
|
|
|
|
# $last_string = _transform($2); |
196
|
|
|
|
|
|
|
# } |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# RULE 9-11 |
199
|
26
|
|
|
|
|
38
|
$string = _transform($string); |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# RULE 12 |
202
|
26
|
|
|
|
|
29
|
$string = $first_char.$string; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
#$string .= $last_string if (defined $last_string); |
205
|
26
|
|
|
|
|
44
|
$string .= '0' x (8-length $string); |
206
|
26
|
|
|
|
|
29
|
$string = substr($string,0,8); |
207
|
|
|
|
|
|
|
|
208
|
26
|
|
|
|
|
62
|
return $string; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
sub _transform { |
212
|
26
|
|
|
26
|
|
25
|
my $string = shift; |
213
|
26
|
50
|
|
|
|
36
|
return unless defined $string; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# RULE 9 |
216
|
26
|
|
|
|
|
74
|
$string =~ s/([AEIOUYHW])//g; |
217
|
|
|
|
|
|
|
# RULE 10 |
218
|
26
|
|
|
|
|
96
|
$string =~ s/($CONSONANT+)\1/$1/g; |
219
|
|
|
|
|
|
|
# RULE 11 |
220
|
26
|
|
|
|
|
34
|
foreach my $value (@VALUES) { |
221
|
234
|
|
|
|
|
142
|
my $regexp = $value->[0]; |
222
|
234
|
|
|
|
|
442
|
$string =~ s/$regexp/$value->[1]/g; |
223
|
|
|
|
|
|
|
} |
224
|
26
|
|
|
|
|
52
|
return $string; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _replace { |
228
|
40
|
|
|
40
|
|
37
|
my $replace = shift; |
229
|
40
|
|
|
|
|
43
|
my $pos1 = shift; |
230
|
40
|
|
|
|
|
31
|
my $pos2 = shift; |
231
|
|
|
|
|
|
|
|
232
|
40
|
100
|
|
|
|
72
|
$replace =~ s/1/$pos1/ if (defined $pos1); |
233
|
40
|
100
|
|
|
|
52
|
$replace =~ s/2/$pos2/ if (defined $pos2); |
234
|
|
|
|
|
|
|
|
235
|
40
|
|
|
|
|
84
|
return $replace; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
1; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=encoding utf8 |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=pod |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=head1 NAME |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Text::Phonetic::Phonix - Phonix algorithm |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=head1 DESCRIPTION |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix |
251
|
|
|
|
|
|
|
has been incorporated into a number of WAIS implementations, including |
252
|
|
|
|
|
|
|
freeWAIS. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
There seem to be two variants of the Phonix algorithm. One which also includes |
255
|
|
|
|
|
|
|
the first letter in the numeric code, and one that doesn't. This module is |
256
|
|
|
|
|
|
|
using the later variant. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head1 AUTHOR |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Maroš Kollár |
261
|
|
|
|
|
|
|
CPAN ID: MAROS |
262
|
|
|
|
|
|
|
maros [at] k-1.com |
263
|
|
|
|
|
|
|
http://www.k-1.com |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 COPYRIGHT |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Text::Phonetic::Phonix is Copyright (c) 2006,2007 Maroš. Kollár. |
268
|
|
|
|
|
|
|
All rights reserved. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
This program is free software; you can redistribute |
271
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
The full text of the license can be found in the |
274
|
|
|
|
|
|
|
LICENSE file included with this module. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head1 SEE ALSO |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |