File Coverage

blib/lib/Text/Phonetic/Koeln.pm

Criterion	Covered	Total	%
statement	74	78	94.8
branch	33	38	86.8
condition	4	9	44.4
subroutine	3	3	100.0
pod			n/a
total	114	128	89.0

line	stmt	bran	cond	sub	time	code
1						# ============================================================================
2						package Text::Phonetic::Koeln;
3						# ============================================================================
4	3			3	96221	use utf8;
	3				8
	3				19
5
6	3			3	490	use Moo;
	3				8822
	3				15
7						extends qw(Text::Phonetic);
8
9						__PACKAGE__->meta->make_immutable;
10
11						our $VERSION = $Text::Phonetic::VERSION;
12
13						sub _do_encode {
14	16			16	33	my ($self,$string) = @_;
15
16	16				27	my (@character_list,$result,$last_match);
17
18	16				35	$string = uc($string);
19
20						# Replace umlaut
21	16				34	$string =~ s/ß/S/g;
22	16				29	$string =~ s/[äÄ]/AE/g;
23	16				25	$string =~ s/[öÖ]/OE/g;
24	16				30	$string =~ s/[üÜ]/UE/g;
25
26						# Replace double consonants
27						#$string =~ s/([BCDFGHJKLMNPQRSTVWXZ])\1+/$1/g;
28
29						# Convert string to array
30	16				52	@character_list = split //,$string;
31	16				27	$result = '';
32
33						# Handle initial sounds
34						# A,E,I,J,O,U,Y => 0
35	16	100			45	if (Text::Phonetic::_is_inlist($character_list[0],qw(A E I J Y O U))) {
		100
36	1				3	$result .= 0;
37	1				2	$last_match = shift @character_list;
38						} elsif ($character_list[0] eq 'C') {
39	1	50			5	if (Text::Phonetic::_is_inlist($character_list[1],qw(A H K L O Q R U X))) {
40	1				3	$result .= 4;
41						} else {
42	0				0	$result .= 8;
43						}
44	1				3	$last_match = shift @character_list;
45						}
46
47						# Loop all characters
48	16				37	while (scalar(@character_list)) {
49						# A,E,I,J,O,U,Y => 0
50						#if (Text::Phonetic::_is_inlist($character_list[0],qw(A E I J O U Y))) {
51						# $result .= 0;
52						# $last_match = shift @character_list;
53						# next;
54						#}
55						# B => 1
56	116	100			236	if ($character_list[0] eq 'B') {
57	3				6	$result .= 1;
58	3				5	$last_match = shift @character_list;
59	3				8	next;
60						}
61						# P in front of H => 1
62						# P => 3
63	113	100			218	if ($character_list[0] eq 'P') {
64	1	50	33		8	if (defined($character_list[1])
65						&& $character_list[1] eq 'H') {
66	0				0	$result .= 3;
67						} else {
68	1				3	$result .= 1;
69						}
70	1				2	$last_match = shift @character_list;
71	1				2	next;
72						}
73						# D,T in front of C,S,Z => 8
74						# D,T => 2
75	112	100			233	if (Text::Phonetic::_is_inlist($character_list[0],qw(D T))) {
76	8	50	66		33	if (defined($character_list[1]) && $character_list[1] =~ m/[CSZ]/) {
77	0				0	$result .= 8;
78						} else {
79	8				14	$result .= 2;
80						}
81	8				12	$last_match = shift @character_list;
82	8				19	next;
83						}
84						# F,V,W => 3
85	104	100			234	if (Text::Phonetic::_is_inlist($character_list[0],qw(F V W))) {
86	8				13	$result .= 3;
87	8				12	$last_match = shift @character_list;
88	8				19	next;
89						}
90						# C in front of A,H,K,O,Q,U,X => 4
91						# C after S,Z => 8
92	96	100			197	if ($character_list[0] eq 'C') {
93	8	100	33		17	if (Text::Phonetic::_is_inlist($last_match,qw(S Z))) {
		50
94	7				11	$result .= 8;
95						} elsif (defined($character_list[1])
96						&& Text::Phonetic::_is_inlist($character_list[1],qw(A H K O Q U X))) {
97	1				3	$result .= 4;
98						}
99	8				15	$last_match = shift @character_list;
100	8				18	next;
101						}
102
103						# G,K,Q => 4
104	88	100			178	if (Text::Phonetic::_is_inlist($character_list[0],qw(G Q K))) {
105	3				4	$result .= 4;
106	3				7	$last_match = shift @character_list;
107	3				6	next;
108						}
109						# X not after C,K,Q => 48
110						# X after C,K,Q => 8
111	85	100			170	if ($character_list[0] eq 'X') {
112	1	50			2	if (Text::Phonetic::_is_inlist($last_match,qw(C K Q))) {
113	0				0	$result .= 8;
114						} else {
115	1				5	$result .= 48;
116						}
117	1				3	$last_match = shift @character_list;
118	1				2	next;
119						}
120						# L => 5
121	84	100			160	if ($character_list[0] eq 'L') {
122	7				10	$result .= 5;
123	7				11	$last_match = shift @character_list;
124	7				15	next;
125						}
126						# M,N => 6
127	77	100			169	if (Text::Phonetic::_is_inlist($character_list[0],qw(M N))) {
128	11				19	$result .= 6;
129	11				15	$last_match = shift @character_list;
130	11				26	next;
131						}
132						# R => 7
133	66	100			130	if ($character_list[0] eq 'R') {
134	11				14	$result .= 7;
135	11				19	$last_match = shift @character_list;
136	11				25	next;
137						}
138						# S,Z => 8
139	55	100			119	if (Text::Phonetic::_is_inlist($character_list[0],qw(S Z))) {
140	8				13	$result .= 8;
141	8				291	$last_match = shift @character_list;
142	8				22	next;
143						}
144
145						# No rule matched
146	47				103	$last_match = shift @character_list;
147						}
148
149						# Replace consecutive codes
150	16				84	$result =~ s/(\d)\1+/$1/g;
151
152						# Replace zero code (except for first position)
153						#$result =~ s/(\d+)0/$1/g;
154
155	16				53	return $result
156						}
157
158
159						1;
160
161						=encoding utf8
162
163						=pod
164
165						=head1 NAME
166
167						Text::Phonetic::Koeln - Kölner Phonetik algorithm
168
169						=head1 DESCRIPTION
170
171						The "Kölner Phonetik" is a phonetic algorithm for indexing names by sound, as
172						pronounced in German. The goal is for names with the same pronunciation to be
173						encoded to the same representation so that they can be matched despite minor
174						differences in spelling.
175
176						In contrast to Soundex this algorithm is suitable for long names since the
177						length of the encoded result is not limited. This algorithm is able to find
178						allmost all ortographic variations in names, but also produces many false
179						positives.
180
181						The result is always a sequence of numbers. Special characters and whitespaces
182						are ignored. If your text might contain non-latin characters (except for
183						German umlaute and 'ß') you should unaccent it prior to creating a phonetic
184						code.
185
186						=head1 AUTHOR
187
188						Maroš Kollár
189						CPAN ID: MAROS
190						maros [at] k-1.com
191						http://www.k-1.com
192
193						=head1 COPYRIGHT
194
195						Text::Phonetic::Koeln is Copyright (c) 2006,2007 Maroš. Kollár.
196						All rights reserved.
197
198						This program is free software; you can redistribute
199						it and/or modify it under the same terms as Perl itself.
200
201						The full text of the license can be found in the
202						LICENSE file included with this module.
203
204						=head1 SEE ALSO
205
206						Description of the algorithm can be found at
207						L
208
209						Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
210						von Personennamen auf der Grundlage der Gestaltanalyse. in: IBM-Nachrichten,
211						19. Jahrgang, 1969, S. 925-931
212
213						=cut