File Coverage

blib/lib/Text/Phonetic/Koeln.pm

Criterion	Covered	Total	%
statement	74	78	94.8
branch	33	38	86.8
condition	4	9	44.4
subroutine	3	3	100.0
pod			n/a
total	114	128	89.0

line	stmt	bran	cond	sub	time	code
1						# ============================================================================
2						package Text::Phonetic::Koeln;
3						# ============================================================================
4	3			3	54108	use utf8;
	3				3
	3				13
5
6	3			3	480	use Moo;
	3				8840
	3				11
7						extends qw(Text::Phonetic);
8
9						__PACKAGE__->meta->make_immutable;
10
11						our $VERSION = $Text::Phonetic::VERSION;
12
13						sub _do_encode {
14	16			16	15	my ($self,$string) = @_;
15
16	16				14	my (@character_list,$result,$last_match);
17
18	16				23	$string = uc($string);
19
20						# Replace umlaut
21	16				17	$string =~ s/ß/S/g;
22	16				18	$string =~ s/[äÄ]/AE/g;
23	16				15	$string =~ s/[öÖ]/OE/g;
24	16				19	$string =~ s/[üÜ]/UE/g;
25
26						# Replace double consonants
27						#$string =~ s/([BCDFGHJKLMNPQRSTVWXZ])\1+/$1/g;
28
29						# Convert string to array
30	16				38	@character_list = split //,$string;
31	16				16	$result = '';
32
33						# Handle initial sounds
34						# A,E,I,J,O,U,Y => 0
35	16	100			41	if (Text::Phonetic::_is_inlist($character_list[0],qw(A E I J Y O U))) {
		100
36	1				2	$result .= 0;
37	1				2	$last_match = shift @character_list;
38						} elsif ($character_list[0] eq 'C') {
39	1	50			3	if (Text::Phonetic::_is_inlist($character_list[1],qw(A H K L O Q R U X))) {
40	1				2	$result .= 4;
41						} else {
42	0				0	$result .= 8;
43						}
44	1				2	$last_match = shift @character_list;
45						}
46
47						# Loop all characters
48	16				27	while (scalar(@character_list)) {
49						# A,E,I,J,O,U,Y => 0
50						#if (Text::Phonetic::_is_inlist($character_list[0],qw(A E I J O U Y))) {
51						# $result .= 0;
52						# $last_match = shift @character_list;
53						# next;
54						#}
55						# B => 1
56	116	100			133	if ($character_list[0] eq 'B') {
57	3				2	$result .= 1;
58	3				4	$last_match = shift @character_list;
59	3				4	next;
60						}
61						# P in front of H => 1
62						# P => 3
63	113	100			112	if ($character_list[0] eq 'P') {
64	1	50	33		5	if (defined($character_list[1])
65						&& $character_list[1] eq 'H') {
66	0				0	$result .= 3;
67						} else {
68	1				2	$result .= 1;
69						}
70	1				1	$last_match = shift @character_list;
71	1				2	next;
72						}
73						# D,T in front of C,S,Z => 8
74						# D,T => 2
75	112	100			145	if (Text::Phonetic::_is_inlist($character_list[0],qw(D T))) {
76	8	50	66		23	if (defined($character_list[1]) && $character_list[1] =~ m/[CSZ]/) {
77	0				0	$result .= 8;
78						} else {
79	8				9	$result .= 2;
80						}
81	8				7	$last_match = shift @character_list;
82	8				10	next;
83						}
84						# F,V,W => 3
85	104	100			132	if (Text::Phonetic::_is_inlist($character_list[0],qw(F V W))) {
86	8				7	$result .= 3;
87	8				9	$last_match = shift @character_list;
88	8				13	next;
89						}
90						# C in front of A,H,K,O,Q,U,X => 4
91						# C after S,Z => 8
92	96	100			111	if ($character_list[0] eq 'C') {
93	8	100	33		9	if (Text::Phonetic::_is_inlist($last_match,qw(S Z))) {
		50
94	7				8	$result .= 8;
95						} elsif (defined($character_list[1])
96						&& Text::Phonetic::_is_inlist($character_list[1],qw(A H K O Q U X))) {
97	1				2	$result .= 4;
98						}
99	8				6	$last_match = shift @character_list;
100	8				10	next;
101						}
102
103						# G,K,Q => 4
104	88	100			124	if (Text::Phonetic::_is_inlist($character_list[0],qw(G Q K))) {
105	3				3	$result .= 4;
106	3				3	$last_match = shift @character_list;
107	3				4	next;
108						}
109						# X not after C,K,Q => 48
110						# X after C,K,Q => 8
111	85	100			91	if ($character_list[0] eq 'X') {
112	1	50			2	if (Text::Phonetic::_is_inlist($last_match,qw(C K Q))) {
113	0				0	$result .= 8;
114						} else {
115	1				1	$result .= 48;
116						}
117	1				2	$last_match = shift @character_list;
118	1				1	next;
119						}
120						# L => 5
121	84	100			95	if ($character_list[0] eq 'L') {
122	7				6	$result .= 5;
123	7				6	$last_match = shift @character_list;
124	7				9	next;
125						}
126						# M,N => 6
127	77	100			98	if (Text::Phonetic::_is_inlist($character_list[0],qw(M N))) {
128	11				8	$result .= 6;
129	11				9	$last_match = shift @character_list;
130	11				16	next;
131						}
132						# R => 7
133	66	100			73	if ($character_list[0] eq 'R') {
134	11				9	$result .= 7;
135	11				10	$last_match = shift @character_list;
136	11				14	next;
137						}
138						# S,Z => 8
139	55	100			74	if (Text::Phonetic::_is_inlist($character_list[0],qw(S Z))) {
140	8				7	$result .= 8;
141	8				5	$last_match = shift @character_list;
142	8				13	next;
143						}
144
145						# No rule matched
146	47				63	$last_match = shift @character_list;
147						}
148
149						# Replace consecutive codes
150	16				66	$result =~ s/(\d)\1+/$1/g;
151
152						# Replace zero code (except for first position)
153						#$result =~ s/(\d+)0/$1/g;
154
155	16				34	return $result
156						}
157
158
159						1;
160
161						=encoding utf8
162
163						=pod
164
165						=head1 NAME
166
167						Text::Phonetic::Koeln - Kölner Phonetik algorithm
168
169						=head1 DESCRIPTION
170
171						The "Kölner Phonetik" is a phonetic algorithm for indexing names by sound, as
172						pronounced in German. The goal is for names with the same pronunciation to be
173						encoded to the same representation so that they can be matched despite minor
174						differences in spelling.
175
176						In contrast to Soundex this algorithm is suitable for long names since the
177						length of the encoded result is not limited. This algorithm is able to find
178						allmost all ortographic variations in names, but also produces many false
179						positives.
180
181						The result is always a sequence of numbers. Special characters and whitespaces
182						are ignored. If your text might contain non-latin characters (except for
183						German umlaute and 'ß') you should unaccent it prior to creating a phonetic
184						code.
185
186						=head1 AUTHOR
187
188						Maroš Kollár
189						CPAN ID: MAROS
190						maros [at] k-1.com
191						http://www.k-1.com
192
193						=head1 COPYRIGHT
194
195						Text::Phonetic::Koeln is Copyright (c) 2006,2007 Maroš. Kollár.
196						All rights reserved.
197
198						This program is free software; you can redistribute
199						it and/or modify it under the same terms as Perl itself.
200
201						The full text of the license can be found in the
202						LICENSE file included with this module.
203
204						=head1 SEE ALSO
205
206						Description of the algorithm can be found at
207						L
208
209						Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
210						von Personennamen auf der Grundlage der Gestaltanalyse. in: IBM-Nachrichten,
211						19. Jahrgang, 1969, S. 925-931
212
213						=cut