File Coverage

blib/lib/Text/Phonetic/Koeln.pm

Criterion	Covered	Total	%
statement	58	59	98.3
branch	37	38	97.3
condition	8	9	88.8
subroutine	3	3	100.0
pod			n/a
total	106	109	97.2

line	stmt	bran	cond	sub	time	code
1						# ============================================================================
2						package Text::Phonetic::Koeln;
3						# ============================================================================
4	3			3	102473	use utf8;
	3				20
	3				16
5
6	3			3	445	use Moo;
	3				11426
	3				21
7						extends qw(Text::Phonetic);
8
9						our $VERSION = $Text::Phonetic::VERSION;
10
11						sub _do_encode {
12	81			81	146	my ($self,$string) = @_;
13
14	81				119	my (@character_list,$result,$last_match);
15
16	81				144	$string = uc($string);
17
18						# Replace umlaut
19	81				155	$string =~ s/ß/S/g;
20	81				98	$string =~ s/Ä/AE/g;
21	81				119	$string =~ s/Ö/OE/g;
22	81				125	$string =~ s/Ü/UE/g;
23
24						# Replace double consonants
25						#$string =~ s/([BCDFGHJKLMNPQRSTVWXZ])\1+/$1/g;
26
27						# Convert string to array
28	81				207	@character_list = split //,$string;
29	81				109	$result = '';
30
31						# Handle initial sounds
32	81	100			156	if ($character_list[0] eq 'C') {
33	28	100			72	if (Text::Phonetic::_is_inlist($character_list[1],qw(A H K L O Q R U X))) {
34	10				23	$result .= 4;
35						} else {
36	18				29	$result .= 8;
37						}
38	28				44	$last_match = shift @character_list;
39						}
40
41						# Loop all characters
42	81				161	while (scalar(@character_list)) {
43						# A,E,I,J,O,U,Y => 0
44	293	100			551	if (Text::Phonetic::_is_inlist($character_list[0],qw(A E I J Y O U))) {
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
45	80				109	$result .= 0;
46	80				142	$last_match = shift @character_list;
47						# B => 1
48						} elsif ($character_list[0] eq 'B') {
49	34				49	$result .= 1;
50	34				64	$last_match = shift @character_list;
51						# P in front of H => 1
52						# P => 3
53						} elsif ($character_list[0] eq 'P') {
54	3	50	66		10	if (defined($character_list[1])
55						&& $character_list[1] eq 'H') {
56	0				0	$result .= 3;
57						} else {
58	3				7	$result .= 1;
59						}
60	3				8	$last_match = shift @character_list;
61						# D,T in front of C,S,Z => 8
62						# D,T => 2
63						} elsif (Text::Phonetic::_is_inlist($character_list[0],qw(D T))) {
64	15	100	100		61	if (defined($character_list[1]) && $character_list[1] =~ m/[CSZ]/) {
65	3				9	$result .= 8;
66						} else {
67	12				17	$result .= 2;
68						}
69	15				32	$last_match = shift @character_list;
70						# F,V,W => 3
71						} elsif (Text::Phonetic::_is_inlist($character_list[0],qw(F V W))) {
72	18				26	$result .= 3;
73	18				36	$last_match = shift @character_list;
74						# C in front of A,H,K,O,Q,U,X => 4
75						# C after S,Z => 8
76						} elsif ($character_list[0] eq 'C') {
77	43	100	100		71	if (Text::Phonetic::_is_inlist($last_match,qw(S Z))) {
		100
78	12				15	$result .= 8;
79						} elsif (defined($character_list[1])
80						&& Text::Phonetic::_is_inlist($character_list[1],qw(A H K O Q U X))) {
81	10				13	$result .= 4;
82						} else {
83	21				28	$result .= 8;
84						}
85	43				90	$last_match = shift @character_list;
86						# G,K,Q => 4
87						} elsif (Text::Phonetic::_is_inlist($character_list[0],qw(G Q K))) {
88	11				17	$result .= 4;
89	11				25	$last_match = shift @character_list;
90						# X not after C,K,Q => 48
91						# X after C,K,Q => 8
92						} elsif ($character_list[0] eq 'X') {
93	5	100			10	if (Text::Phonetic::_is_inlist($last_match,qw(C K Q))) {
94	4				6	$result .= 8;
95						} else {
96	1				3	$result .= 48;
97						}
98	5				10	$last_match = shift @character_list;
99						# L => 5
100						} elsif ($character_list[0] eq 'L') {
101	9				16	$result .= 5;
102	9				26	$last_match = shift @character_list;
103						# M,N => 6
104						} elsif (Text::Phonetic::_is_inlist($character_list[0],qw(M N))) {
105	21				32	$result .= 6;
106	21				44	$last_match = shift @character_list;
107						# R => 7
108						} elsif ($character_list[0] eq 'R') {
109	18				24	$result .= 7;
110	18				41	$last_match = shift @character_list;
111						# S,Z => 8
112						} elsif (Text::Phonetic::_is_inlist($character_list[0],qw(S Z))) {
113	21				33	$result .= 8;
114	21				42	$last_match = shift @character_list;
115						# No rule matched
116						} else {
117	15				31	$last_match = shift @character_list;
118						}
119
120						}
121
122						# Replace consecutive codes
123	81				371	$result =~ s/(\d)\1+/$1/g;
124
125						# Replace zero code (except for first position)
126	81				253	$result =~ s/([1-9])0+/$1/g;
127
128	81				249	return $result
129						}
130
131
132						1;
133
134						=encoding utf8
135
136						=pod
137
138						=head1 NAME
139
140						Text::Phonetic::Koeln - Kölner Phonetik algorithm
141
142						=head1 DESCRIPTION
143
144						The "Kölner Phonetik" is a phonetic algorithm for indexing names by sound, as
145						pronounced in German. The goal is for names with the same pronunciation to be
146						encoded to the same representation so that they can be matched despite minor
147						differences in spelling.
148
149						In contrast to Soundex this algorithm is suitable for long names since the
150						length of the encoded result is not limited. This algorithm is able to find
151						allmost all ortographic variations in names, but also produces many false
152						positives.
153
154						The result is always a sequence of numbers. Special characters and whitespaces
155						are ignored. If your text might contain non-latin characters (except for
156						German umlaute and 'ß') you should unaccent it prior to creating a phonetic
157						code.
158
159						=head1 AUTHOR
160
161						Maroš Kollár
162						CPAN ID: MAROS
163						maros [at] k-1.com
164						http://www.k-1.com
165
166						=head1 COPYRIGHT
167
168						Text::Phonetic::Koeln is Copyright (c) 2006,2007 Maroš. Kollár.
169						All rights reserved.
170
171						This program is free software; you can redistribute
172						it and/or modify it under the same terms as Perl itself.
173
174						The full text of the license can be found in the
175						LICENSE file included with this module.
176
177						=head1 SEE ALSO
178
179						Description of the algorithm can be found at
180						L
181
182						Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
183						von Personennamen auf der Grundlage der Gestaltanalyse. in: IBM-Nachrichten,
184						19. Jahrgang, 1969, S. 925-931
185
186						=cut