File Coverage

blib/lib/Text/Names/Canonicalize.pm

Criterion	Covered	Total	%
statement	111	111	100.0
branch	20	26	76.9
condition	13	19	68.4
subroutine	14	14	100.0
pod	2	2	100.0
total	160	172	93.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Names::Canonicalize;
2
3	11			11		3638410	use strict;
	11					24
	11					434
4	11			11		57	use warnings;
	11					22
	11					742
5	11			11		66	use Exporter qw(import);
	11					34
	11					543
6	11			11		6938	use Unicode::Normalize qw(NFKC NFD NFC);
	11					37231
	11					1383
7	11			11		88	use feature 'unicode_strings';
	11					36
	11					1756
8	11			11		6368	use charnames qw(:full);
	11					111427
	11					81
9
10	11			11		278804	use Text::Names::Canonicalize::Rules;
	11					38
	11					6144
11
12							our @EXPORT_OK = qw(
13							canonicalize_name
14							canonicalize_name_struct
15							);
16
17							# Default suffixes used when no rules are provided
18							my %DEFAULT_SUFFIX = map { $_ => 1 } qw(jr sr ii iii iv);
19
20							=head1 NAME
21
22							Text::Names::Canonicalize - Locale-aware personal name canonicalization with YAML rules, inheritance, and user overrides
23
24
25							=head1 VERSION
26
27							Version 0.01
28
29							=cut
30
31							our $VERSION = '0.01';
32
33							=head1 SYNOPSIS
34
35							use Text::Names::Canonicalize qw(
36							canonicalize_name
37							canonicalize_name_struct
38							);
39
40							my $canon = canonicalize_name(
41							"Jean d'Ormesson",
42							locale => 'fr_FR',
43							);
44
45							# jean d'ormesson
46
47							my $struct = canonicalize_name_struct(
48							"Karl von der Heide",
49							locale => 'de_DE',
50							);
51
52							# {
53							# original => "Karl von der Heide",
54							# locale => "de_DE",
55							# parts => {
56							# given => ["karl"],
57							# surname => ["von der", "heide"],
58							# },
59							# canonical => "karl von der heide",
60							# }
61
62							=head1 DESCRIPTION
63
64							Text::Names::Canonicalize provides a robust, data-driven engine for
65							canonicalizing personal names across multiple languages and cultural
66							conventions. It is designed for data cleaning, indexing, matching,
67							and normalization tasks where consistent, locale-aware handling of
68							names is essential.
69
70							The module uses declarative YAML rules for each locale, supports
71							inheritance between locale files, detects circular includes, and
72							allows users to override or extend rules via configuration files.
73
74							A command-line tool C is included for
75							interactive use.
76
77							=head1 FEATURES
78
79							=over 4
80
81							=item * Locale-aware name canonicalization
82
83							=item * YAML-based rule definitions
84
85							=item * Inheritance between locale files (C)
86
87							=item * Circular-include detection
88
89							=item * User override rules via C<$CONFIG_DIR> or C<~/.config>
90
91							=item * Multi-word particle handling (e.g. C, C, C)
92
93							=item * Tokenization and surname-strategy engine
94
95							=item * CLI tool with C<--explain> and C<--rules>
96
97							=back
98
99							=head1 FUNCTIONS
100
101							=head2 canonicalize_name( $name, %opts )
102
103							Returns a canonicalized string form of the name.
104
105							my $canon = canonicalize_name("John Mc Donald", locale => 'en_GB');
106
107							Options:
108
109							=over 4
110
111							=item * C
112
113							Locale code (e.g. C, C, C).
114							Defaults to C.
115
116							=back
117
118							=head2 canonicalize_name_struct( $name, %opts )
119
120							Returns a structured hashref describing the canonicalization process:
121
122							{
123							original => "...",
124							locale => "...",
125							parts => {
126							given => [...],
127							surname => [...],
128							},
129							canonical => "...",
130							}
131
132							Useful for debugging, testing, and downstream processing.
133
134							=head1 LOCALE SYSTEM
135
136							Locale rules are stored as YAML files under:
137
138							Text/Names/Canonicalize/Rules/*.yaml
139
140							Each file contains one or more rulesets (typically C).
141
142							=head2 Inheritance
143
144							A ruleset may include one or more parent locales:
145
146							default:
147							include: en_GB
148							particles:
149							- de
150							- du
151
152							Parents are merged in order, and child keys override parent keys.
153
154							=head2 Circular include detection
155
156							Circular include chains (direct or indirect) are detected and reported
157							with a clear error message.
158
159							=head1 USER OVERRIDES
160
161							Users may override or extend locale rules by placing YAML files in:
162
163							$CONFIG_DIR/text-names-canonicalize/rules/*.yaml
164
165							or, if C<$CONFIG_DIR> is not set:
166
167							~/.config/text-names-canonicalize/rules/*.yaml
168
169							User rules override built-in rules at the per-ruleset level.
170
171							=head1 CLI TOOL
172
173							The distribution includes a command-line utility:
174
175							text-names-canonicalize [options] "Full Name"
176
177							Options:
178
179							--locale LOCALE Select locale (default: en_GB)
180							--explain Dump structured canonicalization
181							--rules Show resolved ruleset for the locale
182
183							Examples:
184
185							text-names-canonicalize "Jean d'Ormesson" --locale fr_FR
186							text-names-canonicalize "Karl von der Heide" --locale de_DE --explain
187							text-names-canonicalize --rules --locale fr_FR
188
189							=head1 YAML RULE FORMAT
190
191							Each ruleset contains:
192
193							=over 4
194
195							=item * C - list of surname particles
196
197							=item * C - generational/professional suffixes
198
199							=item * C - titles to remove
200
201							=item * C - currently C
202
203							=item * C - e.g. C
204
205							=back
206
207							=head1 SUPPORTED LOCALES
208
209							=over 4
210
211							=item * C - shared Western defaults
212
213							=item * C - British English
214
215							=item * C - American English
216
217							=item * C - French
218
219							=item * C - German
220
221							=back
222
223							Additional locales can be added easily by creating new YAML files.
224
225							=head1 EXTENDING
226
227							To add a new locale:
228
229							1. Create a YAML file in Rules/
230							2. Optionally inherit from base or another locale
231							3. Add locale-specific particles, titles, or suffixes
232							4. Write tests under t/
233
234							To override rules locally:
235
236							mkdir -p ~/.config/text-names-canonicalize/rules
237							cp my_rules.yaml ~/.config/text-names-canonicalize/rules/
238
239							=cut
240
241
242							# Returns a plain canonical string.
243							sub canonicalize_name {
244	19			19	1	202735	my ($name, %opts) = @_;
245	19					77	return _normalize_string($name, %opts);
246							}
247
248							sub canonicalize_name_struct {
249	25			25	1	248699	my ($name, %opts) = @_;
250
251	25		50			106	my $locale = $opts{locale} \|\| 'en_GB';
252	25		50			136	my $ruleset = $opts{ruleset} \|\| 'default';
253
254	25					187	my $rules = Text::Names::Canonicalize::Rules->get($locale, $ruleset);
255
256							# 1. Strip titles (using raw input)
257	25	50				75	if (my $titles = $rules->{strip_titles}) {
258	25					69	my $re = join '\|', map { quotemeta } @$titles;
	197					424
259	25	50				658	$name =~ s/\b(?:$re)\b\.?//ig if defined $name;
260							}
261
262							# 2. Normalize
263	25					114	my $norm = _normalize_string($name, %opts);
264
265							# 3. Tokenize
266	25					92	my $tokens = _tokenize($norm);
267
268							# 4. Classify
269	25					60	my $classified = _classify_tokens($tokens, $rules);
270
271							# 5. Extract parts
272	25					55	my $parts = _extract_parts($classified, $rules);
273
274							return {
275	25	50				367	original => (defined $name ? $name : ''),
276							locale => $locale,
277							ruleset => $ruleset,
278							canonical => join(' ', @$tokens),
279							parts => $parts,
280							};
281							}
282
283							sub _tokenize {
284	36			36		102	my ($norm) = @_;
285
286							# Normalize apostrophes
287	36					78	$norm =~ s/[\N{LEFT SINGLE QUOTATION MARK}\N{RIGHT SINGLE QUOTATION MARK}]/'/g;
288
289							# Normalize dash-like characters
290	36					104	$norm =~ s/\p{Dash}/-/g;
291
292							# Split French prefix particles BEFORE splitting on spaces.
293							# d'Ormesson → d' Ormesson
294							# l'Enfant → l' Enfant
295	36					181	$norm =~ s/\b(d'\|l')(\p{Letter}+)/$1 $2/gi;
296
297	36					218	my @t = split / /, $norm;
298
299							# Join multi-word particles (e.g., "von der")
300	36					92	@t = _join_multiword_particles(@t);
301
302	36					122	for (@t) {
303	106					231	s/^\pP+//; # leading punctuation
304	106					193	s/[\pP&&[^']]+$//; # trailing punctuation except apostrophe
305	106					196	s/\.$//; # trailing period (initials)
306							}
307
308	36					73	return [ grep { length } @t ];
	106					387
309							}
310
311							sub _classify_tokens {
312	32			32		86	my ($tokens, $rules) = @_;
313
314	32					132	my %suffix = %DEFAULT_SUFFIX;
315
316							# If rules are provided, override suffix list from ruleset
317	32	50	66			164	if ($rules && $rules->{suffixes}) {
318	25					45	%suffix = map { $_ => 1 } @{ $rules->{suffixes} };
	143					299
	25					55
319							}
320
321	32					65	my @types;
322
323	32					92	for my $t (@$tokens) {
324	97	100				321	if ($t =~ /^[a-z]$/) {
		100
325	13					24	push @types, "initial";
326							}
327							elsif ($suffix{$t}) {
328	6					12	push @types, "suffix";
329							}
330							else {
331	78					201	push @types, "word";
332							}
333							}
334
335							return {
336	32					160	tokens => $tokens,
337							types => \@types,
338							};
339							}
340
341							sub _extract_parts {
342	29			29		59	my ($classified, $rules) = @_;
343
344	29					49	my @tokens = @{ $classified->{tokens} };
	29					115
345	29					45	my @types = @{ $classified->{types} };
	29					77
346
347	29	100				47	my %particle = map { $_ => 1 } @{ $rules->{particles} \|\| [] };
	85					200
	29					114
348
349	29					58	my (@given, @middle, @surname, @suffix);
350
351							# 1. Peel off suffixes
352	29		66			134	while (@types && $types[-1] eq 'suffix') {
353	5					22	unshift @suffix, pop @tokens;
354	5					19	pop @types;
355							}
356
357							# If nothing left, return empty structure
358							return {
359	29	50				66	given => [],
360							middle => [],
361							surname => [],
362							suffix => \@suffix,
363							} unless @tokens;
364
365							# 2. Locale-aware surname extraction
366	29	100	66			120	if ($rules->{surname_strategy} && $rules->{surname_strategy} eq 'last_token_with_particles') {
367
368							# Always take the last token as surname root
369	25					48	my $root = pop @tokens;
370	25					39	pop @types;
371	25					50	unshift @surname, $root;
372
373							# Pull in particles from the end backwards
374	25		66			136	while (@tokens && $particle{$tokens[-1]}) {
375	13					49	unshift @surname, pop @tokens;
376	13					59	pop @types;
377							}
378
379							} else {
380							# Fallback: simple last token
381	4					8	my $root = pop @tokens;
382	4					8	pop @types;
383	4					10	unshift @surname, $root;
384							}
385
386							# 3. Given = first token (if any)
387	29	50				82	if (@tokens) {
388	29					66	push @given, shift @tokens;
389	29					60	shift @types;
390							}
391
392							# 4. Middle = everything else
393	29					85	@middle = @tokens;
394
395							return {
396	29					232	given => \@given,
397							middle => \@middle,
398							surname => \@surname,
399							suffix => \@suffix,
400							};
401							}
402
403							sub _normalize_string {
404	44			44		155	my ($name, %opts) = @_;
405
406	44	100				133	$name = '' unless defined $name;
407
408	44					577	my $norm = NFKC($name);
409
410							# whitespace
411	44					347	$norm =~ s/\s+/ /g;
412	44					140	$norm =~ s/^\s+//;
413	44					179	$norm =~ s/\s+$//;
414
415							# punctuation (basic)
416	44					108	$norm =~ s/[.,]+$//; # strip trailing comma/period
417	44					105	$norm =~ s/^[.,]+//; # strip leading comma/period
418
419							# lowercase
420	44					150	$norm = lc $norm;
421
422							# diacritics
423	44	100				124	if ($opts{strip_diacritics}) {
424	30					150	my $d = NFD($norm);
425	30					91	$d =~ s/\pM//g;
426	30					169	$norm = NFC($d);
427							}
428
429	44					174	return $norm;
430							}
431
432							sub _join_multiword_particles {
433	36			36		123	my @t = @_;
434	36					83	my @out;
435
436	36					100	while (@t) {
437	106					170	my $w = shift @t;
438
439							# Try 2-word particles
440	106	100	100			419	if (@t && "$w $t[0]" =~ /^(von der\|von dem)$/) {
441	1					14	$w = "$w " . shift @t;
442							}
443
444	106					273	push @out, $w;
445							}
446
447	36					147	return @out;
448							}
449
450							=head1 AUTHOR
451
452							Nigel Horne, C<< >>
453
454							=head1 SEE ALSO
455
456							=head1 REPOSITORY
457
458							L
459
460							=head1 SUPPORT
461
462							This module is provided as-is without any warranty.
463
464							Please report any bugs or feature requests to C,
465							or through the web interface at
466							L.
467							I will be notified, and then you'll
468							automatically be notified of progress on your bug as I make changes.
469
470							You can find documentation for this module with the perldoc command.
471
472							perldoc Text::Names::Canonicalize
473
474							You can also look for information at:
475
476							=over 4
477
478							=item * MetaCPAN
479
480							L
481
482							=item * RT: CPAN's request tracker
483
484							L
485
486							=item * CPAN Testers' Matrix
487
488							L
489
490							=item * CPAN Testers Dependencies
491
492							L
493
494							=back
495
496							=head1 LICENCE AND COPYRIGHT
497
498							Copyright 2026 Nigel Horne.
499
500							Usage is subject to licence terms.
501
502							The licence terms of this software are as follows:
503
504							=over 4
505
506							=item * Personal single user, single computer use: GPL2
507
508							=item * All other users (including Commercial, Charity, Educational, Government)
509							must apply in writing for a licence for use from Nigel Horne at the
510							above e-mail.
511
512							=back
513
514							=cut
515
516							1;