File Coverage

blib/lib/Lingua/EN/NameParse.pm

Criterion	Covered	Total	%
statement	177	260	68.0
branch	58	104	55.7
condition	6	24	25.0
subroutine	21	23	91.3
pod	10	10	100.0
total	272	421	64.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Lingua::EN::NameParse - extract the components of a person or couples full name, presented as a text string
4
5							=head1 SYNOPSIS
6
7							use Lingua::EN::NameParse qw(clean case_surname);
8
9							# optional configuration arguments
10							my %args =
11							(
12							auto_clean => 1,
13							lc_prefix => 1,
14							initials => 3,
15							allow_reversed => 1,
16							joint_names => 0,
17							extended_titles => 0
18							);
19
20							my $name = Lingua::EN::NameParse->new(%args);
21
22							$error = $name->parse("Estate Of Lt Col AB Van Der Heiden (Hold Mail)");
23							unless ( $error )
24							{
25							print($name->report);
26
27							Case all : Estate Of Lt Col AB Van Der Heiden (Hold Mail)
28							Case all reversed : Van Der Heiden, Lt Col AB
29							Salutation : Dear Friend
30							Type : Mr_A_Smith
31							Parsing Error : 0
32							Error description : :
33							Parsing Warning : 1
34							Warning description : ;non_matching text found : (Hold Mail)
35
36							COMPONENTS
37							initials_1 : AB
38							non_matching : (Hold Mail)
39							precursor : Estate Of
40							surname_1 : Van Der Heiden
41							title_1 : Lt Col
42
43							%name_comps = $name->components;
44							$surname = $name_comps{surname_1};
45
46							$correct_casing = $name->case_all;
47
48							$correct_casing = $name->case_all_reversed ;
49
50							$salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend'));
51
52							$good_name = clean("Bad Na9me "); # "Bad Name"
53
54							%my_properties = $name->properties;
55							$number_surnames = $my_properties{number}; # 1
56							}
57
58
59							$lc_prefix = 0;
60							$correct_case = case_surname("DE SILVA-MACNAY",$lc_prefix); # A stand alone function, returns: De Silva-MacNay
61
62							$error = $name->parse("MR AS & D.E. DE LA MARE");
63							%my_properties = $name->properties;
64							$number_surnames = $my_properties{number}; # 2
65
66
67							=head1 DESCRIPTION
68
69
70							This module takes as input one person's name or a couples names in
71							free format text such as,
72
73							Mr AB & M/s CD MacNay-Smith
74							MR J.L. D'ANGELO
75							Estate Of The Late Lieutenant Colonel AB Van Der Heiden
76
77							and attempts to parse it. If successful, the name is broken
78							down into components and useful functions can be performed such as :
79
80							converting upper or lower case values to name case (Mr AB MacNay )
81							creating a personalised greeting or salutation (Dear Mr MacNay )
82							extracting the names individual components (Mr,AB,MacNay )
83							determining the type of format the name is in (Mr_A_Smith )
84
85
86							If the name(s) cannot be parsed you have the option of cleaning the name(s)
87							of bad characters, or extracting any portion that was parsed and the
88							portion that failed.
89
90							This module can be used for analysing and improving the quality of
91							lists of names.
92
93
94							=head1 DEFINITIONS
95
96							The following terms are used by NameParse to define the components
97							that can make up a name.
98
99							Precursor - Estate of (The Late), Right Honourable ...
100							Title - Mr, Mrs, Ms., Sir, Dr, Major, Reverend ...
101							Conjunction - word to separate two names, such as "And" or &
102							Initials - 1-3 letters, each with an optional space and/or dot
103							Surname - De Silva, Van Der Heiden, MacNay-Smith, O'Reilly ...
104							Suffix - Snr., Jnr, III, V ...
105
106							Refer to the component grammar defined within the code for a complete
107							list of combinations.
108
109							'Name casing' refers to the correct use of upper and lower case letters
110							in peoples names, such as Mr AB McNay.
111
112							To describe the formats supported by NameParse, a short hand representation
113							of the name is used. The following formats are currently supported :
114
115							Mr_John_Smith_&_Ms_Mary_Jones
116							Mr_A_Smith_&_Ms_B_Jones
117							Mr_&Ms_A_&_B_Smith
118							Mr_A_&_Ms_B_Smith
119							Mr_&_Ms_A_Smith
120							Mr_A_&_B_Smith
121							John_Smith_&_Mary_Jones
122							John_&_Mary_Smith
123							A_Smith_&_B_Jones
124
125							Mr_John_Adam_Smith
126							Mr_John_A_Smith
127							Mr_John_Smith
128							Mr_A_Smith
129							John_Adam_Smith
130							John_A_Smith
131							J_Adam_Smith
132							John_Smith
133							A_Smith
134							John
135
136							Precursors and suffixes may be applied to single names that have a surname
137
138
139							=head1 METHODS
140
141							=head2 new
142
143							The C method creates an instance of a name object and sets up
144							the grammar used to parse names. This must be called before any of the
145							following methods are invoked. Note that the object only needs to be
146							created ONCE, and should be reused with new input data. Calling C
147							repeatedly will significantly slow your program down.
148
149							Various setup options may be defined in a hash that is passed as an optional
150							argument to the C method. Note that all the arguments are optional. You
151							need to define the combination of arguments that are appropriate for your
152							usage.
153
154							my %args =
155							(
156							auto_clean => 1,
157							lc_prefix => 1,
158							initials => 3,
159							allow_reversed => 1
160							);
161
162
163							my $name = Lingua::EN::NameParse->new(%args);
164
165
166							=over 4
167
168							=item auto_clean
169
170							When this option is set to a positive value, any call to the C method
171							that fails will attempt to 'clean' the name and then reparse it. See the
172							C method for details. This is useful for dirty data with embedded
173							unprintable or non alphabetic characters.
174
175							=item lc_prefix
176
177							When this option is set to a positive value, it will force the C
178							and C methods to lower case the first letter of each word that
179							occurs in the prefix portion of a surname. For example, Mr AB de Silva,
180							or Ms AS von der Heiden.
181
182							=item initials
183
184							Allows the user to control the number of letters that can occur in the initials.
185							Valid settings are 1,2 or 3. If no value is supplied a default of 2 is used.
186
187							=item allow_reversed
188
189							When this option is set to a positive value, names in reverse order will be
190							processed. The only valid format is the surname followed by a comma and the
191							rest of the name, which can be in any of the combinations allowed by non
192							reversed names. Some examples are:
193
194							Smith, Mr AB
195							Jones, Jim
196							De Silva, Professor A.B.
197
198							The program changes the order of the name back to the non reversed format, and
199							then performs the normal parsing. Note that if the name can be parsed, the fact
200							that it's order was originally reversed, is not recorded as a property of the
201							name object.
202
203							=item joint_names
204
205							When this option is set to a positive value, joint names are accounted for:
206
207							Mr_A_Smith_&Ms_B_Jones
208							Mr_&Ms_A_&B_Smith
209							Mr_A_&Ms_B_Smith
210							Mr_&Ms_A_Smith
211							Mr_A_&B_Smith
212
213							Note that if this option is not specified, than by default joint names are
214							ignored. Disabling joint names speeds up the processing a lot.
215
216							=item extended_titles
217
218							When this option is set to a positive value, all combinations of titles,
219							such as Colonel, Mother Superior are used. If this value is not set, only
220							the following titles are accounted for:
221
222							Mr
223							Ms
224							M/s
225							Mrs
226							Miss
227							Dr
228							Sir
229							Dame
230
231
232							Note that if this option is not specified, than by default extended titles
233							are ignored. Disabling extended titles speeds up the parsing.
234
235							=back
236
237							=head2 parse
238
239							$error = $name->parse("MR AC DE SILVA");
240
241							The C method takes a single parameter of a text string containing a
242							name. It attempts to parse the name and break it down into the components
243
244							Returns an error flag. If the name was parsed successfully, it's value is 0,
245							otherwise a 1. This step is a prerequisite for the following methods.
246
247
248							=head2 case_all
249
250							$correct_casing = $name->case_all;
251
252							The C method converts the first letter of each component to
253							capitals and the remainder to lower case, with the following exceptions-
254
255							initials remain capitalised
256							surname spelling such as MacNay-Smith, O'Brien and Van Der Heiden are preserved
257							- see C for user defined exceptions
258
259							A complete definition of the capitalising rules can be found by studying
260							the case_surname function.
261
262							The method returns the entire cased name as text.
263
264							=head2 case_all_reversed
265
266							$correct_casing = $name->case_all_reversed;
267
268							The C method applies the same type of casing as
269							C. However, the name is returned as surname followed by a comma
270							and the rest of the name, which can be any of the combinations allowed
271							for a name, except the title. Some examples are: "Smith, John", "De Silva, A.B."
272							This is useful for sorting names alphabetically by surname.
273
274							The method returns the entire reverse order cased name as text.
275
276
277							=head2 components
278
279							%my_name = $name->components;
280							$cased_surname = $my_name{surname_1};
281
282
283							The C method does the same thing as the C method,
284							but returns the name cased components in a hash. The following keys are used
285							for each component:
286
287							precursor
288							title_1
289							title_2
290							given_name_1
291							given_name_2
292							initials_1
293							initials_2
294							middle_name
295							conjunction_1
296							conjunction_2
297							surname_1
298							surname_2
299							suffix
300
301							If a component has no matching data for a given name, it will not appear in the hash
302
303							If the name could not be parsed, this method returns null. If you assign the return
304							value to a hash, you should check the error status returned by the C method first.
305							Ohterwise, you will get an odd number of values assigned to the hash.
306
307
308							=head2 case_surname
309
310							$correct_casing = case_surname("DE SILVA-MACNAY" [,$lc_prefix]);
311
312							C is a stand alone function that does not require a name
313							object. The input is a text string. An optional input argument controls the
314							casing rules for prefix portions of a surname, as described above in the
315							C section.
316
317							The output is a string converted to the correct casing for surnames.
318							See C for user defined exceptions
319
320							This function is useful when you know you are only dealing with names that
321							do not have initials like "Mr John Jones". It is much faster than the case_all
322							method, but does not understand context, and cannot detect errors on strings
323							that are not personal names.
324
325
326							=head2 surname_prefs.txt
327
328							Some surnames can have more than one form of valid capitalisation, such as
329							MacQuarie or Macquarie. Where the user wants to specify one form as the default,
330							a text file called surname_prefs.txt should be created and placed in the same
331							location as the NameParse module. The text file should contain one surname per
332							line, in the capitalised form you want, such as
333
334							Macquarie
335							MacHado
336
337							NameParse will still operate if the file does not exist
338
339							=head2 salutation
340
341							$salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend',sal_type => 'given_name'));
342
343							The C method converts a name into a personal greeting,
344							such as "Dear Mr & Mrs O'Brien" or "Dear Sue and John"
345
346							Optional parameters may be specided in a hash as follows:
347
348
349							salutation:
350
351							The greeting word such as 'Dear' or 'Greetings'. If not spefied than 'Dear' is used
352
353							sal_default:
354
355							The default word used when a personalised salution cannot be generated. If not
356							specified, than 'Friend' is used.
357
358							sal_type:
359
360							Can be either 'given_name' such as 'Dear Sue' or 'title_plus_name' such as 'Dear Ms Smith'
361							If not specified, than 'given_name' is used.
362
363							If an error is detected during parsing, such as with the name "AB Smith & Associates",
364							then the value of sal_default is used instead of a given name, or a title and surname.
365							If the input string contains a conjunction, an 's' is added to the value of sal_default.
366
367							If the name contains a precursor, a default salutation is produced.
368
369							=head2 clean
370
371							$good_name = clean("Bad Na9me");
372
373							C is a stand alone function that does not require a name object.
374							The input is a text string and the output is the string with:
375
376							all repeating spaces removed
377							all characters not in the set (A-Z a-z - ' , . &) removed
378
379
380							=head2 properties
381
382							The C method returns all the properties of the name,
383							non_matching, number and type, as a hash.
384
385							=over 4
386
387							=item type
388
389							The type of format a name is in, as one of the following strings:
390
391							Mr_A_Smith_&Ms_B_Jones
392							Mr_&Ms_A_&B_Smith
393							Mr_A_&Ms_B_Smith
394							Mr_&Ms_A_Smith
395							Mr_A_&B_Smith
396							Mr_John_Adam_Smith
397							Mr_John_A_Smith
398							Mr_John_Smith
399							Mr_A_Smith
400							John_Adam_Smith
401							John_A_Smith
402							J_Adam_Smith
403							John_Smith
404							A_Smith
405							John
406							unknown
407
408
409							=item non_matching
410
411							Returns any unmatched section that was found.
412
413							=back
414
415							=head2 report
416
417							Create a formatted text report to standard output listing
418							- the input string,
419							- the name and value of each defined component
420							- any non matching component
421
422
423							=head1 LIMITATIONS
424
425							The huge number of character combinations that can form a valid names makes
426							it is impossible to correctly identify them all. Firstly, there are many
427							ambiguities, which have no right answer.
428
429							Macbeth or MacBeth, are both valid spellings
430							Is ED WOOD E.D. Wood or Edward Wood
431							Is 'Mr Rapid Print' a name or a company
432							Does John Bradfield Smith have a middle name of Bradfield, or a surname of Bradfield-Smith?
433
434							One approach is to have large lookup files of names and words, statistical rules
435							and fuzzy logic to attempt to derive context. This approach gives high levels of
436							accuracy but uses a lot of your computers time and resources.
437
438							NameParse takes the approach of using a limited set of rules, based on the
439							formats that are commonly used by business to represent peoples names. This
440							gives us fairly high accuracy, with acceptable speed and program size.
441
442							NameParse will accept names from many countries, like Van Der Heiden,
443							De La Mare and Le Fontain. Having said that, it is still biased toward English,
444							because the precursors, titles and conjunctions are based on English usage.
445
446							Names with two or more words, but no separating hyphen are not recognized.
447							This is a real quandary as Indian, Chinese and other names can have several
448							components. If these are allowed for, any component after the surname
449							will also be picked up. For example in "Mr AB Jones Trading As Jones Pty Ltd"
450							will return a surname of "Jones Trading".
451
452							Because of the large combination of possible names defined in the grammar, the
453							program is not very fast, except for the more limited C subroutine.
454							See the "Future Directions" section for possible speed ups.
455
456							As the parser has a very limited understanding of context, the "John_Adam_Smith"
457							name type is most likely to cause problems, as it contains no known tokens
458							like a title. A string such as "National Australia Bank" would be accepted
459							as a valid name, first name National etc. Supplying a list of common pronouns
460							as exceptions could solve this problem.
461
462
463							=head1 REFERENCES
464
465							"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997)
466
467							Australian Standard AS4212-1994 "Geographic Information Systems -
468							Data Dictionary for transfer of street addressing information"
469
470
471							=head1 FUTURE DIRECTIONS
472
473							Define grammar for other languages. Hopefully, all that would be needed is
474							to specify a new module with its own grammar, and inherit all the existing
475							methods. I don't have the knowledge of the naming conventions for non-english
476							languages.
477
478							=head1 REPOSITORY
479
480							L
481
482
483							=head1 SEE ALSO
484
485							L, L, L,
486							L, L
487
488
489							=head1 BUGS
490
491							Names with accented characters (acute, circumfelx etc) will not be parsed
492							correctly. A work around is to replace the character class [a-z] with \w
493							in the appropriate rules in the grammar tree, but this could lower the accuracy
494							of names based purely on ASCII text.
495
496							=head1 CREDITS
497
498							Thanks to all the people who provided ideas and suggestions, including -
499
500							Damian Conway, author of Parse::RecDescent
501							Mark Summerfield author of Lingua::EN::NameCase,
502							Ron Savage, Alastair Adam Huffman, Douglas Wilson
503							Peter Schendzielorz
504
505							=head1 AUTHOR
506
507							NameParse was written by Kim Ryan
508
509							=head1 COPYRIGHT AND LICENSE
510
511							Copyright (c) 2016 Kim Ryan. All rights reserved.
512
513							This library is free software; you can redistribute it and/or modify
514							it under the same terms as Perl itself.
515
516							=cut
517							#-------------------------------------------------------------------------------
518
519							package Lingua::EN::NameParse;
520
521	2			2		19427	use strict;
	2					3
	2					47
522	2			2		6	use warnings;
	2					2
	2					39
523
524	2			2		711	use Lingua::EN::NameParse::Grammar;
	2					9
	2					42
525	2			2		2012	use Parse::RecDescent;
	2					59747
	2					14
526
527	2			2		76	use Exporter;
	2					2
	2					65
528	2			2		7	use vars qw (@ISA @EXPORT_OK);
	2					2
	2					2459
529
530							our $VERSION = '1.36';
531							@ISA = qw(Exporter);
532							@EXPORT_OK = qw(clean case_surname);
533
534							#-------------------------------------------------------------------------------
535							# Create a new instance of a name parsing object. This step is time consuming
536							# and should normally only be called once in your program.
537
538							sub new
539							{
540	2			2	1	15	my $class = shift;
541	2					5	my %args = @_;
542
543	2					3	my $name = {};
544	2					3	bless($name,$class);
545
546							# Default to 2 initials per name. Can be overwritten if user defines
547							# 'initials' as a key in the hash supplied to new method.
548	2					12	$name->{initials} = 2;
549
550	2					14	my $current_key;
551	2					7	foreach my $current_key (keys %args)
552							{
553	8					11	$name->{$current_key} = $args{$current_key};
554							}
555
556	2					7	my $grammar = Lingua::EN::NameParse::Grammar::_create($name);
557	2					15	$name->{parse} = new Parse::RecDescent($grammar);
558
559	2					348263	return ($name);
560							}
561							#-------------------------------------------------------------------------------
562							# Attempt to parse a string and retrieve it's components and properties
563							# Requires a name object to have been created with the 'new' method'
564							# Returns: an error code, 0 for success, 1 for failure
565
566							sub parse
567							{
568	28			28	1	4271	my $name = shift;
569	28					34	my ($input_string) = @_;
570
571	28					36	chomp($input_string);
572
573							# If reverse ordered names are allowed, swap the surname component, before
574							# the comma, with the rest of the name. Rejoin the name, replacing comma
575							# with a space.
576
577	28	100	100			107	if ( $name->{allow_reversed} and $input_string =~ /,/ )
578							{
579	1					4	my ($first,$second) = split(/,/,$input_string);
580	1					4	$input_string = join(' ',$second,$first);
581							}
582
583	28					38	$name->{comps} = ();
584	28					92	$name->{properties} = ();
585	28					63	$name->{properties}{type} = 'unknown';
586	28					32	$name->{error} = 0;
587	28					33	$name->{error_desc} = '';
588	28					78	$name->{warning} = 0;
589	28					32	$name->{warning_desc} = '';
590
591	28					28	$name->{original_input} = $input_string;
592	28					30	$name->{input_string} = $input_string;
593
594	28					49	$name = _pre_parse($name);
595	28	50				56	unless ( $name->{error} )
596							{
597	28	100				43	if ( $name->{auto_clean} )
598							{
599	9					17	$name->{input_string} = clean($name->{input_string});
600							}
601	28					41	$name = _assemble($name);
602	28					44	_validate($name);
603							}
604
605	28					60	return($name->{error});
606							}
607							#-------------------------------------------------------------------------------
608							# Clean the input string. Can be called as a stand alone function.
609
610							sub clean
611							{
612	10			10	1	528	my ($input_string) = @_;
613
614							# remove illegal characters
615	10					19	$input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go;
616
617							# remove repeating spaces
618	10					16	$input_string =~ s/ +/ /go ;
619
620							# remove any remaining leading or trailing space
621	10					14	$input_string =~ s/^ //;
622
623	10					18	return($input_string);
624							}
625
626							#-------------------------------------------------------------------------------
627							# Given a name object, apply correct capitalisation to each component of a person's name.
628							# Return all cased components in a hash.
629							# Else return no value.
630
631
632							sub components
633							{
634	35			35	1	32	my $name = shift;
635
636	35	50				76	if ( $name->{properties}{type} eq 'unknown' )
637							{
638	0					0	return;
639							}
640							else
641							{
642	35					29	my %orig_components = %{ $name->{comps} };
	35					190
643
644	35					48	my ($current_key,%cased_components);
645	35					78	foreach $current_key ( keys %orig_components )
646							{
647	490					329	my $cased_value;
648	490	100				930	if ( $current_key =~ /initials/ ) # initials_1, possibly initials_2
		100
		50
649							{
650	70					80	$cased_value = uc($orig_components{$current_key});
651							}
652							elsif ( $current_key =~ /surname\|suffix/ )
653							{
654	105					191	$cased_value = case_surname($orig_components{$current_key},$name->{lc_prefix});
655							}
656							elsif ( $current_key eq 'type')
657							{
658	0					0	$cased_value = $orig_components{$current_key};
659							}
660							else
661							{
662	315					316	$cased_value = _case_word($orig_components{$current_key});
663							}
664
665	490					559	$cased_components{$current_key} = $cased_value;
666							}
667	35					333	return(%cased_components);
668							}
669							}
670
671							#-------------------------------------------------------------------------------
672							# Hash of of lists, indicating the order that name components are assembled in.
673							# Each list element is itself the name of the key value in a name object.
674							# Used by the case_all and case_all_reversed methods.
675							# These hashes are created here globally, as quite a large overhead is
676							# imposed if the are created locally, each time the method is invoked
677
678							my %component_order=
679							(
680							'Mr_John_Smith_&_Ms_Mary_Jones' => ['title_1','given_name_1','surname_1','conjunction_1','title_2','given_name_2','surname_2'],
681							'Mr_A_Smith_&_Ms_B_Jones' => ['title_1','initials_1','surname_1','conjunction_1','title_2','initials_2','surname_2'],
682							'Mr_&_Ms_A_&_B_Smith' => ['title_1','conjunction_1','title_2','initials_1','conjunction_2','initials_2','surname_1'],
683							'Mr_A_&_Ms_B_Smith' => ['title_1','initials_1','conjunction_1','title_2','initials_2','surname_1'],
684							'Mr_&_Ms_A_Smith' => ['title_1','conjunction_1','title_2','initials_1','surname_1'],
685							'Mr_A_&_B_Smith' => ['title_1','initials_1','conjunction_1','initials_2','surname_1'],
686							'John_Smith_&Mary_Jones' => ['given_name_1','surname_1','conjunction_1','given_name_2','surname_2'],
687							'John_&_Mary_Smith' => ['given_name_1','conjunction_1','given_name_2','surname_1'],
688							'A_Smith_&_B_Jones' => ['initials_1','surname_1','conjunction_1','initials_2','surname_2'],
689
690							'Mr_John_Adam_Smith' => ['precursor','title_1','given_name_1','middle_name','surname_1','suffix'],
691							'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'],
692							'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'],
693							'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'],
694							'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'],
695							'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'],
696							'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'],
697							'John_Smith' => ['precursor','given_name_1','surname_1','suffix'],
698							'A_Smith' => ['precursor','initials_1','surname_1','suffix'],
699							'John' => ['given_name_1']
700							);
701
702
703							# only include names with a single surname
704							my %reverse_component_order=
705							(
706							'Mr_&_Ms_A_&_B_Smith' => ['surname_1','title_1','conjunction_1','title_2','initials_1','conjunction_1','initials_2'],
707							'Mr_A_&_Ms_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','title_2','initials_2'],
708							'Mr_&_Ms_A_Smith' => ['surname_1','title_1','title_1','conjunction_1','title_2','initials_1'],
709							'Mr_A_&_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','initials_2'],
710							'John_&_Mary_Smith' => ['surname_1','given_name_1','conjunction_1','given_name_2'],
711
712							'Mr_John_Adam_Smith' => ['surname_1','title_1','given_name_1','middle_name','suffix'],
713							'Mr_John_A_Smith' => ['surname_1','title_1','given_name_1','initials_1','suffix'],
714							'Mr_John_Smith' => ['surname_1','title_1','given_name_1','suffix'],
715							'Mr_A_Smith' => ['surname_1','title_1','initials_1','suffix'],
716							'John_Adam_Smith' => ['surname_1','given_name_1','middle_name','suffix'],
717							'John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'],
718							'J_Adam_Smith' => ['surname_1','initials_1','middle_name','suffix'],
719							'John_Smith' => ['surname_1','given_name_1','suffix'],
720							'A_Smith' => ['surname_1','initials_1','suffix']
721							);
722
723							#-------------------------------------------------------------------------------
724							# Apply correct capitalisation to a person's entire name
725							# If the name type is unknown, return undef
726							# Else, return a string of all cased components in correct order
727
728							sub case_all
729							{
730	2			2	1	6	my $name = shift;
731
732	2					4	my @cased_name;
733
734	2	50				5	if ( $name->{properties}{type} eq 'unknown' )
735							{
736	0					0	return undef;
737							}
738
739	2	50				9	unless ( $component_order{$name->{properties}{type}} )
740							{
741							# component order missing in array defined above
742	0					0	warn "Component order not defined for: $name->{properties}{type}";
743	0					0	return;
744							}
745
746	2					4	my %component_vals = $name->components;
747	2					3	my @order = @{ $component_order{$name->{properties}{type}} };
	2					8
748
749	2					5	foreach my $component_key ( @order )
750							{
751							# As some components such as precursors are optional, they will appear
752							# in the order array but may or may not have have a value, so only
753							# process defined values
754	10	100				18	if ( $component_vals{$component_key} )
755							{
756	6					8	push(@cased_name,$component_vals{$component_key});
757							}
758							}
759	2	100				5	if ( $name->{comps}{non_matching} )
760							{
761							# Despite errors, try to name case non-matching section. As the format
762							# of this section is unknown, surname case will provide the best
763							# approximation, but still fail on initials of more than 1 letter
764	1					4	push(@cased_name,case_surname($name->{comps}{non_matching},$name->{lc_prefix}));
765							}
766
767	2					17	return(join(' ',@cased_name));
768							}
769
770							#-------------------------------------------------------------------------------
771							=head1 case_all_reversed
772
773							Apply correct capitalisation to a person's entire name and reverse the order
774							so that surname is first, followed by the other components, such as: Smith, Mr John A
775							Useful for creating a list of names that can be sorted by surname.
776
777							If name type is unknown , returns null
778
779							If the name type has a joint name, such as 'Mr_A_Smith_Ms_B_Jones', return null,
780							as it is ambiguous which surname to place at the start of the string
781
782							Else, returns a string of all cased components in correct reversed order
783
784							=cut
785
786							sub case_all_reversed
787							{
788	0			0	1	0	my $name = shift;
789
790	0					0	my @cased_name_reversed;
791
792	0	0				0	unless ( $name->{properties}{type} eq 'unknown' )
793							{
794	0	0				0	unless ( $reverse_component_order{$name->{properties}{type} } )
795							{
796							# this type of name should not be reversed, such as two surnames
797	0					0	return;
798							}
799	0					0	my %component_vals = $name->components;
800	0					0	my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } };
	0					0
801
802	0					0	foreach my $component_key ( @reverse_order )
803							{
804							# As some components such as precursors are optional, they will appear
805							# in the order array but may or may not have have a value, so only
806							# process defined values
807
808	0					0	my $component_value = $component_vals{$component_key};
809	0	0				0	if ( $component_value )
810							{
811	0	0				0	if ($component_key eq 'surname_1')
812							{
813	0					0	$component_value .= ',';
814							}
815	0					0	push(@cased_name_reversed,$component_value);
816							}
817							}
818							}
819	0					0	return(join(' ',@cased_name_reversed));
820							}
821							#-------------------------------------------------------------------------------
822							# The user may specify their own preferred spelling for surnames.
823							# These should be placed in a text file called surname_prefs.txt
824							# in the same location as the module itself.
825
826							BEGIN
827							{
828							# Obtain the full path to NameParse module, defined in the %INC hash.
829	2			2		5	my $prefs_file_location = $INC{"Lingua/EN/NameParse.pm"};
830							# Now substitute the name of the preferences file
831	2					17	$prefs_file_location =~ s/NameParse\.pm$/surname_prefs.txt/;
832
833	2	50				2989	if ( open(PREFERENCES_FH,"<$prefs_file_location") )
834							{
835	0					0	my @surnames = ;
836	0					0	foreach my $name ( @surnames )
837							{
838	0					0	chomp($name);
839							# Build hash, lower case name is key for case insensitive
840							# comparison, while value holds the actual capitalisation
841	0					0	$Lingua::EN::surname_preferences{lc($name)} = $name;
842							}
843	0					0	close(PREFERENCES_FH);
844							}
845							}
846							#-------------------------------------------------------------------------------
847							# Apply correct capitalisation to a person's surname. Can be called as a
848							# stand alone function.
849
850							sub case_surname
851							{
852	108			108	1	366	my ($surname,$lc_prefix) = @_;
853
854	108	100				159	unless ($surname)
855							{
856	65					94	return('');
857							}
858
859							# If the user has specified a preferred capitalisation for this
860							# surname in the surname_prefs.txt, it should be returned now.
861	43	50				91	if ($Lingua::EN::surname_preferences{lc($surname)} )
862							{
863	0					0	return($Lingua::EN::surname_preferences{lc($surname)});
864							}
865
866							# Lowercase everything
867	43					40	$surname = lc($surname);
868
869							# Now uppercase first letter of every word. By checking on word boundaries,
870							# we will account for apostrophes (D'Angelo) and hyphenated names
871	43					209	$surname =~ s/\b(\w)/\u$1/g;
872
873							# Name case Macs and Mcs
874							# Exclude names with 1-2 letters after prefix like Mack, Macky, Mace
875							# Exclude names ending in a,c,i,o,z or j, typically Polish or Italian
876
877	43	100				134	if ( $surname =~ /\bMac[a-z]{2,}[^a\|c\|i\|o\|z\|j]\b/i )
		50
878							{
879	4					15	$surname =~ s/\b(Mac)([a-z]+)/$1\u$2/ig;
880
881							# Now correct for "Mac" exceptions
882	4					7	$surname =~ s/MacHin/Machin/;
883	4					9	$surname =~ s/MacHlin/Machlin/;
884	4					3	$surname =~ s/MacHar/Machar/;
885	4					7	$surname =~ s/MacKle/Mackle/;
886	4					4	$surname =~ s/MacKlin/Macklin/;
887	4					3	$surname =~ s/MacKie/Mackie/;
888
889							# Portuguese
890	4					4	$surname =~ s/MacHado/Machado/;
891
892							# Lithuanian
893	4					4	$surname =~ s/MacEvicius/Macevicius/;
894	4					3	$surname =~ s/MacIulis/Maciulis/;
895	4					4	$surname =~ s/MacIas/Macias/;
896							}
897							elsif ( $surname =~ /\bMc/i )
898							{
899	0					0	$surname =~ s/\b(Mc)([a-z]+)/$1\u$2/ig;
900							}
901							# Exceptions (only 'Mac' name ending in 'o' ?)
902	43					38	$surname =~ s/Macmurdo/MacMurdo/;
903
904
905	43	100				61	if ( $lc_prefix )
906							{
907							# Lowercase first letter of every word in prefix. The trailing space
908							# prevents the surname from being altered. Note that spellings like
909							# d'Angelo are not accounted for.
910	1					10	$surname =~ s/\b(\w+ )/\l$1/g;
911							}
912
913							# Correct for possessives such as "John's" or "Australia's". Although this
914							# should not occur in a person's name, they are valid for proper names.
915							# As this subroutine may be used to capitalise words other than names,
916							# we may need to account for this case. Note that the 's' must be at the
917							# end of the string
918	43					63	$surname =~ s/(\w+)'S(\s+)/$1's$2/;
919	43					39	$surname =~ s/(\w+)'S$/$1's/;
920
921							# Correct for roman numerals, excluding single letter cases I,V and X,
922							# which will work with the above code
923	43					75	$surname =~ s/\b(I{2,3})\b/\U$1/i; # 2nd, 3rd
924	43					46	$surname =~ s/\b(IV)\b/\U$1/i; # 4th
925	43					42	$surname =~ s/\b(VI{1,3})\b/\U$1/i; # 6th, 7th, 8th
926	43					46	$surname =~ s/\b(IX)\b/\U$1/i; # 9th
927	43					44	$surname =~ s/\b(XI{1,3})\b/\U$1/i; # 11th, 12th, 13th
928
929	43					70	return($surname);
930							}
931							#-------------------------------------------------------------------------------
932							# Create a personalised greeting from one or two person's names
933							# Returns the salutation as a string, such as "Dear Mr Smith", or "Dear Sue"
934
935							sub salutation
936							{
937	3			3	1	4	my $name = shift;
938	3					4	my %args = @_;
939
940	3					5	my $salutation = 'Dear';
941	3					26	my $sal_default = 'Friend';
942	3					4	my $sal_type = 'title_plus_surname';
943
944							# Check to see if we should override defualts with any user specified preferences
945	3	100				6	if ( %args )
946							{
947	2					4	foreach my $current_key (keys %args)
948							{
949	2	50				5	$current_key eq 'salutation' and $salutation = $args{$current_key};
950	2	50				5	$current_key eq 'sal_default' and $sal_default = $args{$current_key};
951	2	50				7	$current_key eq 'sal_type' and $sal_type = $args{$current_key};
952							}
953							}
954
955
956	3					4	my @greeting;
957	3					4	push(@greeting,$salutation);
958
959							# Personalised salutations cannot be created for Estates or people
960							# without some title
961	3	50	33			13	if
			33
962							(
963							$name->{error} or
964							( $name->{comps}{precursor} and $name->{comps}{precursor} =~ /ESTATE/)
965							)
966							{
967							# Despite an error, the presence of a conjunction probably
968							# means we are dealing with 2 or more people.
969							# For example Mr AB Smith & John Jones
970	0	0				0	if ( $name->{input_string} =~ / (AND\|&) / )
971							{
972	0					0	$sal_default .= 's';
973							}
974	0					0	push(@greeting,$sal_default);
975							}
976							else
977							{
978	3					7	my %component_vals = $name->components;
979
980	3	100				14	if ( $sal_type eq 'given_name')
		50
981							{
982	1	50				4	if ( $component_vals{'given_name_1'} )
983							{
984	1					2	push(@greeting,$component_vals{'given_name_1'});
985	1	50				5	if ( $component_vals{'given_name_2'} )
986							{
987	0					0	push(@greeting,$component_vals{'conjunction_1'});
988	0					0	push(@greeting,$component_vals{'given_name_2'});
989							}
990							}
991							else
992							{
993							# No given name such as 'A_Smith','J_Adam_Smith','Mr_A_Smith'
994							# Must use default
995	0					0	push(@greeting,$sal_default);
996							}
997							}
998							elsif ( $sal_type eq 'title_plus_surname' )
999							{
1000	2	50				6	if ( $name->{properties}{number} == 1 )
		0
1001							{
1002	2	100				5	if ( $component_vals{'title_1'} )
1003							{
1004	1					2	push(@greeting,$component_vals{'title_1'});
1005	1					3	push(@greeting,$component_vals{'surname_1'});
1006							}
1007							else
1008							{
1009							# No title such as 'A_Smith','J_Adam_Smith', so must use default
1010	1					3	push(@greeting,$sal_default);
1011							}
1012							}
1013							elsif ( $name->{properties}{number} == 2 )
1014							{
1015							# a joint name
1016
1017	0					0	my $type = $name->{properties}{type};
1018	0	0	0			0	if ( $type eq 'Mr_&Ms_A_Smith' or $type eq 'Mr_A_&Ms_B_Smith' or $type eq 'Mr_&Ms_A_&B_Smith' )
		0	0
			0
1019							{
1020							# common surname
1021	0					0	push(@greeting,$component_vals{'title_1'});
1022	0					0	push(@greeting,$component_vals{'conjunction_1'});
1023	0					0	push(@greeting,$component_vals{'title_2'});
1024	0					0	push(@greeting,$component_vals{'surname_1'});
1025
1026							}
1027							elsif ( $type eq 'Mr_A_Smith_&Ms_B_Jones' or $type eq 'Mr_John_Smith_&Ms_Mary_Jones' )
1028							{
1029	0					0	push(@greeting,$component_vals{'title_1'});
1030	0					0	push(@greeting,$component_vals{'surname_1'});
1031	0					0	push(@greeting,$component_vals{'conjunction_1'});
1032	0					0	push(@greeting,$component_vals{'title_2'});
1033	0					0	push(@greeting,$component_vals{'surname_2'});
1034							}
1035							else
1036							{
1037							# No title such as A_Smith_&B_Jones', 'John_Smith_&Mary_Jones'
1038							# Must use default
1039	0					0	push(@greeting,$sal_default);
1040							}
1041							}
1042							}
1043							else
1044							{
1045	0					0	warn "Invalid sal_type : ", $sal_type;
1046	0					0	push(@greeting,$sal_default);
1047							}
1048							}
1049	3					19	return(join(' ',@greeting));
1050							}
1051							#-------------------------------------------------------------------------------
1052							# Return all name properties as a hash
1053
1054							sub properties
1055							{
1056	21			21	1	69	my $name = shift;
1057	21					15	return(%{ $name->{properties} });
	21					89
1058							}
1059
1060							#-------------------------------------------------------------------------------
1061							# Create a text report to standard output listing
1062							# - the input string,
1063							# - the name of each defined component, if it exists
1064							# - any non matching component
1065
1066							sub report
1067							{
1068	0			0	1	0	my $name = shift;
1069
1070	0					0	my %props = $name->properties;
1071
1072	0					0	my $fmt = "%-20.20s : %s\n";
1073
1074	0					0	printf($fmt,"Original Input",$name->{original_input});
1075	0					0	printf($fmt,"Cleaned Input",$name->{input_string});
1076	0					0	printf($fmt,"Case all",$name->case_all);
1077	0					0	printf($fmt,"Case all reversed",$name->case_all_reversed);
1078	0					0	printf($fmt,"Salutation",$name->salutation(salutation => 'Dear',sal_default => 'Friend', sal_type => 'title_plus_surname'));
1079	0					0	printf($fmt,"Type", $props{type});
1080	0					0	printf($fmt,"Number", $props{number});
1081	0					0	printf($fmt,"Parsing Error", $name->{error});
1082	0					0	printf($fmt,"Error description : ", $name->{error_desc});
1083	0					0	printf($fmt,"Parsing Warning", $name->{warning});
1084	0					0	printf($fmt,"Warning description", $name->{warning_desc});
1085
1086
1087	0	0				0	unless ($props{type} eq 'unknown')
1088							{
1089	0					0	my %comps = $name->components;
1090	0	0				0	if ( %comps )
1091							{
1092	0					0	print("\nCOMPONENTS\n");
1093	0					0	foreach my $value ( sort keys %comps)
1094							{
1095	0	0	0			0	if ($value and $comps{$value})
1096							{
1097	0					0	printf($fmt,$value,$comps{$value});
1098							}
1099							}
1100							}
1101							}
1102							}
1103							#-------------------------------------------------------------------------------
1104
1105							# PRIVATE METHODS
1106
1107							#-------------------------------------------------------------------------------
1108
1109							sub _pre_parse
1110							{
1111	28			28		28	my $name = shift;
1112
1113							# strip all full stops
1114	28					50	$name->{input_string} =~ s/\.//g;
1115
1116							# Fold all text to upper case, as these are used in all regular expressions withun thr grammar tree
1117	28					39	$name->{input_string} = uc($name->{input_string});
1118
1119							# Check that common reserved word (as found in company names) do not appear
1120	28	50				197	if ( $name->{input_string} =~
1121							/\BPTY LTD$\|\BLTD$\|\BPLC$\|ASSOCIATION\|DEPARTMENT\|NATIONAL\|SOCIETY/ )
1122							{
1123	0					0	$name->{error} = 1;
1124	0					0	$name->{comps}{non_matching} = $name->{input_string};
1125	0					0	$name->{error_desc} = 'Reserved words found in name';
1126							}
1127
1128							# For the case of a single name such as 'Voltaire' we need to add a trailing space
1129							# to the input string. This is because the grammar tree expects a terminator (the space)
1130							# optionally followed by other productions or non matching text
1131	28					33	$name->{input_string} .= ' ';
1132	28	50				148	if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/ )
1133							{
1134	0					0	$name->{input_string} .= ' ';
1135							}
1136	28					40	return($name);
1137
1138							}
1139							#-------------------------------------------------------------------------------
1140							# Initialise all components to empty string. Assemble hashes of components
1141							# and properties as part of the name object
1142							#
1143							sub _assemble
1144							{
1145	28			28		22	my $name = shift;
1146
1147							# Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree
1148							# defined in Lingua::EN::NameParse::Grammar
1149	28					163	my $parsed_name = $name->{parse}->full_name($name->{input_string});
1150
1151							# Place components into a separate hash, so they can be easily returned
1152							# for the user to inspect and modify.
1153
1154	28					171857	my @all_comps = qw(precursor title_1 given_name_1 initials_1 middle_name surname_1 conjunction_1
1155							title_2 given_name_2 initials_2 surname_2 conjunction_2 suffix non_matching);
1156
1157	28					48	foreach my $comp (@all_comps)
1158							{
1159							# set all components to empty string, as any of them could be accessed, even if they don't exist
1160	392					380	$name->{comps}{$comp} = '';
1161	392	100				516	if (defined($parsed_name->{$comp}))
1162							{
1163							# Copy over existing components.
1164	135					156	$name->{comps}{$comp} = _trim_space($parsed_name->{$comp});
1165							}
1166							}
1167
1168	28					41	$name->{properties}{number} = 0;
1169	28					31	$name->{properties}{number} = $parsed_name->{number};
1170	28					35	$name->{properties}{type} = $parsed_name->{type};
1171
1172	28					73	return($name);
1173							}
1174							#-------------------------------------------------------------------------------
1175							# For correct matching, the grammar of each component must include the trailing space that separates it
1176							# from any following word. This should now be removed from the components, and will be restored by the
1177							# case_all and salutation methods, if called.
1178
1179							sub _trim_space
1180							{
1181	135			135		112	my ($string) = @_;
1182	135	100				157	if ($string)
1183							{
1184	109					213	$string =~ s/ $//;
1185							}
1186	135					191	return($string);
1187							}
1188							#-------------------------------------------------------------------------------
1189							# Check if any name components have illegal characters, or do not have the
1190							# correct syntax for a valid name.
1191
1192
1193							sub _validate
1194							{
1195	28			28		26	my $name = shift;
1196	28					67	my %comps = $name->components;
1197
1198	28	100				112	if ( $comps{non_matching} )
		50
1199							{
1200	2					3	$name->{warning} = 1;
1201	2					7	$name->{warning_desc} .= ";non_matching text found : $comps{non_matching}";
1202							}
1203							elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ )
1204							{
1205							# illegal characters found
1206	0					0	$name->{error} = 1;
1207	0					0	$name->{error_desc} = 'illegal characters found';
1208							}
1209
1210
1211	28	50				44	if ( not _valid_name($comps{given_name_1}) )
		50
		50
		50
1212							{
1213	0					0	$name->{warning} = 1;
1214	0					0	$name->{warning_desc} .= ";no vowel sound in given_name_1 : $comps{given_name_1}";
1215							}
1216							elsif ( not _valid_name($comps{middle_name}) )
1217							{
1218	0					0	$name->{warning} = 1;
1219	0					0	$name->{warning_desc} .= ";no vowel sound in middle_name : $comps{middle_name}";
1220							}
1221
1222							elsif ( not _valid_name($comps{surname_1}) )
1223							{
1224	0					0	$name->{warning} = 1;
1225	0					0	$name->{warning_desc} .= ";no vowel sound in surname_1 : $comps{surname_1}";
1226
1227							}
1228							elsif ( not _valid_name($comps{surname_2}) )
1229							{
1230	0					0	$name->{warning} = 1;
1231	0					0	$name->{warning_desc} .= ";no vowel sound in surname_2 : $comps{surname_2}";
1232							}
1233							}
1234							#-------------------------------------------------------------------------------
1235							# If the name has an assigned value, check that it contains a vowel sound,
1236							# or matches the exceptions to this rule.
1237							# Returns 1 if name is valid, otherwise 0
1238
1239							sub _valid_name
1240							{
1241	112			112		86	my ($name) = @_;
1242	112	100	33			303	if ( not $name )
		50
1243							{
1244	65					150	return(1);
1245							}
1246							# Names should have a vowel sound,
1247							# valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc
1248							elsif ( $name and $name =~ /[AEIOUYJ]\|^(NG\|TSZ\|MD\|(C?H\|[PTS])NG)$/i )
1249							{
1250	47					106	return(1);
1251							}
1252							else
1253							{
1254	0					0	return(0);
1255							}
1256							}
1257							#-------------------------------------------------------------------------------
1258							# Upper case first letter, lower case the rest, for all words in string
1259							sub _case_word
1260							{
1261	315			315		229	my ($word) = @_;
1262
1263	315	100				349	if ($word)
1264							{
1265	67					262	$word =~ s/(\w+)/\u\L$1/g;
1266							}
1267
1268	315					309	return($word);
1269							}
1270							#-------------------------------------------------------------------------------
1271							return(1);