File Coverage

blib/lib/Lingua/EN/TitleParse.pm

Criterion	Covered	Total	%
statement	60	60	100.0
branch	17	18	94.4
condition	8	9	88.8
subroutine	9	9	100.0
pod	2	4	50.0
total	96	100	96.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::EN::TitleParse;
2
3	1			1		999	use 5.006000;
	1					4
	1					39
4	1			1		6	use strict;
	1					2
	1					35
5	1			1		16	use warnings;
	1					2
	1					2859
6
7							our $VERSION = '0.01';
8
9							sub new {
10	4			4	0	5941	my ($class, %params) = @_;
11	4					6	my $self = {};
12	4					15	bless $self, $class;
13	4	100				17	$self->{titles} = $params{titles} ? $self->_load( $params{titles} ) : $self->_default_titles;
14	4	100				21	$self->{clean} = $params{clean} ? 1 : 0;
15	4					12	return $self;
16							}
17
18							# parse uses a hash-table of "normalised" titles to very efficiently identify
19							# titles, regardless of the number of titles required to look up.
20							# Normalised text for our purposes consists of lower-case \w characters,
21							# all other characters and spaces being \W.
22							# Using this technique we find titles regardless of case or other
23							# punctuation used. e.g. MR, Mr., mr, and Mr can all be found.
24							#
25							# Once we have identified the normalised title we then capture the real
26							# title, with the punctuation and case as in the original string, by
27							# counting forward the correct number of normalised characters and
28							# capturing non-normalised characters along the way.
29
30							sub parse {
31
32	16			16	1	9184	my ($self, $name) = @_;
33
34	16	50				48	return () unless defined $name;
35
36	16					27	my ($title, $remaining_name) = ('', $name);
37	16	100				44	my $titles = ref $self ? $self->{titles} : $self->_default_titles;
38
39							# Try to find a normalised title using a hash lookup.
40							# Split the name by spaces/non-word characters, then match in
41							# reverse order against a list of normalised titles.
42							# Take the largest matching title.
43	16					20	my $normalised_title;
44	16					86	my @name_chunks = split(/\W+/, lc $name);
45	16					619	while (pop @name_chunks) {
46	20					42	my $possible_title = join(" ", @name_chunks);
47	20	100				53	if (exists $titles->{$possible_title}) {
48	15					16	$normalised_title = $possible_title;
49	15					21	last;
50							}
51							}
52
53	16	100				26	if ($normalised_title) {
54							# Find the normalised title in the real string
55							# by counting the number of normalised characters
56							# (ignore any spaces in the count)
57	15					20	my $unspaced_title = $normalised_title;
58	15					34	$unspaced_title =~ s/\s//g;
59	15					17	my $character_count = length $unspaced_title;
60	15					91	my @characters = split (//, $name);
61	15					24	my @title_chars;
62	15		66			74	while ($character_count > 0 && scalar @characters > 0) {
63	89					160	my $character = shift @characters;
64	89					139	push (@title_chars, $character);
65							# only count down when we have a normalised character
66	89	100				468	$character_count-- if $character =~ /^\w$/;
67							}
68							# Now add any trailing un-normalised characters to the title too
69							# e.g. for "Mr." we want the "." in "Mr." too,
70	15					48	while ($characters[0] =~ /^\W$/) {
71	20					62	push (@title_chars, shift @characters);
72							}
73
74	15					35	$title = join("", @title_chars);
75	15					30	$remaining_name = join("", @characters);
76
77							# clean up any spaces at the point of separation
78	15					54	$title =~ s/\s+$//;
79	15					54	$remaining_name =~ s/^\s+//;
80							}
81
82							# Return a cleaned title if that option was set
83	16	100	100			103	$title = $titles->{$normalised_title} if $normalised_title && ref $self && $self->{clean};
			100
84
85	16					73	return ($title, $remaining_name);
86							}
87
88							# This method must match how parse()
89							# handles its input string.
90
91							sub normalise {
92	3			3	0	4	my ($self, $title) = @_;
93
94							# remove leading/trailing whitespace
95	3					7	$title =~ s/^\s+//;
96	3					8	$title =~ s/\s+$//;
97							# remove punctuation & consolidate spaces
98	3					9	$title =~ s/\W+/ /;
99							# lower-case
100	3					6	$title = lc($title);
101
102	3					7	return $title;
103							}
104
105							sub titles {
106	2			2	1	2087	my $self = shift;
107	2	100				11	my $titles = ref $self ? $self->{titles} : $self->_default_titles;
108	2					141	return sort values %$titles;
109							}
110
111							sub _load {
112	1			1		1	my ($self, $titles) = @_;
113	1					3	my $normalised_titles = {};
114	1					3	foreach my $title (@$titles) {
115	3					11	my $normalised_title = $self->normalise($title);
116							# Store the title in our hashref pointing at the original title
117	3					8	$normalised_titles->{$normalised_title} = $title;
118							}
119	1					3	return $normalised_titles;
120							}
121
122							sub _default_titles {
123							return {
124							# Basic titles
125	5			5		373	'mr' => 'Mr',
126							'ms' => 'Ms',
127							'mrs' => 'Mrs',
128							'miss' => 'Miss',
129							'mx' => 'Mx',
130							'dr' => 'Dr',
131							# Combined titles
132							'mr and mrs' => 'Mr and Mrs',
133							'mr mrs' => 'Mr & Mrs',
134							# Extended titles
135							'sir' => 'Sir',
136							'dame' => 'Dame',
137							'messrs' => 'Messrs',
138							'madame' => 'Madame',
139							'madam' => 'Madam',
140							'mme' => 'Mme',
141							'mister' => 'Mister',
142							'master' => 'Master',
143							'mast' => 'Mast',
144							'msgr' => 'Msgr',
145							'mgr' => 'Mgr',
146							'count' => 'Count',
147							'countess' => 'Countess',
148							'duke' => 'Duke',
149							'duchess' => 'Duchess',
150							'lord' => 'Lord',
151							'lady' => 'Lady',
152							'marquis' => 'Marquis',
153							'marquess' => 'Marquess',
154							# Medical
155							'doctor' => 'Doctor',
156							'sister' => 'Sister',
157							'matron' => 'Matron',
158							'nurse' => 'Nurse',
159							# Legal
160							'judge' => 'Judge',
161							'justice' => 'Justice',
162							'attorney' => 'Attorney',
163							'solicitor' => 'Solicitor',
164							'barrister' => 'Barrister',
165							'qc' => 'QC',
166							'kc' => 'KC',
167							# Police
168							'det' => 'Det',
169							'detective' => 'Detective',
170							'insp' => 'Insp',
171							'inspector' => 'Inspector',
172							# Military
173							'brig' => 'Brig',
174							'brigadier' => 'Brigadier',
175							'captain' => 'Captain',
176							'capt' => 'Capt',
177							'colonel' => 'Colonel',
178							'col' => 'Col',
179							'commander in chief' => 'Commander in Chief',
180							'commander' => 'Commander',
181							'commodore' => 'Commodore',
182							'cdr' => 'Cdr',
183							'field marshall' => 'Field Marshall',
184							'fl off' => 'Fl Off',
185							'flight officer' => 'Flight Officer',
186							'flt lt' => 'Flt Lt',
187							'flight lieutenant' => 'Flight Lieutenant',
188							'general of the army' => 'General of the Army',
189							'general' => 'General',
190							'gen' => 'Gen',
191							'pte' => 'Pte',
192							'private' => 'Private',
193							'sgt' => 'Sgt',
194							'sargent' => 'Sargent',
195							'air commander' => 'Air Commander',
196							'air commodore' => 'Air Commodore',
197							'air marshall' => 'Air Marshall',
198							'lieutenant colonel' => 'Lieutenant Colonel',
199							'lt col' => 'Lt Col',
200							'lt gen' => 'Lt Gen',
201							'lt cdr' => 'Lt Cdr',
202							'lieutenant' => 'Lieutenant',
203							'lt' => 'Lt',
204							'leut' => 'Leut',
205							'lieut' => 'Lieut',
206							'major general' => 'Major General',
207							'maj gen' => 'Maj Gen',
208							'major' => 'Major',
209							'maj' => 'Maj',
210							'pilot officer' => 'Pilot Officer',
211							# Religious
212							'rabbi' => 'Rabbi',
213							'bishop' => 'Bishop',
214							'brother' => 'Brother',
215							'chaplain' => 'Chaplain',
216							'father' => 'Father',
217							'pastor' => 'Pastor',
218							'mother superior' => 'Mother Superior',
219							'mother' => 'Mother',
220							'most reverend' => 'Most Reverend',
221							'most reverand' => 'Most Reverand',
222							'very reverend' => 'Very Reverend',
223							'very reverand' => 'Very Reverand',
224							'reverend' => 'Reverend',
225							'reverand' => 'Reverand',
226							'mt revd' => 'Mt Revd',
227							'v revd' => 'V Revd',
228							'revd' => 'Revd',
229							# Academic
230							'professor' => 'Professor',
231							'prof' => 'Prof',
232							'associate professor' => 'Associate Professor',
233							'assoc prof' => 'Assoc Prof',
234							# Other
235							'alderman' => 'Alderman',
236							'ald' => 'Ald',
237							# These might be followed by another title
238							# in which case we will fail to pick that up.
239							'his excellency' => 'His Excellency',
240							'his honour' => 'His Honour',
241							'his honor' => 'His Honor',
242							'her excellency' => 'Her Excellency',
243							'her honour' => 'Her Honour',
244							'her honor' => 'Her Honor',
245							'the right honourable' => 'The Right Honourable',
246							'the right honorable' => 'The Right Honorable',
247							'the honourable' => 'The Honourable',
248							'the honorable' => 'The Honorable',
249							'right honourable' => 'Right Honourable',
250							'right honorable' => 'Right Honorable',
251							'rt hon' => 'Rt Hon',
252							'rt hon' => 'Rt Hon',
253							'the hon' => 'The Hon',
254							'the hon' => 'The Hon',
255							};
256							}
257
258							1;
259							__END__