line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::TitleParse; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
999
|
use 5.006000; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
39
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
5
|
1
|
|
|
1
|
|
16
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2859
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
sub new { |
10
|
4
|
|
|
4
|
0
|
5941
|
my ($class, %params) = @_; |
11
|
4
|
|
|
|
|
6
|
my $self = {}; |
12
|
4
|
|
|
|
|
15
|
bless $self, $class; |
13
|
4
|
100
|
|
|
|
17
|
$self->{titles} = $params{titles} ? $self->_load( $params{titles} ) : $self->_default_titles; |
14
|
4
|
100
|
|
|
|
21
|
$self->{clean} = $params{clean} ? 1 : 0; |
15
|
4
|
|
|
|
|
12
|
return $self; |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# parse uses a hash-table of "normalised" titles to very efficiently identify |
19
|
|
|
|
|
|
|
# titles, regardless of the number of titles required to look up. |
20
|
|
|
|
|
|
|
# Normalised text for our purposes consists of lower-case \w characters, |
21
|
|
|
|
|
|
|
# all other characters and spaces being \W. |
22
|
|
|
|
|
|
|
# Using this technique we find titles regardless of case or other |
23
|
|
|
|
|
|
|
# punctuation used. e.g. MR, Mr., mr, and Mr can all be found. |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# Once we have identified the normalised title we then capture the real |
26
|
|
|
|
|
|
|
# title, with the punctuation and case as in the original string, by |
27
|
|
|
|
|
|
|
# counting forward the correct number of normalised characters and |
28
|
|
|
|
|
|
|
# capturing non-normalised characters along the way. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub parse { |
31
|
|
|
|
|
|
|
|
32
|
16
|
|
|
16
|
1
|
9184
|
my ($self, $name) = @_; |
33
|
|
|
|
|
|
|
|
34
|
16
|
50
|
|
|
|
48
|
return () unless defined $name; |
35
|
|
|
|
|
|
|
|
36
|
16
|
|
|
|
|
27
|
my ($title, $remaining_name) = ('', $name); |
37
|
16
|
100
|
|
|
|
44
|
my $titles = ref $self ? $self->{titles} : $self->_default_titles; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Try to find a normalised title using a hash lookup. |
40
|
|
|
|
|
|
|
# Split the name by spaces/non-word characters, then match in |
41
|
|
|
|
|
|
|
# reverse order against a list of normalised titles. |
42
|
|
|
|
|
|
|
# Take the largest matching title. |
43
|
16
|
|
|
|
|
20
|
my $normalised_title; |
44
|
16
|
|
|
|
|
86
|
my @name_chunks = split(/\W+/, lc $name); |
45
|
16
|
|
|
|
|
619
|
while (pop @name_chunks) { |
46
|
20
|
|
|
|
|
42
|
my $possible_title = join(" ", @name_chunks); |
47
|
20
|
100
|
|
|
|
53
|
if (exists $titles->{$possible_title}) { |
48
|
15
|
|
|
|
|
16
|
$normalised_title = $possible_title; |
49
|
15
|
|
|
|
|
21
|
last; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
16
|
100
|
|
|
|
26
|
if ($normalised_title) { |
54
|
|
|
|
|
|
|
# Find the normalised title in the real string |
55
|
|
|
|
|
|
|
# by counting the number of normalised characters |
56
|
|
|
|
|
|
|
# (ignore any spaces in the count) |
57
|
15
|
|
|
|
|
20
|
my $unspaced_title = $normalised_title; |
58
|
15
|
|
|
|
|
34
|
$unspaced_title =~ s/\s//g; |
59
|
15
|
|
|
|
|
17
|
my $character_count = length $unspaced_title; |
60
|
15
|
|
|
|
|
91
|
my @characters = split (//, $name); |
61
|
15
|
|
|
|
|
24
|
my @title_chars; |
62
|
15
|
|
66
|
|
|
74
|
while ($character_count > 0 && scalar @characters > 0) { |
63
|
89
|
|
|
|
|
160
|
my $character = shift @characters; |
64
|
89
|
|
|
|
|
139
|
push (@title_chars, $character); |
65
|
|
|
|
|
|
|
# only count down when we have a normalised character |
66
|
89
|
100
|
|
|
|
468
|
$character_count-- if $character =~ /^\w$/; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
# Now add any trailing un-normalised characters to the title too |
69
|
|
|
|
|
|
|
# e.g. for "Mr." we want the "." in "Mr." too, |
70
|
15
|
|
|
|
|
48
|
while ($characters[0] =~ /^\W$/) { |
71
|
20
|
|
|
|
|
62
|
push (@title_chars, shift @characters); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
15
|
|
|
|
|
35
|
$title = join("", @title_chars); |
75
|
15
|
|
|
|
|
30
|
$remaining_name = join("", @characters); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# clean up any spaces at the point of separation |
78
|
15
|
|
|
|
|
54
|
$title =~ s/\s+$//; |
79
|
15
|
|
|
|
|
54
|
$remaining_name =~ s/^\s+//; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Return a cleaned title if that option was set |
83
|
16
|
100
|
100
|
|
|
103
|
$title = $titles->{$normalised_title} if $normalised_title && ref $self && $self->{clean}; |
|
|
|
100
|
|
|
|
|
84
|
|
|
|
|
|
|
|
85
|
16
|
|
|
|
|
73
|
return ($title, $remaining_name); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# This method must match how parse() |
89
|
|
|
|
|
|
|
# handles its input string. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub normalise { |
92
|
3
|
|
|
3
|
0
|
4
|
my ($self, $title) = @_; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# remove leading/trailing whitespace |
95
|
3
|
|
|
|
|
7
|
$title =~ s/^\s+//; |
96
|
3
|
|
|
|
|
8
|
$title =~ s/\s+$//; |
97
|
|
|
|
|
|
|
# remove punctuation & consolidate spaces |
98
|
3
|
|
|
|
|
9
|
$title =~ s/\W+/ /; |
99
|
|
|
|
|
|
|
# lower-case |
100
|
3
|
|
|
|
|
6
|
$title = lc($title); |
101
|
|
|
|
|
|
|
|
102
|
3
|
|
|
|
|
7
|
return $title; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub titles { |
106
|
2
|
|
|
2
|
1
|
2087
|
my $self = shift; |
107
|
2
|
100
|
|
|
|
11
|
my $titles = ref $self ? $self->{titles} : $self->_default_titles; |
108
|
2
|
|
|
|
|
141
|
return sort values %$titles; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _load { |
112
|
1
|
|
|
1
|
|
1
|
my ($self, $titles) = @_; |
113
|
1
|
|
|
|
|
3
|
my $normalised_titles = {}; |
114
|
1
|
|
|
|
|
3
|
foreach my $title (@$titles) { |
115
|
3
|
|
|
|
|
11
|
my $normalised_title = $self->normalise($title); |
116
|
|
|
|
|
|
|
# Store the title in our hashref pointing at the original title |
117
|
3
|
|
|
|
|
8
|
$normalised_titles->{$normalised_title} = $title; |
118
|
|
|
|
|
|
|
} |
119
|
1
|
|
|
|
|
3
|
return $normalised_titles; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub _default_titles { |
123
|
|
|
|
|
|
|
return { |
124
|
|
|
|
|
|
|
# Basic titles |
125
|
5
|
|
|
5
|
|
373
|
'mr' => 'Mr', |
126
|
|
|
|
|
|
|
'ms' => 'Ms', |
127
|
|
|
|
|
|
|
'mrs' => 'Mrs', |
128
|
|
|
|
|
|
|
'miss' => 'Miss', |
129
|
|
|
|
|
|
|
'mx' => 'Mx', |
130
|
|
|
|
|
|
|
'dr' => 'Dr', |
131
|
|
|
|
|
|
|
# Combined titles |
132
|
|
|
|
|
|
|
'mr and mrs' => 'Mr and Mrs', |
133
|
|
|
|
|
|
|
'mr mrs' => 'Mr & Mrs', |
134
|
|
|
|
|
|
|
# Extended titles |
135
|
|
|
|
|
|
|
'sir' => 'Sir', |
136
|
|
|
|
|
|
|
'dame' => 'Dame', |
137
|
|
|
|
|
|
|
'messrs' => 'Messrs', |
138
|
|
|
|
|
|
|
'madame' => 'Madame', |
139
|
|
|
|
|
|
|
'madam' => 'Madam', |
140
|
|
|
|
|
|
|
'mme' => 'Mme', |
141
|
|
|
|
|
|
|
'mister' => 'Mister', |
142
|
|
|
|
|
|
|
'master' => 'Master', |
143
|
|
|
|
|
|
|
'mast' => 'Mast', |
144
|
|
|
|
|
|
|
'msgr' => 'Msgr', |
145
|
|
|
|
|
|
|
'mgr' => 'Mgr', |
146
|
|
|
|
|
|
|
'count' => 'Count', |
147
|
|
|
|
|
|
|
'countess' => 'Countess', |
148
|
|
|
|
|
|
|
'duke' => 'Duke', |
149
|
|
|
|
|
|
|
'duchess' => 'Duchess', |
150
|
|
|
|
|
|
|
'lord' => 'Lord', |
151
|
|
|
|
|
|
|
'lady' => 'Lady', |
152
|
|
|
|
|
|
|
'marquis' => 'Marquis', |
153
|
|
|
|
|
|
|
'marquess' => 'Marquess', |
154
|
|
|
|
|
|
|
# Medical |
155
|
|
|
|
|
|
|
'doctor' => 'Doctor', |
156
|
|
|
|
|
|
|
'sister' => 'Sister', |
157
|
|
|
|
|
|
|
'matron' => 'Matron', |
158
|
|
|
|
|
|
|
'nurse' => 'Nurse', |
159
|
|
|
|
|
|
|
# Legal |
160
|
|
|
|
|
|
|
'judge' => 'Judge', |
161
|
|
|
|
|
|
|
'justice' => 'Justice', |
162
|
|
|
|
|
|
|
'attorney' => 'Attorney', |
163
|
|
|
|
|
|
|
'solicitor' => 'Solicitor', |
164
|
|
|
|
|
|
|
'barrister' => 'Barrister', |
165
|
|
|
|
|
|
|
'qc' => 'QC', |
166
|
|
|
|
|
|
|
'kc' => 'KC', |
167
|
|
|
|
|
|
|
# Police |
168
|
|
|
|
|
|
|
'det' => 'Det', |
169
|
|
|
|
|
|
|
'detective' => 'Detective', |
170
|
|
|
|
|
|
|
'insp' => 'Insp', |
171
|
|
|
|
|
|
|
'inspector' => 'Inspector', |
172
|
|
|
|
|
|
|
# Military |
173
|
|
|
|
|
|
|
'brig' => 'Brig', |
174
|
|
|
|
|
|
|
'brigadier' => 'Brigadier', |
175
|
|
|
|
|
|
|
'captain' => 'Captain', |
176
|
|
|
|
|
|
|
'capt' => 'Capt', |
177
|
|
|
|
|
|
|
'colonel' => 'Colonel', |
178
|
|
|
|
|
|
|
'col' => 'Col', |
179
|
|
|
|
|
|
|
'commander in chief' => 'Commander in Chief', |
180
|
|
|
|
|
|
|
'commander' => 'Commander', |
181
|
|
|
|
|
|
|
'commodore' => 'Commodore', |
182
|
|
|
|
|
|
|
'cdr' => 'Cdr', |
183
|
|
|
|
|
|
|
'field marshall' => 'Field Marshall', |
184
|
|
|
|
|
|
|
'fl off' => 'Fl Off', |
185
|
|
|
|
|
|
|
'flight officer' => 'Flight Officer', |
186
|
|
|
|
|
|
|
'flt lt' => 'Flt Lt', |
187
|
|
|
|
|
|
|
'flight lieutenant' => 'Flight Lieutenant', |
188
|
|
|
|
|
|
|
'general of the army' => 'General of the Army', |
189
|
|
|
|
|
|
|
'general' => 'General', |
190
|
|
|
|
|
|
|
'gen' => 'Gen', |
191
|
|
|
|
|
|
|
'pte' => 'Pte', |
192
|
|
|
|
|
|
|
'private' => 'Private', |
193
|
|
|
|
|
|
|
'sgt' => 'Sgt', |
194
|
|
|
|
|
|
|
'sargent' => 'Sargent', |
195
|
|
|
|
|
|
|
'air commander' => 'Air Commander', |
196
|
|
|
|
|
|
|
'air commodore' => 'Air Commodore', |
197
|
|
|
|
|
|
|
'air marshall' => 'Air Marshall', |
198
|
|
|
|
|
|
|
'lieutenant colonel' => 'Lieutenant Colonel', |
199
|
|
|
|
|
|
|
'lt col' => 'Lt Col', |
200
|
|
|
|
|
|
|
'lt gen' => 'Lt Gen', |
201
|
|
|
|
|
|
|
'lt cdr' => 'Lt Cdr', |
202
|
|
|
|
|
|
|
'lieutenant' => 'Lieutenant', |
203
|
|
|
|
|
|
|
'lt' => 'Lt', |
204
|
|
|
|
|
|
|
'leut' => 'Leut', |
205
|
|
|
|
|
|
|
'lieut' => 'Lieut', |
206
|
|
|
|
|
|
|
'major general' => 'Major General', |
207
|
|
|
|
|
|
|
'maj gen' => 'Maj Gen', |
208
|
|
|
|
|
|
|
'major' => 'Major', |
209
|
|
|
|
|
|
|
'maj' => 'Maj', |
210
|
|
|
|
|
|
|
'pilot officer' => 'Pilot Officer', |
211
|
|
|
|
|
|
|
# Religious |
212
|
|
|
|
|
|
|
'rabbi' => 'Rabbi', |
213
|
|
|
|
|
|
|
'bishop' => 'Bishop', |
214
|
|
|
|
|
|
|
'brother' => 'Brother', |
215
|
|
|
|
|
|
|
'chaplain' => 'Chaplain', |
216
|
|
|
|
|
|
|
'father' => 'Father', |
217
|
|
|
|
|
|
|
'pastor' => 'Pastor', |
218
|
|
|
|
|
|
|
'mother superior' => 'Mother Superior', |
219
|
|
|
|
|
|
|
'mother' => 'Mother', |
220
|
|
|
|
|
|
|
'most reverend' => 'Most Reverend', |
221
|
|
|
|
|
|
|
'most reverand' => 'Most Reverand', |
222
|
|
|
|
|
|
|
'very reverend' => 'Very Reverend', |
223
|
|
|
|
|
|
|
'very reverand' => 'Very Reverand', |
224
|
|
|
|
|
|
|
'reverend' => 'Reverend', |
225
|
|
|
|
|
|
|
'reverand' => 'Reverand', |
226
|
|
|
|
|
|
|
'mt revd' => 'Mt Revd', |
227
|
|
|
|
|
|
|
'v revd' => 'V Revd', |
228
|
|
|
|
|
|
|
'revd' => 'Revd', |
229
|
|
|
|
|
|
|
# Academic |
230
|
|
|
|
|
|
|
'professor' => 'Professor', |
231
|
|
|
|
|
|
|
'prof' => 'Prof', |
232
|
|
|
|
|
|
|
'associate professor' => 'Associate Professor', |
233
|
|
|
|
|
|
|
'assoc prof' => 'Assoc Prof', |
234
|
|
|
|
|
|
|
# Other |
235
|
|
|
|
|
|
|
'alderman' => 'Alderman', |
236
|
|
|
|
|
|
|
'ald' => 'Ald', |
237
|
|
|
|
|
|
|
# These might be followed by another title |
238
|
|
|
|
|
|
|
# in which case we will fail to pick that up. |
239
|
|
|
|
|
|
|
'his excellency' => 'His Excellency', |
240
|
|
|
|
|
|
|
'his honour' => 'His Honour', |
241
|
|
|
|
|
|
|
'his honor' => 'His Honor', |
242
|
|
|
|
|
|
|
'her excellency' => 'Her Excellency', |
243
|
|
|
|
|
|
|
'her honour' => 'Her Honour', |
244
|
|
|
|
|
|
|
'her honor' => 'Her Honor', |
245
|
|
|
|
|
|
|
'the right honourable' => 'The Right Honourable', |
246
|
|
|
|
|
|
|
'the right honorable' => 'The Right Honorable', |
247
|
|
|
|
|
|
|
'the honourable' => 'The Honourable', |
248
|
|
|
|
|
|
|
'the honorable' => 'The Honorable', |
249
|
|
|
|
|
|
|
'right honourable' => 'Right Honourable', |
250
|
|
|
|
|
|
|
'right honorable' => 'Right Honorable', |
251
|
|
|
|
|
|
|
'rt hon' => 'Rt Hon', |
252
|
|
|
|
|
|
|
'rt hon' => 'Rt Hon', |
253
|
|
|
|
|
|
|
'the hon' => 'The Hon', |
254
|
|
|
|
|
|
|
'the hon' => 'The Hon', |
255
|
|
|
|
|
|
|
}; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
1; |
259
|
|
|
|
|
|
|
__END__ |