| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::EN::TitleParse; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
999
|
use 5.006000; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
39
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
35
|
|
|
5
|
1
|
|
|
1
|
|
16
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
2859
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
sub new { |
|
10
|
4
|
|
|
4
|
0
|
5941
|
my ($class, %params) = @_; |
|
11
|
4
|
|
|
|
|
6
|
my $self = {}; |
|
12
|
4
|
|
|
|
|
15
|
bless $self, $class; |
|
13
|
4
|
100
|
|
|
|
17
|
$self->{titles} = $params{titles} ? $self->_load( $params{titles} ) : $self->_default_titles; |
|
14
|
4
|
100
|
|
|
|
21
|
$self->{clean} = $params{clean} ? 1 : 0; |
|
15
|
4
|
|
|
|
|
12
|
return $self; |
|
16
|
|
|
|
|
|
|
} |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# parse uses a hash-table of "normalised" titles to very efficiently identify |
|
19
|
|
|
|
|
|
|
# titles, regardless of the number of titles required to look up. |
|
20
|
|
|
|
|
|
|
# Normalised text for our purposes consists of lower-case \w characters, |
|
21
|
|
|
|
|
|
|
# all other characters and spaces being \W. |
|
22
|
|
|
|
|
|
|
# Using this technique we find titles regardless of case or other |
|
23
|
|
|
|
|
|
|
# punctuation used. e.g. MR, Mr., mr, and Mr can all be found. |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# Once we have identified the normalised title we then capture the real |
|
26
|
|
|
|
|
|
|
# title, with the punctuation and case as in the original string, by |
|
27
|
|
|
|
|
|
|
# counting forward the correct number of normalised characters and |
|
28
|
|
|
|
|
|
|
# capturing non-normalised characters along the way. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub parse { |
|
31
|
|
|
|
|
|
|
|
|
32
|
16
|
|
|
16
|
1
|
9184
|
my ($self, $name) = @_; |
|
33
|
|
|
|
|
|
|
|
|
34
|
16
|
50
|
|
|
|
48
|
return () unless defined $name; |
|
35
|
|
|
|
|
|
|
|
|
36
|
16
|
|
|
|
|
27
|
my ($title, $remaining_name) = ('', $name); |
|
37
|
16
|
100
|
|
|
|
44
|
my $titles = ref $self ? $self->{titles} : $self->_default_titles; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Try to find a normalised title using a hash lookup. |
|
40
|
|
|
|
|
|
|
# Split the name by spaces/non-word characters, then match in |
|
41
|
|
|
|
|
|
|
# reverse order against a list of normalised titles. |
|
42
|
|
|
|
|
|
|
# Take the largest matching title. |
|
43
|
16
|
|
|
|
|
20
|
my $normalised_title; |
|
44
|
16
|
|
|
|
|
86
|
my @name_chunks = split(/\W+/, lc $name); |
|
45
|
16
|
|
|
|
|
619
|
while (pop @name_chunks) { |
|
46
|
20
|
|
|
|
|
42
|
my $possible_title = join(" ", @name_chunks); |
|
47
|
20
|
100
|
|
|
|
53
|
if (exists $titles->{$possible_title}) { |
|
48
|
15
|
|
|
|
|
16
|
$normalised_title = $possible_title; |
|
49
|
15
|
|
|
|
|
21
|
last; |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
16
|
100
|
|
|
|
26
|
if ($normalised_title) { |
|
54
|
|
|
|
|
|
|
# Find the normalised title in the real string |
|
55
|
|
|
|
|
|
|
# by counting the number of normalised characters |
|
56
|
|
|
|
|
|
|
# (ignore any spaces in the count) |
|
57
|
15
|
|
|
|
|
20
|
my $unspaced_title = $normalised_title; |
|
58
|
15
|
|
|
|
|
34
|
$unspaced_title =~ s/\s//g; |
|
59
|
15
|
|
|
|
|
17
|
my $character_count = length $unspaced_title; |
|
60
|
15
|
|
|
|
|
91
|
my @characters = split (//, $name); |
|
61
|
15
|
|
|
|
|
24
|
my @title_chars; |
|
62
|
15
|
|
66
|
|
|
74
|
while ($character_count > 0 && scalar @characters > 0) { |
|
63
|
89
|
|
|
|
|
160
|
my $character = shift @characters; |
|
64
|
89
|
|
|
|
|
139
|
push (@title_chars, $character); |
|
65
|
|
|
|
|
|
|
# only count down when we have a normalised character |
|
66
|
89
|
100
|
|
|
|
468
|
$character_count-- if $character =~ /^\w$/; |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
# Now add any trailing un-normalised characters to the title too |
|
69
|
|
|
|
|
|
|
# e.g. for "Mr." we want the "." in "Mr." too, |
|
70
|
15
|
|
|
|
|
48
|
while ($characters[0] =~ /^\W$/) { |
|
71
|
20
|
|
|
|
|
62
|
push (@title_chars, shift @characters); |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
|
|
74
|
15
|
|
|
|
|
35
|
$title = join("", @title_chars); |
|
75
|
15
|
|
|
|
|
30
|
$remaining_name = join("", @characters); |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# clean up any spaces at the point of separation |
|
78
|
15
|
|
|
|
|
54
|
$title =~ s/\s+$//; |
|
79
|
15
|
|
|
|
|
54
|
$remaining_name =~ s/^\s+//; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Return a cleaned title if that option was set |
|
83
|
16
|
100
|
100
|
|
|
103
|
$title = $titles->{$normalised_title} if $normalised_title && ref $self && $self->{clean}; |
|
|
|
|
100
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
|
85
|
16
|
|
|
|
|
73
|
return ($title, $remaining_name); |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# This method must match how parse() |
|
89
|
|
|
|
|
|
|
# handles its input string. |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub normalise { |
|
92
|
3
|
|
|
3
|
0
|
4
|
my ($self, $title) = @_; |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# remove leading/trailing whitespace |
|
95
|
3
|
|
|
|
|
7
|
$title =~ s/^\s+//; |
|
96
|
3
|
|
|
|
|
8
|
$title =~ s/\s+$//; |
|
97
|
|
|
|
|
|
|
# remove punctuation & consolidate spaces |
|
98
|
3
|
|
|
|
|
9
|
$title =~ s/\W+/ /; |
|
99
|
|
|
|
|
|
|
# lower-case |
|
100
|
3
|
|
|
|
|
6
|
$title = lc($title); |
|
101
|
|
|
|
|
|
|
|
|
102
|
3
|
|
|
|
|
7
|
return $title; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub titles { |
|
106
|
2
|
|
|
2
|
1
|
2087
|
my $self = shift; |
|
107
|
2
|
100
|
|
|
|
11
|
my $titles = ref $self ? $self->{titles} : $self->_default_titles; |
|
108
|
2
|
|
|
|
|
141
|
return sort values %$titles; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _load { |
|
112
|
1
|
|
|
1
|
|
1
|
my ($self, $titles) = @_; |
|
113
|
1
|
|
|
|
|
3
|
my $normalised_titles = {}; |
|
114
|
1
|
|
|
|
|
3
|
foreach my $title (@$titles) { |
|
115
|
3
|
|
|
|
|
11
|
my $normalised_title = $self->normalise($title); |
|
116
|
|
|
|
|
|
|
# Store the title in our hashref pointing at the original title |
|
117
|
3
|
|
|
|
|
8
|
$normalised_titles->{$normalised_title} = $title; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
1
|
|
|
|
|
3
|
return $normalised_titles; |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub _default_titles { |
|
123
|
|
|
|
|
|
|
return { |
|
124
|
|
|
|
|
|
|
# Basic titles |
|
125
|
5
|
|
|
5
|
|
373
|
'mr' => 'Mr', |
|
126
|
|
|
|
|
|
|
'ms' => 'Ms', |
|
127
|
|
|
|
|
|
|
'mrs' => 'Mrs', |
|
128
|
|
|
|
|
|
|
'miss' => 'Miss', |
|
129
|
|
|
|
|
|
|
'mx' => 'Mx', |
|
130
|
|
|
|
|
|
|
'dr' => 'Dr', |
|
131
|
|
|
|
|
|
|
# Combined titles |
|
132
|
|
|
|
|
|
|
'mr and mrs' => 'Mr and Mrs', |
|
133
|
|
|
|
|
|
|
'mr mrs' => 'Mr & Mrs', |
|
134
|
|
|
|
|
|
|
# Extended titles |
|
135
|
|
|
|
|
|
|
'sir' => 'Sir', |
|
136
|
|
|
|
|
|
|
'dame' => 'Dame', |
|
137
|
|
|
|
|
|
|
'messrs' => 'Messrs', |
|
138
|
|
|
|
|
|
|
'madame' => 'Madame', |
|
139
|
|
|
|
|
|
|
'madam' => 'Madam', |
|
140
|
|
|
|
|
|
|
'mme' => 'Mme', |
|
141
|
|
|
|
|
|
|
'mister' => 'Mister', |
|
142
|
|
|
|
|
|
|
'master' => 'Master', |
|
143
|
|
|
|
|
|
|
'mast' => 'Mast', |
|
144
|
|
|
|
|
|
|
'msgr' => 'Msgr', |
|
145
|
|
|
|
|
|
|
'mgr' => 'Mgr', |
|
146
|
|
|
|
|
|
|
'count' => 'Count', |
|
147
|
|
|
|
|
|
|
'countess' => 'Countess', |
|
148
|
|
|
|
|
|
|
'duke' => 'Duke', |
|
149
|
|
|
|
|
|
|
'duchess' => 'Duchess', |
|
150
|
|
|
|
|
|
|
'lord' => 'Lord', |
|
151
|
|
|
|
|
|
|
'lady' => 'Lady', |
|
152
|
|
|
|
|
|
|
'marquis' => 'Marquis', |
|
153
|
|
|
|
|
|
|
'marquess' => 'Marquess', |
|
154
|
|
|
|
|
|
|
# Medical |
|
155
|
|
|
|
|
|
|
'doctor' => 'Doctor', |
|
156
|
|
|
|
|
|
|
'sister' => 'Sister', |
|
157
|
|
|
|
|
|
|
'matron' => 'Matron', |
|
158
|
|
|
|
|
|
|
'nurse' => 'Nurse', |
|
159
|
|
|
|
|
|
|
# Legal |
|
160
|
|
|
|
|
|
|
'judge' => 'Judge', |
|
161
|
|
|
|
|
|
|
'justice' => 'Justice', |
|
162
|
|
|
|
|
|
|
'attorney' => 'Attorney', |
|
163
|
|
|
|
|
|
|
'solicitor' => 'Solicitor', |
|
164
|
|
|
|
|
|
|
'barrister' => 'Barrister', |
|
165
|
|
|
|
|
|
|
'qc' => 'QC', |
|
166
|
|
|
|
|
|
|
'kc' => 'KC', |
|
167
|
|
|
|
|
|
|
# Police |
|
168
|
|
|
|
|
|
|
'det' => 'Det', |
|
169
|
|
|
|
|
|
|
'detective' => 'Detective', |
|
170
|
|
|
|
|
|
|
'insp' => 'Insp', |
|
171
|
|
|
|
|
|
|
'inspector' => 'Inspector', |
|
172
|
|
|
|
|
|
|
# Military |
|
173
|
|
|
|
|
|
|
'brig' => 'Brig', |
|
174
|
|
|
|
|
|
|
'brigadier' => 'Brigadier', |
|
175
|
|
|
|
|
|
|
'captain' => 'Captain', |
|
176
|
|
|
|
|
|
|
'capt' => 'Capt', |
|
177
|
|
|
|
|
|
|
'colonel' => 'Colonel', |
|
178
|
|
|
|
|
|
|
'col' => 'Col', |
|
179
|
|
|
|
|
|
|
'commander in chief' => 'Commander in Chief', |
|
180
|
|
|
|
|
|
|
'commander' => 'Commander', |
|
181
|
|
|
|
|
|
|
'commodore' => 'Commodore', |
|
182
|
|
|
|
|
|
|
'cdr' => 'Cdr', |
|
183
|
|
|
|
|
|
|
'field marshall' => 'Field Marshall', |
|
184
|
|
|
|
|
|
|
'fl off' => 'Fl Off', |
|
185
|
|
|
|
|
|
|
'flight officer' => 'Flight Officer', |
|
186
|
|
|
|
|
|
|
'flt lt' => 'Flt Lt', |
|
187
|
|
|
|
|
|
|
'flight lieutenant' => 'Flight Lieutenant', |
|
188
|
|
|
|
|
|
|
'general of the army' => 'General of the Army', |
|
189
|
|
|
|
|
|
|
'general' => 'General', |
|
190
|
|
|
|
|
|
|
'gen' => 'Gen', |
|
191
|
|
|
|
|
|
|
'pte' => 'Pte', |
|
192
|
|
|
|
|
|
|
'private' => 'Private', |
|
193
|
|
|
|
|
|
|
'sgt' => 'Sgt', |
|
194
|
|
|
|
|
|
|
'sargent' => 'Sargent', |
|
195
|
|
|
|
|
|
|
'air commander' => 'Air Commander', |
|
196
|
|
|
|
|
|
|
'air commodore' => 'Air Commodore', |
|
197
|
|
|
|
|
|
|
'air marshall' => 'Air Marshall', |
|
198
|
|
|
|
|
|
|
'lieutenant colonel' => 'Lieutenant Colonel', |
|
199
|
|
|
|
|
|
|
'lt col' => 'Lt Col', |
|
200
|
|
|
|
|
|
|
'lt gen' => 'Lt Gen', |
|
201
|
|
|
|
|
|
|
'lt cdr' => 'Lt Cdr', |
|
202
|
|
|
|
|
|
|
'lieutenant' => 'Lieutenant', |
|
203
|
|
|
|
|
|
|
'lt' => 'Lt', |
|
204
|
|
|
|
|
|
|
'leut' => 'Leut', |
|
205
|
|
|
|
|
|
|
'lieut' => 'Lieut', |
|
206
|
|
|
|
|
|
|
'major general' => 'Major General', |
|
207
|
|
|
|
|
|
|
'maj gen' => 'Maj Gen', |
|
208
|
|
|
|
|
|
|
'major' => 'Major', |
|
209
|
|
|
|
|
|
|
'maj' => 'Maj', |
|
210
|
|
|
|
|
|
|
'pilot officer' => 'Pilot Officer', |
|
211
|
|
|
|
|
|
|
# Religious |
|
212
|
|
|
|
|
|
|
'rabbi' => 'Rabbi', |
|
213
|
|
|
|
|
|
|
'bishop' => 'Bishop', |
|
214
|
|
|
|
|
|
|
'brother' => 'Brother', |
|
215
|
|
|
|
|
|
|
'chaplain' => 'Chaplain', |
|
216
|
|
|
|
|
|
|
'father' => 'Father', |
|
217
|
|
|
|
|
|
|
'pastor' => 'Pastor', |
|
218
|
|
|
|
|
|
|
'mother superior' => 'Mother Superior', |
|
219
|
|
|
|
|
|
|
'mother' => 'Mother', |
|
220
|
|
|
|
|
|
|
'most reverend' => 'Most Reverend', |
|
221
|
|
|
|
|
|
|
'most reverand' => 'Most Reverand', |
|
222
|
|
|
|
|
|
|
'very reverend' => 'Very Reverend', |
|
223
|
|
|
|
|
|
|
'very reverand' => 'Very Reverand', |
|
224
|
|
|
|
|
|
|
'reverend' => 'Reverend', |
|
225
|
|
|
|
|
|
|
'reverand' => 'Reverand', |
|
226
|
|
|
|
|
|
|
'mt revd' => 'Mt Revd', |
|
227
|
|
|
|
|
|
|
'v revd' => 'V Revd', |
|
228
|
|
|
|
|
|
|
'revd' => 'Revd', |
|
229
|
|
|
|
|
|
|
# Academic |
|
230
|
|
|
|
|
|
|
'professor' => 'Professor', |
|
231
|
|
|
|
|
|
|
'prof' => 'Prof', |
|
232
|
|
|
|
|
|
|
'associate professor' => 'Associate Professor', |
|
233
|
|
|
|
|
|
|
'assoc prof' => 'Assoc Prof', |
|
234
|
|
|
|
|
|
|
# Other |
|
235
|
|
|
|
|
|
|
'alderman' => 'Alderman', |
|
236
|
|
|
|
|
|
|
'ald' => 'Ald', |
|
237
|
|
|
|
|
|
|
# These might be followed by another title |
|
238
|
|
|
|
|
|
|
# in which case we will fail to pick that up. |
|
239
|
|
|
|
|
|
|
'his excellency' => 'His Excellency', |
|
240
|
|
|
|
|
|
|
'his honour' => 'His Honour', |
|
241
|
|
|
|
|
|
|
'his honor' => 'His Honor', |
|
242
|
|
|
|
|
|
|
'her excellency' => 'Her Excellency', |
|
243
|
|
|
|
|
|
|
'her honour' => 'Her Honour', |
|
244
|
|
|
|
|
|
|
'her honor' => 'Her Honor', |
|
245
|
|
|
|
|
|
|
'the right honourable' => 'The Right Honourable', |
|
246
|
|
|
|
|
|
|
'the right honorable' => 'The Right Honorable', |
|
247
|
|
|
|
|
|
|
'the honourable' => 'The Honourable', |
|
248
|
|
|
|
|
|
|
'the honorable' => 'The Honorable', |
|
249
|
|
|
|
|
|
|
'right honourable' => 'Right Honourable', |
|
250
|
|
|
|
|
|
|
'right honorable' => 'Right Honorable', |
|
251
|
|
|
|
|
|
|
'rt hon' => 'Rt Hon', |
|
252
|
|
|
|
|
|
|
'rt hon' => 'Rt Hon', |
|
253
|
|
|
|
|
|
|
'the hon' => 'The Hon', |
|
254
|
|
|
|
|
|
|
'the hon' => 'The Hon', |
|
255
|
|
|
|
|
|
|
}; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
1; |
|
259
|
|
|
|
|
|
|
__END__ |