line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Lingua::EN::AddressParse - extract components of a street address from free format text |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Lingua::EN::AddressParse; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my %args = |
10
|
|
|
|
|
|
|
( |
11
|
|
|
|
|
|
|
country => 'US', |
12
|
|
|
|
|
|
|
auto_clean => 1, |
13
|
|
|
|
|
|
|
force_case => 1, |
14
|
|
|
|
|
|
|
abbreviate_subcountry => 0, |
15
|
|
|
|
|
|
|
abbreviated_subcountry_only => 0, |
16
|
|
|
|
|
|
|
force_post_code => 0 |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
20
|
|
|
|
|
|
|
$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL"); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
print $address->report; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
Country address format 'US' |
25
|
|
|
|
|
|
|
Address type 'suburban' |
26
|
|
|
|
|
|
|
Non matching part 'HOLD MAIL ' |
27
|
|
|
|
|
|
|
Error '1' |
28
|
|
|
|
|
|
|
Error descriptions 'non matching section : HOLD MAIL ' |
29
|
|
|
|
|
|
|
Warning '1' |
30
|
|
|
|
|
|
|
Warning description '' |
31
|
|
|
|
|
|
|
Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100' |
32
|
|
|
|
|
|
|
COMPONENTS '' |
33
|
|
|
|
|
|
|
base_street_name 'Old Massachusetts' |
34
|
|
|
|
|
|
|
post_code '98100' |
35
|
|
|
|
|
|
|
property_identifier '40 1/2' |
36
|
|
|
|
|
|
|
street_direction_prefix 'N' |
37
|
|
|
|
|
|
|
street_name 'N Old Massachusetts' |
38
|
|
|
|
|
|
|
street_type 'Ave' |
39
|
|
|
|
|
|
|
sub_property_identifier '3B' |
40
|
|
|
|
|
|
|
sub_property_type 'Apt' |
41
|
|
|
|
|
|
|
subcountry 'WASHINGTON' |
42
|
|
|
|
|
|
|
suburb 'Washington Valley' |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
%address_components = $address->components; |
45
|
|
|
|
|
|
|
print $address_components{sub_property_type}; # APT |
46
|
|
|
|
|
|
|
print $address_components{sub_property_identifier}; # 3B |
47
|
|
|
|
|
|
|
print $address_components{property_identifier}; # 40 1/2 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
%address_properties = $address->properties; |
50
|
|
|
|
|
|
|
print $address_properties{type}; # suburban |
51
|
|
|
|
|
|
|
print $address_properties{non_matching}; # : HOLD MAIL |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 DESCRIPTION |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This module takes as input a suburban, rural or postal address in free format |
59
|
|
|
|
|
|
|
text such as, |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
3080 28TH AVE N ST PETERSBURG, FL 33713-3810 |
62
|
|
|
|
|
|
|
12 1st Avenue N Suite # 2 Somewhere CA 12345 USA |
63
|
|
|
|
|
|
|
C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317 |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
9 Church Street, Abertillery, Mid Glamorgan NP13 1DA |
66
|
|
|
|
|
|
|
27 Bury Street, Abingdon, Oxfordshire OX14 3QT |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
2A O'CONNELL ST KEW NSW 2123 |
69
|
|
|
|
|
|
|
12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133 |
70
|
|
|
|
|
|
|
"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA |
71
|
|
|
|
|
|
|
GPO Box K318, HAYMARKET, NSW 2000 |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
and attempts to parse it. If successful, the address is broken |
75
|
|
|
|
|
|
|
down into it's components and useful functions can be performed such as : |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
converting upper or lower case values to title case (2A O'Connell St Kew NSW 2123) |
78
|
|
|
|
|
|
|
extracting the addresses individual components (2A,O'Connell,St,KEW,NSW,2123) |
79
|
|
|
|
|
|
|
determining the type of format the address is in ('suburban') |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
If the address cannot be parsed you have the option of cleaning the address |
83
|
|
|
|
|
|
|
of bad characters, or extracting any portion that was parsed and the portion |
84
|
|
|
|
|
|
|
that failed. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
This module can be used for analysing and improving the quality of |
87
|
|
|
|
|
|
|
lists of residential and postal addresses. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
By using a large combination of regular expressiosn with look ahead analysis, patterns |
90
|
|
|
|
|
|
|
can be parsed that confuse many other parsers. Examples are |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Street names with several street types: Lane Cove Road |
93
|
|
|
|
|
|
|
Suburbs which include street types: Smith Road St Marys |
94
|
|
|
|
|
|
|
Suburbs that include state names: Fort Washington Washington |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 DEFINITIONS |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The following terms are used by AddressParse to define the components that |
99
|
|
|
|
|
|
|
can make up an address. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Pre cursor : C/O MR A Smith... |
102
|
|
|
|
|
|
|
Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ... |
103
|
|
|
|
|
|
|
Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44, 2.5 |
104
|
|
|
|
|
|
|
Property name : "Old Regret" |
105
|
|
|
|
|
|
|
Post Box : GP0 Box K123, LPO 2345, RMS 23 ... |
106
|
|
|
|
|
|
|
Road Box : RMB 24A, RMS 234 ... |
107
|
|
|
|
|
|
|
Street Direction: North, SE, Sth. etc |
108
|
|
|
|
|
|
|
Street name : O'Hare, New South Head, The Causeway, Broadway |
109
|
|
|
|
|
|
|
Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ... |
110
|
|
|
|
|
|
|
Suburb : Dee Why, St. John's Wood ... |
111
|
|
|
|
|
|
|
Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ... |
112
|
|
|
|
|
|
|
Post (zip) code : 2062, 34532-1234, SG12A 9ET |
113
|
|
|
|
|
|
|
Country : Australia, UK, US or Canada |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
The main address formats currently supported are as follows. (a ? means the component is optional): |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?) |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
OR for the USA |
122
|
|
|
|
|
|
|
'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?) |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
'rural' : property_name suburb subcountry post_code(?) country(?) |
125
|
|
|
|
|
|
|
'post_box' : post_box suburb subcountry post_code(?) country(?) |
126
|
|
|
|
|
|
|
'road_box' : road_box street street_type suburb subcountry post_code(?) country(?) |
127
|
|
|
|
|
|
|
'road_box' : road_box suburb subcountry post_code(?) country(?) |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Note that suburb and subcountry are not optional. The accuracy of the parser is |
130
|
|
|
|
|
|
|
improved by providing as much context as possible. Proding a suburb can ehlp to |
131
|
|
|
|
|
|
|
identify street names that would itherwise be ambigious. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
For the case where you only have a street address, dummy (but still valid) values can be used |
134
|
|
|
|
|
|
|
for suburb (such as 'Somewhere') and sub country (such as 'NY'). These dummy values will |
135
|
|
|
|
|
|
|
be parsed but can be ignored. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
All formats may contain a precursor |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar |
140
|
|
|
|
|
|
|
module for a complete list of combinations. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 METHODS |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=head2 new |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
The C method creates an instance of an address object and sets up |
148
|
|
|
|
|
|
|
the grammar used to parse addresses. This must be called before any of the |
149
|
|
|
|
|
|
|
following methods are invoked. Note that the object only needs to be |
150
|
|
|
|
|
|
|
created once, and can be reused with new input data. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Various setup options may be defined in a hash that is passed as an |
153
|
|
|
|
|
|
|
optional argument to the C method. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my %args = |
156
|
|
|
|
|
|
|
( |
157
|
|
|
|
|
|
|
country => 'US', |
158
|
|
|
|
|
|
|
auto_clean => 1, |
159
|
|
|
|
|
|
|
force_case => 1, |
160
|
|
|
|
|
|
|
abbreviate_subcountry => 1, |
161
|
|
|
|
|
|
|
abbreviated_subcountry_only => 1, |
162
|
|
|
|
|
|
|
force_post_code => 1 |
163
|
|
|
|
|
|
|
); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=over 4 |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item country |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
The country argument must be specified. It determines the possible list of |
172
|
|
|
|
|
|
|
valid sub countries (states, counties etc, defined in the Locale::SubCountry |
173
|
|
|
|
|
|
|
module) and post code formats. Either the full name or abbreviation may be |
174
|
|
|
|
|
|
|
specified. The currently supported country names and codes are: |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
AU or Australia |
177
|
|
|
|
|
|
|
CA or Canada |
178
|
|
|
|
|
|
|
GB or United Kingdom |
179
|
|
|
|
|
|
|
US or United States |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
All forms of upper/lower case are acceptable in the country's spelling. If a |
182
|
|
|
|
|
|
|
country name is supplied that the module doesn't recognise, it will die. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item force_case (optional) |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
This option only applies to the C method, see below. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item auto_clean (optional) |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
When this option is set to a positive value, the input string is |
191
|
|
|
|
|
|
|
'cleaned' to try and normalise bad patterns. The type of cleaning |
192
|
|
|
|
|
|
|
includes |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
remove non alphanumeric characters |
195
|
|
|
|
|
|
|
remove full stops |
196
|
|
|
|
|
|
|
remove redundant white space |
197
|
|
|
|
|
|
|
add missing space separators |
198
|
|
|
|
|
|
|
expand abbreviations to more common forms |
199
|
|
|
|
|
|
|
remove bracketed annotations |
200
|
|
|
|
|
|
|
fix badly formed sub property identifiers |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=item abbreviate_subcountry (optional) |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
When this option is set to a positive value, the sub country is forced to it's |
205
|
|
|
|
|
|
|
abbreviated form, so "New South Wales" becomes "NSW". If the sub country is |
206
|
|
|
|
|
|
|
already abbreviated then it's value is not altered. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=item abbreviated_subcountry_only (optional) |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
When this option is set to a positive value, only the abbreviated form |
211
|
|
|
|
|
|
|
of sub country is allowed, such as "NSW" and not "New South Wales". This |
212
|
|
|
|
|
|
|
will make parsing quicker and ensure that addresses comply with postal |
213
|
|
|
|
|
|
|
standards that normally permit only abbreviated sub countries. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey' |
216
|
|
|
|
|
|
|
Normally, 'Washington would be consumed as the sub country, but by first converting |
217
|
|
|
|
|
|
|
the address to 'Port Washington NJ' we avoid this problem |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item force_post_code (optional) |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
When this option is set to a positive value, the address must contain |
223
|
|
|
|
|
|
|
a post code. If it does not then an error flag is raised. If this option |
224
|
|
|
|
|
|
|
is set to 0 than a post code is optional. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
By default for this option is true. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=back |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=head2 parse |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133"); |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
The C method takes a single parameter of a text string containing a |
235
|
|
|
|
|
|
|
address. It attempts to parse the address and break it down into the components |
236
|
|
|
|
|
|
|
described below. If the address is parsed successfully, a 0 is returned, |
237
|
|
|
|
|
|
|
otherwise a 1. |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Note that you can successfully parse all the components of an address and still |
240
|
|
|
|
|
|
|
have an error returned. This occurs when you have non matching data following |
241
|
|
|
|
|
|
|
a valid address. To check if the data is unusable, you also need to use the |
242
|
|
|
|
|
|
|
C method to check the address type is 'unknown' |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
This method is a prerequisite for all the following methods. |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head2 components |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
%address = $address->components($upper_case_all); |
249
|
|
|
|
|
|
|
$suburb = $address{suburb}; |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
If the optional argument $upper_case_all is set to a postive value, all components |
252
|
|
|
|
|
|
|
are converted to upper case. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
The C method returns all the address components in a hash. The |
256
|
|
|
|
|
|
|
following keys are used for each component: |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
pre_cursor - such as 'C/O Mr A Smith' |
260
|
|
|
|
|
|
|
po_box_type - such as 'Private Boxes' |
261
|
|
|
|
|
|
|
post_box |
262
|
|
|
|
|
|
|
road_box |
263
|
|
|
|
|
|
|
sub_property_type |
264
|
|
|
|
|
|
|
sub_property_identifier |
265
|
|
|
|
|
|
|
property_identifier |
266
|
|
|
|
|
|
|
property_name |
267
|
|
|
|
|
|
|
level - such as 12th Floor |
268
|
|
|
|
|
|
|
building - such as Tower A |
269
|
|
|
|
|
|
|
street_direction_prefix (such as East, NW, North etc) |
270
|
|
|
|
|
|
|
base_street_name (the name with direction removed, such as "Main" in "East Main St") |
271
|
|
|
|
|
|
|
street_name (the full street name such as "East Main") |
272
|
|
|
|
|
|
|
street_type |
273
|
|
|
|
|
|
|
street_direction_suffix (US only, abbreviated only such as N, SE etc) |
274
|
|
|
|
|
|
|
suburb |
275
|
|
|
|
|
|
|
subcountry |
276
|
|
|
|
|
|
|
post_code |
277
|
|
|
|
|
|
|
country |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
If a component has no matching data for a given address, it's values will be |
280
|
|
|
|
|
|
|
set to the empty string. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
Each component is converted to title case, meaning the first letter of each |
283
|
|
|
|
|
|
|
component is set to capitals and the remainder to lower case. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Proper name capitalisations such as MacNay and O'Brien are observed |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
The following components are not converted to title case: |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
post_box |
290
|
|
|
|
|
|
|
road_box |
291
|
|
|
|
|
|
|
subcountry |
292
|
|
|
|
|
|
|
post_code |
293
|
|
|
|
|
|
|
country |
294
|
|
|
|
|
|
|
street_direction_suffix |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
If your input data is all upper case and you want to retian that format for parsed |
297
|
|
|
|
|
|
|
data, you will need to apply the 'uc' function to each component. |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head2 case_all |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
The C method does the same thing as the C method except |
304
|
|
|
|
|
|
|
the entire address is returned as a title cased text string. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
If the force_case option was set in the C method above, address case the |
307
|
|
|
|
|
|
|
entire input string, including any unmatched sections after a recognisable address |
308
|
|
|
|
|
|
|
that failed parsing. This option is useful when you know you have invalid data, |
309
|
|
|
|
|
|
|
but you still want to title case what you have. |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=head2 properties |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
The C method returns several properties of the address as a hash. |
314
|
|
|
|
|
|
|
The following keys are used for each property - |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
type - either suburban ,rural,post_box,road_box,unknown |
317
|
|
|
|
|
|
|
non_matching - any trailing string not part the address |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Additional properties can be accessed with the following |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
$address->{original_input} |
323
|
|
|
|
|
|
|
$address->{input_string} - string after auto_clean option has been applied |
324
|
|
|
|
|
|
|
$address->{country_code} - abbreviated Country address format (as defined in the C method) |
325
|
|
|
|
|
|
|
$address->{error} - error flag, 0 = good, 1 = error |
326
|
|
|
|
|
|
|
$address->{error_desc} - text to describe the type of parsing error |
327
|
|
|
|
|
|
|
$address->{warning} - warning flag, 0 = good, 1 = warning |
328
|
|
|
|
|
|
|
$address->{warning_desc} - text to to describe the type of parsing warning(s) |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Warnings mean that the address has parsed but there may still be errors within it's components |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=head2 report |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Create a formatted text report |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
the input string |
338
|
|
|
|
|
|
|
the cleaned input string |
339
|
|
|
|
|
|
|
the country type |
340
|
|
|
|
|
|
|
the address type |
341
|
|
|
|
|
|
|
any non matching part of input string |
342
|
|
|
|
|
|
|
if any parsing errors occurred |
343
|
|
|
|
|
|
|
error description |
344
|
|
|
|
|
|
|
if any parsing warning occurred |
345
|
|
|
|
|
|
|
warning description |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
the name and value of each defined component |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Returns a string containing a multi line formatted text report |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
L, L, L |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=head1 BUGS |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=head1 LIMITATIONS |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a |
361
|
|
|
|
|
|
|
street type of null string. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
The abbreviation 'St' can be interpreted as either street or Saint. This leads to |
364
|
|
|
|
|
|
|
ambiguities such as '12 East St Thomas Lane'. This could be 'East Street', suburb of |
365
|
|
|
|
|
|
|
'Thomas Lane' or 'East St Thomas Lane'. And the first pattern is the more common, |
366
|
|
|
|
|
|
|
that is what will match. |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
For US addresses, an ambiguity arises between a street directional suffix and |
369
|
|
|
|
|
|
|
a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South |
370
|
|
|
|
|
|
|
Main St, or South Springfield? The parser assumes that 'S' belongs to the street |
371
|
|
|
|
|
|
|
description. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
The huge number of character combinations that can form a valid address makes |
374
|
|
|
|
|
|
|
it is impossible to correctly identify them all. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
Valid addresses must contain: |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
property address, suburb, subcountry (aka state) in that order. |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
This format is widely accepted in Australia and the US. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
UK addresses will often include suburb, town, city and county, formats that |
383
|
|
|
|
|
|
|
are very difficult to parse. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Property names must be enclosed in single or double quotes like "Old Regret" |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
Because of the large combination of possible addresses defined in the grammar, |
388
|
|
|
|
|
|
|
the program is not very fast. |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 REFERENCES |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997) |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
Australian Standard AS4212-1994 "Geographic Information Systems - |
396
|
|
|
|
|
|
|
Data Dictionary for transfer of street addressing information" |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
ISO 3166-2:1998, Codes for the representation of names of countries |
399
|
|
|
|
|
|
|
and their subdivisions. Also released as AS/NZS 2632.2:1999 |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head1 SEE ALSO |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
AddressParse is designed to identify properties, which have a unique physical |
405
|
|
|
|
|
|
|
location. L will also parse addresses for the USA, and can handle |
406
|
|
|
|
|
|
|
locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA" |
407
|
|
|
|
|
|
|
"Mission Street at Valencia Street, San Francisco, CA" |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
L |
411
|
|
|
|
|
|
|
L |
412
|
|
|
|
|
|
|
L |
413
|
|
|
|
|
|
|
L |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
See L |
416
|
|
|
|
|
|
|
for a list of different addressing formats from around the world. And also |
417
|
|
|
|
|
|
|
L |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=head1 REPOSITORY |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
L |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=head1 TO DO |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Define grammar for other languages. Hopefully, all that would be needed is |
426
|
|
|
|
|
|
|
to specify a new module with its own grammar, and inherit all the existing |
427
|
|
|
|
|
|
|
methods. I don't have the knowledge of the naming conventions for non-english |
428
|
|
|
|
|
|
|
languages. |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head1 AUTHOR |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
AddressParse was written by Kim Ryan |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Copyright (c) 2015 Kim Ryan. All rights reserved. |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
439
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=cut |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
package Lingua::EN::AddressParse; |
447
|
|
|
|
|
|
|
|
448
|
1
|
|
|
1
|
|
573
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
23
|
|
449
|
1
|
|
|
1
|
|
3
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
57
|
|
450
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
18
|
|
451
|
1
|
|
|
1
|
|
404
|
use Lingua::EN::AddressParse::Grammar; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
452
|
1
|
|
|
1
|
|
748
|
use Lingua::EN::NameParse; |
|
1
|
|
|
|
|
34374
|
|
|
1
|
|
|
|
|
52
|
|
453
|
1
|
|
|
1
|
|
7
|
use Parse::RecDescent; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
4
|
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
our $VERSION = '1.26'; |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
458
|
|
|
|
|
|
|
# Create a new instance of an address parsing object. This step is time |
459
|
|
|
|
|
|
|
# consuming and should normally only be called once in your program. |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
sub new |
462
|
|
|
|
|
|
|
{ |
463
|
4
|
|
|
4
|
1
|
940
|
my $class = shift; |
464
|
4
|
|
|
|
|
11
|
my %args = @_; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
|
467
|
4
|
50
|
33
|
|
|
51
|
unless (defined $args{country} and $args{country} =~ |
468
|
|
|
|
|
|
|
/^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ ) |
469
|
|
|
|
|
|
|
{ |
470
|
0
|
|
|
|
|
0
|
croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n"; |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
|
474
|
4
|
|
|
|
|
5
|
my $address = {}; |
475
|
4
|
|
|
|
|
10
|
bless($address,$class); |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# option defaults |
478
|
4
|
|
|
|
|
13
|
$address->{'force_post_code'} = 1; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# Add error checking for invalid keys? |
481
|
4
|
|
|
|
|
13
|
foreach my $curr_key (keys %args) |
482
|
|
|
|
|
|
|
{ |
483
|
8
|
|
|
|
|
14
|
$address->{$curr_key} = $args{$curr_key}; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# create the grammar tree (this is country dependent) |
487
|
4
|
|
|
|
|
19
|
my $grammar = Lingua::EN::AddressParse::Grammar::_create($address); |
488
|
|
|
|
|
|
|
|
489
|
4
|
|
|
|
|
24
|
$address->{parse} = Parse::RecDescent->new($grammar); |
490
|
|
|
|
|
|
|
|
491
|
4
|
|
|
|
|
2508788
|
return ($address); |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
494
|
|
|
|
|
|
|
sub parse |
495
|
|
|
|
|
|
|
{ |
496
|
15
|
|
|
15
|
1
|
7725
|
my $address = shift; |
497
|
15
|
|
|
|
|
41
|
my ($input_string) = @_; |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# Save original data so we can check effect of auto cleaning |
500
|
15
|
|
|
|
|
44
|
$address->{original_input} = $input_string; |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# Convert to all upper case. This will allow for faster regexp matching in |
503
|
|
|
|
|
|
|
# the grammar tree |
504
|
15
|
|
|
|
|
57
|
$address->{input_string} = uc($input_string); |
505
|
|
|
|
|
|
|
|
506
|
15
|
|
|
|
|
44
|
chomp($address->{input_string}); |
507
|
|
|
|
|
|
|
|
508
|
15
|
|
|
|
|
25
|
my $pre_cursor; |
509
|
15
|
|
|
|
|
61
|
($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string}); |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# Replace commas (which can be used to chunk sections of addresses) with spaces |
512
|
15
|
|
|
|
|
63
|
$address->{input_string} =~ s/,/ /g; |
513
|
|
|
|
|
|
|
|
514
|
15
|
100
|
|
|
|
47
|
if ( $address->{auto_clean} ) |
515
|
|
|
|
|
|
|
{ |
516
|
12
|
|
|
|
|
34
|
$address->{input_string} = _clean($address); |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
|
519
|
15
|
|
|
|
|
25
|
my $po_box_type; |
520
|
15
|
|
|
|
|
53
|
($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string}); |
521
|
|
|
|
|
|
|
|
522
|
15
|
|
|
|
|
27
|
my $level; |
523
|
15
|
|
|
|
|
49
|
($level,$address->{input_string}) = _extract_level($address->{input_string}); |
524
|
|
|
|
|
|
|
|
525
|
15
|
|
|
|
|
20
|
my $building; |
526
|
15
|
|
|
|
|
54
|
($building,$address->{input_string}) = _extract_building($address->{input_string}); |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12 |
531
|
15
|
100
|
100
|
|
|
142
|
if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ ) |
532
|
|
|
|
|
|
|
{ |
533
|
2
|
|
|
|
|
12
|
$address->{input_string} = "UNIT $1 $2"; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
# We need to add a trailing space to the input string. This is because the grammar |
537
|
|
|
|
|
|
|
# tree expects a terminator (the space) fro every production, optionally followed |
538
|
|
|
|
|
|
|
# by other productions or any final non matching text. |
539
|
|
|
|
|
|
|
# This space will be removed in the _assemble function |
540
|
15
|
|
|
|
|
34
|
$address->{input_string} .= ' '; |
541
|
|
|
|
|
|
|
|
542
|
15
|
|
|
|
|
49
|
$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building); |
543
|
15
|
|
|
|
|
62
|
_validate($address); |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
|
546
|
15
|
|
|
|
|
64
|
return($address,$address->{error}); |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
550
|
|
|
|
|
|
|
# Apply correct capitalisation to each component of an address |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
sub components |
553
|
|
|
|
|
|
|
{ |
554
|
17
|
|
|
17
|
1
|
105
|
my $address = shift; |
555
|
17
|
|
|
|
|
20
|
my ($uc_all) = @_; |
556
|
|
|
|
|
|
|
|
557
|
17
|
|
|
|
|
23
|
my %orig_components = %{ $address->{components} }; |
|
17
|
|
|
|
|
163
|
|
558
|
|
|
|
|
|
|
|
559
|
17
|
|
|
|
|
36
|
my (%cased_components); |
560
|
17
|
|
|
|
|
55
|
foreach my $curr_key ( keys %orig_components ) |
561
|
|
|
|
|
|
|
{ |
562
|
323
|
|
|
|
|
186
|
my $cased_value; |
563
|
323
|
|
|
|
|
267
|
my $curr_value = $orig_components{$curr_key}; |
564
|
|
|
|
|
|
|
|
565
|
323
|
50
|
|
|
|
386
|
if ($uc_all) |
566
|
|
|
|
|
|
|
{ |
567
|
0
|
|
|
|
|
0
|
$cased_components{$curr_key} = uc($curr_value); |
568
|
0
|
|
|
|
|
0
|
next; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
|
572
|
323
|
100
|
|
|
|
658
|
if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ ) |
|
|
100
|
|
|
|
|
|
573
|
|
|
|
|
|
|
{ |
574
|
|
|
|
|
|
|
|
575
|
187
|
100
|
100
|
|
|
444
|
if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ ) |
|
|
100
|
|
|
|
|
|
576
|
|
|
|
|
|
|
{ |
577
|
1
|
|
|
|
|
4
|
$cased_value = "US Highway $1"; |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
elsif ( $curr_key eq 'sub_property_identifier' ) |
580
|
|
|
|
|
|
|
{ |
581
|
|
|
|
|
|
|
# UNIT, APT ... 12D etc |
582
|
|
|
|
|
|
|
|
583
|
17
|
|
|
|
|
39
|
my @words = split(/ /,$curr_value); |
584
|
17
|
|
|
|
|
18
|
my @cased_words; |
585
|
|
|
|
|
|
|
my $cased_string; |
586
|
17
|
|
|
|
|
32
|
foreach my $word (@words) |
587
|
|
|
|
|
|
|
{ |
588
|
5
|
|
|
|
|
6
|
my $cased_word; |
589
|
5
|
50
|
66
|
|
|
36
|
if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/) |
|
|
50
|
|
|
|
|
|
590
|
|
|
|
|
|
|
{ |
591
|
|
|
|
|
|
|
# ordinal component, as in 3rd Floor |
592
|
0
|
|
|
|
|
0
|
$cased_word = lc($word); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
elsif ( length($word) > 1 and $word !~ /\d/ ) |
595
|
|
|
|
|
|
|
{ |
596
|
|
|
|
|
|
|
# only need to title case words such as UNIT |
597
|
0
|
|
|
|
|
0
|
$cased_word = Lingua::EN::NameParse::case_surname($word); |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
else |
600
|
|
|
|
|
|
|
{ |
601
|
5
|
|
|
|
|
9
|
$cased_word = $word; |
602
|
|
|
|
|
|
|
} |
603
|
5
|
|
|
|
|
8
|
push(@cased_words,$cased_word); |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
} |
606
|
17
|
|
|
|
|
41
|
$cased_value = join(' ',@cased_words); |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
else |
609
|
|
|
|
|
|
|
{ |
610
|
169
|
100
|
|
|
|
162
|
if ($curr_value) |
611
|
|
|
|
|
|
|
{ |
612
|
|
|
|
|
|
|
# Surnames can be used for street's or suburbs so this method |
613
|
|
|
|
|
|
|
# will give correct capitalisation for most cases |
614
|
71
|
|
|
|
|
133
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
else |
617
|
|
|
|
|
|
|
{ |
618
|
98
|
|
|
|
|
93
|
$cased_value = ''; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
# retain street_direction,sub country and countries capitalisation, usually uppercase |
623
|
|
|
|
|
|
|
elsif ($curr_key =~ /street_direction/) |
624
|
|
|
|
|
|
|
{ |
625
|
34
|
100
|
66
|
|
|
147
|
if (length($curr_value) == 1 or length($curr_value) == 2) |
|
|
50
|
|
|
|
|
|
626
|
|
|
|
|
|
|
{ |
627
|
|
|
|
|
|
|
# N, SE etc is capitalised |
628
|
1
|
|
|
|
|
3
|
$cased_value =$curr_value; |
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
elsif (length($curr_value) > 2) |
631
|
|
|
|
|
|
|
{ |
632
|
0
|
|
|
|
|
0
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
else |
635
|
|
|
|
|
|
|
{ |
636
|
33
|
|
|
|
|
35
|
$cased_value = ''; |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
# retain sub country and countries capitalisation, as usually uppercase |
640
|
|
|
|
|
|
|
else |
641
|
|
|
|
|
|
|
{ |
642
|
102
|
|
|
|
|
107
|
$cased_value = uc($curr_value); |
643
|
|
|
|
|
|
|
} |
644
|
323
|
|
|
|
|
1720
|
$cased_components{$curr_key} = $cased_value; |
645
|
|
|
|
|
|
|
} |
646
|
17
|
|
|
|
|
402
|
return(%cased_components); |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
649
|
|
|
|
|
|
|
# Apply correct capitalisation to an entire address |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
sub case_all |
652
|
|
|
|
|
|
|
{ |
653
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
654
|
|
|
|
|
|
|
|
655
|
0
|
|
|
|
|
0
|
my @cased_address; |
656
|
|
|
|
|
|
|
|
657
|
0
|
0
|
|
|
|
0
|
unless ( $address->{properties}{type} eq 'unknown' ) |
658
|
|
|
|
|
|
|
{ |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# Hash of of lists, indicating the order that address components are assembled in. |
661
|
|
|
|
|
|
|
# Each list element is itself the name of the key value in an address object. |
662
|
|
|
|
|
|
|
|
663
|
0
|
|
|
|
|
0
|
my %component_order= |
664
|
|
|
|
|
|
|
( |
665
|
|
|
|
|
|
|
'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/], |
666
|
|
|
|
|
|
|
'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ], |
667
|
|
|
|
|
|
|
'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ] |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
); |
670
|
0
|
0
|
|
|
|
0
|
if ( $address->{country} eq 'US' ) |
671
|
|
|
|
|
|
|
{ |
672
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/]; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
else |
675
|
|
|
|
|
|
|
{ |
676
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ]; |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
|
679
|
0
|
|
|
|
|
0
|
my %component_vals = $address->components; |
680
|
0
|
|
|
|
|
0
|
my @order = @{ $component_order{$address->{properties}{type} } }; |
|
0
|
|
|
|
|
0
|
|
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
foreach my $component ( @order ) |
683
|
|
|
|
|
|
|
{ |
684
|
|
|
|
|
|
|
# As some components such as property name are optional, they will appear |
685
|
|
|
|
|
|
|
# in the order array but may or may not have have a value, so check |
686
|
|
|
|
|
|
|
# for undefined values |
687
|
0
|
0
|
|
|
|
0
|
if ( $component_vals{$component} ) |
688
|
|
|
|
|
|
|
{ |
689
|
0
|
|
|
|
|
0
|
push(@cased_address,$component_vals{$component}); |
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
} |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
|
694
|
0
|
0
|
0
|
|
|
0
|
if ( $address->{error} and $address->{force_case} ) |
695
|
|
|
|
|
|
|
{ |
696
|
|
|
|
|
|
|
# Despite errors, try to name case non-matching section. As the format |
697
|
|
|
|
|
|
|
# of this section is unknown, surname case will provide the best |
698
|
|
|
|
|
|
|
# approximation |
699
|
0
|
|
|
|
|
0
|
push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching})); |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
0
|
|
|
|
|
0
|
return(join(' ',@cased_address)); |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
705
|
|
|
|
|
|
|
sub properties |
706
|
|
|
|
|
|
|
{ |
707
|
2
|
|
|
2
|
1
|
11
|
my $address = shift; |
708
|
2
|
|
|
|
|
2
|
return(%{ $address->{properties} }); |
|
2
|
|
|
|
|
11
|
|
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
712
|
|
|
|
|
|
|
# Create a text report to standard output listing |
713
|
|
|
|
|
|
|
# - the input string, |
714
|
|
|
|
|
|
|
# - the name of each defined component |
715
|
|
|
|
|
|
|
# - any non matching component |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub report |
718
|
|
|
|
|
|
|
{ |
719
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
720
|
|
|
|
|
|
|
|
721
|
0
|
|
|
|
|
0
|
my $report = ''; |
722
|
|
|
|
|
|
|
|
723
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Original Input",$address->{original_input}); |
724
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Cleaned Input",$address->{input_string}); |
725
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Country address format",$address->{country_code}); |
726
|
|
|
|
|
|
|
|
727
|
0
|
|
|
|
|
0
|
my %props = $address->properties; |
728
|
0
|
0
|
|
|
|
0
|
if ( $props{type} ) |
729
|
|
|
|
|
|
|
{ |
730
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Address type",$props{type}); |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
|
734
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Non matching part",$props{non_matching}); |
735
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error",$address->{error}); |
736
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error descriptions",$address->{error_desc}); |
737
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning",$address->{error}); |
738
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning description",$address->{warning_desc}); |
739
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Case all",$address->case_all); |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
|
742
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"COMPONENTS",''); |
743
|
0
|
|
|
|
|
0
|
my %comps = $address->components; |
744
|
0
|
|
|
|
|
0
|
foreach my $comp ( sort keys %comps) |
745
|
|
|
|
|
|
|
{ |
746
|
0
|
0
|
|
|
|
0
|
if (defined($comps{$comp}) ) |
747
|
|
|
|
|
|
|
{ |
748
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,$comp,$comps{$comp}); |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
|
752
|
0
|
|
|
|
|
0
|
return($report); |
753
|
|
|
|
|
|
|
} |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# PRIVATE METHODS |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
sub _assemble |
763
|
|
|
|
|
|
|
{ |
764
|
|
|
|
|
|
|
|
765
|
15
|
|
|
15
|
|
25
|
my $address = shift; |
766
|
15
|
|
|
|
|
29
|
my ($pre_cursor,$po_box_type,$level,$building) = @_; |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
# Parse the address according to the rules defined in the AddressParse::Grammar module, |
769
|
|
|
|
|
|
|
# $::RD_TRACE = 1; # for debugging RecDescent output |
770
|
|
|
|
|
|
|
# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree |
771
|
15
|
|
|
|
|
147
|
my $parsed_address = $address->{parse}->full_address($address->{input_string}); |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
# Place components into a separate hash, so they can be easily returned to the user to inspect and modify |
774
|
15
|
|
|
|
|
191025
|
$address->{components} = (); |
775
|
|
|
|
|
|
|
|
776
|
15
|
100
|
|
|
|
146
|
if ($pre_cursor) |
777
|
|
|
|
|
|
|
{ |
778
|
1
|
|
|
|
|
4
|
$address->{components}{'pre_cursor'} = $pre_cursor; |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
else |
781
|
|
|
|
|
|
|
{ |
782
|
14
|
|
|
|
|
46
|
$address->{components}{'pre_cursor'} = ''; |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
|
785
|
15
|
100
|
|
|
|
51
|
if ($level) |
786
|
|
|
|
|
|
|
{ |
787
|
1
|
|
|
|
|
4
|
$address->{components}{'level'} = $level; |
788
|
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
else |
790
|
|
|
|
|
|
|
{ |
791
|
14
|
|
|
|
|
33
|
$address->{components}{'level'} = ''; |
792
|
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
|
794
|
15
|
100
|
|
|
|
40
|
if ($building) |
795
|
|
|
|
|
|
|
{ |
796
|
1
|
|
|
|
|
3
|
$address->{components}{'building'} = $building; |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
else |
799
|
|
|
|
|
|
|
{ |
800
|
14
|
|
|
|
|
28
|
$address->{components}{'building'} = ''; |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
|
804
|
15
|
100
|
|
|
|
42
|
if ($po_box_type) |
805
|
|
|
|
|
|
|
{ |
806
|
1
|
|
|
|
|
4
|
$address->{components}{'po_box_type'} = $po_box_type; |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
else |
809
|
|
|
|
|
|
|
{ |
810
|
14
|
|
|
|
|
36
|
$address->{components}{'po_box_type'} = ''; |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
|
814
|
15
|
|
|
|
|
42
|
$address->{components}{post_box} = ''; |
815
|
15
|
100
|
|
|
|
65
|
if ( $parsed_address->{post_box} ) |
816
|
|
|
|
|
|
|
{ |
817
|
1
|
|
|
|
|
3
|
$address->{components}{post_box} = $parsed_address->{post_box}; |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
|
820
|
15
|
|
|
|
|
30
|
$address->{components}{road_box} = ''; |
821
|
15
|
50
|
|
|
|
57
|
if ( $parsed_address->{road_box} ) |
822
|
|
|
|
|
|
|
{ |
823
|
0
|
|
|
|
|
0
|
$address->{components}{road_box} = $parsed_address->{road_box}; |
824
|
|
|
|
|
|
|
} |
825
|
|
|
|
|
|
|
|
826
|
15
|
|
|
|
|
42
|
$address->{components}{property_name} = ''; |
827
|
15
|
100
|
|
|
|
44
|
if ( $parsed_address->{property_name} ) |
828
|
|
|
|
|
|
|
{ |
829
|
1
|
|
|
|
|
4
|
$address->{components}{property_name} = $parsed_address->{property_name}; |
830
|
|
|
|
|
|
|
} |
831
|
|
|
|
|
|
|
|
832
|
15
|
|
|
|
|
48
|
$address->{components}{sub_property_identifier} = ''; |
833
|
15
|
|
|
|
|
63
|
$address->{components}{sub_property_type} = ''; |
834
|
|
|
|
|
|
|
|
835
|
15
|
100
|
|
|
|
46
|
if ( $parsed_address->{sub_property} ) |
836
|
|
|
|
|
|
|
{ |
837
|
4
|
50
|
|
|
|
31
|
if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ ) |
|
|
0
|
|
|
|
|
|
838
|
|
|
|
|
|
|
{ |
839
|
|
|
|
|
|
|
# Such as Unit 24, # 4A etc |
840
|
4
|
|
|
|
|
12
|
$address->{components}{sub_property_type} = $1; |
841
|
4
|
|
|
|
|
13
|
$address->{components}{sub_property_identifier} = $2; |
842
|
|
|
|
|
|
|
} |
843
|
|
|
|
|
|
|
elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ ) |
844
|
|
|
|
|
|
|
{ |
845
|
|
|
|
|
|
|
# Such as 1st Floor |
846
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_type} = $2; |
847
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_identifier} = $1; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
} |
850
|
|
|
|
|
|
|
|
851
|
15
|
|
|
|
|
35
|
$address->{components}{property_identifier} = ''; |
852
|
15
|
100
|
|
|
|
50
|
if ( $parsed_address->{property_identifier} ) |
853
|
|
|
|
|
|
|
{ |
854
|
13
|
|
|
|
|
34
|
$address->{components}{property_identifier} = $parsed_address->{property_identifier}; |
855
|
|
|
|
|
|
|
} |
856
|
|
|
|
|
|
|
|
857
|
15
|
|
|
|
|
46
|
$address->{components}{base_street_name} = ''; |
858
|
15
|
|
|
|
|
26
|
$address->{components}{street_direction_prefix} = ''; |
859
|
15
|
|
|
|
|
61
|
my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name}); |
860
|
15
|
50
|
|
|
|
42
|
if ($street_direction ) |
861
|
|
|
|
|
|
|
{ |
862
|
0
|
|
|
|
|
0
|
$address->{components}{street_direction_prefix} = $street_direction; |
863
|
0
|
|
|
|
|
0
|
$address->{components}{base_street_name} = $base_street_name; |
864
|
|
|
|
|
|
|
} |
865
|
|
|
|
|
|
|
else |
866
|
|
|
|
|
|
|
{ |
867
|
15
|
|
|
|
|
36
|
$address->{components}{base_street_name} = $parsed_address->{street_name}; |
868
|
|
|
|
|
|
|
} |
869
|
|
|
|
|
|
|
|
870
|
15
|
|
|
|
|
34
|
$address->{components}{street_name} = ''; |
871
|
15
|
|
|
|
|
27
|
$address->{components}{street_type} = ''; |
872
|
15
|
100
|
|
|
|
45
|
if ( $parsed_address->{street_name} ) |
873
|
|
|
|
|
|
|
{ |
874
|
|
|
|
|
|
|
# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out |
875
|
13
|
50
|
|
|
|
40
|
if ( $parsed_address->{street_name} eq 'THE ' ) |
876
|
|
|
|
|
|
|
{ |
877
|
0
|
|
|
|
|
0
|
$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type}; |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
else |
880
|
|
|
|
|
|
|
{ |
881
|
13
|
|
|
|
|
32
|
$address->{components}{street_name} = $parsed_address->{street_name}; |
882
|
13
|
|
|
|
|
25
|
$address->{components}{street_type} = $parsed_address->{street_type}; |
883
|
|
|
|
|
|
|
} |
884
|
|
|
|
|
|
|
} |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
|
887
|
15
|
|
|
|
|
41
|
$address->{components}{street_direction_suffix} = ''; |
888
|
15
|
100
|
|
|
|
43
|
if ( $parsed_address->{street_direction_suffix} ) |
889
|
|
|
|
|
|
|
{ |
890
|
1
|
|
|
|
|
2
|
$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix}; |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
|
894
|
15
|
|
|
|
|
45
|
$address->{components}{suburb} = ''; |
895
|
15
|
50
|
|
|
|
36
|
if ( $parsed_address->{suburb} ) |
896
|
|
|
|
|
|
|
{ |
897
|
15
|
|
|
|
|
45
|
$address->{components}{suburb} = $parsed_address->{suburb}; |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
|
900
|
15
|
|
|
|
|
28
|
$address->{components}{subcountry} = ''; |
901
|
15
|
50
|
|
|
|
58
|
if ( $parsed_address->{subcountry} ) |
902
|
|
|
|
|
|
|
{ |
903
|
15
|
|
|
|
|
24
|
my $sub_country = $parsed_address->{subcountry}; |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc |
906
|
15
|
100
|
|
|
|
39
|
if ($address->{abbreviate_subcountry}) |
907
|
|
|
|
|
|
|
{ |
908
|
7
|
|
|
|
|
51
|
my $country = Locale::SubCountry->new($address->{country}); |
909
|
7
|
|
|
|
|
172
|
my $code = $country->code($sub_country); |
910
|
7
|
100
|
|
|
|
619
|
if ( $code ne 'unknown' ) |
911
|
|
|
|
|
|
|
{ |
912
|
1
|
|
|
|
|
4
|
$address->{components}{subcountry} = $code; |
913
|
|
|
|
|
|
|
} |
914
|
|
|
|
|
|
|
# sub country already abbreviated |
915
|
|
|
|
|
|
|
else |
916
|
|
|
|
|
|
|
{ |
917
|
6
|
|
|
|
|
20
|
$address->{components}{subcountry} = $sub_country; |
918
|
|
|
|
|
|
|
} |
919
|
|
|
|
|
|
|
} |
920
|
|
|
|
|
|
|
else |
921
|
|
|
|
|
|
|
{ |
922
|
8
|
|
|
|
|
24
|
$address->{components}{subcountry} = $sub_country; |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
|
926
|
15
|
|
|
|
|
27
|
$address->{components}{post_code} = ''; |
927
|
15
|
50
|
|
|
|
48
|
if ( $parsed_address->{post_code} ) |
928
|
|
|
|
|
|
|
{ |
929
|
15
|
|
|
|
|
29
|
$address->{components}{post_code} = $parsed_address->{post_code}; |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
|
932
|
15
|
|
|
|
|
32
|
$address->{components}{country} = ''; |
933
|
15
|
100
|
|
|
|
44
|
if ( $parsed_address->{country} ) |
934
|
|
|
|
|
|
|
{ |
935
|
1
|
|
|
|
|
2
|
$address->{components}{country} = $parsed_address->{country}; |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
15
|
|
|
|
|
40
|
$address->{properties} = (); |
939
|
|
|
|
|
|
|
|
940
|
15
|
|
|
|
|
43
|
$address->{properties}{non_matching} = ''; |
941
|
15
|
100
|
|
|
|
43
|
if ( $parsed_address->{non_matching} ) |
942
|
|
|
|
|
|
|
{ |
943
|
1
|
|
|
|
|
2
|
$address->{properties}{non_matching} = $parsed_address->{non_matching}; |
944
|
|
|
|
|
|
|
} |
945
|
15
|
|
|
|
|
36
|
$address->{properties}{type} = $parsed_address->{type}; |
946
|
|
|
|
|
|
|
|
947
|
15
|
|
|
|
|
48
|
_trim_trailing_space($address); |
948
|
|
|
|
|
|
|
|
949
|
15
|
|
|
|
|
64
|
return($address); |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
953
|
|
|
|
|
|
|
# |
954
|
|
|
|
|
|
|
sub _get_street_direction |
955
|
|
|
|
|
|
|
{ |
956
|
15
|
|
|
15
|
|
30
|
my ($street_name) = @_; |
957
|
|
|
|
|
|
|
|
958
|
15
|
|
|
|
|
19
|
my $street_direction; |
959
|
|
|
|
|
|
|
my $base_street_name; |
960
|
|
|
|
|
|
|
|
961
|
15
|
100
|
|
|
|
50
|
unless ($street_name) |
962
|
|
|
|
|
|
|
{ |
963
|
2
|
|
|
|
|
4
|
return; |
964
|
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
|
|
966
|
13
|
|
|
|
|
85
|
my @words = split(/\s/,$street_name); |
967
|
13
|
100
|
|
|
|
39
|
if (@words > 1) |
968
|
|
|
|
|
|
|
{ |
969
|
4
|
50
|
|
|
|
18
|
if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ ) |
970
|
|
|
|
|
|
|
{ |
971
|
0
|
|
|
|
|
0
|
$street_direction = $1; |
972
|
0
|
|
|
|
|
0
|
shift(@words); |
973
|
0
|
|
|
|
|
0
|
$base_street_name = join(' ',@words); |
974
|
|
|
|
|
|
|
} |
975
|
|
|
|
|
|
|
} |
976
|
13
|
|
|
|
|
39
|
return($street_direction,$base_street_name); |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
} |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
981
|
|
|
|
|
|
|
# Check for several different types of syntax errors |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
sub _validate |
984
|
|
|
|
|
|
|
{ |
985
|
15
|
|
|
15
|
|
28
|
my $address = shift; |
986
|
15
|
|
|
|
|
35
|
$address->{error} = 0; |
987
|
15
|
|
|
|
|
30
|
$address->{error_desc} = ''; |
988
|
15
|
|
|
|
|
25
|
$address->{warning} = 0; |
989
|
15
|
|
|
|
|
37
|
$address->{warning_desc} = ''; |
990
|
|
|
|
|
|
|
|
991
|
15
|
100
|
|
|
|
40
|
if ( $address->{properties}{non_matching} ) |
992
|
|
|
|
|
|
|
{ |
993
|
1
|
|
|
|
|
2
|
$address->{error} = 1; |
994
|
1
|
|
|
|
|
4
|
$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching}; |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
else |
997
|
|
|
|
|
|
|
{ |
998
|
14
|
50
|
|
|
|
40
|
if ( $address->{properties}{type} eq 'unknown' ) |
999
|
|
|
|
|
|
|
{ |
1000
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
1001
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= 'unknown address format'; |
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
else |
1004
|
|
|
|
|
|
|
{ |
1005
|
14
|
50
|
33
|
|
|
107
|
if ($address->{force_post_code} and not $address->{components}{post_code}) |
1006
|
|
|
|
|
|
|
{ |
1007
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
1008
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':no post code'; |
1009
|
|
|
|
|
|
|
} |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses |
1012
|
14
|
50
|
|
|
|
66
|
if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,\/ ]/ ) |
1013
|
|
|
|
|
|
|
{ |
1014
|
|
|
|
|
|
|
# Note, if auto_clean is on, illegal characters will have been removed |
1015
|
|
|
|
|
|
|
# for second parsing and no error flag or message reported |
1016
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
1017
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':illegal chars'; |
1018
|
|
|
|
|
|
|
} |
1019
|
14
|
100
|
|
|
|
37
|
if ( $address->{properties}{type} eq 'suburban' ) |
1020
|
|
|
|
|
|
|
{ |
1021
|
12
|
|
|
|
|
26
|
my $street = $address->{components}{street_name}; |
1022
|
12
|
100
|
|
|
|
42
|
if ($street !~ /\d/ ) |
1023
|
|
|
|
|
|
|
{ |
1024
|
|
|
|
|
|
|
# Not an ordinal or single letter street type |
1025
|
11
|
100
|
|
|
|
31
|
if ( _check_vowel($address->{components}{base_street_name}) ) |
1026
|
|
|
|
|
|
|
{ |
1027
|
|
|
|
|
|
|
# street name must have a vowel sound, |
1028
|
1
|
|
|
|
|
2
|
$address->{warning} = 1; |
1029
|
1
|
|
|
|
|
6
|
$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}"; |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
} |
1032
|
|
|
|
|
|
|
} |
1033
|
|
|
|
|
|
|
|
1034
|
14
|
50
|
|
|
|
40
|
if ( _check_vowel($address->{components}{suburb}) ) |
1035
|
|
|
|
|
|
|
{ |
1036
|
0
|
|
|
|
|
0
|
$address->{warning} = 1; |
1037
|
0
|
|
|
|
|
0
|
$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}"; |
1038
|
|
|
|
|
|
|
} |
1039
|
|
|
|
|
|
|
} |
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
} |
1042
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1043
|
|
|
|
|
|
|
# Purge the input string of illegal or redundant characters. |
1044
|
|
|
|
|
|
|
# Correct malformed patterns |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
sub _clean |
1047
|
|
|
|
|
|
|
{ |
1048
|
12
|
|
|
12
|
|
13
|
my $address = shift; |
1049
|
|
|
|
|
|
|
|
1050
|
12
|
|
|
|
|
29
|
my ($input) = $address->{input_string}; |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St) |
1053
|
12
|
|
|
|
|
25
|
$input =~ s|\(.*\)||; |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
# Normalise half house numbers, such as 12.5 to 12 1/2. This is needed now before full stops are stripped out |
1056
|
12
|
|
|
|
|
26
|
$input =~ s|^(\d{1,4})\.5 |$1 1/2 |; |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
# strip full stops, remove illegal characters |
1059
|
|
|
|
|
|
|
# & can be part of property name |
1060
|
|
|
|
|
|
|
# hash (#) may denote number for USA address |
1061
|
|
|
|
|
|
|
# quotes can occur as property name delimiters |
1062
|
|
|
|
|
|
|
|
1063
|
12
|
|
|
|
|
32
|
$input =~ s|[^A-Za-z0-9/'" -]||go; |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
# remove repeating, leading and trailing spaces |
1066
|
12
|
|
|
|
|
25
|
$input =~ s| +| |go ; |
1067
|
12
|
|
|
|
|
28
|
$input =~ s|^ ||; |
1068
|
12
|
|
|
|
|
20
|
$input =~ s| $||; |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
# Expand abbreviations that are too short |
1072
|
|
|
|
|
|
|
|
1073
|
12
|
|
|
|
|
26
|
$input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street' |
1074
|
12
|
|
|
|
|
28
|
$input =~ s| CSEWY | CAUSEWAY |; |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
# Standardise abbreviations |
1077
|
|
|
|
|
|
|
|
1078
|
12
|
|
|
|
|
23
|
$input =~ s|STR |ST |; |
1079
|
12
|
|
|
|
|
24
|
$input =~ s|TERR |TERRACE |; |
1080
|
|
|
|
|
|
|
|
1081
|
12
|
|
|
|
|
14
|
$input =~ s|^FCTR?Y |FACTORY |; |
1082
|
12
|
|
|
|
|
19
|
$input =~ s|^FACT?R?Y? |FACTORY |; |
1083
|
|
|
|
|
|
|
|
1084
|
12
|
|
|
|
|
27
|
$input =~ s|LVL |LEVEL |; # sub property identifiers |
1085
|
12
|
|
|
|
|
34
|
$input =~ s|^UN? |UNIT |; |
1086
|
12
|
|
|
|
|
25
|
$input =~ s|^U(\d+)|UNIT $1|; |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A |
1089
|
12
|
|
|
|
|
17
|
$input =~ s|/ |/|; |
1090
|
12
|
|
|
|
|
23
|
$input =~ s| /|/|; |
1091
|
12
|
|
|
|
|
22
|
$input =~ s|- |-|; |
1092
|
12
|
|
|
|
|
22
|
$input =~ s| -|-|; |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St |
1095
|
|
|
|
|
|
|
|
1096
|
12
|
50
|
|
|
|
61
|
if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ ) |
1097
|
|
|
|
|
|
|
{ |
1098
|
|
|
|
|
|
|
# Don't remove space before single letter streets such as 21 B Street |
1099
|
12
|
50
|
|
|
|
46
|
if ( $address->{country_code} eq 'US' ) |
1100
|
|
|
|
|
|
|
{ |
1101
|
|
|
|
|
|
|
# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET |
1102
|
|
|
|
|
|
|
# Assume that the direction prefix is the more likely case |
1103
|
0
|
|
|
|
|
0
|
$input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|; |
1104
|
|
|
|
|
|
|
} |
1105
|
|
|
|
|
|
|
else |
1106
|
|
|
|
|
|
|
{ |
1107
|
12
|
|
|
|
|
42
|
$input =~ s|^(\d+) ([A-Z] )|$1$2|; |
1108
|
|
|
|
|
|
|
} |
1109
|
|
|
|
|
|
|
} |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
# Normalise sub property identifiers |
1112
|
12
|
50
|
|
|
|
30
|
if ( $address->{country_code} eq 'US' ) |
1113
|
|
|
|
|
|
|
{ |
1114
|
|
|
|
|
|
|
# Fix US sub property identifiers that appear after street name and type |
1115
|
|
|
|
|
|
|
# add space between # or 'Apt' and the number so #2 becomes '# 2' |
1116
|
0
|
|
|
|
|
0
|
$input =~ s| #(\d)| # $1|; |
1117
|
0
|
|
|
|
|
0
|
$input =~ s| #([A-Z])| # $1|; |
1118
|
0
|
|
|
|
|
0
|
$input =~ s| (APT)(\d)| $1 $2|i; |
1119
|
|
|
|
|
|
|
|
1120
|
0
|
|
|
|
|
0
|
$input =~ s| (APT)-(\w)| $1 $2|i; |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
# remove redundant space so # 34 B becomes # 34B |
1123
|
0
|
|
|
|
|
0
|
$input =~ s| # (\d+) (\w) | # $1$2 |; |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
# remove redundant '#' |
1126
|
0
|
|
|
|
|
0
|
$input =~ s/ (APT|SUITE|UNIT) #/ $1 /; |
1127
|
|
|
|
|
|
|
# still to test |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
} |
1130
|
|
|
|
|
|
|
else |
1131
|
|
|
|
|
|
|
{ |
1132
|
|
|
|
|
|
|
# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2 |
1133
|
12
|
|
|
|
|
33
|
$input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/; |
1134
|
|
|
|
|
|
|
} |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
# Remove redundant slash or dash |
1137
|
|
|
|
|
|
|
# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12 |
1138
|
12
|
|
|
|
|
29
|
$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /; |
1139
|
|
|
|
|
|
|
# Unit J1/ 39 becomes Unit J1 39 |
1140
|
12
|
|
|
|
|
25
|
$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /; |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
# remove dash that does not from a sequence, such as D-5 or 22-A |
1144
|
12
|
|
|
|
|
21
|
$input =~ s|([A-Z])-(\d)|$1$2|; |
1145
|
12
|
|
|
|
|
21
|
$input =~ s|(\d)-([A-Z])|$1$2|; |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
|
1148
|
12
|
|
|
|
|
38
|
return($input); |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1151
|
|
|
|
|
|
|
# Remove any "care of" type of precursor from the main address |
1152
|
|
|
|
|
|
|
# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214 |
1153
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
sub _extract_precursor |
1156
|
|
|
|
|
|
|
{ |
1157
|
15
|
|
|
15
|
|
25
|
my ($input) = @_; |
1158
|
15
|
|
|
|
|
25
|
my ($pre_cursor,$address_start,$address_end); |
1159
|
|
|
|
|
|
|
|
1160
|
15
|
100
|
|
|
|
92
|
if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)}) |
1161
|
|
|
|
|
|
|
{ |
1162
|
1
|
|
|
|
|
5
|
$pre_cursor = $1; |
1163
|
1
|
|
|
|
|
3
|
$address_start = $2; |
1164
|
1
|
|
|
|
|
3
|
$address_end = $3; |
1165
|
1
|
|
|
|
|
6
|
return($pre_cursor, $address_start . $address_end); |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
else |
1168
|
|
|
|
|
|
|
{ |
1169
|
14
|
|
|
|
|
61
|
return('',$input) |
1170
|
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
|
} |
1172
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1173
|
|
|
|
|
|
|
# Remove any level or floor info such as: |
1174
|
|
|
|
|
|
|
# 12 Smith St Floor 2 |
1175
|
|
|
|
|
|
|
# Level 22 Suite 3 12 Main St |
1176
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
sub _extract_level |
1179
|
|
|
|
|
|
|
{ |
1180
|
15
|
|
|
15
|
|
21
|
my ($input) = @_; |
1181
|
15
|
|
|
|
|
19
|
my ($level); |
1182
|
|
|
|
|
|
|
|
1183
|
15
|
100
|
33
|
|
|
238
|
if |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1184
|
|
|
|
|
|
|
( |
1185
|
|
|
|
|
|
|
# Level info could be at start of string so first space is optional |
1186
|
|
|
|
|
|
|
$input =~ / ?((FIRST|SECONND|THIRD|FOURTH|FIFTH|SIXTH) (FLOOR|FLR|FL) )/ or |
1187
|
|
|
|
|
|
|
$input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or |
1188
|
|
|
|
|
|
|
$input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or |
1189
|
|
|
|
|
|
|
$input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/ |
1190
|
|
|
|
|
|
|
) |
1191
|
|
|
|
|
|
|
{ |
1192
|
1
|
|
|
|
|
5
|
$level = $1; |
1193
|
1
|
|
|
|
|
4
|
$level =~ s|/||; |
1194
|
1
|
|
|
|
|
3
|
$level =~ s|-||; |
1195
|
1
|
|
|
|
|
17
|
$input =~ s/$level//; |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
|
1198
|
15
|
|
|
|
|
47
|
return($level,$input); |
1199
|
|
|
|
|
|
|
} |
1200
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1201
|
|
|
|
|
|
|
# Remove any building info such as: |
1202
|
|
|
|
|
|
|
# Building 2 Level 12 123 Smith St |
1203
|
|
|
|
|
|
|
# 12 Main St Tower A Level 2 |
1204
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
sub _extract_building |
1207
|
|
|
|
|
|
|
{ |
1208
|
15
|
|
|
15
|
|
18
|
my ($input) = @_; |
1209
|
15
|
|
|
|
|
18
|
my ($building); |
1210
|
|
|
|
|
|
|
|
1211
|
15
|
|
|
|
|
64
|
my $bld = qr{BLOCK|BLDG?|BUILDING|TOWER}; |
1212
|
15
|
|
|
|
|
39
|
my $id = qr{[A-Z]|A[A-Z]|\d+|\d{1,3}[A-Z]|[A-Z]\d{1,3}}; # AA or 12 or 32C or C12 |
1213
|
|
|
|
|
|
|
|
1214
|
15
|
100
|
66
|
|
|
315
|
if |
1215
|
|
|
|
|
|
|
( |
1216
|
|
|
|
|
|
|
$input =~ / ($bld $id) / or $input =~ /^($bld $id) / |
1217
|
|
|
|
|
|
|
) |
1218
|
|
|
|
|
|
|
{ |
1219
|
1
|
|
|
|
|
3
|
$building = $1; |
1220
|
1
|
|
|
|
|
3
|
$building =~ s|/||; |
1221
|
1
|
|
|
|
|
2
|
$building =~ s|-||; |
1222
|
1
|
|
|
|
|
12
|
$input =~ s/$building//; |
1223
|
|
|
|
|
|
|
} |
1224
|
|
|
|
|
|
|
# TO DO, North, East etc Building? |
1225
|
|
|
|
|
|
|
|
1226
|
15
|
|
|
|
|
62
|
return($building,$input); |
1227
|
|
|
|
|
|
|
} |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1230
|
|
|
|
|
|
|
# Remove any description that follows the suburb after the main address |
1231
|
|
|
|
|
|
|
# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965" |
1232
|
|
|
|
|
|
|
# It will be saved as an address attribute |
1233
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
sub _extract_po_box_type |
1235
|
|
|
|
|
|
|
{ |
1236
|
15
|
|
|
15
|
|
23
|
my ($input) = @_; |
1237
|
15
|
|
|
|
|
34
|
my ($po_box_type,$address_start,$address_end); |
1238
|
|
|
|
|
|
|
|
1239
|
15
|
100
|
|
|
|
59
|
if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ ) |
1240
|
|
|
|
|
|
|
{ |
1241
|
1
|
|
|
|
|
4
|
$address_start = $1; |
1242
|
1
|
|
|
|
|
3
|
$po_box_type = $2; |
1243
|
1
|
|
|
|
|
2
|
$address_end = $3; |
1244
|
1
|
|
|
|
|
5
|
return($po_box_type, $address_start . $address_end); |
1245
|
|
|
|
|
|
|
} |
1246
|
|
|
|
|
|
|
else |
1247
|
|
|
|
|
|
|
{ |
1248
|
14
|
|
|
|
|
45
|
return('',$input) |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1252
|
|
|
|
|
|
|
# For correct matching, the grammar of each component must include the |
1253
|
|
|
|
|
|
|
# trailing space that separates it from any following word. This should |
1254
|
|
|
|
|
|
|
# now be removed from each component |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
sub _trim_trailing_space |
1257
|
|
|
|
|
|
|
{ |
1258
|
15
|
|
|
15
|
|
28
|
my ($address) = @_; |
1259
|
|
|
|
|
|
|
|
1260
|
15
|
|
|
|
|
25
|
foreach my $key (keys %{ $address->{components} } ) |
|
15
|
|
|
|
|
93
|
|
1261
|
|
|
|
|
|
|
{ |
1262
|
285
|
100
|
|
|
|
379
|
if ($address->{components}{$key} ) |
1263
|
|
|
|
|
|
|
{ |
1264
|
110
|
|
|
|
|
222
|
$address->{components}{$key} =~ s/ $//g; |
1265
|
|
|
|
|
|
|
} |
1266
|
|
|
|
|
|
|
} |
1267
|
|
|
|
|
|
|
} |
1268
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
sub _fmt_report_line |
1271
|
|
|
|
|
|
|
{ |
1272
|
0
|
|
|
0
|
|
0
|
my ($report_ref,$label,$value) = @_; |
1273
|
|
|
|
|
|
|
# To DO $$ ?? |
1274
|
0
|
|
|
|
|
0
|
$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value); |
1275
|
|
|
|
|
|
|
} |
1276
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
sub _check_vowel |
1279
|
|
|
|
|
|
|
{ |
1280
|
25
|
|
|
25
|
|
27
|
my ($str) = @_; |
1281
|
|
|
|
|
|
|
|
1282
|
25
|
|
|
|
|
58
|
my @words = split(/ /,$str); |
1283
|
25
|
|
|
|
|
34
|
foreach my $word (@words) |
1284
|
|
|
|
|
|
|
{ |
1285
|
|
|
|
|
|
|
# Saint, Mount, Junior, Senior (as in Dr Martin Luther King Snr) |
1286
|
34
|
100
|
66
|
|
|
192
|
if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|DR|JN?R|SN?R/ ) |
1287
|
|
|
|
|
|
|
{ |
1288
|
1
|
|
|
|
|
4
|
return(1); |
1289
|
|
|
|
|
|
|
} |
1290
|
|
|
|
|
|
|
} |
1291
|
24
|
|
|
|
|
69
|
return(0); |
1292
|
|
|
|
|
|
|
} |
1293
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
return(1); |