line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Lingua::EN::AddressParse - extract components of a street address, presented as a text string |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Lingua::EN::AddressParse; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my %args = |
10
|
|
|
|
|
|
|
( |
11
|
|
|
|
|
|
|
country => 'US', |
12
|
|
|
|
|
|
|
auto_clean => 1, |
13
|
|
|
|
|
|
|
force_case => 1, |
14
|
|
|
|
|
|
|
abbreviate_subcountry => 0, |
15
|
|
|
|
|
|
|
abbreviated_subcountry_only => 1, |
16
|
|
|
|
|
|
|
force_post_code => 0 |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
20
|
|
|
|
|
|
|
$error = $address->parse("40 1/2 N OLD MASSACHUSETTS AVE APT 3B Washington Valley Washington 98100: HOLD MAIL"); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
print $address->report; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
Country address format 'US' |
25
|
|
|
|
|
|
|
Address type 'suburban' |
26
|
|
|
|
|
|
|
Non matching part 'HOLD MAIL ' |
27
|
|
|
|
|
|
|
Error '1' |
28
|
|
|
|
|
|
|
Error descriptions 'non matching section : HOLD MAIL ' |
29
|
|
|
|
|
|
|
Warning '1' |
30
|
|
|
|
|
|
|
Warning description '' |
31
|
|
|
|
|
|
|
Case all '40 1/2 N Old Massachusetts Ave Apt 3B Washington Valley WA 98100' |
32
|
|
|
|
|
|
|
COMPONENTS '' |
33
|
|
|
|
|
|
|
base_street_name 'Old Massachusetts' |
34
|
|
|
|
|
|
|
post_code '98100' |
35
|
|
|
|
|
|
|
property_identifier '40 1/2' |
36
|
|
|
|
|
|
|
street_direction_prefix 'N' |
37
|
|
|
|
|
|
|
street_name 'N Old Massachusetts' |
38
|
|
|
|
|
|
|
street_type 'Ave' |
39
|
|
|
|
|
|
|
sub_property_identifier '3B' |
40
|
|
|
|
|
|
|
sub_property_type 'Apt' |
41
|
|
|
|
|
|
|
subcountry 'WA' |
42
|
|
|
|
|
|
|
suburb 'Washington Valley' |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
%address_components = $address->components; |
45
|
|
|
|
|
|
|
print $address_components{sub_property_type}; # APT |
46
|
|
|
|
|
|
|
print $address_components{sub_property_identifier}; # 3B |
47
|
|
|
|
|
|
|
print $address_components{property_identifier}; # 40 1/2 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
%address_properties = $address->properties; |
50
|
|
|
|
|
|
|
print $address_properties{type}; # suburban |
51
|
|
|
|
|
|
|
print $address_properties{non_matching}; # : HOLD MAIL |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 DESCRIPTION |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This module takes as input a suburban, rural or postal address in free format |
59
|
|
|
|
|
|
|
text such as, |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
3080 28TH AVE N ST PETERSBURG, FL 33713-3810 |
62
|
|
|
|
|
|
|
12 1st Avenue N Suite # 2 Somewhere CA 12345 USA |
63
|
|
|
|
|
|
|
C/O JOHN, KENNETH JR POA 744 WIND RIVER DR SYLVANIA, OH 43560-4317 |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
9 Church Street, Abertillery, Mid Glamorgan NP13 1DA |
66
|
|
|
|
|
|
|
27 Bury Street, Abingdon, Oxfordshire OX14 3QT |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
2A LOW ST KEW NSW 2123 |
69
|
|
|
|
|
|
|
12/3-5 AUBREY ST MOUNT VICTORIA VICTORIA 3133 |
70
|
|
|
|
|
|
|
"OLD REGRET" WENTWORTH FALLS NSW 2782 AUSTRALIA |
71
|
|
|
|
|
|
|
GPO Box K318, HAYMARKET, NSW 2000 |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
and attempts to parse it. If successful, the address is broken |
75
|
|
|
|
|
|
|
down into it's components and useful functions can be performed such as : |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
converting upper or lower case values to title case (2A Low St Kew NSW 2123) |
78
|
|
|
|
|
|
|
extracting the addresses individual components (2A,Low,St,KEW,NSW,2123) |
79
|
|
|
|
|
|
|
determining the type of format the address is in ('suburban') |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
If the address cannot be parsed you have the option of cleaning the address |
83
|
|
|
|
|
|
|
of bad characters, or extracting any portion that was parsed and the portion |
84
|
|
|
|
|
|
|
that failed. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
This module can be used for analysing and improving the quality of |
87
|
|
|
|
|
|
|
lists of residential and postal addresses. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head1 DEFINITIONS |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
The following terms are used by AddressParse to define the components that |
92
|
|
|
|
|
|
|
can make up an address. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Pre cursor : C/O MR A Smith... |
95
|
|
|
|
|
|
|
Sub property identifier : Level 1A Unit 2, Apartment B, Lot 12, Suite # 12 ... |
96
|
|
|
|
|
|
|
Property Identifier : 12/66A, 24-34, 2A, 23B/12C, 12/42-44 |
97
|
|
|
|
|
|
|
Property name : "Old Regret" |
98
|
|
|
|
|
|
|
Post Box : GP0 Box K123, LPO 2345, RMS 23 ... |
99
|
|
|
|
|
|
|
Road Box : RMB 24A, RMS 234 ... |
100
|
|
|
|
|
|
|
Street Direction: North, SE, Sth. etc |
101
|
|
|
|
|
|
|
Street name : O'Hare, New South Head, The Causeway |
102
|
|
|
|
|
|
|
Street type : Road, Rd., St, Lane, Highway, Crescent, Circuit ... |
103
|
|
|
|
|
|
|
Suburb : Dee Why, St. John's Wood ... |
104
|
|
|
|
|
|
|
Sub country : NSW, New South Wales, ACT, NY, New Jersey AZ ... |
105
|
|
|
|
|
|
|
Post (zip) code : 2062, 34532-1234, SG12A 9ET |
106
|
|
|
|
|
|
|
Country : Australia, UK, US or Canada |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
The main address formats currently supported are as follows. (a ? means the component is optional): |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
'suburban' : sub_property(?) property_identifier(?) street street_type suburb subcountry post_code(?)country(?) |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
OR for the USA |
115
|
|
|
|
|
|
|
'suburban' : property_identifier(?) street street_type sub_property(?) suburb subcountry post_code(?) country(?) |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
'rural' : property_name suburb subcountry post_code(?) country(?) |
118
|
|
|
|
|
|
|
'post_box' : post_box suburb subcountry post_code(?) country(?) |
119
|
|
|
|
|
|
|
'road_box' : road_box street street_type suburb subcountry post_code(?) country(?) |
120
|
|
|
|
|
|
|
'road_box' : road_box suburb subcountry post_code(?) country(?) |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
All formats may contain a precursor |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Refer to the component grammar defined in the Lingua::EN::AddressParse::Grammar |
125
|
|
|
|
|
|
|
module for a complete list of combinations. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 METHODS |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head2 new |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
The C method creates an instance of an address object and sets up |
133
|
|
|
|
|
|
|
the grammar used to parse addresses. This must be called before any of the |
134
|
|
|
|
|
|
|
following methods are invoked. Note that the object only needs to be |
135
|
|
|
|
|
|
|
created once, and can be reused with new input data. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Various setup options may be defined in a hash that is passed as an |
138
|
|
|
|
|
|
|
optional argument to the C method. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my %args = |
141
|
|
|
|
|
|
|
( |
142
|
|
|
|
|
|
|
country => 'Australia', |
143
|
|
|
|
|
|
|
auto_clean => 1, |
144
|
|
|
|
|
|
|
force_case => 1, |
145
|
|
|
|
|
|
|
abbreviate_subcountry => 1, |
146
|
|
|
|
|
|
|
abbreviated_subcountry_only => 1, |
147
|
|
|
|
|
|
|
force_post_code => 1 |
148
|
|
|
|
|
|
|
); |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
my $address = Lingua::EN::AddressParse->new(%args); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=over 4 |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item country |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The country argument must be specified. It determines the possible list of |
157
|
|
|
|
|
|
|
valid sub countries (states, counties etc, defined in the Locale::SubCountry |
158
|
|
|
|
|
|
|
module) and post code formats. Either the full name or abbreviation may be |
159
|
|
|
|
|
|
|
specified. The currently supported country names and codes are: |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
AU or Australia |
162
|
|
|
|
|
|
|
CA or Canada |
163
|
|
|
|
|
|
|
GB or United Kingdom |
164
|
|
|
|
|
|
|
US or United States |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
All forms of upper/lower case are acceptable in the country's spelling. If a |
167
|
|
|
|
|
|
|
country name is supplied that the module doesn't recognise, it will die. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item force_case (optional) |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
This option only applies to the C method, see below. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=item auto_clean (optional) |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
When this option is set to a positive value, the input string is |
176
|
|
|
|
|
|
|
'cleaned' to try and normalise bad patterns. The type of cleaning |
177
|
|
|
|
|
|
|
includes |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
remove non alphanumeric characters |
180
|
|
|
|
|
|
|
remove full stops |
181
|
|
|
|
|
|
|
remove redundant white space |
182
|
|
|
|
|
|
|
add missing space separators |
183
|
|
|
|
|
|
|
expand abbreviations to more common forms |
184
|
|
|
|
|
|
|
remove bracketed annotations |
185
|
|
|
|
|
|
|
fix badly formed sub property identifiers |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item abbreviate_subcountry (optional) |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
When this option is set to a positive value, the sub country is forced to it's |
190
|
|
|
|
|
|
|
abbreviated form, so "New South Wales" becomes "NSW". If the sub country is |
191
|
|
|
|
|
|
|
already abbreviated then it's value is not altered. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item abbreviated_subcountry_only (optional) |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
When this option is set to a positive value, only the abbreviated form |
196
|
|
|
|
|
|
|
of sub country is allowed, such as "NSW" and not "New South Wales". This |
197
|
|
|
|
|
|
|
will make parsing quicker and ensure that addresses comply with postal |
198
|
|
|
|
|
|
|
standards that normally permit only abbreviated sub countries. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
It also avoids matching a sub_country name too early, as in the case of 'Port Washington New Jersey' |
201
|
|
|
|
|
|
|
Normally, 'Washington would be consumed as the sub country, but by first converting |
202
|
|
|
|
|
|
|
the address to 'Port Washington NJ' we avoid this problem |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=item force_post_code (optional) |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
When this option is set to a positive value, the address must contain |
208
|
|
|
|
|
|
|
a post code. If it does not then an error flag is raised. If this option |
209
|
|
|
|
|
|
|
is set to 0 than a post code is optional. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
By default for this option is true. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=back |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 parse |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
$error = $address->parse("12/3-5 AUBREY ST VERMONT VIC 3133"); |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
The C method takes a single parameter of a text string containing a |
220
|
|
|
|
|
|
|
address. It attempts to parse the address and break it down into the components |
221
|
|
|
|
|
|
|
described below. If the address is parsed successfully, a 0 is returned, |
222
|
|
|
|
|
|
|
otherwise a 1. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Note that you can successfully parse all the components of an address and still |
225
|
|
|
|
|
|
|
have an error returned. This occurs when you have non matching data following |
226
|
|
|
|
|
|
|
a valid address. To check if the data is unusable, you also need to use the |
227
|
|
|
|
|
|
|
C method to check the address type is 'unknown' |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
This method is a prerequisite for all the following methods. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=head2 components |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
%address = $address->components($upper_case_all); |
234
|
|
|
|
|
|
|
$suburb = $address{suburb}; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
If the optional argument $upper_case_all is set to a postive value, all components |
237
|
|
|
|
|
|
|
are converted to upper case. |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
The C method returns all the address components in a hash. The |
241
|
|
|
|
|
|
|
following keys are used for each component: |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
pre_cursor - such as 'C/O Mr A Smith' |
245
|
|
|
|
|
|
|
po_box_type - such as 'Private Boxes' |
246
|
|
|
|
|
|
|
post_box |
247
|
|
|
|
|
|
|
road_box |
248
|
|
|
|
|
|
|
sub_property_type |
249
|
|
|
|
|
|
|
sub_property_identifier |
250
|
|
|
|
|
|
|
property_identifier |
251
|
|
|
|
|
|
|
property_name |
252
|
|
|
|
|
|
|
level - such as 12th Floor |
253
|
|
|
|
|
|
|
building - such as Tower A |
254
|
|
|
|
|
|
|
street_direction_prefix (such as East, NW, North etc) |
255
|
|
|
|
|
|
|
base_street_name (the name with direction removed, such as "Main" in "East Main St") |
256
|
|
|
|
|
|
|
street_name (the full street name such as "East Main") |
257
|
|
|
|
|
|
|
street_type |
258
|
|
|
|
|
|
|
street_direction_suffix (US only, abbreviated only such as N, SE etc) |
259
|
|
|
|
|
|
|
suburb |
260
|
|
|
|
|
|
|
subcountry |
261
|
|
|
|
|
|
|
post_code |
262
|
|
|
|
|
|
|
country |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
If a component has no matching data for a given address, it's values will be |
265
|
|
|
|
|
|
|
set to the empty string. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Each component is converted to title case, meaning the first letter of each |
268
|
|
|
|
|
|
|
component is set to capitals and the remainder to lower case. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
Proper name capitalisations such as MacNay and O'Brien are observed |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
The following components are not converted to title case: |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
post_box |
275
|
|
|
|
|
|
|
road_box |
276
|
|
|
|
|
|
|
subcountry |
277
|
|
|
|
|
|
|
post_code |
278
|
|
|
|
|
|
|
country |
279
|
|
|
|
|
|
|
street_direction_suffix |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
If your input data is all upper case and you want to retian that format for parsed |
282
|
|
|
|
|
|
|
data, you will need to apply the 'uc' function to each component. |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=head2 case_all |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$correct_casing = $address->case_all; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
The C method does the same thing as the C method except |
289
|
|
|
|
|
|
|
the entire address is returned as a title cased text string. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
If the force_case option was set in the C method above, address case the |
292
|
|
|
|
|
|
|
entire input string, including any unmatched sections after a recognisable address |
293
|
|
|
|
|
|
|
that failed parsing. This option is useful when you know you have invalid data, |
294
|
|
|
|
|
|
|
but you still want to title case what you have. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head2 properties |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
The C method returns several properties of the address as a hash. |
299
|
|
|
|
|
|
|
The following keys are used for each property - |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
type - either suburban ,rural,post_box,road_box,unknown |
302
|
|
|
|
|
|
|
non_matching - any trailing string not part the address |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Additional properties can be accessed with the following |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
$address->{original_input} |
308
|
|
|
|
|
|
|
$address->{input_string} - string after auto_clean option has been applied |
309
|
|
|
|
|
|
|
$address->{country_code} - abbreviated Country address format (as defined in the C method) |
310
|
|
|
|
|
|
|
$address->{error} - error flag, 0 = good, 1 = error |
311
|
|
|
|
|
|
|
$address->{error_desc} - text to describe the type of parsing error |
312
|
|
|
|
|
|
|
$address->{warning} - warning flag, 0 = good, 1 = warning |
313
|
|
|
|
|
|
|
$address->{warning_desc} - text to to describe the type of parsing warning(s) |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Warnings mean that the address has parsed but there may still be errors within it's components |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=head2 report |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Create a formatted text report |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
the input string |
323
|
|
|
|
|
|
|
the cleaned input string |
324
|
|
|
|
|
|
|
the country type |
325
|
|
|
|
|
|
|
the address type |
326
|
|
|
|
|
|
|
any non matching part of input string |
327
|
|
|
|
|
|
|
if any parsing errors occurred |
328
|
|
|
|
|
|
|
error description |
329
|
|
|
|
|
|
|
if any parsing warning occurred |
330
|
|
|
|
|
|
|
warning description |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
the name and value of each defined component |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Returns a string containing a multi line formatted text report |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=head1 DEPENDANCIES |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
L, L, L |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head1 BUGS |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head1 LIMITATIONS |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Streets such as 'The Esplanade' will return a street of 'The Esplanade' and a |
346
|
|
|
|
|
|
|
street type of null string. |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
For US addresses, an ambiguity arises between a street directional suffix and |
349
|
|
|
|
|
|
|
a suburb directional prefix, such as '12 Main St S Springfield CA 92345'. Is it South |
350
|
|
|
|
|
|
|
Main St, or South Springfield? The parser assumes that 'S' belongs to the street |
351
|
|
|
|
|
|
|
description. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
The huge number of character combinations that can form a valid address makes |
354
|
|
|
|
|
|
|
it is impossible to correctly identify them all. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Valid addresses must contain: |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
property address, suburb, subcountry (aka state) in that order. |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
This format is widely accepted in Australia and the US. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
UK addresses will often include suburb, town, city and county, formats that |
363
|
|
|
|
|
|
|
are very difficult to parse. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Property names must be enclosed in single or double quotes like "Old Regret" |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Because of the large combination of possible addresses defined in the grammar, |
368
|
|
|
|
|
|
|
the program is not very fast. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head1 REFERENCES |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
"The Wordsworth Dictionary of Abbreviations & Acronyms" (1997) |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
Australian Standard AS4212-1994 "Geographic Information Systems - |
376
|
|
|
|
|
|
|
Data Dictionary for transfer of street addressing information" |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
ISO 3166-2:1998, Codes for the representation of names of countries |
379
|
|
|
|
|
|
|
and their subdivisions. Also released as AS/NZS 2632.2:1999 |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=head1 SEE ALSO |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
AddressParse is designed to identify properties, which have a unique physical |
385
|
|
|
|
|
|
|
location. L will also parse addresses for the USA, and can handle |
386
|
|
|
|
|
|
|
locations defined by street intersections, such as: "Hollywood & Vine, Los Angeles, CA" |
387
|
|
|
|
|
|
|
"Mission Street at Valencia Street, San Francisco, CA" |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
L |
391
|
|
|
|
|
|
|
L |
392
|
|
|
|
|
|
|
L |
393
|
|
|
|
|
|
|
L |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
See L |
396
|
|
|
|
|
|
|
for a list of different addressing formats from around the world. And also |
397
|
|
|
|
|
|
|
L |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=head1 REPOSITORY |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
L |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 TO DO |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Define grammar for other languages. Hopefully, all that would be needed is |
406
|
|
|
|
|
|
|
to specify a new module with its own grammar, and inherit all the existing |
407
|
|
|
|
|
|
|
methods. I don't have the knowledge of the naming conventions for non-english |
408
|
|
|
|
|
|
|
languages. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 AUTHOR |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
AddressParse was written by Kim Ryan |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Copyright (c) 2015 Kim Ryan. All rights reserved. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
419
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=cut |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
package Lingua::EN::AddressParse; |
427
|
|
|
|
|
|
|
|
428
|
2
|
|
|
2
|
|
89600
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
45
|
|
429
|
2
|
|
|
2
|
|
6
|
use Carp; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
115
|
|
430
|
2
|
|
|
2
|
|
7
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
40
|
|
431
|
2
|
|
|
2
|
|
829
|
use Lingua::EN::AddressParse::Grammar; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
74
|
|
432
|
2
|
|
|
2
|
|
1413
|
use Lingua::EN::NameParse; |
|
2
|
|
|
|
|
67956
|
|
|
2
|
|
|
|
|
99
|
|
433
|
2
|
|
|
2
|
|
12
|
use Parse::RecDescent; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
9
|
|
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
our $VERSION = '1.25'; |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
438
|
|
|
|
|
|
|
# Create a new instance of an address parsing object. This step is time |
439
|
|
|
|
|
|
|
# consuming and should normally only be called once in your program. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub new |
442
|
|
|
|
|
|
|
{ |
443
|
4
|
|
|
4
|
1
|
1364
|
my $class = shift; |
444
|
4
|
|
|
|
|
9
|
my %args = @_; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
447
|
4
|
50
|
33
|
|
|
53
|
unless (defined $args{country} and $args{country} =~ |
448
|
|
|
|
|
|
|
/^(AU|Australia|GB|United Kingdom|US|United States|CA|Canada)$/ ) |
449
|
|
|
|
|
|
|
{ |
450
|
0
|
|
|
|
|
0
|
croak "Cannot start parser. You must specify a value for the country in the options hash.\nValid options are AUS,GB,US or CA.\n"; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
454
|
4
|
|
|
|
|
6
|
my $address = {}; |
455
|
4
|
|
|
|
|
9
|
bless($address,$class); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# option defaults |
458
|
4
|
|
|
|
|
11
|
$address->{'force_post_code'} = 1; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# Add error checking for invalid keys? |
461
|
4
|
|
|
|
|
13
|
foreach my $curr_key (keys %args) |
462
|
|
|
|
|
|
|
{ |
463
|
8
|
|
|
|
|
13
|
$address->{$curr_key} = $args{$curr_key}; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# create the grammar tree (this is country dependent) |
467
|
4
|
|
|
|
|
20
|
my $grammar = Lingua::EN::AddressParse::Grammar::_create($address); |
468
|
|
|
|
|
|
|
|
469
|
4
|
|
|
|
|
26
|
$address->{parse} = Parse::RecDescent->new($grammar); |
470
|
|
|
|
|
|
|
|
471
|
4
|
|
|
|
|
2614337
|
return ($address); |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
474
|
|
|
|
|
|
|
sub parse |
475
|
|
|
|
|
|
|
{ |
476
|
15
|
|
|
15
|
1
|
7686
|
my $address = shift; |
477
|
15
|
|
|
|
|
25
|
my ($input_string) = @_; |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# Save original data so we can check effect of auto cleaning |
480
|
15
|
|
|
|
|
63
|
$address->{original_input} = $input_string; |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Convert to all upper case. This will allow for faster regexp matching in |
483
|
|
|
|
|
|
|
# the grammar tree |
484
|
15
|
|
|
|
|
40
|
$address->{input_string} = uc($input_string); |
485
|
|
|
|
|
|
|
|
486
|
15
|
|
|
|
|
34
|
chomp($address->{input_string}); |
487
|
|
|
|
|
|
|
|
488
|
15
|
|
|
|
|
18
|
my $pre_cursor; |
489
|
15
|
|
|
|
|
32
|
($pre_cursor,$address->{input_string}) = _extract_precursor($address->{input_string}); |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# Replace commas (which can be used to chunk sections of addresses) with spaces |
492
|
15
|
|
|
|
|
34
|
$address->{input_string} =~ s/,/ /g; |
493
|
|
|
|
|
|
|
|
494
|
15
|
100
|
|
|
|
45
|
if ( $address->{auto_clean} ) |
495
|
|
|
|
|
|
|
{ |
496
|
12
|
|
|
|
|
24
|
$address->{input_string} = _clean($address); |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
|
499
|
15
|
|
|
|
|
15
|
my $po_box_type; |
500
|
15
|
|
|
|
|
34
|
($po_box_type,$address->{input_string}) = _extract_po_box_type($address->{input_string}); |
501
|
|
|
|
|
|
|
|
502
|
15
|
|
|
|
|
16
|
my $level; |
503
|
15
|
|
|
|
|
33
|
($level,$address->{input_string}) = _extract_level($address->{input_string}); |
504
|
|
|
|
|
|
|
|
505
|
15
|
|
|
|
|
15
|
my $building; |
506
|
15
|
|
|
|
|
26
|
($building,$address->{input_string}) = _extract_building($address->{input_string}); |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Normalise sub property ID, 4/22-24 => UNIT 4 22-24, 4 12 => Unit 4 12 |
511
|
15
|
100
|
100
|
|
|
94
|
if ($address->{country_code} ne 'US' and $address->{input_string} =~ /^(\d{1,4}[A-Z]{0,2})[\/| ](\d+[ \w-].*)$/ ) |
512
|
|
|
|
|
|
|
{ |
513
|
2
|
|
|
|
|
10
|
$address->{input_string} = "UNIT $1 $2"; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# We need to add a trailing space to the input string. This is because the grammar |
517
|
|
|
|
|
|
|
# tree expects a terminator (the space) fro every production, optionally followed |
518
|
|
|
|
|
|
|
# by other productions or any final non matching text. |
519
|
|
|
|
|
|
|
# This space will be removed in the _assemble function |
520
|
15
|
|
|
|
|
21
|
$address->{input_string} .= ' '; |
521
|
|
|
|
|
|
|
|
522
|
15
|
|
|
|
|
33
|
$address = _assemble($address,$pre_cursor,$po_box_type,$level,$building); |
523
|
15
|
|
|
|
|
32
|
_validate($address); |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
526
|
15
|
|
|
|
|
48
|
return($address,$address->{error}); |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
530
|
|
|
|
|
|
|
# Apply correct capitalisation to each component of an address |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub components |
533
|
|
|
|
|
|
|
{ |
534
|
17
|
|
|
17
|
1
|
80
|
my $address = shift; |
535
|
17
|
|
|
|
|
16
|
my ($uc_all) = @_; |
536
|
|
|
|
|
|
|
|
537
|
17
|
|
|
|
|
18
|
my %orig_components = %{ $address->{components} }; |
|
17
|
|
|
|
|
132
|
|
538
|
|
|
|
|
|
|
|
539
|
17
|
|
|
|
|
26
|
my (%cased_components); |
540
|
17
|
|
|
|
|
49
|
foreach my $curr_key ( keys %orig_components ) |
541
|
|
|
|
|
|
|
{ |
542
|
323
|
|
|
|
|
191
|
my $cased_value; |
543
|
323
|
|
|
|
|
232
|
my $curr_value = $orig_components{$curr_key}; |
544
|
|
|
|
|
|
|
|
545
|
323
|
50
|
|
|
|
359
|
if ($uc_all) |
546
|
|
|
|
|
|
|
{ |
547
|
0
|
|
|
|
|
0
|
$cased_components{$curr_key} = uc($curr_value); |
548
|
0
|
|
|
|
|
0
|
next; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
|
552
|
323
|
100
|
|
|
|
607
|
if ( $curr_key =~ /^(base_street_name|street_name|street_type|suburb|property_name|sub_property|pre_cursor|po_box_type|level|building)/ ) |
|
|
100
|
|
|
|
|
|
553
|
|
|
|
|
|
|
{ |
554
|
|
|
|
|
|
|
|
555
|
187
|
100
|
100
|
|
|
405
|
if ( $curr_key eq 'street_name' and$curr_value =~ /^US HIGHWAY (.*)/ ) |
|
|
100
|
|
|
|
|
|
556
|
|
|
|
|
|
|
{ |
557
|
1
|
|
|
|
|
5
|
$cased_value = "US Highway $1"; |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
elsif ( $curr_key eq 'sub_property_identifier' ) |
560
|
|
|
|
|
|
|
{ |
561
|
|
|
|
|
|
|
# UNIT, APT ... 12D etc |
562
|
|
|
|
|
|
|
|
563
|
17
|
|
|
|
|
28
|
my @words = split(/ /,$curr_value); |
564
|
17
|
|
|
|
|
16
|
my @cased_words; |
565
|
|
|
|
|
|
|
my $cased_string; |
566
|
17
|
|
|
|
|
23
|
foreach my $word (@words) |
567
|
|
|
|
|
|
|
{ |
568
|
5
|
|
|
|
|
3
|
my $cased_word; |
569
|
5
|
50
|
66
|
|
|
31
|
if ( $word =~ /^\d{1,3}(ST|ND|RD|TH)$/) |
|
|
50
|
|
|
|
|
|
570
|
|
|
|
|
|
|
{ |
571
|
|
|
|
|
|
|
# ordinal component, as in 3rd Floor |
572
|
0
|
|
|
|
|
0
|
$cased_word = lc($word); |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
elsif ( length($word) > 1 and $word !~ /\d/ ) |
575
|
|
|
|
|
|
|
{ |
576
|
|
|
|
|
|
|
# only need to title case words such as UNIT |
577
|
0
|
|
|
|
|
0
|
$cased_word = Lingua::EN::NameParse::case_surname($word); |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
else |
580
|
|
|
|
|
|
|
{ |
581
|
5
|
|
|
|
|
6
|
$cased_word = $word; |
582
|
|
|
|
|
|
|
} |
583
|
5
|
|
|
|
|
7
|
push(@cased_words,$cased_word); |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
} |
586
|
17
|
|
|
|
|
34
|
$cased_value = join(' ',@cased_words); |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
else |
589
|
|
|
|
|
|
|
{ |
590
|
169
|
100
|
|
|
|
164
|
if ($curr_value) |
591
|
|
|
|
|
|
|
{ |
592
|
|
|
|
|
|
|
# Surnames can be used for street's or suburbs so this method |
593
|
|
|
|
|
|
|
# will give correct capitalisation for most cases |
594
|
71
|
|
|
|
|
126
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
else |
597
|
|
|
|
|
|
|
{ |
598
|
98
|
|
|
|
|
82
|
$cased_value = ''; |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
# retain street_direction,sub country and countries capitalisation, usually uppercase |
603
|
|
|
|
|
|
|
elsif ($curr_key =~ /street_direction/) |
604
|
|
|
|
|
|
|
{ |
605
|
34
|
100
|
66
|
|
|
135
|
if (length($curr_value) == 1 or length($curr_value) == 2) |
|
|
50
|
|
|
|
|
|
606
|
|
|
|
|
|
|
{ |
607
|
|
|
|
|
|
|
# N, SE etc is capitalised |
608
|
1
|
|
|
|
|
2
|
$cased_value =$curr_value; |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
elsif (length($curr_value) > 2) |
611
|
|
|
|
|
|
|
{ |
612
|
0
|
|
|
|
|
0
|
$cased_value = Lingua::EN::NameParse::case_surname($curr_value); |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
else |
615
|
|
|
|
|
|
|
{ |
616
|
33
|
|
|
|
|
34
|
$cased_value = ''; |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
# retain sub country and countries capitalisation, as usually uppercase |
620
|
|
|
|
|
|
|
else |
621
|
|
|
|
|
|
|
{ |
622
|
102
|
|
|
|
|
103
|
$cased_value = uc($curr_value); |
623
|
|
|
|
|
|
|
} |
624
|
323
|
|
|
|
|
1518
|
$cased_components{$curr_key} = $cased_value; |
625
|
|
|
|
|
|
|
} |
626
|
17
|
|
|
|
|
324
|
return(%cased_components); |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
629
|
|
|
|
|
|
|
# Apply correct capitalisation to an entire address |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub case_all |
632
|
|
|
|
|
|
|
{ |
633
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
634
|
|
|
|
|
|
|
|
635
|
0
|
|
|
|
|
0
|
my @cased_address; |
636
|
|
|
|
|
|
|
|
637
|
0
|
0
|
|
|
|
0
|
unless ( $address->{properties}{type} eq 'unknown' ) |
638
|
|
|
|
|
|
|
{ |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# Hash of of lists, indicating the order that address components are assembled in. |
641
|
|
|
|
|
|
|
# Each list element is itself the name of the key value in an address object. |
642
|
|
|
|
|
|
|
|
643
|
0
|
|
|
|
|
0
|
my %component_order= |
644
|
|
|
|
|
|
|
( |
645
|
|
|
|
|
|
|
'rural' => [ qw/pre_cursor property_name suburb subcountry post_code country/], |
646
|
|
|
|
|
|
|
'post_box'=> [ qw/pre_cursor post_box suburb po_box_type subcountry post_code country/ ], |
647
|
|
|
|
|
|
|
'road_box'=> [ qw/pre_cursor road_box street_name street_type suburb subcountry post_code country/ ] |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
); |
650
|
0
|
0
|
|
|
|
0
|
if ( $address->{country} eq 'US' ) |
651
|
|
|
|
|
|
|
{ |
652
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor property_identifier street_name street_type street_direction_suffix building level sub_property_type sub_property_identifier suburb subcountry post_code country/]; |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
else |
655
|
|
|
|
|
|
|
{ |
656
|
0
|
|
|
|
|
0
|
$component_order{'suburban'} = [ qw/pre_cursor building level sub_property_type sub_property_identifier property_identifier street_name street_type suburb subcountry post_code country/ ]; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
0
|
my %component_vals = $address->components; |
660
|
0
|
|
|
|
|
0
|
my @order = @{ $component_order{$address->{properties}{type} } }; |
|
0
|
|
|
|
|
0
|
|
661
|
|
|
|
|
|
|
|
662
|
0
|
|
|
|
|
0
|
foreach my $component ( @order ) |
663
|
|
|
|
|
|
|
{ |
664
|
|
|
|
|
|
|
# As some components such as property name are optional, they will appear |
665
|
|
|
|
|
|
|
# in the order array but may or may not have have a value, so check |
666
|
|
|
|
|
|
|
# for undefined values |
667
|
0
|
0
|
|
|
|
0
|
if ( $component_vals{$component} ) |
668
|
|
|
|
|
|
|
{ |
669
|
0
|
|
|
|
|
0
|
push(@cased_address,$component_vals{$component}); |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
0
|
0
|
0
|
|
|
0
|
if ( $address->{error} and $address->{force_case} ) |
675
|
|
|
|
|
|
|
{ |
676
|
|
|
|
|
|
|
# Despite errors, try to name case non-matching section. As the format |
677
|
|
|
|
|
|
|
# of this section is unknown, surname case will provide the best |
678
|
|
|
|
|
|
|
# approximation |
679
|
0
|
|
|
|
|
0
|
push(@cased_address,&Lingua::EN::NameParse::case_surname($address->{properties}{non_matching})); |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
return(join(' ',@cased_address)); |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
685
|
|
|
|
|
|
|
sub properties |
686
|
|
|
|
|
|
|
{ |
687
|
2
|
|
|
2
|
1
|
8
|
my $address = shift; |
688
|
2
|
|
|
|
|
2
|
return(%{ $address->{properties} }); |
|
2
|
|
|
|
|
10
|
|
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
692
|
|
|
|
|
|
|
# Create a text report to standard output listing |
693
|
|
|
|
|
|
|
# - the input string, |
694
|
|
|
|
|
|
|
# - the name of each defined component |
695
|
|
|
|
|
|
|
# - any non matching component |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
sub report |
698
|
|
|
|
|
|
|
{ |
699
|
0
|
|
|
0
|
1
|
0
|
my $address = shift; |
700
|
|
|
|
|
|
|
|
701
|
0
|
|
|
|
|
0
|
my $report = ''; |
702
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Original Input",$address->{original_input}); |
704
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Cleaned Input",$address->{input_string}); |
705
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Country address format",$address->{country_code}); |
706
|
|
|
|
|
|
|
|
707
|
0
|
|
|
|
|
0
|
my %props = $address->properties; |
708
|
0
|
0
|
|
|
|
0
|
if ( $props{type} ) |
709
|
|
|
|
|
|
|
{ |
710
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Address type",$props{type}); |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
|
714
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Non matching part",$props{non_matching}); |
715
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error",$address->{error}); |
716
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Error descriptions",$address->{error_desc}); |
717
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning",$address->{error}); |
718
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Warning description",$address->{warning_desc}); |
719
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"Case all",$address->case_all); |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
|
722
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,"COMPONENTS",''); |
723
|
0
|
|
|
|
|
0
|
my %comps = $address->components; |
724
|
0
|
|
|
|
|
0
|
foreach my $comp ( sort keys %comps) |
725
|
|
|
|
|
|
|
{ |
726
|
0
|
0
|
|
|
|
0
|
if (defined($comps{$comp}) ) |
727
|
|
|
|
|
|
|
{ |
728
|
0
|
|
|
|
|
0
|
_fmt_report_line(\$report,$comp,$comps{$comp}); |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
|
732
|
0
|
|
|
|
|
0
|
return($report); |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
# PRIVATE METHODS |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
sub _assemble |
743
|
|
|
|
|
|
|
{ |
744
|
|
|
|
|
|
|
|
745
|
15
|
|
|
15
|
|
17
|
my $address = shift; |
746
|
15
|
|
|
|
|
19
|
my ($pre_cursor,$po_box_type,$level,$building) = @_; |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# Parse the address according to the rules defined in the AddressParse::Grammar module, |
749
|
|
|
|
|
|
|
# $::RD_TRACE = 1; # for debugging RecDescent output |
750
|
|
|
|
|
|
|
# Use Parse::RecDescent to do the parsing. 'full_address' is a label for the complete grammar tree |
751
|
15
|
|
|
|
|
127
|
my $parsed_address = $address->{parse}->full_address($address->{input_string}); |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
# Place components into a separate hash, so they can be easily returned to the user to inspect and modify |
754
|
15
|
|
|
|
|
169049
|
$address->{components} = (); |
755
|
|
|
|
|
|
|
|
756
|
15
|
100
|
|
|
|
90
|
if ($pre_cursor) |
757
|
|
|
|
|
|
|
{ |
758
|
1
|
|
|
|
|
4
|
$address->{components}{'pre_cursor'} = $pre_cursor; |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
else |
761
|
|
|
|
|
|
|
{ |
762
|
14
|
|
|
|
|
35
|
$address->{components}{'pre_cursor'} = ''; |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
15
|
100
|
|
|
|
31
|
if ($level) |
766
|
|
|
|
|
|
|
{ |
767
|
1
|
|
|
|
|
2
|
$address->{components}{'level'} = $level; |
768
|
|
|
|
|
|
|
} |
769
|
|
|
|
|
|
|
else |
770
|
|
|
|
|
|
|
{ |
771
|
14
|
|
|
|
|
24
|
$address->{components}{'level'} = ''; |
772
|
|
|
|
|
|
|
} |
773
|
|
|
|
|
|
|
|
774
|
15
|
100
|
|
|
|
41
|
if ($building) |
775
|
|
|
|
|
|
|
{ |
776
|
1
|
|
|
|
|
3
|
$address->{components}{'building'} = $building; |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
else |
779
|
|
|
|
|
|
|
{ |
780
|
14
|
|
|
|
|
29
|
$address->{components}{'building'} = ''; |
781
|
|
|
|
|
|
|
} |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
|
784
|
15
|
100
|
|
|
|
28
|
if ($po_box_type) |
785
|
|
|
|
|
|
|
{ |
786
|
1
|
|
|
|
|
2
|
$address->{components}{'po_box_type'} = $po_box_type; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
else |
789
|
|
|
|
|
|
|
{ |
790
|
14
|
|
|
|
|
22
|
$address->{components}{'po_box_type'} = ''; |
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
|
794
|
15
|
|
|
|
|
28
|
$address->{components}{post_box} = ''; |
795
|
15
|
100
|
|
|
|
39
|
if ( $parsed_address->{post_box} ) |
796
|
|
|
|
|
|
|
{ |
797
|
1
|
|
|
|
|
2
|
$address->{components}{post_box} = $parsed_address->{post_box}; |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
|
800
|
15
|
|
|
|
|
23
|
$address->{components}{road_box} = ''; |
801
|
15
|
50
|
|
|
|
33
|
if ( $parsed_address->{road_box} ) |
802
|
|
|
|
|
|
|
{ |
803
|
0
|
|
|
|
|
0
|
$address->{components}{road_box} = $parsed_address->{road_box}; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
15
|
|
|
|
|
25
|
$address->{components}{property_name} = ''; |
807
|
15
|
100
|
|
|
|
34
|
if ( $parsed_address->{property_name} ) |
808
|
|
|
|
|
|
|
{ |
809
|
1
|
|
|
|
|
2
|
$address->{components}{property_name} = $parsed_address->{property_name}; |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
|
812
|
15
|
|
|
|
|
38
|
$address->{components}{sub_property_identifier} = ''; |
813
|
15
|
|
|
|
|
26
|
$address->{components}{sub_property_type} = ''; |
814
|
|
|
|
|
|
|
|
815
|
15
|
100
|
|
|
|
35
|
if ( $parsed_address->{sub_property} ) |
816
|
|
|
|
|
|
|
{ |
817
|
4
|
50
|
|
|
|
49
|
if ($parsed_address->{sub_property} =~ /^(#|[A-Z]{1,}) (.*)$/ ) |
|
|
0
|
|
|
|
|
|
818
|
|
|
|
|
|
|
{ |
819
|
|
|
|
|
|
|
# Such as Unit 24, # 4A etc |
820
|
4
|
|
|
|
|
15
|
$address->{components}{sub_property_type} = $1; |
821
|
4
|
|
|
|
|
10
|
$address->{components}{sub_property_identifier} = $2; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
elsif ($parsed_address->{sub_property} =~ /^(\d\w\w) (.*)$/ ) |
824
|
|
|
|
|
|
|
{ |
825
|
|
|
|
|
|
|
# Such as 1st Floor |
826
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_type} = $2; |
827
|
0
|
|
|
|
|
0
|
$address->{components}{sub_property_identifier} = $1; |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
|
831
|
15
|
|
|
|
|
23
|
$address->{components}{property_identifier} = ''; |
832
|
15
|
100
|
|
|
|
37
|
if ( $parsed_address->{property_identifier} ) |
833
|
|
|
|
|
|
|
{ |
834
|
13
|
|
|
|
|
27
|
$address->{components}{property_identifier} = $parsed_address->{property_identifier}; |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
15
|
|
|
|
|
21
|
$address->{components}{base_street_name} = ''; |
838
|
15
|
|
|
|
|
25
|
$address->{components}{street_direction_prefix} = ''; |
839
|
15
|
|
|
|
|
38
|
my ($street_direction,$base_street_name) = _get_street_direction($parsed_address->{street_name}); |
840
|
15
|
50
|
|
|
|
26
|
if ($street_direction ) |
841
|
|
|
|
|
|
|
{ |
842
|
0
|
|
|
|
|
0
|
$address->{components}{street_direction_prefix} = $street_direction; |
843
|
0
|
|
|
|
|
0
|
$address->{components}{base_street_name} = $base_street_name; |
844
|
|
|
|
|
|
|
} |
845
|
|
|
|
|
|
|
else |
846
|
|
|
|
|
|
|
{ |
847
|
15
|
|
|
|
|
24
|
$address->{components}{base_street_name} = $parsed_address->{street_name}; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
|
850
|
15
|
|
|
|
|
22
|
$address->{components}{street_name} = ''; |
851
|
15
|
|
|
|
|
22
|
$address->{components}{street_type} = ''; |
852
|
15
|
100
|
|
|
|
28
|
if ( $parsed_address->{street_name} ) |
853
|
|
|
|
|
|
|
{ |
854
|
|
|
|
|
|
|
# Streets such as 'The Corso' will parse as street_name = 'The' and street_type = 'Corso', so seperate out |
855
|
13
|
50
|
|
|
|
29
|
if ( $parsed_address->{street_name} eq 'THE ' ) |
856
|
|
|
|
|
|
|
{ |
857
|
0
|
|
|
|
|
0
|
$address->{components}{street_name} = 'THE ' . $parsed_address->{street_type}; |
858
|
|
|
|
|
|
|
} |
859
|
|
|
|
|
|
|
else |
860
|
|
|
|
|
|
|
{ |
861
|
13
|
|
|
|
|
21
|
$address->{components}{street_name} = $parsed_address->{street_name}; |
862
|
13
|
|
|
|
|
18
|
$address->{components}{street_type} = $parsed_address->{street_type}; |
863
|
|
|
|
|
|
|
} |
864
|
|
|
|
|
|
|
} |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
|
867
|
15
|
|
|
|
|
25
|
$address->{components}{street_direction_suffix} = ''; |
868
|
15
|
100
|
|
|
|
34
|
if ( $parsed_address->{street_direction_suffix} ) |
869
|
|
|
|
|
|
|
{ |
870
|
1
|
|
|
|
|
5
|
$address->{components}{street_direction_suffix} = $parsed_address->{street_direction_suffix}; |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
|
874
|
15
|
|
|
|
|
40
|
$address->{components}{suburb} = ''; |
875
|
15
|
50
|
|
|
|
37
|
if ( $parsed_address->{suburb} ) |
876
|
|
|
|
|
|
|
{ |
877
|
15
|
|
|
|
|
24
|
$address->{components}{suburb} = $parsed_address->{suburb}; |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
|
880
|
15
|
|
|
|
|
23
|
$address->{components}{subcountry} = ''; |
881
|
15
|
50
|
|
|
|
33
|
if ( $parsed_address->{subcountry} ) |
882
|
|
|
|
|
|
|
{ |
883
|
15
|
|
|
|
|
17
|
my $sub_country = $parsed_address->{subcountry}; |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# Force sub country to abbreviated form, South Australia becomes SA, Michigan become MI etc |
886
|
15
|
100
|
|
|
|
33
|
if ($address->{abbreviate_subcountry}) |
887
|
|
|
|
|
|
|
{ |
888
|
7
|
|
|
|
|
45
|
my $country = Locale::SubCountry->new($address->{country}); |
889
|
7
|
|
|
|
|
106
|
my $code = $country->code($sub_country); |
890
|
7
|
100
|
|
|
|
497
|
if ( $code ne 'unknown' ) |
891
|
|
|
|
|
|
|
{ |
892
|
1
|
|
|
|
|
3
|
$address->{components}{subcountry} = $code; |
893
|
|
|
|
|
|
|
} |
894
|
|
|
|
|
|
|
# sub country already abbreviated |
895
|
|
|
|
|
|
|
else |
896
|
|
|
|
|
|
|
{ |
897
|
6
|
|
|
|
|
20
|
$address->{components}{subcountry} = $sub_country; |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
} |
900
|
|
|
|
|
|
|
else |
901
|
|
|
|
|
|
|
{ |
902
|
8
|
|
|
|
|
15
|
$address->{components}{subcountry} = $sub_country; |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
} |
905
|
|
|
|
|
|
|
|
906
|
15
|
|
|
|
|
20
|
$address->{components}{post_code} = ''; |
907
|
15
|
50
|
|
|
|
34
|
if ( $parsed_address->{post_code} ) |
908
|
|
|
|
|
|
|
{ |
909
|
15
|
|
|
|
|
24
|
$address->{components}{post_code} = $parsed_address->{post_code}; |
910
|
|
|
|
|
|
|
} |
911
|
|
|
|
|
|
|
|
912
|
15
|
|
|
|
|
19
|
$address->{components}{country} = ''; |
913
|
15
|
100
|
|
|
|
31
|
if ( $parsed_address->{country} ) |
914
|
|
|
|
|
|
|
{ |
915
|
1
|
|
|
|
|
2
|
$address->{components}{country} = $parsed_address->{country}; |
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
|
918
|
15
|
|
|
|
|
30
|
$address->{properties} = (); |
919
|
|
|
|
|
|
|
|
920
|
15
|
|
|
|
|
31
|
$address->{properties}{non_matching} = ''; |
921
|
15
|
100
|
|
|
|
28
|
if ( $parsed_address->{non_matching} ) |
922
|
|
|
|
|
|
|
{ |
923
|
1
|
|
|
|
|
2
|
$address->{properties}{non_matching} = $parsed_address->{non_matching}; |
924
|
|
|
|
|
|
|
} |
925
|
15
|
|
|
|
|
24
|
$address->{properties}{type} = $parsed_address->{type}; |
926
|
|
|
|
|
|
|
|
927
|
15
|
|
|
|
|
32
|
_trim_trailing_space($address); |
928
|
|
|
|
|
|
|
|
929
|
15
|
|
|
|
|
59
|
return($address); |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
933
|
|
|
|
|
|
|
# |
934
|
|
|
|
|
|
|
sub _get_street_direction |
935
|
|
|
|
|
|
|
{ |
936
|
15
|
|
|
15
|
|
23
|
my ($street_name) = @_; |
937
|
|
|
|
|
|
|
|
938
|
15
|
|
|
|
|
19
|
my $street_direction; |
939
|
|
|
|
|
|
|
my $base_street_name; |
940
|
|
|
|
|
|
|
|
941
|
15
|
100
|
|
|
|
32
|
unless ($street_name) |
942
|
|
|
|
|
|
|
{ |
943
|
2
|
|
|
|
|
4
|
return; |
944
|
|
|
|
|
|
|
} |
945
|
|
|
|
|
|
|
|
946
|
13
|
|
|
|
|
65
|
my @words = split(/\s/,$street_name); |
947
|
13
|
100
|
|
|
|
34
|
if (@words > 1) |
948
|
|
|
|
|
|
|
{ |
949
|
4
|
50
|
|
|
|
15
|
if ( $words[0] =~ /^(N|NE|NW|E|S|SE|SW|W|NORTH|EAST|SOUTH|WEST|NTH|STH)$/ ) |
950
|
|
|
|
|
|
|
{ |
951
|
0
|
|
|
|
|
0
|
$street_direction = $1; |
952
|
0
|
|
|
|
|
0
|
shift(@words); |
953
|
0
|
|
|
|
|
0
|
$base_street_name = join(' ',@words); |
954
|
|
|
|
|
|
|
} |
955
|
|
|
|
|
|
|
} |
956
|
13
|
|
|
|
|
29
|
return($street_direction,$base_street_name); |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
961
|
|
|
|
|
|
|
# Check for several different types of syntax errors |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
sub _validate |
964
|
|
|
|
|
|
|
{ |
965
|
15
|
|
|
15
|
|
19
|
my $address = shift; |
966
|
15
|
|
|
|
|
24
|
$address->{error} = 0; |
967
|
15
|
|
|
|
|
21
|
$address->{error_desc} = ''; |
968
|
15
|
|
|
|
|
21
|
$address->{warning} = 0; |
969
|
15
|
|
|
|
|
20
|
$address->{warning_desc} = ''; |
970
|
|
|
|
|
|
|
|
971
|
15
|
100
|
|
|
|
35
|
if ( $address->{properties}{non_matching} ) |
972
|
|
|
|
|
|
|
{ |
973
|
1
|
|
|
|
|
2
|
$address->{error} = 1; |
974
|
1
|
|
|
|
|
4
|
$address->{error_desc} = 'non matching section : ' . $address->{properties}{non_matching}; |
975
|
|
|
|
|
|
|
} |
976
|
|
|
|
|
|
|
else |
977
|
|
|
|
|
|
|
{ |
978
|
14
|
50
|
|
|
|
27
|
if ( $address->{properties}{type} eq 'unknown' ) |
979
|
|
|
|
|
|
|
{ |
980
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
981
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= 'unknown address format'; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
else |
984
|
|
|
|
|
|
|
{ |
985
|
14
|
50
|
33
|
|
|
82
|
if ($address->{force_post_code} and not $address->{components}{post_code}) |
986
|
|
|
|
|
|
|
{ |
987
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
988
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':no post code'; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# illegal characters found, note a '#' can appear as an abbreviation for number in USA addresses |
992
|
14
|
50
|
|
|
|
50
|
if ( $address->{input_string} =~ /[^"A-Z0-9'\-\.,\/ ]/ ) |
993
|
|
|
|
|
|
|
{ |
994
|
|
|
|
|
|
|
# Note, if auto_clean is on, illegal characters will have been removed |
995
|
|
|
|
|
|
|
# for second parsing and no error flag or message reported |
996
|
0
|
|
|
|
|
0
|
$address->{error} = 1; |
997
|
0
|
|
|
|
|
0
|
$address->{error_desc} .= ':illegal chars'; |
998
|
|
|
|
|
|
|
} |
999
|
14
|
100
|
|
|
|
36
|
if ( $address->{properties}{type} eq 'suburban' ) |
1000
|
|
|
|
|
|
|
{ |
1001
|
12
|
|
|
|
|
12
|
my $street = $address->{components}{street_name}; |
1002
|
12
|
100
|
|
|
|
34
|
if ($street !~ /\d/ ) |
1003
|
|
|
|
|
|
|
{ |
1004
|
|
|
|
|
|
|
# Not an ordinal or single letter street type |
1005
|
11
|
100
|
|
|
|
24
|
if ( _check_vowel($address->{components}{base_street_name}) ) |
1006
|
|
|
|
|
|
|
{ |
1007
|
|
|
|
|
|
|
# street name must have a vowel sound, |
1008
|
1
|
|
|
|
|
2
|
$address->{warning} = 1; |
1009
|
1
|
|
|
|
|
3
|
$address->{warning_desc} .= ";no vowel sound in street word : $address->{components}{base_street_name}"; |
1010
|
|
|
|
|
|
|
} |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
|
1014
|
14
|
50
|
|
|
|
25
|
if ( _check_vowel($address->{components}{suburb}) ) |
1015
|
|
|
|
|
|
|
{ |
1016
|
0
|
|
|
|
|
0
|
$address->{warning} = 1; |
1017
|
0
|
|
|
|
|
0
|
$address->{warning_desc} .= ";no vowel sound in suburb word : $address->{components}{suburb}"; |
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1023
|
|
|
|
|
|
|
# Purge the input string of illegal or redundant characters. |
1024
|
|
|
|
|
|
|
# Correct malformed patterns |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
sub _clean |
1027
|
|
|
|
|
|
|
{ |
1028
|
12
|
|
|
12
|
|
16
|
my $address = shift; |
1029
|
|
|
|
|
|
|
|
1030
|
12
|
|
|
|
|
21
|
my ($input) = $address->{input_string}; |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# Remove annotations enclosed in brackets, such as 1 Smith St (Cnr Brown St) |
1033
|
12
|
|
|
|
|
15
|
$input =~ s|\(.*\)||; |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# Normalise half house numbers, sucvh as 12.5 to 12 1/2. This is needed now before full stops are stripped out |
1036
|
12
|
|
|
|
|
17
|
$input =~ s|^(\d{1,4})\.5 |$1 1/2 |; |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# strip full stops, remove illegal characters |
1039
|
|
|
|
|
|
|
# & can be part of property name |
1040
|
|
|
|
|
|
|
# hash (#) may denote number for USA address |
1041
|
|
|
|
|
|
|
# quotes can occur as property name delimiters |
1042
|
|
|
|
|
|
|
|
1043
|
12
|
|
|
|
|
24
|
$input =~ s|[^A-Za-z0-9/'" -]||go; |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
# remove repeating, leading and trailing spaces |
1046
|
12
|
|
|
|
|
18
|
$input =~ s| +| |go ; |
1047
|
12
|
|
|
|
|
13
|
$input =~ s|^ ||; |
1048
|
12
|
|
|
|
|
18
|
$input =~ s| $||; |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
# Expand abbreviations that are too short |
1052
|
|
|
|
|
|
|
|
1053
|
12
|
|
|
|
|
18
|
$input =~ s/LAKE ST (GEORGE|CLAIR)/LAKE SAINT $1/; # otherwise St gets consumed to early as 'Street' |
1054
|
12
|
|
|
|
|
17
|
$input =~ s| CSEWY | CAUSEWAY |; |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
# street types |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
|
1059
|
12
|
|
|
|
|
13
|
$input =~ s|^FCTR?Y |FACTORY |; |
1060
|
12
|
|
|
|
|
50
|
$input =~ s|^FACT?R?Y? |FACTORY |; |
1061
|
|
|
|
|
|
|
|
1062
|
12
|
|
|
|
|
15
|
$input =~ s|LVL |LEVEL |; # sub property identifiers |
1063
|
12
|
|
|
|
|
14
|
$input =~ s|^UN? |UNIT |; |
1064
|
12
|
|
|
|
|
11
|
$input =~ s|^U(\d+)|UNIT $1|; |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
# Fix badly formed number dividers such as home unit format of 14/ 12 becomes 14/12, 2- 7A becomes 2-7A |
1067
|
12
|
|
|
|
|
11
|
$input =~ s|/ |/|; |
1068
|
12
|
|
|
|
|
12
|
$input =~ s| /|/|; |
1069
|
12
|
|
|
|
|
12
|
$input =~ s|- |-|; |
1070
|
12
|
|
|
|
|
14
|
$input =~ s| -|-|; |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# Remove redundant spaces in property identifiers, 21 B Smith St becomes 21B Smith St |
1073
|
|
|
|
|
|
|
|
1074
|
12
|
50
|
|
|
|
40
|
if ( $input !~ /^\d+ [A-Z] (ST|AVE)/ ) |
1075
|
|
|
|
|
|
|
{ |
1076
|
|
|
|
|
|
|
# Don't remove space before single letter streets such as 21 B Street |
1077
|
12
|
50
|
|
|
|
31
|
if ( $address->{country_code} eq 'US' ) |
1078
|
|
|
|
|
|
|
{ |
1079
|
|
|
|
|
|
|
# Note cannot use N,E,S,W as they can be street direction prefix, as in 1 E MAIN STREET |
1080
|
|
|
|
|
|
|
# Assume that the direction prefix is the more likely case |
1081
|
0
|
|
|
|
|
0
|
$input =~ s|^(\d+) ([A-DF-MO-RT-VX-Z] )|$1$2|; |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
else |
1084
|
|
|
|
|
|
|
{ |
1085
|
12
|
|
|
|
|
31
|
$input =~ s|^(\d+) ([A-Z] )|$1$2|; |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
} |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
# Add or remove spaces in sub property identifiers |
1090
|
12
|
50
|
|
|
|
24
|
if ( $address->{country_code} eq 'US' ) |
1091
|
|
|
|
|
|
|
{ |
1092
|
|
|
|
|
|
|
# Fix US sub property identifiers that appear after street name and type |
1093
|
|
|
|
|
|
|
# add space between # and the number so #2 becomes '# 2' |
1094
|
0
|
|
|
|
|
0
|
$input =~ s| #(\d)| # $1|; |
1095
|
0
|
|
|
|
|
0
|
$input =~ s| #([A-Z])| # $1|; |
1096
|
0
|
|
|
|
|
0
|
$input =~ s| (APT)(\d)| $1 $2|i; |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
# remove redundnant space so # 34 B becomes # 34B |
1099
|
0
|
|
|
|
|
0
|
$input =~ s| # (\d+) (\w) | # $1$2 |; |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
# remove redundnant '#' |
1102
|
0
|
|
|
|
|
0
|
$input =~ s| APT #| APT |; |
1103
|
|
|
|
|
|
|
} |
1104
|
|
|
|
|
|
|
else |
1105
|
|
|
|
|
|
|
{ |
1106
|
|
|
|
|
|
|
# Add a space to separate sub property type from number, UNIT2 becomes UNIT 2 |
1107
|
12
|
|
|
|
|
20
|
$input =~ s/^(UNIT|LOT|APT|SHOP)(\d)/$1 $2/; |
1108
|
|
|
|
|
|
|
} |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
# Remove redundant slash or dash |
1111
|
|
|
|
|
|
|
# Unit 1B/22, becomes Unit 1B 22, Flat 2-12 becomes Flat 2 12 |
1112
|
|
|
|
|
|
|
# TO DO, add |# at start |
1113
|
12
|
|
|
|
|
21
|
$input =~ s/^([A-Z]{2,}) (\d+[A-Z]?)[\/-]/$1 $2 /; |
1114
|
|
|
|
|
|
|
# Unit J1/ 39 becomes Unit J1 39 |
1115
|
12
|
|
|
|
|
18
|
$input =~ s/^([A-Z]{2,}) ([A-Z]\d{0,3})[\/-]/$1 $2 /; |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
# remove dash that is not from a sequence, such as D-5 or 22-A |
1119
|
12
|
|
|
|
|
15
|
$input =~ s|([A-Z])-(\d)|$1$2|; |
1120
|
12
|
|
|
|
|
16
|
$input =~ s|(\d)-([A-Z])|$1$2|; |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
|
1123
|
12
|
|
|
|
|
18
|
return($input); |
1124
|
|
|
|
|
|
|
} |
1125
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1126
|
|
|
|
|
|
|
# Remove any "care of" type of precursor from the main address |
1127
|
|
|
|
|
|
|
# such as: C/O BRAKEFIELD BETTY S PO BOX 214 GULF HAMMOCK, FL 32639-0214 |
1128
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
sub _extract_precursor |
1131
|
|
|
|
|
|
|
{ |
1132
|
15
|
|
|
15
|
|
19
|
my ($input) = @_; |
1133
|
15
|
|
|
|
|
19
|
my ($pre_cursor,$address_start,$address_end); |
1134
|
|
|
|
|
|
|
|
1135
|
15
|
100
|
|
|
|
73
|
if ($input =~ m{^(C/O.*?|ATTN.*?) (\d+|PO BOX)( .*)}) |
1136
|
|
|
|
|
|
|
{ |
1137
|
1
|
|
|
|
|
5
|
$pre_cursor = $1; |
1138
|
1
|
|
|
|
|
2
|
$address_start = $2; |
1139
|
1
|
|
|
|
|
2
|
$address_end = $3; |
1140
|
1
|
|
|
|
|
5
|
return($pre_cursor, $address_start . $address_end); |
1141
|
|
|
|
|
|
|
} |
1142
|
|
|
|
|
|
|
else |
1143
|
|
|
|
|
|
|
{ |
1144
|
14
|
|
|
|
|
37
|
return('',$input) |
1145
|
|
|
|
|
|
|
} |
1146
|
|
|
|
|
|
|
} |
1147
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1148
|
|
|
|
|
|
|
# Remove any level or floor info such as: |
1149
|
|
|
|
|
|
|
# 12 Smith St Floor 2 |
1150
|
|
|
|
|
|
|
# Level 22 Suite 3 12 Main St |
1151
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
sub _extract_level |
1154
|
|
|
|
|
|
|
{ |
1155
|
15
|
|
|
15
|
|
14
|
my ($input) = @_; |
1156
|
15
|
|
|
|
|
13
|
my ($level); |
1157
|
|
|
|
|
|
|
|
1158
|
15
|
100
|
66
|
|
|
131
|
if |
|
|
|
66
|
|
|
|
|
1159
|
|
|
|
|
|
|
( |
1160
|
|
|
|
|
|
|
# Level info could be at start of string so first space is optional |
1161
|
|
|
|
|
|
|
$input =~ / ?(\d{1,2}(ST|ND|RD|TH) (FLOOR|FLR|FL) )/ or |
1162
|
|
|
|
|
|
|
$input =~ / ?(LEVEL (\d{1,2}|[GM])[\/ -])/ or |
1163
|
|
|
|
|
|
|
$input =~ / ?((FLOOR|FLR|FL) \d{1,2}[\/ -])/ |
1164
|
|
|
|
|
|
|
) |
1165
|
|
|
|
|
|
|
{ |
1166
|
1
|
|
|
|
|
2
|
$level = $1; |
1167
|
1
|
|
|
|
|
2
|
$level =~ s|/||; |
1168
|
1
|
|
|
|
|
2
|
$level =~ s|-||; |
1169
|
1
|
|
|
|
|
17
|
$input =~ s/$level//; |
1170
|
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
|
|
1172
|
15
|
|
|
|
|
40
|
return($level,$input); |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1175
|
|
|
|
|
|
|
# Remove any building info such as: |
1176
|
|
|
|
|
|
|
# Building 2 Level 12 123 Smith St |
1177
|
|
|
|
|
|
|
# 12 Main St Tower A Level 2 |
1178
|
|
|
|
|
|
|
# It will later be saved as an attribute in the address object |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
sub _extract_building |
1181
|
|
|
|
|
|
|
{ |
1182
|
15
|
|
|
15
|
|
17
|
my ($input) = @_; |
1183
|
15
|
|
|
|
|
17
|
my ($building); |
1184
|
|
|
|
|
|
|
|
1185
|
15
|
|
|
|
|
46
|
my $bld = qr{BLOCK|BLDG?|BUILDING|TOWER}; |
1186
|
|
|
|
|
|
|
|
1187
|
15
|
100
|
66
|
|
|
486
|
if |
|
|
|
66
|
|
|
|
|
1188
|
|
|
|
|
|
|
( |
1189
|
|
|
|
|
|
|
$input =~ / ?(($bld) ([A-Z]{1,2}|\d+) )/ or # BLD 12 or AA |
1190
|
|
|
|
|
|
|
$input =~ / ?(($bld) \d{1,3}[A-Z] )/ or # BLD 32C |
1191
|
|
|
|
|
|
|
$input =~ / ?(($bld) [A-Z]\d{1,3} )/ # BLD C12 |
1192
|
|
|
|
|
|
|
) |
1193
|
|
|
|
|
|
|
{ |
1194
|
1
|
|
|
|
|
2
|
$building = $1; |
1195
|
1
|
|
|
|
|
1
|
$building =~ s|/||; |
1196
|
1
|
|
|
|
|
2
|
$building =~ s|-||; |
1197
|
1
|
|
|
|
|
7
|
$input =~ s/$building//; |
1198
|
|
|
|
|
|
|
} |
1199
|
|
|
|
|
|
|
|
1200
|
15
|
|
|
|
|
49
|
return($building,$input); |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
1204
|
|
|
|
|
|
|
# Remove any description that follows the suburb after the main address |
1205
|
|
|
|
|
|
|
# such as: PO BOX 1305 BIBRA LAKE PRIVATE BOXES WA 6965" |
1206
|
|
|
|
|
|
|
# It will be saved as an address attribute |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
sub _extract_po_box_type |
1209
|
|
|
|
|
|
|
{ |
1210
|
15
|
|
|
15
|
|
21
|
my ($input) = @_; |
1211
|
15
|
|
|
|
|
19
|
my ($po_box_type,$address_start,$address_end); |
1212
|
|
|
|
|
|
|
|
1213
|
15
|
100
|
|
|
|
42
|
if ($input =~ /^(.*?) (PRIVATE BOXES)( .*)$/ ) |
1214
|
|
|
|
|
|
|
{ |
1215
|
1
|
|
|
|
|
3
|
$address_start = $1; |
1216
|
1
|
|
|
|
|
2
|
$po_box_type = $2; |
1217
|
1
|
|
|
|
|
2
|
$address_end = $3; |
1218
|
1
|
|
|
|
|
4
|
return($po_box_type, $address_start . $address_end); |
1219
|
|
|
|
|
|
|
} |
1220
|
|
|
|
|
|
|
else |
1221
|
|
|
|
|
|
|
{ |
1222
|
14
|
|
|
|
|
31
|
return('',$input) |
1223
|
|
|
|
|
|
|
} |
1224
|
|
|
|
|
|
|
} |
1225
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1226
|
|
|
|
|
|
|
# For correct matching, the grammar of each component must include the |
1227
|
|
|
|
|
|
|
# trailing space that separates it from any following word. This should |
1228
|
|
|
|
|
|
|
# now be removed from each component |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
sub _trim_trailing_space |
1231
|
|
|
|
|
|
|
{ |
1232
|
15
|
|
|
15
|
|
17
|
my ($address) = @_; |
1233
|
|
|
|
|
|
|
|
1234
|
15
|
|
|
|
|
17
|
foreach my $key (keys %{ $address->{components} } ) |
|
15
|
|
|
|
|
79
|
|
1235
|
|
|
|
|
|
|
{ |
1236
|
285
|
100
|
|
|
|
386
|
if ($address->{components}{$key} ) |
1237
|
|
|
|
|
|
|
{ |
1238
|
110
|
|
|
|
|
211
|
$address->{components}{$key} =~ s/ $//g; |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
} |
1241
|
|
|
|
|
|
|
} |
1242
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
sub _fmt_report_line |
1245
|
|
|
|
|
|
|
{ |
1246
|
0
|
|
|
0
|
|
0
|
my ($report_ref,$label,$value) = @_; |
1247
|
|
|
|
|
|
|
# To DO $$ ?? |
1248
|
0
|
|
|
|
|
0
|
$$report_ref .= sprintf("%-23.23s '%s'\n",$label,$value); |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
sub _check_vowel |
1253
|
|
|
|
|
|
|
{ |
1254
|
25
|
|
|
25
|
|
34
|
my ($str) = @_; |
1255
|
|
|
|
|
|
|
|
1256
|
25
|
|
|
|
|
49
|
my @words = split(/ /,$str); |
1257
|
25
|
|
|
|
|
38
|
foreach my $word (@words) |
1258
|
|
|
|
|
|
|
{ |
1259
|
|
|
|
|
|
|
# Saint, Mount, Junior, Senior (as in Martin Luther KIng Snr) |
1260
|
34
|
100
|
66
|
|
|
160
|
if ( length($word) > 1 and $word !~ /[AEIOUY]|ST|MT|JN?R|SN?R/ ) |
1261
|
|
|
|
|
|
|
{ |
1262
|
1
|
|
|
|
|
4
|
return(1); |
1263
|
|
|
|
|
|
|
} |
1264
|
|
|
|
|
|
|
} |
1265
|
24
|
|
|
|
|
65
|
return(0); |
1266
|
|
|
|
|
|
|
} |
1267
|
|
|
|
|
|
|
#------------------------------------------------------------------------------ |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
return(1); |