File Coverage

lib/Geo/Address/Formatter.pm
Criterion Covered Total %
statement 528 602 87.7
branch 210 276 76.0
condition 54 76 71.0
subroutine 42 47 89.3
pod 4 4 100.0
total 838 1005 83.3


line stmt bran cond sub pod time code
1             package Geo::Address::Formatter;
2             $Geo::Address::Formatter::VERSION = '1.9992';
3             # ABSTRACT: take structured address data and format it according to the various global/country rules
4              
5 7     7   2492226 use strict;
  7         15  
  7         284  
6 7     7   35 use warnings;
  7         17  
  7         584  
7 7     7   58 use feature qw(say);
  7         12  
  7         1132  
8 7     7   3129 use Clone qw(clone);
  7         3331  
  7         613  
9 7     7   1243 use Data::Dumper;
  7         15293  
  7         549  
10             $Data::Dumper::Sortkeys = 1;
11 7     7   48 use File::Basename qw(dirname);
  7         26  
  7         394  
12 7     7   3377 use Ref::Util qw(is_hashref);
  7         15771  
  7         626  
13 7     7   65 use Scalar::Util qw(looks_like_number);
  7         13  
  7         447  
14 7     7   3540 use Text::Hogan::Compiler;
  7         56063  
  7         271  
15 7     7   3219 use Try::Catch;
  7         5946  
  7         630  
16 7     7   1653 use YAML::XS qw(LoadFile);
  7         13730  
  7         503  
17 7     7   50 use utf8;
  7         14  
  7         69  
18              
19             my $THC = Text::Hogan::Compiler->new;
20              
21             # optional params
22             my $show_warnings = 1;
23             my $debug = 0;
24             my $only_address = 0;
25              
26              
27              
28              
29             my $instance;
30              
31             sub instance {
32 0     0 1 0 my ($class, %params) = @_;
33            
34 0 0       0 unless ($instance) {
35 0         0 $instance = $class->new(%params);
36             }
37              
38 0 0       0 say STDERR "************* in Geo::Address::Formatter::instance ***" if ($debug);
39            
40             # clear these fields that change on each run
41 0         0 $instance->{final_components} = undef;
42 0         0 $instance->{set_district_alias} = {};
43              
44 0         0 return $instance;
45             }
46              
47             sub new {
48 20     20 1 45113 my ($class, %params) = @_;
49              
50 20         69 my $self = {};
51 20   50     179 my $conf_path = $params{conf_path} || die "no conf_path set";
52              
53             # optional params
54 20 50 66     96 if ( defined($params{no_warnings}) && ($params{no_warnings})){
55 1         3 $show_warnings = 0;
56             }
57 20   66     116 $only_address = (defined($params{only_address}) && $params{only_address}) // 0;
      50        
58 20   66     102 $debug = (defined($params{debug}) && $params{debug}) // 0;
      50        
59              
60 20         59 $self->{final_components} = undef;
61 20         44 $self->{set_district_alias} = {};
62              
63 20         49 bless($self, $class);
64              
65 20 50       63 say STDERR "************* in Geo::Address::Formatter::new ***" if ($debug);
66              
67             # is slow because lots of conf, and pre-compiling
68 20 100       99 if ($self->_read_configuration($conf_path)){
69 19         185 return $self;
70             }
71 1         21 die 'unable to read configuration';
72             }
73              
74             sub _read_configuration {
75 20     20   40 my $self = shift;
76 20         39 my $path = shift;
77              
78 20 50       798 return if (! -e $path);
79             # components file
80 20         78 my $compyaml = $path . '/components.yaml';
81 20 100       410 return if (! -e $compyaml);
82              
83 19         88 $self->{templates} = {};
84 19         50 $self->{component_aliases} = {};
85 19         44 $self->{component2type} = {};
86 19         74 $self->{ordered_components} = [];
87              
88             # read the config file(s)
89 19         38 my $loaded = 0;
90              
91 19         36 my $wwfile = $path . '/countries/worldwide.yaml';
92 19 50       53 say STDERR "loading templates $wwfile" if ($debug);
93              
94 19 50       341 return if (! -e $wwfile);
95              
96             try {
97 19     19   638 my $rh_templates = LoadFile($wwfile);
98 19         50482 foreach (keys %$rh_templates) {
99 3855         6804 $self->{templates}{$_} = $rh_templates->{$_};
100 3855         5451 $loaded = 1;
101             }
102             } catch {
103 0     0   0 warn "error parsing country configuration in $wwfile";
104 0         0 return;
105 19         914 };
106 19 50       703 return if ($loaded == 0); # no templates
107              
108             # see if we can load the components
109             try {
110 19 50   19   500 say STDERR "loading components" if ($debug);
111 19         79 my @c = LoadFile($compyaml);
112              
113 19 50       5069 if ($debug){
114 0         0 say STDERR Dumper \@c;
115             }
116              
117 19         54 foreach my $rh_c (@c) {
118 304 50       583 if (defined($rh_c->{name})){
119 304 100       499 if (defined($rh_c->{aliases})){
120 187         452 $self->{component_aliases}{$rh_c->{name}} = $rh_c->{aliases};
121             } else {
122 117         329 $self->{component_aliases}{$rh_c->{name}} = [];
123             }
124             }
125             }
126              
127 19         55 foreach my $rh_c (@c) {
128 304         370 push(@{$self->{ordered_components}}, $rh_c->{name});
  304         13358  
129 304         729 $self->{component2type}->{$rh_c->{name}} = $rh_c->{name};
130              
131 304 100       646 if (defined($rh_c->{aliases})) {
132 187         218 foreach my $alias (@{$rh_c->{aliases}}) {
  187         312  
133 625         753 push(@{$self->{ordered_components}}, $alias);
  625         1088  
134 625         1605 $self->{component2type}->{$alias} = $rh_c->{name};
135             }
136             }
137             }
138 19         33 my %h_known = map { $_ => 1 } @{$self->{ordered_components}};
  929         1791  
  19         78  
139 19         139 $self->{h_known} = \%h_known;
140              
141 19 50       299 if ($debug){
142 0         0 say STDERR 'component_aliases';
143 0         0 say STDERR Dumper $self->{component_aliases};
144 0         0 say STDERR 'component2type';
145 0         0 say STDERR Dumper $self->{component2type};
146             }
147             } catch {
148 0     0   0 warn "error parsing component configuration: $_";
149 19         246 };
150              
151             # get the county and state codes and country2lang conf
152 19         566 my @conf_files = qw(county_codes state_codes country2lang);
153 19         51 foreach my $cfile (@conf_files) {
154 57         136240 $self->{$cfile} = {};
155 57         132 my $yfile = $path . '/' . $cfile . '.yaml';
156 57 100       1614 if (-e $yfile) {
157             try {
158 51     51   1377 $self->{$cfile} = LoadFile($yfile);
159             } catch {
160 0     0   0 warn "error parsing $cfile configuration: $_";
161 51         688 };
162             }
163             }
164              
165 19         10771 my $abbrvdir = $path . '/abbreviations';
166              
167 19 100       485 if (-d $abbrvdir){
168 15 50       791 opendir my $dh, $abbrvdir
169             or die "Could not open '$abbrvdir' for read: $!\n";
170              
171 15         718 while (my $file = readdir $dh) {
172             # say STDERR "file: $file";
173 361 100       74590 if ($file =~ m/^(\w\w)\.yaml$/) {
174 331         1065 my $lang = $1; # two letter lang code like 'en'
175 331         732 my $abbrvfile = $abbrvdir . '/' . $file;
176             try {
177 331     331   8435 $self->{abbreviations}->{$lang} = LoadFile($abbrvfile);
178             } catch {
179 0     0   0 warn "error parsing abbrv conf in $abbrvfile: $_";
180 331         2538 };
181             }
182             }
183 15         3321 closedir $dh;
184             }
185              
186             #say Dumper $self->{abbreviations};
187             #say Dumper $self->{country2lang};
188              
189 19         165 $self->_precompile_rules();
190 19         126 $self->_build_code_lookups();
191 19         123 $self->_precompile_abbreviations();
192              
193 19         202 return 1;
194             }
195              
196             sub _precompile_rules {
197 19     19   50 my $self = shift;
198              
199 19         40 foreach my $cc (keys %{$self->{templates}}) {
  19         945  
200 3855         6410 my $tpl = $self->{templates}{$cc};
201 3855 100       6226 next unless is_hashref($tpl);
202              
203             # Pre-compile replace rules
204 3500 100       6258 if (defined($tpl->{replace})) {
205 251         293 my @compiled;
206 251         306 foreach my $ra_fromto (@{$tpl->{replace}}) {
  251         486  
207 853         1451 my $pattern = $ra_fromto->[0];
208 853         1262 my $replacement = $ra_fromto->[1];
209 853         1123 my ($comp_name, $exact_match, $re);
210              
211             # detect component-specific rules like "state=Some Value"
212 853 100       2618 if ($pattern =~ m/^(\w+)=(.+)$/) {
213 208         488 $comp_name = $1;
214 208         405 my $rest = $2;
215             # try to compile as regex; if it fails, it's an exact match
216 208         298 eval { $re = qr/$rest/i };
  208         2901  
217 208 50       466 if ($@) {
218 0         0 warn "invalid replacement regex '$rest' for $cc, skipping";
219 0         0 next;
220             }
221 208         300 $exact_match = $rest;
222             } else {
223 645         861 eval { $re = qr/$pattern/i };
  645         9739  
224 645 50       1434 if ($@) {
225 0         0 warn "invalid replacement regex '$pattern' for $cc, skipping";
226 0         0 next;
227             }
228             }
229 853         3640 push @compiled, {
230             component => $comp_name,
231             exact_match => $exact_match,
232             re => $re,
233             replacement => $replacement,
234             };
235             }
236 251         590 $tpl->{_compiled_replace} = \@compiled;
237             }
238              
239             # Pre-compile postformat_replace rules
240 3500 100       6524 if (defined($tpl->{postformat_replace})) {
241 461         613 my @compiled;
242 461         582 foreach my $ra_fromto (@{$tpl->{postformat_replace}}) {
  461         908  
243 801         1355 my $pattern = $ra_fromto->[0];
244 801         1109 my $replacement = $ra_fromto->[1];
245 801         1015 my $re;
246 801         1006 eval { $re = qr/$pattern/ };
  801         16231  
247 801 50       1797 if ($@) {
248 0         0 warn "invalid postformat regex '$pattern' for $cc, skipping";
249 0         0 next;
250             }
251 801         2665 push @compiled, {
252             re => $re,
253             replacement => $replacement,
254             };
255             }
256 461         1228 $tpl->{_compiled_postformat} = \@compiled;
257             }
258             }
259 19         478 return;
260             }
261              
262             sub _build_code_lookups {
263 19     19   62 my $self = shift;
264              
265 19         46 foreach my $type (qw(state_codes county_codes)) {
266 38         90 my $reverse_key = $type . '_reverse';
267 38         68 my $name_key = $type . '_name';
268 38         192 $self->{$reverse_key} = {};
269 38         114 $self->{$name_key} = {};
270              
271 38   50     186 my $data = $self->{$type} // next;
272 38         452 foreach my $cc (keys %$data) {
273 1593         3214 my $mapping = $data->{$cc};
274 1593         2267 my $rev = {};
275 1593         1998 my $name = {};
276              
277 1593         12742 foreach my $code (keys %$mapping) {
278 33864         58310 my $val = $mapping->{$code};
279 33864 100       53478 if (is_hashref($val)) {
280             # hash with default, alt, alt_XX keys
281 5676         10005 my $default_name = $val->{default};
282 5676 50       14653 $name->{$code} = $default_name if defined $default_name;
283 5676         12142 foreach my $v (values %$val) {
284 12223         41482 $rev->{uc($v)} = $code;
285             }
286             } else {
287             # simple string value
288 28188         52810 $name->{$code} = $val;
289 28188         75164 $rev->{uc($val)} = $code;
290             }
291             # code-to-code identity lookup
292 33864         67575 $rev->{$code} = $code;
293             }
294 1593         7537 $self->{$reverse_key}{$cc} = $rev;
295 1593         4450 $self->{$name_key}{$cc} = $name;
296             }
297             }
298 19         58 return;
299             }
300              
301             sub _precompile_abbreviations {
302 19     19   58 my $self = shift;
303 19   100     118 my $abbr = $self->{abbreviations} // return;
304              
305 15         178 foreach my $lang (keys %$abbr) {
306 331         576 foreach my $comp_name (keys %{$abbr->{$lang}}) {
  331         1236  
307 424         821 my $rh_pairs = $abbr->{$lang}{$comp_name};
308 424         551 my @compiled;
309 424         2685 foreach my $long (keys %$rh_pairs) {
310             push @compiled, {
311             re => qr/(^|\s)\Q$long\E\b/,
312 5715         170706 short => $rh_pairs->{$long},
313             };
314             }
315 424         3132 $self->{compiled_abbreviations}{$lang}{$comp_name} = \@compiled;
316             }
317             }
318 15         127 return;
319             }
320              
321              
322             sub final_components {
323 6     6 1 2594 my $self = shift;
324 6 100       29 if (defined($self->{final_components})) {
325 5         27 return $self->{final_components};
326             }
327 1 50       28 warn 'final_components not yet set' if ($show_warnings);
328 1         104 return;
329             }
330              
331              
332             sub format_address {
333 488     488 1 736384 my $self = shift;
334 488   50     16104 my $rh_components = clone(shift) || return;
335 488   100     2696 my $rh_options = shift || {};
336              
337             # 1. make sure empty at the beginning
338 488         2701 $self->{final_components} = undef;
339              
340 488 50       1671 if ($debug){
341 0         0 say STDERR "*** in format_address ***";
342 0         0 say STDERR Dumper $rh_options;
343 0         0 say STDERR Dumper $rh_components;
344             }
345              
346             # 2. deal with the options
347              
348 488         2170 foreach (grep { ! /^[a-z0-9_]+$/ } keys %$rh_components) {
  3802         10703  
349 7 50       17 if ($debug){
350 0         0 say STDERR "Skipping compontent $_";
351             }
352 7         16 delete $rh_components->{$_};
353             }
354              
355             # 2a. which country format will we use?
356             # might have been specified in options
357             # otherwise look at components
358             my $cc = $rh_options->{country}
359 488   100     2877 || $self->_determine_country_code($rh_components)
360             || '';
361              
362 488 100       1319 if ($cc) {
363 484         1006 $rh_components->{country_code} = $cc;
364 484         1486 $self->_set_district_alias($cc);
365             }
366              
367             # 2b. should we abbreviate?
368 488   100     1934 my $abbrv = $rh_options->{abbreviate} // 0;
369              
370             # 2c. was only_address set at the formatting level
371 488         839 my $oa = $only_address;
372 488 100       1136 if (defined($rh_options->{only_address})){
373 2         6 $oa = $rh_options->{only_address};
374             }
375              
376 488 50       1031 if ($debug){
377 0         0 say STDERR "component_aliases";
378 0         0 say STDERR Dumper $self->{component_aliases};
379             }
380             # done with the options
381              
382             # 3. deal wtih terrible inputs
383 488         1642 $self->_sanity_cleaning($rh_components);
384 488 50       1032 if ($debug){
385 0         0 say STDERR "after sanity_cleaning applied";
386 0         0 say STDERR Dumper $rh_components;
387             }
388              
389             # 4. set the aliases, unless this would overwrite something
390             # need to do this in the right order (as defined in the components file)
391             # For example:
392             # both 'city_district' and 'suburb' are aliases of 'neighbourhood'
393             # so which one should we use if both are present?
394             # We should use the one defined first in the list
395              
396 488         785 my $rhh_p2a;
397 488         1253 foreach my $c (keys %$rh_components){
398              
399             # might not need an alias as it is a primary type
400 3797 100       8746 next if (defined($self->{component_aliases}{$c}));
401              
402             # it is not a primary type
403             # is there an alias?
404 767 100       2082 if (defined($self->{component2type}{$c})){
405 528         1148 my $ptype = $self->{component2type}{$c};
406             # but is it already set?
407 528 100       1424 if (! defined($rh_components->{$ptype}) ){
408             # no, we will set it later
409 415         1492 $rhh_p2a->{$ptype}{$c} = 1;
410              
411             }
412             }
413             }
414              
415             # now we know which primary types have aliases
416 488         1494 foreach my $ptype (keys %$rhh_p2a){
417             # is there more than one?
418 374         528 my @aliases = keys %{$rhh_p2a->{$ptype}};
  374         1255  
419 374 100       1002 if (scalar @aliases == 1){
420 334         974 $rh_components->{$ptype} = $rh_components->{$aliases[0]};
421 334         822 next; # we are done with this ptype
422             }
423              
424             # if there is more than one we need to go through the list
425             # so we do them in the right order
426 40         77 foreach my $c (@{$self->{component_aliases}->{$ptype}}){
  40         128  
427 45 100       162 if (defined($rh_components->{$c})){
428 40         134 $rh_components->{$ptype} = $rh_components->{$c};
429 40         121 last; # we are done with this ptype
430             }
431             }
432             }
433              
434 488 50       1230 if ($debug){
435 0         0 say STDERR "after component_aliases applied";
436 0         0 say STDERR Dumper $rh_components;
437             }
438              
439             # 5. determine the template
440 488         802 my $template_text;
441 488   66     1923 my $rh_config = $self->{templates}{uc($cc)} || $self->{templates}{default};
442            
443 488 100       1127 if (defined($rh_options->{address_template})) {
444 2         6 $template_text = $rh_options->{address_template};
445             }
446             else {
447              
448 486 100       1217 if (defined($rh_config->{address_template})) {
    50          
449 484         1075 $template_text = $rh_config->{address_template};
450             } elsif (defined($self->{templates}{default}{address_template})) {
451 2         5 $template_text = $self->{templates}{default}{address_template};
452             }
453            
454             # do we have the minimal components for an address?
455             # or should we instead use the fallback template?
456 486 100       1459 if (!$self->_minimal_components($rh_components)) {
457 57 50       176 say STDERR "using fallback" if ($debug);
458 57 100       237 if (defined($rh_config->{fallback_template})) {
    100          
459 42         139 $template_text = $rh_config->{fallback_template};
460             } elsif (defined($self->{templates}{default}{fallback_template})) {
461 14         48 $template_text = $self->{templates}{default}{fallback_template};
462             }
463             # no fallback
464             }
465              
466             }
467              
468 488 50       1047 say STDERR 'template text: ' . $template_text if ($debug);
469              
470             # 6. clean up the components, possibly add codes
471 488         1621 $self->_fix_country($rh_components);
472 488 50       993 if ($debug){
473 0         0 say STDERR "after fix_country";
474 0         0 say STDERR Dumper $rh_components;
475             }
476              
477             # apply replacements if we have any
478 488 100       1197 if (defined($rh_config->{_compiled_replace})){
479 160         622 $self->_apply_replacements($rh_components, $rh_config->{_compiled_replace});
480 160 50       412 if ($debug){
481 0         0 say STDERR "after applying_replacements applied";
482 0         0 say STDERR Dumper $rh_components;
483             }
484             }
485              
486 488         1495 $self->_add_state_code($rh_components);
487 488         1440 $self->_add_county_code($rh_components);
488 488 50       987 if ($debug){
489 0         0 say STDERR "after adding codes";
490 0         0 say STDERR Dumper $rh_components;
491             }
492              
493             # 7. add the attention, if needed
494 488 50       942 if ($debug){
495 0         0 say STDERR "object level only_address: $only_address";
496 0         0 say STDERR "formatting level only_address: $oa";
497             }
498              
499 488 100       1125 if ($oa){
500 3 50       9 if ($debug){
501 0         0 say STDERR "not looking for unknown_components";
502 0         0 say STDERR "only_address was specified";
503             }
504             }
505             else {
506 485         1578 my $ra_unknown = $self->_find_unknown_components($rh_components);
507 485 100       1243 if (scalar(@$ra_unknown)){
508 230 50       569 if ($debug){
509 0         0 say STDERR "unknown_components:";
510 0         0 say STDERR Dumper $ra_unknown;
511             }
512             # need to sort for consistency
513             # FIXME - add better sorting based on meaning of the values
514             $rh_components->{attention} =
515 230         781 join(', ', map { $rh_components->{$_} } sort @$ra_unknown);
  236         1241  
516 230 50       710 if ($debug){
517 0         0 say STDERR "putting unknown_components in 'attention'";
518             }
519             }
520             }
521              
522             # 8. abbreviate, if needed
523 488 100       1124 if ($abbrv) {
524 10         32 $rh_components = $self->_abbreviate($rh_components);
525             }
526              
527             # 9. prepare the template
528 488         1514 $template_text = $self->_replace_template_lambdas($template_text);
529              
530             # 10. compiled the template
531 488   66     3384 my $compiled_template = $self->{compiled_template_cache}{$template_text}
532             //= $THC->compile($template_text, {'numeric_string_as_string' => 1});
533              
534 488 50       918039 if ($debug){
535 0         0 say STDERR "before _render_template";
536 0         0 say STDERR Dumper $rh_components;
537 0         0 say STDERR "template: ";
538 0         0 say STDERR Dumper $compiled_template;
539             }
540              
541              
542             # 11. render the template
543 488         1593 my $text = $self->_render_template($compiled_template, $rh_components);
544 488 50       1084 if ($debug){
545 0         0 say STDERR "text after _render_template $text";
546             }
547              
548             # 12. postformatting
549 488         2296 $text = $self->_postformat($text, $rh_config->{_compiled_postformat});
550              
551             # 13. clean again
552 488         4988 $text = $self->_clean($text);
553              
554             # 14. set final components (so we can get them later)
555 488         1330 $self->{final_components} = $rh_components;
556              
557             # all done
558 488         14334 return $text;
559             }
560              
561             # remove duplicates ("Berlin, Berlin"), do replacements and similar
562             sub _postformat {
563 490     490   2289 my $self = shift;
564 490         679 my $text = shift;
565 490         997 my $raa_rules = shift;
566              
567 490 50       1170 if ($debug){
568 0         0 say STDERR "entering _postformat: $text";
569 0         0 say STDERR Dumper $raa_rules;
570             }
571              
572             # remove duplicates
573 490         1339 my @before_pieces = split(/, /, $text);
574 490         873 my %seen;
575             my @after_pieces;
576 490         893 foreach my $piece (@before_pieces) {
577 608         1212 $piece =~ s/^\s+//g;
578 608         1469 $seen{$piece}++;
579 608 100       1809 if (lc($piece) ne 'new york') {
580 604 100       1327 next if ($seen{$piece} > 1);
581             }
582 607         1175 push(@after_pieces, $piece);
583             }
584 490         1220 $text = join(', ', @after_pieces);
585              
586             # do any country specific rules
587 490         1115 foreach my $rule (@$raa_rules) {
588 554         981 my $regexp = $rule->{re};
589 554         869 my $replacement = $rule->{replacement};
590              
591             # ultra hack to do substitution
592             # limited to $1 and $2, should really be a while loop
593             # doing every substitution
594              
595 554 100       1287 if ($replacement =~ m/\$\d/) {
596 50 100       541 if ($text =~ m/$regexp/) {
597 21         63 my $tmp1 = $1;
598 21         45 my $tmp2 = $2;
599 21         58 my $tmp3 = $3;
600 21         73 $replacement =~ s/\$1/$tmp1/;
601 21         70 $replacement =~ s/\$2/$tmp2/;
602 21         58 $replacement =~ s/\$3/$tmp3/;
603             }
604             }
605 554         2587 $text =~ s/$regexp/$replacement/;
606             }
607 490         1646 return $text;
608             }
609              
610             sub _sanity_cleaning {
611 489     489   2024 my $self = shift;
612 489   50     1164 my $rh_components = shift || return;
613              
614             # catch insane postcodes
615 489 100       2987 if (defined($rh_components->{'postcode'})) {
616 332 100       2041 if (length($rh_components->{'postcode'}) > 20) {
    100          
    100          
617 1         4 delete $rh_components->{'postcode'};
618             } elsif ($rh_components->{'postcode'} =~ m/\d+;\d+/) {
619             # sometimes OSM has postcode ranges
620 1         2 delete $rh_components->{'postcode'};
621             } elsif ($rh_components->{'postcode'} =~ m/^(\d{5}),\d{5}/) {
622 1         4 $rh_components->{'postcode'} = $1;
623             }
624             }
625              
626             # remove things that might be empty
627 489         2000 foreach my $c (keys %$rh_components) {
628             # catch empty values
629 3800 50       16289 if (!defined($rh_components->{$c})) {
    100          
    100          
630 0         0 delete $rh_components->{$c};
631             }
632             # no chars
633             elsif ($rh_components->{$c} !~ m/\w/) {
634 1         3 delete $rh_components->{$c};
635             }
636             # catch values containing URLs
637             elsif ($rh_components->{$c} =~ m|https?://|) {
638 1         4 delete $rh_components->{$c};
639             }
640             }
641 489         1233 return;
642             }
643              
644             sub _minimal_components {
645 486     486   786 my $self = shift;
646 486   50     1028 my $rh_components = shift || return;
647 486         1542 my @required_components = qw(road postcode); #FIXME - should be in conf
648 486         747 my $missing = 0; # num required components that are missing
649              
650 486         700 my $minimal_threshold = 2;
651 486         945 foreach my $c (@required_components) {
652 972 100       2432 $missing++ if (!defined($rh_components->{$c}));
653 972 100       2054 return 0 if ($missing == $minimal_threshold);
654             }
655 429         1442 return 1;
656             }
657              
658             my %valid_replacement_components = ('state' => 1,);
659              
660             # determines which country code to use
661             # may also override other configuration if we are dealing with
662             # a dependent territory
663             sub _determine_country_code {
664 489     489   926 my $self = shift;
665 489   50     1331 my $rh_components = shift || return;
666              
667             # FIXME - validate it is a valid country
668 489 100       1458 return if (!defined($rh_components->{country_code}));
669              
670 485 50       1882 if (my $cc = lc($rh_components->{country_code})) {
671              
672             # is it two letters long?
673 485 50       1268 return if (length($cc) != 2);
674 485 50       1197 return 'GB' if ($cc eq 'uk');
675              
676 485         1011 $cc = uc($cc);
677              
678             # check if the configuration tells us to use
679             # the configuration of another country
680             # used in cases of dependent territories like
681             # American Samoa (AS) and Puerto Rico (PR)
682 485 100 100     4692 if ( defined($self->{templates}{$cc})
683             && defined($self->{templates}{$cc}{use_country}))
684             {
685 46         100 my $old_cc = $cc;
686 46         119 $cc = $self->{templates}{$cc}{use_country};
687 46 100       172 if (defined($self->{templates}{$old_cc}{change_country})) {
688              
689 35         87 my $new_country = $self->{templates}{$old_cc}{change_country};
690 35 100       147 if ($new_country =~ m/\$(\w*)/) {
691 2         3 my $component = $1;
692 2 50       6 if (defined($rh_components->{$component})) {
693 2         28 $new_country =~ s/\$$component/$rh_components->{$component}/;
694             } else {
695 0         0 $new_country =~ s/\$$component//;
696             }
697             }
698 35         193 $rh_components->{country} = $new_country;
699             }
700 46 100       214 if (defined($self->{templates}{$old_cc}{add_component})) {
701 10         19 my $tmp = $self->{templates}{$old_cc}{add_component};
702 10         50 my ($k, $v) = split(/=/, $tmp);
703             # check whitelist of valid replacement components
704 10 100       39 if (defined($valid_replacement_components{$k})) {
705 9         29 $rh_components->{$k} = $v;
706             }
707             }
708             }
709              
710 485 100       1146 if ($cc eq 'NL') {
711 5 50       18 if (defined($rh_components->{state})) {
712 5 100       41 if ($rh_components->{state} eq 'Curaçao') {
    50          
    100          
713 1         3 $cc = 'CW';
714 1         3 $rh_components->{country} = 'Curaçao';
715             } elsif ($rh_components->{state} =~ m/^sint maarten/i) {
716 0         0 $cc = 'SX';
717 0         0 $rh_components->{country} = 'Sint Maarten';
718             } elsif ($rh_components->{state} =~ m/^Aruba/i) {
719 1         4 $cc = 'AW';
720 1         3 $rh_components->{country} = 'Aruba';
721             }
722             }
723             }
724 485         2559 return $cc;
725             }
726 0         0 return;
727             }
728              
729             # hacks for bad country data
730             sub _fix_country {
731 488     488   789 my $self = shift;
732 488   50     881 my $rh_c = shift || return;
733              
734             # is the country a number?
735             # if so, and there is a state, use state as country
736 488 100 100     2208 if (defined($rh_c->{country}) && defined($rh_c->{state})){
737 386 100       2238 if (looks_like_number($rh_c->{country})){
738 1         4 $rh_c->{country} = delete $rh_c->{state};
739             }
740             }
741 488         844 return;
742             }
743              
744             # sets and returns a state code
745             # note may also set other values in some odd edge cases
746             sub _add_state_code {
747 493     493   4880 my $self = shift;
748 493   50     1111 my $rh_components = shift // return;
749 493 100       1304 if (defined($rh_components->{state})){
750 391         1136 return $self->_add_code('state', $rh_components);
751             }
752 102         196 return;
753             }
754              
755             sub _add_county_code {
756 492     492   6547 my $self = shift;
757 492   50     1183 my $rh_components = shift // return;
758 492 100       1117 if (defined($rh_components->{county})){
759 216         504 return $self->_add_code('county', $rh_components);
760             }
761 276         463 return;
762             }
763              
764             sub _add_code {
765 607     607   842 my $self = shift;
766 607   50     1212 my $keyname = shift // return;
767 607         762 my $rh_components = shift;
768 607 100       1442 return if !$rh_components->{country_code}; # do we know country?
769 603 50       1472 return if !$rh_components->{$keyname}; # do we know state/county?
770              
771 603         1071 my $code = $keyname . '_code';
772              
773 603 100       1358 if (defined($rh_components->{$code})) {
774             # do we already have code?
775             # we could have situation where code and long name are same
776             # so we want to corrent
777 32 100       141 return if ($rh_components->{$code} ne $rh_components->{$keyname});
778             }
779              
780             # ensure country_code is uppercase as we use it as conf key
781 572         1064 my $cc = uc($rh_components->{country_code});
782              
783 572         879 my $reverse_key = $code . 's_reverse';
784 572         876 my $name_key = $code . 's_name';
785              
786 572 100       2476 if (my $rev = $self->{$reverse_key}{$cc}) {
787 304         651 my $name = $rh_components->{$keyname};
788 304         673 my $uc_name = uc($name);
789              
790 304 100       1208 if (my $found_code = $rev->{$uc_name}) {
791             # matched a name -> code
792 204         566 $rh_components->{$code} = $found_code;
793             # if input was a code (e.g. state => 'NC'), set keyname to full name
794 204 100       556 if ($uc_name eq $found_code) {
795 6         24 my $full_name = $self->{$name_key}{$cc}{$found_code};
796 6 50       31 $rh_components->{$keyname} = $full_name if defined $full_name;
797             }
798             }
799              
800             # US-specific edge cases
801 304 100 66     1390 if ($cc eq 'US' && $keyname eq 'state' && !defined($rh_components->{state_code})) {
      100        
802 2 50       8 if ($rh_components->{state} =~ m/^united states/i) {
803 0         0 my $state = $rh_components->{state};
804 0         0 $state =~ s/^United States/US/i;
805 0 0       0 if (my $fc = $rev->{uc($state)}) {
806 0         0 $rh_components->{state_code} = $fc;
807             }
808             }
809 2 50       19 if ($rh_components->{state} =~ m/^washington,? d\.?c\.?/i) {
810 2         6 $rh_components->{state_code} = 'DC';
811 2         4 $rh_components->{state} = 'District of Columbia';
812 2         10 $rh_components->{city} = 'Washington';
813             }
814             }
815             }
816 572         1387 return $rh_components->{$code};
817             }
818              
819             sub _apply_replacements {
820 163     163   1206 my $self = shift;
821 163         256 my $rh_components = shift;
822 163   50     446 my $ra_rules = shift // return; # bail out if no rules
823              
824 163 50       482 if ($debug){
825 0         0 say STDERR "in _apply_replacements";
826 0         0 say STDERR Dumper $ra_rules;
827             }
828              
829             # support legacy raw array-of-arrays format (used by tests)
830 163 100 66     1418 if (ref($ra_rules) eq 'ARRAY' && @$ra_rules && ref($ra_rules->[0]) eq 'ARRAY') {
      100        
831 2         8 foreach my $component (keys %$rh_components) {
832 2 50       7 next if ($component eq 'country_code');
833 2 50       7 next if ($component eq 'house_number');
834 2         5 foreach my $ra_fromto (@$ra_rules) {
835 3         24 my $regexp;
836 3 50       44 if ($ra_fromto->[0] =~ m/^$component=/) {
837 0         0 my $from = $ra_fromto->[0];
838 0         0 $from =~ s/^$component=//;
839 0 0       0 if ($rh_components->{$component} eq $from) {
840 0         0 $rh_components->{$component} = $ra_fromto->[1];
841             } else {
842 0         0 $regexp = $from;
843             }
844             } else {
845 3         7 $regexp = $ra_fromto->[0];
846             }
847 3 50       10 if (defined($regexp)) {
848             try {
849 3     3   184 my $re = qr/$regexp/i;
850 2         28 $rh_components->{$component} =~ s/$re/$ra_fromto->[1]/;
851             } catch {
852 1     1   35 warn "invalid replacement: " . join(', ', @$ra_fromto);
853 3         30 };
854             }
855             }
856             }
857 2         153 return $rh_components;
858             }
859              
860 161         537 foreach my $component (keys %$rh_components) {
861             # some components dont need replacements
862 1362 100       2495 next if ($component eq 'country_code');
863 1202 100       2014 next if ($component eq 'house_number');
864 1141         1568 foreach my $rule (@$ra_rules) {
865 7937 100       11869 if (defined($rule->{component})) {
866             # component-specific rule: only applies to matching component
867 1990 100       3832 next if ($rule->{component} ne $component);
868 197 100       427 if ($rh_components->{$component} eq $rule->{exact_match}) {
869 4         13 $rh_components->{$component} = $rule->{replacement};
870             } else {
871 193         1059 $rh_components->{$component} =~ s/$rule->{re}/$rule->{replacement}/;
872             }
873             } else {
874 5947         15535 $rh_components->{$component} =~ s/$rule->{re}/$rule->{replacement}/;
875             }
876             }
877             }
878 161         478 return $rh_components;
879             }
880              
881             sub _abbreviate {
882 12     12   1185 my $self = shift;
883 12   50     49 my $rh_comp = shift // return;
884              
885             # do we know the country?
886 12 100       36 if (!defined($rh_comp->{country_code})) {
887 2 100       6 if ($show_warnings){
888 1         4 my $error_msg = 'no country_code, unable to abbreviate';
889 1 50       4 if (defined($rh_comp->{country})) {
890 1         3 $error_msg .= ' - country: ' . $rh_comp->{country};
891             }
892 1         18 warn $error_msg
893             }
894 2         99 return;
895             }
896              
897             # do we have abbreviations for this country?
898 10         42 my $cc = uc($rh_comp->{country_code});
899              
900             # 1. which languages?
901 10 50       34 if (defined($self->{country2lang}{$cc})) {
902              
903 10         37 my @langs = split(/,/, $self->{country2lang}{$cc});
904              
905 10         24 foreach my $lang (@langs) {
906             # do we have compiled abbrv for this lang?
907 14 100       46 if (defined($self->{compiled_abbreviations}{$lang})) {
908 11         19 my $rh_compiled = $self->{compiled_abbreviations}{$lang};
909              
910 11         31 foreach my $comp_name (keys %$rh_compiled) {
911 16 100       43 next if (!defined($rh_comp->{$comp_name}));
912 15         23 foreach my $rule (@{$rh_compiled->{$comp_name}}) {
  15         104  
913 111         668 $rh_comp->{$comp_name} =~ s/$rule->{re}/$1$rule->{short}/;
914             }
915             }
916             } else {
917             #warn "no abbreviations defined for lang $lang";
918             }
919             }
920             }
921 10         28 return $rh_comp;
922             }
923              
924             # " abc,,def , ghi " => 'abc, def, ghi'
925             sub _clean {
926 982     982   4468 my $self = shift;
927 982   100     2475 my $out = shift // return;
928 981 50       2310 if ($debug){
929 0         0 say STDERR "entering _clean \n$out";
930             }
931              
932 981         1945 $out =~ s/\&#39\;/'/g;
933              
934 981         7920 $out =~ s/[\},\s]+$//;
935 981         2975 $out =~ s/^[,\s]+//;
936              
937 981         1783 $out =~ s/^- //; # line starting with dash due to a parameter missing
938              
939 981         1830 $out =~ s/,\s*,/, /g; # multiple commas to one
940 981         2374 $out =~ s/\h+,\h+/, /g; # one horiz whitespace behind comma
941 981         3303 $out =~ s/\h\h+/ /g; # multiple horiz whitespace to one
942 981         2680 $out =~ s/\h\n/\n/g; # horiz whitespace, newline to newline
943 981         1758 $out =~ s/\n,/\n/g; # newline comma to just newline
944 981         1534 $out =~ s/,,+/,/g; # multiple commas to one
945 981         1458 $out =~ s/,\n/\n/g; # comma newline to just newline
946 981         3395 $out =~ s/\n\h+/\n/g; # newline plus space to newline
947 981         2954 $out =~ s/\n\n+/\n/g; # multiple newline to one
948              
949             # final dedupe across and within lines
950 981         3279 my @before_pieces = split(/\n/, $out);
951 981         1792 my %seen_lines;
952             my @after_pieces;
953 981         1781 foreach my $line (@before_pieces) {
954 3720         6286 $line =~ s/^\h+//g;
955 3720         7361 $line =~ s/\h+$//g;
956 3720         8481 $seen_lines{$line}++;
957 3720 100       7932 next if ($seen_lines{$line} > 1);
958             # now dedupe within the line
959 3701         6877 my @before_words = split(/,/, $line);
960 3701         4844 my %seen_words;
961             my @after_words;
962 3701         5514 foreach my $w (@before_words) {
963 3924         6866 $w =~ s/^\h+//g;
964 3924         6792 $w =~ s/\h+$//g;
965 3924 100       8697 if (lc($w) ne 'new york') {
966 3914         6547 $seen_words{$w}++;
967             }
968 3924 100 100     13245 next if ((defined($seen_words{$w})) && ($seen_words{$w} > 1));
969 3922         7524 push(@after_words, $w);
970             }
971 3701         6299 $line = join(', ', @after_words);
972 3701         8950 push(@after_pieces, $line);
973             }
974 981         2226 $out = join("\n", @after_pieces);
975              
976 981         1911 $out =~ s/^\s+//; # remove leading whitespace
977 981         3520 $out =~ s/\s+$//; # remove end whitespace
978              
979 981         1589 $out .= "\n"; # add final newline
980 981         4594 return $out; # we are done
981             }
982              
983             sub _render_template {
984 489     489   2294 my $self = shift;
985 489         614 my $thtemplate = shift;
986 489         684 my $components = shift;
987              
988             # Mustache calls it context
989 489         689 my $context = $components;
990 489 50       1003 say STDERR 'context: ' . Dumper $context if ($debug);
991 489         2190 my $output = $thtemplate->render($context);
992              
993 489         596003 $output = $self->_evaluate_template_lamdas($output);
994              
995 489 50       1417 say STDERR "in _render before _clean: $output" if ($debug);
996 489         1772 $output = $self->_clean($output);
997              
998             # is it empty?
999 489 50       1435 if (length($output) == 0){
1000             # if yes and there is only one component then just use that one
1001 0         0 my @comps = keys %$components;
1002 0 0       0 if (scalar(@comps) == 1) {
1003 0         0 foreach my $k (@comps) {
1004 0         0 $output = $components->{$k};
1005             }
1006             } # FIXME what if more than one?
1007             }
1008 489         1253 return $output;
1009             }
1010              
1011             # Text::Hogan apparently caches lambdas when rendering templates. In the past
1012             # we needed our lambda 'first', example
1013             # {{#first}} {{{city}}} || {{{town}}} {{/first}}
1014             # to evaluate the components. Whenever the lambda was called with different
1015             # component values it consumed memory. Now replace with a simpler implementation
1016             #
1017             sub _replace_template_lambdas {
1018 489     489   1413 my $self = shift;
1019 489         874 my $template_text = shift;
1020 489         10705 $template_text =~ s!\Q{{#first}}\E(.+?)\Q{{/first}}\E!FIRSTSTART${1}FIRSTEND!g;
1021 489         1856 return $template_text;
1022             }
1023              
1024             # We only use a lambda named 'first'
1025             sub _evaluate_template_lamdas {
1026 489     489   1016 my $self = shift;
1027 489         4432 my $text = shift;
1028 489         6903 $text =~ s!FIRSTSTART\s*(.+?)\s*FIRSTEND!_select_first($1)!seg;
  920         1956  
1029 489         1527 return $text;
1030             }
1031              
1032             # '|| val1 || || val3' => 'val1'
1033             sub _select_first {
1034 920     920   1856 my $text = shift;
1035 920         7326 my @a_parts = grep { length($_) } split(/\s*\|\|\s*/, $text);
  2809         5688  
1036 920 100       8169 return scalar(@a_parts) ? $a_parts[0] : '';
1037             }
1038              
1039             my %small_district = (
1040             'BR' => 1,
1041             'CR' => 1,
1042             'ES' => 1,
1043             'NI' => 1,
1044             'PY' => 1,
1045             'RO' => 1,
1046             'TG' => 1,
1047             'TM' => 1,
1048             'XK' => 1,
1049             );
1050              
1051             # correct the alias for "district"
1052             # in OSM some countries use district to mean "city_district"
1053             # others to mean "state_district"
1054             sub _set_district_alias {
1055 484     484   766 my $self = shift;
1056 484   50     1323 my $cc = shift // return;
1057              
1058 484         914 my $ucc = uc($cc);
1059              
1060             # this may get called repeatedly
1061             # no need to do the work again
1062 484 100       2028 return if (defined($self->{set_district_alias}{$ucc}));
1063              
1064             # note that we are here so don't do this work again
1065 234         729 $self->{set_district_alias}{$ucc} = 1;
1066              
1067 234         461 my $oldalias;
1068 234 100       796 if (defined($small_district{$ucc})){
1069 11         48 $self->{component2type}{district} = 'neighbourhood';
1070 11         32 $oldalias = 'state_district';
1071              
1072             # add to the neighbourhood alias list
1073             # though of course we are just sticking it at the end
1074 11         24 push(@{$self->{component_aliases}{'neighbourhood'}}, 'district');
  11         63  
1075              
1076             } else {
1077             # set 'district' to be type 'state_district'
1078 223         669 $self->{component2type}{district} = 'state_district';
1079 223         397 $oldalias = 'neighbourhood';
1080              
1081             # add to the state_district alias list
1082 223         344 push(@{$self->{component_aliases}{'state_district'}}, 'district');
  223         947  
1083             }
1084              
1085             # remove from the old alias list
1086 3075         6145 my @temp = grep { $_ ne 'district' }
1087 234         437 @{$self->{component_aliases}{$oldalias}};
  234         854  
1088            
1089 234         1031 $self->{component_aliases}{$oldalias} = \@temp;
1090 234         570 return;
1091             }
1092              
1093              
1094             # returns []
1095             sub _find_unknown_components {
1096 486     486   1410 my $self = shift;
1097 486         756 my $rh_comp = shift;
1098              
1099 486         1864 my @a_unknown = grep { !exists($self->{h_known}->{$_}) } keys %$rh_comp;
  4337         9241  
1100 486         1382 return \@a_unknown;
1101             }
1102              
1103              
1104             1;
1105              
1106             __END__
1107              
1108             =pod
1109              
1110             =encoding UTF-8
1111              
1112             =head1 NAME
1113              
1114             Geo::Address::Formatter - take structured address data and format it according to the various global/country rules
1115              
1116             =head1 VERSION
1117              
1118             version 1.9992
1119              
1120             =head1 SYNOPSIS
1121              
1122             #
1123             # get the templates (or use your own)
1124             # git clone git@github.com:OpenCageData/address-formatting.git
1125             #
1126             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
1127              
1128             my $components = { ... }
1129             my $text = $GAF->format_address($components, { country => 'FR' } );
1130             my $rh_final_components = $GAF->final_components();
1131             #
1132             # or if we want shorter output
1133             #
1134             my $short_text = $GAF->format_address($components, { country => 'FR', abbreviate => 1, });
1135              
1136             =head2 new
1137              
1138             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
1139              
1140             Returns new object. The I<conf_path> is required.
1141              
1142             Optional parameters are:
1143              
1144             =over
1145              
1146             =item * I<debug>
1147              
1148             Prints tons of debugging info for use in development.
1149              
1150             =item * I<no_warnings>
1151              
1152             Turns off a few warnings if configuration is not optimal.
1153              
1154             =item * I<only_address>
1155              
1156             Formatted output will only contain known address components (will not
1157             include POI names). Can be overridden per-call via the I<only_address>
1158             option to L</format_address>.
1159              
1160             =back
1161              
1162             =head2 instance
1163              
1164             my $GAF = Geo::Address::Formatter->instance( conf_path => '/path/to/templates' );
1165              
1166             Returns new instance (potentially re-used) The I<conf_path> is required.
1167              
1168             Optional parameters are as in the B<new> method.
1169              
1170             =head2 final_components
1171              
1172             my $rh_components = $GAF->final_components();
1173              
1174             Returns a reference to a hash of the final components that were set during
1175             the most recent call to B<format_address>. Returns C<undef> if called
1176             before B<format_address> has been called. Warns in that case unless
1177             I<no_warnings> was set at object creation.
1178              
1179             =head2 format_address
1180              
1181             my $text = $GAF->format_address(\%components, \%options );
1182              
1183             Given structured address components (hashref) and options (hashref) returns a
1184             formatted address as a multiline string with a trailing newline.
1185              
1186             Possible options are:
1187              
1188             =over
1189              
1190             =item * I<abbreviate>
1191              
1192             If true, common abbreviations are applied to the resulting output.
1193              
1194             =item * I<address_template>
1195              
1196             A Mustache-format template to be used instead of the template defined in
1197             the configuration.
1198              
1199             =item * I<country>
1200              
1201             An uppercase ISO 3166-1 alpha-2 code, e.g. C<'GB'> for Great Britain,
1202             C<'DE'> for Germany, etc. If omitted the country is determined from
1203             the address components.
1204              
1205             =item * I<only_address>
1206              
1207             Same as the I<only_address> constructor option but applied per-call.
1208              
1209             =back
1210              
1211             =head1 DESCRIPTION
1212              
1213             You have a structured postal address (hash) and need to convert it into a
1214             readable address based on the format of the address country.
1215              
1216             For example, you have:
1217              
1218             {
1219             house_number => 12,
1220             street => 'Avenue Road',
1221             postcode => 45678,
1222             city => 'Deville'
1223             }
1224              
1225             you need:
1226              
1227             Great Britain: 12 Avenue Road, Deville 45678
1228             France: 12 Avenue Road, 45678 Deville
1229             Germany: Avenue Road 12, 45678 Deville
1230             Latvia: Avenue Road 12, Deville, 45678
1231              
1232             It gets more complicated with 200+ countries and territories and dozens more
1233             address components to consider.
1234              
1235             This module comes with a minimal configuration to run tests. Instead of
1236             developing your own configuration please use (and contribute to)
1237             those in https://github.com/OpenCageData/address-formatting
1238             which includes test cases.
1239              
1240             Together we can address the world!
1241              
1242             =head1 SEE ALSO
1243              
1244             L<https://github.com/OpenCageData/address-formatting> - the address
1245             formatting template configuration used by this module.
1246              
1247             L<https://opencagedata.com> - OpenCage geocoder.
1248              
1249             =head1 AUTHOR
1250              
1251             Ed Freyfogle
1252              
1253             =head1 COPYRIGHT AND LICENSE
1254              
1255             This software is copyright (c) 2026 by Opencage GmbH.
1256              
1257             This is free software; you can redistribute it and/or modify it under
1258             the same terms as the Perl 5 programming language system itself.
1259              
1260             =cut