line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Business::CompanyDesignator; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# Require perl 5.010 because the 'track' functionality of Regexp::Assemble |
4
|
|
|
|
|
|
|
# is unsafe for earlier versions. |
5
|
9
|
|
|
9
|
|
657930
|
use 5.010001; |
|
9
|
|
|
|
|
111
|
|
6
|
9
|
|
|
9
|
|
4651
|
use Moose; |
|
9
|
|
|
|
|
3675611
|
|
|
9
|
|
|
|
|
52
|
|
7
|
9
|
|
|
9
|
|
58348
|
use utf8; |
|
9
|
|
|
|
|
55
|
|
|
9
|
|
|
|
|
57
|
|
8
|
9
|
|
|
9
|
|
284
|
use warnings qw(FATAL utf8); |
|
9
|
|
|
|
|
19
|
|
|
9
|
|
|
|
|
377
|
|
9
|
9
|
|
|
9
|
|
2531
|
use FindBin qw($Bin); |
|
9
|
|
|
|
|
4935
|
|
|
9
|
|
|
|
|
1064
|
|
10
|
9
|
|
|
9
|
|
2054
|
use YAML; |
|
9
|
|
|
|
|
31124
|
|
|
9
|
|
|
|
|
478
|
|
11
|
9
|
|
|
9
|
|
5112
|
use File::ShareDir qw(dist_file); |
|
9
|
|
|
|
|
167941
|
|
|
9
|
|
|
|
|
570
|
|
12
|
9
|
|
|
9
|
|
67
|
use List::MoreUtils qw(uniq); |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
37
|
|
13
|
9
|
|
|
9
|
|
11508
|
use Regexp::Assemble; |
|
9
|
|
|
|
|
157207
|
|
|
9
|
|
|
|
|
316
|
|
14
|
9
|
|
|
9
|
|
5066
|
use Unicode::Normalize; |
|
9
|
|
|
|
|
17381
|
|
|
9
|
|
|
|
|
632
|
|
15
|
9
|
|
|
9
|
|
69
|
use Carp; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
513
|
|
16
|
|
|
|
|
|
|
|
17
|
9
|
|
|
9
|
|
4207
|
use Business::CompanyDesignator::Record; |
|
9
|
|
|
|
|
37
|
|
|
9
|
|
|
|
|
449
|
|
18
|
9
|
|
|
9
|
|
4899
|
use Business::CompanyDesignator::SplitResult; |
|
9
|
|
|
|
|
28
|
|
|
9
|
|
|
|
|
9867
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION = '0.16'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Hardcode the set of languages that we treat as 'continuous' i.e. their |
23
|
|
|
|
|
|
|
# designators don't require a word break before/after. |
24
|
|
|
|
|
|
|
our %LANG_CONTINUA = map { $_ => 1 } qw( |
25
|
|
|
|
|
|
|
zh |
26
|
|
|
|
|
|
|
ja |
27
|
|
|
|
|
|
|
ko |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has 'datafile' => ( is => 'ro', default => sub { |
31
|
|
|
|
|
|
|
# Development/test version |
32
|
|
|
|
|
|
|
my $local_datafile = "$Bin/../share/company_designator_dev.yml"; |
33
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
34
|
|
|
|
|
|
|
$local_datafile = "$Bin/../share/company_designator.yml"; |
35
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
36
|
|
|
|
|
|
|
# Installed version |
37
|
|
|
|
|
|
|
return dist_file('Business-CompanyDesignator', 'company_designator.yml'); |
38
|
|
|
|
|
|
|
}); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# data is the raw dataset as loaded from datafile, keyed by long designator |
41
|
|
|
|
|
|
|
has data => ( is => 'ro', lazy_build => 1 ); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# regex_cache is a cache of regexes by language and type, since they're expensive to build |
44
|
|
|
|
|
|
|
has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of |
47
|
|
|
|
|
|
|
# long designators (since abbreviations are not necessarily unique) |
48
|
|
|
|
|
|
|
has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 ); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# pattern_string_map is a hash mapping patterns back to their source string, |
51
|
|
|
|
|
|
|
# since we do things like add additional patterns without diacritics |
52
|
|
|
|
|
|
|
has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
53
|
|
|
|
|
|
|
# pattern_string_map_lang is a hash of hashes, mapping language codes to hashes |
54
|
|
|
|
|
|
|
# of patterns back to their source string |
55
|
|
|
|
|
|
|
has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _build_data { |
58
|
8
|
|
|
8
|
|
57
|
my $self = shift; |
59
|
8
|
|
|
|
|
256
|
YAML::LoadFile($self->datafile); |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub _build_abbr_long_map { |
63
|
5
|
|
|
5
|
|
31
|
my $self = shift; |
64
|
5
|
|
|
|
|
14
|
my $map = {}; |
65
|
5
|
|
|
|
|
15
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
895
|
|
|
|
|
22117
|
|
66
|
890
|
100
|
|
|
|
1718
|
if (my $abbr = $entry->{abbr_std}) { |
67
|
25
|
|
100
|
|
|
114
|
$map->{$abbr} ||= []; |
68
|
25
|
|
|
|
|
42
|
push @{ $map->{$abbr} }, $long; |
|
25
|
|
|
|
|
70
|
|
69
|
|
|
|
|
|
|
} |
70
|
890
|
100
|
|
|
|
1701
|
my $abbr_list = $entry->{abbr} or next; |
71
|
860
|
100
|
|
|
|
1592
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
72
|
860
|
|
|
|
|
1296
|
for my $abbr (@$abbr_list) { |
73
|
1365
|
|
100
|
|
|
5791
|
$map->{$abbr} ||= []; |
74
|
1365
|
|
|
|
|
1687
|
push @{ $map->{$abbr} }, $long; |
|
1365
|
|
|
|
|
3495
|
|
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
} |
77
|
5
|
|
|
|
|
149
|
return $map; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub long_designators { |
81
|
4
|
|
|
4
|
1
|
1178
|
my $self = shift; |
82
|
4
|
|
|
|
|
9
|
sort keys %{ $self->data }; |
|
4
|
|
|
|
|
132
|
|
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub abbreviations { |
86
|
3
|
|
|
3
|
1
|
1055
|
my $self = shift; |
87
|
3
|
|
|
|
|
7
|
sort keys %{ $self->abbr_long_map }; |
|
3
|
|
|
|
|
107
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub designators { |
91
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
92
|
1
|
|
|
|
|
5
|
sort $self->long_designators, $self->abbreviations; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# Return the B::CD::Record for $long designator |
96
|
|
|
|
|
|
|
sub record { |
97
|
1590
|
|
|
1590
|
1
|
46348
|
my ($self, $long) = @_; |
98
|
1590
|
100
|
|
|
|
40880
|
my $entry = $self->data->{$long} |
99
|
|
|
|
|
|
|
or croak "Invalid long designator '$long'"; |
100
|
1589
|
|
|
|
|
46147
|
return Business::CompanyDesignator::Record->new( long => $long, record => $entry ); |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Return a list of B::CD::Records for $designator |
104
|
|
|
|
|
|
|
sub records { |
105
|
1243
|
|
|
1243
|
1
|
486211
|
my ($self, $designator) = @_; |
106
|
1243
|
50
|
|
|
|
2878
|
croak "Missing designator" if ! $designator; |
107
|
1243
|
100
|
|
|
|
38104
|
if (exists $self->data->{$designator}) { |
|
|
100
|
|
|
|
|
|
108
|
366
|
|
|
|
|
934
|
return ( $self->record($designator) ); |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
elsif (my $long_set = $self->abbr_long_map->{$designator}) { |
111
|
876
|
|
|
|
|
2168
|
return map { $self->record($_) } @$long_set |
|
1045
|
|
|
|
|
2433
|
|
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
else { |
114
|
1
|
|
|
|
|
21
|
croak "Invalid designator '$designator'"; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# Add $string to regex assembler |
119
|
|
|
|
|
|
|
sub _add_to_assembler { |
120
|
1896
|
|
|
1896
|
|
3362
|
my ($self, $assembler, $lang, $string, $reference_string) = @_; |
121
|
1896
|
|
66
|
|
|
4092
|
$reference_string ||= $string; |
122
|
|
|
|
|
|
|
# printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang || []}), $string, $reference_string; |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs: |
125
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=50228 |
126
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=74449 |
127
|
|
|
|
|
|
|
# $assembler->add($string) |
128
|
|
|
|
|
|
|
# Workaround by lexing and using insert() |
129
|
1896
|
|
|
|
|
2361
|
my $optional1 = '\\.?,?\\s*'; |
130
|
|
|
|
|
|
|
my @pattern = map { |
131
|
|
|
|
|
|
|
# Periods are treated as optional literals, with optional trailing commas and/or whitespace |
132
|
1896
|
100
|
|
|
|
6023
|
/\./ ? $optional1 : |
|
26451
|
100
|
|
|
|
62650
|
|
|
|
100
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# Embedded spaces can be multiple, and include leading commas |
134
|
|
|
|
|
|
|
/ / ? ',?\s+' : |
135
|
|
|
|
|
|
|
# Escape other regex metacharacters |
136
|
|
|
|
|
|
|
/[()]/ ? "\\$_" : $_ |
137
|
|
|
|
|
|
|
} split //, $string; |
138
|
1896
|
|
|
|
|
6768
|
$assembler->insert(@pattern); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang |
141
|
1896
|
|
|
|
|
205850
|
my $pattern_string = join '', @pattern; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Special case - optional match characters can cause clashes between |
144
|
|
|
|
|
|
|
# distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/ |
145
|
|
|
|
|
|
|
# We need to handle such cases as ambiguous with extra checks |
146
|
1896
|
|
|
|
|
3108
|
my $optional1e = "\Q$optional1\E"; |
147
|
1896
|
|
|
|
|
2345
|
my $alt_pattern_string1; |
148
|
1896
|
100
|
|
|
|
9834
|
if ($pattern_string =~ /^(\w)(\w)$/) { |
|
|
100
|
|
|
|
|
|
149
|
84
|
|
|
|
|
283
|
$alt_pattern_string1 = "$1$optional1$2$optional1"; |
150
|
|
|
|
|
|
|
} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) { |
151
|
107
|
|
|
|
|
314
|
$alt_pattern_string1 = "$1$2"; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous |
155
|
|
|
|
|
|
|
# across entries, and we can't unambiguously map back to a standard designator |
156
|
1896
|
100
|
66
|
|
|
56001
|
if (exists $self->pattern_string_map->{ $pattern_string }) { |
|
|
100
|
|
|
|
|
|
157
|
429
|
|
|
|
|
11785
|
my $current = $self->pattern_string_map->{ $pattern_string }; |
158
|
429
|
100
|
100
|
|
|
1468
|
if ($current && $current ne $reference_string) { |
159
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
160
|
3
|
|
|
|
|
84
|
$self->pattern_string_map->{ $pattern_string } = undef; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity |
164
|
|
|
|
|
|
|
elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) { |
165
|
7
|
|
|
|
|
178
|
my $current = $self->pattern_string_map->{ $alt_pattern_string1 }; |
166
|
7
|
50
|
33
|
|
|
55
|
if ($current && $current ne $reference_string) { |
167
|
|
|
|
|
|
|
# Reset both pairs to undef to mark ambiguity |
168
|
7
|
|
|
|
|
203
|
$self->pattern_string_map->{ $pattern_string } = undef; |
169
|
7
|
|
|
|
|
198
|
$self->pattern_string_map->{ $alt_pattern_string1 } = undef; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
else { |
173
|
1460
|
|
|
|
|
36752
|
$self->pattern_string_map->{ $pattern_string } = $reference_string; |
174
|
|
|
|
|
|
|
} |
175
|
1896
|
100
|
|
|
|
3517
|
if ($lang) { |
176
|
638
|
|
|
|
|
1029
|
for my $l (@$lang) { |
177
|
766
|
100
|
|
|
|
20150
|
if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) { |
178
|
202
|
|
|
|
|
5331
|
my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string }; |
179
|
202
|
50
|
33
|
|
|
705
|
if ($current && $current ne $reference_string) { |
180
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
181
|
0
|
|
|
|
|
0
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
else { |
185
|
564
|
|
|
|
|
14773
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# If $string contains unicode diacritics, also add a version without them for misspellings |
191
|
9
|
100
|
|
9
|
|
73
|
if ($string =~ m/\pM/) { |
|
9
|
|
|
|
|
19
|
|
|
9
|
|
|
|
|
135
|
|
|
1896
|
|
|
|
|
8060
|
|
192
|
217
|
|
|
|
|
381
|
my $stripped = $string; |
193
|
217
|
|
|
|
|
1170
|
$stripped =~ s/\pM//g; |
194
|
217
|
|
|
|
|
724
|
$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Assemble designator regexes |
199
|
|
|
|
|
|
|
sub _build_regex { |
200
|
42
|
|
|
42
|
|
80
|
my $self = shift; |
201
|
42
|
|
|
|
|
98
|
my ($type, $lang) = @_; |
202
|
|
|
|
|
|
|
|
203
|
42
|
|
|
|
|
99
|
state $types = { map { $_ => 1 } qw(end end_cont begin) }; |
|
12
|
|
|
|
|
56
|
|
204
|
42
|
50
|
|
|
|
147
|
if (! $types->{$type}) { |
205
|
0
|
|
|
|
|
0
|
croak "invalid regex type '$type'"; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# RA constructor - case insensitive, with match tracking |
209
|
42
|
|
|
|
|
258
|
my $assembler = Regexp::Assemble->new->flags('i')->track(1); |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# Construct language regex if $lang is set |
212
|
42
|
|
|
|
|
3153
|
my $lang_re; |
213
|
42
|
100
|
|
|
|
115
|
if ($lang) { |
214
|
36
|
100
|
|
|
|
150
|
$lang = [ $lang ] if ! ref $lang; |
215
|
36
|
|
|
|
|
154
|
my $lang_str = join '|', sort @$lang; |
216
|
36
|
|
|
|
|
446
|
$lang_re = qr/^($lang_str)$/; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
42
|
|
|
|
|
88
|
my $count = 0; |
220
|
42
|
|
|
|
|
70
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
7518
|
|
|
|
|
176560
|
|
221
|
|
|
|
|
|
|
# If $lang is set, restrict to entries that include $lang |
222
|
7476
|
100
|
100
|
|
|
29452
|
next if $lang_re && $entry->{lang} !~ $lang_re; |
223
|
|
|
|
|
|
|
# If $type is 'begin', restrict to 'lead' entries |
224
|
1448
|
100
|
100
|
|
|
3479
|
next if $type eq 'begin' && ! $entry->{lead}; |
225
|
|
|
|
|
|
|
# if $type is 'end_cont', restrict to languages in %LANG_CONTINUA |
226
|
964
|
100
|
100
|
|
|
2537
|
next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}}; |
227
|
|
|
|
|
|
|
|
228
|
648
|
|
|
|
|
810
|
$count++; |
229
|
648
|
|
|
|
|
3393
|
my $long_nfd = NFD($long); |
230
|
648
|
|
|
|
|
1690
|
$self->_add_to_assembler($assembler, $lang, $long_nfd); |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# Add all abbreviations |
233
|
648
|
100
|
|
|
|
2215
|
if (my $abbr_list = $entry->{abbr}) { |
234
|
616
|
100
|
|
|
|
1444
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
235
|
616
|
|
|
|
|
1104
|
for my $abbr (@$abbr_list) { |
236
|
1031
|
|
|
|
|
3723
|
my $abbr_nfd = NFD($abbr); |
237
|
1031
|
|
66
|
|
|
4336
|
my $abbr_std = NFD($entry->{abbr_std} || $abbr); |
238
|
1031
|
|
|
|
|
2207
|
$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# If no entries found (a strange/bogus language?), return undef |
244
|
42
|
100
|
|
|
|
266
|
return if $count == 0; |
245
|
|
|
|
|
|
|
|
246
|
30
|
50
|
|
|
|
190
|
return wantarray ? ( $assembler->re, $assembler ) : $assembler->re; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.) |
250
|
|
|
|
|
|
|
# $type defaults to 'end', $lang defaults to undef (for all) |
251
|
|
|
|
|
|
|
sub regex { |
252
|
2753
|
|
|
2753
|
1
|
3745
|
my $self = shift; |
253
|
2753
|
|
|
|
|
4785
|
my ($type, $lang) = @_; |
254
|
2753
|
|
50
|
|
|
5042
|
$type ||= 'end'; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# $lang might be an arrayref containing multiple language codes |
257
|
2753
|
|
|
|
|
3313
|
my $lang_key; |
258
|
2753
|
100
|
|
|
|
4821
|
if ($lang) { |
259
|
968
|
|
|
|
|
1288
|
$lang_key = $lang; |
260
|
968
|
50
|
66
|
|
|
2011
|
if (ref $lang && ref $lang eq 'ARRAY' && @$lang) { |
|
|
|
66
|
|
|
|
|
261
|
8
|
50
|
|
|
|
18
|
if (@$lang == 1) { |
262
|
0
|
|
|
|
|
0
|
$lang_key = $lang->[0]; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
else { |
265
|
8
|
|
|
|
|
21
|
$lang_key = join '_', sort map { lc $_ } @$lang; |
|
16
|
|
|
|
|
64
|
|
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
2753
|
|
|
|
|
3625
|
my $cache_key = $type; |
271
|
2753
|
100
|
|
|
|
4934
|
$cache_key .= "_$lang_key" if $lang_key; |
272
|
|
|
|
|
|
|
|
273
|
2753
|
100
|
|
|
|
73585
|
if (my $entry = $self->regex_cache->{ $cache_key }) { |
274
|
2711
|
50
|
|
|
|
11172
|
return wantarray ? @$entry : $entry->[0]; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
42
|
|
|
|
|
154
|
my ($re, $assembler) = $self->_build_regex($type, $lang); |
278
|
42
|
|
|
|
|
218295
|
$self->regex_cache->{ $cache_key } = [ $re, $assembler ]; |
279
|
42
|
50
|
|
|
|
261
|
return wantarray ? ( $re, $assembler ) : $re; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# Helper to return split_designator results |
283
|
|
|
|
|
|
|
sub _split_designator_result { |
284
|
1079
|
|
|
1079
|
|
9895
|
my $self = shift; |
285
|
1079
|
|
|
|
|
4167
|
my ($lang, $before, $des, $after, $matched_pattern) = @_; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# $before can end in whitespace (that we don't want to consume in the RE |
288
|
|
|
|
|
|
|
# for technical reasons around handling punctuation like '& Co' in designators) |
289
|
|
|
|
|
|
|
# So trim here to handle that case. |
290
|
1079
|
100
|
|
|
|
4967
|
$before =~ s/\s+$// if $before; |
291
|
|
|
|
|
|
|
|
292
|
1079
|
|
|
|
|
1803
|
my $des_std; |
293
|
1079
|
100
|
|
|
|
2132
|
if ($matched_pattern) { |
294
|
819
|
100
|
|
|
|
8696
|
$des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang; |
295
|
819
|
|
100
|
|
|
20126
|
$des_std ||= $self->pattern_string_map->{$matched_pattern}; |
296
|
819
|
100
|
|
|
|
1738
|
if ($des_std) { |
297
|
|
|
|
|
|
|
# Always coalesce spaces and delete commas from $des_std |
298
|
799
|
|
|
|
|
1629
|
$des_std =~ s/,+/ /g; |
299
|
799
|
|
|
|
|
2071
|
$des_std =~ s/\s\s+/ /g; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# Legacy interface - return a simple before / des / after tuple, plus $des_std |
304
|
1079
|
100
|
66
|
|
|
2698
|
return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std) |
|
1452
|
100
|
|
|
|
11041
|
|
305
|
|
|
|
|
|
|
if wantarray; |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# New scalar-context interface - return SplitResult object |
308
|
716
|
100
|
100
|
|
|
18662
|
Business::CompanyDesignator::SplitResult->new( |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
309
|
|
|
|
|
|
|
before => NFC($before // ''), |
310
|
|
|
|
|
|
|
designator => NFC($des // ''), |
311
|
|
|
|
|
|
|
designator_std => NFC($des_std // ''), |
312
|
|
|
|
|
|
|
after => NFC($after // ''), |
313
|
|
|
|
|
|
|
records => [ $des_std ? $self->records(NFC $des_std) : () ], |
314
|
|
|
|
|
|
|
); |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# Split $company_name on (the first) company designator, returning a triplet of strings: |
318
|
|
|
|
|
|
|
# ($before, $designator, $after), plus the normalised form of the designator. If no |
319
|
|
|
|
|
|
|
# designator is found, just returns ($company_name). |
320
|
|
|
|
|
|
|
# e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for |
321
|
|
|
|
|
|
|
# the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator, |
322
|
|
|
|
|
|
|
# but "Ltée" for the normalised form |
323
|
|
|
|
|
|
|
sub split_designator { |
324
|
1079
|
|
|
1079
|
1
|
238445
|
my $self = shift; |
325
|
1079
|
|
|
|
|
3203
|
my ($company_name, %arg) = @_; |
326
|
1079
|
|
|
|
|
2191
|
my $lang = $arg{lang}; |
327
|
1079
|
|
|
|
|
1886
|
my $allow_embedded = $arg{allow_embedded}; |
328
|
1079
|
|
100
|
|
|
4394
|
$allow_embedded //= 1; # backwards-compatibility, unfortunately |
329
|
1079
|
|
|
|
|
6859
|
my $company_name_match = NFD($company_name); |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# Handle older perls without XPosixPunct |
332
|
1079
|
50
|
|
|
|
2012
|
state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ? |
|
4
|
|
|
|
|
36
|
|
333
|
|
|
|
|
|
|
'[\s\p{XPosixPunct}]' : |
334
|
|
|
|
|
|
|
'[\s[:punct:]]'; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# Strip all brackets for continuous language matching |
337
|
1079
|
|
|
|
|
4429
|
(my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g; |
338
|
|
|
|
|
|
|
|
339
|
1079
|
|
|
|
|
2008
|
my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr); |
340
|
1079
|
100
|
|
|
|
2303
|
if ($lang) { |
341
|
484
|
100
|
|
|
|
1120
|
if ($LANG_CONTINUA{$lang}) { |
342
|
36
|
|
|
|
|
81
|
($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang); |
343
|
|
|
|
|
|
|
} else { |
344
|
448
|
|
|
|
|
1251
|
($end_re, $end_asr) = $self->regex('end', $lang); |
345
|
|
|
|
|
|
|
} |
346
|
484
|
|
|
|
|
1151
|
($begin_re, $begin_asr) = $self->regex('begin', $lang); |
347
|
|
|
|
|
|
|
} else { |
348
|
595
|
|
|
|
|
1916
|
($end_re, $end_asr) = $self->regex('end'); |
349
|
595
|
|
|
|
|
1524
|
($end_cont_re, $end_cont_asr) = $self->regex('end_cont'); |
350
|
595
|
|
|
|
|
1381
|
($begin_re, $begin_asr) = $self->regex('begin'); |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# Designators are usually final, so try $end_re first |
354
|
1079
|
100
|
100
|
|
|
287582
|
if ($end_re && |
355
|
|
|
|
|
|
|
$company_name_match =~ m/^\s*(.*?)${punct_class}\s*\(?($end_re)\)?\s*$/) { |
356
|
572
|
|
|
|
|
1018642
|
return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R)); |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# No final designator - retry without a word break for the subset of languages |
360
|
|
|
|
|
|
|
# that use continuous scripts (see %LANG_CONTINUA above) |
361
|
507
|
100
|
100
|
|
|
506032
|
if ($end_cont_re && |
362
|
|
|
|
|
|
|
$company_name_match_cont_stripped =~ m/^\s*(.*?)\(?($end_cont_re)\)?\s*$/) { |
363
|
114
|
|
|
|
|
1501
|
return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R)); |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# No final designator - check for a lead designator instead (e.g. RU, NL, etc.) |
367
|
393
|
100
|
100
|
|
|
31529
|
if ($begin_re && |
368
|
|
|
|
|
|
|
$company_name_match =~ m/^\s*\(?($begin_re)\)?${punct_class}\s*(.*?)\s*$/) { |
369
|
91
|
|
|
|
|
29011
|
return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R)); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# No final or initial - check for an embedded designator with trailing content |
373
|
302
|
100
|
100
|
|
|
33225
|
if ($end_re && $allow_embedded && |
|
|
|
100
|
|
|
|
|
374
|
|
|
|
|
|
|
$company_name_match =~ m/(.*?)${punct_class}\s*\(?($end_re)\)?(?:\s+(.*?))?$/) { |
375
|
42
|
|
|
|
|
70358
|
return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R)); |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# No match - return $company_name unchanged |
379
|
260
|
|
|
|
|
15879
|
return $self->_split_designator_result($lang, $company_name); |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
1; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
__END__ |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=encoding utf-8 |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head1 NAME |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Business::CompanyDesignator - module for matching and stripping/manipulating the |
391
|
|
|
|
|
|
|
company designators appended to company names |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head1 VERSION |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
Version: 0.16. |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
This module is considered a B<BETA> release. Interfaces may change and/or break |
398
|
|
|
|
|
|
|
without notice until the module reaches version 1.0. |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=head1 SYNOPSIS |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
Business::CompanyDesignator is a perl module for matching and stripping/manipulating |
403
|
|
|
|
|
|
|
the typical company designators appended (or sometimes, prepended) to company names. |
404
|
|
|
|
|
|
|
It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and |
405
|
|
|
|
|
|
|
abbreviations (e.g. Corp., Inc., Ltd., GmbH etc). |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
use Business::CompanyDesignator; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# Constructor |
410
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
411
|
|
|
|
|
|
|
# Optionally, you can provide your own company_designator.yml file, instead of the bundled one |
412
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.) |
415
|
|
|
|
|
|
|
@des = $bcd->designators; |
416
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
417
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# Lookup individual designator records (returns B::CD::Record objects) |
420
|
|
|
|
|
|
|
# Lookup record by long designator (unique) |
421
|
|
|
|
|
|
|
$record = $bcd->record($long_designator); |
422
|
|
|
|
|
|
|
# Lookup records by abbreviation or long designator (may not be unique) |
423
|
|
|
|
|
|
|
@records = $bcd->records($designator); |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# Get a regex for matching designators by type ('end'/'begin') and lang |
426
|
|
|
|
|
|
|
# By default, returns 'end' regexes for all languages |
427
|
|
|
|
|
|
|
$re = $bcd->regex; |
428
|
|
|
|
|
|
|
$company_name =~ $re and say 'designator found!'; |
429
|
|
|
|
|
|
|
$company_name =~ /$re\s*$/ and say 'final designator found!'; |
430
|
|
|
|
|
|
|
my $re_begin_en = $bcd->regex('begin', 'en'); |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
# Split $company_name on designator, returning a ($before, $designator, $after) triplet, |
433
|
|
|
|
|
|
|
# plus the normalised form of the designator matched (can pass to records(), for example) |
434
|
|
|
|
|
|
|
($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name); |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object |
437
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => 'en'); |
438
|
|
|
|
|
|
|
print join ' / ', $res->designator_std, $res->short_name, $res->extra; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=head1 DATASET |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
Business::CompanyDesignator uses the company designator dataset from here: |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
L<https://github.com/ProfoundNetworks/company_designator> |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
which is bundled with the module. You can use your own (updated or custom) |
448
|
|
|
|
|
|
|
version, if you prefer, by passing a 'datafile' parameter to the constructor. |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
The dataset defines multiple long form designators (like "Company", "Limited", |
451
|
|
|
|
|
|
|
or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.', |
452
|
|
|
|
|
|
|
'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry, |
453
|
|
|
|
|
|
|
for instance, looks like this: |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
Company: |
456
|
|
|
|
|
|
|
abbr: |
457
|
|
|
|
|
|
|
- Co. |
458
|
|
|
|
|
|
|
- '& Co.' |
459
|
|
|
|
|
|
|
- and Co. |
460
|
|
|
|
|
|
|
- and Company |
461
|
|
|
|
|
|
|
lang: en |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Long designators are unique across the dataset, but abbreviations are not |
464
|
|
|
|
|
|
|
e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée". |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=head1 METHODS |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 new() |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Creates a Business::CompanyDesignator object. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
By default this uses the bundled company_designator dataset. You may |
475
|
|
|
|
|
|
|
provide your own (updated or custom) version by passing via a 'datafile' |
476
|
|
|
|
|
|
|
parameter to the constructor. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=head2 designators() |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Returns the full list of company designator strings from the dataset |
483
|
|
|
|
|
|
|
(both long form and abbreviations). |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
@designators = $bcd->designators; |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head2 long_designators() |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
Returns the full list of long form designators from the dataset. |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head2 abbreviations() |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
Returns the full list of abbreviation designators from the dataset. |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=head2 record($long_designator) |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
Returns the Business::CompanyDesignator::Record object for the given |
502
|
|
|
|
|
|
|
long designator (and dies if not found). |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head2 records($designator) |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
Returns a list of Business::CompanyDesignator::Record objects for the |
507
|
|
|
|
|
|
|
given abbreviation or long designator (for long designators there will |
508
|
|
|
|
|
|
|
only be a single record returned, but abbreviations may map to multiple |
509
|
|
|
|
|
|
|
records). |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Use this method for abbreviations, or if you're aren't sure of a |
512
|
|
|
|
|
|
|
designator's type. |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head2 regex([$type], [$lang]) |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
Returns a regex for all matching designators for $type ('begin'/'end') and |
517
|
|
|
|
|
|
|
$lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset. |
518
|
|
|
|
|
|
|
$lang may be either a single language code scalar, or an arrayref of language |
519
|
|
|
|
|
|
|
codes, for multiple alternative languages. The returned regex is case-insensitive |
520
|
|
|
|
|
|
|
and non-anchored. |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
$type defaults to 'end', so without parameters regex() returns a regex |
523
|
|
|
|
|
|
|
matching all designators for all languages. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool]) |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Attempts to split $company_name on (the first) company designator found. |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
In array context split_designator returns a list of four items - a triplet of |
530
|
|
|
|
|
|
|
strings from $company_name ( $before, $designator, $after ), plus the |
531
|
|
|
|
|
|
|
standardised version of the designator as a fourth element. |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name); |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult> |
536
|
|
|
|
|
|
|
object. |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => $lang); |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
The $des designator in array context, and the SplitResult $res->designator |
541
|
|
|
|
|
|
|
is the designator text as it matched in $company_name, while the array context |
542
|
|
|
|
|
|
|
$des_std, and the SplitResult $res->designator_std is the standardised version |
543
|
|
|
|
|
|
|
as found in the dataset. |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but |
546
|
|
|
|
|
|
|
"Pty. Ltd." as the stardardised form, and the latter would be what you |
547
|
|
|
|
|
|
|
would find in designators() or would lookup with records(). Similarly, |
548
|
|
|
|
|
|
|
"Accessoires XYZ Ltee" (without the french acute) would match, returning |
549
|
|
|
|
|
|
|
"Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the |
550
|
|
|
|
|
|
|
standardised form. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
split_designator accepts the following optional (named) parameters: |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=over 4 |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=item lang => $lang |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
$lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an |
559
|
|
|
|
|
|
|
arrayref containing multiple language codes. If $lang is defined, split_designator |
560
|
|
|
|
|
|
|
will only match designators for the specified set of languages, which can improve |
561
|
|
|
|
|
|
|
the accuracy of the split by reducing false positive matches. |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=item allow_embedded => $boolean |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
allow_embedded is a boolean indicating whether or not designators can occur in |
566
|
|
|
|
|
|
|
the middle of strings, instead of only at the beginning or end. Defaults to true, |
567
|
|
|
|
|
|
|
for backwards compatibility, which yields more matches, but also more false |
568
|
|
|
|
|
|
|
positives. Setting to false is safer, but yields fewer matches (and embedded |
569
|
|
|
|
|
|
|
designators do occur surprisingly often in the wild.) |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
For more discussion, see L<AMBIGUITIES> below. |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=back |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head2 AMBIGUITIES |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
Note that split_designator does not always get the split right. It checks for |
578
|
|
|
|
|
|
|
final designators first, then leading ones, and then finally looks for embedded |
579
|
|
|
|
|
|
|
designators (if allow_embedded is set to true). |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Leading and trailing designators are usually reasonably accurate, but embedded |
582
|
|
|
|
|
|
|
designators are problematic. For instance, embedded designators allow names like |
583
|
|
|
|
|
|
|
these to split correctly: |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
Amerihealth Insurance Company of NJ |
586
|
|
|
|
|
|
|
Trenkwalder Personal AG Schweiz |
587
|
|
|
|
|
|
|
Vicente Campano S L (COMERCIAL VICAM) |
588
|
|
|
|
|
|
|
Gvozdika, gostinitsa OOO ""Eko-Treyd"" |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
but it will also wrongly split names like the following: |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation') |
593
|
|
|
|
|
|
|
Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada') |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
If you do want to allow splitting on embedded designators, you might want to pass |
596
|
|
|
|
|
|
|
a 'lang' parameter to split_designator if you know the language(s) used for your |
597
|
|
|
|
|
|
|
company names, as this will reduce the number of false positives by restricting the |
598
|
|
|
|
|
|
|
set of designators matched against. It won't eliminate the issue altogether though, |
599
|
|
|
|
|
|
|
so some post-processing might be required. (And I'd love to hear of ideas on how |
600
|
|
|
|
|
|
|
to improve this.) |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=head1 SEE ALSO |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Finance::CompanyNames |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=head1 AUTHOR |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
Gavin Carr <gavin@profound.net> |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENCE |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
Copyright (C) 2013-2021 Gavin Carr |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it |
615
|
|
|
|
|
|
|
under the same terms as Perl itself. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=cut |