line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Business::CompanyDesignator; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# Require perl 5.010 because the 'track' functionality of Regexp::Assemble |
4
|
|
|
|
|
|
|
# is unsafe for earlier versions. |
5
|
9
|
|
|
9
|
|
660497
|
use 5.010001; |
|
9
|
|
|
|
|
106
|
|
6
|
9
|
|
|
9
|
|
4566
|
use Moose; |
|
9
|
|
|
|
|
3704066
|
|
|
9
|
|
|
|
|
70
|
|
7
|
9
|
|
|
9
|
|
61055
|
use utf8; |
|
9
|
|
|
|
|
60
|
|
|
9
|
|
|
|
|
57
|
|
8
|
9
|
|
|
9
|
|
314
|
use warnings qw(FATAL utf8); |
|
9
|
|
|
|
|
16
|
|
|
9
|
|
|
|
|
413
|
|
9
|
9
|
|
|
9
|
|
2618
|
use FindBin qw($Bin); |
|
9
|
|
|
|
|
5346
|
|
|
9
|
|
|
|
|
1148
|
|
10
|
9
|
|
|
9
|
|
2319
|
use YAML; |
|
9
|
|
|
|
|
32361
|
|
|
9
|
|
|
|
|
505
|
|
11
|
9
|
|
|
9
|
|
5562
|
use File::ShareDir qw(dist_file); |
|
9
|
|
|
|
|
174307
|
|
|
9
|
|
|
|
|
567
|
|
12
|
9
|
|
|
9
|
|
68
|
use List::MoreUtils qw(uniq); |
|
9
|
|
|
|
|
27
|
|
|
9
|
|
|
|
|
42
|
|
13
|
9
|
|
|
9
|
|
11876
|
use Regexp::Assemble; |
|
9
|
|
|
|
|
158772
|
|
|
9
|
|
|
|
|
342
|
|
14
|
9
|
|
|
9
|
|
5041
|
use Unicode::Normalize; |
|
9
|
|
|
|
|
17531
|
|
|
9
|
|
|
|
|
664
|
|
15
|
9
|
|
|
9
|
|
70
|
use Carp; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
434
|
|
16
|
|
|
|
|
|
|
|
17
|
9
|
|
|
9
|
|
4151
|
use Business::CompanyDesignator::Record; |
|
9
|
|
|
|
|
39
|
|
|
9
|
|
|
|
|
543
|
|
18
|
9
|
|
|
9
|
|
5097
|
use Business::CompanyDesignator::SplitResult; |
|
9
|
|
|
|
|
29
|
|
|
9
|
|
|
|
|
10042
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION = '0.17'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Hardcode the set of languages that we treat as 'continuous' |
23
|
|
|
|
|
|
|
# i.e. their non-ascii designators don't require a word break |
24
|
|
|
|
|
|
|
# before/after. |
25
|
|
|
|
|
|
|
our %LANG_CONTINUA = map { $_ => 1 } qw( |
26
|
|
|
|
|
|
|
zh |
27
|
|
|
|
|
|
|
ja |
28
|
|
|
|
|
|
|
ko |
29
|
|
|
|
|
|
|
); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
has 'datafile' => ( is => 'ro', default => sub { |
32
|
|
|
|
|
|
|
# Development/test version |
33
|
|
|
|
|
|
|
my $local_datafile = "$Bin/../share/company_designator_dev.yml"; |
34
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
35
|
|
|
|
|
|
|
$local_datafile = "$Bin/../share/company_designator.yml"; |
36
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
37
|
|
|
|
|
|
|
# Installed version |
38
|
|
|
|
|
|
|
return dist_file('Business-CompanyDesignator', 'company_designator.yml'); |
39
|
|
|
|
|
|
|
}); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# data is the raw dataset as loaded from datafile, keyed by long designator |
42
|
|
|
|
|
|
|
has data => ( is => 'ro', lazy_build => 1 ); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# regex_cache is a cache of regexes by language and type, since they're expensive to build |
45
|
|
|
|
|
|
|
has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of |
48
|
|
|
|
|
|
|
# long designators (since abbreviations are not necessarily unique) |
49
|
|
|
|
|
|
|
has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 ); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# pattern_string_map is a hash mapping patterns back to their source string, |
52
|
|
|
|
|
|
|
# since we do things like add additional patterns without diacritics |
53
|
|
|
|
|
|
|
has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
54
|
|
|
|
|
|
|
# pattern_string_map_lang is a hash of hashes, mapping language codes to hashes |
55
|
|
|
|
|
|
|
# of patterns back to their source string |
56
|
|
|
|
|
|
|
has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub _build_data { |
59
|
8
|
|
|
8
|
|
53
|
my $self = shift; |
60
|
8
|
|
|
|
|
256
|
YAML::LoadFile($self->datafile); |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _build_abbr_long_map { |
64
|
5
|
|
|
5
|
|
28
|
my $self = shift; |
65
|
5
|
|
|
|
|
16
|
my $map = {}; |
66
|
5
|
|
|
|
|
13
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
935
|
|
|
|
|
22972
|
|
67
|
930
|
100
|
|
|
|
1731
|
if (my $abbr = $entry->{abbr_std}) { |
68
|
25
|
|
100
|
|
|
112
|
$map->{$abbr} ||= []; |
69
|
25
|
|
|
|
|
39
|
push @{ $map->{$abbr} }, $long; |
|
25
|
|
|
|
|
75
|
|
70
|
|
|
|
|
|
|
} |
71
|
930
|
100
|
|
|
|
1735
|
my $abbr_list = $entry->{abbr} or next; |
72
|
900
|
100
|
|
|
|
1728
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
73
|
900
|
|
|
|
|
1386
|
for my $abbr (@$abbr_list) { |
74
|
1440
|
|
100
|
|
|
6043
|
$map->{$abbr} ||= []; |
75
|
1440
|
|
|
|
|
1695
|
push @{ $map->{$abbr} }, $long; |
|
1440
|
|
|
|
|
3596
|
|
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
5
|
|
|
|
|
138
|
return $map; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub long_designators { |
82
|
4
|
|
|
4
|
1
|
1197
|
my $self = shift; |
83
|
4
|
|
|
|
|
9
|
sort keys %{ $self->data }; |
|
4
|
|
|
|
|
141
|
|
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub abbreviations { |
87
|
3
|
|
|
3
|
1
|
976
|
my $self = shift; |
88
|
3
|
|
|
|
|
6
|
sort keys %{ $self->abbr_long_map }; |
|
3
|
|
|
|
|
114
|
|
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub designators { |
92
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
93
|
1
|
|
|
|
|
5
|
sort $self->long_designators, $self->abbreviations; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Return the B::CD::Record for $long designator |
97
|
|
|
|
|
|
|
sub record { |
98
|
1782
|
|
|
1782
|
1
|
48890
|
my ($self, $long) = @_; |
99
|
1782
|
100
|
|
|
|
45238
|
my $entry = $self->data->{$long} |
100
|
|
|
|
|
|
|
or croak "Invalid long designator '$long'"; |
101
|
1781
|
|
|
|
|
51476
|
return Business::CompanyDesignator::Record->new( long => $long, record => $entry ); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# Return a list of B::CD::Records for $designator |
105
|
|
|
|
|
|
|
sub records { |
106
|
1427
|
|
|
1427
|
1
|
562913
|
my ($self, $designator) = @_; |
107
|
1427
|
50
|
|
|
|
3383
|
croak "Missing designator" if ! $designator; |
108
|
1427
|
100
|
|
|
|
43146
|
if (exists $self->data->{$designator}) { |
|
|
100
|
|
|
|
|
|
109
|
388
|
|
|
|
|
1127
|
return ( $self->record($designator) ); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
elsif (my $long_set = $self->abbr_long_map->{$designator}) { |
112
|
1038
|
|
|
|
|
2413
|
return map { $self->record($_) } @$long_set |
|
1207
|
|
|
|
|
2818
|
|
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
else { |
115
|
1
|
|
|
|
|
19
|
croak "Invalid designator '$designator'"; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# Add $string to regex assembler |
120
|
|
|
|
|
|
|
sub _add_to_assembler { |
121
|
1973
|
|
|
1973
|
|
3716
|
my ($self, $assembler, $lang, $string, $reference_string) = @_; |
122
|
1973
|
|
66
|
|
|
4481
|
$reference_string ||= $string; |
123
|
|
|
|
|
|
|
# printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang || []}), $string, $reference_string; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs: |
126
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=50228 |
127
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=74449 |
128
|
|
|
|
|
|
|
# $assembler->add($string) |
129
|
|
|
|
|
|
|
# Workaround by lexing and using insert() |
130
|
1973
|
|
|
|
|
2572
|
my $optional1 = '\\.?,?\\s*'; |
131
|
|
|
|
|
|
|
my @pattern = map { |
132
|
|
|
|
|
|
|
# Periods are treated as optional literals, with optional trailing commas and/or whitespace |
133
|
1973
|
100
|
|
|
|
6542
|
/\./ ? $optional1 : |
|
27614
|
100
|
|
|
|
66046
|
|
|
|
100
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# Embedded spaces can be multiple, and include leading commas |
135
|
|
|
|
|
|
|
/ / ? ',?\s+' : |
136
|
|
|
|
|
|
|
# Escape other regex metacharacters |
137
|
|
|
|
|
|
|
/[()]/ ? "\\$_" : $_ |
138
|
|
|
|
|
|
|
} split //, $string; |
139
|
1973
|
|
|
|
|
7194
|
$assembler->insert(@pattern); |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang |
142
|
1973
|
|
|
|
|
218229
|
my $pattern_string = join '', @pattern; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# Special case - optional match characters can cause clashes between |
145
|
|
|
|
|
|
|
# distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/ |
146
|
|
|
|
|
|
|
# We need to handle such cases as ambiguous with extra checks |
147
|
1973
|
|
|
|
|
3110
|
my $optional1e = "\Q$optional1\E"; |
148
|
1973
|
|
|
|
|
2362
|
my $alt_pattern_string1; |
149
|
1973
|
100
|
|
|
|
10447
|
if ($pattern_string =~ /^(\w)(\w)$/) { |
|
|
100
|
|
|
|
|
|
150
|
96
|
|
|
|
|
366
|
$alt_pattern_string1 = "$1$optional1$2$optional1"; |
151
|
|
|
|
|
|
|
} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) { |
152
|
109
|
|
|
|
|
335
|
$alt_pattern_string1 = "$1$2"; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous |
156
|
|
|
|
|
|
|
# across entries, and we can't unambiguously map back to a standard designator |
157
|
1973
|
100
|
66
|
|
|
58918
|
if (exists $self->pattern_string_map->{ $pattern_string }) { |
|
|
100
|
|
|
|
|
|
158
|
432
|
|
|
|
|
10812
|
my $current = $self->pattern_string_map->{ $pattern_string }; |
159
|
432
|
100
|
100
|
|
|
1518
|
if ($current && $current ne $reference_string) { |
160
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
161
|
3
|
|
|
|
|
84
|
$self->pattern_string_map->{ $pattern_string } = undef; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity |
165
|
|
|
|
|
|
|
elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) { |
166
|
7
|
|
|
|
|
209
|
my $current = $self->pattern_string_map->{ $alt_pattern_string1 }; |
167
|
7
|
50
|
33
|
|
|
42
|
if ($current && $current ne $reference_string) { |
168
|
|
|
|
|
|
|
# Reset both pairs to undef to mark ambiguity |
169
|
7
|
|
|
|
|
195
|
$self->pattern_string_map->{ $pattern_string } = undef; |
170
|
7
|
|
|
|
|
181
|
$self->pattern_string_map->{ $alt_pattern_string1 } = undef; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
else { |
174
|
1534
|
|
|
|
|
38994
|
$self->pattern_string_map->{ $pattern_string } = $reference_string; |
175
|
|
|
|
|
|
|
} |
176
|
1973
|
100
|
|
|
|
3605
|
if ($lang) { |
177
|
681
|
|
|
|
|
1145
|
for my $l (@$lang) { |
178
|
821
|
100
|
|
|
|
21608
|
if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) { |
179
|
230
|
|
|
|
|
6226
|
my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string }; |
180
|
230
|
50
|
33
|
|
|
780
|
if ($current && $current ne $reference_string) { |
181
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
182
|
0
|
|
|
|
|
0
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
else { |
186
|
591
|
|
|
|
|
15165
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# If $string contains unicode diacritics, also add a version without them for misspellings |
192
|
9
|
100
|
|
9
|
|
75
|
if ($string =~ m/\pM/) { |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
138
|
|
|
1973
|
|
|
|
|
8511
|
|
193
|
231
|
|
|
|
|
404
|
my $stripped = $string; |
194
|
231
|
|
|
|
|
1347
|
$stripped =~ s/\pM//g; |
195
|
231
|
|
|
|
|
750
|
$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string); |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Assemble designator regexes |
200
|
|
|
|
|
|
|
sub _build_regex { |
201
|
50
|
|
|
50
|
|
104
|
my $self = shift; |
202
|
50
|
|
|
|
|
148
|
my ($type, $lang) = @_; |
203
|
|
|
|
|
|
|
|
204
|
50
|
|
|
|
|
105
|
state $types = { map { $_ => 1 } qw(end end_cont begin) }; |
|
12
|
|
|
|
|
41
|
|
205
|
50
|
50
|
|
|
|
193
|
if (! $types->{$type}) { |
206
|
0
|
|
|
|
|
0
|
croak "invalid regex type '$type'"; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# RA constructor - case insensitive, with match tracking |
210
|
50
|
|
|
|
|
344
|
my $assembler = Regexp::Assemble->new->flags('i')->track(1); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# Construct language regex if $lang is set |
213
|
50
|
|
|
|
|
4053
|
my $lang_re; |
214
|
50
|
100
|
|
|
|
144
|
if ($lang) { |
215
|
44
|
100
|
|
|
|
204
|
$lang = [ $lang ] if ! ref $lang; |
216
|
44
|
|
|
|
|
200
|
my $lang_str = join '|', sort @$lang; |
217
|
44
|
|
|
|
|
698
|
$lang_re = qr/^($lang_str)$/; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
50
|
|
|
|
|
113
|
my $count = 0; |
221
|
50
|
|
|
|
|
97
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
9350
|
|
|
|
|
222252
|
|
222
|
|
|
|
|
|
|
# If $lang is set, restrict to entries that include $lang |
223
|
9300
|
100
|
100
|
|
|
38214
|
next if $lang_re && $entry->{lang} !~ $lang_re; |
224
|
|
|
|
|
|
|
# If $type is 'begin', restrict to 'lead' entries |
225
|
1528
|
100
|
100
|
|
|
3676
|
next if $type eq 'begin' && ! $entry->{lead}; |
226
|
|
|
|
|
|
|
# if $type is 'end_cont', restrict to languages in %LANG_CONTINUA |
227
|
1023
|
100
|
100
|
|
|
2742
|
next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}}; |
228
|
|
|
|
|
|
|
|
229
|
691
|
|
|
|
|
924
|
$count++; |
230
|
691
|
|
|
|
|
3535
|
my $long_nfd = NFD($long); |
231
|
691
|
|
|
|
|
1923
|
$self->_add_to_assembler($assembler, $lang, $long_nfd); |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# Add all abbreviations |
234
|
691
|
100
|
|
|
|
2076
|
if (my $abbr_list = $entry->{abbr}) { |
235
|
659
|
100
|
|
|
|
1523
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
236
|
659
|
|
|
|
|
1298
|
for my $abbr (@$abbr_list) { |
237
|
|
|
|
|
|
|
# Only treat non-ascii abbreviations as continuous |
238
|
1114
|
100
|
100
|
|
|
2440
|
next if $type eq 'end_cont' && $abbr =~ /^\p{ASCII}+$/; |
239
|
1051
|
|
|
|
|
3900
|
my $abbr_nfd = NFD($abbr); |
240
|
1051
|
|
66
|
|
|
4248
|
my $abbr_std = NFD($entry->{abbr_std} || $abbr); |
241
|
1051
|
|
|
|
|
2266
|
$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# If no entries found (a strange/bogus language?), return undef |
247
|
50
|
100
|
|
|
|
405
|
return if $count == 0; |
248
|
|
|
|
|
|
|
|
249
|
35
|
50
|
|
|
|
208
|
return wantarray ? ( $assembler->re, $assembler ) : $assembler->re; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.) |
253
|
|
|
|
|
|
|
# $type defaults to 'end', $lang defaults to undef (for all) |
254
|
|
|
|
|
|
|
sub regex { |
255
|
3282
|
|
|
3282
|
1
|
4604
|
my $self = shift; |
256
|
3282
|
|
|
|
|
5823
|
my ($type, $lang) = @_; |
257
|
3282
|
|
50
|
|
|
6010
|
$type ||= 'end'; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# $lang might be an arrayref containing multiple language codes |
260
|
3282
|
|
|
|
|
4049
|
my $lang_key; |
261
|
3282
|
100
|
|
|
|
5677
|
if ($lang) { |
262
|
1152
|
|
|
|
|
1525
|
$lang_key = $lang; |
263
|
1152
|
50
|
66
|
|
|
2657
|
if (ref $lang && ref $lang eq 'ARRAY' && @$lang) { |
|
|
|
66
|
|
|
|
|
264
|
8
|
50
|
|
|
|
22
|
if (@$lang == 1) { |
265
|
0
|
|
|
|
|
0
|
$lang_key = $lang->[0]; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
else { |
268
|
8
|
|
|
|
|
22
|
$lang_key = join '_', sort map { lc $_ } @$lang; |
|
16
|
|
|
|
|
78
|
|
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
3282
|
|
|
|
|
4359
|
my $cache_key = $type; |
274
|
3282
|
100
|
|
|
|
6168
|
$cache_key .= "_$lang_key" if $lang_key; |
275
|
|
|
|
|
|
|
|
276
|
3282
|
100
|
|
|
|
88646
|
if (my $entry = $self->regex_cache->{ $cache_key }) { |
277
|
3232
|
50
|
|
|
|
12012
|
return wantarray ? @$entry : $entry->[0]; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
50
|
|
|
|
|
227
|
my ($re, $assembler) = $self->_build_regex($type, $lang); |
281
|
50
|
|
|
|
|
230461
|
$self->regex_cache->{ $cache_key } = [ $re, $assembler ]; |
282
|
50
|
50
|
|
|
|
354
|
return wantarray ? ( $re, $assembler ) : $re; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Helper to return split_designator results |
286
|
|
|
|
|
|
|
sub _split_designator_result { |
287
|
1286
|
|
|
1286
|
|
12125
|
my $self = shift; |
288
|
1286
|
|
|
|
|
5290
|
my ($lang, $before, $des, $after, $matched_pattern) = @_; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# $before can end in whitespace (that we don't want to consume in the RE |
291
|
|
|
|
|
|
|
# for technical reasons around handling punctuation like '& Co' in designators) |
292
|
|
|
|
|
|
|
# So trim here to handle that case. |
293
|
1286
|
100
|
|
|
|
5959
|
$before =~ s/\s+$// if $before; |
294
|
|
|
|
|
|
|
|
295
|
1286
|
|
|
|
|
2214
|
my $des_std; |
296
|
1286
|
100
|
|
|
|
2782
|
if ($matched_pattern) { |
297
|
980
|
100
|
|
|
|
10705
|
$des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang; |
298
|
980
|
|
100
|
|
|
24003
|
$des_std ||= $self->pattern_string_map->{$matched_pattern}; |
299
|
980
|
100
|
|
|
|
2470
|
if ($des_std) { |
300
|
|
|
|
|
|
|
# Always coalesce spaces and delete commas from $des_std |
301
|
960
|
|
|
|
|
2274
|
$des_std =~ s/,+/ /g; |
302
|
960
|
|
|
|
|
2973
|
$des_std =~ s/\s\s+/ /g; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# Legacy interface - return a simple before / des / after tuple, plus $des_std |
307
|
1286
|
100
|
66
|
|
|
3390
|
return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std) |
|
1728
|
100
|
|
|
|
13422
|
|
308
|
|
|
|
|
|
|
if wantarray; |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# New scalar-context interface - return SplitResult object |
311
|
854
|
100
|
100
|
|
|
24006
|
Business::CompanyDesignator::SplitResult->new( |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
312
|
|
|
|
|
|
|
before => NFC($before // ''), |
313
|
|
|
|
|
|
|
designator => NFC($des // ''), |
314
|
|
|
|
|
|
|
designator_std => NFC($des_std // ''), |
315
|
|
|
|
|
|
|
after => NFC($after // ''), |
316
|
|
|
|
|
|
|
records => [ $des_std ? $self->records(NFC $des_std) : () ], |
317
|
|
|
|
|
|
|
); |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# Split $company_name on (the first) company designator, returning a triplet of strings: |
321
|
|
|
|
|
|
|
# ($before, $designator, $after), plus the normalised form of the designator. If no |
322
|
|
|
|
|
|
|
# designator is found, just returns ($company_name). |
323
|
|
|
|
|
|
|
# e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for |
324
|
|
|
|
|
|
|
# the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator, |
325
|
|
|
|
|
|
|
# but "Ltée" for the normalised form |
326
|
|
|
|
|
|
|
sub split_designator { |
327
|
1286
|
|
|
1286
|
1
|
285419
|
my $self = shift; |
328
|
1286
|
|
|
|
|
4113
|
my ($company_name, %arg) = @_; |
329
|
1286
|
|
|
|
|
2544
|
my $lang = $arg{lang}; |
330
|
1286
|
|
|
|
|
3041
|
my $allow_embedded = $arg{allow_embedded}; |
331
|
1286
|
|
100
|
|
|
5448
|
$allow_embedded //= 1; # backwards-compatibility, unfortunately |
332
|
1286
|
|
|
|
|
8759
|
my $company_name_match = NFD($company_name); |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# Handle older perls without XPosixPunct |
335
|
1286
|
50
|
|
|
|
2326
|
state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ? |
|
4
|
|
|
|
|
35
|
|
336
|
|
|
|
|
|
|
'[\s\p{XPosixPunct}]' : |
337
|
|
|
|
|
|
|
'[\s[:punct:]]'; |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# Strip all brackets for continuous language matching |
340
|
1286
|
|
|
|
|
5330
|
(my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g; |
341
|
|
|
|
|
|
|
|
342
|
1286
|
|
|
|
|
2444
|
my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr); |
343
|
1286
|
100
|
|
|
|
2925
|
if ($lang) { |
344
|
576
|
100
|
|
|
|
1903
|
if ($LANG_CONTINUA{$lang}) { |
345
|
36
|
|
|
|
|
195
|
($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang); |
346
|
|
|
|
|
|
|
} else { |
347
|
540
|
|
|
|
|
2023
|
($end_re, $end_asr) = $self->regex('end', $lang); |
348
|
|
|
|
|
|
|
} |
349
|
576
|
|
|
|
|
1584
|
($begin_re, $begin_asr) = $self->regex('begin', $lang); |
350
|
|
|
|
|
|
|
} else { |
351
|
710
|
|
|
|
|
2100
|
($end_re, $end_asr) = $self->regex('end'); |
352
|
710
|
|
|
|
|
1790
|
($end_cont_re, $end_cont_asr) = $self->regex('end_cont'); |
353
|
710
|
|
|
|
|
1574
|
($begin_re, $begin_asr) = $self->regex('begin'); |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# Designators are usually final, so try $end_re first |
357
|
1286
|
100
|
100
|
|
|
360951
|
if ($end_re && |
358
|
|
|
|
|
|
|
$company_name_match =~ m/^\s*(.*?)${punct_class}\s*\(?($end_re)\)?\s*$/) { |
359
|
677
|
|
|
|
|
1266193
|
return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R)); |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# No final designator - retry without a word break for the subset of languages |
363
|
|
|
|
|
|
|
# that use continuous scripts (see %LANG_CONTINUA above) |
364
|
609
|
100
|
100
|
|
|
632773
|
if ($end_cont_re && |
365
|
|
|
|
|
|
|
$company_name_match_cont_stripped =~ m/^\s*(.*?)\(?($end_cont_re)\)?\s*$/) { |
366
|
114
|
|
|
|
|
1902
|
return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R)); |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# No final designator - check for a lead designator instead (e.g. RU, NL, etc.) |
370
|
495
|
100
|
100
|
|
|
45453
|
if ($begin_re && |
371
|
|
|
|
|
|
|
$company_name_match =~ m/^\s*\(?($begin_re)\)?${punct_class}\s*(.*?)\s*$/) { |
372
|
147
|
|
|
|
|
53550
|
return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R)); |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# No final or initial - check for an embedded designator with trailing content |
376
|
348
|
100
|
100
|
|
|
34810
|
if ($end_re && $allow_embedded && |
|
|
|
100
|
|
|
|
|
377
|
|
|
|
|
|
|
$company_name_match =~ m/(.*?)${punct_class}\s*\(?($end_re)\)?(?:\s+(.*?))?$/) { |
378
|
42
|
|
|
|
|
70350
|
return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R)); |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# No match - return $company_name unchanged |
382
|
306
|
|
|
|
|
16268
|
return $self->_split_designator_result($lang, $company_name); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
1; |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
__END__ |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=encoding utf-8 |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 NAME |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
Business::CompanyDesignator - module for matching and stripping/manipulating the |
394
|
|
|
|
|
|
|
company designators appended to company names |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=head1 VERSION |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
Version: 0.17. |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
This module is considered a B<BETA> release. Interfaces may change and/or break |
401
|
|
|
|
|
|
|
without notice until the module reaches version 1.0. |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 SYNOPSIS |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Business::CompanyDesignator is a perl module for matching and stripping/manipulating |
406
|
|
|
|
|
|
|
the typical company designators appended (or sometimes, prepended) to company names. |
407
|
|
|
|
|
|
|
It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and |
408
|
|
|
|
|
|
|
abbreviations (e.g. Corp., Inc., Ltd., GmbH etc). |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
use Business::CompanyDesignator; |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# Constructor |
413
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
414
|
|
|
|
|
|
|
# Optionally, you can provide your own company_designator.yml file, instead of the bundled one |
415
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.) |
418
|
|
|
|
|
|
|
@des = $bcd->designators; |
419
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
420
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# Lookup individual designator records (returns B::CD::Record objects) |
423
|
|
|
|
|
|
|
# Lookup record by long designator (unique) |
424
|
|
|
|
|
|
|
$record = $bcd->record($long_designator); |
425
|
|
|
|
|
|
|
# Lookup records by abbreviation or long designator (may not be unique) |
426
|
|
|
|
|
|
|
@records = $bcd->records($designator); |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# Get a regex for matching designators by type ('end'/'begin') and lang |
429
|
|
|
|
|
|
|
# By default, returns 'end' regexes for all languages |
430
|
|
|
|
|
|
|
$re = $bcd->regex; |
431
|
|
|
|
|
|
|
$company_name =~ $re and say 'designator found!'; |
432
|
|
|
|
|
|
|
$company_name =~ /$re\s*$/ and say 'final designator found!'; |
433
|
|
|
|
|
|
|
my $re_begin_en = $bcd->regex('begin', 'en'); |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# Split $company_name on designator, returning a ($before, $designator, $after) triplet, |
436
|
|
|
|
|
|
|
# plus the normalised form of the designator matched (can pass to records(), for example) |
437
|
|
|
|
|
|
|
($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name); |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object |
440
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => 'en'); |
441
|
|
|
|
|
|
|
print join ' / ', $res->designator_std, $res->short_name, $res->extra; |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=head1 DATASET |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
Business::CompanyDesignator uses the company designator dataset from here: |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
L<https://github.com/ProfoundNetworks/company_designator> |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
which is bundled with the module. You can use your own (updated or custom) |
451
|
|
|
|
|
|
|
version, if you prefer, by passing a 'datafile' parameter to the constructor. |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
The dataset defines multiple long form designators (like "Company", "Limited", |
454
|
|
|
|
|
|
|
or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.', |
455
|
|
|
|
|
|
|
'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry, |
456
|
|
|
|
|
|
|
for instance, looks like this: |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
Company: |
459
|
|
|
|
|
|
|
abbr: |
460
|
|
|
|
|
|
|
- Co. |
461
|
|
|
|
|
|
|
- '& Co.' |
462
|
|
|
|
|
|
|
- and Co. |
463
|
|
|
|
|
|
|
- and Company |
464
|
|
|
|
|
|
|
lang: en |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
Long designators are unique across the dataset, but abbreviations are not |
467
|
|
|
|
|
|
|
e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée". |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
=head1 METHODS |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=head2 new() |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
Creates a Business::CompanyDesignator object. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
By default this uses the bundled company_designator dataset. You may |
478
|
|
|
|
|
|
|
provide your own (updated or custom) version by passing via a 'datafile' |
479
|
|
|
|
|
|
|
parameter to the constructor. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=head2 designators() |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
Returns the full list of company designator strings from the dataset |
486
|
|
|
|
|
|
|
(both long form and abbreviations). |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
@designators = $bcd->designators; |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=head2 long_designators() |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
Returns the full list of long form designators from the dataset. |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=head2 abbreviations() |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
Returns the full list of abbreviation designators from the dataset. |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=head2 record($long_designator) |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
Returns the Business::CompanyDesignator::Record object for the given |
505
|
|
|
|
|
|
|
long designator (and dies if not found). |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=head2 records($designator) |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
Returns a list of Business::CompanyDesignator::Record objects for the |
510
|
|
|
|
|
|
|
given abbreviation or long designator (for long designators there will |
511
|
|
|
|
|
|
|
only be a single record returned, but abbreviations may map to multiple |
512
|
|
|
|
|
|
|
records). |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Use this method for abbreviations, or if you're aren't sure of a |
515
|
|
|
|
|
|
|
designator's type. |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head2 regex([$type], [$lang]) |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
Returns a regex for all matching designators for $type ('begin'/'end') and |
520
|
|
|
|
|
|
|
$lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset. |
521
|
|
|
|
|
|
|
$lang may be either a single language code scalar, or an arrayref of language |
522
|
|
|
|
|
|
|
codes, for multiple alternative languages. The returned regex is case-insensitive |
523
|
|
|
|
|
|
|
and non-anchored. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
$type defaults to 'end', so without parameters regex() returns a regex |
526
|
|
|
|
|
|
|
matching all designators for all languages. |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool]) |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
Attempts to split $company_name on (the first) company designator found. |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
In array context split_designator returns a list of four items - a triplet of |
533
|
|
|
|
|
|
|
strings from $company_name ( $before, $designator, $after ), plus the |
534
|
|
|
|
|
|
|
standardised version of the designator as a fourth element. |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name); |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult> |
539
|
|
|
|
|
|
|
object. |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => $lang); |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
The $des designator in array context, and the SplitResult $res->designator |
544
|
|
|
|
|
|
|
is the designator text as it matched in $company_name, while the array context |
545
|
|
|
|
|
|
|
$des_std, and the SplitResult $res->designator_std is the standardised version |
546
|
|
|
|
|
|
|
as found in the dataset. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but |
549
|
|
|
|
|
|
|
"Pty. Ltd." as the stardardised form, and the latter would be what you |
550
|
|
|
|
|
|
|
would find in designators() or would lookup with records(). Similarly, |
551
|
|
|
|
|
|
|
"Accessoires XYZ Ltee" (without the french acute) would match, returning |
552
|
|
|
|
|
|
|
"Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the |
553
|
|
|
|
|
|
|
standardised form. |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
split_designator accepts the following optional (named) parameters: |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=over 4 |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=item lang => $lang |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
$lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an |
562
|
|
|
|
|
|
|
arrayref containing multiple language codes. If $lang is defined, split_designator |
563
|
|
|
|
|
|
|
will only match designators for the specified set of languages, which can improve |
564
|
|
|
|
|
|
|
the accuracy of the split by reducing false positive matches. |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=item allow_embedded => $boolean |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
allow_embedded is a boolean indicating whether or not designators can occur in |
569
|
|
|
|
|
|
|
the middle of strings, instead of only at the beginning or end. Defaults to true, |
570
|
|
|
|
|
|
|
for backwards compatibility, which yields more matches, but also more false |
571
|
|
|
|
|
|
|
positives. Setting to false is safer, but yields fewer matches (and embedded |
572
|
|
|
|
|
|
|
designators do occur surprisingly often in the wild.) |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
For more discussion, see L<AMBIGUITIES> below. |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=back |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=head2 AMBIGUITIES |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
Note that split_designator does not always get the split right. It checks for |
581
|
|
|
|
|
|
|
final designators first, then leading ones, and then finally looks for embedded |
582
|
|
|
|
|
|
|
designators (if allow_embedded is set to true). |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
Leading and trailing designators are usually reasonably accurate, but embedded |
585
|
|
|
|
|
|
|
designators are problematic. For instance, embedded designators allow names like |
586
|
|
|
|
|
|
|
these to split correctly: |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
Amerihealth Insurance Company of NJ |
589
|
|
|
|
|
|
|
Trenkwalder Personal AG Schweiz |
590
|
|
|
|
|
|
|
Vicente Campano S L (COMERCIAL VICAM) |
591
|
|
|
|
|
|
|
Gvozdika, gostinitsa OOO ""Eko-Treyd"" |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
but it will also wrongly split names like the following: |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation') |
596
|
|
|
|
|
|
|
Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada') |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
If you do want to allow splitting on embedded designators, you might want to pass |
599
|
|
|
|
|
|
|
a 'lang' parameter to split_designator if you know the language(s) used for your |
600
|
|
|
|
|
|
|
company names, as this will reduce the number of false positives by restricting the |
601
|
|
|
|
|
|
|
set of designators matched against. It won't eliminate the issue altogether though, |
602
|
|
|
|
|
|
|
so some post-processing might be required. (And I'd love to hear of ideas on how |
603
|
|
|
|
|
|
|
to improve this.) |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head1 SEE ALSO |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Finance::CompanyNames |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=head1 AUTHOR |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
Gavin Carr <gavin@profound.net> |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENCE |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
Copyright (C) 2013-2021 Gavin Carr |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it |
618
|
|
|
|
|
|
|
under the same terms as Perl itself. |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=cut |