line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Business::CompanyDesignator; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# Require perl 5.010 because the 'track' functionality of Regexp::Assemble |
4
|
|
|
|
|
|
|
# is unsafe for earlier versions. |
5
|
9
|
|
|
9
|
|
711708
|
use 5.010001; |
|
9
|
|
|
|
|
117
|
|
6
|
9
|
|
|
9
|
|
4951
|
use Moose; |
|
9
|
|
|
|
|
4092652
|
|
|
9
|
|
|
|
|
69
|
|
7
|
9
|
|
|
9
|
|
65865
|
use utf8; |
|
9
|
|
|
|
|
65
|
|
|
9
|
|
|
|
|
66
|
|
8
|
9
|
|
|
9
|
|
323
|
use warnings qw(FATAL utf8); |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
472
|
|
9
|
9
|
|
|
9
|
|
2863
|
use FindBin qw($Bin); |
|
9
|
|
|
|
|
5416
|
|
|
9
|
|
|
|
|
1209
|
|
10
|
9
|
|
|
9
|
|
2282
|
use YAML; |
|
9
|
|
|
|
|
33410
|
|
|
9
|
|
|
|
|
585
|
|
11
|
9
|
|
|
9
|
|
5869
|
use File::ShareDir qw(dist_file); |
|
9
|
|
|
|
|
186548
|
|
|
9
|
|
|
|
|
719
|
|
12
|
9
|
|
|
9
|
|
86
|
use List::MoreUtils qw(uniq); |
|
9
|
|
|
|
|
29
|
|
|
9
|
|
|
|
|
44
|
|
13
|
9
|
|
|
9
|
|
13354
|
use Regexp::Assemble; |
|
9
|
|
|
|
|
173264
|
|
|
9
|
|
|
|
|
428
|
|
14
|
9
|
|
|
9
|
|
5826
|
use Unicode::Normalize; |
|
9
|
|
|
|
|
19119
|
|
|
9
|
|
|
|
|
765
|
|
15
|
9
|
|
|
9
|
|
90
|
use Carp; |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
587
|
|
16
|
|
|
|
|
|
|
|
17
|
9
|
|
|
9
|
|
4640
|
use Business::CompanyDesignator::Record; |
|
9
|
|
|
|
|
41
|
|
|
9
|
|
|
|
|
704
|
|
18
|
9
|
|
|
9
|
|
5671
|
use Business::CompanyDesignator::SplitResult; |
|
9
|
|
|
|
|
37
|
|
|
9
|
|
|
|
|
10730
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION = '0.15'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has 'datafile' => ( is => 'ro', default => sub { |
23
|
|
|
|
|
|
|
# Development/test version |
24
|
|
|
|
|
|
|
my $local_datafile = "$Bin/../share/company_designator_dev.yml"; |
25
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
26
|
|
|
|
|
|
|
$local_datafile = "$Bin/../share/company_designator.yml"; |
27
|
|
|
|
|
|
|
return $local_datafile if -f $local_datafile; |
28
|
|
|
|
|
|
|
# Installed version |
29
|
|
|
|
|
|
|
return dist_file('Business-CompanyDesignator', 'company_designator.yml'); |
30
|
|
|
|
|
|
|
}); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# data is the raw dataset as loaded from datafile, keyed by long designator |
33
|
|
|
|
|
|
|
has data => ( is => 'ro', lazy_build => 1 ); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# regex_cache is a cache of regexes by language and type, since they're expensive to build |
36
|
|
|
|
|
|
|
has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of |
39
|
|
|
|
|
|
|
# long designators (since abbreviations are not necessarily unique) |
40
|
|
|
|
|
|
|
has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 ); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# pattern_string_map is a hash mapping patterns back to their source string, |
43
|
|
|
|
|
|
|
# since we do things like add additional patterns without diacritics |
44
|
|
|
|
|
|
|
has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
45
|
|
|
|
|
|
|
# pattern_string_map_lang is a hash of hashes, mapping language codes to hashes |
46
|
|
|
|
|
|
|
# of patterns back to their source string |
47
|
|
|
|
|
|
|
has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _build_data { |
50
|
8
|
|
|
8
|
|
146
|
my $self = shift; |
51
|
8
|
|
|
|
|
296
|
YAML::LoadFile($self->datafile); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _build_abbr_long_map { |
55
|
5
|
|
|
5
|
|
15
|
my $self = shift; |
56
|
5
|
|
|
|
|
14
|
my $map = {}; |
57
|
5
|
|
|
|
|
31
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
865
|
|
|
|
|
22228
|
|
58
|
860
|
100
|
|
|
|
1690
|
if (my $abbr = $entry->{abbr_std}) { |
59
|
5
|
|
50
|
|
|
35
|
$map->{$abbr} ||= []; |
60
|
5
|
|
|
|
|
9
|
push @{ $map->{$abbr} }, $long; |
|
5
|
|
|
|
|
26
|
|
61
|
|
|
|
|
|
|
} |
62
|
860
|
100
|
|
|
|
1622
|
my $abbr_list = $entry->{abbr} or next; |
63
|
830
|
100
|
|
|
|
1723
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
64
|
830
|
|
|
|
|
1291
|
for my $abbr (@$abbr_list) { |
65
|
1270
|
|
100
|
|
|
5393
|
$map->{$abbr} ||= []; |
66
|
1270
|
|
|
|
|
1576
|
push @{ $map->{$abbr} }, $long; |
|
1270
|
|
|
|
|
3355
|
|
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
} |
69
|
5
|
|
|
|
|
143
|
return $map; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub long_designators { |
73
|
4
|
|
|
4
|
1
|
1176
|
my $self = shift; |
74
|
4
|
|
|
|
|
11
|
sort keys %{ $self->data }; |
|
4
|
|
|
|
|
140
|
|
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub abbreviations { |
78
|
3
|
|
|
3
|
1
|
1057
|
my $self = shift; |
79
|
3
|
|
|
|
|
7
|
sort keys %{ $self->abbr_long_map }; |
|
3
|
|
|
|
|
136
|
|
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub designators { |
83
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
84
|
1
|
|
|
|
|
5
|
sort $self->long_designators, $self->abbreviations; |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Return the B::CD::Record for $long designator |
88
|
|
|
|
|
|
|
sub record { |
89
|
1094
|
|
|
1094
|
1
|
43821
|
my ($self, $long) = @_; |
90
|
1094
|
100
|
|
|
|
30048
|
my $entry = $self->data->{$long} |
91
|
|
|
|
|
|
|
or croak "Invalid long designator '$long'"; |
92
|
1093
|
|
|
|
|
33565
|
return Business::CompanyDesignator::Record->new( long => $long, record => $entry ); |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# Return a list of B::CD::Records for $designator |
96
|
|
|
|
|
|
|
sub records { |
97
|
862
|
|
|
862
|
1
|
325503
|
my ($self, $designator) = @_; |
98
|
862
|
50
|
|
|
|
2247
|
croak "Missing designator" if ! $designator; |
99
|
862
|
100
|
|
|
|
28616
|
if (exists $self->data->{$designator}) { |
|
|
100
|
|
|
|
|
|
100
|
241
|
|
|
|
|
707
|
return ( $self->record($designator) ); |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
elsif (my $long_set = $self->abbr_long_map->{$designator}) { |
103
|
620
|
|
|
|
|
1595
|
return map { $self->record($_) } @$long_set |
|
680
|
|
|
|
|
1800
|
|
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
else { |
106
|
1
|
|
|
|
|
22
|
croak "Invalid designator '$designator'"; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Add $string to regex assembler |
111
|
|
|
|
|
|
|
sub _add_to_assembler { |
112
|
1590
|
|
|
1590
|
|
3378
|
my ($self, $assembler, $lang, $string, $reference_string) = @_; |
113
|
1590
|
|
66
|
|
|
3842
|
$reference_string ||= $string; |
114
|
|
|
|
|
|
|
# printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang || []}), $string, $reference_string; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs: |
117
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=50228 |
118
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=74449 |
119
|
|
|
|
|
|
|
# $assembler->add($string) |
120
|
|
|
|
|
|
|
# Workaround by lexing and using insert() |
121
|
1590
|
|
|
|
|
2190
|
my $optional1 = '\\.?,?\\s*'; |
122
|
|
|
|
|
|
|
my @pattern = map { |
123
|
|
|
|
|
|
|
# Periods are treated as optional literals, with optional trailing commas and/or whitespace |
124
|
1590
|
100
|
|
|
|
5961
|
/\./ ? $optional1 : |
|
23461
|
100
|
|
|
|
61846
|
|
|
|
100
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Embedded spaces can be multiple, and include leading commas |
126
|
|
|
|
|
|
|
/ / ? ',?\s+' : |
127
|
|
|
|
|
|
|
# Escape other regex metacharacters |
128
|
|
|
|
|
|
|
/[()]/ ? "\\$_" : $_ |
129
|
|
|
|
|
|
|
} split //, $string; |
130
|
1590
|
|
|
|
|
6319
|
$assembler->insert(@pattern); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang |
133
|
1590
|
|
|
|
|
199302
|
my $pattern_string = join '', @pattern; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Special case - optional match characters can cause clashes between |
136
|
|
|
|
|
|
|
# distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/ |
137
|
|
|
|
|
|
|
# We need to handle such cases as ambiguous with extra checks |
138
|
1590
|
|
|
|
|
2752
|
my $optional1e = "\Q$optional1\E"; |
139
|
1590
|
|
|
|
|
2040
|
my $alt_pattern_string1; |
140
|
1590
|
100
|
|
|
|
8932
|
if ($pattern_string =~ /^(\w)(\w)$/) { |
|
|
100
|
|
|
|
|
|
141
|
69
|
|
|
|
|
276
|
$alt_pattern_string1 = "$1$optional1$2$optional1"; |
142
|
|
|
|
|
|
|
} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) { |
143
|
98
|
|
|
|
|
279
|
$alt_pattern_string1 = "$1$2"; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous |
147
|
|
|
|
|
|
|
# across entries, and we can't unambiguously map back to a standard designator |
148
|
1590
|
100
|
66
|
|
|
52967
|
if (exists $self->pattern_string_map->{ $pattern_string }) { |
|
|
100
|
|
|
|
|
|
149
|
245
|
|
|
|
|
7146
|
my $current = $self->pattern_string_map->{ $pattern_string }; |
150
|
245
|
100
|
100
|
|
|
1131
|
if ($current && $current ne $reference_string) { |
151
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
152
|
3
|
|
|
|
|
95
|
$self->pattern_string_map->{ $pattern_string } = undef; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity |
156
|
|
|
|
|
|
|
elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) { |
157
|
5
|
|
|
|
|
143
|
my $current = $self->pattern_string_map->{ $alt_pattern_string1 }; |
158
|
5
|
50
|
33
|
|
|
58
|
if ($current && $current ne $reference_string) { |
159
|
|
|
|
|
|
|
# Reset both pairs to undef to mark ambiguity |
160
|
5
|
|
|
|
|
169
|
$self->pattern_string_map->{ $pattern_string } = undef; |
161
|
5
|
|
|
|
|
144
|
$self->pattern_string_map->{ $alt_pattern_string1 } = undef; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
else { |
165
|
1340
|
|
|
|
|
36901
|
$self->pattern_string_map->{ $pattern_string } = $reference_string; |
166
|
|
|
|
|
|
|
} |
167
|
1590
|
100
|
|
|
|
3343
|
if ($lang) { |
168
|
524
|
|
|
|
|
922
|
for my $l (@$lang) { |
169
|
633
|
100
|
|
|
|
19739
|
if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) { |
170
|
154
|
|
|
|
|
4774
|
my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string }; |
171
|
154
|
50
|
33
|
|
|
628
|
if ($current && $current ne $reference_string) { |
172
|
|
|
|
|
|
|
# Reset to undef to mark ambiguity |
173
|
0
|
|
|
|
|
0
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
else { |
177
|
479
|
|
|
|
|
13654
|
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# If $string contains unicode diacritics, also add a version without them for misspellings |
183
|
9
|
100
|
|
9
|
|
92
|
if ($string =~ m/\pM/) { |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
148
|
|
|
1590
|
|
|
|
|
7456
|
|
184
|
211
|
|
|
|
|
421
|
my $stripped = $string; |
185
|
211
|
|
|
|
|
1211
|
$stripped =~ s/\pM//g; |
186
|
211
|
|
|
|
|
765
|
$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# Assemble designator regex |
191
|
|
|
|
|
|
|
sub _build_regex { |
192
|
32
|
|
|
32
|
|
81
|
my $self = shift; |
193
|
32
|
|
|
|
|
80
|
my ($type, $lang) = @_; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# RA constructor - case insensitive, with match tracking |
196
|
32
|
|
|
|
|
215
|
my $assembler = Regexp::Assemble->new->flags('i')->track(1); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Construct language regex if $lang is set |
199
|
32
|
|
|
|
|
2704
|
my $lang_re; |
200
|
32
|
100
|
|
|
|
93
|
if ($lang) { |
201
|
28
|
100
|
|
|
|
104
|
$lang = [ $lang ] if ! ref $lang; |
202
|
28
|
|
|
|
|
120
|
my $lang_str = join '|', sort @$lang; |
203
|
28
|
|
|
|
|
367
|
$lang_re = qr/^($lang_str)$/; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
32
|
|
|
|
|
74
|
my $count = 0; |
207
|
32
|
|
|
|
|
75
|
while (my ($long, $entry) = each %{ $self->data }) { |
|
5536
|
|
|
|
|
144215
|
|
208
|
|
|
|
|
|
|
# If $type is begin, restrict to 'lead' entries |
209
|
5504
|
100
|
100
|
|
|
14884
|
next if $type eq 'begin' && ! $entry->{lead}; |
210
|
|
|
|
|
|
|
# If $lang is set, restrict to entries that include $lang |
211
|
2976
|
100
|
100
|
|
|
13107
|
next if $lang_re && $entry->{lang} !~ $lang_re; |
212
|
|
|
|
|
|
|
|
213
|
547
|
|
|
|
|
810
|
$count++; |
214
|
547
|
|
|
|
|
2835
|
my $long_nfd = NFD($long); |
215
|
547
|
|
|
|
|
1730
|
$self->_add_to_assembler($assembler, $lang, $long_nfd); |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# Add all abbreviations |
218
|
547
|
100
|
|
|
|
1712
|
if (my $abbr_list = $entry->{abbr}) { |
219
|
531
|
100
|
|
|
|
1732
|
$abbr_list = [ $abbr_list ] if ! ref $abbr_list; |
220
|
531
|
|
|
|
|
1022
|
for my $abbr (@$abbr_list) { |
221
|
832
|
|
|
|
|
3395
|
my $abbr_nfd = NFD($abbr); |
222
|
832
|
|
66
|
|
|
3785
|
my $abbr_std = NFD($entry->{abbr_std} || $abbr); |
223
|
832
|
|
|
|
|
1921
|
$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# If no entries found (a strange/bogus language?), return undef |
229
|
32
|
100
|
|
|
|
240
|
return if $count == 0; |
230
|
|
|
|
|
|
|
|
231
|
20
|
50
|
|
|
|
120
|
return wantarray ? ( $assembler->re, $assembler ) : $assembler->re; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.) |
235
|
|
|
|
|
|
|
# $type defaults to 'end', $lang defaults to undef (for all) |
236
|
|
|
|
|
|
|
sub regex { |
237
|
1240
|
|
|
1240
|
1
|
1905
|
my $self = shift; |
238
|
1240
|
|
|
|
|
2413
|
my ($type, $lang) = @_; |
239
|
1240
|
|
50
|
|
|
2506
|
$type ||= 'end'; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# $lang might be an arrayref containing multiple language codes |
242
|
1240
|
|
|
|
|
1759
|
my $lang_key; |
243
|
1240
|
100
|
|
|
|
2422
|
if ($lang) { |
244
|
560
|
|
|
|
|
948
|
$lang_key = $lang; |
245
|
560
|
50
|
66
|
|
|
1316
|
if (ref $lang && ref $lang eq 'ARRAY' && @$lang) { |
|
|
|
66
|
|
|
|
|
246
|
8
|
50
|
|
|
|
28
|
if (@$lang == 1) { |
247
|
0
|
|
|
|
|
0
|
$lang_key = $lang->[0]; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
else { |
250
|
8
|
|
|
|
|
22
|
$lang_key = join '_', sort map { lc $_ } @$lang; |
|
16
|
|
|
|
|
111
|
|
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
1240
|
|
|
|
|
1999
|
my $cache_key = $type; |
256
|
1240
|
100
|
|
|
|
2607
|
$cache_key .= "_$lang_key" if $lang_key; |
257
|
|
|
|
|
|
|
|
258
|
1240
|
100
|
|
|
|
39045
|
if (my $entry = $self->regex_cache->{ $cache_key }) { |
259
|
1208
|
50
|
|
|
|
4949
|
return wantarray ? @$entry : $entry->[0]; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
32
|
|
|
|
|
127
|
my ($re, $assembler) = $self->_build_regex($type, $lang); |
263
|
32
|
|
|
|
|
210095
|
$self->regex_cache->{ $cache_key } = [ $re, $assembler ]; |
264
|
32
|
50
|
|
|
|
206
|
return wantarray ? ( $re, $assembler ) : $re; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# Helper to return split_designator results |
268
|
|
|
|
|
|
|
sub _split_designator_result { |
269
|
620
|
|
|
620
|
|
5388
|
my $self = shift; |
270
|
620
|
|
|
|
|
2912
|
my ($lang, $before, $des, $after, $matched_pattern) = @_; |
271
|
|
|
|
|
|
|
|
272
|
620
|
|
|
|
|
1199
|
my $des_std; |
273
|
620
|
100
|
|
|
|
1442
|
if ($matched_pattern) { |
274
|
462
|
100
|
|
|
|
5158
|
$des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang; |
275
|
462
|
|
100
|
|
|
12716
|
$des_std ||= $self->pattern_string_map->{$matched_pattern}; |
276
|
462
|
100
|
|
|
|
1137
|
if ($des_std) { |
277
|
|
|
|
|
|
|
# Always coalesce spaces and delete commas from $des_std |
278
|
442
|
|
|
|
|
1081
|
$des_std =~ s/,+/ /g; |
279
|
442
|
|
|
|
|
1199
|
$des_std =~ s/\s\s+/ /g; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# Legacy interface - return a simple before / des / after tuple, plus $des_std |
284
|
620
|
100
|
66
|
|
|
1622
|
return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std) |
|
840
|
100
|
|
|
|
6065
|
|
285
|
|
|
|
|
|
|
if wantarray; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# New scalar-context interface - return SplitResult object |
288
|
410
|
100
|
100
|
|
|
12895
|
Business::CompanyDesignator::SplitResult->new( |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
289
|
|
|
|
|
|
|
before => NFC($before // ''), |
290
|
|
|
|
|
|
|
designator => NFC($des // ''), |
291
|
|
|
|
|
|
|
designator_std => NFC($des_std // ''), |
292
|
|
|
|
|
|
|
after => NFC($after // ''), |
293
|
|
|
|
|
|
|
records => [ $des_std ? $self->records(NFC $des_std) : () ], |
294
|
|
|
|
|
|
|
); |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# Split $company_name on (the first) company designator, returning a triplet of strings: |
298
|
|
|
|
|
|
|
# ($before, $designator, $after), plus the normalised form of the designator. If no |
299
|
|
|
|
|
|
|
# designator is found, just returns ($company_name). |
300
|
|
|
|
|
|
|
# e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for |
301
|
|
|
|
|
|
|
# the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator, |
302
|
|
|
|
|
|
|
# but "Ltée" for the normalised form |
303
|
|
|
|
|
|
|
sub split_designator { |
304
|
620
|
|
|
620
|
1
|
170989
|
my $self = shift; |
305
|
620
|
|
|
|
|
2118
|
my ($company_name, %arg) = @_; |
306
|
620
|
|
|
|
|
1281
|
my $lang = $arg{lang}; |
307
|
620
|
|
|
|
|
1327
|
my $allow_embedded = $arg{allow_embedded}; |
308
|
620
|
|
100
|
|
|
2754
|
$allow_embedded //= 1; # backwards-compatibility, unfortunately |
309
|
620
|
|
|
|
|
4185
|
my $company_name_match = NFD($company_name); |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# Handle older perls without XPosixPunct |
312
|
620
|
50
|
|
|
|
1238
|
state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ? '[\s\p{XPosixPunct}]' : '[\s[:punct:]]'; |
|
4
|
|
|
|
|
37
|
|
313
|
|
|
|
|
|
|
|
314
|
620
|
|
|
|
|
1908
|
my ($re, $assembler) = $self->regex('end', $lang); |
315
|
620
|
|
|
|
|
1727
|
my ($lead_re, $lead_assembler) = $self->regex('begin', $lang); |
316
|
|
|
|
|
|
|
|
317
|
620
|
100
|
|
|
|
1595
|
if ($re) { |
318
|
|
|
|
|
|
|
# Designators are usually final, so try that first |
319
|
480
|
100
|
100
|
|
|
174081
|
if ($company_name_match =~ m/^\s*(.*?)${punct_class}\s*($re)\s*$/) { |
|
|
100
|
100
|
|
|
|
|
|
|
100
|
|
|
|
|
|
320
|
371
|
|
|
|
|
670673
|
return $self->_split_designator_result($lang, $1, $2, undef, $assembler->source($^R)); |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
# Not final - check for a lead designator instead (e.g. RU, NL, etc.) |
323
|
|
|
|
|
|
|
elsif ($lead_re && $company_name_match =~ m/^\s*($lead_re)${punct_class}\s*(.*?)\s*$/) { |
324
|
49
|
|
|
|
|
107393
|
return $self->_split_designator_result($lang, undef, $1, $2, $lead_assembler->source($^R)); |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
# Not final - check for an embedded designator with trailing content |
327
|
|
|
|
|
|
|
elsif ($allow_embedded && $company_name_match =~ m/(.*?)${punct_class}\s*($re)(?:\s+(.*?))?$/) { |
328
|
42
|
|
|
|
|
167660
|
return $self->_split_designator_result($lang, $1, $2, $3, $assembler->source($^R)); |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# No match - return $company_name unchanged |
333
|
158
|
|
|
|
|
52835
|
return $self->_split_designator_result($lang, $company_name); |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
1; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
__END__ |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=encoding utf-8 |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=head1 NAME |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Business::CompanyDesignator - module for matching and stripping/manipulating the |
345
|
|
|
|
|
|
|
company designators appended to company names |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=head1 VERSION |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Version: 0.13. |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
This module is considered a B<BETA> release. Interfaces may change and/or break |
352
|
|
|
|
|
|
|
without notice until the module reaches version 1.0. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head1 SYNOPSIS |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Business::CompanyDesignator is a perl module for matching and stripping/manipulating |
357
|
|
|
|
|
|
|
the typical company designators appended (or sometimes, prepended) to company names. |
358
|
|
|
|
|
|
|
It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and |
359
|
|
|
|
|
|
|
abbreviations (e.g. Corp., Inc., Ltd., GmbH etc). |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
use Business::CompanyDesignator; |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# Constructor |
364
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
365
|
|
|
|
|
|
|
# Optionally, you can provide your own company_designator.yml file, instead of the bundled one |
366
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.) |
369
|
|
|
|
|
|
|
@des = $bcd->designators; |
370
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
371
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# Lookup individual designator records (returns B::CD::Record objects) |
374
|
|
|
|
|
|
|
# Lookup record by long designator (unique) |
375
|
|
|
|
|
|
|
$record = $bcd->record($long_designator); |
376
|
|
|
|
|
|
|
# Lookup records by abbreviation or long designator (may not be unique) |
377
|
|
|
|
|
|
|
@records = $bcd->records($designator); |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# Get a regex for matching designators by type ('end'/'begin') and lang |
380
|
|
|
|
|
|
|
# By default, returns 'end' regexes for all languages |
381
|
|
|
|
|
|
|
$re = $bcd->regex; |
382
|
|
|
|
|
|
|
$company_name =~ $re and say 'designator found!'; |
383
|
|
|
|
|
|
|
$company_name =~ /$re\s*$/ and say 'final designator found!'; |
384
|
|
|
|
|
|
|
my $re_begin_en = $bcd->regex('begin', 'en'); |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# Split $company_name on designator, returning a ($before, $designator, $after) triplet, |
387
|
|
|
|
|
|
|
# plus the normalised form of the designator matched (can pass to records(), for example) |
388
|
|
|
|
|
|
|
($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name); |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object |
391
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => 'en'); |
392
|
|
|
|
|
|
|
print join ' / ', $res->designator_std, $res->short_name, $res->extra; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head1 DATASET |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Business::CompanyDesignator uses the company designator dataset from here: |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
L<https://github.com/ProfoundNetworks/company_designator> |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
which is bundled with the module. You can use your own (updated or custom) |
402
|
|
|
|
|
|
|
version, if you prefer, by passing a 'datafile' parameter to the constructor. |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
The dataset defines multiple long form designators (like "Company", "Limited", |
405
|
|
|
|
|
|
|
or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.', |
406
|
|
|
|
|
|
|
'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry, |
407
|
|
|
|
|
|
|
for instance, looks like this: |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
Company: |
410
|
|
|
|
|
|
|
abbr: |
411
|
|
|
|
|
|
|
- Co. |
412
|
|
|
|
|
|
|
- '& Co.' |
413
|
|
|
|
|
|
|
- and Co. |
414
|
|
|
|
|
|
|
lang: en |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Long designators are unique across the dataset, but abbreviations are not |
417
|
|
|
|
|
|
|
e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée". |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=head1 METHODS |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=head2 new() |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Creates a Business::CompanyDesignator object. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
By default this uses the bundled company_designator dataset. You may |
428
|
|
|
|
|
|
|
provide your own (updated or custom) version by passing via a 'datafile' |
429
|
|
|
|
|
|
|
parameter to the constructor. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml'); |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=head2 designators() |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Returns the full list of company designator strings from the dataset |
436
|
|
|
|
|
|
|
(both long form and abbreviations). |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
@designators = $bcd->designators; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 long_designators() |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Returns the full list of long form designators from the dataset. |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
@long = $bcd->long_designators; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 abbreviations() |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Returns the full list of abbreviation designators from the dataset. |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
@abbrev = $bcd->abbreviations; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=head2 record($long_designator) |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Returns the Business::CompanyDesignator::Record object for the given |
455
|
|
|
|
|
|
|
long designator (and dies if not found). |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head2 records($designator) |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
Returns a list of Business::CompanyDesignator::Record objects for the |
460
|
|
|
|
|
|
|
given abbreviation or long designator (for long designators there will |
461
|
|
|
|
|
|
|
only be a single record returned, but abbreviations may map to multiple |
462
|
|
|
|
|
|
|
records). |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
Use this method for abbreviations, or if you're aren't sure of a |
465
|
|
|
|
|
|
|
designator's type. |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=head2 regex([$type], [$lang]) |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Returns a regex for all matching designators for $type ('begin'/'end') and |
470
|
|
|
|
|
|
|
$lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset. |
471
|
|
|
|
|
|
|
$lang may be either a single language code scalar, or an arrayref of language |
472
|
|
|
|
|
|
|
codes, for multiple alternative languages. The returned regex is case-insensitive |
473
|
|
|
|
|
|
|
and non-anchored. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
$type defaults to 'end', so without parameters regex() returns a regex |
476
|
|
|
|
|
|
|
matching all designators for all languages. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool]) |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
Attempts to split $company_name on (the first) company designator found. |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
In array context split_designator returns a list of four items - a triplet of |
483
|
|
|
|
|
|
|
strings from $company_name ( $before, $designator, $after ), plus the |
484
|
|
|
|
|
|
|
standardised version of the designator as a fourth element. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name); |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult> |
489
|
|
|
|
|
|
|
object. |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
$res = $bcd->split_designator($company_name, lang => $lang); |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
The $des designator in array context, and the SplitResult $res->designator |
494
|
|
|
|
|
|
|
is the designator text as it matched in $company_name, while the array context |
495
|
|
|
|
|
|
|
$des_std, and the SplitResult $res->designator_std is the standardised version |
496
|
|
|
|
|
|
|
as found in the dataset. |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but |
499
|
|
|
|
|
|
|
"Pty. Ltd." as the stardardised form, and the latter would be what you |
500
|
|
|
|
|
|
|
would find in designators() or would lookup with records(). Similarly, |
501
|
|
|
|
|
|
|
"Accessoires XYZ Ltee" (without the french acute) would match, returning |
502
|
|
|
|
|
|
|
"Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the |
503
|
|
|
|
|
|
|
standardised form. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
split_designator accepts the following optional (named) parameters: |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=over 4 |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=item lang => $lang |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
$lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an |
512
|
|
|
|
|
|
|
arrayref containing multiple language codes. If $lang is defined, split_designator |
513
|
|
|
|
|
|
|
will only match designators for the specified set of languages, which can improve |
514
|
|
|
|
|
|
|
the accuracy of the split by reducing false positive matches. |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=item allow_embedded => $boolean |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
allow_embedded is a boolean indicating whether or not designators can occur in |
519
|
|
|
|
|
|
|
the middle of strings, instead of only at the beginning or end. Defaults to true, |
520
|
|
|
|
|
|
|
for backwards compatibility, which yields more matches, but also more false |
521
|
|
|
|
|
|
|
positives. Setting to false is safer, but yields fewer matches (and embedded |
522
|
|
|
|
|
|
|
designators do occur surprisingly often in the wild.) |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
For more discussion, see L<AMBIGUITIES> below. |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=back |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=head2 AMBIGUITIES |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
Note that split_designator does not always get the split right. It checks for |
531
|
|
|
|
|
|
|
final designators first, then leading ones, and then finally looks for embedded |
532
|
|
|
|
|
|
|
designators (if allow_embedded is set to true). |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
Leading and trailing designators are usually reasonably accurate, but embedded |
535
|
|
|
|
|
|
|
designators are problematic. For instance, embedded designators allow names like |
536
|
|
|
|
|
|
|
these to split correctly: |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
Amerihealth Insurance Company of NJ |
539
|
|
|
|
|
|
|
Trenkwalder Personal AG Schweiz |
540
|
|
|
|
|
|
|
Vicente Campano S L (COMERCIAL VICAM) |
541
|
|
|
|
|
|
|
Gvozdika, gostinitsa OOO ""Eko-Treyd"" |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
but it will also wrongly split names like the following: |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation') |
546
|
|
|
|
|
|
|
Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada') |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
If you do want to allow splitting on embedded designators, you might want to pass |
549
|
|
|
|
|
|
|
a 'lang' parameter to split_designator if you know the language(s) used for your |
550
|
|
|
|
|
|
|
company names, as this will reduce the number of false positives by restricting the |
551
|
|
|
|
|
|
|
set of designators matched against. It won't eliminate the issue altogether though, |
552
|
|
|
|
|
|
|
so some post-processing might be required. (And I'd love to hear of ideas on how |
553
|
|
|
|
|
|
|
to improve this.) |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=head1 SEE ALSO |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
Finance::CompanyNames |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=head1 AUTHOR |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
Gavin Carr <gavin@profound.net> |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENCE |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Copyright (C) 2013-2016 Gavin Carr |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify it |
568
|
|
|
|
|
|
|
under the same terms as Perl itself. |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=cut |