File Coverage

blib/lib/Business/CompanyDesignator.pm

Criterion	Covered	Total	%
statement	192	195	98.4
branch	92	102	90.2
condition	59	70	84.2
subroutine	26	26	100.0
pod	7	7	100.0
total	376	400	94.0

line	stmt	bran	cond	sub	pod	time	code
1							package Business::CompanyDesignator;
2
3							# Require perl 5.010 because the 'track' functionality of Regexp::Assemble
4							# is unsafe for earlier versions.
5	9			9		657930	use 5.010001;
	9					111
6	9			9		4651	use Moose;
	9					3675611
	9					52
7	9			9		58348	use utf8;
	9					55
	9					57
8	9			9		284	use warnings qw(FATAL utf8);
	9					19
	9					377
9	9			9		2531	use FindBin qw($Bin);
	9					4935
	9					1064
10	9			9		2054	use YAML;
	9					31124
	9					478
11	9			9		5112	use File::ShareDir qw(dist_file);
	9					167941
	9					570
12	9			9		67	use List::MoreUtils qw(uniq);
	9					23
	9					37
13	9			9		11508	use Regexp::Assemble;
	9					157207
	9					316
14	9			9		5066	use Unicode::Normalize;
	9					17381
	9					632
15	9			9		69	use Carp;
	9					18
	9					513
16
17	9			9		4207	use Business::CompanyDesignator::Record;
	9					37
	9					449
18	9			9		4899	use Business::CompanyDesignator::SplitResult;
	9					28
	9					9867
19
20							our $VERSION = '0.16';
21
22							# Hardcode the set of languages that we treat as 'continuous' i.e. their
23							# designators don't require a word break before/after.
24							our %LANG_CONTINUA = map { $_ => 1 } qw(
25							zh
26							ja
27							ko
28							);
29
30							has 'datafile' => ( is => 'ro', default => sub {
31							# Development/test version
32							my $local_datafile = "$Bin/../share/company_designator_dev.yml";
33							return $local_datafile if -f $local_datafile;
34							$local_datafile = "$Bin/../share/company_designator.yml";
35							return $local_datafile if -f $local_datafile;
36							# Installed version
37							return dist_file('Business-CompanyDesignator', 'company_designator.yml');
38							});
39
40							# data is the raw dataset as loaded from datafile, keyed by long designator
41							has data => ( is => 'ro', lazy_build => 1 );
42
43							# regex_cache is a cache of regexes by language and type, since they're expensive to build
44							has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
45
46							# abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of
47							# long designators (since abbreviations are not necessarily unique)
48							has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 );
49
50							# pattern_string_map is a hash mapping patterns back to their source string,
51							# since we do things like add additional patterns without diacritics
52							has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
53							# pattern_string_map_lang is a hash of hashes, mapping language codes to hashes
54							# of patterns back to their source string
55							has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
56
57							sub _build_data {
58	8			8		57	my $self = shift;
59	8					256	YAML::LoadFile($self->datafile);
60							}
61
62							sub _build_abbr_long_map {
63	5			5		31	my $self = shift;
64	5					14	my $map = {};
65	5					15	while (my ($long, $entry) = each %{ $self->data }) {
	895					22117
66	890	100				1718	if (my $abbr = $entry->{abbr_std}) {
67	25		100			114	$map->{$abbr} \|\|= [];
68	25					42	push @{ $map->{$abbr} }, $long;
	25					70
69							}
70	890	100				1701	my $abbr_list = $entry->{abbr} or next;
71	860	100				1592	$abbr_list = [ $abbr_list ] if ! ref $abbr_list;
72	860					1296	for my $abbr (@$abbr_list) {
73	1365		100			5791	$map->{$abbr} \|\|= [];
74	1365					1687	push @{ $map->{$abbr} }, $long;
	1365					3495
75							}
76							}
77	5					149	return $map;
78							}
79
80							sub long_designators {
81	4			4	1	1178	my $self = shift;
82	4					9	sort keys %{ $self->data };
	4					132
83							}
84
85							sub abbreviations {
86	3			3	1	1055	my $self = shift;
87	3					7	sort keys %{ $self->abbr_long_map };
	3					107
88							}
89
90							sub designators {
91	1			1	1	3	my $self = shift;
92	1					5	sort $self->long_designators, $self->abbreviations;
93							}
94
95							# Return the B::CD::Record for $long designator
96							sub record {
97	1590			1590	1	46348	my ($self, $long) = @_;
98	1590	100				40880	my $entry = $self->data->{$long}
99							or croak "Invalid long designator '$long'";
100	1589					46147	return Business::CompanyDesignator::Record->new( long => $long, record => $entry );
101							}
102
103							# Return a list of B::CD::Records for $designator
104							sub records {
105	1243			1243	1	486211	my ($self, $designator) = @_;
106	1243	50				2878	croak "Missing designator" if ! $designator;
107	1243	100				38104	if (exists $self->data->{$designator}) {
		100
108	366					934	return ( $self->record($designator) );
109							}
110							elsif (my $long_set = $self->abbr_long_map->{$designator}) {
111	876					2168	return map { $self->record($_) } @$long_set
	1045					2433
112							}
113							else {
114	1					21	croak "Invalid designator '$designator'";
115							}
116							}
117
118							# Add $string to regex assembler
119							sub _add_to_assembler {
120	1896			1896		3362	my ($self, $assembler, $lang, $string, $reference_string) = @_;
121	1896		66			4092	$reference_string \|\|= $string;
122							# printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang \|\| []}), $string, $reference_string;
123
124							# FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs:
125							# https://rt.cpan.org/Public/Bug/Display.html?id=50228
126							# https://rt.cpan.org/Public/Bug/Display.html?id=74449
127							# $assembler->add($string)
128							# Workaround by lexing and using insert()
129	1896					2361	my $optional1 = '\\.?,?\\s*';
130							my @pattern = map {
131							# Periods are treated as optional literals, with optional trailing commas and/or whitespace
132	1896	100				6023	/\./ ? $optional1 :
	26451	100				62650
		100
133							# Embedded spaces can be multiple, and include leading commas
134							/ / ? ',?\s+' :
135							# Escape other regex metacharacters
136							/[()]/ ? "\\$_" : $_
137							} split //, $string;
138	1896					6768	$assembler->insert(@pattern);
139
140							# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
141	1896					205850	my $pattern_string = join '', @pattern;
142
143							# Special case - optional match characters can cause clashes between
144							# distinct pattern_strings e.g. /A\.?,?\sS\.?,?\s/ clashes with /AS/
145							# We need to handle such cases as ambiguous with extra checks
146	1896					3108	my $optional1e = "\Q$optional1\E";
147	1896					2345	my $alt_pattern_string1;
148	1896	100				9834	if ($pattern_string =~ /^(\w)(\w)$/) {
		100
149	84					283	$alt_pattern_string1 = "$1$optional1$2$optional1";
150							} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
151	107					314	$alt_pattern_string1 = "$1$2";
152							}
153
154							# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
155							# across entries, and we can't unambiguously map back to a standard designator
156	1896	100	66			56001	if (exists $self->pattern_string_map->{ $pattern_string }) {
		100
157	429					11785	my $current = $self->pattern_string_map->{ $pattern_string };
158	429	100	100			1468	if ($current && $current ne $reference_string) {
159							# Reset to undef to mark ambiguity
160	3					84	$self->pattern_string_map->{ $pattern_string } = undef;
161							}
162							}
163							# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
164							elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
165	7					178	my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
166	7	50	33			55	if ($current && $current ne $reference_string) {
167							# Reset both pairs to undef to mark ambiguity
168	7					203	$self->pattern_string_map->{ $pattern_string } = undef;
169	7					198	$self->pattern_string_map->{ $alt_pattern_string1 } = undef;
170							}
171							}
172							else {
173	1460					36752	$self->pattern_string_map->{ $pattern_string } = $reference_string;
174							}
175	1896	100				3517	if ($lang) {
176	638					1029	for my $l (@$lang) {
177	766	100				20150	if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
178	202					5331	my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
179	202	50	33			705	if ($current && $current ne $reference_string) {
180							# Reset to undef to mark ambiguity
181	0					0	$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
182							}
183							}
184							else {
185	564					14773	$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
186							}
187							}
188							}
189
190							# If $string contains unicode diacritics, also add a version without them for misspellings
191	9	100		9		73	if ($string =~ m/\pM/) {
	9					19
	9					135
	1896					8060
192	217					381	my $stripped = $string;
193	217					1170	$stripped =~ s/\pM//g;
194	217					724	$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
195							}
196							}
197
198							# Assemble designator regexes
199							sub _build_regex {
200	42			42		80	my $self = shift;
201	42					98	my ($type, $lang) = @_;
202
203	42					99	state $types = { map { $_ => 1 } qw(end end_cont begin) };
	12					56
204	42	50				147	if (! $types->{$type}) {
205	0					0	croak "invalid regex type '$type'";
206							}
207
208							# RA constructor - case insensitive, with match tracking
209	42					258	my $assembler = Regexp::Assemble->new->flags('i')->track(1);
210
211							# Construct language regex if $lang is set
212	42					3153	my $lang_re;
213	42	100				115	if ($lang) {
214	36	100				150	$lang = [ $lang ] if ! ref $lang;
215	36					154	my $lang_str = join '\|', sort @$lang;
216	36					446	$lang_re = qr/^($lang_str)$/;
217							}
218
219	42					88	my $count = 0;
220	42					70	while (my ($long, $entry) = each %{ $self->data }) {
	7518					176560
221							# If $lang is set, restrict to entries that include $lang
222	7476	100	100			29452	next if $lang_re && $entry->{lang} !~ $lang_re;
223							# If $type is 'begin', restrict to 'lead' entries
224	1448	100	100			3479	next if $type eq 'begin' && ! $entry->{lead};
225							# if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
226	964	100	100			2537	next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};
227
228	648					810	$count++;
229	648					3393	my $long_nfd = NFD($long);
230	648					1690	$self->_add_to_assembler($assembler, $lang, $long_nfd);
231
232							# Add all abbreviations
233	648	100				2215	if (my $abbr_list = $entry->{abbr}) {
234	616	100				1444	$abbr_list = [ $abbr_list ] if ! ref $abbr_list;
235	616					1104	for my $abbr (@$abbr_list) {
236	1031					3723	my $abbr_nfd = NFD($abbr);
237	1031		66			4336	my $abbr_std = NFD($entry->{abbr_std} \|\| $abbr);
238	1031					2207	$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
239							}
240							}
241							}
242
243							# If no entries found (a strange/bogus language?), return undef
244	42	100				266	return if $count == 0;
245
246	30	50				190	return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
247							}
248
249							# Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.)
250							# $type defaults to 'end', $lang defaults to undef (for all)
251							sub regex {
252	2753			2753	1	3745	my $self = shift;
253	2753					4785	my ($type, $lang) = @_;
254	2753		50			5042	$type \|\|= 'end';
255
256							# $lang might be an arrayref containing multiple language codes
257	2753					3313	my $lang_key;
258	2753	100				4821	if ($lang) {
259	968					1288	$lang_key = $lang;
260	968	50	66			2011	if (ref $lang && ref $lang eq 'ARRAY' && @$lang) {
			66
261	8	50				18	if (@$lang == 1) {
262	0					0	$lang_key = $lang->[0];
263							}
264							else {
265	8					21	$lang_key = join '_', sort map { lc $_ } @$lang;
	16					64
266							}
267							}
268							}
269
270	2753					3625	my $cache_key = $type;
271	2753	100				4934	$cache_key .= "_$lang_key" if $lang_key;
272
273	2753	100				73585	if (my $entry = $self->regex_cache->{ $cache_key }) {
274	2711	50				11172	return wantarray ? @$entry : $entry->[0];
275							}
276
277	42					154	my ($re, $assembler) = $self->_build_regex($type, $lang);
278	42					218295	$self->regex_cache->{ $cache_key } = [ $re, $assembler ];
279	42	50				261	return wantarray ? ( $re, $assembler ) : $re;
280							}
281
282							# Helper to return split_designator results
283							sub _split_designator_result {
284	1079			1079		9895	my $self = shift;
285	1079					4167	my ($lang, $before, $des, $after, $matched_pattern) = @_;
286
287							# $before can end in whitespace (that we don't want to consume in the RE
288							# for technical reasons around handling punctuation like '& Co' in designators)
289							# So trim here to handle that case.
290	1079	100				4967	$before =~ s/\s+$// if $before;
291
292	1079					1803	my $des_std;
293	1079	100				2132	if ($matched_pattern) {
294	819	100				8696	$des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang;
295	819		100			20126	$des_std \|\|= $self->pattern_string_map->{$matched_pattern};
296	819	100				1738	if ($des_std) {
297							# Always coalesce spaces and delete commas from $des_std
298	799					1629	$des_std =~ s/,+/ /g;
299	799					2071	$des_std =~ s/\s\s+/ /g;
300							}
301							}
302
303							# Legacy interface - return a simple before / des / after tuple, plus $des_std
304	1079	100	66			2698	return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std)
	1452	100				11041
305							if wantarray;
306
307							# New scalar-context interface - return SplitResult object
308	716	100	100			18662	Business::CompanyDesignator::SplitResult->new(
			100
			100
			100
309							before => NFC($before // ''),
310							designator => NFC($des // ''),
311							designator_std => NFC($des_std // ''),
312							after => NFC($after // ''),
313							records => [ $des_std ? $self->records(NFC $des_std) : () ],
314							);
315							}
316
317							# Split $company_name on (the first) company designator, returning a triplet of strings:
318							# ($before, $designator, $after), plus the normalised form of the designator. If no
319							# designator is found, just returns ($company_name).
320							# e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for
321							# the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator,
322							# but "Ltée" for the normalised form
323							sub split_designator {
324	1079			1079	1	238445	my $self = shift;
325	1079					3203	my ($company_name, %arg) = @_;
326	1079					2191	my $lang = $arg{lang};
327	1079					1886	my $allow_embedded = $arg{allow_embedded};
328	1079		100			4394	$allow_embedded //= 1; # backwards-compatibility, unfortunately
329	1079					6859	my $company_name_match = NFD($company_name);
330
331							# Handle older perls without XPosixPunct
332	1079	50				2012	state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ?
	4					36
333							'[\s\p{XPosixPunct}]' :
334							'[\s[:punct:]]';
335
336							# Strip all brackets for continuous language matching
337	1079					4429	(my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g;
338
339	1079					2008	my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr);
340	1079	100				2303	if ($lang) {
341	484	100				1120	if ($LANG_CONTINUA{$lang}) {
342	36					81	($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang);
343							} else {
344	448					1251	($end_re, $end_asr) = $self->regex('end', $lang);
345							}
346	484					1151	($begin_re, $begin_asr) = $self->regex('begin', $lang);
347							} else {
348	595					1916	($end_re, $end_asr) = $self->regex('end');
349	595					1524	($end_cont_re, $end_cont_asr) = $self->regex('end_cont');
350	595					1381	($begin_re, $begin_asr) = $self->regex('begin');
351							}
352
353							# Designators are usually final, so try $end_re first
354	1079	100	100			287582	if ($end_re &&
355							$company_name_match =~ m/^\s(.?)${punct_class}\s$?($end_re)$?\s$/) {
356	572					1018642	return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R));
357							}
358
359							# No final designator - retry without a word break for the subset of languages
360							# that use continuous scripts (see %LANG_CONTINUA above)
361	507	100	100			506032	if ($end_cont_re &&
362							$company_name_match_cont_stripped =~ m/^\s(.?)$?($end_cont_re)$?\s*$/) {
363	114					1501	return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R));
364							}
365
366							# No final designator - check for a lead designator instead (e.g. RU, NL, etc.)
367	393	100	100			31529	if ($begin_re &&
368							$company_name_match =~ m/^\s$?($begin_re)$?${punct_class}\s(.?)\s$/) {
369	91					29011	return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R));
370							}
371
372							# No final or initial - check for an embedded designator with trailing content
373	302	100	100			33225	if ($end_re && $allow_embedded &&
			100
374							$company_name_match =~ m/(.?)${punct_class}\s$?($end_re)$?(?:\s+(.*?))?$/) {
375	42					70358	return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R));
376							}
377
378							# No match - return $company_name unchanged
379	260					15879	return $self->_split_designator_result($lang, $company_name);
380							}
381
382							1;
383
384							__END__
385
386							=encoding utf-8
387
388							=head1 NAME
389
390							Business::CompanyDesignator - module for matching and stripping/manipulating the
391							company designators appended to company names
392
393							=head1 VERSION
394
395							Version: 0.16.
396
397							This module is considered a B<BETA> release. Interfaces may change and/or break
398							without notice until the module reaches version 1.0.
399
400							=head1 SYNOPSIS
401
402							Business::CompanyDesignator is a perl module for matching and stripping/manipulating
403							the typical company designators appended (or sometimes, prepended) to company names.
404							It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and
405							abbreviations (e.g. Corp., Inc., Ltd., GmbH etc).
406
407							use Business::CompanyDesignator;
408
409							# Constructor
410							$bcd = Business::CompanyDesignator->new;
411							# Optionally, you can provide your own company_designator.yml file, instead of the bundled one
412							$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
413
414							# Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.)
415							@des = $bcd->designators;
416							@long = $bcd->long_designators;
417							@abbrev = $bcd->abbreviations;
418
419							# Lookup individual designator records (returns B::CD::Record objects)
420							# Lookup record by long designator (unique)
421							$record = $bcd->record($long_designator);
422							# Lookup records by abbreviation or long designator (may not be unique)
423							@records = $bcd->records($designator);
424
425							# Get a regex for matching designators by type ('end'/'begin') and lang
426							# By default, returns 'end' regexes for all languages
427							$re = $bcd->regex;
428							$company_name =~ $re and say 'designator found!';
429							$company_name =~ /$re\s*$/ and say 'final designator found!';
430							my $re_begin_en = $bcd->regex('begin', 'en');
431
432							# Split $company_name on designator, returning a ($before, $designator, $after) triplet,
433							# plus the normalised form of the designator matched (can pass to records(), for example)
434							($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name);
435
436							# Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object
437							$res = $bcd->split_designator($company_name, lang => 'en');
438							print join ' / ', $res->designator_std, $res->short_name, $res->extra;
439
440
441							=head1 DATASET
442
443							Business::CompanyDesignator uses the company designator dataset from here:
444
445							L<https://github.com/ProfoundNetworks/company_designator>
446
447							which is bundled with the module. You can use your own (updated or custom)
448							version, if you prefer, by passing a 'datafile' parameter to the constructor.
449
450							The dataset defines multiple long form designators (like "Company", "Limited",
451							or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.',
452							'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry,
453							for instance, looks like this:
454
455							Company:
456							abbr:
457							- Co.
458							- '& Co.'
459							- and Co.
460							- and Company
461							lang: en
462
463							Long designators are unique across the dataset, but abbreviations are not
464							e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée".
465
466							=head1 METHODS
467
468							=head2 new()
469
470							Creates a Business::CompanyDesignator object.
471
472							$bcd = Business::CompanyDesignator->new;
473
474							By default this uses the bundled company_designator dataset. You may
475							provide your own (updated or custom) version by passing via a 'datafile'
476							parameter to the constructor.
477
478							$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
479
480							=head2 designators()
481
482							Returns the full list of company designator strings from the dataset
483							(both long form and abbreviations).
484
485							@designators = $bcd->designators;
486
487							=head2 long_designators()
488
489							Returns the full list of long form designators from the dataset.
490
491							@long = $bcd->long_designators;
492
493							=head2 abbreviations()
494
495							Returns the full list of abbreviation designators from the dataset.
496
497							@abbrev = $bcd->abbreviations;
498
499							=head2 record($long_designator)
500
501							Returns the Business::CompanyDesignator::Record object for the given
502							long designator (and dies if not found).
503
504							=head2 records($designator)
505
506							Returns a list of Business::CompanyDesignator::Record objects for the
507							given abbreviation or long designator (for long designators there will
508							only be a single record returned, but abbreviations may map to multiple
509							records).
510
511							Use this method for abbreviations, or if you're aren't sure of a
512							designator's type.
513
514							=head2 regex([$type], [$lang])
515
516							Returns a regex for all matching designators for $type ('begin'/'end') and
517							$lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset.
518							$lang may be either a single language code scalar, or an arrayref of language
519							codes, for multiple alternative languages. The returned regex is case-insensitive
520							and non-anchored.
521
522							$type defaults to 'end', so without parameters regex() returns a regex
523							matching all designators for all languages.
524
525							=head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool])
526
527							Attempts to split $company_name on (the first) company designator found.
528
529							In array context split_designator returns a list of four items - a triplet of
530							strings from $company_name ( $before, $designator, $after ), plus the
531							standardised version of the designator as a fourth element.
532
533							($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name);
534
535							In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult>
536							object.
537
538							$res = $bcd->split_designator($company_name, lang => $lang);
539
540							The $des designator in array context, and the SplitResult $res->designator
541							is the designator text as it matched in $company_name, while the array context
542							$des_std, and the SplitResult $res->designator_std is the standardised version
543							as found in the dataset.
544
545							For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but
546							"Pty. Ltd." as the stardardised form, and the latter would be what you
547							would find in designators() or would lookup with records(). Similarly,
548							"Accessoires XYZ Ltee" (without the french acute) would match, returning
549							"Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the
550							standardised form.
551
552							split_designator accepts the following optional (named) parameters:
553
554							=over 4
555
556							=item lang => $lang
557
558							$lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an
559							arrayref containing multiple language codes. If $lang is defined, split_designator
560							will only match designators for the specified set of languages, which can improve
561							the accuracy of the split by reducing false positive matches.
562
563							=item allow_embedded => $boolean
564
565							allow_embedded is a boolean indicating whether or not designators can occur in
566							the middle of strings, instead of only at the beginning or end. Defaults to true,
567							for backwards compatibility, which yields more matches, but also more false
568							positives. Setting to false is safer, but yields fewer matches (and embedded
569							designators do occur surprisingly often in the wild.)
570
571							For more discussion, see L<AMBIGUITIES> below.
572
573							=back
574
575							=head2 AMBIGUITIES
576
577							Note that split_designator does not always get the split right. It checks for
578							final designators first, then leading ones, and then finally looks for embedded
579							designators (if allow_embedded is set to true).
580
581							Leading and trailing designators are usually reasonably accurate, but embedded
582							designators are problematic. For instance, embedded designators allow names like
583							these to split correctly:
584
585							Amerihealth Insurance Company of NJ
586							Trenkwalder Personal AG Schweiz
587							Vicente Campano S L (COMERCIAL VICAM)
588							Gvozdika, gostinitsa OOO ""Eko-Treyd""
589
590							but it will also wrongly split names like the following:
591
592							XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation')
593							Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada')
594
595							If you do want to allow splitting on embedded designators, you might want to pass
596							a 'lang' parameter to split_designator if you know the language(s) used for your
597							company names, as this will reduce the number of false positives by restricting the
598							set of designators matched against. It won't eliminate the issue altogether though,
599							so some post-processing might be required. (And I'd love to hear of ideas on how
600							to improve this.)
601
602							=head1 SEE ALSO
603
604							Finance::CompanyNames
605
606							=head1 AUTHOR
607
608							Gavin Carr <gavin@profound.net>
609
610							=head1 COPYRIGHT AND LICENCE
611
612							Copyright (C) 2013-2021 Gavin Carr
613
614							This library is free software; you can redistribute it and/or modify it
615							under the same terms as Perl itself.
616
617							=cut