File Coverage

blib/lib/Business/CompanyDesignator.pm

Criterion	Covered	Total	%
statement	193	196	98.4
branch	94	104	90.3
condition	62	73	84.9
subroutine	26	26	100.0
pod	7	7	100.0
total	382	406	94.0

line	stmt	bran	cond	sub	pod	time	code
1							package Business::CompanyDesignator;
2
3							# Require perl 5.010 because the 'track' functionality of Regexp::Assemble
4							# is unsafe for earlier versions.
5	9			9		660497	use 5.010001;
	9					106
6	9			9		4566	use Moose;
	9					3704066
	9					70
7	9			9		61055	use utf8;
	9					60
	9					57
8	9			9		314	use warnings qw(FATAL utf8);
	9					16
	9					413
9	9			9		2618	use FindBin qw($Bin);
	9					5346
	9					1148
10	9			9		2319	use YAML;
	9					32361
	9					505
11	9			9		5562	use File::ShareDir qw(dist_file);
	9					174307
	9					567
12	9			9		68	use List::MoreUtils qw(uniq);
	9					27
	9					42
13	9			9		11876	use Regexp::Assemble;
	9					158772
	9					342
14	9			9		5041	use Unicode::Normalize;
	9					17531
	9					664
15	9			9		70	use Carp;
	9					18
	9					434
16
17	9			9		4151	use Business::CompanyDesignator::Record;
	9					39
	9					543
18	9			9		5097	use Business::CompanyDesignator::SplitResult;
	9					29
	9					10042
19
20							our $VERSION = '0.17';
21
22							# Hardcode the set of languages that we treat as 'continuous'
23							# i.e. their non-ascii designators don't require a word break
24							# before/after.
25							our %LANG_CONTINUA = map { $_ => 1 } qw(
26							zh
27							ja
28							ko
29							);
30
31							has 'datafile' => ( is => 'ro', default => sub {
32							# Development/test version
33							my $local_datafile = "$Bin/../share/company_designator_dev.yml";
34							return $local_datafile if -f $local_datafile;
35							$local_datafile = "$Bin/../share/company_designator.yml";
36							return $local_datafile if -f $local_datafile;
37							# Installed version
38							return dist_file('Business-CompanyDesignator', 'company_designator.yml');
39							});
40
41							# data is the raw dataset as loaded from datafile, keyed by long designator
42							has data => ( is => 'ro', lazy_build => 1 );
43
44							# regex_cache is a cache of regexes by language and type, since they're expensive to build
45							has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
46
47							# abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of
48							# long designators (since abbreviations are not necessarily unique)
49							has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 );
50
51							# pattern_string_map is a hash mapping patterns back to their source string,
52							# since we do things like add additional patterns without diacritics
53							has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
54							# pattern_string_map_lang is a hash of hashes, mapping language codes to hashes
55							# of patterns back to their source string
56							has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
57
58							sub _build_data {
59	8			8		53	my $self = shift;
60	8					256	YAML::LoadFile($self->datafile);
61							}
62
63							sub _build_abbr_long_map {
64	5			5		28	my $self = shift;
65	5					16	my $map = {};
66	5					13	while (my ($long, $entry) = each %{ $self->data }) {
	935					22972
67	930	100				1731	if (my $abbr = $entry->{abbr_std}) {
68	25		100			112	$map->{$abbr} \|\|= [];
69	25					39	push @{ $map->{$abbr} }, $long;
	25					75
70							}
71	930	100				1735	my $abbr_list = $entry->{abbr} or next;
72	900	100				1728	$abbr_list = [ $abbr_list ] if ! ref $abbr_list;
73	900					1386	for my $abbr (@$abbr_list) {
74	1440		100			6043	$map->{$abbr} \|\|= [];
75	1440					1695	push @{ $map->{$abbr} }, $long;
	1440					3596
76							}
77							}
78	5					138	return $map;
79							}
80
81							sub long_designators {
82	4			4	1	1197	my $self = shift;
83	4					9	sort keys %{ $self->data };
	4					141
84							}
85
86							sub abbreviations {
87	3			3	1	976	my $self = shift;
88	3					6	sort keys %{ $self->abbr_long_map };
	3					114
89							}
90
91							sub designators {
92	1			1	1	2	my $self = shift;
93	1					5	sort $self->long_designators, $self->abbreviations;
94							}
95
96							# Return the B::CD::Record for $long designator
97							sub record {
98	1782			1782	1	48890	my ($self, $long) = @_;
99	1782	100				45238	my $entry = $self->data->{$long}
100							or croak "Invalid long designator '$long'";
101	1781					51476	return Business::CompanyDesignator::Record->new( long => $long, record => $entry );
102							}
103
104							# Return a list of B::CD::Records for $designator
105							sub records {
106	1427			1427	1	562913	my ($self, $designator) = @_;
107	1427	50				3383	croak "Missing designator" if ! $designator;
108	1427	100				43146	if (exists $self->data->{$designator}) {
		100
109	388					1127	return ( $self->record($designator) );
110							}
111							elsif (my $long_set = $self->abbr_long_map->{$designator}) {
112	1038					2413	return map { $self->record($_) } @$long_set
	1207					2818
113							}
114							else {
115	1					19	croak "Invalid designator '$designator'";
116							}
117							}
118
119							# Add $string to regex assembler
120							sub _add_to_assembler {
121	1973			1973		3716	my ($self, $assembler, $lang, $string, $reference_string) = @_;
122	1973		66			4481	$reference_string \|\|= $string;
123							# printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang \|\| []}), $string, $reference_string;
124
125							# FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs:
126							# https://rt.cpan.org/Public/Bug/Display.html?id=50228
127							# https://rt.cpan.org/Public/Bug/Display.html?id=74449
128							# $assembler->add($string)
129							# Workaround by lexing and using insert()
130	1973					2572	my $optional1 = '\\.?,?\\s*';
131							my @pattern = map {
132							# Periods are treated as optional literals, with optional trailing commas and/or whitespace
133	1973	100				6542	/\./ ? $optional1 :
	27614	100				66046
		100
134							# Embedded spaces can be multiple, and include leading commas
135							/ / ? ',?\s+' :
136							# Escape other regex metacharacters
137							/[()]/ ? "\\$_" : $_
138							} split //, $string;
139	1973					7194	$assembler->insert(@pattern);
140
141							# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
142	1973					218229	my $pattern_string = join '', @pattern;
143
144							# Special case - optional match characters can cause clashes between
145							# distinct pattern_strings e.g. /A\.?,?\sS\.?,?\s/ clashes with /AS/
146							# We need to handle such cases as ambiguous with extra checks
147	1973					3110	my $optional1e = "\Q$optional1\E";
148	1973					2362	my $alt_pattern_string1;
149	1973	100				10447	if ($pattern_string =~ /^(\w)(\w)$/) {
		100
150	96					366	$alt_pattern_string1 = "$1$optional1$2$optional1";
151							} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
152	109					335	$alt_pattern_string1 = "$1$2";
153							}
154
155							# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
156							# across entries, and we can't unambiguously map back to a standard designator
157	1973	100	66			58918	if (exists $self->pattern_string_map->{ $pattern_string }) {
		100
158	432					10812	my $current = $self->pattern_string_map->{ $pattern_string };
159	432	100	100			1518	if ($current && $current ne $reference_string) {
160							# Reset to undef to mark ambiguity
161	3					84	$self->pattern_string_map->{ $pattern_string } = undef;
162							}
163							}
164							# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
165							elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
166	7					209	my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
167	7	50	33			42	if ($current && $current ne $reference_string) {
168							# Reset both pairs to undef to mark ambiguity
169	7					195	$self->pattern_string_map->{ $pattern_string } = undef;
170	7					181	$self->pattern_string_map->{ $alt_pattern_string1 } = undef;
171							}
172							}
173							else {
174	1534					38994	$self->pattern_string_map->{ $pattern_string } = $reference_string;
175							}
176	1973	100				3605	if ($lang) {
177	681					1145	for my $l (@$lang) {
178	821	100				21608	if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
179	230					6226	my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
180	230	50	33			780	if ($current && $current ne $reference_string) {
181							# Reset to undef to mark ambiguity
182	0					0	$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
183							}
184							}
185							else {
186	591					15165	$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
187							}
188							}
189							}
190
191							# If $string contains unicode diacritics, also add a version without them for misspellings
192	9	100		9		75	if ($string =~ m/\pM/) {
	9					22
	9					138
	1973					8511
193	231					404	my $stripped = $string;
194	231					1347	$stripped =~ s/\pM//g;
195	231					750	$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
196							}
197							}
198
199							# Assemble designator regexes
200							sub _build_regex {
201	50			50		104	my $self = shift;
202	50					148	my ($type, $lang) = @_;
203
204	50					105	state $types = { map { $_ => 1 } qw(end end_cont begin) };
	12					41
205	50	50				193	if (! $types->{$type}) {
206	0					0	croak "invalid regex type '$type'";
207							}
208
209							# RA constructor - case insensitive, with match tracking
210	50					344	my $assembler = Regexp::Assemble->new->flags('i')->track(1);
211
212							# Construct language regex if $lang is set
213	50					4053	my $lang_re;
214	50	100				144	if ($lang) {
215	44	100				204	$lang = [ $lang ] if ! ref $lang;
216	44					200	my $lang_str = join '\|', sort @$lang;
217	44					698	$lang_re = qr/^($lang_str)$/;
218							}
219
220	50					113	my $count = 0;
221	50					97	while (my ($long, $entry) = each %{ $self->data }) {
	9350					222252
222							# If $lang is set, restrict to entries that include $lang
223	9300	100	100			38214	next if $lang_re && $entry->{lang} !~ $lang_re;
224							# If $type is 'begin', restrict to 'lead' entries
225	1528	100	100			3676	next if $type eq 'begin' && ! $entry->{lead};
226							# if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
227	1023	100	100			2742	next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};
228
229	691					924	$count++;
230	691					3535	my $long_nfd = NFD($long);
231	691					1923	$self->_add_to_assembler($assembler, $lang, $long_nfd);
232
233							# Add all abbreviations
234	691	100				2076	if (my $abbr_list = $entry->{abbr}) {
235	659	100				1523	$abbr_list = [ $abbr_list ] if ! ref $abbr_list;
236	659					1298	for my $abbr (@$abbr_list) {
237							# Only treat non-ascii abbreviations as continuous
238	1114	100	100			2440	next if $type eq 'end_cont' && $abbr =~ /^\p{ASCII}+$/;
239	1051					3900	my $abbr_nfd = NFD($abbr);
240	1051		66			4248	my $abbr_std = NFD($entry->{abbr_std} \|\| $abbr);
241	1051					2266	$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
242							}
243							}
244							}
245
246							# If no entries found (a strange/bogus language?), return undef
247	50	100				405	return if $count == 0;
248
249	35	50				208	return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
250							}
251
252							# Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.)
253							# $type defaults to 'end', $lang defaults to undef (for all)
254							sub regex {
255	3282			3282	1	4604	my $self = shift;
256	3282					5823	my ($type, $lang) = @_;
257	3282		50			6010	$type \|\|= 'end';
258
259							# $lang might be an arrayref containing multiple language codes
260	3282					4049	my $lang_key;
261	3282	100				5677	if ($lang) {
262	1152					1525	$lang_key = $lang;
263	1152	50	66			2657	if (ref $lang && ref $lang eq 'ARRAY' && @$lang) {
			66
264	8	50				22	if (@$lang == 1) {
265	0					0	$lang_key = $lang->[0];
266							}
267							else {
268	8					22	$lang_key = join '_', sort map { lc $_ } @$lang;
	16					78
269							}
270							}
271							}
272
273	3282					4359	my $cache_key = $type;
274	3282	100				6168	$cache_key .= "_$lang_key" if $lang_key;
275
276	3282	100				88646	if (my $entry = $self->regex_cache->{ $cache_key }) {
277	3232	50				12012	return wantarray ? @$entry : $entry->[0];
278							}
279
280	50					227	my ($re, $assembler) = $self->_build_regex($type, $lang);
281	50					230461	$self->regex_cache->{ $cache_key } = [ $re, $assembler ];
282	50	50				354	return wantarray ? ( $re, $assembler ) : $re;
283							}
284
285							# Helper to return split_designator results
286							sub _split_designator_result {
287	1286			1286		12125	my $self = shift;
288	1286					5290	my ($lang, $before, $des, $after, $matched_pattern) = @_;
289
290							# $before can end in whitespace (that we don't want to consume in the RE
291							# for technical reasons around handling punctuation like '& Co' in designators)
292							# So trim here to handle that case.
293	1286	100				5959	$before =~ s/\s+$// if $before;
294
295	1286					2214	my $des_std;
296	1286	100				2782	if ($matched_pattern) {
297	980	100				10705	$des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang;
298	980		100			24003	$des_std \|\|= $self->pattern_string_map->{$matched_pattern};
299	980	100				2470	if ($des_std) {
300							# Always coalesce spaces and delete commas from $des_std
301	960					2274	$des_std =~ s/,+/ /g;
302	960					2973	$des_std =~ s/\s\s+/ /g;
303							}
304							}
305
306							# Legacy interface - return a simple before / des / after tuple, plus $des_std
307	1286	100	66			3390	return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std)
	1728	100				13422
308							if wantarray;
309
310							# New scalar-context interface - return SplitResult object
311	854	100	100			24006	Business::CompanyDesignator::SplitResult->new(
			100
			100
			100
312							before => NFC($before // ''),
313							designator => NFC($des // ''),
314							designator_std => NFC($des_std // ''),
315							after => NFC($after // ''),
316							records => [ $des_std ? $self->records(NFC $des_std) : () ],
317							);
318							}
319
320							# Split $company_name on (the first) company designator, returning a triplet of strings:
321							# ($before, $designator, $after), plus the normalised form of the designator. If no
322							# designator is found, just returns ($company_name).
323							# e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for
324							# the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator,
325							# but "Ltée" for the normalised form
326							sub split_designator {
327	1286			1286	1	285419	my $self = shift;
328	1286					4113	my ($company_name, %arg) = @_;
329	1286					2544	my $lang = $arg{lang};
330	1286					3041	my $allow_embedded = $arg{allow_embedded};
331	1286		100			5448	$allow_embedded //= 1; # backwards-compatibility, unfortunately
332	1286					8759	my $company_name_match = NFD($company_name);
333
334							# Handle older perls without XPosixPunct
335	1286	50				2326	state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ?
	4					35
336							'[\s\p{XPosixPunct}]' :
337							'[\s[:punct:]]';
338
339							# Strip all brackets for continuous language matching
340	1286					5330	(my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g;
341
342	1286					2444	my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr);
343	1286	100				2925	if ($lang) {
344	576	100				1903	if ($LANG_CONTINUA{$lang}) {
345	36					195	($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang);
346							} else {
347	540					2023	($end_re, $end_asr) = $self->regex('end', $lang);
348							}
349	576					1584	($begin_re, $begin_asr) = $self->regex('begin', $lang);
350							} else {
351	710					2100	($end_re, $end_asr) = $self->regex('end');
352	710					1790	($end_cont_re, $end_cont_asr) = $self->regex('end_cont');
353	710					1574	($begin_re, $begin_asr) = $self->regex('begin');
354							}
355
356							# Designators are usually final, so try $end_re first
357	1286	100	100			360951	if ($end_re &&
358							$company_name_match =~ m/^\s(.?)${punct_class}\s$?($end_re)$?\s$/) {
359	677					1266193	return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R));
360							}
361
362							# No final designator - retry without a word break for the subset of languages
363							# that use continuous scripts (see %LANG_CONTINUA above)
364	609	100	100			632773	if ($end_cont_re &&
365							$company_name_match_cont_stripped =~ m/^\s(.?)$?($end_cont_re)$?\s*$/) {
366	114					1902	return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R));
367							}
368
369							# No final designator - check for a lead designator instead (e.g. RU, NL, etc.)
370	495	100	100			45453	if ($begin_re &&
371							$company_name_match =~ m/^\s$?($begin_re)$?${punct_class}\s(.?)\s$/) {
372	147					53550	return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R));
373							}
374
375							# No final or initial - check for an embedded designator with trailing content
376	348	100	100			34810	if ($end_re && $allow_embedded &&
			100
377							$company_name_match =~ m/(.?)${punct_class}\s$?($end_re)$?(?:\s+(.*?))?$/) {
378	42					70350	return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R));
379							}
380
381							# No match - return $company_name unchanged
382	306					16268	return $self->_split_designator_result($lang, $company_name);
383							}
384
385							1;
386
387							__END__
388
389							=encoding utf-8
390
391							=head1 NAME
392
393							Business::CompanyDesignator - module for matching and stripping/manipulating the
394							company designators appended to company names
395
396							=head1 VERSION
397
398							Version: 0.17.
399
400							This module is considered a B<BETA> release. Interfaces may change and/or break
401							without notice until the module reaches version 1.0.
402
403							=head1 SYNOPSIS
404
405							Business::CompanyDesignator is a perl module for matching and stripping/manipulating
406							the typical company designators appended (or sometimes, prepended) to company names.
407							It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and
408							abbreviations (e.g. Corp., Inc., Ltd., GmbH etc).
409
410							use Business::CompanyDesignator;
411
412							# Constructor
413							$bcd = Business::CompanyDesignator->new;
414							# Optionally, you can provide your own company_designator.yml file, instead of the bundled one
415							$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
416
417							# Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.)
418							@des = $bcd->designators;
419							@long = $bcd->long_designators;
420							@abbrev = $bcd->abbreviations;
421
422							# Lookup individual designator records (returns B::CD::Record objects)
423							# Lookup record by long designator (unique)
424							$record = $bcd->record($long_designator);
425							# Lookup records by abbreviation or long designator (may not be unique)
426							@records = $bcd->records($designator);
427
428							# Get a regex for matching designators by type ('end'/'begin') and lang
429							# By default, returns 'end' regexes for all languages
430							$re = $bcd->regex;
431							$company_name =~ $re and say 'designator found!';
432							$company_name =~ /$re\s*$/ and say 'final designator found!';
433							my $re_begin_en = $bcd->regex('begin', 'en');
434
435							# Split $company_name on designator, returning a ($before, $designator, $after) triplet,
436							# plus the normalised form of the designator matched (can pass to records(), for example)
437							($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name);
438
439							# Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object
440							$res = $bcd->split_designator($company_name, lang => 'en');
441							print join ' / ', $res->designator_std, $res->short_name, $res->extra;
442
443
444							=head1 DATASET
445
446							Business::CompanyDesignator uses the company designator dataset from here:
447
448							L<https://github.com/ProfoundNetworks/company_designator>
449
450							which is bundled with the module. You can use your own (updated or custom)
451							version, if you prefer, by passing a 'datafile' parameter to the constructor.
452
453							The dataset defines multiple long form designators (like "Company", "Limited",
454							or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.',
455							'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry,
456							for instance, looks like this:
457
458							Company:
459							abbr:
460							- Co.
461							- '& Co.'
462							- and Co.
463							- and Company
464							lang: en
465
466							Long designators are unique across the dataset, but abbreviations are not
467							e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée".
468
469							=head1 METHODS
470
471							=head2 new()
472
473							Creates a Business::CompanyDesignator object.
474
475							$bcd = Business::CompanyDesignator->new;
476
477							By default this uses the bundled company_designator dataset. You may
478							provide your own (updated or custom) version by passing via a 'datafile'
479							parameter to the constructor.
480
481							$bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
482
483							=head2 designators()
484
485							Returns the full list of company designator strings from the dataset
486							(both long form and abbreviations).
487
488							@designators = $bcd->designators;
489
490							=head2 long_designators()
491
492							Returns the full list of long form designators from the dataset.
493
494							@long = $bcd->long_designators;
495
496							=head2 abbreviations()
497
498							Returns the full list of abbreviation designators from the dataset.
499
500							@abbrev = $bcd->abbreviations;
501
502							=head2 record($long_designator)
503
504							Returns the Business::CompanyDesignator::Record object for the given
505							long designator (and dies if not found).
506
507							=head2 records($designator)
508
509							Returns a list of Business::CompanyDesignator::Record objects for the
510							given abbreviation or long designator (for long designators there will
511							only be a single record returned, but abbreviations may map to multiple
512							records).
513
514							Use this method for abbreviations, or if you're aren't sure of a
515							designator's type.
516
517							=head2 regex([$type], [$lang])
518
519							Returns a regex for all matching designators for $type ('begin'/'end') and
520							$lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset.
521							$lang may be either a single language code scalar, or an arrayref of language
522							codes, for multiple alternative languages. The returned regex is case-insensitive
523							and non-anchored.
524
525							$type defaults to 'end', so without parameters regex() returns a regex
526							matching all designators for all languages.
527
528							=head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool])
529
530							Attempts to split $company_name on (the first) company designator found.
531
532							In array context split_designator returns a list of four items - a triplet of
533							strings from $company_name ( $before, $designator, $after ), plus the
534							standardised version of the designator as a fourth element.
535
536							($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name);
537
538							In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult>
539							object.
540
541							$res = $bcd->split_designator($company_name, lang => $lang);
542
543							The $des designator in array context, and the SplitResult $res->designator
544							is the designator text as it matched in $company_name, while the array context
545							$des_std, and the SplitResult $res->designator_std is the standardised version
546							as found in the dataset.
547
548							For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but
549							"Pty. Ltd." as the stardardised form, and the latter would be what you
550							would find in designators() or would lookup with records(). Similarly,
551							"Accessoires XYZ Ltee" (without the french acute) would match, returning
552							"Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the
553							standardised form.
554
555							split_designator accepts the following optional (named) parameters:
556
557							=over 4
558
559							=item lang => $lang
560
561							$lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an
562							arrayref containing multiple language codes. If $lang is defined, split_designator
563							will only match designators for the specified set of languages, which can improve
564							the accuracy of the split by reducing false positive matches.
565
566							=item allow_embedded => $boolean
567
568							allow_embedded is a boolean indicating whether or not designators can occur in
569							the middle of strings, instead of only at the beginning or end. Defaults to true,
570							for backwards compatibility, which yields more matches, but also more false
571							positives. Setting to false is safer, but yields fewer matches (and embedded
572							designators do occur surprisingly often in the wild.)
573
574							For more discussion, see L<AMBIGUITIES> below.
575
576							=back
577
578							=head2 AMBIGUITIES
579
580							Note that split_designator does not always get the split right. It checks for
581							final designators first, then leading ones, and then finally looks for embedded
582							designators (if allow_embedded is set to true).
583
584							Leading and trailing designators are usually reasonably accurate, but embedded
585							designators are problematic. For instance, embedded designators allow names like
586							these to split correctly:
587
588							Amerihealth Insurance Company of NJ
589							Trenkwalder Personal AG Schweiz
590							Vicente Campano S L (COMERCIAL VICAM)
591							Gvozdika, gostinitsa OOO ""Eko-Treyd""
592
593							but it will also wrongly split names like the following:
594
595							XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation')
596							Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada')
597
598							If you do want to allow splitting on embedded designators, you might want to pass
599							a 'lang' parameter to split_designator if you know the language(s) used for your
600							company names, as this will reduce the number of false positives by restricting the
601							set of designators matched against. It won't eliminate the issue altogether though,
602							so some post-processing might be required. (And I'd love to hear of ideas on how
603							to improve this.)
604
605							=head1 SEE ALSO
606
607							Finance::CompanyNames
608
609							=head1 AUTHOR
610
611							Gavin Carr <gavin@profound.net>
612
613							=head1 COPYRIGHT AND LICENCE
614
615							Copyright (C) 2013-2021 Gavin Carr
616
617							This library is free software; you can redistribute it and/or modify it
618							under the same terms as Perl itself.
619
620							=cut