File Coverage

blib/lib/Lingua/LO/NLP/Data.pm

Criterion	Covered	Total	%
statement	47	47	100.0
branch	2	2	100.0
condition			n/a
subroutine	17	17	100.0
pod	5	5	100.0
total	71	71	100.0

line	stmt	bran	sub	pod	time	code
1						package Lingua::LO::NLP::Data;
2	8		8		170908	use strict;
	8				102
	8				262
3	8		8		53	use warnings;
	8				18
	8				236
4	8		8		158	use 5.012000;
	8				38
5	8		8		66	use utf8;
	8				21
	8				48
6	8		8		260	use feature 'unicode_strings';
	8				18
	8				852
7	8		8		755	use version 0.77; our $VERSION = version->declare('v1.0.1');
	8				4313
	8				58
8	8		8		1875	use charnames qw/ :full lao /;
	8				64829
	8				55
9	8		8		29474	use parent 'Exporter';
	8				2272
	8				46
10
11						=encoding utf8
12
13						=head1 NAME
14
15						Lingua::LO::NLP::Data - Helper module to keep common read-only data
16
17						=head1 FUNCTION
18
19						Provides a few functions that return regular expressions for matching and
20						extracting parts from Lao syllables. Instead of hardcoding these expressions as
21						strings, they are constructed from fragments at runtime, trading maintainability
22						for a small one-time initialization cost.
23
24						Also holds common read-only data such as vowel classifications.
25
26						You will probably not want to use this module on its own. If you do, see the
27						other L modules for examples.
28
29						=cut
30
31						our %EXPORT_TAGS = (
32						all => [ qw/
33						get_sylre_basic get_sylre_full get_sylre_named is_long_vowel
34						normalize_tone_marks
35						/
36						]
37						);
38						our @EXPORT_OK = @{ $EXPORT_TAGS{all} };
39
40						# Character classes
41						my $TONE_MARKS = "\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}" .
42						"\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}";
43
44						my $CONSONANTS = "\N{LAO LETTER KO}\N{LAO LETTER KHO SUNG}\N{LAO LETTER KHO TAM}" .
45						"\N{LAO LETTER NGO}\N{LAO LETTER CO}\N{LAO LETTER SO TAM}\N{LAO LETTER NYO}" .
46						"\N{LAO LETTER DO}\N{LAO LETTER TO}\N{LAO LETTER THO SUNG}\N{LAO LETTER THO TAM}" .
47						"\N{LAO LETTER NO}\N{LAO LETTER BO}\N{LAO LETTER PO}\N{LAO LETTER PHO SUNG}" .
48						"\N{LAO LETTER FO TAM}\N{LAO LETTER PHO TAM}\N{LAO LETTER FO SUNG}" .
49						"\N{LAO LETTER MO}\N{LAO LETTER YO}\N{LAO LETTER LO LING}\N{LAO LETTER LO LOOT}" .
50						"\N{LAO LETTER WO}\N{LAO LETTER SO SUNG}\N{LAO LETTER HO SUNG}\N{LAO LETTER O}" .
51						"\N{LAO LETTER HO TAM}";
52
53						my $VOWELS_COMBINING = "\N{LAO VOWEL SIGN MAI KAN}" .
54						"\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
55						"\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
56						"\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO NIGGAHITA}";
57
58						my $VOWELS = "\N{LAO VOWEL SIGN A}\N{LAO VOWEL SIGN MAI KAN}\N{LAO VOWEL SIGN AA}" .
59						"\N{LAO VOWEL SIGN AM}\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
60						"\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
61						"\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO SEMIVOWEL SIGN LO}" .
62						"\N{LAO SEMIVOWEL SIGN NYO}\N{LAO VOWEL SIGN E}\N{LAO VOWEL SIGN EI}" .
63						"\N{LAO VOWEL SIGN O}\N{LAO VOWEL SIGN AY}\N{LAO VOWEL SIGN AI}" .
64						"\N{LAO NIGGAHITA}";
65
66						# Regular expression fragments. The cryptic names correspond to the naming
67						# in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
68						# Using what looks like interpolated variables in single-quoted strings is
69						# intentional; the interpolation is done manually later to be able to construct
70						# expressions with and without named captures.
71						my %regexp_fragments = (
72						x0_1 => 'ເ',
73						x0_2 => 'ແ',
74						x0_3 => 'ໂ',
75						x0_4 => 'ໄ',
76						x0_5 => 'ໃ',
77
78						x1 => 'ຫ',
79
80						x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
81
82						x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
83
84						x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
85
86						x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
87						x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
88						x4_5 => "\N{LAO NIGGAHITA}",
89						x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
90						x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
91						x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
92
93						x5 => "[$TONE_MARKS]",
94
95						x6_1 => 'ວ',
96						x6_2 => 'ອ',
97						x6_3 => 'ຽ',
98						x6 => '[ວອຽ]',
99
100						x7_1 => 'ະ',
101						x7_2 => 'າ',
102						x7_3 => "\N{LAO VOWEL SIGN AM}",
103
104						x8_3t8 => '[ຍດນມຢບ]',
105						x8 => '[ກງຍດນມຢບວ]',
106
107						x9 => '[ຈສຊພຟລ]',
108
109						x10_12 => '[ຯໆ]',
110						x10_3 => "\N{LAO CANCELLATION MARK}",
111
112						x9a10_3 => '(?: $x9 $x10_3)',
113						);
114						my $re1_all = '$x0_1 $x1? $x $x2?';
115						## See naming explanation above
116						## no critic(Bangs::ProhibitNumberedNames)
117						my $re1_1 = '$x5? $x8? $x9a10_3?';
118						my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
119						my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
120						my $re1_4 = '$x7_2? $x7_1';
121						my $re1_5 = '$x4_6 $x5? $x7_2';
122						my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
123						my $re1_8 = '$x4_7? $x5? $x6_3';
124
125						my $re2_all = '$x0_2 $x1? $x $x2?';
126						my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
127						my $re2_2 = '$x7_1';
128						my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
129
130						my $re3_all = '$x0_3 $x1? $x $x2?';
131						my $re3_1 = '$x5? $x8? $x9a10_3?';
132						my $re3_2 = '$x7_1';
133						my $re3_3 = '$x4_7 $x5? $x8_3t8?';
134
135						my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
136
137						my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
138
139						my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
140
141						my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
142
143						my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
144
145						my $re9 = '$x1? $x $x2? $x4_6 $x5? ( $x8 $x9a10_3? \| $x6_1 $x7_1 )';
146
147						my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
148
149						my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
150
151						my $re12 = '$x1? $x $x2? $x5? $x7_1';
152
153						my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
154
155						my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
156
157						my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
158
159						my $rex1012 = '$x10_12';
160
161						# This is the basic regexp that matches a syllable, still with variables to be
162						# substituted
163						my $re_basic = <<"EOF";
164						(
165						(
166						( $re1_all ( $re1_1 \| $re1_2 \| $re1_3 \| $re1_4 \| $re1_5 \| $re1_6 \| $re1_8 ) ) \|
167						( $re2_all ( $re2_1 \| $re2_2 \| $re2_3 ) ) \|
168						( $re3_all ( $re3_1 \| $re3_2 \| $re3_3 ) ) \|
169						$re4 \| $re5 \| $re6 \| $re7 \| $re8 \| $re9 \|
170						$re10 \| $re11 \| $re12 \| $re13 \| $re14
171						) $rex1012? \|
172						$re_num+
173						)
174						EOF
175
176						# A simplified lookahead expression that matches only the syllables not starting
177						# in x0/x1. For the latter, matching their first character is sufficient.
178						my $re_lookahead = <<"EOF";
179						(
180						$re6 \| $re7 \| $re8 \| $re9 \| $re10 \| $re11 \| $re12 \| $re13 \| $re14 \| $re_num+
181						)
182						EOF
183
184						# Fix up regexen
185						for( $re_basic, $re_lookahead ) {
186						s/\n//gs; # Remove newlines
187						s/\(/(?:/gs; # Make all groups non-capturing
188						s/\s+//g;
189						}
190
191						# Functional names for all the x-something groups from the original paper
192						# Used for named captures.
193						my %CAPTURE_NAMES = (
194						'x' => 'consonant',
195						'x0_\d' => 'vowel0',
196						'x1' => 'h',
197						'x2' => 'semivowel',
198						'x3' => 'vowel1',
199						'x4_[1-9t]{1,3}'=> 'vowel1',
200						'x5' => 'tone_mark',
201						'x6' => 'vowel2',
202						'x6_\d' => 'vowel2',
203						'x7_2' => 'vowel2',
204						'x7_[13]' => 'vowel3',
205						'x8' => 'end_consonant',
206						'x8_3t8' => 'end_consonant',
207						'x9' => 'foreign_consonant',
208						'x10_12' => 'extra',
209						'x10_3' => 'cancel',
210						);
211
212						# Substitute longer fragment names first so their matches don't get swallowed
213						# by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
214						# so we have to do it first.
215						my @SORTED_X_NAMES = ('x9a10_3', reverse sort { length $a <=> length $b } keys %CAPTURE_NAMES);
216
217						our %VOWEL_LENGTH = (
218						### Monophthongs
219						'Xະ' => 0, # /a/
220						'Xັ' => 0, # /a/ with end consonant
221						'Xາ' => 1, # /aː/
222
223						'Xິ' => 0, # /i/
224						'Xີ' => 1, # /iː/
225
226						'Xຶ' => 0, # /ɯ/
227						'Xື' => 1, # /ɯː/
228
229						'Xຸ' => 0, # /u/
230						'Xູ' => 1, # /uː/
231
232						'ເXະ' => 0, # /e/
233						'ເXັ' => 0, # /e/ with end consonant
234						'ເX' => 1, # /eː/
235
236						'ແXະ' => 0, # /ɛ/
237						'ແXັ' => 0, # /ɛ/ with end consonant
238						'ແX' => 1, # /ɛː/
239
240						'ໂXະ' => 0, # /o/
241						'Xົ' => 0, # /o/
242						'ໂX' => 1, # /oː/
243
244						'ເXາະ' => 0, # /ɔ/
245						'Xັອ' => 0, # /ɔ/ with end consonant
246						'Xໍ' => 1, # /ɔː/
247						'Xອ' => 1, # /ɔː/ with end consonant
248
249						'ເXິ' => 0, # /ɤ/
250						'ເXີ' => 1, # /ɤː/
251
252						###' Diphthongs
253						'ເXັຍ' => 0, # /iə/
254						'Xັຽ' => 0, # /iə/
255						'ເXຍ' => 1, # /iːə/
256						'Xຽ' => 1, # /iːə/
257
258						'ເXຶອ' => 0, # /ɯə/
259						'ເXືອ' => 1, # /ɯːə/
260
261						'Xົວະ' => 0, # /uə/
262						'Xັວ' => 0, # /uə/
263						'Xົວ' => 1, # /uːə/
264						'Xວ' => 1, # /uːə/ with end consonant
265
266						'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
267						'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
268						'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
269						'Xັຍ' => 0, # /aj/
270
271						'ເXົາ' => 0, # /aw/
272						'Xໍາ' => 0, # /am/
273						);
274						{
275						# Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
276						my %v;
277						foreach my $v (keys %VOWEL_LENGTH) {
278						(my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
279						$v{$w} = $VOWEL_LENGTH{$v};
280						}
281						%VOWEL_LENGTH = %v;
282						}
283
284
285						=head1 FUNCTIONS
286
287						=head2 get_sylre_basic
288
289						Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
290						alternations and will thus return the I possible match which is neither
291						guaranteed to be the longest nor the appropriate one in a longer sequence of
292						characters. It is useful as a building block and for verifying syllables
293						though.
294
295						=cut
296
297						sub get_sylre_basic {
298	7		7	1	98	my $syl_re = _subst_regexp_fragments($re_basic);
299	7				3201	return qr/ $syl_re /x;
300						}
301
302						=head2 get_sylre_full
303
304						In addition to the matching done by L, this one makes sure
305						matches are either followed by another complete syllable (or what can only be
306						the start of one), a space, the end of string/line or some non-Lao character.
307						This ensures correct matching of ambiguous syllable boundaries where the core
308						consonant of a following syllable could also be an end consonant of the current
309						one.
310
311						=cut
312
313						sub get_sylre_full {
314	6		6	1	26	return _format_syllable_regexp(
315						_subst_regexp_fragments($re_basic),
316						);
317						}
318
319						=head2 get_sylre_named
320
321						The expression returned is the same as for L but also includes
322						named captures that upon a successful match allow to get the syllable's parts
323						from C<%+>.
324
325						=cut
326
327						sub get_sylre_named {
328	6		6	1	27	return _format_syllable_regexp(
329						_subst_regexp_fragments_named($re_basic),
330						);
331						}
332
333						=head2 is_long_vowel
334
335						C
336
337						Returns a boolean indicating whether the vowel passed in is long. Consonant
338						placeholders must be included in the form of DOTTED CIRCLE (U+25CC). Note that
339						for speed there is no check if the vowel actually exists in the data, so
340						passing many bogus values may lead to uncontrolled growth of the
341						C<%VOWEL_LENGTH> hash due to autovivification!
342
343						=cut
344
345	289		289	1	896	sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
346
347						=head2 normalize_tone_marks
348
349						C
350
351						Normalize tone mark order in C<$text>. Usually when using a combining vowel
352						such as ◌ິ, ◌ຸ or ◌ໍ with a tone mark, they have to be typed in the order
353						I as renderers are supposed to stack above-consonant
354						signs in the order they appear in the text, and tone marks are supposed to go
355						on top. As some renderers will put them on top no matter what, these sequences
356						are sometimes incorrectly entered as I and would thus
357						not be parsed correctly.
358
359						This function is just meant for internal use and modifies its argument in place
360						for speed!
361
362						=cut
363
364						## Modify-in-place for speed reasons
365						## no critic(Subroutines::RequireFinalReturn)
366						sub normalize_tone_marks {
367	87		87	1	146	my $t = $_[0];
368	87				489	$_[0] =~ s/([$CONSONANTS])([$TONE_MARKS])([$VOWELS_COMBINING])/$1$3$2/og;
369						}
370
371						sub _named_capture {
372	966		966		2429	my ($fragments, $atom, $match) = @_;
373
374						return sprintf(
375						'(?<%s> %s)',
376						$CAPTURE_NAMES{$atom}, $fragments->{$match}
377	966	100			6478	) if defined $CAPTURE_NAMES{$atom};
378
379	96				488	return $fragments->{$match};
380						}
381
382						sub _subst_regexp_fragments_named {
383	6		6		34	my $re = shift;
384
385	6				22	for my $atom (@SORTED_X_NAMES) {
386	102				2315	$re =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
	966				2110
387						}
388
389	6				73	return $re;
390						}
391
392						sub _subst_regexp_fragments {
393	25		25		65	my $re = shift;
394
395	25				64	for my $atom (@SORTED_X_NAMES) {
396	425				6813	$re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
	3041				13122
397						}
398
399	25				120	return $re;
400						}
401
402						sub _format_syllable_regexp {
403	12		12		36	my $re = shift;
404	12				43	my $lookahead = _subst_regexp_fragments($re_lookahead);
405
406						my $expr = sprintf(
407						'%s (?= [%s] \| \s \| \P{Lao} \| $ \| %s )',
408						$re,
409	12				238	join('', @regexp_fragments{qw/ x0_1 x0_2 x0_3 x0_4 x0_5 x1 /} ),
410						$lookahead
411						);
412
413	12				5902	return qr/ $expr /x;
414						}
415
416
417						1;