File Coverage

blib/lib/Lingua/LO/Transform/Data.pm

Criterion	Covered	Total	%
statement	40	43	93.0
branch	2	2	100.0
condition			n/a
subroutine	13	16	81.2
pod	3	7	42.8
total	58	68	85.2

line	stmt	bran	sub	pod	time	code
1						package Lingua::LO::Transform::Data;
2	5		5		46946	use strict;
	5				6
	5				113
3	5		5		14	use warnings;
	5				5
	5				95
4	5		5		68	use 5.012000;
	5				16
5	5		5		14	use utf8;
	5				5
	5				23
6	5		5		85	use feature 'unicode_strings';
	5				651
	5				375
7	5		5		819	use version 0.77; our $VERSION = version->declare('v0.0.1');
	5				2559
	5				24
8	5		5		1313	use charnames qw/ :full lao /;
	5				42919
	5				23
9	5		5		21169	use parent 'Exporter';
	5				1150
	5				22
10
11						=encoding UTF-8
12
13						=head1 NAME
14
15						Lingua::LO::Transform::Data - Helper module to keep common read-only data
16
17						=head1 FUNCTION
18
19						Provides a few functions that return regular expressions for matching and
20						extracting parts from Lao syllables. Instead of hardcoding these expressions as
21						strings, they are constructed from ragments at runtime, trading maintainability
22						for a small one-time initialization cost.
23
24						Also holds common read-only data such as vowel classifications.
25
26						You will probably not want to use this module on its own. If you do, see the
27						other L modules for examples.
28
29						=cut
30
31						our %EXPORT_TAGS = (
32						all => [ qw/
33						get_sylre_basic get_sylre_full get_sylre_named is_long_vowel
34						get_consonants get_vowels get_tone_marks
35						/
36						]
37						);
38						our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
39
40						# Character classes
41						my $TONE_MARKS = "\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}" .
42						"\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}";
43						my $CONSONANTS = "\N{LAO LETTER KO}\N{LAO LETTER KHO SUNG}\N{LAO LETTER KHO TAM}" .
44						"\N{LAO LETTER NGO}\N{LAO LETTER CO}\N{LAO LETTER SO TAM}\N{LAO LETTER NYO}" .
45						"\N{LAO LETTER DO}\N{LAO LETTER TO}\N{LAO LETTER THO SUNG}\N{LAO LETTER THO TAM}" .
46						"\N{LAO LETTER NO}\N{LAO LETTER BO}\N{LAO LETTER PO}\N{LAO LETTER PHO SUNG}" .
47						"\N{LAO LETTER FO TAM}\N{LAO LETTER PHO TAM}\N{LAO LETTER FO SUNG}" .
48						"\N{LAO LETTER MO}\N{LAO LETTER YO}\N{LAO LETTER LO LING}\N{LAO LETTER LO LOOT}" .
49						"\N{LAO LETTER WO}\N{LAO LETTER SO SUNG}\N{LAO LETTER HO SUNG}\N{LAO LETTER O}" .
50						"\N{LAO LETTER HO TAM}";
51						my $VOWELS = "\N{LAO VOWEL SIGN A}\N{LAO VOWEL SIGN MAI KAN}\N{LAO VOWEL SIGN AA}" .
52						"\N{LAO VOWEL SIGN AM}\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
53						"\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
54						"\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO SEMIVOWEL SIGN LO}" .
55						"\N{LAO SEMIVOWEL SIGN NYO}\N{LAO VOWEL SIGN E}\N{LAO VOWEL SIGN EI}" .
56						"\N{LAO VOWEL SIGN O}\N{LAO VOWEL SIGN AY}\N{LAO VOWEL SIGN AI}" .
57						"\N{LAO NIGGAHITA}";
58
59						# Regular expression fragments. The cryptic names correspond to the naming
60						# in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
61						# Using what looks like interpolated variables in single-quoted strings is
62						# intentional; the interpolation is done manually later to be able to construct
63						# expressions with and without named captures.
64						my %regexp_fragments = (
65						x0_1 => 'ເ',
66						x0_2 => 'ແ',
67						x0_3 => 'ໂ',
68						x0_4 => 'ໄ',
69						x0_5 => 'ໃ',
70
71						x1 => 'ຫ',
72
73						x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
74
75						x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
76
77						x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
78
79						x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
80						x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
81						x4_5 => "\N{LAO NIGGAHITA}",
82						x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
83						x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
84						x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
85
86						x5 => "[$TONE_MARKS]",
87
88						x6_1 => 'ວ',
89						x6_2 => 'ອ',
90						x6_3 => 'ຽ',
91						x6 => '[ວອຽ]',
92
93						x7_1 => 'ະ',
94						x7_2 => 'າ',
95						x7_3 => "\N{LAO VOWEL SIGN AM}",
96
97						x8_3t8 => '[ຍດນມຢບ]',
98						x8 => '[ກງຍດນມຢບວ]',
99
100						x9 => '[ຈສຊພຟລ]',
101
102						x10_12 => '[ຯໆ]',
103						x10_3 => "\N{LAO CANCELLATION MARK}",
104
105						x9a10_3 => '(?: $x9 $x10_3)',
106						);
107						my $re1_all = '$x0_1 $x1? $x $x2?';
108						my $re1_1 = '$x5? $x8? $x9a10_3?';
109						my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
110						my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
111						my $re1_4 = '$x7_2? $x7_1';
112						my $re1_5 = '$x4_6 $x5? $x7_2';
113						my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
114						my $re1_8 = '$x4_7? $x5? $x6_3';
115
116						my $re2_all = '$x0_2 $x1? $x $x2?';
117						my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
118						my $re2_2 = '$x7_1';
119						my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
120
121						my $re3_all = '$x0_3 $x1? $x $x2?';
122						my $re3_1 = '$x5? $x8? $x9a10_3?';
123						my $re3_2 = '$x7_1';
124						my $re3_3 = '$x4_7 $x5? $x8_3t8?';
125
126						my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
127
128						my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
129
130						my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
131
132						my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
133
134						my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
135
136						my $re9 = '$x1? $x $x2? $x4_6 $x5? (?: $x8 $x9a10_3? \| $x6_1 $x7_1 )';
137
138						my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
139
140						my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
141
142						my $re12 = '$x1? $x $x2? $x5? $x7_1';
143
144						my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
145
146						my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
147
148						my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
149
150						my $rex1012 = '$x10_12';
151
152						# This is the basic regexp that matches a syllable, still with variables to be
153						# substituted
154						my $re_basic = <
155						(?:
156						(?:
157						(?: $re1_all (?: $re1_1 \| $re1_2 \| $re1_3 \| $re1_4 \| $re1_5 \| $re1_6 \| $re1_8 ) ) \|
158						(?: $re2_all (?: $re2_1 \| $re2_2 \| $re2_3 ) ) \|
159						(?: $re3_all (?: $re3_1 \| $re3_2 \| $re3_3 ) ) \|
160						$re4 \| $re5 \| $re6 \| $re7 \| $re8 \| $re9 \|
161						$re10 \| $re11 \| $re12 \| $re13 \| $re14
162						) $rex1012? \|
163						$re_num+
164						)
165						EOF
166						$re_basic =~ s/\n//gs;
167						$re_basic =~ s/\s+/ /g; # keep it a bit more readable. could use s/\s+//g
168
169						# Functional names for all the x-something groups from the original paper
170						# Used for named catures.
171						my %capture_names = (
172						'x' => 'consonant',
173						'x0_\d' => 'vowel0',
174						'x1' => 'h',
175						'x2' => 'semivowel',
176						'x3' => 'vowel1',
177						'x4_[1-9t]{1,3}'=> 'vowel1',
178						'x5' => 'tone_mark',
179						'x6' => 'vowel2',
180						'x6_\d' => 'vowel2',
181						'x7_\d' => 'vowel3',
182						'x8' => 'end_consonant',
183						'x8_3t8' => 'end_consonant',
184						'x9' => 'foreign_consonant',
185						'x10_12' => 'extra',
186						'x10_3' => 'cancel',
187						);
188
189						# Substitute longer fragment names first so their matches don't get swallowed
190						# by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
191						# so we have to do it first.
192						my @sorted_x_names = ('x9a10_3', reverse sort { length $a <=> length $b } keys %capture_names);
193
194						my %VOWEL_LENGTH = (
195						### Monophthongs
196						'Xະ' => 0, # /a/
197						'Xັ' => 0, # /a/ with end consonant
198						'Xາ' => 1, # /aː/
199
200						'Xິ' => 0, # /i/
201						'Xີ' => 1, # /iː/
202
203						'Xຶ' => 0, # /ɯ/
204						'Xື' => 1, # /ɯː/
205
206						'Xຸ' => 0, # /u/
207						'Xູ' => 1, # /uː/
208
209						'ເXະ' => 0, # /e/
210						'ເXັ' => 0, # /e/ with end consonant
211						'ເX' => 1, # /eː/
212
213						'ແXະ' => 0, # /ɛ/
214						'ແXັ' => 0, # /ɛ/ with end consonant
215						'ແX' => 1, # /ɛː/
216
217						'ໂXະ' => 0, # /o/
218						'Xົ' => 0, # /o/
219						'ໂX' => 1, # /oː/
220
221						'ເXາະ' => 0, # /ɔ/
222						'Xັອ' => 0, # /ɔ/ with end consonant
223						'Xໍ' => 1, # /ɔː/
224						'Xອ' => 1, # /ɔː/ with end consonant
225
226						'ເXິ' => 0, # /ɤ/
227						'ເXີ' => 1, # /ɤː/
228
229						###' Diphthongs
230						'ເXັຍ' => 0, # /iə/
231						'Xັຽ' => 0, # /iə/
232						'ເXຍ' => 1, # /iːə/
233						'Xຽ' => 1, # /iːə/
234
235						'ເXຶອ' => 0, # /ɯə/
236						'ເXືອ' => 1, # /ɯːə/
237
238						'Xົວະ' => 0, # /uə/
239						'Xັວ' => 0, # /uə/
240						'Xົວ' => 1, # /uːə/
241						'Xວ' => 1, # /uːə/ with end consonant
242
243						'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
244						'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
245						'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
246						'Xັຍ' => 0, # /aj/
247
248						'ເXົາ' => 0, # /aw/
249						'Xໍາ' => 0, # /am/
250						);
251						{
252						# Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
253						my %v;
254						foreach my $v (keys %VOWEL_LENGTH) {
255						(my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
256						$v{$w} = $VOWEL_LENGTH{$v};
257						}
258						%VOWEL_LENGTH = %v;
259						}
260
261
262						=head1 FUNCTIONS
263
264						=head2 get_sylre_basic
265
266						Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
267						alternations and will thus return the I possible match which is neither
268						guaranteed to be the longest nor the appropriate one in a longer sequence of
269						characters. It is useful as a building block and for verifying syllables
270						though.
271
272						=cut
273
274						sub get_sylre_basic {
275	11		11	1	26	my $syl_re = $re_basic;
276	11				20	for my $atom (@sorted_x_names) {
277	176				2129	$syl_re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
	1771				3758
278						}
279
280	11				2010	return qr/ $syl_re /x;
281						}
282
283						=head2 get_sylre_full
284
285						In addition to the matching done by L, this one makes sure
286						matches are either followed by another complete syllable, a blank, the end of
287						string/line or some non-Lao character. This ensures correct matching of
288						ambiguous syllable boundaries where the core consonant of a following syllable
289						could also be an end consonant of the current one.
290
291						=cut
292
293						sub get_sylre_full {
294	4		4	1	451	my $syl_short = get_sylre_basic();
295	4				1582	return qr/ $syl_short (?= \P{Lao} \| \s \| $ \| $syl_short ) /x;
296						}
297
298						=head2 get_sylre_named
299
300						The expression returned is the same as for L but also includes
301						named captures that upon a successful match allow to get the syllable's parts
302						from C<%+>.
303
304						=cut
305
306						sub get_sylre_named {
307	3		3	1	6	my $syl_short = get_sylre_basic();
308	3				13	my $syl_capture = $re_basic;
309	3				5	for my $atom (@sorted_x_names) {
310	48				749	$syl_capture =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
	483				479
311						}
312
313	3				2757	return qr/ $syl_capture (?= \P{Lao} \| \s \| $ \| $syl_short )/x;
314						}
315
316	0		0	0	0	sub get_consonants { return $CONSONANTS; }
317	0		0	0	0	sub get_vowels { return $VOWELS; }
318	0		0	0	0	sub get_tone_marks { return $TONE_MARKS; }
319
320	112		112	0	277	sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
321
322						sub _named_capture {
323	483		483		461	my ($fragments, $atom, $match) = @_;
324
325						return sprintf(
326						'(?<%s> %s)',
327						$capture_names{$atom}, $fragments->{$match}
328	483	100			1971	) if defined $capture_names{$atom};
329
330	48				106	return $fragments->{$match};
331						}
332
333						1;