File Coverage

blib/lib/Lingua/LO/Transform/Data.pm

Criterion	Covered	Total	%
statement	40	40	100.0
branch	2	2	100.0
condition			n/a
subroutine	13	13	100.0
pod	3	4	75.0
total	58	59	98.3

line	stmt	bran	sub	pod	time	code
1						package Lingua::LO::Transform::Data;
2	5		5		49238	use strict;
	5				6
	5				118
3	5		5		15	use warnings;
	5				5
	5				120
4	5		5		81	use 5.012000;
	5				17
5	5		5		16	use utf8;
	5				9
	5				780
6	5		5		102	use feature 'unicode_strings';
	5				5
	5				516
7	5		5		843	use version 0.77; our $VERSION = version->declare('v0.0.1');
	5				2670
	5				24
8	5		5		1292	use charnames qw/ :full lao /;
	5				43438
	5				32
9	5		5		21260	use parent 'Exporter';
	5				1244
	5				25
10
11						=encoding UTF-8
12
13						=head1 NAME
14
15						Lingua::LO::Transform::Data - Helper module to keep common read-only data
16
17						=head1 FUNCTION
18
19						Provides a few functions that return regular expressions for matching and
20						extracting parts from Lao syllables. Instead of hardcoding these expressions as
21						strings, they are constructed from ragments at runtime, trading maintainability
22						for a small one-time initialization cost.
23
24						Also holds common read-only data such as vowel classifications.
25
26						You will probably not want to use this module on its own. If you do, see the
27						other L modules for examples.
28
29						=cut
30
31						our @EXPORT_OK = qw/ get_sylre_basic get_sylre_full get_sylre_named is_long_vowel /;
32
33						# Regular expression fragments. The cryptic names correspond to the naming
34						# in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
35						# Using what looks like interpolated variables in single-quoted strings is
36						# intentional; the interpolation is done manually later to be able to construct
37						# expressions with and without named captures.
38						my %regexp_fragments = (
39						x0_1 => 'ເ',
40						x0_2 => 'ແ',
41						x0_3 => 'ໂ',
42						x0_4 => 'ໄ',
43						x0_5 => 'ໃ',
44
45						x1 => 'ຫ',
46
47						x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
48
49						x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
50
51						x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
52
53						x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
54						x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
55						x4_5 => "\N{LAO NIGGAHITA}",
56						x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
57						x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
58						x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
59
60						x5 => "[\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}]",
61
62						x6_1 => 'ວ',
63						x6_2 => 'ອ',
64						x6_3 => 'ຽ',
65						x6 => '[ວອຽ]',
66
67						x7_1 => 'ະ',
68						x7_2 => 'າ',
69						x7_3 => "\N{LAO VOWEL SIGN AM}",
70
71						x8_3t8 => '[ຍດນມຢບ]',
72						x8 => '[ກງຍດນມຢບວ]',
73
74						x9 => '[ຈສຊພຟລ]',
75
76						x10_12 => '[ຯໆ]',
77						x10_3 => "\N{LAO CANCELLATION MARK}",
78
79						x9a10_3 => '(?: $x9 $x10_3)',
80						);
81						my $re1_all = '$x0_1 $x1? $x $x2?';
82						my $re1_1 = '$x5? $x8? $x9a10_3?';
83						my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
84						my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
85						my $re1_4 = '$x7_2? $x7_1';
86						my $re1_5 = '$x4_6 $x5? $x7_2';
87						my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
88						my $re1_8 = '$x4_7? $x5? $x6_3';
89
90						my $re2_all = '$x0_2 $x1? $x $x2?';
91						my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
92						my $re2_2 = '$x7_1';
93						my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
94
95						my $re3_all = '$x0_3 $x1? $x $x2?';
96						my $re3_1 = '$x5? $x8? $x9a10_3?';
97						my $re3_2 = '$x7_1';
98						my $re3_3 = '$x4_7 $x5? $x8_3t8?';
99
100						my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
101
102						my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
103
104						my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
105
106						my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
107
108						my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
109
110						my $re9 = '$x1? $x $x2? $x4_6 $x5? (?: $x8 $x9a10_3? \| $x6_1 $x7_1 )';
111
112						my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
113
114						my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
115
116						my $re12 = '$x1? $x $x2? $x5? $x7_1';
117
118						my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
119
120						my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
121
122						my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
123
124						my $rex1012 = '$x10_12';
125
126						# This is the basic regexp that matches a syllable, still with variables to be
127						# substituted
128						my $re_basic = <
129						(?:
130						(?:
131						(?: $re1_all (?: $re1_1 \| $re1_2 \| $re1_3 \| $re1_4 \| $re1_5 \| $re1_6 \| $re1_8 ) ) \|
132						(?: $re2_all (?: $re2_1 \| $re2_2 \| $re2_3 ) ) \|
133						(?: $re3_all (?: $re3_1 \| $re3_2 \| $re3_3 ) ) \|
134						$re4 \| $re5 \| $re6 \| $re7 \| $re8 \| $re9 \|
135						$re10 \| $re11 \| $re12 \| $re13 \| $re14
136						) $rex1012? \|
137						$re_num+
138						)
139						EOF
140						$re_basic =~ s/\n//gs;
141						$re_basic =~ s/\s+/ /g; # keep it a bit more readable. could use s/\s+//g
142
143						# Functional names for all the x-something groups from the original paper
144						# Used for named catures.
145						my %capture_names = (
146						'x' => 'consonant',
147						'x0_\d' => 'vowel0',
148						'x1' => 'h',
149						'x2' => 'semivowel',
150						'x3' => 'vowel1',
151						'x4_[1-9t]{1,3}'=> 'vowel1',
152						'x5' => 'tone_mark',
153						'x6' => 'vowel2',
154						'x6_\d' => 'vowel2',
155						'x7_\d' => 'vowel3',
156						'x8' => 'end_consonant',
157						'x8_3t8' => 'end_consonant',
158						'x9' => 'foreign_consonant',
159						'x10_12' => 'extra',
160						'x10_3' => 'cancel',
161						);
162
163						# Substitute longer fragment names first so their matches don't get swallowed
164						# by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
165						# so we have to do it first.
166						my @sorted_x_names = ('x9a10_3', reverse sort { length $a <=> length $b } keys %capture_names);
167
168						my %VOWEL_LENGTH = (
169						### Monophthongs
170						'Xະ' => 0, # /a/
171						'Xັ' => 0, # /a/ with end consonant
172						'Xາ' => 1, # /aː/
173
174						'Xິ' => 0, # /i/
175						'Xີ' => 1, # /iː/
176
177						'Xຶ' => 0, # /ɯ/
178						'Xື' => 1, # /ɯː/
179
180						'Xຸ' => 0, # /u/
181						'Xູ' => 1, # /uː/
182
183						'ເXະ' => 0, # /e/
184						'ເXັ' => 0, # /e/ with end consonant
185						'ເX' => 1, # /eː/
186
187						'ແXະ' => 0, # /ɛ/
188						'ແXັ' => 0, # /ɛ/ with end consonant
189						'ແX' => 1, # /ɛː/
190
191						'ໂXະ' => 0, # /o/
192						'Xົ' => 0, # /o/
193						'ໂX' => 1, # /oː/
194
195						'ເXາະ' => 0, # /ɔ/
196						'Xັອ' => 0, # /ɔ/ with end consonant
197						'Xໍ' => 1, # /ɔː/
198						'Xອ' => 1, # /ɔː/ with end consonant
199
200						'ເXິ' => 0, # /ɤ/
201						'ເXີ' => 1, # /ɤː/
202
203						###' Diphthongs
204						'ເXັຍ' => 0, # /iə/
205						'Xັຽ' => 0, # /iə/
206						'ເXຍ' => 1, # /iːə/
207						'Xຽ' => 1, # /iːə/
208
209						'ເXຶອ' => 0, # /ɯə/
210						'ເXືອ' => 1, # /ɯːə/
211
212						'Xົວະ' => 0, # /uə/
213						'Xັວ' => 0, # /uə/
214						'Xົວ' => 1, # /uːə/
215						'Xວ' => 1, # /uːə/ with end consonant
216
217						'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
218						'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
219						'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
220						'Xັຍ' => 0, # /aj/
221
222						'ເXົາ' => 0, # /aw/
223						'Xໍາ' => 0, # /am/
224						);
225						{
226						# Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
227						my %v;
228						foreach my $v (keys %VOWEL_LENGTH) {
229						(my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
230						$v{$w} = $VOWEL_LENGTH{$v};
231						}
232						%VOWEL_LENGTH = %v;
233						}
234
235
236						=head1 FUNCTIONS
237
238						=head2 get_sylre_basic
239
240						Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
241						alternations and will thus return the I possible match which is neither
242						guaranteed to be the longest nor the appropriate one in a longer sequence of
243						characters. It is useful as a building block and for verifying syllables
244						though.
245
246						=cut
247
248						sub get_sylre_basic {
249	11		11	1	28	my $syl_re = $re_basic;
250	11				19	for my $atom (@sorted_x_names) {
251	176				2151	$syl_re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
	1771				4058
252						}
253
254	11				2119	return qr/ $syl_re /x;
255						}
256
257						=head2 get_sylre_full
258
259						In addition to the matching done by L, this one makes sure
260						matches are either followed by another complete syllable, a blank, the end of
261						string/line or some non-Lao character. This ensures correct matching of
262						ambiguous syllable boundaries where the core consonant of a following syllable
263						could also be an end consonant of the current one.
264
265						=cut
266
267						sub get_sylre_full {
268	4		4	1	458	my $syl_short = get_sylre_basic();
269	4				1680	return qr/ $syl_short (?= \P{Lao} \| \s \| $ \| $syl_short ) /x;
270						}
271
272						=head2 get_sylre_named
273
274						The expression returned is the same as for L but also includes
275						named captures that upon a successful match allow to get the syllable's parts
276						from C<%+>.
277
278						=cut
279
280						sub get_sylre_named {
281	3		3	1	8	my $syl_short = get_sylre_basic();
282	3				12	my $syl_capture = $re_basic;
283	3				5	for my $atom (@sorted_x_names) {
284	48				739	$syl_capture =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
	483				499
285						}
286
287	3				1602	return qr/ $syl_capture (?= \P{Lao} \| \s \| $ \| $syl_short )/x;
288						}
289
290	112		112	0	283	sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
291
292						sub _named_capture {
293	483		483		460	my ($fragments, $atom, $match) = @_;
294
295						return sprintf(
296						'(?<%s> %s)',
297						$capture_names{$atom}, $fragments->{$match}
298	483	100			1902	) if defined $capture_names{$atom};
299
300	48				105	return $fragments->{$match};
301						}
302
303						1;