| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Lingua::LO::Transform::Data; | 
| 2 | 5 |  |  | 5 |  | 46946 | use strict; | 
|  | 5 |  |  |  |  | 6 |  | 
|  | 5 |  |  |  |  | 113 |  | 
| 3 | 5 |  |  | 5 |  | 14 | use warnings; | 
|  | 5 |  |  |  |  | 5 |  | 
|  | 5 |  |  |  |  | 95 |  | 
| 4 | 5 |  |  | 5 |  | 68 | use 5.012000; | 
|  | 5 |  |  |  |  | 16 |  | 
| 5 | 5 |  |  | 5 |  | 14 | use utf8; | 
|  | 5 |  |  |  |  | 5 |  | 
|  | 5 |  |  |  |  | 23 |  | 
| 6 | 5 |  |  | 5 |  | 85 | use feature 'unicode_strings'; | 
|  | 5 |  |  |  |  | 651 |  | 
|  | 5 |  |  |  |  | 375 |  | 
| 7 | 5 |  |  | 5 |  | 819 | use version 0.77; our $VERSION = version->declare('v0.0.1'); | 
|  | 5 |  |  |  |  | 2559 |  | 
|  | 5 |  |  |  |  | 24 |  | 
| 8 | 5 |  |  | 5 |  | 1313 | use charnames qw/ :full lao /; | 
|  | 5 |  |  |  |  | 42919 |  | 
|  | 5 |  |  |  |  | 23 |  | 
| 9 | 5 |  |  | 5 |  | 21169 | use parent 'Exporter'; | 
|  | 5 |  |  |  |  | 1150 |  | 
|  | 5 |  |  |  |  | 22 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | =encoding UTF-8 | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | =head1 NAME | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | Lingua::LO::Transform::Data - Helper module to keep common read-only data | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | =head1 FUNCTION | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | Provides a few functions that return regular expressions for matching and | 
| 20 |  |  |  |  |  |  | extracting parts from Lao syllables. Instead of hardcoding these expressions as | 
| 21 |  |  |  |  |  |  | strings, they are constructed from ragments at runtime, trading maintainability | 
| 22 |  |  |  |  |  |  | for a small one-time initialization cost. | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | Also holds common read-only data such as vowel classifications. | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | You will probably not want to use this module on its own. If you do, see the | 
| 27 |  |  |  |  |  |  | other L modules for examples. | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =cut | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | our %EXPORT_TAGS = ( | 
| 32 |  |  |  |  |  |  | all => [ qw/ | 
| 33 |  |  |  |  |  |  | get_sylre_basic get_sylre_full get_sylre_named is_long_vowel | 
| 34 |  |  |  |  |  |  | get_consonants get_vowels get_tone_marks | 
| 35 |  |  |  |  |  |  | / | 
| 36 |  |  |  |  |  |  | ] | 
| 37 |  |  |  |  |  |  | ); | 
| 38 |  |  |  |  |  |  | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | # Character classes | 
| 41 |  |  |  |  |  |  | my $TONE_MARKS = "\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}" . | 
| 42 |  |  |  |  |  |  | "\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}"; | 
| 43 |  |  |  |  |  |  | my $CONSONANTS = "\N{LAO LETTER KO}\N{LAO LETTER KHO SUNG}\N{LAO LETTER KHO TAM}" . | 
| 44 |  |  |  |  |  |  | "\N{LAO LETTER NGO}\N{LAO LETTER CO}\N{LAO LETTER SO TAM}\N{LAO LETTER NYO}" . | 
| 45 |  |  |  |  |  |  | "\N{LAO LETTER DO}\N{LAO LETTER TO}\N{LAO LETTER THO SUNG}\N{LAO LETTER THO TAM}" . | 
| 46 |  |  |  |  |  |  | "\N{LAO LETTER NO}\N{LAO LETTER BO}\N{LAO LETTER PO}\N{LAO LETTER PHO SUNG}" . | 
| 47 |  |  |  |  |  |  | "\N{LAO LETTER FO TAM}\N{LAO LETTER PHO TAM}\N{LAO LETTER FO SUNG}" . | 
| 48 |  |  |  |  |  |  | "\N{LAO LETTER MO}\N{LAO LETTER YO}\N{LAO LETTER LO LING}\N{LAO LETTER LO LOOT}" . | 
| 49 |  |  |  |  |  |  | "\N{LAO LETTER WO}\N{LAO LETTER SO SUNG}\N{LAO LETTER HO SUNG}\N{LAO LETTER O}" . | 
| 50 |  |  |  |  |  |  | "\N{LAO LETTER HO TAM}"; | 
| 51 |  |  |  |  |  |  | my $VOWELS = "\N{LAO VOWEL SIGN A}\N{LAO VOWEL SIGN MAI KAN}\N{LAO VOWEL SIGN AA}" . | 
| 52 |  |  |  |  |  |  | "\N{LAO VOWEL SIGN AM}\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" . | 
| 53 |  |  |  |  |  |  | "\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" . | 
| 54 |  |  |  |  |  |  | "\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO SEMIVOWEL SIGN LO}" . | 
| 55 |  |  |  |  |  |  | "\N{LAO SEMIVOWEL SIGN NYO}\N{LAO VOWEL SIGN E}\N{LAO VOWEL SIGN EI}" . | 
| 56 |  |  |  |  |  |  | "\N{LAO VOWEL SIGN O}\N{LAO VOWEL SIGN AY}\N{LAO VOWEL SIGN AI}" . | 
| 57 |  |  |  |  |  |  | "\N{LAO NIGGAHITA}"; | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | # Regular expression fragments. The cryptic names correspond to the naming | 
| 60 |  |  |  |  |  |  | # in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking | 
| 61 |  |  |  |  |  |  | # Using what looks like interpolated variables in single-quoted strings is | 
| 62 |  |  |  |  |  |  | # intentional; the interpolation is done manually later to be able to construct | 
| 63 |  |  |  |  |  |  | # expressions with and without named captures. | 
| 64 |  |  |  |  |  |  | my %regexp_fragments = ( | 
| 65 |  |  |  |  |  |  | x0_1    => 'ເ', | 
| 66 |  |  |  |  |  |  | x0_2    => 'ແ', | 
| 67 |  |  |  |  |  |  | x0_3    => 'ໂ', | 
| 68 |  |  |  |  |  |  | x0_4    => 'ໄ', | 
| 69 |  |  |  |  |  |  | x0_5    => 'ໃ', | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | x1      => 'ຫ', | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | x       => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]', | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | x2      => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]", | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | x3      => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]", | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | x4_12   => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]", | 
| 80 |  |  |  |  |  |  | x4_34   => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]", | 
| 81 |  |  |  |  |  |  | x4_5    => "\N{LAO NIGGAHITA}", | 
| 82 |  |  |  |  |  |  | x4_6    => "\N{LAO VOWEL SIGN MAI KON}", | 
| 83 |  |  |  |  |  |  | x4_7    => "\N{LAO VOWEL SIGN MAI KAN}", | 
| 84 |  |  |  |  |  |  | x4_1t4  => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]", | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | x5      => "[$TONE_MARKS]", | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | x6_1    => 'ວ', | 
| 89 |  |  |  |  |  |  | x6_2    => 'ອ', | 
| 90 |  |  |  |  |  |  | x6_3    => 'ຽ', | 
| 91 |  |  |  |  |  |  | x6      => '[ວອຽ]', | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | x7_1    => 'ະ', | 
| 94 |  |  |  |  |  |  | x7_2    => 'າ', | 
| 95 |  |  |  |  |  |  | x7_3    => "\N{LAO VOWEL SIGN AM}", | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | x8_3t8  => '[ຍດນມຢບ]', | 
| 98 |  |  |  |  |  |  | x8      => '[ກງຍດນມຢບວ]', | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | x9      => '[ຈສຊພຟລ]', | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | x10_12  => '[ຯໆ]', | 
| 103 |  |  |  |  |  |  | x10_3   => "\N{LAO CANCELLATION MARK}", | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | x9a10_3 => '(?: $x9 $x10_3)', | 
| 106 |  |  |  |  |  |  | ); | 
| 107 |  |  |  |  |  |  | my $re1_all = '$x0_1 $x1? $x $x2?'; | 
| 108 |  |  |  |  |  |  | my $re1_1   = '$x5? $x8? $x9a10_3?'; | 
| 109 |  |  |  |  |  |  | my $re1_2   = '$x4_12 $x5? $x8? $x9a10_3?'; | 
| 110 |  |  |  |  |  |  | my $re1_3   = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?'; | 
| 111 |  |  |  |  |  |  | my $re1_4   = '$x7_2? $x7_1'; | 
| 112 |  |  |  |  |  |  | my $re1_5   = '$x4_6 $x5? $x7_2'; | 
| 113 |  |  |  |  |  |  | my $re1_6   = '$x4_7 $x5? $x8  $x9a10_3?'; | 
| 114 |  |  |  |  |  |  | my $re1_8   = '$x4_7? $x5? $x6_3'; | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | my $re2_all = '$x0_2 $x1? $x $x2?'; | 
| 117 |  |  |  |  |  |  | my $re2_1   = '$x5? $x6? $x8? $x9a10_3?'; | 
| 118 |  |  |  |  |  |  | my $re2_2   = '$x7_1'; | 
| 119 |  |  |  |  |  |  | my $re2_3   = '$x4_7 $x5? $x8 $x9a10_3?'; | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | my $re3_all = '$x0_3 $x1? $x $x2?'; | 
| 122 |  |  |  |  |  |  | my $re3_1   = '$x5? $x8? $x9a10_3?'; | 
| 123 |  |  |  |  |  |  | my $re3_2   = '$x7_1'; | 
| 124 |  |  |  |  |  |  | my $re3_3   = '$x4_7 $x5? $x8_3t8?'; | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | my $re4     = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?'; | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  | my $re5     = '$x0_5 $x1? $x $x2? $x5? $x6_1?'; | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | my $re6     = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?'; | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | my $re7     = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?'; | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | my $re8     = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?'; | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | my $re9     = '$x1? $x $x2? $x4_6 $x5? (?: $x8 $x9a10_3? | $x6_1 $x7_1 )'; | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | my $re10    = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?'; | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | my $re11    = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?'; | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | my $re12    = '$x1? $x $x2? $x5? $x7_1'; | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | my $re13    = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?'; | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | my $re14    = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?'; | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | my $re_num  = '[໑໒໓໔໕໖໗໘໙໐]'; | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | my $rex1012 = '$x10_12'; | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | # This is the basic regexp that matches a syllable, still with variables to be | 
| 153 |  |  |  |  |  |  | # substituted | 
| 154 |  |  |  |  |  |  | my $re_basic = < | 
| 155 |  |  |  |  |  |  | (?: | 
| 156 |  |  |  |  |  |  | (?: | 
| 157 |  |  |  |  |  |  | (?: $re1_all (?: $re1_1 | $re1_2 | $re1_3 | $re1_4 | $re1_5 | $re1_6 | $re1_8 ) ) | | 
| 158 |  |  |  |  |  |  | (?: $re2_all (?: $re2_1 | $re2_2 | $re2_3 ) ) | | 
| 159 |  |  |  |  |  |  | (?: $re3_all (?: $re3_1 | $re3_2 | $re3_3 ) ) | | 
| 160 |  |  |  |  |  |  | $re4  | $re5  | $re6  | $re7  | $re8  | $re9  | | 
| 161 |  |  |  |  |  |  | $re10 | $re11 | $re12 | $re13 | $re14 | 
| 162 |  |  |  |  |  |  | ) $rex1012? | | 
| 163 |  |  |  |  |  |  | $re_num+ | 
| 164 |  |  |  |  |  |  | ) | 
| 165 |  |  |  |  |  |  | EOF | 
| 166 |  |  |  |  |  |  | $re_basic =~ s/\n//gs; | 
| 167 |  |  |  |  |  |  | $re_basic =~ s/\s+/ /g; # keep it a bit more readable. could use s/\s+//g | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | # Functional names for all the x-something groups from the original paper | 
| 170 |  |  |  |  |  |  | # Used for named catures. | 
| 171 |  |  |  |  |  |  | my %capture_names = ( | 
| 172 |  |  |  |  |  |  | 'x'             => 'consonant', | 
| 173 |  |  |  |  |  |  | 'x0_\d'         => 'vowel0', | 
| 174 |  |  |  |  |  |  | 'x1'            => 'h', | 
| 175 |  |  |  |  |  |  | 'x2'            => 'semivowel', | 
| 176 |  |  |  |  |  |  | 'x3'            => 'vowel1', | 
| 177 |  |  |  |  |  |  | 'x4_[1-9t]{1,3}'=> 'vowel1', | 
| 178 |  |  |  |  |  |  | 'x5'            => 'tone_mark', | 
| 179 |  |  |  |  |  |  | 'x6'            => 'vowel2', | 
| 180 |  |  |  |  |  |  | 'x6_\d'         => 'vowel2', | 
| 181 |  |  |  |  |  |  | 'x7_\d'         => 'vowel3', | 
| 182 |  |  |  |  |  |  | 'x8'            => 'end_consonant', | 
| 183 |  |  |  |  |  |  | 'x8_3t8'        => 'end_consonant', | 
| 184 |  |  |  |  |  |  | 'x9'            => 'foreign_consonant', | 
| 185 |  |  |  |  |  |  | 'x10_12'        => 'extra', | 
| 186 |  |  |  |  |  |  | 'x10_3'         => 'cancel', | 
| 187 |  |  |  |  |  |  | ); | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | # Substitute longer fragment names first so their matches don't get swallowed | 
| 190 |  |  |  |  |  |  | # by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)' | 
| 191 |  |  |  |  |  |  | # so we have to do it first. | 
| 192 |  |  |  |  |  |  | my @sorted_x_names = ('x9a10_3', reverse sort { length $a <=> length $b } keys %capture_names); | 
| 193 |  |  |  |  |  |  |  | 
| 194 |  |  |  |  |  |  | my %VOWEL_LENGTH = ( | 
| 195 |  |  |  |  |  |  | ### Monophthongs | 
| 196 |  |  |  |  |  |  | 'Xະ'   => 0,  # /a/ | 
| 197 |  |  |  |  |  |  | 'Xັ'    => 0,  # /a/ with end consonant | 
| 198 |  |  |  |  |  |  | 'Xາ'   => 1,  # /aː/ | 
| 199 |  |  |  |  |  |  |  | 
| 200 |  |  |  |  |  |  | 'Xິ'    => 0,  # /i/ | 
| 201 |  |  |  |  |  |  | 'Xີ'    => 1,  # /iː/ | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | 'Xຶ'    => 0,  # /ɯ/ | 
| 204 |  |  |  |  |  |  | 'Xື'    => 1,  # /ɯː/ | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | 'Xຸ'    => 0,  # /u/ | 
| 207 |  |  |  |  |  |  | 'Xູ'    => 1,  # /uː/ | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | 'ເXະ'  => 0,  # /e/ | 
| 210 |  |  |  |  |  |  | 'ເXັ'   => 0,  # /e/ with end consonant | 
| 211 |  |  |  |  |  |  | 'ເX'   => 1,  # /eː/ | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | 'ແXະ'  => 0,  # /ɛ/ | 
| 214 |  |  |  |  |  |  | 'ແXັ'   => 0,  # /ɛ/ with end consonant | 
| 215 |  |  |  |  |  |  | 'ແX'   => 1,  # /ɛː/ | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | 'ໂXະ'  => 0,  # /o/ | 
| 218 |  |  |  |  |  |  | 'Xົ'    => 0,  # /o/ | 
| 219 |  |  |  |  |  |  | 'ໂX'   => 1,  # /oː/ | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | 'ເXາະ' => 0,  # /ɔ/ | 
| 222 |  |  |  |  |  |  | 'Xັອ'   => 0,  # /ɔ/ with end consonant | 
| 223 |  |  |  |  |  |  | 'Xໍ'    => 1,  # /ɔː/ | 
| 224 |  |  |  |  |  |  | 'Xອ'   => 1,  # /ɔː/ with end consonant | 
| 225 |  |  |  |  |  |  |  | 
| 226 |  |  |  |  |  |  | 'ເXິ'   => 0,  # /ɤ/ | 
| 227 |  |  |  |  |  |  | 'ເXີ'   => 1,  # /ɤː/ | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | ###' Diphthongs | 
| 230 |  |  |  |  |  |  | 'ເXັຍ'  => 0,  # /iə/ | 
| 231 |  |  |  |  |  |  | 'Xັຽ'   => 0,  # /iə/ | 
| 232 |  |  |  |  |  |  | 'ເXຍ'  => 1,  # /iːə/ | 
| 233 |  |  |  |  |  |  | 'Xຽ'   => 1,  # /iːə/ | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | 'ເXຶອ'  => 0,  # /ɯə/ | 
| 236 |  |  |  |  |  |  | 'ເXືອ'  => 1,  # /ɯːə/ | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | 'Xົວະ'  => 0,  # /uə/ | 
| 239 |  |  |  |  |  |  | 'Xັວ'   => 0,  # /uə/ | 
| 240 |  |  |  |  |  |  | 'Xົວ'   => 1,  # /uːə/ | 
| 241 |  |  |  |  |  |  | 'Xວ'   => 1,  # /uːə/ with end consonant | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | 'ໄX'   => 1,  # /aj/ - Actually short but counts as long for rules | 
| 244 |  |  |  |  |  |  | 'ໃX'   => 1,  # /aj/ - Actually short but counts as long for rules | 
| 245 |  |  |  |  |  |  | 'Xາຍ'  => 1,  # /aj/ - Actually short but counts as long for rules | 
| 246 |  |  |  |  |  |  | 'Xັຍ'   => 0,  # /aj/ | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | 'ເXົາ'  => 0,  # /aw/ | 
| 249 |  |  |  |  |  |  | 'Xໍາ'   => 0,  # /am/ | 
| 250 |  |  |  |  |  |  | ); | 
| 251 |  |  |  |  |  |  | { | 
| 252 |  |  |  |  |  |  | # Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit. | 
| 253 |  |  |  |  |  |  | my %v; | 
| 254 |  |  |  |  |  |  | foreach my $v (keys %VOWEL_LENGTH) { | 
| 255 |  |  |  |  |  |  | (my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/; | 
| 256 |  |  |  |  |  |  | $v{$w} = $VOWEL_LENGTH{$v}; | 
| 257 |  |  |  |  |  |  | } | 
| 258 |  |  |  |  |  |  | %VOWEL_LENGTH = %v; | 
| 259 |  |  |  |  |  |  | } | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | =head1 FUNCTIONS | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  | =head2 get_sylre_basic | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | Returns a basic regexp that can match a Lao syllable. It consists of a bunch of | 
| 267 |  |  |  |  |  |  | alternations and will thus return the I possible match which is neither | 
| 268 |  |  |  |  |  |  | guaranteed to be the longest nor the appropriate one in a longer sequence of | 
| 269 |  |  |  |  |  |  | characters. It is useful as a building block and for verifying syllables | 
| 270 |  |  |  |  |  |  | though. | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | =cut | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | sub get_sylre_basic { | 
| 275 | 11 |  |  | 11 | 1 | 26 | my $syl_re = $re_basic; | 
| 276 | 11 |  |  |  |  | 20 | for my $atom (@sorted_x_names) { | 
| 277 | 176 |  |  |  |  | 2129 | $syl_re =~ s/\$($atom)/$regexp_fragments{$1}/eg; | 
|  | 1771 |  |  |  |  | 3758 |  | 
| 278 |  |  |  |  |  |  | } | 
| 279 |  |  |  |  |  |  |  | 
| 280 | 11 |  |  |  |  | 2010 | return qr/ $syl_re /x; | 
| 281 |  |  |  |  |  |  | } | 
| 282 |  |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  | =head2 get_sylre_full | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | In addition to the matching done by L, this one makes sure | 
| 286 |  |  |  |  |  |  | matches are either followed by another complete syllable, a blank, the end of | 
| 287 |  |  |  |  |  |  | string/line or some non-Lao character. This ensures correct matching of | 
| 288 |  |  |  |  |  |  | ambiguous syllable boundaries where the core consonant of a following syllable | 
| 289 |  |  |  |  |  |  | could also be an end consonant of the current one. | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | =cut | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | sub get_sylre_full { | 
| 294 | 4 |  |  | 4 | 1 | 451 | my $syl_short = get_sylre_basic(); | 
| 295 | 4 |  |  |  |  | 1582 | return qr/ $syl_short (?= \P{Lao} | \s | $ | $syl_short ) /x; | 
| 296 |  |  |  |  |  |  | } | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | =head2 get_sylre_named | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | The expression returned is the same as for L but also includes | 
| 301 |  |  |  |  |  |  | named captures that upon a successful match allow to get the syllable's parts | 
| 302 |  |  |  |  |  |  | from C<%+>. | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | =cut | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | sub get_sylre_named { | 
| 307 | 3 |  |  | 3 | 1 | 6 | my $syl_short = get_sylre_basic(); | 
| 308 | 3 |  |  |  |  | 13 | my $syl_capture = $re_basic; | 
| 309 | 3 |  |  |  |  | 5 | for my $atom (@sorted_x_names) { | 
| 310 | 48 |  |  |  |  | 749 | $syl_capture =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg; | 
|  | 483 |  |  |  |  | 479 |  | 
| 311 |  |  |  |  |  |  | } | 
| 312 |  |  |  |  |  |  |  | 
| 313 | 3 |  |  |  |  | 2757 | return qr/ $syl_capture (?= \P{Lao} | \s | $ | $syl_short )/x; | 
| 314 |  |  |  |  |  |  | } | 
| 315 |  |  |  |  |  |  |  | 
| 316 | 0 |  |  | 0 | 0 | 0 | sub get_consonants { return $CONSONANTS; } | 
| 317 | 0 |  |  | 0 | 0 | 0 | sub get_vowels { return $VOWELS; } | 
| 318 | 0 |  |  | 0 | 0 | 0 | sub get_tone_marks { return $TONE_MARKS; } | 
| 319 |  |  |  |  |  |  |  | 
| 320 | 112 |  |  | 112 | 0 | 277 | sub is_long_vowel { return $VOWEL_LENGTH{+shift} } | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | sub _named_capture { | 
| 323 | 483 |  |  | 483 |  | 461 | my ($fragments, $atom, $match) = @_; | 
| 324 |  |  |  |  |  |  |  | 
| 325 |  |  |  |  |  |  | return sprintf( | 
| 326 |  |  |  |  |  |  | '(?<%s> %s)', | 
| 327 |  |  |  |  |  |  | $capture_names{$atom}, $fragments->{$match} | 
| 328 | 483 | 100 |  |  |  | 1971 | ) if defined $capture_names{$atom}; | 
| 329 |  |  |  |  |  |  |  | 
| 330 | 48 |  |  |  |  | 106 | return $fragments->{$match}; | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | 1; |