| blib/lib/Encode/Korean/TransliteratorGenerator.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 89 | 267 | 33.3 |
| branch | 18 | 84 | 21.4 |
| condition | 2 | 21 | 9.5 |
| subroutine | 16 | 35 | 45.7 |
| pod | 0 | 32 | 0.0 |
| total | 125 | 439 | 28.4 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | # Korean Transliterator Generator | ||||||
| 2 | |||||||
| 3 | # $Id: TransliteratorGenerator.pm,v 1.7 2007/11/29 14:25:31 you Exp $ | ||||||
| 4 | |||||||
| 5 | package Encode::Korean::TransliteratorGenerator; | ||||||
| 6 | |||||||
| 7 | our $VERSION = do { q$Revision: 1.7 $ =~ /\d+\.(\d+)/; sprintf "%.2f", $1 / 100 }; | ||||||
| 8 | |||||||
| 9 | 11 | 11 | 22151 | use 5.008008; | |||
| 11 | 44 | ||||||
| 11 | 493 | ||||||
| 10 | 11 | 11 | 57 | use strict; | |||
| 11 | 22 | ||||||
| 11 | 364 | ||||||
| 11 | 11 | 11 | 53 | use warnings; | |||
| 11 | 21 | ||||||
| 11 | 40286 | ||||||
| 12 | |||||||
| 13 | # == CONSTANTS == | ||||||
| 14 | my $NotFound = '-1'; | ||||||
| 15 | |||||||
| 16 | my $CamelCase = 0; | ||||||
| 17 | my $GREEDY_SEP =1; | ||||||
| 18 | my $SMART_SEP = 2; | ||||||
| 19 | |||||||
| 20 | my %MODE = ( | ||||||
| 21 | 'CamelCase' => $CamelCase, | ||||||
| 22 | 'camel' => $CamelCase, | ||||||
| 23 | |||||||
| 24 | 'greedy_sep' => $GREEDY_SEP, | ||||||
| 25 | 'greedy' => $GREEDY_SEP, | ||||||
| 26 | |||||||
| 27 | 'smart_sep' => $SMART_SEP, | ||||||
| 28 | 'smart' => $SMART_SEP | ||||||
| 29 | ); | ||||||
| 30 | |||||||
| 31 | # == CONSTRUCTOR == | ||||||
| 32 | sub new { | ||||||
| 33 | 11 | 11 | 0 | 35 | my ($class) = @_; | ||
| 34 | 11 | 110 | my $self = { | ||||
| 35 | CONSONANTS => [], | ||||||
| 36 | VOWELS => [], | ||||||
| 37 | EL => undef, | ||||||
| 38 | ELL => undef, | ||||||
| 39 | NAUGHT => undef, | ||||||
| 40 | SEP => undef, | ||||||
| 41 | ENMODE => [], | ||||||
| 42 | DEMODE => [], | ||||||
| 43 | HEAD => [], | ||||||
| 44 | BODY => [], | ||||||
| 45 | FOOT => [], | ||||||
| 46 | HEADMAP => {}, | ||||||
| 47 | BODYMAP => {}, | ||||||
| 48 | FOOTMAP => {} | ||||||
| 49 | }; | ||||||
| 50 | |||||||
| 51 | 11 | 36 | bless $self, $class; | ||||
| 52 | 11 | 41 | return $self; | ||||
| 53 | } | ||||||
| 54 | |||||||
| 55 | # == METHODS == | ||||||
| 56 | # accessor | ||||||
| 57 | sub consonants { | ||||||
| 58 | 67 | 67 | 0 | 291 | my $self = shift; | ||
| 59 | 67 | 100 | 191 | if(@_) { | |||
| 60 | 11 | 19 | @{ $self->{CONSONANTS} } = @_; | ||||
| 11 | 90 | ||||||
| 61 | 11 | 41 | $self->head(@_); | ||||
| 62 | 11 | 264 | @ { $self->{FOOT} } = ( | ||||
| 11 | 73 | ||||||
| 63 | '', # NULL | ||||||
| 64 | $self->{CONSONANTS}->[0], # kiyeok (ㄱ) | ||||||
| 65 | $self->{CONSONANTS}->[1], # ssangkiyeok (ㄲ) | ||||||
| 66 | $self->{CONSONANTS}->[0] . $self->{CONSONANTS}->[9], # kiyeok sios (ㄳ) | ||||||
| 67 | $self->{CONSONANTS}->[2], # nieun (ㄴ) | ||||||
| 68 | $self->{CONSONANTS}->[2] . $self->{CONSONANTS}->[12], # nieun cieuc (ㄵ) | ||||||
| 69 | $self->{CONSONANTS}->[2] . $self->{CONSONANTS}->[18], # nieun hieuh (ㄶ) | ||||||
| 70 | $self->{CONSONANTS}->[3], # tikeut (ㄷ) | ||||||
| 71 | $self->{CONSONANTS}->[5], # rieul (ㄹ) | ||||||
| 72 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[0], # rieul kiyeok (ㄺ) | ||||||
| 73 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[6], # rieul mieum (ㄻ) | ||||||
| 74 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[7], # rieul pieup (ㄼ) | ||||||
| 75 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[9], # rieul sios (ㄽ) | ||||||
| 76 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[16], # rieul thieuth (ㄾ) | ||||||
| 77 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[17], # rieul phieuph (ㄿ) | ||||||
| 78 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[18], # rieul hieuh (ㅀ) | ||||||
| 79 | $self->{CONSONANTS}->[6], # mieum (ㅁ) | ||||||
| 80 | $self->{CONSONANTS}->[7], # pieup (ㅂ) | ||||||
| 81 | $self->{CONSONANTS}->[7] . $self->{CONSONANTS}->[9], # pieup sios (ㅄ) | ||||||
| 82 | $self->{CONSONANTS}->[9], # sios (ㅅ) | ||||||
| 83 | $self->{CONSONANTS}->[10], # ssangsios (ㅆ) | ||||||
| 84 | $self->{CONSONANTS}->[11], # ieung (ㅇ) | ||||||
| 85 | $self->{CONSONANTS}->[12], # cieuc (ㅈ) | ||||||
| 86 | $self->{CONSONANTS}->[14], # chieuch (ㅊ) | ||||||
| 87 | $self->{CONSONANTS}->[15], # khieukh (ㅋ) | ||||||
| 88 | $self->{CONSONANTS}->[16], # thieuth (ㅌ) | ||||||
| 89 | $self->{CONSONANTS}->[17], # phieuph (ㅍ) | ||||||
| 90 | $self->{CONSONANTS}->[18] # hieuh (ㅎ) | ||||||
| 91 | ); | ||||||
| 92 | } | ||||||
| 93 | 67 | 277 | return $self->{CONSONANTS}; | ||||
| 94 | } | ||||||
| 95 | |||||||
| 96 | sub head { | ||||||
| 97 | 858 | 858 | 0 | 888 | my $self = shift; | ||
| 98 | 858 | 100 | 1388 | if (@_) { @{ $self->{HEAD} } = @_; } | |||
| 11 | 23 | ||||||
| 11 | 49 | ||||||
| 99 | 858 | 2407 | return $self->{HEAD}; | ||||
| 100 | } | ||||||
| 101 | |||||||
| 102 | sub foot { | ||||||
| 103 | 691 | 691 | 0 | 750 | my $self = shift; | ||
| 104 | 691 | 50 | 1133 | if (@_) { @{ $self->{FOOT} } = @_; } | |||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 105 | 691 | 1907 | return $self->{FOOT}; | ||||
| 106 | } | ||||||
| 107 | |||||||
| 108 | # accessor | ||||||
| 109 | sub vowels { | ||||||
| 110 | 11 | 11 | 0 | 22 | my $self = shift; | ||
| 111 | 11 | 50 | 52 | if(@_) { | |||
| 112 | 11 | 23 | @{ $self->{VOWELS} } = @_; | ||||
| 11 | 62 | ||||||
| 113 | 11 | 46 | $self->body(@_); | ||||
| 114 | } | ||||||
| 115 | 11 | 32 | return $self->{VOWELS}; | ||||
| 116 | } | ||||||
| 117 | |||||||
| 118 | sub body { | ||||||
| 119 | 1177 | 1177 | 0 | 1160 | my $self = shift; | ||
| 120 | 1177 | 100 | 2094 | if (@_) { @{ $self->{BODY} } = @_; } | |||
| 11 | 22 | ||||||
| 11 | 45 | ||||||
| 121 | 1177 | 2948 | return $self->{BODY}; | ||||
| 122 | } | ||||||
| 123 | |||||||
| 124 | |||||||
| 125 | |||||||
| 126 | # accessor | ||||||
| 127 | sub el { | ||||||
| 128 | 8 | 8 | 0 | 15 | my $self = shift; | ||
| 129 | 8 | 50 | 26 | if(@_) { | |||
| 130 | 8 | 16 | $self->{EL} = shift; | ||||
| 131 | |||||||
| 132 | # Sets jongseongs with rieul | ||||||
| 133 | 8 | 28 | $self->foot->[8] = $self->{EL}; # rieul (ㄹ) | ||||
| 134 | 8 | 36 | $self->foot->[9] = $self->{EL} . $self->consonants->[0]; # rieul kiyeok (ㄺ) | ||||
| 135 | 8 | 63 | $self->foot->[10] = $self->{EL} . $self->consonants->[6]; # rieul mieum (ㄻ) | ||||
| 136 | 8 | 33 | $self->foot->[11] = $self->{EL} . $self->consonants->[7]; # rieul pieup (ㄼ) | ||||
| 137 | 8 | 28 | $self->foot->[12] = $self->{EL} . $self->consonants->[9]; # rieul sios (ㄽ) | ||||
| 138 | 8 | 43 | $self->foot->[13] = $self->{EL} . $self->consonants->[16]; # rieul thieuth (ㄾ) | ||||
| 139 | 8 | 24 | $self->foot->[14] = $self->{EL} . $self->consonants->[17]; # rieul phieuph (ㄿ) | ||||
| 140 | 8 | 28 | $self->foot->[15] = $self->{EL} . $self->consonants->[18]; # rieul hieuh (ㅀ) | ||||
| 141 | } | ||||||
| 142 | 8 | 23 | return $self->{EL}; | ||||
| 143 | } | ||||||
| 144 | |||||||
| 145 | # accessor | ||||||
| 146 | sub ell { | ||||||
| 147 | 8 | 8 | 0 | 14 | my $self = shift; | ||
| 148 | 8 | 50 | 36 | if(@_) { | |||
| 149 | 8 | 143 | $self->{ELL} = shift; | ||||
| 150 | } | ||||||
| 151 | 8 | 25 | return $self->{ELL}; | ||||
| 152 | } | ||||||
| 153 | |||||||
| 154 | # accessor | ||||||
| 155 | sub naught { | ||||||
| 156 | 10 | 10 | 0 | 21 | my $self = shift; | ||
| 157 | 10 | 50 | 39 | if(@_) { | |||
| 158 | 10 | 22 | $self->{NAUGHT} = shift; | ||||
| 159 | 10 | 25 | $self->{HEAD}->[11] = $self->{NAUGHT}; | ||||
| 160 | } | ||||||
| 161 | 10 | 38 | return $self->{NAUGHT}; | ||||
| 162 | } | ||||||
| 163 | |||||||
| 164 | # accessor | ||||||
| 165 | sub sep { | ||||||
| 166 | 11 | 11 | 0 | 23 | my $self = shift; | ||
| 167 | 11 | 50 | 52 | if(@_) { | |||
| 168 | 11 | 33 | $self->{SEP} = shift; | ||||
| 169 | } | ||||||
| 170 | 11 | 45 | return $self->{SEP}; | ||||
| 171 | } | ||||||
| 172 | |||||||
| 173 | # accessor | ||||||
| 174 | sub enmode { | ||||||
| 175 | 12 | 12 | 0 | 25 | my $self = shift; | ||
| 176 | 12 | 50 | 56 | if(@_) { | |||
| 177 | 12 | 35 | $self->{ENMODE} = shift; | ||||
| 178 | } | ||||||
| 179 | 12 | 46 | return $self->{ENMODE}; | ||||
| 180 | } | ||||||
| 181 | |||||||
| 182 | sub demode { | ||||||
| 183 | 10 | 10 | 0 | 18 | my $self = shift; | ||
| 184 | 10 | 50 | 37 | if(@_) { | |||
| 185 | 10 | 45 | $self->{DEMODE} = shift; | ||||
| 186 | } | ||||||
| 187 | 10 | 29 | return $self->{DEMODE}; | ||||
| 188 | } | ||||||
| 189 | |||||||
| 190 | |||||||
| 191 | |||||||
| 192 | sub make { | ||||||
| 193 | 11 | 11 | 0 | 52 | my $self = shift; | ||
| 194 | |||||||
| 195 | 11 | 29 | for ( my $i=0; $i <= $#{$self->head}; ++$i ) { | ||||
| 220 | 360 | ||||||
| 196 | 209 | 50 | 66 | 368 | if ($self->head->[$i] eq "" && $i != 11) { | ||
| 197 | #printf "error: empty slot. fill the transliteration for /%s/! ", |
||||||
| 198 | # encode::encode("utf8", $han_consonant[$i]); exit(1); | ||||||
| 199 | } | ||||||
| 200 | 209 | 50 | 414 | if (exists $self->{HEADMAP}->{$self->head->[$i]}) { | |||
| 201 | #print_mapping_error($self::head[$i], $self::head{$self::head[$i]}, $i); | ||||||
| 202 | 0 | 0 | exit(1); | ||||
| 203 | } else { | ||||||
| 204 | 209 | 394 | $self->{HEADMAP}->{$self->head->[$i]} = $i; | ||||
| 205 | }; | ||||||
| 206 | } | ||||||
| 207 | |||||||
| 208 | 11 | 25 | for ( my $i=0; $i <= $#{$self->body}; ++$i ) { | ||||
| 242 | 415 | ||||||
| 209 | 231 | 50 | 361 | if ($self->body->[$i] eq "") { | |||
| 210 | #printf "error: empty slot. fill the transliteration for /%s/! ", |
||||||
| 211 | # Encode::encode("utf8", $HAN_VOWEL[$i]); | ||||||
| 212 | 0 | 0 | exit(1); | ||||
| 213 | } | ||||||
| 214 | 231 | 50 | 477 | if (exists $self->{BODYMAP}->{$self->body->[$i]}) { | |||
| 215 | #print_mapping_error($self::BODY[$i], $self::BODY{$self::BODY[$i]}, $i); | ||||||
| 216 | 0 | 0 | exit(1); | ||||
| 217 | } else { | ||||||
| 218 | 231 | 508 | $self->{BODYMAP}->{$self->body->[$i]} = $i; | ||||
| 219 | }; | ||||||
| 220 | 231 | 458 | $self->{BODYMAP}->{$self->body->[$i]} = $i; | ||||
| 221 | } | ||||||
| 222 | 11 | 26 | for ( my $i=0; $i <= $#{$self->foot}; ++$i ) { | ||||
| 319 | 674 | ||||||
| 223 | 308 | 565 | $self->{FOOTMAP}->{$self->foot->[$i]} = $i; | ||||
| 224 | } | ||||||
| 225 | |||||||
| 226 | 11 | 43 | return $self; | ||||
| 227 | } | ||||||
| 228 | |||||||
| 229 | |||||||
| 230 | # encode($string [,$check]) | ||||||
| 231 | # = transliteration (romanization) | ||||||
| 232 | sub encode($$;$) { | ||||||
| 233 | 0 | 0 | 0 | my ($obj, $str, $chk) = @_; | |||
| 234 | 0 | my $tr = $obj->transliterate($str); | |||||
| 235 | 0 | 0 | $_[1] = '' if $chk; | ||||
| 236 | 0 | return $tr; | |||||
| 237 | } | ||||||
| 238 | |||||||
| 239 | # decode($octets [,$check]) | ||||||
| 240 | sub decode ($$;$) { | ||||||
| 241 | 0 | 0 | 0 | my ($obj, $str, $chk) = @_; | |||
| 242 | 0 | my $han = $obj->hangulize($str); | |||||
| 243 | 0 | 0 | $_[1] = '' if $chk; | ||||
| 244 | 0 | return $han; | |||||
| 245 | } | ||||||
| 246 | |||||||
| 247 | # to work with encoding pragma | ||||||
| 248 | # cat_decode($destination, $octets, $offset, $terminator [,$check]) | ||||||
| 249 | |||||||
| 250 | |||||||
| 251 | |||||||
| 252 | |||||||
| 253 | |||||||
| 254 | |||||||
| 255 | # = HAN TRANSLITERATOR = | ||||||
| 256 | # romanizer and hangulizer | ||||||
| 257 | |||||||
| 258 | # == hangul composer and decomposer == | ||||||
| 259 | # | ||||||
| 260 | # Unicode : 0xAC00 (가) -- 0xD7A3 (힣) | ||||||
| 261 | # | ||||||
| 262 | # foot (28 types) : 가각갂갃간갅갆갇갈갉갊갋갌갍갎갏감갑값갓갔강갖갗갘같갚갛 | ||||||
| 263 | # body (21 types) : 가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기 | ||||||
| 264 | # head (19 types) : 가까나다따라마바빠사싸아자짜차카타파하 | ||||||
| 265 | # | ||||||
| 266 | |||||||
| 267 | |||||||
| 268 | # === decompose === | ||||||
| 269 | # decomposes an unicode hangul chr into a hancode ($head, $body, $foot) | ||||||
| 270 | # for example, decompose('한') returns (18, 0, 4) | ||||||
| 271 | sub decompose { | ||||||
| 272 | 0 | 0 | 0 | my $self = shift; | |||
| 273 | |||||||
| 274 | 0 | my($chr) = @_; | |||||
| 275 | 0 | my $unicode = ord($chr); | |||||
| 276 | 0 | my $head = int(($unicode - 0xAC00) / (28*21)); | |||||
| 277 | 0 | my $body = int(($unicode - 0xAC00 - $head*28*21) /28); | |||||
| 278 | 0 | my $foot = $unicode - 0xAC00 - $head*28*21 - $body*28; | |||||
| 279 | 0 | return ($head, $body, $foot); | |||||
| 280 | } | ||||||
| 281 | |||||||
| 282 | # === compose === | ||||||
| 283 | # composes an unicode hangul chr from a hancode ($head, $body, $foot) | ||||||
| 284 | # for example, compose((18,0,4)) returns '한' | ||||||
| 285 | sub compose { | ||||||
| 286 | 0 | 0 | 0 | my $self = shift; | |||
| 287 | |||||||
| 288 | 0 | my($head, $body, $foot) = @_; | |||||
| 289 | 0 | my $unicode = 0xAC00 + $head*28*21 + $body*28 + $foot; | |||||
| 290 | 0 | return chr($unicode); | |||||
| 291 | } | ||||||
| 292 | |||||||
| 293 | |||||||
| 294 | |||||||
| 295 | # == ROMANIZE (TRANSLITERATE) == | ||||||
| 296 | |||||||
| 297 | # === transliterates a hangul chr (unicode hangul syllable) === | ||||||
| 298 | # for example, transliterate('한') returns ('h', 'a', 'n') | ||||||
| 299 | sub transliterate_chr { | ||||||
| 300 | 0 | 0 | 0 | my $self = shift; | |||
| 301 | 0 | my($chr) = @_; | |||||
| 302 | 0 | my($head,$body,$foot) = $self->decompose($chr); | |||||
| 303 | #return ($self->head->[$head], $self->body->[$body], $self->foot->[$foot]); | ||||||
| 304 | 0 | 0 | 0 | if ($self->enmode eq 'greedy' && $head == 11) { | |||
| 305 | 0 | return $self->body->[$body] . $self->foot->[$foot]; | |||||
| 306 | } else { | ||||||
| 307 | 0 | return $self->head->[$head] . $self->body->[$body] . $self->foot->[$foot]; | |||||
| 308 | } | ||||||
| 309 | } | ||||||
| 310 | sub transliterate_first_chr_of_word { | ||||||
| 311 | 0 | 0 | 0 | my $self = shift; | |||
| 312 | 0 | my($chr) = @_; | |||||
| 313 | 0 | my($head, $body, $foot) = $self->decompose ($chr); | |||||
| 314 | 0 | 0 | if ($head == 11) { | ||||
| 315 | 0 | return $self->body->[$body] . $self->foot->[$foot]; | |||||
| 316 | } else { | ||||||
| 317 | 0 | return $self->head->[$head] . $self->body->[$body] . $self->foot->[$foot]; | |||||
| 318 | } | ||||||
| 319 | |||||||
| 320 | } | ||||||
| 321 | |||||||
| 322 | # === transliterate a hangul word === | ||||||
| 323 | # Transliterates a hangul word (a string containing | ||||||
| 324 | # only hangul syllables) | ||||||
| 325 | sub transliterate_hangul_word { | ||||||
| 326 | 0 | 0 | 0 | my $self = shift; | |||
| 327 | 0 | my($word) = @_; | |||||
| 328 | 0 | my(@char) = split //, $word; | |||||
| 329 | 0 | my $tr = $self->transliterate_first_chr_of_word($char[0]); | |||||
| 330 | 0 | for (my $i=1; $i <= $#char; ++$i) { | |||||
| 331 | 0 | 0 | if ($MODE{$self->enmode} == $GREEDY_SEP) { | ||||
| 332 | 0 | $tr = $tr . $self->sep . $self->transliterate_chr($char[$i]); | |||||
| 333 | } else { | ||||||
| 334 | 0 | $tr = $tr . $self->transliterate_chr($char[$i]); | |||||
| 335 | } | ||||||
| 336 | } | ||||||
| 337 | 0 | return $tr; | |||||
| 338 | } | ||||||
| 339 | |||||||
| 340 | # === transliterate a string === | ||||||
| 341 | # The input string may contain any character. | ||||||
| 342 | # Transliterates only unicode hangul syllables (AC00-D7A3), | ||||||
| 343 | # returns other characters including hangul jamo (1100-11F9) | ||||||
| 344 | # and hangul compatibility jamo. | ||||||
| 345 | sub transliterate_line { | ||||||
| 346 | 0 | 0 | 0 | my($str) = @_; | |||
| 347 | 0 | my $tr; | |||||
| 348 | 0 | my(@char) = split(//,$str); | |||||
| 349 | 0 | foreach my $c (@char) { | |||||
| 350 | 0 | 0 | 0 | if (ord($c)>=0xAC00 && ord($c)<=0xD7A3){ | |||
| 351 | 0 | $tr = $tr . transliterate_chr($c); | |||||
| 352 | } else { | ||||||
| 353 | 0 | $tr = $tr . $c; | |||||
| 354 | } | ||||||
| 355 | } | ||||||
| 356 | 0 | return $tr; | |||||
| 357 | } | ||||||
| 358 | |||||||
| 359 | # === transliterate === | ||||||
| 360 | # Transliterates word by word | ||||||
| 361 | sub transliterate { | ||||||
| 362 | 0 | 0 | 0 | my $self = shift; | |||
| 363 | |||||||
| 364 | #my($str) = @_; | ||||||
| 365 | 0 | my $str = shift; | |||||
| 366 | 0 | my $tr; | |||||
| 367 | 0 | my(@word) = split /([^\x{AC00}-\x{D7A3}]+)/, $str; | |||||
| 368 | 0 | foreach my $w (@word) { | |||||
| 369 | 0 | 0 | if ($w =~ m/^[\x{AC00}-\x{D7A3}]+$/) { | ||||
| 370 | 0 | $tr = $tr . $self->transliterate_hangul_word($w); | |||||
| 371 | } else { | ||||||
| 372 | 0 | $tr = $tr . $w; | |||||
| 373 | } | ||||||
| 374 | } | ||||||
| 375 | |||||||
| 376 | 0 | return $tr; | |||||
| 377 | } | ||||||
| 378 | |||||||
| 379 | |||||||
| 380 | # | ||||||
| 381 | # == HANGULIZE (REVERSE TRANSLITERATION) == | ||||||
| 382 | # | ||||||
| 383 | # H: head, B: body, F: foot | ||||||
| 384 | # H?BF?(HBF?)* | ||||||
| 385 | |||||||
| 386 | # === hangulize === | ||||||
| 387 | # reverse transliteration : hangulizes a transliterated strings | ||||||
| 388 | # for example: hangulize('hangugmal') returns '한국말' | ||||||
| 389 | sub hangulize { | ||||||
| 390 | 0 | 0 | 0 | my $self = shift; | |||
| 391 | 0 | my $sep = $self->sep; | |||||
| 392 | |||||||
| 393 | 0 | my($str) = @_; | |||||
| 394 | 0 | my $h; | |||||
| 395 | |||||||
| 396 | 0 | 0 | if ($sep ne '') { | ||||
| 397 | 0 | my @word = split(/\Q$sep\E/, $str); | |||||
| 398 | 0 | foreach(@word) { $h = $h . $self->get_han($_); } | |||||
| 0 | |||||||
| 399 | } else { | ||||||
| 400 | 0 | $h = $h . $self->get_han($str); | |||||
| 401 | } | ||||||
| 402 | 0 | return $h; | |||||
| 403 | } | ||||||
| 404 | |||||||
| 405 | #------------------------------ | ||||||
| 406 | # hangulizes an array of alphabets into one hangul chr | ||||||
| 407 | # for example, hangulize_code(('h', 'a', 'n')) returns '한' | ||||||
| 408 | sub hangulize_code { | ||||||
| 409 | 0 | 0 | 0 | my $self = shift; | |||
| 410 | |||||||
| 411 | 0 | my($head, $body, $foot) = @_; | |||||
| 412 | 0 | my @hancode = ($self->{HEADMAP}->{$head}, $self->{BODYMAP}->{$body}, $self->{FOOTMAP}->{$foot}); | |||||
| 413 | 0 | return $self->compose(@hancode); | |||||
| 414 | } | ||||||
| 415 | |||||||
| 416 | |||||||
| 417 | #------------------------------- | ||||||
| 418 | # lookup $str, @list_of_jamo_transliteration | ||||||
| 419 | # eg. lookup('ssan', @CONSONANT) returns ('ss', 'an') | ||||||
| 420 | # where @CONSONANT has an item 'ss' | ||||||
| 421 | sub lookup { | ||||||
| 422 | 0 | 0 | 0 | my $self = shift; | |||
| 423 | |||||||
| 424 | 0 | my($str, @where) = @_; | |||||
| 425 | 0 | my $found = $NotFound; | |||||
| 426 | 0 | my $rest = $str; | |||||
| 427 | 0 | foreach(@where) { | |||||
| 428 | 0 | 0 | if ($_ eq substr($str, 0, length($_))) { | ||||
| 429 | 0 | 0 | if ($found eq $NotFound) { | ||||
| 0 | |||||||
| 430 | 0 | $found = $_; | |||||
| 431 | 0 | $rest = substr($str, length($_)); | |||||
| 432 | } elsif (length($found) < length($_)) { | ||||||
| 433 | 0 | $found = $_; | |||||
| 434 | 0 | $rest = substr($str, length($_)); | |||||
| 435 | } | ||||||
| 436 | } | ||||||
| 437 | } | ||||||
| 438 | # if($found eq $NotFound) { | ||||||
| 439 | # if(@where == @HEAD) {$found = $HEAD[11];} | ||||||
| 440 | # elsif (@where == @BODY) {$found = $NotFound;} | ||||||
| 441 | # elsif (@where == @FOOT) {$found = $FOOT[0];} | ||||||
| 442 | # $rest = $str; | ||||||
| 443 | # } | ||||||
| 444 | 0 | return ($found, $rest); | |||||
| 445 | } | ||||||
| 446 | |||||||
| 447 | #------------------------------- | ||||||
| 448 | # $SEP = "/"; $NAUGHT = "'"; | ||||||
| 449 | # isse = 이써 | ||||||
| 450 | # iss'e = 있어 : is/se = 잇서 | ||||||
| 451 | # ibsi = 입시 | ||||||
| 452 | # ibs'i = 잆이 | ||||||
| 453 | # ibsse = 입써 : ibs/se = 잆서 | ||||||
| 454 | # ibssse = 잆써 | ||||||
| 455 | |||||||
| 456 | #------------------------------- | ||||||
| 457 | # get_head($str) | ||||||
| 458 | # eg. get_head("ssan") retunrs ("ss", "an") | ||||||
| 459 | sub get_head { | ||||||
| 460 | 0 | 0 | 0 | my $self = shift; | |||
| 461 | |||||||
| 462 | 0 | my($str) = @_; | |||||
| 463 | 0 | my($head, $rest) = $self->lookup($str, @{$self->head}); | |||||
| 0 | |||||||
| 464 | 0 | return ($head, $rest); | |||||
| 465 | } | ||||||
| 466 | |||||||
| 467 | #------------------------------- | ||||||
| 468 | # get_body($str) | ||||||
| 469 | # eg. get_body("wan") returns ("wa", "n") | ||||||
| 470 | sub get_body { | ||||||
| 471 | 0 | 0 | 0 | my $self = shift; | |||
| 472 | |||||||
| 473 | 0 | my($str) = @_; | |||||
| 474 | 0 | my($body, $rest) = $self->lookup($str, @{$self->body}); | |||||
| 0 | |||||||
| 475 | 0 | return ($body, $rest); | |||||
| 476 | } | ||||||
| 477 | #------------------------------- | ||||||
| 478 | # get_foot($str) | ||||||
| 479 | # eg. get_foot("bssan") returns ("bs", "san") | ||||||
| 480 | sub get_foot { | ||||||
| 481 | 0 | 0 | 0 | my $self = shift; | |||
| 482 | |||||||
| 483 | 0 | my($str) = @_; | |||||
| 484 | 0 | my($foot, $rest) = $self->lookup($str, @{$self->foot}); | |||||
| 0 | |||||||
| 485 | 0 | return ($foot, $rest); | |||||
| 486 | } | ||||||
| 487 | |||||||
| 488 | #------------------------------- | ||||||
| 489 | # look_ahead for the next head - body sequence | ||||||
| 490 | # case : | ||||||
| 491 | # normal : look_ahead("mal") == "m"; | ||||||
| 492 | # no_head: look_ahead("an") == ""; | ||||||
| 493 | # no_body: look_ahead("kkkkk") eq $NotFound; | ||||||
| 494 | sub look_ahead { | ||||||
| 495 | 0 | 0 | 0 | my $self = shift; | |||
| 496 | |||||||
| 497 | 0 | my ($right) = @_; | |||||
| 498 | 0 | my $head; | |||||
| 499 | my $body; | ||||||
| 500 | 0 | ($head, $right) = $self->get_head($right); | |||||
| 501 | 0 | ($body, $right) = $self->get_body($right); | |||||
| 502 | |||||||
| 503 | 0 | 0 | if ($body eq $NotFound) { return $NotFound;} | ||||
| 0 | 0 | ||||||
| 0 | |||||||
| 504 | elsif($head eq $NotFound) {return "";} | ||||||
| 505 | 0 | else { return $head;} | |||||
| 506 | } | ||||||
| 507 | |||||||
| 508 | #------------------------------- | ||||||
| 509 | # get a hangul string from a transliteration : | ||||||
| 510 | # Makes the first hangul syllable from a transliterated string | ||||||
| 511 | # and recursively processes the rest. | ||||||
| 512 | # for example: get_han('hangugmal') returns unicode string '한국말' | ||||||
| 513 | sub get_han { | ||||||
| 514 | 0 | 0 | 0 | my $self = shift; | |||
| 515 | 0 | my $NAUGHT = $self->naught; | |||||
| 516 | 0 | my $FILL = ""; # jongseong filler | |||||
| 517 | |||||||
| 518 | 0 | my ($right) = @_; | |||||
| 519 | 0 | my $head; | |||||
| 520 | my $body; | ||||||
| 521 | 0 | my $foot; | |||||
| 522 | 0 | my $look_ahead_token; | |||||
| 523 | 0 | my $h; | |||||
| 524 | |||||||
| 525 | |||||||
| 526 | 0 | show_process(0, "begin", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 527 | |||||||
| 528 | 0 | ($head, $right) = $self->get_head($right); | |||||
| 529 | 0 | show_process(1, "get_head", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 530 | |||||||
| 531 | 0 | ($body, $right) = $self->get_body($right); | |||||
| 532 | 0 | show_process(2, "get_body", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 533 | |||||||
| 534 | 0 | 0 | 0 | if ($head eq $NotFound && $body eq $NotFound ) { | |||
| 0 | 0 | ||||||
| 535 | 0 | $h = $h . substr($right,0,1); | |||||
| 536 | 0 | show_process(21, "no head", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 537 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han(substr($right,1));} | ||||
| 0 | |||||||
| 538 | } elsif ($head ne $NotFound && $body eq $NotFound) { | ||||||
| 539 | 0 | $h = $h . $head; | |||||
| 540 | 0 | show_process(22, "no body", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 541 | 0 | 0 | if($right ne "") {$h = $h . substr($right, 0, 1) . $self->get_han(substr($right,1));} | ||||
| 0 | |||||||
| 542 | } else { | ||||||
| 543 | 0 | 0 | if($head eq $NotFound) { $head = $NAUGHT; } | ||||
| 0 | |||||||
| 544 | 0 | ($foot, $right) = $self->get_foot($right); | |||||
| 545 | 0 | show_process(3, "get_foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 546 | 0 | 0 | 0 | if ($foot eq $NotFound || $foot eq $FILL) { | |||
| 0 | |||||||
| 547 | 0 | $h = $h . $self->hangulize_code($head, $body, $FILL); | |||||
| 548 | 0 | show_process(31, "no foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 549 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
| 0 | |||||||
| 550 | } elsif($right eq "") { | ||||||
| 551 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
| 552 | 0 | show_process(32, "eof", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 553 | } else { | ||||||
| 554 | 0 | $look_ahead_token = $self->look_ahead($right); | |||||
| 555 | 0 | show_process(4, "look_ahead", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 556 | 0 | 0 | 0 | if ($look_ahead_token eq $NotFound || $look_ahead_token eq $NAUGHT) { | |||
| 557 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
| 558 | 0 | show_process(41, "no look", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 559 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
| 0 | |||||||
| 560 | } else { | ||||||
| 561 | 0 | ($foot, $right) = $self->get_correct_foot($foot, $look_ahead_token, $right); | |||||
| 562 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
| 563 | 0 | show_process(42, "get_correct_foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
| 564 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
| 0 | |||||||
| 565 | } | ||||||
| 566 | |||||||
| 567 | } | ||||||
| 568 | } | ||||||
| 569 | 0 | return $h; | |||||
| 570 | } | ||||||
| 571 | |||||||
| 572 | |||||||
| 573 | |||||||
| 574 | $, = "\t"; | ||||||
| 575 | sub show_process { | ||||||
| 576 | 0 | 0 | 0 | if(0) { | |||
| 577 | my($id, $desc, $h, $head, $body, $foot, $look_ahead_token, $right) = @_; | ||||||
| 578 | print $id , $desc, $h, $head, $body, $foot, $look_ahead_token, $right, "\n"; | ||||||
| 579 | } | ||||||
| 580 | } | ||||||
| 581 | |||||||
| 582 | #------------------------------- | ||||||
| 583 | # correct foot | ||||||
| 584 | # |
||||||
| 585 | # |
||||||
| 586 | # |
||||||
| 587 | #my $foot_p, my $look_ahead_token, my $right_p; | ||||||
| 588 | #my $foot, my $right; | ||||||
| 589 | sub get_correct_foot { | ||||||
| 590 | 0 | 0 | 0 | my $self = shift; | |||
| 591 | |||||||
| 592 | 0 | my ($foot_p, $look_ahead_token, $right_p) = @_; | |||||
| 593 | 0 | my $foot, my $right; | |||||
| 594 | 0 | $foot_p = $foot_p . $look_ahead_token;; | |||||
| 595 | 0 | $right_p = substr($right_p, length($look_ahead_token)); | |||||
| 596 | 0 | $foot = $foot_p; | |||||
| 597 | 0 | $right = $right_p; | |||||
| 598 | 0 | my $found = $NotFound; | |||||
| 599 | |||||||
| 600 | 0 | foreach(@{$self->head}) { | |||||
| 0 | |||||||
| 601 | 0 | 0 | if ($_ eq substr($foot_p, length($foot_p) - length($_))) { | ||||
| 602 | 0 | 0 | if ($found eq $NotFound) { | ||||
| 0 | |||||||
| 603 | 0 | $found = $_; | |||||
| 604 | 0 | $foot = substr($foot_p, 0, length($foot)-length($found)); | |||||
| 605 | 0 | $right = $found . $right_p; | |||||
| 606 | } elsif (length($found) < length($_)) { | ||||||
| 607 | 0 | $found = $_; | |||||
| 608 | 0 | $foot = substr($foot_p, 0, length($foot_p)-length($found)); | |||||
| 609 | 0 | $right = $found . $right_p; | |||||
| 610 | } | ||||||
| 611 | } | ||||||
| 612 | } | ||||||
| 613 | |||||||
| 614 | 0 | return ($foot, $right); | |||||
| 615 | } | ||||||
| 616 | |||||||
| 617 | 1; | ||||||
| 618 | __END__ |