| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Lingua::Han::PinYin; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 6 |  |  | 6 |  | 104019 | use strict; | 
|  | 6 |  |  |  |  | 10 |  | 
|  | 6 |  |  |  |  | 215 |  | 
| 4 | 6 |  |  | 6 |  | 25 | use warnings; | 
|  | 6 |  |  |  |  | 8 |  | 
|  | 6 |  |  |  |  | 213 |  | 
| 5 |  |  |  |  |  |  | our $VERSION = '0.21'; | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 6 |  |  | 6 |  | 26 | use File::Spec (); | 
|  | 6 |  |  |  |  | 11 |  | 
|  | 6 |  |  |  |  | 103 |  | 
| 8 | 6 |  |  | 6 |  | 2900 | use Lingua::Han::Utils qw/Unihan_value/; | 
|  | 6 |  |  |  |  | 330183 |  | 
|  | 6 |  |  |  |  | 5105 |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | sub new { | 
| 11 | 6 |  |  | 6 | 0 | 65 | my $class = shift; | 
| 12 |  |  |  |  |  |  |  | 
| 13 | 6 |  |  |  |  | 11 | my $dir   = __FILE__; | 
| 14 | 6 |  |  |  |  | 37 | $dir =~ s/\.pm//o; | 
| 15 | 6 | 50 |  |  |  | 219 | -d $dir or die "Directory $dir does not exists, please consider to reinstall this module."; | 
| 16 |  |  |  |  |  |  |  | 
| 17 | 6 | 50 |  |  |  | 43 | my %args = (@_ % 2 == 1) ? %{ $_[0] } : (@_); | 
|  | 0 |  |  |  |  | 0 |  | 
| 18 |  |  |  |  |  |  |  | 
| 19 | 6 |  |  |  |  | 10 | my %py; | 
| 20 | 6 |  |  |  |  | 137 | my $file = File::Spec->catfile( $dir, 'Mandarin.dat' ); | 
| 21 | 6 | 50 |  |  |  | 236 | open(my $fh, '<', $file) or die "Can't open $file: $!"; | 
| 22 | 6 |  |  |  |  | 198 | while (my $line = <$fh>) { | 
| 23 | 247260 |  |  |  |  | 171179 | chomp($line); | 
| 24 | 247260 |  |  |  |  | 309859 | my ( $uni, $py ) = split(/\s+/, $line); | 
| 25 | 247260 |  |  |  |  | 617166 | $py{$uni} = $py; | 
| 26 |  |  |  |  |  |  | } | 
| 27 | 6 |  |  |  |  | 295 | close($fh); | 
| 28 |  |  |  |  |  |  |  | 
| 29 | 6 |  |  |  |  | 26 | $args{'py'} = \%py; | 
| 30 |  |  |  |  |  |  |  | 
| 31 | 6 |  |  |  |  | 113 | return bless \%args => $class; | 
| 32 |  |  |  |  |  |  | } | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | sub han2pinyin1 { | 
| 35 | 5 |  |  | 5 | 1 | 1892 | my ($self, $word) = @_; | 
| 36 | 5 |  |  |  |  | 16 | my $code = Unihan_value($word); | 
| 37 | 5 |  |  |  |  | 7450 | my $value = $self->{'py'}->{$code}; | 
| 38 | 5 | 50 |  |  |  | 11 | if (defined $value) { | 
| 39 | 5 |  |  |  |  | 11 | $value = $self->_fix_val( $value ); | 
| 40 |  |  |  |  |  |  | } else { | 
| 41 |  |  |  |  |  |  | # not found in dictionary, return original word | 
| 42 | 0 |  |  |  |  | 0 | $value = $word; | 
| 43 |  |  |  |  |  |  | } | 
| 44 | 5 |  |  |  |  | 13 | return $value; | 
| 45 |  |  |  |  |  |  | } | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | sub han2pinyin { | 
| 48 | 36 |  |  | 36 | 1 | 15039 | my ( $self, $hanzi ) = @_; | 
| 49 |  |  |  |  |  |  |  | 
| 50 | 36 |  |  |  |  | 101 | my @code = Unihan_value($hanzi); | 
| 51 |  |  |  |  |  |  |  | 
| 52 | 36 |  |  |  |  | 11248 | my @result; | 
| 53 | 36 |  |  |  |  | 62 | foreach my $code (@code) { | 
| 54 | 69 |  |  |  |  | 153 | my $value = $self->{'py'}->{$code}; | 
| 55 | 69 | 100 |  |  |  | 108 | if ( defined $value ) { | 
| 56 | 51 |  |  |  |  | 82 | $value = $self->_fix_val( $value ); | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  | else { | 
| 59 |  |  |  |  |  |  | # if it's not a Chinese, return original word | 
| 60 | 18 |  |  |  |  | 33 | $value = pack( "U*", hex $code ); | 
| 61 |  |  |  |  |  |  | } | 
| 62 | 69 | 100 |  |  |  | 192 | push @result, ($self->{capitalize} ? ucfirst $value : $value); | 
| 63 |  |  |  |  |  |  | } | 
| 64 |  |  |  |  |  |  |  | 
| 65 | 36 | 50 |  |  |  | 131 | return wantarray ? @result : join( '', @result ); | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | sub gb2pinyin { | 
| 70 | 1 |  |  | 1 | 1 | 418 | my ($self, $hanzi) = @_; | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | # convert only normal Chinese letter. Ignore Chinese symbols | 
| 73 |  |  |  |  |  |  | # which fall within [0xa1,0xb0) region. 0xb0==0260 | 
| 74 |  |  |  |  |  |  | # if it is not normal Chinese, retain original characters | 
| 75 | 1 |  |  |  |  | 5 | $hanzi =~ s/[\260-\377][\200-\377]/$self->han2pinyin1($&)/ge; | 
|  | 2 |  |  |  |  | 4 |  | 
| 76 | 1 |  |  |  |  | 3 | return $hanzi; | 
| 77 |  |  |  |  |  |  | } | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | sub _fix_val { | 
| 80 | 56 |  |  | 56 |  | 66 | my ( $self, $value ) = @_; | 
| 81 |  |  |  |  |  |  |  | 
| 82 | 56 | 100 |  |  |  | 112 | if ($self->{unicode}) { | 
| 83 | 5 |  |  |  |  | 6 | return $value; | 
| 84 |  |  |  |  |  |  | } | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | # convert into ascii | 
| 87 | 51 | 100 |  |  |  | 105 | $value =~ s/ū/u/g and $value .= '1'; | 
| 88 | 51 | 50 |  |  |  | 86 | $value =~ s/ǖ/u/g and $value .= '1'; | 
| 89 | 51 | 50 |  |  |  | 79 | $value =~ s/ī/i/g and $value .= '1'; | 
| 90 | 51 | 100 |  |  |  | 84 | $value =~ s/ō/o/g and $value .= '1'; | 
| 91 | 51 | 50 |  |  |  | 81 | $value =~ s/ā/a/g and $value .= '1'; | 
| 92 | 51 | 100 |  |  |  | 82 | $value =~ s/ē/e/g and $value .= '1'; | 
| 93 |  |  |  |  |  |  |  | 
| 94 | 51 | 50 |  |  |  | 73 | $value =~ s/í/i/g and $value .= '2'; | 
| 95 | 51 | 100 |  |  |  | 80 | $value =~ s/é/e/g and $value .= '2'; | 
| 96 | 51 | 100 |  |  |  | 88 | $value =~ s/ú/u/g and $value .= '2'; | 
| 97 | 51 | 50 |  |  |  | 80 | $value =~ s/ó/o/g and $value .= '2'; | 
| 98 | 51 | 100 |  |  |  | 74 | $value =~ s/ǘ/v/g and $value .= '2'; | 
| 99 | 51 | 100 |  |  |  | 79 | $value =~ s/á/a/g and $value .= '2'; | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 51 | 50 |  |  |  | 76 | $value =~ s/ě/e/g and $value .= '3'; | 
| 102 | 51 | 100 |  |  |  | 83 | $value =~ s/ǎ/a/g and $value .= '3'; | 
| 103 | 51 | 100 |  |  |  | 104 | $value =~ s/ǒ/o/g and $value .= '3'; | 
| 104 | 51 | 50 |  |  |  | 78 | $value =~ s/ǔ/u/g and $value .= '3'; | 
| 105 | 51 | 100 |  |  |  | 86 | $value =~ s/ǚ/v/g and $value .= '3'; | 
| 106 | 51 | 100 |  |  |  | 109 | $value =~ s/ǐ/i/g and $value .= '3'; | 
| 107 |  |  |  |  |  |  |  | 
| 108 | 51 | 50 |  |  |  | 85 | $value =~ s/ò/o/g and $value .= '4'; | 
| 109 | 51 | 100 |  |  |  | 95 | $value =~ s/à/a/g and $value .= '4'; | 
| 110 | 51 | 50 |  |  |  | 154 | $value =~ s/è/e/g and $value .= '4'; | 
| 111 | 51 | 50 |  |  |  | 87 | $value =~ s/ù/u/g and $value .= '4'; | 
| 112 | 51 | 100 |  |  |  | 77 | $value =~ s/ǜ/v/g and $value .= '4'; | 
| 113 | 51 | 100 |  |  |  | 90 | $value =~ s/ì/i/g and $value .= '4'; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 51 | 100 |  |  |  | 147 | $value =~ s/\d//g unless $self->{tone}; | 
| 116 | 51 |  |  |  |  | 77 | return $value; | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | 1; | 
| 120 |  |  |  |  |  |  | __END__ |