line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package eGuideDog::Dict::Mandarin; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
74630
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
87
|
|
4
|
2
|
|
|
2
|
|
13
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
66
|
|
5
|
2
|
|
|
2
|
|
1111
|
use utf8; |
|
2
|
|
|
|
|
17
|
|
|
2
|
|
|
|
|
9
|
|
6
|
2
|
|
|
2
|
|
2175
|
use Encode::CNMap; |
|
2
|
|
|
|
|
49433
|
|
|
2
|
|
|
|
|
325
|
|
7
|
2
|
|
|
2
|
|
3578
|
use Storable; |
|
2
|
|
|
|
|
11759
|
|
|
2
|
|
|
|
|
9664
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
14
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
15
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# This allows declaration use eGuideDog::Dict::Mandarin ':all'; |
18
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
19
|
|
|
|
|
|
|
# will save memory. |
20
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
) ] ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @EXPORT = qw( |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our $VERSION = '0.5'; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Preloaded methods go here. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub new() { |
36
|
1
|
|
|
1
|
1
|
19
|
my $self = {}; |
37
|
1
|
|
|
|
|
3
|
$self->{pinyin} = {}; # The most probably phonetic symbol |
38
|
1
|
|
|
|
|
3
|
$self->{chars} = {}; # all phonetic symbols (array ref) |
39
|
1
|
|
|
|
|
3
|
$self->{words} = {}; # word phonetic symbols (array ref) |
40
|
1
|
|
|
|
|
4
|
$self->{word_index} = {}; # the first char to words (array ref) |
41
|
1
|
|
|
|
|
2
|
$self->{char_rate} = {}; |
42
|
1
|
|
|
|
|
4
|
bless $self, __PACKAGE__; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# load dictionary file. |
45
|
1
|
|
|
|
|
3
|
my $dir = __FILE__; |
46
|
1
|
|
|
|
|
5
|
$dir =~ s/[.]pm$//; |
47
|
|
|
|
|
|
|
|
48
|
1
|
50
|
|
|
|
64
|
if(-e "$dir/Mandarin.dict") { |
49
|
1
|
|
|
|
|
8
|
my $dict = retrieve("$dir/Mandarin.dict"); |
50
|
1
|
|
|
|
|
451771
|
$self->{pinyin} = $dict->{pinyin}; |
51
|
1
|
|
|
|
|
6
|
$self->{chars} = $dict->{chars}; |
52
|
1
|
|
|
|
|
4
|
$self->{words} = $dict->{words}; |
53
|
1
|
|
|
|
|
4
|
$self->{word_index} = $dict->{word_index}; |
54
|
1
|
|
|
|
|
4
|
$self->{char_rate} = $dict->{char_rate}; |
55
|
1
|
|
|
|
|
4
|
$self->{symbol_size} = $dict->{symbol_size}; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
1
|
|
|
|
|
10
|
return $self; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub update_symbol_size { |
62
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
63
|
0
|
|
|
|
|
0
|
my $all = "a ai an ang ao ba bai ban bang bao bei ben beng bi bian biao bie bin bing bo bu ca cai can cang cao ce cen ceng cha chai chan chang chao che chen cheng chi chong chou chu chua chuai chuan chuang chui chun chuo ci cong cou cu cuan cui cun cuo da dai dan dang dao de dei den deng di dia dian diao die ding diu dong dou du duan dui dun duo e ei en eng er fa fan fang fei fen feng fo fou fu ga gai gan gang gao ge gei gen geng gong gou gu gua guai guan guang gui gun guo ha hai han hang hao he hei hen heng hong hou hu hua huai huan huang hui hun huo ji jia jian jiang jiao jie jin jing jiong jiu ju juan jue jun ka kai kan kang kao ke kei ken keng kong kou ku kua kuai kuan kuang kui kun kuo la lai lan lang lao le lei leng li lia lian liang liao lie lin ling liu lo long lou lu lu: luan lu:e lun luo ma mai man mang mao me mei men meng mi mian miao mie min ming miu mo mou mu na nai nan nang nao ne nei nen neng ng ni nia nian niang niao nie nin ning niu nong nou nu nu: nuan nu:e nuo o ou pa pai pan pang pao pei pen peng pi pian piao pie pin ping po pou pu qi qia qian qiang qiao qie qin qing qiong qiu qu quan que qun ran rang rao re ren reng ri rong rou ru rua ruan rui run ruo sa sai san sang sao se sen seng sha shai shan shang shao she shei shen sheng shi shou shu shua shuai shuan shuang shui shun shuo si song sou su suan sui sun suo ta tai tan tang tao te teng ti tian tiao tie ting tong tou tu tuan tui tun tuo wa wai wan wang wei wen weng wo wu xi xia xian xiang xiao xie xin xing xiong xiu xu xuan xue xun ya yan yang yao ye yi yin ying yo yong you yu yuan yue yun za zai zan zang zao ze zei zen zeng zha zhai zhan zhang zhao zhe zhei zhen zheng zhi zhong zhou zhu zhua zhuai zhuan zhuang zhui zhun zhuo zi zong zou zu zuan zui zun zuo"; |
64
|
0
|
|
|
|
|
0
|
$all =~ s/u:/v/g; |
65
|
0
|
|
|
|
|
0
|
my @all_pinyin = split(' ', $all); |
66
|
|
|
|
|
|
|
|
67
|
0
|
|
|
|
|
0
|
foreach my $py (@all_pinyin) { |
68
|
0
|
|
|
|
|
0
|
for (1 .. 5) { |
69
|
0
|
|
|
|
|
0
|
my $pytone = $py . $_; |
70
|
0
|
|
|
|
|
0
|
system("espeak -vzh \"$pytone\" -w /tmp/espeak_size.1"); |
71
|
0
|
|
|
|
|
0
|
system("espeak -vzh \"$pytone$pytone\" -w /tmp/espeak_size.2"); |
72
|
0
|
|
|
|
|
0
|
my $size = (-s '/tmp/espeak_size.2') - (-s '/tmp/espeak_size.1'); |
73
|
0
|
|
|
|
|
0
|
$self->{symbol_size}->{$pytone} = $size; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub import_symbol_size { |
79
|
0
|
|
|
0
|
0
|
0
|
my ($self, $file) = @_; |
80
|
0
|
|
|
|
|
0
|
open(SYMBOL_SIZE, '<', $file); |
81
|
0
|
|
|
|
|
0
|
while () { |
82
|
0
|
|
|
|
|
0
|
my @pair = split(/\s/, $_); |
83
|
0
|
|
|
|
|
0
|
$self->{symbol_size}->{$pair[0]} = $pair[1]; |
84
|
|
|
|
|
|
|
} |
85
|
0
|
|
|
|
|
0
|
close(SYMBOL_SIZE); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub get_symbol_size { |
89
|
0
|
|
|
0
|
0
|
0
|
my ($self, $symbol) = @_; |
90
|
0
|
0
|
|
|
|
0
|
if ($self->{symbol_size}->{$symbol}) { |
91
|
0
|
|
|
|
|
0
|
return $self->{symbol_size}->{$symbol}; |
92
|
|
|
|
|
|
|
} else { |
93
|
0
|
|
|
|
|
0
|
warn "$symbol size not exist"; |
94
|
0
|
|
|
|
|
0
|
return undef; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub print_symbol_size_list { |
99
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
100
|
0
|
|
|
|
|
0
|
foreach (sort(keys %{$self->{symbol_size}})) { |
|
0
|
|
|
|
|
0
|
|
101
|
0
|
|
|
|
|
0
|
print $_, "\t", $self->{symbol_size}->{$_}, "\n"; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub update_dict { |
106
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
107
|
|
|
|
|
|
|
|
108
|
0
|
|
|
|
|
0
|
$self->{pinyin} = {}; |
109
|
0
|
|
|
|
|
0
|
$self->{chars} = {}; |
110
|
0
|
|
|
|
|
0
|
$self->{words} = {}; |
111
|
0
|
|
|
|
|
0
|
$self->{word_index} = {}; |
112
|
0
|
|
|
|
|
0
|
$self->{char_rate} = {}; |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
0
|
$self->import_unihan("HanyuPinlu.txt"); |
115
|
|
|
|
|
|
|
# if a character is not exist in HanyuPinlu, it will look up in Mandarin.txt. |
116
|
0
|
|
|
|
|
0
|
$self->import_unihan("Mandarin.txt"); |
117
|
0
|
|
|
|
|
0
|
$self->import_zh_list("zh_list"); |
118
|
0
|
|
|
|
|
0
|
$self->import_zh_list("zh_listx"); |
119
|
0
|
|
|
|
|
0
|
$self->import_char_rate("HanyuPinlu.txt"); |
120
|
0
|
|
|
|
|
0
|
$self->import_symbol_size("symbol_size_list"); |
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
0
|
my $dict = {pinyin => $self->{pinyin}, |
123
|
|
|
|
|
|
|
chars => $self->{chars}, |
124
|
|
|
|
|
|
|
words => $self->{words}, |
125
|
|
|
|
|
|
|
word_index => $self->{word_index}, |
126
|
|
|
|
|
|
|
char_rate => $self->{char_rate}, |
127
|
|
|
|
|
|
|
symbol_size => $self->{symbol_size}, |
128
|
|
|
|
|
|
|
}; |
129
|
0
|
|
|
|
|
0
|
store($dict, "Mandarin.dict"); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub import_char_rate { |
133
|
0
|
|
|
0
|
0
|
0
|
my ($self, $file) = @_; |
134
|
0
|
|
|
|
|
0
|
open(DATA_FILE, '<', $file); |
135
|
0
|
|
|
|
|
0
|
while(my $line = ) { |
136
|
0
|
|
|
|
|
0
|
chomp($line); |
137
|
0
|
|
|
|
|
0
|
my @items = split(/\s+/, $line); |
138
|
0
|
|
|
|
|
0
|
my $rate = 0; |
139
|
0
|
|
|
|
|
0
|
foreach (@items[1 .. $#items]) { |
140
|
0
|
|
|
|
|
0
|
/\((.*)\)/; |
141
|
0
|
|
|
|
|
0
|
$rate += $1; |
142
|
|
|
|
|
|
|
} |
143
|
0
|
|
|
|
|
0
|
my $char = chr(hex($items[0])); |
144
|
0
|
|
|
|
|
0
|
$self->{char_rate}->{$char} = $rate; |
145
|
|
|
|
|
|
|
# my $char_simp = utf8_to_simputf8($char); |
146
|
|
|
|
|
|
|
# if ($char_simp !~ /[?]/) { |
147
|
|
|
|
|
|
|
# $self->{char_rate}->{$char_simp} = $rate; |
148
|
|
|
|
|
|
|
# } |
149
|
|
|
|
|
|
|
# my $char_trad = utf8_to_tradutf8($char); |
150
|
|
|
|
|
|
|
# if ($char_trad !~ /[?]/) { |
151
|
|
|
|
|
|
|
# $self->{char_rate}->{$char_trad} = $rate; |
152
|
|
|
|
|
|
|
# } |
153
|
|
|
|
|
|
|
} |
154
|
0
|
|
|
|
|
0
|
close(DATA_FILE); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub import_unihan { |
158
|
0
|
|
|
0
|
0
|
0
|
my ($self, $file) = @_; |
159
|
0
|
|
|
|
|
0
|
open(DATA_FILE, '<', $file); |
160
|
0
|
|
|
|
|
0
|
while(my $line = ) { |
161
|
0
|
|
|
|
|
0
|
chomp($line); |
162
|
0
|
|
|
|
|
0
|
$line = lc($line); # specific to Mandarin.txt |
163
|
0
|
|
|
|
|
0
|
my @items = split(/\s+/, $line); |
164
|
0
|
|
|
|
|
0
|
s/\(.*\)// foreach (@items); # specific to HanyuPinlu |
165
|
0
|
|
|
|
|
0
|
my $char = chr(hex($items[0])); |
166
|
0
|
|
|
|
|
0
|
my @phons = @items[1 .. $#items]; |
167
|
0
|
0
|
|
|
|
0
|
if (not defined $self->{chars}->{$char}) { |
168
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = \@phons; |
169
|
|
|
|
|
|
|
} |
170
|
0
|
|
|
|
|
0
|
my $char_simp = utf8_to_simputf8($char); |
171
|
0
|
0
|
|
|
|
0
|
if ($char_simp !~ /[?]/) { |
172
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_simp}) { |
173
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_simp} = \@phons; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
0
|
|
|
|
|
0
|
my $char_trad = utf8_to_tradutf8($char); |
177
|
0
|
0
|
|
|
|
0
|
if ($char_trad !~ /[?]/) { |
178
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_trad}) { |
179
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_trad} = \@phons; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
0
|
|
|
|
|
0
|
close(DATA_FILE); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub add_symbol { |
187
|
0
|
|
|
0
|
0
|
0
|
my ($self, $char, $symbol) = @_; |
188
|
|
|
|
|
|
|
|
189
|
0
|
0
|
|
|
|
0
|
if (not $self->{chars}->{$char}) { |
190
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [$symbol]; |
191
|
0
|
|
|
|
|
0
|
return 1; |
192
|
|
|
|
|
|
|
} else { |
193
|
0
|
|
|
|
|
0
|
foreach (@{$self->{chars}->{$char}}) { |
|
0
|
|
|
|
|
0
|
|
194
|
0
|
0
|
|
|
|
0
|
if ($symbol eq $_) { |
195
|
0
|
|
|
|
|
0
|
return 0; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
} |
198
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [@{$self->{chars}->{$char}}, $symbol]; |
|
0
|
|
|
|
|
0
|
|
199
|
0
|
|
|
|
|
0
|
return 1; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub import_zh_list { |
204
|
0
|
|
|
0
|
0
|
0
|
my ($self, $zh_list) = @_; |
205
|
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
0
|
open(ZH_LIST, '<:utf8', $zh_list); |
207
|
0
|
|
|
|
|
0
|
while (my $line = ) { |
208
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^(.)\s([^\s]*)\s$/) { |
|
|
0
|
|
|
|
|
|
209
|
0
|
0
|
0
|
|
|
0
|
if ($1 && $2) { |
210
|
0
|
|
|
|
|
0
|
my $ch = $1; |
211
|
0
|
|
|
|
|
0
|
my $py = $2; |
212
|
0
|
0
|
|
|
|
0
|
if ($py =~ /^[a-z]*[1-5]$/) { |
213
|
0
|
|
|
|
|
0
|
$self->{pinyin}->{$ch} = $py; |
214
|
0
|
|
|
|
|
0
|
$self->add_symbol($ch, $py); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
} elsif ($line =~ /^[(]([^)]*)[)]\s([^\s]*)\s$/) { |
218
|
0
|
|
|
|
|
0
|
my @chars = split(/ /, $1); |
219
|
0
|
|
|
|
|
0
|
my $phon = $2; |
220
|
0
|
|
|
|
|
0
|
my @symbols; |
221
|
0
|
0
|
|
|
|
0
|
if ($phon =~ /[|]/) { |
222
|
0
|
|
|
|
|
0
|
@symbols = split(/[|]/, $phon); |
223
|
|
|
|
|
|
|
} else { |
224
|
0
|
|
0
|
|
|
0
|
while($phon && $phon =~ /^([a-z]*[0-9])(.*)/) { |
225
|
0
|
|
|
|
|
0
|
push(@symbols, $1); |
226
|
0
|
|
|
|
|
0
|
$phon = $2; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
0
|
0
|
|
|
|
0
|
if ($#chars != $#symbols) { |
230
|
0
|
|
|
|
|
0
|
warn "Dictionary error:" . "@chars" . "-" . "@symbols"; |
231
|
0
|
|
|
|
|
0
|
next; |
232
|
|
|
|
|
|
|
} |
233
|
0
|
|
|
|
|
0
|
my $word = join("", @chars); |
234
|
0
|
0
|
|
|
|
0
|
if ($self->{word_index}->{$chars[0]}) { |
235
|
0
|
|
|
|
|
0
|
push(@{$self->{word_index}->{$chars[0]}}, $word); |
|
0
|
|
|
|
|
0
|
|
236
|
|
|
|
|
|
|
} else { |
237
|
0
|
|
|
|
|
0
|
$self->{word_index}->{$chars[0]} = [$word]; |
238
|
|
|
|
|
|
|
} |
239
|
0
|
|
|
|
|
0
|
$self->{words}->{$word} = \@symbols; |
240
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i <= $#chars; $i++) { |
241
|
0
|
|
|
|
|
0
|
$self->add_symbol($chars[$i], $symbols[$i]); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
0
|
|
|
|
|
0
|
close(ZH_LIST); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
sub get_pinyin { |
249
|
3
|
|
|
3
|
1
|
8
|
my ($self, $str) = @_; |
250
|
|
|
|
|
|
|
|
251
|
3
|
50
|
|
|
|
19
|
if (not utf8::is_utf8($str)) { |
|
|
50
|
|
|
|
|
|
252
|
0
|
0
|
|
|
|
0
|
if (not utf8::decode($str)) { |
253
|
0
|
|
|
|
|
0
|
warn "$str is not in utf8 encoding."; |
254
|
0
|
|
|
|
|
0
|
return undef; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
} elsif (not $str) { |
257
|
0
|
|
|
|
|
0
|
return undef; |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
3
|
100
|
|
|
|
9
|
if (wantarray) { |
261
|
1
|
|
|
|
|
3
|
my @pinyin; |
262
|
1
|
|
|
|
|
7
|
for (my $i = 0; $i < length($str); $i++) { |
263
|
2
|
|
|
|
|
5
|
my $char = substr($str, $i, 1); |
264
|
2
|
|
|
|
|
7
|
my @words = $self->get_words($char); |
265
|
2
|
|
|
|
|
3
|
my $longest_word = ''; |
266
|
2
|
|
|
|
|
5
|
foreach my $word (@words) { |
267
|
8
|
50
|
|
|
|
19
|
if (index($str, $word) == 0) { |
268
|
0
|
0
|
|
|
|
0
|
if (length($word) > length($longest_word)) { |
269
|
0
|
|
|
|
|
0
|
$longest_word = $word; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
} |
273
|
2
|
50
|
|
|
|
5
|
if ($longest_word) { |
274
|
0
|
|
|
|
|
0
|
push(@pinyin, @{$self->{words}->{$longest_word}}); |
|
0
|
|
|
|
|
0
|
|
275
|
0
|
|
|
|
|
0
|
$i += $#{$self->{words}->{$longest_word}}; |
|
0
|
|
|
|
|
0
|
|
276
|
|
|
|
|
|
|
} else { |
277
|
2
|
|
|
|
|
13
|
push(@pinyin, $self->{pinyin}->{$char}); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
} |
280
|
1
|
|
|
|
|
8
|
return @pinyin; |
281
|
|
|
|
|
|
|
} else { |
282
|
2
|
|
|
|
|
10
|
my $char = substr($str, 0, 1); |
283
|
2
|
|
|
|
|
9
|
my @words = $self->get_words($char); |
284
|
2
|
|
|
|
|
9
|
my $longest_word = ''; |
285
|
2
|
|
|
|
|
5
|
foreach my $word (@words) { |
286
|
184
|
100
|
|
|
|
337
|
if (index($str, $word) == 0) { |
287
|
1
|
50
|
|
|
|
32
|
if (length($word) > length($longest_word)) { |
288
|
1
|
|
|
|
|
4
|
$longest_word = $word; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
} |
292
|
2
|
100
|
|
|
|
7
|
if ($longest_word) { |
293
|
1
|
|
|
|
|
12
|
return $self->{words}->{$longest_word}->[0]; |
294
|
|
|
|
|
|
|
} else { |
295
|
1
|
|
|
|
|
10
|
return $self->{pinyin}->{$char}; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub get_words { |
301
|
5
|
|
|
5
|
1
|
10
|
my ($self, $char) = @_; |
302
|
|
|
|
|
|
|
|
303
|
5
|
50
|
|
|
|
19
|
if ($self->{word_index}->{$char}) { |
304
|
5
|
|
|
|
|
7
|
return @{$self->{word_index}->{$char}}; |
|
5
|
|
|
|
|
98
|
|
305
|
|
|
|
|
|
|
} else { |
306
|
0
|
|
|
|
|
|
return (); |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub is_multi_phon { |
311
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
312
|
0
|
|
|
|
|
|
return $#{$self->{chars}->{$char}}; |
|
0
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
sub get_multi_phon { |
316
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
317
|
0
|
0
|
|
|
|
|
if ($self->{chars}->{$char}) { |
318
|
0
|
|
|
|
|
|
return @{$self->{chars}->{$char}}; |
|
0
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
} else { |
320
|
0
|
|
|
|
|
|
return undef; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub print_phon_char_list { |
325
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
326
|
0
|
|
|
|
|
|
my $all = "a ai an ang ao ba bai ban bang bao bei ben beng bi bian biao bie bin bing bo bu ca cai can cang cao ce cen ceng cha chai chan chang chao che chen cheng chi chong chou chu chua chuai chuan chuang chui chun chuo ci cong cou cu cuan cui cun cuo da dai dan dang dao de dei den deng di dia dian diao die ding diu dong dou du duan dui dun duo e ei en eng er fa fan fang fei fen feng fo fou fu ga gai gan gang gao ge gei gen geng gong gou gu gua guai guan guang gui gun guo ha hai han hang hao he hei hen heng hong hou hu hua huai huan huang hui hun huo ji jia jian jiang jiao jie jin jing jiong jiu ju juan jue jun ka kai kan kang kao ke kei ken keng kong kou ku kua kuai kuan kuang kui kun kuo la lai lan lang lao le lei leng li lia lian liang liao lie lin ling liu lo long lou lu lu: luan lu:e lun luo ma mai man mang mao me mei men meng mi mian miao mie min ming miu mo mou mu na nai nan nang nao ne nei nen neng ng ni nia nian niang niao nie nin ning niu nong nou nu nu: nuan nu:e nuo o ou pa pai pan pang pao pei pen peng pi pian piao pie pin ping po pou pu qi qia qian qiang qiao qie qin qing qiong qiu qu quan que qun ran rang rao re ren reng ri rong rou ru rua ruan rui run ruo sa sai san sang sao se sen seng sha shai shan shang shao she shei shen sheng shi shou shu shua shuai shuan shuang shui shun shuo si song sou su suan sui sun suo ta tai tan tang tao te teng ti tian tiao tie ting tong tou tu tuan tui tun tuo wa wai wan wang wei wen weng wo wu xi xia xian xiang xiao xie xin xing xiong xiu xu xuan xue xun ya yan yang yao ye yi yin ying yo yong you yu yuan yue yun za zai zan zang zao ze zei zen zeng zha zhai zhan zhang zhao zhe zhei zhen zheng zhi zhong zhou zhu zhua zhuai zhuan zhuang zhui zhun zhuo zi zong zou zu zuan zui zun zuo"; |
327
|
0
|
|
|
|
|
|
$all =~ s/u:/v/g; |
328
|
0
|
|
|
|
|
|
my @all_pinyin = split(' ', $all); |
329
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
|
my %phonh; |
331
|
0
|
|
|
|
|
|
foreach my $char (keys %{$self->{chars}}) { |
|
0
|
|
|
|
|
|
|
332
|
0
|
|
|
|
|
|
my $phons = $self->{chars}->{$char}; |
333
|
0
|
|
|
|
|
|
foreach my $phon (@{$phons}) { |
|
0
|
|
|
|
|
|
|
334
|
0
|
0
|
|
|
|
|
if ($phonh{$phon}) { |
335
|
0
|
|
|
|
|
|
push(@{$phonh{$phon}}, $char); |
|
0
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
} else { |
337
|
0
|
|
|
|
|
|
my @p = ($char); |
338
|
0
|
|
|
|
|
|
$phonh{$phon} = \@p; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
0
|
|
|
|
|
|
foreach my $py (@all_pinyin) { |
343
|
0
|
|
|
|
|
|
for (1 .. 5) { |
344
|
0
|
|
|
|
|
|
my $pytone = $py . $_; |
345
|
0
|
0
|
|
|
|
|
if ($phonh{$pytone}) { |
346
|
0
|
|
|
|
|
|
my @p1 = @{$phonh{$pytone}}; |
|
0
|
|
|
|
|
|
|
347
|
0
|
|
0
|
|
|
|
my @p2 = sort {($self->{char_rate}->{$b} || 0) <=> ($self->{char_rate}->{$a} || 0)} @p1; |
|
0
|
|
0
|
|
|
|
|
348
|
0
|
|
|
|
|
|
print "$pytone: @p2\n"; |
349
|
|
|
|
|
|
|
# foreach (@p2) { |
350
|
|
|
|
|
|
|
# print $_, "(", $self->{char_rate}->{$_} || 0, ") "; |
351
|
|
|
|
|
|
|
# } |
352
|
|
|
|
|
|
|
# print "\n"; |
353
|
|
|
|
|
|
|
} else { |
354
|
0
|
|
|
|
|
|
print "$pytone:\n"; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
1; |
361
|
|
|
|
|
|
|
__END__ |