line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package eGuideDog::Dict::Cantonese; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
64923
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
86
|
|
4
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
61
|
|
5
|
2
|
|
|
2
|
|
1564
|
use utf8; |
|
2
|
|
|
|
|
17
|
|
|
2
|
|
|
|
|
10
|
|
6
|
2
|
|
|
2
|
|
1974
|
use Encode::CNMap; |
|
2
|
|
|
|
|
34520
|
|
|
2
|
|
|
|
|
318
|
|
7
|
2
|
|
|
2
|
|
2517
|
use Storable; |
|
2
|
|
|
|
|
7910
|
|
|
2
|
|
|
|
|
4107
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
14
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
15
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# This allows declaration use eGuideDog::Dict::Cantonese ':all'; |
18
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
19
|
|
|
|
|
|
|
# will save memory. |
20
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
) ] ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @EXPORT = qw( |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our $VERSION = '0.41'; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Preloaded methods go here. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub new() { |
36
|
1
|
|
|
1
|
1
|
15
|
my $self = {}; |
37
|
1
|
|
|
|
|
4
|
$self->{jyutping} = {}; # The most probably phonetic symbol |
38
|
1
|
|
|
|
|
5
|
$self->{chars} = {}; # all phonetic symbols (array ref) |
39
|
1
|
|
|
|
|
3
|
$self->{words} = {}; # word phonetic symbols (array ref) |
40
|
1
|
|
|
|
|
3
|
$self->{word_index} = {}; # the first char to words (array ref) |
41
|
1
|
|
|
|
|
6
|
bless $self, __PACKAGE__; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# load zhy_list |
44
|
1
|
|
|
|
|
4
|
my $dir = __FILE__; |
45
|
1
|
|
|
|
|
10
|
$dir =~ s/[.]pm$//; |
46
|
|
|
|
|
|
|
|
47
|
1
|
50
|
|
|
|
56
|
if(-e "$dir/Cantonese.dict") { |
48
|
1
|
|
|
|
|
10
|
my $dict = retrieve("$dir/Cantonese.dict"); |
49
|
1
|
|
|
|
|
104962
|
$self->{jyutping} = $dict->{jyutping}; |
50
|
1
|
|
|
|
|
5
|
$self->{chars} = $dict->{chars}; |
51
|
1
|
|
|
|
|
5
|
$self->{words} = $dict->{words}; |
52
|
1
|
|
|
|
|
5
|
$self->{word_index} = $dict->{word_index}; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
1
|
|
|
|
|
11
|
return $self; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub update_dict { |
59
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
60
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
0
|
$self->{jyutping} = {}; |
62
|
0
|
|
|
|
|
0
|
$self->{chars} = {}; |
63
|
0
|
|
|
|
|
0
|
$self->{words} = {}; |
64
|
0
|
|
|
|
|
0
|
$self->{word_index} = {}; |
65
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
0
|
$self->import_unihan("Cantonese.txt"); |
67
|
0
|
|
|
|
|
0
|
$self->import_zhy_list("zhy_list"); |
68
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
my $dict = {jyutping => $self->{jyutping}, |
70
|
|
|
|
|
|
|
chars => $self->{chars}, |
71
|
|
|
|
|
|
|
words => $self->{words}, |
72
|
|
|
|
|
|
|
word_index => $self->{word_index}, |
73
|
|
|
|
|
|
|
}; |
74
|
0
|
|
|
|
|
0
|
store($dict, "Cantonese.dict"); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub import_unihan { |
78
|
0
|
|
|
0
|
0
|
0
|
my ($self, $cantonese_txt) = @_; |
79
|
0
|
|
|
|
|
0
|
open(DATA_FILE, '<', $cantonese_txt); |
80
|
0
|
|
|
|
|
0
|
while() { |
81
|
0
|
|
|
|
|
0
|
chomp; |
82
|
0
|
|
|
|
|
0
|
my @line = split(/\s+/, $_); |
83
|
0
|
|
|
|
|
0
|
my $char = chr(hex($line[0])); |
84
|
0
|
|
|
|
|
0
|
my @phons = @line[1 .. $#line]; |
85
|
0
|
0
|
|
|
|
0
|
if (not defined $self->{chars}->{$char}) { |
86
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = \@phons; |
87
|
|
|
|
|
|
|
} |
88
|
0
|
|
|
|
|
0
|
my $char_simp = utf8_to_simputf8($char); |
89
|
0
|
0
|
|
|
|
0
|
if ($char_simp !~ /[?]/) { |
90
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_simp}) { |
91
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_simp} = \@phons; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
0
|
|
|
|
|
0
|
my $char_trad = utf8_to_tradutf8($char); |
95
|
0
|
0
|
|
|
|
0
|
if ($char_trad !~ /[?]/) { |
96
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_trad}) { |
97
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_trad} = \@phons; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
0
|
|
|
|
|
0
|
close(DATA_FILE); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub add_symbol { |
105
|
0
|
|
|
0
|
0
|
0
|
my ($self, $char, $symbol) = @_; |
106
|
|
|
|
|
|
|
|
107
|
0
|
0
|
|
|
|
0
|
if (not $self->{chars}->{$char}) { |
108
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [$symbol]; |
109
|
0
|
|
|
|
|
0
|
return 1; |
110
|
|
|
|
|
|
|
} else { |
111
|
0
|
|
|
|
|
0
|
foreach (@{$self->{chars}->{$char}}) { |
|
0
|
|
|
|
|
0
|
|
112
|
0
|
0
|
|
|
|
0
|
if ($symbol eq $_) { |
113
|
0
|
|
|
|
|
0
|
return 0; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [@{$self->{chars}->{$char}}, $symbol]; |
|
0
|
|
|
|
|
0
|
|
117
|
0
|
|
|
|
|
0
|
return 1; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub import_zhy_list { |
122
|
0
|
|
|
0
|
0
|
0
|
my ($self, $zhy_list) = @_; |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
0
|
open(ZHY_LIST, '<:utf8', $zhy_list); |
125
|
0
|
|
|
|
|
0
|
while (my $line = ) { |
126
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^(.)\s([^\s]*)\s$/) { |
|
|
0
|
|
|
|
|
|
127
|
0
|
0
|
0
|
|
|
0
|
if ($1 && $2) { |
128
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{$1} = $2; |
129
|
0
|
|
|
|
|
0
|
$self->add_symbol($1, $2); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} elsif ($line =~ /^[(]([^)]*)[)]\s([^\s]*)\s$/) { |
132
|
0
|
|
|
|
|
0
|
my @chars = split(/ /, $1); |
133
|
0
|
|
|
|
|
0
|
my @symbols = split(/[|]/, $2); |
134
|
0
|
0
|
|
|
|
0
|
if ($#chars != $#symbols) { |
135
|
0
|
|
|
|
|
0
|
warn "Dictionary error:" . "@chars" . "-" . "@symbols"; |
136
|
0
|
|
|
|
|
0
|
next; |
137
|
|
|
|
|
|
|
} |
138
|
0
|
|
|
|
|
0
|
my $word = join("", @chars); |
139
|
0
|
0
|
|
|
|
0
|
if ($self->{word_index}->{$chars[0]}) { |
140
|
0
|
|
|
|
|
0
|
push(@{$self->{word_index}->{$chars[0]}}, $word); |
|
0
|
|
|
|
|
0
|
|
141
|
|
|
|
|
|
|
} else { |
142
|
0
|
|
|
|
|
0
|
$self->{word_index}->{$chars[0]} = [$word]; |
143
|
|
|
|
|
|
|
} |
144
|
0
|
|
|
|
|
0
|
$self->{words}->{$word} = \@symbols; |
145
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i <= $#chars; $i++) { |
146
|
0
|
|
|
|
|
0
|
$self->add_symbol($chars[$i], $symbols[$i]); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
0
|
|
|
|
|
0
|
close(ZHY_LIST); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# add numbers |
153
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"0"} = "ling4"; |
154
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"1"} = "jat1"; |
155
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"2"} = "ji6"; |
156
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"3"} = "saam1"; |
157
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"4"} = "sei3"; |
158
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"5"} = "ng5"; |
159
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"6"} = "luk6"; |
160
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"7"} = "cat1"; |
161
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"8"} = "baat3"; |
162
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"9"} = "gau2"; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub get_jyutping { |
166
|
3
|
|
|
3
|
1
|
8
|
my ($self, $str) = @_; |
167
|
|
|
|
|
|
|
|
168
|
3
|
50
|
|
|
|
21
|
if (not utf8::is_utf8($str)) { |
|
|
50
|
|
|
|
|
|
169
|
0
|
0
|
|
|
|
0
|
if (not utf8::decode($str)) { |
170
|
0
|
|
|
|
|
0
|
warn "$str is not in utf8 encoding."; |
171
|
0
|
|
|
|
|
0
|
return undef; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} elsif (not $str) { |
174
|
0
|
|
|
|
|
0
|
return undef; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
3
|
100
|
|
|
|
9
|
if (wantarray) { |
178
|
1
|
|
|
|
|
2
|
my @jyutping; |
179
|
1
|
|
|
|
|
6
|
for (my $i = 0; $i < length($str); $i++) { |
180
|
2
|
|
|
|
|
8
|
my $char = substr($str, $i, 1); |
181
|
2
|
|
|
|
|
6
|
my @words = $self->get_words($char); |
182
|
2
|
|
|
|
|
3
|
my $longest_word = ''; |
183
|
2
|
|
|
|
|
5
|
foreach my $word (@words) { |
184
|
0
|
0
|
|
|
|
0
|
if (index($str, $word) == 0) { |
185
|
0
|
0
|
|
|
|
0
|
if (length($word) > length($longest_word)) { |
186
|
0
|
|
|
|
|
0
|
$longest_word = $word; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
2
|
50
|
|
|
|
6
|
if ($longest_word) { |
191
|
0
|
|
|
|
|
0
|
push(@jyutping, @{$self->{words}->{$longest_word}}); |
|
0
|
|
|
|
|
0
|
|
192
|
0
|
|
|
|
|
0
|
$i += $#{$self->{words}->{$longest_word}}; |
|
0
|
|
|
|
|
0
|
|
193
|
|
|
|
|
|
|
} else { |
194
|
2
|
|
|
|
|
12
|
push(@jyutping, $self->{jyutping}->{$char}); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
1
|
|
|
|
|
17
|
return @jyutping; |
198
|
|
|
|
|
|
|
} else { |
199
|
2
|
|
|
|
|
11
|
my $char = substr($str, 0, 1); |
200
|
2
|
|
|
|
|
7
|
my @words = $self->get_words($char); |
201
|
2
|
|
|
|
|
6
|
my $longest_word = ''; |
202
|
2
|
|
|
|
|
4
|
foreach my $word (@words) { |
203
|
52
|
100
|
|
|
|
101
|
if (index($str, $word) == 0) { |
204
|
1
|
50
|
|
|
|
6
|
if (length($word) > length($longest_word)) { |
205
|
1
|
|
|
|
|
4
|
$longest_word = $word; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
2
|
100
|
|
|
|
7
|
if ($longest_word) { |
210
|
1
|
|
|
|
|
9
|
return $self->{words}->{$longest_word}->[0]; |
211
|
|
|
|
|
|
|
} else { |
212
|
1
|
|
|
|
|
10
|
return $self->{jyutping}->{$char}; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub get_words { |
218
|
5
|
|
|
5
|
1
|
13
|
my ($self, $char) = @_; |
219
|
|
|
|
|
|
|
|
220
|
5
|
100
|
|
|
|
18
|
if ($self->{word_index}->{$char}) { |
221
|
3
|
|
|
|
|
6
|
return @{$self->{word_index}->{$char}}; |
|
3
|
|
|
|
|
30
|
|
222
|
|
|
|
|
|
|
} else { |
223
|
2
|
|
|
|
|
6
|
return (); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub is_multi_phon { |
228
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
229
|
0
|
|
|
|
|
|
return $#{$self->{chars}->{$char}}; |
|
0
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub get_multi_phon { |
233
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
234
|
0
|
0
|
|
|
|
|
if ($self->{chars}->{$char}) { |
235
|
0
|
|
|
|
|
|
return @{$self->{chars}->{$char}}; |
|
0
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
} else { |
237
|
0
|
|
|
|
|
|
return undef; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
1; |
242
|
|
|
|
|
|
|
__END__ |