line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::ZH::CCDICT; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
31
|
|
4
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
26
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
29
|
use 5.006001; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
5
|
use vars qw ($VERSION); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
67
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
$VERSION = '0.05'; |
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
7
|
use File::Spec; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
13
|
1
|
|
|
1
|
|
302713
|
use Params::Validate qw(:all); |
|
1
|
|
|
|
|
421955
|
|
|
1
|
|
|
|
|
321
|
|
14
|
1
|
|
|
1
|
|
12
|
use Sub::Name qw( subname ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
53
|
|
15
|
1
|
|
|
1
|
|
741
|
use Lingua::ZH::CCDICT::Romanization; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
27
|
|
16
|
1
|
|
|
1
|
|
607
|
use Lingua::ZH::CCDICT::Romanization::Pinyin; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
50
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my %storage; |
19
|
1
|
|
|
1
|
|
2
|
BEGIN { %storage = map { lc $_ => $_ } ( qw( InMemory XML BerkeleyDB ) ) } |
|
3
|
|
|
|
|
1386
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub new |
23
|
|
|
|
|
|
|
{ |
24
|
1
|
|
|
1
|
1
|
13
|
my $class = shift; |
25
|
|
|
|
|
|
|
my %p = validate_with( params => \@_, |
26
|
|
|
|
|
|
|
spec => |
27
|
|
|
|
|
|
|
{ storage => |
28
|
|
|
|
|
|
|
{ callbacks => |
29
|
|
|
|
|
|
|
{ 'is a valid storage type' => |
30
|
1
|
|
|
1
|
|
15
|
sub { $storage{ lc $_[0] } } }, |
31
|
|
|
|
|
|
|
}, |
32
|
|
|
|
|
|
|
}, |
33
|
1
|
|
|
|
|
53
|
allow_extra => 1, |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
1
|
|
|
|
|
14
|
my $storage_class = __PACKAGE__ . '::Storage::' . $storage{ lc delete $p{storage} }; |
37
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
|
8
|
eval "use $storage_class"; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
16
|
|
|
1
|
|
|
|
|
98
|
|
39
|
1
|
50
|
|
|
|
5
|
die $@ if $@; |
40
|
|
|
|
|
|
|
|
41
|
1
|
|
|
|
|
8
|
return $storage_class->new(%p); |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my %Ignore = map { $_ => 1 } qw( fUTF8 fCNS11643 fGB fBig5 ); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my %CCDICTToInternal = |
47
|
|
|
|
|
|
|
( fTotalStrokes => 'stroke_count', |
48
|
|
|
|
|
|
|
fCangjie => 'cangjie', |
49
|
|
|
|
|
|
|
fFourCorner => 'four_corner', |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my %RomanizationToInternal = |
53
|
|
|
|
|
|
|
( fHakka => 'pinjim', |
54
|
|
|
|
|
|
|
fCantonese => 'jyutping', |
55
|
|
|
|
|
|
|
fMandarin => 'pinyin', |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my %RomanizationType = map { $_ => 1 } values %RomanizationToInternal; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub parse_source_file |
61
|
|
|
|
|
|
|
{ |
62
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
63
|
1
|
|
|
|
|
3
|
my $file = shift; |
64
|
1
|
50
|
|
|
|
5
|
my $status_fh = $ENV{CCDICT_VERBOSE} ? \*STDERR : undef; |
65
|
1
|
|
|
|
|
2
|
my $lines_per = 5000; |
66
|
|
|
|
|
|
|
|
67
|
1
|
50
|
|
|
|
6
|
unless ( defined $file ) |
68
|
|
|
|
|
|
|
{ |
69
|
1
|
|
|
|
|
31
|
my $pack_as_file = File::Spec->catfile( split /::/, __PACKAGE__ ); |
70
|
1
|
|
|
|
|
3
|
$pack_as_file .= '.pm'; |
71
|
|
|
|
|
|
|
|
72
|
1
|
|
|
|
|
7
|
(my $dir = $INC{$pack_as_file}) =~ s/\.pm$//; |
73
|
|
|
|
|
|
|
|
74
|
1
|
|
|
|
|
9
|
$file = File::Spec->catfile( $dir, 'Data.pm' ); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
1
|
50
|
|
|
|
71
|
open my $fh, '<', $file |
78
|
|
|
|
|
|
|
or die "Cannot read $file: $!"; |
79
|
|
|
|
|
|
|
|
80
|
1
|
|
|
|
|
2
|
my $last_char; |
81
|
|
|
|
|
|
|
my %entry; |
82
|
1
|
|
|
|
|
52
|
while (<$fh>) |
83
|
|
|
|
|
|
|
{ |
84
|
298555
|
|
|
|
|
359846
|
chomp; |
85
|
|
|
|
|
|
|
|
86
|
298555
|
100
|
|
|
|
703149
|
next unless substr( $_, 0, 1 ) eq 'U'; |
87
|
|
|
|
|
|
|
|
88
|
298541
|
|
|
|
|
864822
|
my ( $unicode, $type, $data ) = split /\t/, $_, 3; |
89
|
|
|
|
|
|
|
|
90
|
298541
|
|
|
|
|
3137863
|
s/^\s+|\s+$//g for $unicode, $type, $data; |
91
|
|
|
|
|
|
|
|
92
|
298541
|
100
|
|
|
|
904113
|
next if $Ignore{$type}; |
93
|
|
|
|
|
|
|
|
94
|
204180
|
|
|
|
|
919148
|
my ( $codepoint, $homograph ) = $unicode =~ /U\+([\dABCDEF]+)\.(\d)/; |
95
|
|
|
|
|
|
|
|
96
|
204180
|
50
|
|
|
|
471488
|
die "Bad line (line #$.):\n$_\n\n" unless $codepoint; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# not sure how to handle this, to be honest. |
99
|
204180
|
100
|
|
|
|
397277
|
next if $homograph; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# generate a real Unicode character in Perl |
102
|
197284
|
|
|
|
|
448090
|
my $unicode_char = chr( hex($codepoint) ); |
103
|
|
|
|
|
|
|
|
104
|
197284
|
100
|
|
|
|
403431
|
$last_char = $unicode_char unless defined $last_char; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# this relies on the fact that data for each char is grouped |
107
|
|
|
|
|
|
|
# together on consecutive lines in the ccdict.txt file. |
108
|
197284
|
100
|
|
|
|
420691
|
if ( $unicode_char ne $last_char ) |
109
|
|
|
|
|
|
|
{ |
110
|
27493
|
|
|
|
|
316238
|
$self->_add_entry( $last_char, { %entry, unicode => $last_char } ); |
111
|
|
|
|
|
|
|
|
112
|
27493
|
|
|
|
|
129590
|
%entry = (); |
113
|
|
|
|
|
|
|
|
114
|
27493
|
|
|
|
|
43437
|
$last_char = $unicode_char; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
197284
|
100
|
|
|
|
531646
|
if ( exists $CCDICTToInternal{$type} ) |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
118
|
|
|
|
|
|
|
{ |
119
|
79290
|
|
|
|
|
105209
|
my $internal = $CCDICTToInternal{$type}; |
120
|
|
|
|
|
|
|
|
121
|
79290
|
|
|
|
|
185006
|
$entry{$internal} = $data; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
elsif ( exists $RomanizationToInternal{$type} ) |
124
|
|
|
|
|
|
|
{ |
125
|
68974
|
|
|
|
|
98637
|
my $internal = $RomanizationToInternal{$type}; |
126
|
|
|
|
|
|
|
|
127
|
68974
|
|
|
|
|
166604
|
my $class = $self->_romanization_class($internal); |
128
|
|
|
|
|
|
|
|
129
|
68974
|
|
|
|
|
94736
|
my $is_obsolete = 0; |
130
|
68974
|
|
|
|
|
256916
|
foreach my $syl ( split /[\s;]+/, $data ) |
131
|
|
|
|
|
|
|
{ |
132
|
|
|
|
|
|
|
# Deal with various mysterious bits and mistakes |
133
|
134089
|
100
|
|
|
|
271639
|
next if $syl eq '}'; |
134
|
134086
|
100
|
|
|
|
266721
|
next if $syl eq '(also)'; |
135
|
128294
|
100
|
|
|
|
245365
|
next if $syl eq '(old)'; |
136
|
|
|
|
|
|
|
|
137
|
128293
|
100
|
|
|
|
375107
|
next if $syl =~ /^p?\d+$/; |
138
|
|
|
|
|
|
|
|
139
|
128287
|
100
|
|
|
|
283974
|
next if $syl =~ /^\(coll/; |
140
|
|
|
|
|
|
|
|
141
|
128285
|
|
|
|
|
192995
|
$syl =~ s/^{?([^}]+)}$/$1/; |
142
|
128285
|
|
|
|
|
148913
|
$syl =~ s/{[^}]+}?$//; |
143
|
|
|
|
|
|
|
|
144
|
128285
|
100
|
|
|
|
264807
|
if ( $syl eq 'obs' ) |
145
|
|
|
|
|
|
|
{ |
146
|
23
|
|
|
|
|
42
|
$is_obsolete = 1; |
147
|
23
|
|
|
|
|
65
|
next; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
128262
|
|
|
|
|
489025
|
my $romanized = $class->new( syllable => $syl, |
151
|
|
|
|
|
|
|
obsolete => $is_obsolete, |
152
|
|
|
|
|
|
|
); |
153
|
|
|
|
|
|
|
|
154
|
128262
|
100
|
|
|
|
399903
|
next unless $romanized; |
155
|
|
|
|
|
|
|
|
156
|
128140
|
|
|
|
|
151566
|
push @{ $entry{$internal} }, $romanized; |
|
128140
|
|
|
|
|
520078
|
|
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
elsif ( $type eq 'fR/S' ) |
160
|
|
|
|
|
|
|
{ |
161
|
27495
|
|
|
|
|
71459
|
my ( $radical, $index ) = split /\./, $data; |
162
|
|
|
|
|
|
|
|
163
|
27495
|
|
|
|
|
54776
|
$entry{radical} = $radical; |
164
|
27495
|
|
|
|
|
63466
|
$entry{index} = $index; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
elsif ( $type eq 'fEnglish' ) |
167
|
|
|
|
|
|
|
{ |
168
|
41503
|
50
|
|
|
|
211722
|
$entry{english} = |
169
|
21525
|
|
|
|
|
104006
|
[ grep { defined && length } split /\s*\[\d\d?\]\s*/, $data ]; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
else |
172
|
|
|
|
|
|
|
{ |
173
|
0
|
|
|
|
|
0
|
die "Invalid line: $_\n"; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
197284
|
50
|
66
|
|
|
1125478
|
if ( ! ( $. % $lines_per ) && $status_fh ) |
177
|
|
|
|
|
|
|
{ |
178
|
0
|
|
|
|
|
0
|
print $status_fh "$. lines processed\n"; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
1
|
|
|
|
|
9
|
$self->_add_entry( $last_char, { %entry, unicode => $last_char } ); |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
sub _add_entry |
186
|
|
|
|
|
|
|
{ |
187
|
27494
|
|
|
27494
|
|
42612
|
my $self = shift; |
188
|
27494
|
|
|
|
|
37786
|
my $unicode = shift; |
189
|
27494
|
|
|
|
|
28460
|
my $entry = shift; |
190
|
|
|
|
|
|
|
|
191
|
27494
|
100
|
|
|
|
64282
|
return unless defined $entry->{radical}; |
192
|
|
|
|
|
|
|
|
193
|
27484
|
|
|
|
|
95436
|
$self->_real_add_entry( $unicode, $entry ); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub _romanization_class |
197
|
|
|
|
|
|
|
{ |
198
|
|
|
|
|
|
|
return |
199
|
68974
|
100
|
|
68974
|
|
181562
|
( $_[1] eq 'pinyin' ? |
200
|
|
|
|
|
|
|
'Lingua::ZH::CCDICT::Romanization::Pinyin' : |
201
|
|
|
|
|
|
|
'Lingua::ZH::CCDICT::Romanization' |
202
|
|
|
|
|
|
|
); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub _is_romanization_type |
206
|
|
|
|
|
|
|
{ |
207
|
8
|
|
|
8
|
|
41
|
return $RomanizationType{ $_[1] }; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _InternalTypes |
211
|
|
|
|
|
|
|
{ |
212
|
1
|
|
|
1
|
|
5
|
return values %CCDICTToInternal, values %RomanizationToInternal, 'index', 'radical'; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Some of these may be overridden in subclasses, but they provide an |
216
|
|
|
|
|
|
|
# easy default |
217
|
|
|
|
|
|
|
foreach my $type ( __PACKAGE__->_InternalTypes() ) |
218
|
|
|
|
|
|
|
{ |
219
|
|
|
|
|
|
|
my $sub = |
220
|
|
|
|
|
|
|
__PACKAGE__->_is_romanization_type($type) |
221
|
1
|
|
|
1
|
1
|
656
|
? sub { my $s = shift; |
|
|
|
|
1
|
1
|
|
|
|
|
|
|
1
|
1
|
|
|
222
|
1
|
|
|
|
|
3
|
$s->_match( $type => map { lc } @_ ) } |
|
1
|
|
|
|
|
9
|
|
223
|
2
|
|
|
2
|
1
|
838
|
: sub { my $s = shift; |
|
|
|
|
2
|
1
|
|
|
|
|
|
|
2
|
1
|
|
|
|
|
|
|
2
|
1
|
|
|
|
|
|
|
2
|
1
|
|
|
224
|
2
|
|
|
|
|
7
|
$s->_match( $type => @_ ) }; |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
my $sub_name = "match_$type"; |
227
|
1
|
|
|
1
|
|
8
|
no strict 'refs'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
74
|
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
*{$sub_name} = subname $sub_name => $sub; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
1; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
__END__ |