File Coverage

blib/lib/Data/Kanji/Kanjidic.pm
Criterion Covered Total %
statement 107 177 60.4
branch 37 76 48.6
condition 3 6 50.0
subroutine 12 19 63.1
pod 7 11 63.6
total 166 289 57.4


line stmt bran cond sub pod time code
1             # See Kanjidic.pod for documentation
2              
3             package Data::Kanji::Kanjidic;
4             require Exporter;
5 3     3   213434 use warnings;
  3         26  
  3         103  
6 3     3   17 use strict;
  3         5  
  3         269  
7             our @ISA = qw(Exporter);
8             our @EXPORT_OK = qw/
9             %codes
10             %has_dupes
11             grade
12             grade_stroke_order
13             kanji_dictionary_order
14             kanjidic_order
15             parse_entry
16             parse_kanjidic
17             stroke_radical_jis_order
18             /;
19              
20             our %EXPORT_TAGS = (
21             all => \@EXPORT_OK,
22             );
23             our $VERSION = '0.17';
24 3     3   21 use strict;
  3         7  
  3         116  
25 3     3   18 use warnings;
  3         13  
  3         105  
26 3     3   1828 use Encode;
  3         31840  
  3         218  
27 3     3   1187 use utf8;
  3         30  
  3         20  
28 3     3   87 use Carp;
  3         5  
  3         6316  
29              
30             our $AUTHOR;
31              
32             # Parse one string from kanjidic and return it in an associative array.
33              
34             our %codes = (
35             'W' => 'Korean pronunciation',
36             'Y' => 'Pinyin pronunciation',
37             'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
38             'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
39             'U' => 'Unicode code point as a hexadecimal number',
40             'G' => 'Year of elementary school this kanji is taught',
41             'Q' => 'Four-corner code',
42             'S' => 'Stroke count',
43             'P' => 'SKIP code',
44             'J' => 'Japanese proficiency test level',
45             'N' => 'Nelson code from original Nelson dictionary',
46             'V' => 'Nelson code from the "New Nelson" dictionary',
47             'L' => 'Code from "Remembering the Kanji" by James Heisig',
48             'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
49             'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
50             'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
51             'I' => 'The Spahn-Hadamitzky book number',
52             'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
53              
54             'MP' => 'Morohashi volume/page',
55             'MN' => 'Morohashi index number',
56             'H' => 'Number in Jack Halpern dictionary',
57             'F' => 'Frequency of kanji',
58              
59             'X' => 'Cross reference',
60             'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
61             'DB' => 'Japanese for Busy People textbook numbers',
62             'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
63             'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
64             'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
65             'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
66             'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
67             'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
68             'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
69             'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
70             'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
71             'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
72             'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
73             'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
74             'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
75             'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
76             'XJ' => 'Cross-reference',
77             'XO' => 'Cross-reference',
78             'XH' => 'Cross-reference',
79             'XI' => 'Cross-reference',
80             'XN' => 'Nelson cross-reference',
81             'XDR' => 'De Roo cross-reference',
82             'T' => 'SPECIAL',
83             'ZPP' => 'SKIP misclassification by position',
84             'ZRP' => 'SKIP classification disagreement',
85             'ZSP' => 'SKIP misclassification by stroke count',
86             'ZBP' => 'SKIP misclassification by both stroke count and position',
87             );
88              
89             # Fields which are allowed to have duplicates.
90              
91             our @dupes = qw/
92             DA
93             O
94             Q
95             S
96             V
97             W
98             XDR
99             XH
100             XJ
101             XN
102             Y
103             ZBP
104             ZPP
105             ZRP
106             ZSP
107             /;
108              
109             our %has_dupes;
110              
111             @has_dupes{@dupes} = @dupes;
112              
113             sub parse_entry
114             {
115 57     57 1 126 my ($input) = @_;
116              
117             # Remove the English entries first.
118              
119 57         221 my @english;
120             my @onyomi;
121 57         0 my @kunyomi;
122 57         0 my @nanori;
123              
124             # Return value
125              
126 57         0 my %values;
127              
128             # The English-language "meanings" are between { and }.
129              
130 57         493 while ($input =~ s/\{([^\}]+)\}//) {
131 165         398 my $meaning = $1;
132              
133             # Mark as a "kokuji".
134              
135 165 50       319 if ($meaning =~ m/\(kokuji\)/) {
136 0         0 $values{"kokuji"} = 1;
137             }
138             else {
139 165         966 push (@english, $meaning);
140             }
141             }
142              
143 57         539 (my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
144 57         121 $values{kanji} = $kanji;
145             # Flag to detect the start of nanori readings.
146 57         81 my $in_nanori;
147 57         107 foreach my $entry (@entries) {
148 1722         2095 my $found;
149 1722 100       4348 if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
150 1431 100       2848 if ($entry eq 'T1') {
151 27         43 $in_nanori = 1;
152 27         48 next;
153             }
154 1404         2202 my $field = $1;
155 1404         1961 my $value = $2;
156 1404 50       2669 if ($codes{$field}) {
157 1404 100       2270 if ($has_dupes{$field}) {
158 432         546 push @{$values{$field}}, $value;
  432         1109  
159             }
160             else {
161 972 50       1646 if (!$values{$field}) {
162 972         2022 $values{$field} = $2;
163             }
164             else {
165 0         0 die "duplicate values for key $field.\n";
166             }
167             }
168 1404         2124 $found = 1;
169             }
170             else {
171             # Unknown field is ignored.
172             }
173              
174             # Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
175             # "chouon") characters.
176             }
177             else {
178 291 100       450 if ($in_nanori) {
179 87         166 push @nanori, $entry;
180 87         111 $found = 1;
181             }
182             else {
183 204 100       1128 if ($entry =~ m/^([あ-んー\.-]+)$/) {
    50          
184 129         227 push @kunyomi, $entry;
185 129         180 $found = 1;
186             }
187             elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
188 75         154 push @onyomi, $entry;
189 75         103 $found = 1;
190             }
191             }
192             }
193 1695 50 33     3290 if ($AUTHOR && ! $found) {
194 0         0 die "kanjidic:$.: Mystery entry \"$entry\"\n";
195             }
196             }
197 57         80 my %morohashi;
198 57 100       116 if ($values{MP}) {
199 54         292 @morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
200             }
201 57 100       142 if ($values{MN}) {
202 54         125 $morohashi{index} = $values{MN};
203             }
204 57 100 66     156 if ($values{MN} || $values{MP}) {
205 54         113 $values{morohashi} = \%morohashi;
206             }
207 57 50       127 if (@english) {
208 57         126 $values{"english"} = \@english;
209             }
210 57 50       109 if (@onyomi) {
211 57         100 $values{"onyomi"} = \@onyomi;
212             }
213 57 50       108 if (@kunyomi) {
214 57         99 $values{"kunyomi"} = \@kunyomi;
215             }
216 57 100       109 if (@nanori) {
217 27         41 $values{"nanori"} = \@nanori;
218             }
219              
220             # Kanjidic uses the bogus radical numbers of Nelson rather than
221             # the correct ones.
222              
223 57         134 $values{radical} = $values{B};
224 57 100       115 $values{radical} = $values{C} if $values{C};
225              
226             # Just in case there is a problem in kanjidic, this will tell us
227             # the line where the problem was:
228              
229 57         132 $values{"line_number"} = $.;
230 57         1752 return %values;
231             }
232              
233             # Order of kanji in a kanji dictionary.
234              
235             sub kanji_dictionary_order
236             {
237 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
238             # print "$a, $b,\n";
239 0         0 my $valuea = $kanjidic_ref->{$a};
240 0         0 my $valueb = $kanjidic_ref->{$b};
241 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
242 0 0       0 return $radval if $radval;
243 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
244 0 0       0 return $strokeval if $strokeval;
245 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
246 0 0       0 return $jisval if $jisval;
247 0         0 return 0;
248             }
249              
250             # Order of kanji in a kanji dictionary.
251              
252             sub stroke_radical_jis_order
253             {
254 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
255             # print "$a, $b,\n";
256 0         0 my $valuea = $kanjidic_ref->{$a};
257 0         0 my $valueb = $kanjidic_ref->{$b};
258 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
259 0 0       0 return $strokeval if $strokeval;
260 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
261 0 0       0 return $radval if $radval;
262 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
263 0 0       0 return $jisval if $jisval;
264             # They must be the same kanji.
265 0         0 return 0;
266             }
267              
268             # Comparison function to sort by grade and then stroke order, then JIS
269             # code value if those are both the same.
270              
271             sub grade_stroke_order
272             {
273 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
274             # print "$a, $b,\n";
275 0         0 my $valuea = $kanjidic_ref->{$a};
276 0         0 my $valueb = $kanjidic_ref->{$b};
277 0 0       0 if ($valuea->{G}) {
    0          
278 0 0       0 if ($valueb->{G}) {
279 0         0 my $gradeval = $$valuea{G} - $$valueb{G};
280 0 0       0 return $gradeval if $gradeval;
281             }
282             else {
283 0         0 return -1;
284             }
285             }
286             elsif ($valueb->{G}) {
287 0         0 return 1;
288             }
289 0         0 my $strokeval = $$valuea{S} - $$valueb{S};
290 0 0       0 return $strokeval if $strokeval;
291 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
292 0 0       0 return $jisval if $jisval;
293 0         0 return 0;
294             }
295              
296             sub parse_kanjidic
297             {
298 3     3 1 790 my ($file_name) = @_;
299 3 50       13 if (! $file_name) {
300 0         0 croak "Please supply a file name";
301             }
302 3         8 my $KANJIDIC;
303              
304             my %kanjidic;
305              
306 3 50       80 if (! -f $file_name) {
307 0         0 croak "No such file '$file_name'";
308             }
309              
310 3 50   3   46 open $KANJIDIC, "<:encoding(euc-jp)", $file_name
  3         7  
  3         24  
  3         108  
311             or die "Could not open '$file_name': $!";
312 3         26936 while (<$KANJIDIC>) {
313             # Skip the comment line.
314 60 100       186 next if ( m/^\#/ );
315 57         129 my %values = parse_entry ($_);
316 57         432 my @skip = split ("-", $values{P});
317 57         141 $values{skip} = \@skip;
318 57         593 $kanjidic{$values{kanji}} = \%values;
319             }
320 3         54 close $KANJIDIC;
321 3         32 return \%kanjidic;
322             }
323              
324             sub kanjidic_order
325             {
326 1     1 1 7 my ($kanjidic_ref) = @_;
327             my @kanjidic_order =
328             sort {
329 1         12 hex ($kanjidic_ref->{$a}->{jiscode}) <=>
330             hex ($kanjidic_ref->{$b}->{jiscode})
331 60         133 }
332             keys %$kanjidic_ref;
333 1         4 my $count = 0;
334 1         3 for my $kanji (@kanjidic_order) {
335 19         32 $kanjidic_ref->{$kanji}->{kanji_id} = $count;
336 19         24 $count++;
337             }
338 1         7 return @kanjidic_order;
339             }
340              
341             sub new
342             {
343 0     0 0 0 my ($package, $file) = @_;
344 0         0 my $kanjidic = {};
345 0         0 $kanjidic->{file} = $file;
346 0         0 undef $file;
347 0         0 $kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
348 0         0 bless $kanjidic;
349 0         0 return $kanjidic;
350             }
351              
352             # Make indices going from each type of key back to the data.
353              
354             sub make_indices
355             {
356 0     0 0 0 my ($kanjidic) = @_;
357 0         0 my %indices;
358 0         0 my $data = $kanjidic->{data};
359 0         0 for my $kanji (keys %$data) {
360 0         0 my $kdata = $data->{$kanji};
361 0         0 for my $key (keys %$kdata) {
362 0         0 $indices{$key}{$kdata->{$key}} = $kdata;
363             }
364             }
365 0         0 $kanjidic->{indices} = \%indices;
366             }
367              
368             sub find_key
369             {
370 0     0 0 0 my ($kanjidic, $key, $value) = @_;
371 0 0       0 if (! $kanjidic->{indices}) {
372 0         0 make_indices ($kanjidic);
373             }
374 0         0 my $index = $kanjidic->{indices}{$key};
375 0         0 return $index->{$value};
376             }
377              
378             sub kanji_to_order
379             {
380 0     0 0 0 my ($kanjidic, $kanji) = @_;
381 0 0       0 if (! $kanjidic->{order}) {
382 0         0 my @order = kanjidic_order ($kanjidic->{data});
383 0         0 my %index;
384 0         0 my $count = 0;
385 0         0 for my $k (@order) {
386 0         0 $index{$k} = $count;
387 0         0 $count++;
388             }
389 0         0 $kanjidic->{order} = \@order;
390 0         0 $kanjidic->{index} = \%index;
391             }
392 0         0 return $kanjidic->{index}->{$kanji};
393             }
394              
395             sub grade
396             {
397 1     1 1 9 my ($kanjidic, $grade) = @_;
398 1         2 my @grade_kanjis;
399 1         7 for my $k (keys %$kanjidic) {
400 19         29 my $kgrade = $kanjidic->{$k}->{G};
401 19 100       35 next unless $kgrade;
402 15 100       31 push @grade_kanjis, $k if $kgrade == $grade;
403             }
404 1         4 return \@grade_kanjis;
405             }
406              
407             1;
408