File Coverage

blib/lib/Data/Kanji/Kanjidic.pm

Criterion	Covered	Total	%
statement	107	177	60.4
branch	37	76	48.6
condition	3	6	50.0
subroutine	12	19	63.1
pod	7	11	63.6
total	166	289	57.4

line	stmt	bran	cond	sub	pod	time	code
1							# See Kanjidic.pod for documentation
2
3							package Data::Kanji::Kanjidic;
4							require Exporter;
5	3			3		213434	use warnings;
	3					26
	3					103
6	3			3		17	use strict;
	3					5
	3					269
7							our @ISA = qw(Exporter);
8							our @EXPORT_OK = qw/
9							%codes
10							%has_dupes
11							grade
12							grade_stroke_order
13							kanji_dictionary_order
14							kanjidic_order
15							parse_entry
16							parse_kanjidic
17							stroke_radical_jis_order
18							/;
19
20							our %EXPORT_TAGS = (
21							all => \@EXPORT_OK,
22							);
23							our $VERSION = '0.17';
24	3			3		21	use strict;
	3					7
	3					116
25	3			3		18	use warnings;
	3					13
	3					105
26	3			3		1828	use Encode;
	3					31840
	3					218
27	3			3		1187	use utf8;
	3					30
	3					20
28	3			3		87	use Carp;
	3					5
	3					6316
29
30							our $AUTHOR;
31
32							# Parse one string from kanjidic and return it in an associative array.
33
34							our %codes = (
35							'W' => 'Korean pronunciation',
36							'Y' => 'Pinyin pronunciation',
37							'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
38							'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
39							'U' => 'Unicode code point as a hexadecimal number',
40							'G' => 'Year of elementary school this kanji is taught',
41							'Q' => 'Four-corner code',
42							'S' => 'Stroke count',
43							'P' => 'SKIP code',
44							'J' => 'Japanese proficiency test level',
45							'N' => 'Nelson code from original Nelson dictionary',
46							'V' => 'Nelson code from the "New Nelson" dictionary',
47							'L' => 'Code from "Remembering the Kanji" by James Heisig',
48							'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
49							'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
50							'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
51							'I' => 'The Spahn-Hadamitzky book number',
52							'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
53
54							'MP' => 'Morohashi volume/page',
55							'MN' => 'Morohashi index number',
56							'H' => 'Number in Jack Halpern dictionary',
57							'F' => 'Frequency of kanji',
58
59							'X' => 'Cross reference',
60							'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
61							'DB' => 'Japanese for Busy People textbook numbers',
62							'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
63							'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
64							'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
65							'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
66							'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
67							'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
68							'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
69							'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
70							'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
71							'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
72							'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
73							'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
74							'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
75							'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
76							'XJ' => 'Cross-reference',
77							'XO' => 'Cross-reference',
78							'XH' => 'Cross-reference',
79							'XI' => 'Cross-reference',
80							'XN' => 'Nelson cross-reference',
81							'XDR' => 'De Roo cross-reference',
82							'T' => 'SPECIAL',
83							'ZPP' => 'SKIP misclassification by position',
84							'ZRP' => 'SKIP classification disagreement',
85							'ZSP' => 'SKIP misclassification by stroke count',
86							'ZBP' => 'SKIP misclassification by both stroke count and position',
87							);
88
89							# Fields which are allowed to have duplicates.
90
91							our @dupes = qw/
92							DA
93							O
94							Q
95							S
96							V
97							W
98							XDR
99							XH
100							XJ
101							XN
102							Y
103							ZBP
104							ZPP
105							ZRP
106							ZSP
107							/;
108
109							our %has_dupes;
110
111							@has_dupes{@dupes} = @dupes;
112
113							sub parse_entry
114							{
115	57			57	1	126	my ($input) = @_;
116
117							# Remove the English entries first.
118
119	57					221	my @english;
120							my @onyomi;
121	57					0	my @kunyomi;
122	57					0	my @nanori;
123
124							# Return value
125
126	57					0	my %values;
127
128							# The English-language "meanings" are between { and }.
129
130	57					493	while ($input =~ s/\{([^\}]+)\}//) {
131	165					398	my $meaning = $1;
132
133							# Mark as a "kokuji".
134
135	165	50				319	if ($meaning =~ m/$kokuji$/) {
136	0					0	$values{"kokuji"} = 1;
137							}
138							else {
139	165					966	push (@english, $meaning);
140							}
141							}
142
143	57					539	(my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
144	57					121	$values{kanji} = $kanji;
145							# Flag to detect the start of nanori readings.
146	57					81	my $in_nanori;
147	57					107	foreach my $entry (@entries) {
148	1722					2095	my $found;
149	1722	100				4348	if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
150	1431	100				2848	if ($entry eq 'T1') {
151	27					43	$in_nanori = 1;
152	27					48	next;
153							}
154	1404					2202	my $field = $1;
155	1404					1961	my $value = $2;
156	1404	50				2669	if ($codes{$field}) {
157	1404	100				2270	if ($has_dupes{$field}) {
158	432					546	push @{$values{$field}}, $value;
	432					1109
159							}
160							else {
161	972	50				1646	if (!$values{$field}) {
162	972					2022	$values{$field} = $2;
163							}
164							else {
165	0					0	die "duplicate values for key $field.\n";
166							}
167							}
168	1404					2124	$found = 1;
169							}
170							else {
171							# Unknown field is ignored.
172							}
173
174							# Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
175							# "chouon") characters.
176							}
177							else {
178	291	100				450	if ($in_nanori) {
179	87					166	push @nanori, $entry;
180	87					111	$found = 1;
181							}
182							else {
183	204	100				1128	if ($entry =~ m/^([あ-んー\.-]+)$/) {
		50
184	129					227	push @kunyomi, $entry;
185	129					180	$found = 1;
186							}
187							elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
188	75					154	push @onyomi, $entry;
189	75					103	$found = 1;
190							}
191							}
192							}
193	1695	50	33			3290	if ($AUTHOR && ! $found) {
194	0					0	die "kanjidic:$.: Mystery entry \"$entry\"\n";
195							}
196							}
197	57					80	my %morohashi;
198	57	100				116	if ($values{MP}) {
199	54					292	@morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
200							}
201	57	100				142	if ($values{MN}) {
202	54					125	$morohashi{index} = $values{MN};
203							}
204	57	100	66			156	if ($values{MN} \|\| $values{MP}) {
205	54					113	$values{morohashi} = \%morohashi;
206							}
207	57	50				127	if (@english) {
208	57					126	$values{"english"} = \@english;
209							}
210	57	50				109	if (@onyomi) {
211	57					100	$values{"onyomi"} = \@onyomi;
212							}
213	57	50				108	if (@kunyomi) {
214	57					99	$values{"kunyomi"} = \@kunyomi;
215							}
216	57	100				109	if (@nanori) {
217	27					41	$values{"nanori"} = \@nanori;
218							}
219
220							# Kanjidic uses the bogus radical numbers of Nelson rather than
221							# the correct ones.
222
223	57					134	$values{radical} = $values{B};
224	57	100				115	$values{radical} = $values{C} if $values{C};
225
226							# Just in case there is a problem in kanjidic, this will tell us
227							# the line where the problem was:
228
229	57					132	$values{"line_number"} = $.;
230	57					1752	return %values;
231							}
232
233							# Order of kanji in a kanji dictionary.
234
235							sub kanji_dictionary_order
236							{
237	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
238							# print "$a, $b,\n";
239	0					0	my $valuea = $kanjidic_ref->{$a};
240	0					0	my $valueb = $kanjidic_ref->{$b};
241	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
242	0	0				0	return $radval if $radval;
243	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
244	0	0				0	return $strokeval if $strokeval;
245	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
246	0	0				0	return $jisval if $jisval;
247	0					0	return 0;
248							}
249
250							# Order of kanji in a kanji dictionary.
251
252							sub stroke_radical_jis_order
253							{
254	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
255							# print "$a, $b,\n";
256	0					0	my $valuea = $kanjidic_ref->{$a};
257	0					0	my $valueb = $kanjidic_ref->{$b};
258	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
259	0	0				0	return $strokeval if $strokeval;
260	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
261	0	0				0	return $radval if $radval;
262	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
263	0	0				0	return $jisval if $jisval;
264							# They must be the same kanji.
265	0					0	return 0;
266							}
267
268							# Comparison function to sort by grade and then stroke order, then JIS
269							# code value if those are both the same.
270
271							sub grade_stroke_order
272							{
273	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
274							# print "$a, $b,\n";
275	0					0	my $valuea = $kanjidic_ref->{$a};
276	0					0	my $valueb = $kanjidic_ref->{$b};
277	0	0				0	if ($valuea->{G}) {
		0
278	0	0				0	if ($valueb->{G}) {
279	0					0	my $gradeval = $$valuea{G} - $$valueb{G};
280	0	0				0	return $gradeval if $gradeval;
281							}
282							else {
283	0					0	return -1;
284							}
285							}
286							elsif ($valueb->{G}) {
287	0					0	return 1;
288							}
289	0					0	my $strokeval = $$valuea{S} - $$valueb{S};
290	0	0				0	return $strokeval if $strokeval;
291	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
292	0	0				0	return $jisval if $jisval;
293	0					0	return 0;
294							}
295
296							sub parse_kanjidic
297							{
298	3			3	1	790	my ($file_name) = @_;
299	3	50				13	if (! $file_name) {
300	0					0	croak "Please supply a file name";
301							}
302	3					8	my $KANJIDIC;
303
304							my %kanjidic;
305
306	3	50				80	if (! -f $file_name) {
307	0					0	croak "No such file '$file_name'";
308							}
309
310	3	50		3		46	open $KANJIDIC, "<:encoding(euc-jp)", $file_name
	3					7
	3					24
	3					108
311							or die "Could not open '$file_name': $!";
312	3					26936	while (<$KANJIDIC>) {
313							# Skip the comment line.
314	60	100				186	next if ( m/^\#/ );
315	57					129	my %values = parse_entry ($_);
316	57					432	my @skip = split ("-", $values{P});
317	57					141	$values{skip} = \@skip;
318	57					593	$kanjidic{$values{kanji}} = \%values;
319							}
320	3					54	close $KANJIDIC;
321	3					32	return \%kanjidic;
322							}
323
324							sub kanjidic_order
325							{
326	1			1	1	7	my ($kanjidic_ref) = @_;
327							my @kanjidic_order =
328							sort {
329	1					12	hex ($kanjidic_ref->{$a}->{jiscode}) <=>
330							hex ($kanjidic_ref->{$b}->{jiscode})
331	60					133	}
332							keys %$kanjidic_ref;
333	1					4	my $count = 0;
334	1					3	for my $kanji (@kanjidic_order) {
335	19					32	$kanjidic_ref->{$kanji}->{kanji_id} = $count;
336	19					24	$count++;
337							}
338	1					7	return @kanjidic_order;
339							}
340
341							sub new
342							{
343	0			0	0	0	my ($package, $file) = @_;
344	0					0	my $kanjidic = {};
345	0					0	$kanjidic->{file} = $file;
346	0					0	undef $file;
347	0					0	$kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
348	0					0	bless $kanjidic;
349	0					0	return $kanjidic;
350							}
351
352							# Make indices going from each type of key back to the data.
353
354							sub make_indices
355							{
356	0			0	0	0	my ($kanjidic) = @_;
357	0					0	my %indices;
358	0					0	my $data = $kanjidic->{data};
359	0					0	for my $kanji (keys %$data) {
360	0					0	my $kdata = $data->{$kanji};
361	0					0	for my $key (keys %$kdata) {
362	0					0	$indices{$key}{$kdata->{$key}} = $kdata;
363							}
364							}
365	0					0	$kanjidic->{indices} = \%indices;
366							}
367
368							sub find_key
369							{
370	0			0	0	0	my ($kanjidic, $key, $value) = @_;
371	0	0				0	if (! $kanjidic->{indices}) {
372	0					0	make_indices ($kanjidic);
373							}
374	0					0	my $index = $kanjidic->{indices}{$key};
375	0					0	return $index->{$value};
376							}
377
378							sub kanji_to_order
379							{
380	0			0	0	0	my ($kanjidic, $kanji) = @_;
381	0	0				0	if (! $kanjidic->{order}) {
382	0					0	my @order = kanjidic_order ($kanjidic->{data});
383	0					0	my %index;
384	0					0	my $count = 0;
385	0					0	for my $k (@order) {
386	0					0	$index{$k} = $count;
387	0					0	$count++;
388							}
389	0					0	$kanjidic->{order} = \@order;
390	0					0	$kanjidic->{index} = \%index;
391							}
392	0					0	return $kanjidic->{index}->{$kanji};
393							}
394
395							sub grade
396							{
397	1			1	1	9	my ($kanjidic, $grade) = @_;
398	1					2	my @grade_kanjis;
399	1					7	for my $k (keys %$kanjidic) {
400	19					29	my $kgrade = $kanjidic->{$k}->{G};
401	19	100				35	next unless $kgrade;
402	15	100				31	push @grade_kanjis, $k if $kgrade == $grade;
403							}
404	1					4	return \@grade_kanjis;
405							}
406
407							1;
408