line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::JA::Sort::ReadableKey; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
1033
|
use 5.006; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
44
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
5
|
1
|
|
|
1
|
|
24
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
114
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
require Exporter; |
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
9
|
|
|
|
|
|
|
our @EXPORT = qw( kanji_to_kana japanese_pronunciation japanese_sort_order ); |
10
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
11
|
1
|
|
|
1
|
|
1091
|
use Encode; |
|
1
|
|
|
|
|
13867
|
|
|
1
|
|
|
|
|
162
|
|
12
|
1
|
|
|
1
|
|
1285
|
use Encode::JP; |
|
1
|
|
|
|
|
11685
|
|
|
1
|
|
|
|
|
718
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
my $repeat = qr/[\x{3033}\x{309d}\x{30fd}]/; |
15
|
|
|
|
|
|
|
my $voiced_repeat = qr/[\x{309e}\x{30fe}]/; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my %romanize; |
18
|
|
|
|
|
|
|
my %rank; |
19
|
|
|
|
|
|
|
while () { |
20
|
|
|
|
|
|
|
Encode::_utf8_on($_); |
21
|
|
|
|
|
|
|
/^(\w)\s+(\w+\d)/ or die "Bad line $_"; |
22
|
|
|
|
|
|
|
$romanize{$1} = $2; |
23
|
|
|
|
|
|
|
$rank{$2} = $.; |
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub japanese_sort_order { |
27
|
0
|
|
|
0
|
1
|
|
my $string = _key(shift); |
28
|
0
|
0
|
|
|
|
|
$string =~ s#([A-Za-z]+\d)#exists $rank{$1} ? chr(33+$rank{$1}) : $1#eg; |
|
0
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
return $string; |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my %mutations = (qw( |
33
|
|
|
|
|
|
|
K G |
34
|
|
|
|
|
|
|
T D |
35
|
|
|
|
|
|
|
S Z |
36
|
|
|
|
|
|
|
H B |
37
|
|
|
|
|
|
|
)); |
38
|
|
|
|
|
|
|
sub japanese_pronunciation { |
39
|
0
|
|
|
0
|
1
|
|
my $string = _key(shift); |
40
|
0
|
|
|
|
|
|
$string =~ s/\/.*?$//; |
41
|
|
|
|
|
|
|
# First deal with tenten |
42
|
0
|
|
|
|
|
|
$string =~ s/0//g; |
43
|
0
|
|
|
|
|
|
while (my ($k,$v) = each %mutations) { |
44
|
0
|
|
|
|
|
|
$string =~ s/$k(.)1/$v$1/g; |
45
|
|
|
|
|
|
|
} |
46
|
0
|
|
|
|
|
|
$string =~ s/H(.)2/p$1/g; |
47
|
0
|
|
|
|
|
|
$string =~ s/ZI/JI/g; |
48
|
|
|
|
|
|
|
# Now geminate consonants |
49
|
0
|
|
|
|
|
|
$string =~ s/tu(.)/$1$1/g; |
50
|
|
|
|
|
|
|
# Now smallchars |
51
|
0
|
|
|
|
|
|
$string =~ s/(.)(y?[aeiou])/$2/g; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Finally, Kunreishiki->Hepburn |
54
|
0
|
|
|
|
|
|
$string = lc $string; |
55
|
0
|
|
|
|
|
|
$string =~ s/si/shi/g; |
56
|
0
|
|
|
|
|
|
$string =~ s/sy/sh/g; |
57
|
0
|
|
|
|
|
|
$string =~ s/jy/j/g; |
58
|
0
|
|
|
|
|
|
$string =~ s/ti/chi/g; |
59
|
0
|
|
|
|
|
|
$string =~ s/tu/tsu/g; |
60
|
0
|
|
|
|
|
|
$string =~ s/n([bp])/m$1/g; |
61
|
0
|
|
|
|
|
|
return $string; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub kanji_to_kana { |
65
|
0
|
|
|
0
|
1
|
|
my $string = shift; |
66
|
0
|
0
|
|
|
|
|
return $string unless /[\x{4e00}-\x{9fff}]/; |
67
|
0
|
|
|
|
|
|
require Text::ChaSen; |
68
|
0
|
|
|
|
|
|
my $string_euc = encode("euc-jp", $string); |
69
|
0
|
|
|
|
|
|
Text::ChaSen::getopt_argv('chasen-perl', '-F', '%a0'); |
70
|
0
|
|
0
|
|
|
|
$string = decode("euc-jp", Text::ChaSen::sparse_tostr($string_euc)) |
71
|
|
|
|
|
|
|
|| return $string; |
72
|
0
|
|
|
|
|
|
chomp $string; |
73
|
1
|
|
|
1
|
|
995
|
$string =~ tr/\x{30fc}/\x{30a6}/; # Hack |
|
1
|
|
|
|
|
10
|
|
|
1
|
|
|
|
|
106
|
|
|
0
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
|
$string =~ tr/\x{30a1}-\x{30ff}/\x{3041}-\x{309f}/; |
75
|
0
|
|
|
|
|
|
$string; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _key { |
79
|
0
|
|
|
0
|
|
|
my $string = shift; |
80
|
0
|
|
|
|
|
|
my $type = "hiragana"; |
81
|
0
|
|
|
|
|
|
my $last = ""; |
82
|
0
|
|
|
|
|
|
my @code; |
83
|
0
|
|
|
|
|
|
$string = kanji_to_kana($string); |
84
|
0
|
|
|
|
|
|
$string =~ s/(.)$repeat/$1$1/g; |
85
|
0
|
|
|
|
|
|
$string =~ s/(.)$voiced_repeat/$1.chr(1+ord($1))/eg; |
|
0
|
|
|
|
|
|
|
86
|
0
|
0
|
|
|
|
|
$type ="katakana" if $string =~ tr/\x{30a1}-\x{30ff}/\x{3041}-\x{309f}/; |
87
|
0
|
0
|
|
|
|
|
$string =~ s/(.)/exists $romanize{$1} ? $romanize{$1} : $1 /eg; |
|
0
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
return $string."/$type"; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
1; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=head1 NAME |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
Lingua::JA::Sort::ReadableKey - Sorting and Romanizing Japanese |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 SYNOPSIS |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
use Lingua::JA::Sort::ReadableKey; |
99
|
|
|
|
|
|
|
for ( map { $_->[0] } |
100
|
|
|
|
|
|
|
sort { $a->[1] cmp $b->[1] } |
101
|
|
|
|
|
|
|
map { [ $_, japanese_sortorder($_) ] } @utf8 ) { |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 DESCRIPTION |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
First, does L do what you want? Look at that |
106
|
|
|
|
|
|
|
first. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
It may not do what you want if you want |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=over 3 |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item * |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Kanji phrases sorted in their reading order, rather than as a separate block. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=item * |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
A machine-readable or storable key so that comparisons and sorting can |
119
|
|
|
|
|
|
|
be done by a non-Japanese-aware system later. |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=back |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
This module uses C to do kanji-kana conversion, and then |
124
|
|
|
|
|
|
|
produces a comparable ASCII key for sorting. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
All text should be in "real" UTF-8 - that is, strings in Perl's internal |
127
|
|
|
|
|
|
|
format with the UTF-8 flag on. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head2 EXPORT |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
The following methods are exported: |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head3 kanji_to_kana |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Use ChaSen to convert a kanji sequence into hiragana. You obviously need |
136
|
|
|
|
|
|
|
to install ChaSen, and its Perl interface C to make this |
137
|
|
|
|
|
|
|
work. You can get ChaSen from http://chasen.aist-nara.ac.jp/ and |
138
|
|
|
|
|
|
|
C is bundled with it. If you have Debian, install the |
139
|
|
|
|
|
|
|
packages "chasen" and "libtext-chasen-perl". This code will work with |
140
|
|
|
|
|
|
|
both ChaSen1 and ChaSen2. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head3 japanese_pronunciation |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
This turns a Japanese string into an ASCII representation of its |
145
|
|
|
|
|
|
|
reading. You can't sort on this, because Japanese don't sort according |
146
|
|
|
|
|
|
|
to the Latin alphabet, but you can use to label Japanese things for |
147
|
|
|
|
|
|
|
people who can't read Japanese. This will automatically call |
148
|
|
|
|
|
|
|
C if necessary to get the reading of kanji strings. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head3 japanese_sort_order |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
This returns an ASCII string which represents, in some bizarre magic |
153
|
|
|
|
|
|
|
encoding, the sort order of the Japanese input string, such that |
154
|
|
|
|
|
|
|
comparing the C of two UTF-8 strings will tell you |
155
|
|
|
|
|
|
|
how they should be sorted in a Japanese dictionary. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
By "bizarre" and "magic", I mean that for each character, we find its |
158
|
|
|
|
|
|
|
order in the Japanese alphabet, and then replace that with |
159
|
|
|
|
|
|
|
C so that it can be compared with C. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
This also calls C if there are any kanji strings. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 SEE ALSO |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
L, L. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
http://chasen.aist-nara.ac.jp/ |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head1 AUTHOR |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Simon Cozens, Esimon@cpan.orgE |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Copyright (C) 2004 by Simon Cozens |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
178
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
179
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=cut |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
__DATA__ |