line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
2
|
|
|
|
|
|
|
# Unicode::Japanese |
3
|
|
|
|
|
|
|
# Unicode::Japanese::PurePerl |
4
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
5
|
|
|
|
|
|
|
# $Id: Japanese_stub.pm 5239 2008-01-16 09:49:16Z hio $ |
6
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
7
|
|
|
|
|
|
|
package Unicode::Japanese::PurePerl; |
8
|
|
|
|
|
|
|
package Unicode::Japanese; |
9
|
|
|
|
|
|
|
|
10
|
47
|
|
|
47
|
|
200024
|
use strict; |
|
47
|
|
|
|
|
62
|
|
|
47
|
|
|
|
|
692
|
|
11
|
41
|
|
|
41
|
|
140
|
use vars qw($VERSION $PurePerl $xs_loaderror); |
|
41
|
|
|
|
|
44
|
|
|
41
|
|
|
|
|
3559
|
|
12
|
|
|
|
|
|
|
$VERSION = '0.44_02'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# `use bytes' and `use Encode' if we are on perl-5.8.0 or later. |
15
|
|
|
|
|
|
|
if( $] >= 5.008 ) |
16
|
|
|
|
|
|
|
{ |
17
|
|
|
|
|
|
|
my $evalerr; |
18
|
|
|
|
|
|
|
{ |
19
|
|
|
|
|
|
|
local($SIG{__DIE__}) = 'DEFAULT'; |
20
|
|
|
|
|
|
|
local($@); |
21
|
23
|
|
|
23
|
|
9703
|
eval 'use bytes;use Encode;'; |
|
23
|
|
|
23
|
|
526
|
|
|
23
|
|
|
|
|
83
|
|
|
23
|
|
|
|
|
10433
|
|
|
23
|
|
|
|
|
156532
|
|
|
23
|
|
|
|
|
1226
|
|
22
|
|
|
|
|
|
|
$evalerr = $@; |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
$evalerr and CORE::die($evalerr); |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
28
|
|
|
|
|
|
|
# import |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
sub import |
31
|
|
|
|
|
|
|
{ |
32
|
23
|
|
|
23
|
|
294
|
my $pkg = shift; |
33
|
23
|
|
|
|
|
46
|
my ($callerpkg) = caller; |
34
|
23
|
|
|
|
|
84
|
my %exp = |
35
|
|
|
|
|
|
|
( |
36
|
|
|
|
|
|
|
'&unijp' => \&unijp, |
37
|
|
|
|
|
|
|
); |
38
|
23
|
|
|
|
|
29
|
my @na; |
39
|
23
|
50
|
|
|
|
68
|
my @add = (grep{$_ eq ':all'} @_) ? keys %exp : (); |
|
4
|
|
|
|
|
14
|
|
40
|
23
|
|
|
|
|
51
|
foreach(@_, @add) |
41
|
|
|
|
|
|
|
{ |
42
|
4
|
50
|
|
|
|
9
|
$_ eq 'PurePerl' and $PurePerl=1, next; |
43
|
4
|
50
|
33
|
|
|
27
|
if( $exp{$_} || $exp{'&'.$_} ) |
|
|
50
|
|
|
|
|
|
44
|
|
|
|
|
|
|
{ |
45
|
35
|
|
|
35
|
|
162
|
no strict 'refs'; |
|
35
|
|
|
|
|
44
|
|
|
35
|
|
|
|
|
7882
|
|
46
|
1
|
|
|
|
|
5
|
(my $name = $_) =~ s/^\W//; |
47
|
1
|
|
0
|
|
|
1
|
my $obj = $exp{$_} || $exp{'&'.$_}; |
48
|
1
|
|
|
|
|
3
|
*{$callerpkg.'::'.$name} = $obj; |
|
1
|
|
|
|
|
6
|
|
49
|
|
|
|
|
|
|
}elsif( $_ eq 'no_I18N_Japanese' ) |
50
|
|
|
|
|
|
|
{ |
51
|
4
|
|
|
|
|
6
|
$^H &= ~0x0f00_0000; |
52
|
|
|
|
|
|
|
package Unicode::Japanese::PurePerl; |
53
|
4
|
|
|
|
|
7
|
$^H &= ~0x0f00_0000; |
54
|
|
|
|
|
|
|
package Unicode::Japanese; |
55
|
4
|
|
|
|
|
10
|
next; |
56
|
|
|
|
|
|
|
} |
57
|
1
|
|
|
|
|
3
|
push(@na,$_); |
58
|
|
|
|
|
|
|
} |
59
|
23
|
50
|
|
|
|
12614
|
if( @na ) |
60
|
|
|
|
|
|
|
{ |
61
|
|
|
|
|
|
|
#use Carp; |
62
|
|
|
|
|
|
|
#croak("invalid parameter (".join(',',@na).")"); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
67
|
|
|
|
|
|
|
# DESTROY |
68
|
|
|
|
|
|
|
# |
69
|
|
|
|
|
|
|
sub DESTROY |
70
|
|
|
|
1
|
|
|
{ |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
74
|
|
|
|
|
|
|
# load_xs. |
75
|
|
|
|
|
|
|
# loading xs-subs. |
76
|
|
|
|
|
|
|
# this method is called from new (through new=>_init_table=>load_xs) |
77
|
|
|
|
|
|
|
# |
78
|
|
|
|
|
|
|
sub load_xs |
79
|
|
|
|
|
|
|
{ |
80
|
|
|
|
|
|
|
#print STDERR "load_xs\n"; |
81
|
22
|
50
|
|
22
|
0
|
64
|
if( $PurePerl ) |
82
|
|
|
|
|
|
|
{ |
83
|
|
|
|
|
|
|
#print STDERR "PurePerl mode\n"; |
84
|
1
|
|
|
|
|
1
|
$xs_loaderror = 'disabled'; |
85
|
1
|
|
|
|
|
3
|
return; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
#print STDERR "XS mode\n"; |
88
|
|
|
|
|
|
|
|
89
|
22
|
|
|
|
|
28
|
my $use_xs; |
90
|
|
|
|
|
|
|
LoadXS: |
91
|
|
|
|
|
|
|
{ |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
#print STDERR "* * bootstrap...\n"; |
94
|
22
|
|
|
|
|
26
|
eval q |
95
|
22
|
|
|
22
|
|
118
|
{ |
|
22
|
|
|
22
|
|
22
|
|
|
22
|
|
|
|
|
597
|
|
|
22
|
|
|
|
|
113
|
|
|
22
|
|
|
|
|
19
|
|
|
22
|
|
|
|
|
1231
|
|
|
22
|
|
|
|
|
1170
|
|
96
|
|
|
|
|
|
|
use strict; |
97
|
|
|
|
|
|
|
require DynaLoader; |
98
|
|
|
|
|
|
|
use vars qw(@ISA); |
99
|
|
|
|
|
|
|
@ISA = qw(DynaLoader); |
100
|
|
|
|
|
|
|
local($SIG{__DIE__}) = 'DEFAULT'; |
101
|
|
|
|
|
|
|
Unicode::Japanese->bootstrap($VERSION); |
102
|
|
|
|
|
|
|
}; |
103
|
|
|
|
|
|
|
#print STDERR "* * the trial has been done.\n"; |
104
|
|
|
|
|
|
|
#undef @ISA; |
105
|
22
|
50
|
|
|
|
130
|
if( $@ ) |
106
|
|
|
|
|
|
|
{ |
107
|
|
|
|
|
|
|
#print STDERR "failed.\n"; |
108
|
|
|
|
|
|
|
#print STDERR "$@\n"; |
109
|
1
|
|
|
|
|
2
|
$use_xs = 0; |
110
|
1
|
|
|
|
|
2
|
$xs_loaderror = $@; |
111
|
1
|
|
|
|
|
4
|
undef $@; |
112
|
1
|
|
|
|
|
2
|
last LoadXS; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
#print STDERR "succeeded.\n"; |
115
|
22
|
|
|
|
|
30
|
$use_xs = 1; |
116
|
|
|
|
|
|
|
eval q |
117
|
22
|
|
|
107
|
|
1345
|
{ |
|
359
|
|
|
|
|
4058
|
|
118
|
|
|
|
|
|
|
#print STDERR "overriding _s2u,_u2s\n"; |
119
|
|
|
|
|
|
|
do_memmap(); |
120
|
|
|
|
|
|
|
#print STDERR "memmap done\n"; |
121
|
|
|
|
|
|
|
END{ do_memunmap(); } |
122
|
|
|
|
|
|
|
#print STDERR "binding xsubs has been done.\n"; |
123
|
|
|
|
|
|
|
}; |
124
|
22
|
50
|
|
|
|
106
|
if( $@ ) |
125
|
|
|
|
|
|
|
{ |
126
|
|
|
|
|
|
|
#print STDERR "error in the last part of operation to load XS.\n"; |
127
|
1
|
|
|
|
|
4
|
$xs_loaderror = $@; |
128
|
1
|
|
|
|
|
5
|
CORE::die($@); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
#print STDERR "done.\n"; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
22
|
50
|
|
|
|
58
|
if( $@ ) |
135
|
|
|
|
|
|
|
{ |
136
|
1
|
|
|
|
|
4
|
$xs_loaderror = $@; |
137
|
1
|
|
|
|
|
4
|
CORE::die("Cannot load Unicode::Japanese of neither XS nor PurePerl side\n$@"); |
138
|
|
|
|
|
|
|
} |
139
|
22
|
50
|
|
|
|
87
|
if( !$use_xs ) |
140
|
|
|
|
|
|
|
{ |
141
|
|
|
|
|
|
|
#print STDERR "no xs.\n"; |
142
|
|
|
|
|
|
|
eval q |
143
|
1
|
|
|
|
|
3
|
{ |
144
|
|
|
|
|
|
|
sub do_memmap($){} |
145
|
|
|
|
|
|
|
sub do_memunmap($){} |
146
|
|
|
|
|
|
|
}; |
147
|
|
|
|
|
|
|
} |
148
|
22
|
50
|
|
|
|
133
|
$xs_loaderror = '' if( !defined($xs_loaderror) ); |
149
|
|
|
|
|
|
|
#print STDERR "load_xs done.\n"; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
153
|
|
|
|
|
|
|
# Unicode::Japanese->new(); |
154
|
|
|
|
|
|
|
# cache for char conversion. |
155
|
|
|
|
|
|
|
# 2bytes. |
156
|
|
|
|
|
|
|
# JIS C 6226-1979 \e$@ |
157
|
|
|
|
|
|
|
# JIS X 0208-1983 \e$B |
158
|
|
|
|
|
|
|
# JIS X 0208-1990 \e&@\e$B |
159
|
|
|
|
|
|
|
# JIS X 0212-1990 \e$(D |
160
|
|
|
|
|
|
|
# 1byte. |
161
|
|
|
|
|
|
|
# JIS ROMAN \e(J |
162
|
|
|
|
|
|
|
# JIS ROMAN \e(H |
163
|
|
|
|
|
|
|
# ASCII \e(B |
164
|
|
|
|
|
|
|
# JIS KANA \e(I |
165
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
166
|
|
|
|
|
|
|
# $unijp = Unicode::Japanese->new([$str,[$icode]]); |
167
|
|
|
|
|
|
|
# |
168
|
|
|
|
|
|
|
sub new |
169
|
|
|
|
|
|
|
{ |
170
|
234
|
|
|
234
|
1
|
23812
|
my $pkg = shift; |
171
|
234
|
|
|
|
|
262
|
my $this = {}; |
172
|
|
|
|
|
|
|
|
173
|
234
|
50
|
|
|
|
432
|
if( defined($pkg) ) |
174
|
|
|
|
|
|
|
{ |
175
|
234
|
|
|
|
|
256
|
bless $this, $pkg; |
176
|
234
|
|
|
|
|
794
|
$this->_init_table; |
177
|
|
|
|
|
|
|
}else |
178
|
|
|
|
|
|
|
{ |
179
|
1
|
|
|
|
|
4
|
bless $this; |
180
|
1
|
|
|
|
|
1
|
$this->_init_table; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
234
|
100
|
|
|
|
4168
|
@_ and $this->set(@_); |
184
|
|
|
|
|
|
|
|
185
|
234
|
|
|
|
|
1881
|
$this; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
190
|
|
|
|
|
|
|
# _got_undefined_subroutine |
191
|
|
|
|
|
|
|
# die with message 'undefiend subroutine'. |
192
|
|
|
|
|
|
|
# |
193
|
|
|
|
|
|
|
sub _got_undefined_subroutine |
194
|
|
|
|
|
|
|
{ |
195
|
1
|
|
|
1
|
|
1
|
my $subname = pop; |
196
|
1
|
|
|
|
|
112
|
CORE::die "Undefined subroutine \&$subname got called.\n"; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
200
|
|
|
|
|
|
|
# AUTOLOAD |
201
|
|
|
|
|
|
|
# AUTOLOAD of Unicode::Japanese. |
202
|
|
|
|
|
|
|
# imports PurePerl methods. |
203
|
|
|
|
|
|
|
# |
204
|
|
|
|
|
|
|
AUTOLOAD |
205
|
|
|
|
|
|
|
{ |
206
|
|
|
|
|
|
|
# load pure perl subs. |
207
|
35
|
|
|
35
|
|
145
|
use vars qw($AUTOLOAD); |
|
35
|
|
|
|
|
39
|
|
|
35
|
|
|
|
|
3435
|
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
#print "AUTOLOAD... $AUTOLOAD\n"; |
210
|
|
|
|
|
|
|
|
211
|
140
|
100
|
|
140
|
|
3760
|
if(!defined($Unicode::Japanese::xs_loaderror) ) |
212
|
|
|
|
|
|
|
{ |
213
|
22
|
|
|
|
|
61
|
Unicode::Japanese::PurePerl::_init_table(); |
214
|
22
|
50
|
|
|
|
74
|
if( defined(&$AUTOLOAD) ) |
215
|
|
|
|
|
|
|
{ |
216
|
1
|
|
|
|
|
5
|
return &$AUTOLOAD; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
140
|
50
|
|
|
|
116
|
my ($pkg, $subname) = do{ |
221
|
140
|
|
|
|
|
273
|
local($1, $2); |
222
|
140
|
|
|
|
|
810
|
$AUTOLOAD =~ /^(.*)::(\w+)$/ |
223
|
|
|
|
|
|
|
} or got_undefined_subroutine($AUTOLOAD); |
224
|
|
|
|
|
|
|
|
225
|
140
|
|
|
|
|
219
|
my $pppkg = $pkg . '::PurePerl'; |
226
|
140
|
|
|
|
|
218
|
my $ppsubname = $pkg . '::PurePerl::' . $subname; |
227
|
140
|
100
|
|
|
|
527
|
if( !defined(&$ppsubname) ) |
228
|
|
|
|
|
|
|
{ |
229
|
116
|
|
|
|
|
123
|
my $save = $@; |
230
|
116
|
|
|
|
|
173
|
my @BAK = @_; |
231
|
116
|
|
|
|
|
250
|
$pppkg->_loadsub($ppsubname); |
232
|
113
|
|
|
|
|
96
|
$@ = $save; |
233
|
113
|
|
|
|
|
250
|
@_ = @BAK; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
137
|
|
|
|
|
300
|
my $sub = \&$ppsubname; |
237
|
|
|
|
|
|
|
{ |
238
|
24
|
|
|
24
|
|
105
|
no strict 'refs'; |
|
24
|
|
|
|
|
29
|
|
|
24
|
|
|
|
|
1267
|
|
|
137
|
|
|
|
|
121
|
|
239
|
137
|
|
|
|
|
266
|
*$AUTOLOAD = $sub; # copy. |
240
|
|
|
|
|
|
|
} |
241
|
137
|
|
|
|
|
2428
|
goto &$sub; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
245
|
|
|
|
|
|
|
# Unicode::Japanese::PurePerl |
246
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
247
|
|
|
|
|
|
|
package Unicode::Japanese::PurePerl; |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
|
250
|
24
|
|
|
24
|
|
93
|
use strict; |
|
24
|
|
|
|
|
28
|
|
|
24
|
|
|
|
|
489
|
|
251
|
24
|
|
|
24
|
|
81
|
use vars qw(%CHARCODE %ESC %RE @CHARSET_LIST); |
|
24
|
|
|
|
|
29
|
|
|
24
|
|
|
|
|
1236
|
|
252
|
|
|
|
|
|
|
|
253
|
24
|
|
|
24
|
|
92
|
use vars qw(@J2S @S2J @S2E @E2S @U2T %T2U %S2U %U2S %SA2U1 %U2SA1 %SA2U2 %U2SA2); |
|
24
|
|
|
|
|
24
|
|
|
24
|
|
|
|
|
7137
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
%CHARCODE = ( |
256
|
|
|
|
|
|
|
UNDEF_EUC => "\xa2\xae", |
257
|
|
|
|
|
|
|
UNDEF_SJIS => "\x81\xac", |
258
|
|
|
|
|
|
|
UNDEF_JIS => "\xa2\xf7", |
259
|
|
|
|
|
|
|
UNDEF_UNICODE => "\x20\x20", |
260
|
|
|
|
|
|
|
); |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
%ESC = ( |
263
|
|
|
|
|
|
|
JIS_0208 => "\e\$B", |
264
|
|
|
|
|
|
|
JIS_0212 => "\e\$(D", |
265
|
|
|
|
|
|
|
ASC => "\e\(B", |
266
|
|
|
|
|
|
|
KANA => "\e\(I", |
267
|
|
|
|
|
|
|
E_JSKY_START => "\e\$", |
268
|
|
|
|
|
|
|
E_JSKY_END => "\x0f", |
269
|
|
|
|
|
|
|
); |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
%RE = |
272
|
|
|
|
|
|
|
( |
273
|
|
|
|
|
|
|
ASCII => '[\x00-\x7f]', |
274
|
|
|
|
|
|
|
EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]', |
275
|
|
|
|
|
|
|
EUC_C => '[\xa1-\xfe][\xa1-\xfe]', |
276
|
|
|
|
|
|
|
EUC_KANA => '\x8e[\xa1-\xdf]', |
277
|
|
|
|
|
|
|
JIS_0208 => '\e\$\@|\e\$B|\e&\@\e\$B', |
278
|
|
|
|
|
|
|
JIS_0212 => "\e" . '\$\(D', |
279
|
|
|
|
|
|
|
JIS_ASC => "\e" . '\([BJ]', |
280
|
|
|
|
|
|
|
JIS_KANA => "\e" . '\(I', |
281
|
|
|
|
|
|
|
SJIS_DBCS => '[\x81-\x9f\xe0-\xef\xfa-\xfc][\x40-\x7e\x80-\xfc]', |
282
|
|
|
|
|
|
|
SJIS_KANA => '[\xa1-\xdf]', |
283
|
|
|
|
|
|
|
UTF8 => '[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\xf8-\xfb][\x80-\xbf]{4}|[\xfc-\xfd][\x80-\xbf]{5}', |
284
|
|
|
|
|
|
|
BOM2_BE => '\xfe\xff', |
285
|
|
|
|
|
|
|
BOM2_LE => '\xff\xfe', |
286
|
|
|
|
|
|
|
BOM4_BE => '\x00\x00\xfe\xff', |
287
|
|
|
|
|
|
|
BOM4_LE => '\xff\xfe\x00\x00', |
288
|
|
|
|
|
|
|
UTF32_BE => '\x00[\x00-\x10][\x00-\xff]{2}', |
289
|
|
|
|
|
|
|
UTF32_LE => '[\x00-\xff]{2}[\x00-\x10]\x00', |
290
|
|
|
|
|
|
|
E_IMODEv1 => '\xf8[\x9f-\xfc]|\xf9[\x40-\x49\x50-\x52\x55-\x57\x5b-\x5e\x72-\x7e\x80-\xb0]', |
291
|
|
|
|
|
|
|
E_IMODEv2 => '\xf9[\xb1-\xfc]', |
292
|
|
|
|
|
|
|
E_IMODE => '\xf8[\x9f-\xfc]|\xf9[\x40-\x49\x50-\x52\x55-\x57\x5b-\x5e\x72-\x7e\x80-\xfc]', |
293
|
|
|
|
|
|
|
E_JSKY1 => '[EFGOPQ]', |
294
|
|
|
|
|
|
|
E_JSKY1v1 => '[EFG]', |
295
|
|
|
|
|
|
|
E_JSKY1v2 => '[OPQ]', |
296
|
|
|
|
|
|
|
E_JSKY2 => '[\!-z]', |
297
|
|
|
|
|
|
|
E_DOTI => '\xf0[\x40-\x7e\x80-\xfc]|\xf1[\x40-\x7e\x80-\xd6]|\xf2[\x40-\x7e\x80-\xab\xb0-\xd5\xdf-\xfc]|\xf3[\x40-\x7e\x80-\xfa]|\xf4[\x40-\x4f\x80\x84-\x8a\x8c-\x8e\x90\x94-\x96\x98-\x9c\xa0-\xa4\xa8-\xaf\xb4\xb5\xbc-\xbe\xc4\xc5\xc8\xcc]', |
298
|
|
|
|
|
|
|
E_JIS_AU => '[\x75-\x7b][\x21-\x7e]', |
299
|
|
|
|
|
|
|
E_SJIS_AU => '[\xf3\xf4\xf6\xf7][\x40-\xfc]', |
300
|
|
|
|
|
|
|
E_ICON_AU_START => '
301
|
|
|
|
|
|
|
E_ICON_AU_END => '">', |
302
|
|
|
|
|
|
|
E_JSKY_START => quotemeta($ESC{E_JSKY_START}), |
303
|
|
|
|
|
|
|
E_JSKY_END => '(?:'.quotemeta($ESC{E_JSKY_END}).'|\z)', |
304
|
|
|
|
|
|
|
E_JSKYv1_UTF8 => '\xee(?:\x80[\x81-\xbf]|\x81[\x80-\x9a]|\x84[\x81-\xbf]|\x85[\x80-\x9a]|\x88[\x81-\xbf]|\x89[\x80-\x9a])', |
305
|
|
|
|
|
|
|
E_JSKYv2_UTF8 => '\xee(?:\x8c[\x81-\xbf]|\x8d[\x80-\x8d]|\x90[\x81-\xbf]|\x91[\x80-\x8c]|\x94[\x81-\xb7])', |
306
|
|
|
|
|
|
|
); |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
$]<5.005 and $RE{E_JSKY_END} =~ s/\\z/\$/; |
309
|
|
|
|
|
|
|
$RE{E_JSKY} = $RE{E_JSKY_START} |
310
|
|
|
|
|
|
|
. $RE{E_JSKY1} . $RE{E_JSKY2} . '+' |
311
|
|
|
|
|
|
|
. $RE{E_JSKY_END}; |
312
|
|
|
|
|
|
|
$RE{E_JSKYv1} = $RE{E_JSKY_START} |
313
|
|
|
|
|
|
|
. $RE{E_JSKY1v1} . $RE{E_JSKY2} . '+' |
314
|
|
|
|
|
|
|
. $RE{E_JSKY_END}; |
315
|
|
|
|
|
|
|
$RE{E_JSKYv2} = $RE{E_JSKY_START} |
316
|
|
|
|
|
|
|
. $RE{E_JSKY1v2} . $RE{E_JSKY2} . '+' |
317
|
|
|
|
|
|
|
. $RE{E_JSKY_END}; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
@CHARSET_LIST = qw( |
320
|
|
|
|
|
|
|
utf8 |
321
|
|
|
|
|
|
|
ucs2 |
322
|
|
|
|
|
|
|
ucs4 |
323
|
|
|
|
|
|
|
utf16 |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
sjis |
326
|
|
|
|
|
|
|
sjis-imode |
327
|
|
|
|
|
|
|
sjis-doti |
328
|
|
|
|
|
|
|
sjis-jsky |
329
|
|
|
|
|
|
|
sjis-icon-au |
330
|
|
|
|
|
|
|
cp932 |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
jis |
333
|
|
|
|
|
|
|
jis-jsky |
334
|
|
|
|
|
|
|
jis-au |
335
|
|
|
|
|
|
|
jis-icon-au |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
euc |
338
|
|
|
|
|
|
|
euc-jp |
339
|
|
|
|
|
|
|
euc-icon-au |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
utf8-jsky |
342
|
|
|
|
|
|
|
utf8-icon-au |
343
|
|
|
|
|
|
|
); |
344
|
|
|
|
|
|
|
|
345
|
24
|
|
|
24
|
|
97
|
use vars qw($s2u_table $u2s_table); |
|
24
|
|
|
|
|
24
|
|
|
24
|
|
|
|
|
837
|
|
346
|
23
|
|
|
23
|
|
175
|
use vars qw($ei2u1 $ei2u2 $ed2u $ej2u1 $ej2u2 $ea2u1 $ea2u2 $ea2u1s $ea2u2s); |
|
23
|
|
|
|
|
25
|
|
|
23
|
|
|
|
|
1405
|
|
347
|
23
|
|
|
23
|
|
116
|
use vars qw($eu2i1 $eu2i2 $eu2d $eu2j1 $eu2j2 $eu2a1 $eu2a2 $eu2a1s $eu2a2s); |
|
23
|
|
|
|
|
28
|
|
|
23
|
|
|
|
|
1481
|
|
348
|
|
|
|
|
|
|
|
349
|
23
|
|
|
23
|
|
67
|
use vars qw(%_h2zNum %_z2hNum %_h2zAlpha %_z2hAlpha %_h2zSym %_z2hSym %_h2zKanaK %_z2hKanaK %_h2zKanaD %_z2hKanaD %_hira2kata %_kata2hira); |
|
23
|
|
|
|
|
32
|
|
|
23
|
|
|
|
|
1646
|
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
|
353
|
23
|
|
|
23
|
|
73
|
use vars qw($FH $TABLE $HEADLEN $PROGLEN); |
|
23
|
|
|
|
|
30
|
|
|
23
|
|
|
|
|
981
|
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
356
|
|
|
|
|
|
|
# AUTOLOAD |
357
|
|
|
|
|
|
|
# AUTOLOAD of Unicode::Japanese::PurePerl. |
358
|
|
|
|
|
|
|
# load PurePerl methods from embedded data. |
359
|
|
|
|
|
|
|
# |
360
|
|
|
|
|
|
|
AUTOLOAD |
361
|
|
|
|
|
|
|
{ |
362
|
23
|
|
|
23
|
|
86
|
use strict; |
|
23
|
|
|
|
|
19
|
|
|
23
|
|
|
|
|
435
|
|
363
|
23
|
|
|
23
|
|
60
|
use vars qw($AUTOLOAD); |
|
23
|
|
|
|
|
24
|
|
|
23
|
|
|
|
|
2882
|
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
#print "AUTOLOAD... $AUTOLOAD\n"; |
366
|
|
|
|
|
|
|
|
367
|
58
|
|
|
58
|
|
1433
|
my $save = $@; |
368
|
58
|
|
|
|
|
103
|
my @BAK = @_; |
369
|
|
|
|
|
|
|
|
370
|
58
|
50
|
|
|
|
55
|
my ($pkg, $subname) = do{ |
371
|
58
|
|
|
|
|
112
|
local($1, $2); |
372
|
58
|
|
|
|
|
333
|
$AUTOLOAD =~ /^(.*)::(\w+)$/ |
373
|
|
|
|
|
|
|
} or got_undefined_subroutine($AUTOLOAD); |
374
|
|
|
|
|
|
|
|
375
|
58
|
|
|
|
|
116
|
$pkg->_loadsub($AUTOLOAD); |
376
|
|
|
|
|
|
|
|
377
|
58
|
|
|
|
|
63
|
$@ = $save; |
378
|
58
|
|
|
|
|
124
|
@_ = @BAK; |
379
|
58
|
|
|
|
|
1349
|
goto &$AUTOLOAD; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub _loadsub |
383
|
|
|
|
|
|
|
{ |
384
|
173
|
|
|
173
|
|
168
|
my $pkg = shift; |
385
|
173
|
|
|
|
|
127
|
my $fullsubname = shift; |
386
|
|
|
|
|
|
|
#print "subs..\n",join("\n",keys %$TABLE,''); |
387
|
23
|
|
|
23
|
|
85
|
use vars qw($AUTOLOAD); |
|
23
|
|
|
|
|
26
|
|
|
23
|
|
|
|
|
2304
|
|
388
|
|
|
|
|
|
|
|
389
|
173
|
|
|
|
|
262
|
local($1, $2); |
390
|
173
|
50
|
|
|
|
618
|
my ($subpkg,$subname) = $fullsubname =~ /^(.*)::(\w+)$/ |
391
|
|
|
|
|
|
|
or got_undefined_subroutine($fullsubname); |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# check |
394
|
173
|
50
|
|
|
|
381
|
if(!defined($TABLE->{$subname}{offset})) |
395
|
|
|
|
|
|
|
{ |
396
|
1
|
|
|
|
|
5
|
_init_table(); |
397
|
1
|
0
|
|
|
|
1
|
if( !defined($TABLE->{$subname}{offset}) ) |
398
|
|
|
|
|
|
|
{ |
399
|
1
|
0
|
|
|
|
9
|
if( $subname eq 'DESTROY' ) |
400
|
|
|
|
|
|
|
{ |
401
|
1
|
|
|
1
|
|
5
|
my $sub = sub{}; |
402
|
|
|
|
|
|
|
{ |
403
|
23
|
|
|
23
|
|
76
|
no strict 'refs'; |
|
23
|
|
|
|
|
26
|
|
|
23
|
|
|
|
|
5435
|
|
|
1
|
|
|
|
|
1
|
|
404
|
1
|
|
|
|
|
2
|
*$fullsubname = $sub; |
405
|
|
|
|
|
|
|
} |
406
|
0
|
|
|
|
|
0
|
return $sub; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
0
|
CORE::die "Undefined subroutine \&$fullsubname got called.\n"; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
} |
412
|
172
|
50
|
|
|
|
294
|
if($TABLE->{$subname}{offset} == -1) |
413
|
|
|
|
|
|
|
{ |
414
|
0
|
|
|
|
|
0
|
CORE::die "\&$fullsubname is getting loaded twice. There must be a problem in AUTOLOAD.\n"; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
172
|
50
|
|
|
|
694
|
seek($FH, $PROGLEN + $HEADLEN + $TABLE->{$subname}{offset}, 0) |
418
|
|
|
|
|
|
|
or die "Can't seek $subname. [$!]\n"; |
419
|
|
|
|
|
|
|
|
420
|
172
|
|
|
|
|
130
|
my $sub; |
421
|
172
|
50
|
|
|
|
1461
|
read($FH, $sub, $TABLE->{$subname}{'length'}) |
422
|
|
|
|
|
|
|
or die "Can't read $subname. [$!]\n"; |
423
|
|
|
|
|
|
|
|
424
|
172
|
50
|
|
|
|
315
|
if( $]>=5.008 ) |
425
|
|
|
|
|
|
|
{ |
426
|
172
|
|
|
|
|
387
|
$sub = 'use bytes;'.$sub; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
172
|
100
|
100
|
193
|
|
8961
|
CORE::eval(($sub=~/(.*)/s)[0]); |
|
95
|
100
|
100
|
110
|
|
162
|
|
|
205
|
50
|
66
|
120
|
|
480
|
|
|
96
|
100
|
0
|
60
|
|
974
|
|
|
128
|
100
|
33
|
92
|
|
10497
|
|
|
130
|
100
|
33
|
30
|
|
248
|
|
|
186
|
100
|
66
|
144
|
|
3851
|
|
|
135
|
100
|
33
|
79
|
|
457
|
|
|
132
|
100
|
33
|
39
|
|
1384
|
|
|
140
|
100
|
66
|
142
|
|
227
|
|
|
86
|
100
|
33
|
64
|
|
269
|
|
|
93
|
100
|
66
|
2
|
|
241
|
|
|
41
|
100
|
33
|
2
|
|
72
|
|
|
370
|
100
|
0
|
1
|
|
1869
|
|
|
228
|
100
|
0
|
1
|
|
25246
|
|
|
223
|
100
|
66
|
|
|
2076
|
|
|
51
|
100
|
66
|
|
|
4545
|
|
|
97
|
100
|
33
|
|
|
1390
|
|
|
44
|
100
|
33
|
|
|
42
|
|
|
99
|
100
|
33
|
|
|
233
|
|
|
139
|
100
|
0
|
|
|
19739
|
|
|
125
|
50
|
33
|
|
|
1373
|
|
|
81
|
100
|
66
|
|
|
75
|
|
|
79
|
100
|
33
|
|
|
173
|
|
|
121
|
100
|
33
|
|
|
18609
|
|
|
89
|
100
|
33
|
|
|
972
|
|
|
139
|
100
|
0
|
|
|
19431
|
|
|
76
|
50
|
33
|
|
|
1147
|
|
|
68
|
100
|
66
|
|
|
104
|
|
|
17
|
100
|
33
|
|
|
531
|
|
|
113
|
100
|
33
|
|
|
20182
|
|
|
60
|
100
|
33
|
|
|
689
|
|
|
60
|
100
|
66
|
|
|
20347
|
|
|
60
|
100
|
33
|
|
|
636
|
|
|
16
|
100
|
66
|
|
|
115
|
|
|
16
|
50
|
|
|
|
201
|
|
|
65
|
50
|
|
|
|
19728
|
|
|
112
|
100
|
|
|
|
848
|
|
|
112
|
100
|
|
|
|
20935
|
|
|
114
|
100
|
|
|
|
1373
|
|
|
127
|
100
|
|
|
|
3101
|
|
|
127
|
100
|
|
|
|
327
|
|
|
180
|
100
|
|
|
|
15802
|
|
|
89
|
100
|
|
|
|
1318
|
|
|
110
|
100
|
|
|
|
3056
|
|
|
21
|
50
|
|
|
|
161
|
|
|
98
|
50
|
|
|
|
82
|
|
|
7
|
50
|
|
|
|
12
|
|
|
7
|
100
|
|
|
|
50
|
|
|
12
|
50
|
|
|
|
18
|
|
|
9
|
50
|
|
|
|
14
|
|
|
3
|
100
|
|
|
|
9
|
|
|
2
|
50
|
|
|
|
2
|
|
|
95
|
100
|
|
|
|
375
|
|
|
46
|
100
|
|
|
|
550
|
|
|
97
|
50
|
|
|
|
376
|
|
|
35
|
50
|
|
|
|
36
|
|
|
35
|
50
|
|
|
|
35
|
|
|
35
|
100
|
|
|
|
64
|
|
|
0
|
50
|
|
|
|
0
|
|
|
35
|
50
|
|
|
|
65
|
|
|
0
|
100
|
|
|
|
0
|
|
|
35
|
100
|
|
|
|
50
|
|
|
2
|
0
|
|
|
|
7
|
|
|
35
|
0
|
|
|
|
32
|
|
|
10
|
0
|
|
|
|
80
|
|
|
10
|
0
|
|
|
|
125
|
|
|
10
|
50
|
|
|
|
210
|
|
|
6
|
50
|
|
|
|
8
|
|
|
6
|
50
|
|
|
|
6
|
|
|
6
|
50
|
|
|
|
11
|
|
|
25
|
50
|
|
|
|
72
|
|
|
31
|
50
|
|
|
|
346
|
|
|
26
|
50
|
|
|
|
78
|
|
|
33
|
50
|
|
|
|
37
|
|
|
29
|
100
|
|
|
|
35
|
|
|
33
|
100
|
|
|
|
58
|
|
|
6
|
0
|
|
|
|
77
|
|
|
35
|
0
|
|
|
|
121
|
|
|
6
|
0
|
|
|
|
123
|
|
|
33
|
0
|
|
|
|
43
|
|
|
7
|
50
|
|
|
|
11
|
|
|
33
|
50
|
|
|
|
35
|
|
|
0
|
50
|
|
|
|
0
|
|
|
6
|
50
|
|
|
|
12
|
|
|
0
|
50
|
|
|
|
0
|
|
|
6
|
50
|
|
|
|
11
|
|
|
1
|
50
|
|
|
|
4
|
|
|
6
|
50
|
|
|
|
5
|
|
|
27
|
50
|
|
|
|
79
|
|
|
33
|
100
|
|
|
|
378
|
|
|
33
|
0
|
|
|
|
252
|
|
|
36
|
0
|
|
|
|
157
|
|
|
36
|
0
|
|
|
|
163
|
|
|
33
|
0
|
|
|
|
63
|
|
|
3
|
50
|
|
|
|
5
|
|
|
33
|
50
|
|
|
|
52
|
|
|
1
|
50
|
|
|
|
4
|
|
|
33
|
50
|
|
|
|
58
|
|
|
1
|
50
|
|
|
|
2
|
|
|
33
|
50
|
|
|
|
29
|
|
|
1
|
50
|
|
|
|
3
|
|
|
3
|
50
|
|
|
|
4
|
|
|
0
|
50
|
|
|
|
0
|
|
|
3
|
100
|
|
|
|
4
|
|
|
3
|
0
|
|
|
|
63
|
|
|
3
|
0
|
|
|
|
54
|
|
|
33
|
0
|
|
|
|
148
|
|
|
38
|
0
|
|
|
|
422
|
|
|
38
|
50
|
|
|
|
102
|
|
|
35
|
50
|
|
|
|
42
|
|
|
27
|
50
|
|
|
|
29
|
|
|
35
|
50
|
|
|
|
62
|
|
|
1
|
50
|
|
|
|
3
|
|
|
35
|
50
|
|
|
|
57
|
|
|
1
|
50
|
|
|
|
2
|
|
|
35
|
50
|
|
|
|
45
|
|
|
1
|
0
|
|
|
|
3
|
|
|
35
|
50
|
|
|
|
31
|
|
|
8
|
50
|
|
|
|
90
|
|
|
8
|
50
|
|
|
|
74
|
|
|
8
|
50
|
|
|
|
144
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
27
|
100
|
|
|
|
77
|
|
|
27
|
100
|
|
|
|
372
|
|
|
27
|
100
|
|
|
|
79
|
|
|
27
|
100
|
|
|
|
84
|
|
|
30
|
100
|
|
|
|
31
|
|
|
30
|
50
|
|
|
|
30
|
|
|
30
|
50
|
|
|
|
56
|
|
|
0
|
50
|
|
|
|
0
|
|
|
30
|
50
|
|
|
|
42
|
|
|
0
|
100
|
|
|
|
0
|
|
|
30
|
100
|
|
|
|
37
|
|
|
1
|
50
|
|
|
|
3
|
|
|
30
|
100
|
|
|
|
24
|
|
|
0
|
100
|
|
|
|
0
|
|
|
0
|
100
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
100
|
|
|
|
0
|
|
|
0
|
100
|
|
|
|
0
|
|
|
30
|
50
|
|
|
|
85
|
|
|
30
|
100
|
|
|
|
405
|
|
|
30
|
100
|
|
|
|
85
|
|
|
30
|
50
|
|
|
|
84
|
|
|
8
|
50
|
|
|
|
7
|
|
|
8
|
50
|
|
|
|
7
|
|
|
8
|
50
|
|
|
|
13
|
|
|
0
|
100
|
|
|
|
0
|
|
|
8
|
100
|
|
|
|
8
|
|
|
8
|
50
|
|
|
|
18
|
|
|
8
|
100
|
|
|
|
42
|
|
|
8
|
100
|
|
|
|
138
|
|
|
82
|
50
|
|
|
|
89
|
|
|
82
|
100
|
|
|
|
73
|
|
|
82
|
100
|
|
|
|
134
|
|
|
0
|
50
|
|
|
|
0
|
|
|
82
|
100
|
|
|
|
69
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
82
|
0
|
|
|
|
228
|
|
|
82
|
0
|
|
|
|
357
|
|
|
2
|
0
|
|
|
|
4
|
|
|
2
|
0
|
|
|
|
4
|
|
|
2
|
0
|
|
|
|
8
|
|
|
2
|
0
|
|
|
|
6
|
|
|
2
|
0
|
|
|
|
3
|
|
|
2
|
0
|
|
|
|
7
|
|
|
72
|
50
|
|
|
|
173
|
|
|
72
|
|
|
|
|
127
|
|
|
72
|
|
|
|
|
350
|
|
|
82
|
|
|
|
|
535
|
|
|
17
|
|
|
|
|
17
|
|
|
17
|
|
|
|
|
16
|
|
|
17
|
|
|
|
|
13
|
|
|
17
|
|
|
|
|
19
|
|
|
17
|
|
|
|
|
12
|
|
|
17
|
|
|
|
|
225
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
18
|
|
|
1
|
|
|
|
|
18
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
16
|
|
|
1
|
|
|
|
|
18
|
|
|
1
|
|
|
|
|
17
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
18
|
|
|
1
|
|
|
|
|
17
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
9
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
18
|
|
|
1
|
|
|
|
|
18
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
8
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
17
|
|
|
1
|
|
|
|
|
17
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
12
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
17
|
|
|
|
|
31
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
17
|
|
|
|
|
42
|
|
|
6
|
|
|
|
|
29
|
|
|
6
|
|
|
|
|
98
|
|
|
51
|
|
|
|
|
19052
|
|
|
51
|
|
|
|
|
844
|
|
|
51
|
|
|
|
|
18681
|
|
|
51
|
|
|
|
|
824
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
24
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
430
|
172
|
100
|
|
|
|
19414
|
if ($@) |
431
|
|
|
|
|
|
|
{ |
432
|
3
|
|
|
|
|
284
|
CORE::die $@; |
433
|
|
|
|
|
|
|
} |
434
|
169
|
|
|
|
|
276
|
$DB::sub = $fullsubname; # Now debugger knows where we are. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# evaled |
437
|
169
|
|
|
|
|
506
|
$TABLE->{$subname}{offset} = -1; |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
442
|
|
|
|
|
|
|
# Unicode::Japanese::PurePerl->new() |
443
|
|
|
|
|
|
|
# |
444
|
|
|
|
|
|
|
sub new |
445
|
|
|
|
|
|
|
{ |
446
|
201
|
|
|
202
|
|
2326
|
goto &Unicode::Japanese::new; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
450
|
|
|
|
|
|
|
# DESTROY |
451
|
|
|
|
|
|
|
# |
452
|
|
|
|
|
|
|
sub DESTROY |
453
|
|
|
|
151
|
|
|
{ |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
457
|
|
|
|
|
|
|
# gensym |
458
|
|
|
|
|
|
|
# |
459
|
|
|
|
|
|
|
sub gensym { |
460
|
|
|
|
|
|
|
package Unicode::Japanese::Symbol; |
461
|
23
|
|
|
23
|
|
84
|
no strict; |
|
23
|
|
|
|
|
20
|
|
|
23
|
|
|
|
|
15869
|
|
462
|
134
|
|
|
365
|
|
151
|
$genpkg = "Unicode::Japanese::Symbol::"; |
463
|
134
|
|
|
|
|
507
|
$genseq = 0; |
464
|
143
|
|
|
|
|
1382
|
my $name = "GEN" . $genseq++; |
465
|
147
|
|
|
|
|
44300
|
my $ref = \*{$genpkg . $name}; |
|
162
|
|
|
|
|
286
|
|
466
|
465
|
|
|
|
|
20734
|
delete $$genpkg{$name}; |
467
|
465
|
|
|
|
|
618
|
$ref; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
471
|
|
|
|
|
|
|
# _init_table |
472
|
|
|
|
|
|
|
# |
473
|
|
|
|
|
|
|
sub _init_table { |
474
|
|
|
|
|
|
|
|
475
|
691
|
100
|
|
536
|
|
977
|
if(!defined($HEADLEN)) |
476
|
|
|
|
|
|
|
{ |
477
|
436
|
|
|
|
|
776
|
$FH = gensym; |
478
|
|
|
|
|
|
|
|
479
|
355
|
|
|
|
|
542
|
my $file = "Unicode/Japanese.pm"; |
480
|
|
|
|
|
|
|
OPEN: |
481
|
|
|
|
|
|
|
{ |
482
|
145
|
100
|
|
|
|
278
|
if( $INC{$file} ) |
|
355
|
|
|
|
|
492
|
|
483
|
|
|
|
|
|
|
{ |
484
|
133
|
100
|
|
|
|
887
|
open($FH,$INC{$file}) || CORE::die("could not open file [$INC{$file}] for input : $!"); |
485
|
445
|
|
|
|
|
636
|
last OPEN; |
486
|
|
|
|
|
|
|
} |
487
|
28
|
|
|
|
|
286
|
foreach my $path (@INC) |
488
|
|
|
|
|
|
|
{ |
489
|
347
|
|
|
|
|
620
|
my $mypath = $path; |
490
|
364
|
|
|
|
|
1218
|
$mypath =~ s#/$##; |
491
|
331
|
100
|
|
|
|
635
|
if (-f "$mypath/$file") |
492
|
|
|
|
|
|
|
{ |
493
|
53
|
100
|
|
|
|
218
|
open($FH,"$mypath/$file") || CORE::die("could not open file [$INC{$file}] for input : $!"); |
494
|
43
|
|
|
|
|
112
|
last OPEN; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
43
|
|
|
|
|
83
|
CORE::die "Can't find Japanese.pm in \@INC\n"; |
498
|
|
|
|
|
|
|
} |
499
|
335
|
|
|
|
|
446
|
binmode($FH); |
500
|
|
|
|
|
|
|
|
501
|
186
|
|
|
|
|
657
|
local($/) = "\n"; |
502
|
71
|
|
|
|
|
166
|
my $line; |
503
|
408
|
|
|
|
|
1168
|
while(defined($line = <$FH>)) |
504
|
|
|
|
|
|
|
{ |
505
|
32432
|
100
|
|
|
|
54111
|
last if($line =~ m/^__DATA__/); |
506
|
|
|
|
|
|
|
} |
507
|
52
|
|
|
|
|
212
|
$PROGLEN = tell($FH); |
508
|
|
|
|
|
|
|
|
509
|
29
|
100
|
|
|
|
131
|
read($FH, $HEADLEN, 4) |
510
|
|
|
|
|
|
|
or die "Can't read the table. [$!]\n"; |
511
|
53
|
|
|
|
|
251
|
$HEADLEN = unpack('N', $HEADLEN); |
512
|
382
|
100
|
|
|
|
3251
|
read($FH, $TABLE, $HEADLEN) |
513
|
|
|
|
|
|
|
or die "Can't seek the table. [$!]\n"; |
514
|
53
|
|
|
|
|
444
|
$TABLE =~ /(.*)/s; |
515
|
165
|
|
|
|
|
12659
|
$TABLE = eval(($TABLE=~/(.*)/s)[0]); |
516
|
142
|
100
|
|
|
|
620
|
if($@) |
517
|
|
|
|
|
|
|
{ |
518
|
100
|
|
|
|
|
192
|
die "Internal Error. [$@]\n"; |
519
|
|
|
|
|
|
|
} |
520
|
56
|
100
|
|
|
|
619
|
if(!defined($TABLE)) |
521
|
|
|
|
|
|
|
{ |
522
|
170
|
|
|
|
|
294
|
die "Internal Error.\n"; |
523
|
|
|
|
|
|
|
} |
524
|
185
|
|
|
|
|
606
|
$HEADLEN += 4; |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# load xs. |
527
|
162
|
|
|
|
|
275
|
Unicode::Japanese::load_xs(); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
532
|
|
|
|
|
|
|
# _getFile |
533
|
|
|
|
|
|
|
# load embedded file data. |
534
|
|
|
|
|
|
|
# |
535
|
|
|
|
|
|
|
sub _getFile { |
536
|
67
|
|
|
142
|
|
273
|
my $this = shift; |
537
|
|
|
|
|
|
|
|
538
|
221
|
|
|
|
|
584
|
my $file = shift; |
539
|
|
|
|
|
|
|
|
540
|
69
|
100
|
|
|
|
181
|
exists($TABLE->{$file}) |
541
|
|
|
|
|
|
|
or die "no such file [$file]\n"; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
#my $offset16 = $TABLE->{$file}{offset} % 16; |
544
|
|
|
|
|
|
|
#print STDERR "_getFile($file, $TABLE->{$file}{offset}, $TABLE->{$file}{'length'}, $offset16)\n"; |
545
|
202
|
100
|
|
|
|
380
|
seek($FH, $PROGLEN + $HEADLEN + $TABLE->{$file}{offset}, 0) |
546
|
|
|
|
|
|
|
or die "Can't seek $file. [$!]\n"; |
547
|
|
|
|
|
|
|
|
548
|
154
|
|
|
|
|
275
|
my $data; |
549
|
156
|
100
|
|
|
|
1665
|
read($FH, $data, $TABLE->{$file}{'length'}) |
550
|
|
|
|
|
|
|
or die "Can't read $file. [$!]\n"; |
551
|
|
|
|
|
|
|
|
552
|
156
|
|
|
|
|
959
|
$data; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
556
|
|
|
|
|
|
|
# use_I18N_Japanese |
557
|
|
|
|
|
|
|
# copied from I18N::Japanese in jperl-5.5.3 |
558
|
|
|
|
|
|
|
# |
559
|
|
|
|
|
|
|
sub use_I18N_Japanese |
560
|
|
|
|
|
|
|
{ |
561
|
121
|
|
|
113
|
|
431
|
shift; |
562
|
33
|
100
|
|
|
|
69
|
if( @_ ) |
563
|
|
|
|
|
|
|
{ |
564
|
83
|
|
|
|
|
589
|
my $bits = 0; |
565
|
40
|
|
|
|
|
131
|
foreach( @_ ) |
566
|
|
|
|
|
|
|
{ |
567
|
92
|
100
|
|
|
|
298
|
$bits |= 0x1000000 if $_ eq 're'; |
568
|
82
|
100
|
|
|
|
1017
|
$bits |= 0x2000000 if $_ eq 'tr'; |
569
|
83
|
100
|
|
|
|
722
|
$bits |= 0x4000000 if $_ eq 'format'; |
570
|
69
|
100
|
|
|
|
106
|
$bits |= 0x8000000 if $_ eq 'string'; |
571
|
|
|
|
|
|
|
} |
572
|
141
|
|
|
|
|
996
|
$^H |= $bits; |
573
|
|
|
|
|
|
|
}else |
574
|
|
|
|
|
|
|
{ |
575
|
77
|
|
|
|
|
137
|
$^H |= 0x0f00_0000; |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# ----------------------------------------------------------------------------- |
580
|
|
|
|
|
|
|
# no_I18N_Japanese |
581
|
|
|
|
|
|
|
# copied from I18N::Japanese in jperl-5.5.3 |
582
|
|
|
|
|
|
|
# |
583
|
|
|
|
|
|
|
sub no_I18N_Japanese |
584
|
|
|
|
|
|
|
{ |
585
|
18
|
|
|
181
|
|
18
|
shift; |
586
|
76
|
100
|
|
|
|
218
|
if( @_ ) |
587
|
|
|
|
|
|
|
{ |
588
|
62
|
|
|
|
|
227
|
my $bits = 0; |
589
|
125
|
|
|
|
|
169
|
foreach( @_ ) |
590
|
|
|
|
|
|
|
{ |
591
|
66
|
100
|
|
|
|
276
|
$bits |= 0x1000000 if $_ eq 're'; |
592
|
65
|
100
|
|
|
|
1251
|
$bits |= 0x2000000 if $_ eq 'tr'; |
593
|
65
|
100
|
|
|
|
384
|
$bits |= 0x4000000 if $_ eq 'format'; |
594
|
112
|
100
|
|
|
|
226
|
$bits |= 0x8000000 if $_ eq 'string'; |
595
|
|
|
|
|
|
|
} |
596
|
105
|
|
|
|
|
108
|
$^H &= ~$bits; |
597
|
|
|
|
|
|
|
}else |
598
|
|
|
|
|
|
|
{ |
599
|
105
|
|
|
|
|
305
|
$^H &= ~0x0f00_0000; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
1; |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=encoding utf-8 |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=head1 NAME |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
Unicode::Japanese - Convert encoding of japanese text |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head1 SYNOPSIS |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
use Unicode::Japanese; |
615
|
|
|
|
|
|
|
use Unicode::Japanese qw(unijp); |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# convert utf8 -> sjis |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
print Unicode::Japanese->new($str)->sjis; |
620
|
|
|
|
|
|
|
print unijp($str)->sjis; # same as above. |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# convert sjis -> utf8 |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
print Unicode::Japanese->new($str,'sjis')->get; |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
# convert sjis (imode_EMOJI) -> utf8 |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
print Unicode::Japanese->new($str,'sjis-imode')->get; |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# convert zenkaku (utf8) -> hankaku (utf8) |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
print Unicode::Japanese->new($str)->z2h->get; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
=head1 DESCRIPTION |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
The Unicode::Japanese module converts encoding of japanese text from one |
637
|
|
|
|
|
|
|
encoding to another. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=head2 FEATURES |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=over 2 |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=item * |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
An instance of Unicode::Japanese internally holds a string in UTF-8. |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=item * |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
This module is implemented in two ways: XS and pure perl. If efficiency is |
656
|
|
|
|
|
|
|
important for you, you should build and install the XS module. If you don't want |
657
|
|
|
|
|
|
|
to, or if you can't build the XS module, you may use the pure perl module |
658
|
|
|
|
|
|
|
instead. In that case, only you have to do is to copy Japanese.pm into somewhere |
659
|
|
|
|
|
|
|
in @INC. |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item * |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
This module can convert characters from zenkaku (full-width) form to hankaku |
667
|
|
|
|
|
|
|
(half-width) form, and vice versa. Conversion between hiragana (one of two sets |
668
|
|
|
|
|
|
|
of japanese phonetical alphabet) and katakana (another set of japanese |
669
|
|
|
|
|
|
|
phonetical alphabet) is also supported. |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item * |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
This module has mapping tables for emoji (graphic characters) defined by various |
677
|
|
|
|
|
|
|
japanese mobile phones; DoCoMo i-mode, ASTEL dot-i and J-PHONE J-Sky. Those |
678
|
|
|
|
|
|
|
letters are mapped on Unicode Private Use Area so unicode strings it outputs are |
679
|
|
|
|
|
|
|
still valid even if they contain emoji, and you can safely pass them to other |
680
|
|
|
|
|
|
|
softwares that can handle Unicode. |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=item * |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
This module can map some emoji from one set to another. Different mobile phones |
688
|
|
|
|
|
|
|
define different sets of emoji, so mapping each other is not always |
689
|
|
|
|
|
|
|
possible. But since some emoji exist in two or more sets with similar |
690
|
|
|
|
|
|
|
appearance, this module considers those emoji to be the same. |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=item * |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
This module uses the mapping table for MS-CP932 instead of the standard |
698
|
|
|
|
|
|
|
Shift_JIS. The Shift_JIS encoding used by MS-Windows (MS-SJIS/MS-CP932) slightly |
699
|
|
|
|
|
|
|
differs from the standard. |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=item * |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
When the module converts strings from Unicode to Shift_JIS, EUC-JP or |
707
|
|
|
|
|
|
|
ISO-2022-JP, unicode letters which can't be represented in those encodings will |
708
|
|
|
|
|
|
|
be encoded in "dddd;" form (decimal character reference). Note, however, that |
709
|
|
|
|
|
|
|
letters in Unicode Private Use Area will be replaced with '?' mark ('QUESTION |
710
|
|
|
|
|
|
|
MARK'; U+003F) instead of being encoded. In addition, encoding to character sets |
711
|
|
|
|
|
|
|
for mobile phones makes every unrepresentable letters being '?' mark. |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
=item * |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
On perl-5.8.0 or later, this module handles the UTF-8 flag: the method utf8() |
719
|
|
|
|
|
|
|
returns UTF-8 I string, and the method getu() returns UTF-8 I |
720
|
|
|
|
|
|
|
string. |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
Currently the method get() returns UTF-8 I string but this behavior may be |
724
|
|
|
|
|
|
|
changed in the future. |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
Methods like sjis(), jis(), utf8(), and such like return I string. new(), |
728
|
|
|
|
|
|
|
set(), getcode() methods just ignore the UTF-8 flag of strings they take. |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=back |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
=head1 REQUIREMENT |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=over 4 |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
=item * |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
perl 5.10.x, 5.8.x, etc. (5.005_03 and later) |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=item * |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
(optional) |
749
|
|
|
|
|
|
|
C Compiler. |
750
|
|
|
|
|
|
|
This module supports both XS and Pure Perl. |
751
|
|
|
|
|
|
|
If you have no C Compilers, |
752
|
|
|
|
|
|
|
Unicode::Japanese will be installed as Pure Perl module. |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item * |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
(optional) |
760
|
|
|
|
|
|
|
Test.pm and Test::More for testing. |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=back |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=head1 METHODS |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
=over 4 |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=item $s = Unicode::Japanese->new($str [, $icode [, $encode]]) |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Create a new instance of Unicode::Japanese. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
Any given parameters will be internally passed to the method set(). |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=item unijp($str [, $icode [, $encode]]) |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Same as Unicode::Janaese->new(...). |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=item $s->set($str [, $icode [, $encode]]) |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=over 2 |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=item $str: string |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=item $icode: optional character encoding (default: 'utf8') |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=item $encode: optional binary encoding (default: no binary encodings are assumed) |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=back |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Store a string into the instance. |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
Possible character encodings are: |
798
|
|
|
|
|
|
|
'auto', 'utf8', 'ucs2', 'ucs4', 'utf16-be', 'utf16-le', 'utf16', |
799
|
|
|
|
|
|
|
'utf32-be', 'utf32-le', 'utf32', 'jis', 'euc', 'euc-jp', |
800
|
|
|
|
|
|
|
'sjis', 'cp932', 'sjis-imode', 'sjis-imode1', 'sjis-imode2', |
801
|
|
|
|
|
|
|
'sjis-doti', 'sjis-doti1', 'sjis-jsky', 'sjis-jsky1', 'sjis-jsky2', |
802
|
|
|
|
|
|
|
'jis-jsky', 'jis-jsky1', 'jis-jsky2', 'jis-au', 'jis-au1', 'jis-au2', |
803
|
|
|
|
|
|
|
'sjis-au', 'sjis-au1', 'sjis-au2', 'sjis-icon-au', 'sjis-icon-au1', 'sjis-icon-au2', |
804
|
|
|
|
|
|
|
'euc-icon-au', 'euc-icon-au1', 'euc-icon-au2', 'jis-icon-au', 'jis-icon-au1', 'jis-icon-au2', |
805
|
|
|
|
|
|
|
'utf8-icon-au', 'utf8-icon-au1', 'utf8-icon-au2', 'ascii', 'binary' |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
If you want the Unicode::Japanese detect the character encoding of string, you |
809
|
|
|
|
|
|
|
must explicitly specify 'auto' as the second argument. In that case, the given |
810
|
|
|
|
|
|
|
string will be passed to the method getcode() to guess the encoding. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
For binary encodings, only 'base64' is currently supported. If you specify |
814
|
|
|
|
|
|
|
'base64' as the third argument, the given string will be decoded using Base64 |
815
|
|
|
|
|
|
|
decoder. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
Specify 'binary' as the second argument if you want your string to be stored |
819
|
|
|
|
|
|
|
without modification. |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
When you specify 'sjis-imode' or 'sjis-doti' as the character encoding, any |
823
|
|
|
|
|
|
|
occurences of 'dddd;' (decimal character reference) in the string will be |
824
|
|
|
|
|
|
|
interpreted and decoded as code point of emoji, just like emoji implanted into |
825
|
|
|
|
|
|
|
the string in binary form. |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Since encoded forms of strings in various encodings are not clearly distinctive |
829
|
|
|
|
|
|
|
to each other, it is not always certainly possible to detect what encoding is |
830
|
|
|
|
|
|
|
used for a given string. |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
When a given string is possibly interpreted as both Shift_JIS and UTF-8 string, |
834
|
|
|
|
|
|
|
this module considers such a string to be encoded in Shift_JIS. And if the |
835
|
|
|
|
|
|
|
encoding is not distinguishable between 'sjis-au' and 'sjis-doti', this module |
836
|
|
|
|
|
|
|
considers it 'sjis-au'. |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=item $str = $s->get |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=over 2 |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
=item $str: string (UTF-8) |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
=back |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
Get the internal string in UTF-8. |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
This method currently returns a byte string (whose UTF-8 flag is turned off), |
851
|
|
|
|
|
|
|
but this behavior may be changed in the future. |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
If you absolutely want a byte string, you should use the method utf8() |
855
|
|
|
|
|
|
|
instead. And if you want a character string (whose UTF-8 flag is turned on), you |
856
|
|
|
|
|
|
|
have to use the method getu(). |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=item $str = $s->getu |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
=over 2 |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
=item $str: string (UTF-8) |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=back |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
Get the internal string in UTF-8. |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
On perl-5.8.0 or later, this method returns a character string with its UTF-8 |
871
|
|
|
|
|
|
|
flag turned on. |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=item $code = $s->getcode($str) |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=over 2 |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=item $str: string |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
=item $code: name of character encoding |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=back |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
Detect the character encoding of given string. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
Note that this method, exceptionaly, doesn't deal with the internal string of an |
888
|
|
|
|
|
|
|
instance. |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
To guess the encoding, the following algorithm is used: |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
(For pure perl implementation) |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=over 4 |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
=item 1 |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
If the string has an UTF-32 BOM, its encoding is 'utf32'. |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
=item 2 |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
If it has an UTF-16 BOM, its encoding is 'utf16'. |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
=item 3 |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
If it is valid for UTF-32BE, its encoding is 'utf32-be'. |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
=item 4 |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
If it is valid for UTF-32LE, its encoding is 'utf32-le'. |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
=item 5 |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
If it contains no ESC characters or bytes whose eighth bit is on, its encoding |
932
|
|
|
|
|
|
|
is 'ascii'. Every ASCII control characters (0x00-0x1F and 0x7F) except ESC |
933
|
|
|
|
|
|
|
(0x1B) are considered to be in the range of 'ascii'. |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=item 6 |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
If it contains escape sequences of ISO-2022-JP, its encoding is 'jis'. |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=item 7 |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
If it contains any emoji defined for J-PHONE, its encoding is 'sjis-jsky'. |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=item 8 |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
If it is valid for EUC-JP, its encoding is 'euc'. |
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
=item 9 |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
If it is valid for Shift_JIS, its encoding is 'sjis'. |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
If it contains any emoji defined for au, and everything else is valid for |
965
|
|
|
|
|
|
|
Shift_JIS, its encoding is 'sjis-au'. |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
=item 10 |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
If it contains any emoji defined for i-mode, and everything else is valid for |
973
|
|
|
|
|
|
|
Shift_JIS, its encoding is 'sjis-imode'. |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=item 11 |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
If it contains any emoji defined for dot-i, and everything else is valid for |
981
|
|
|
|
|
|
|
Shift_JIS, its encoding is 'sjis-doti'. |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=item 12 |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
If it is valid for UTF-8, its encoding is 'utf8'. |
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
=item 13 |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
If no conditions above are fulfilled, its encoding is 'unknown'. |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=back |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
(For XS implementation) |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
=over 4 |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
=item 1 |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
If the string has an UTF-32 BOM, its encoding is 'utf32'. |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
=item 2 |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
If it has an UTF-16 BOM, its encoding is 'utf16'. |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
=item 3 |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
Find all possible encodings that might have been applied to the string from the |
1024
|
|
|
|
|
|
|
following: |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
ascii / euc-jp / sjis / jis / utf8 / utf32-be / utf32-le / sjis-jsky / |
1028
|
|
|
|
|
|
|
sjis-imode / sjis-au / sjis-doti |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=item 4 |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
If any encodings have been found possible, this module picks out one encoding |
1036
|
|
|
|
|
|
|
having the highest priority among them. The priority order is as follows: |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
utf32-be / utf32-le / ascii / jis / euc-jp / sjis / sjis-jsky / sjis-imode / |
1040
|
|
|
|
|
|
|
sjis-au / sjis-doti / utf8 |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
=item 5 |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
If no conditions above are fulfilled, its encoding is 'unknown'. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
=back |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
Pay attention to the following pitfalls in the above algorithm: |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
=over 2 |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
=item * |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
UTF-8 strings might be accidentally considered to be encoded in Shift_JIS. |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=item * |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
UCS-2 strings (sequence of raw UCS-2 letters in big-endian; each letters has |
1069
|
|
|
|
|
|
|
always 2 bytes) can't be detected because they look like nothing but sequences |
1070
|
|
|
|
|
|
|
of random bytes whose length is an even number. |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
=item * |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
UTF-16 strings must have BOM to be detected. |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
=item * |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
Emoji are only be recognized if they are implanted into the string in binary |
1085
|
|
|
|
|
|
|
form. If they are described in 'dddd;' form, they aren't considered to be |
1086
|
|
|
|
|
|
|
emoji. |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=back |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
Since the XS and pure perl implementations use different algorithms to guess |
1092
|
|
|
|
|
|
|
encoding, they may guess differently for the same string. Especially, the pure |
1093
|
|
|
|
|
|
|
perl implementation finds Shift_JIS strings containing ESC character (0x1B) to |
1094
|
|
|
|
|
|
|
be actually encoded in Shift_JIS but XS implementation doesn't. This is because |
1095
|
|
|
|
|
|
|
such strings can hardly be distinguished from 'sjis-jsky'. In addition, EUC-JP |
1096
|
|
|
|
|
|
|
strings containing ESC character are also rejected for the same reason. |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
=item $code = $s->getcodelist($str) |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
=over 2 |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=item $str: string |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=item $code: name of character encodings |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
=back |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
Detect the character encoding of given string. |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
Unlike the method getcode(), getcodelist() returns a list of possible encodings. |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
=item $str = $s->conv($ocode, $encode) |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
=over 2 |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
=item $ocode: character encoding (possible encodings: 'utf8', 'euc', 'euc-jp', 'jis', 'sjis', 'cp932', |
1120
|
|
|
|
|
|
|
'sjis-imode', 'sjis-imode1', 'sjis-imode2', 'sjis-doti', 'sjis-doti1', 'sjis-jsky', 'sjis-jsky1', 'sjis-jsky2', |
1121
|
|
|
|
|
|
|
'jis-jsky', 'jis-jsky1', 'jis-jsky2', 'jis-au', 'jis-au1', 'jis-au2', 'sjis-au', 'sjis-au1', 'sjis-au2', |
1122
|
|
|
|
|
|
|
'sjis-icon-au', 'sjis-icon-au1', 'sjis-icon-au2', 'euc-icon-au', 'euc-icon-au1', 'euc-icon-au2', |
1123
|
|
|
|
|
|
|
'jis-icon-au', 'jis-icon-au1', 'jis-icon-au2', 'utf8-icon-au', 'utf8-icon-au1', 'utf8-icon-au2', |
1124
|
|
|
|
|
|
|
'ucs2', 'ucs4', 'utf16', 'binary') |
1125
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
Some encodings for mobile phones have a trailing digit like 'sjis-au2'. Those |
1127
|
|
|
|
|
|
|
digits represent the version number of encodings. Such encodings have a variant |
1128
|
|
|
|
|
|
|
with no trailing digits, like 'sjis-au', which is the same as the latest version |
1129
|
|
|
|
|
|
|
among its variants. |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
=item $encode: optional binary encoding |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=item $str: string |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=back |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
Get the internal string of instance with encoding it using a given character |
1139
|
|
|
|
|
|
|
encoding method. |
1140
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
If you want the resulting string to be encoded in Base64, specify 'base64' as |
1143
|
|
|
|
|
|
|
the second argument. |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off even if |
1147
|
|
|
|
|
|
|
you specify 'utf8' to the first argument. |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
=item $s->tag2bin |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
Interpret decimal character references (dddd;) in the instance, and replaces |
1153
|
|
|
|
|
|
|
them with single characters they represent. |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=item $s->z2h |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
Replace zenkaku (full-width) letters in the instance with hankaku (half-width) |
1159
|
|
|
|
|
|
|
letters. |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
=item $s->h2z |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
Replace hankaku (half-width) letters in the instance with zenkaku (full-width) |
1165
|
|
|
|
|
|
|
letters. |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=item $s->hira2kata |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
Replace any hiragana in the instance with katakana. |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=item $s->kata2hira |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
Replace any katakana in the instance with hiragana. |
1176
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
=item $str = $s->jis |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
$str: byte string in ISO-2022-JP |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in ISO-2022-JP. |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
=item $str = $s->euc |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
$str: byte string in EUC-JP |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in EUC-JP. |
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
=item $str = $s->utf8 |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
$str: byte string in UTF-8 |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
Get the internal UTF-8 string of instance. |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off. |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=item $str = $s->ucs2 |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
$str: byte string in UCS-2 |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
Get the internal string of instance as a sequence of raw UCS-2 letters in |
1211
|
|
|
|
|
|
|
big-endian. Note that this is different from UTF-16BE as raw UCS-2 sequence has |
1212
|
|
|
|
|
|
|
no concept of surrogate pair. |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=item $str = $s->ucs4 |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
$str: byte string in UCS-4 |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
Get the internal string of instance as a sequence of raw UCS-4 letters in |
1221
|
|
|
|
|
|
|
big-endian. This is practically the same as UTF-32BE. |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=item $str = $s->utf16 |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
$str: byte string in UTF-16 |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
Get the insternal string of instance with encoding it in UTF-16 in big-endian |
1230
|
|
|
|
|
|
|
with no BOM prepended. |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
=item $str = $s->sjis |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
$str: byte string in Shift_JIS |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in Shift_JIS (MS-SJIS / |
1239
|
|
|
|
|
|
|
MS-CP932). |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
=item $str = $s->sjis_imode |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
$str: byte string in 'sjis-imode' |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-imode'. |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
=item $str = $s->sjis_imode1 |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
$str: byte string in 'sjis-imode1' |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-imode1'. |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
=item $str = $s->sjis_imode2 |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
$str: byte string in 'sjis-imode2' |
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-imode2'. |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
=item $str = $s->sjis_doti |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
$str: byte string in 'sjis-doti' |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-doti'. |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=item $str = $s->sjis_jsky |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
$str: byte string in 'sjis-jsky' |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-jsky'. |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
=item $str = $s->sjis_jsky1 |
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
$str: byte string in 'sjis-jsky1' |
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-jsky1'. |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
=item $str = $s->sjis_jsky |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
$str: byte string in 'sjis-jsky' |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-jsky'. |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=item $str = $s->sjis_icon_au |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
$str: byte string in 'sjis-icon-au' |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
Get the internal string of instance with encoding it in 'sjis-icon-au'. |
1304
|
|
|
|
|
|
|
|
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
=item $str_arrayref = $s->strcut($len) |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
=over 2 |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
=item $len: maximum length of each chunks (in number of |
1311
|
|
|
|
|
|
|
full-width characters) |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
=item $str_arrayref: reference to array of strings |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
=back |
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
Split the internal string of instance into chunks of a given length. |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
On perl-5.8.0 or later, UTF-8 flags of each chunks are turned on. |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
=item $len = $s->strlen |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
$len: character width of the internal string |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
Calculate the character width of the internal string. Half-width characters have |
1329
|
|
|
|
|
|
|
width of one unit, and full-width characters have width of two units. |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
=item $s->join_csv(@values); |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
@values: array of strings |
1335
|
|
|
|
|
|
|
|
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
Build a line of CSV from the arguments, and store it into the instance. The |
1338
|
|
|
|
|
|
|
resulting line has a trailing line break ("\n"). |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
=item @values = $s->split_csv; |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
@values: array of strings |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
Parse a line of CSV in the instance and return each columns. The line will be |
1347
|
|
|
|
|
|
|
chomp()ed before getting parsed. |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
If the internal string was decoded from 'binary' encoding (see methods new() and |
1351
|
|
|
|
|
|
|
set()), the UTF-8 flags of the resulting array of strings are turned |
1352
|
|
|
|
|
|
|
off. Otherwise the flags are turned on. |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
|
1355
|
|
|
|
|
|
|
=back |
1356
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
=head1 DESCRIPTION OF UNICODE MAPPING |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
Transcoding between Unicode encodings and other ones is performed as below: |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
=over 2 |
1363
|
|
|
|
|
|
|
|
1364
|
|
|
|
|
|
|
=item Shift_JIS |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
This module uses the mapping table of MS-CP932. |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
L<< ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT >> |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
When the module tries to convert Unicode string to Shift_JIS, it represents most |
1373
|
|
|
|
|
|
|
letters which isn't available in Shift_JIS as decimal character reference |
1374
|
|
|
|
|
|
|
('dddd;'). There is one exception to this: every graphic characters for mobile |
1375
|
|
|
|
|
|
|
phones are replaced with '?' mark. |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
For variants of Shift_JIS defined for mobile phones, every unrepresentable |
1379
|
|
|
|
|
|
|
characters are replaced with '?' mark unlike the plain Shift_JIS. |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
=item EUC-JP/ISO-2022-JP |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
This module doesn't directly convert Unicode string from/to EUC-JP or |
1385
|
|
|
|
|
|
|
ISO-2022-JP: it once converts from/to Shift_JIS and then do the rest |
1386
|
|
|
|
|
|
|
translation. So characters which aren't available in the Shift_JIS can not be |
1387
|
|
|
|
|
|
|
properly translated. |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
=item DoCoMo i-mode |
1391
|
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
This module maps emoji in the range of F800 - F9FF to U+0FF800 - U+0FF9FF. |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
=item ASTEL dot-i |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
This module maps emoji in the range of F000 - F4FF to U+0FF000 - U+0FF4FF. |
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
=item J-PHONE J-SKY |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
The encoding method defined by J-SKY is as follows: first an escape sequence |
1403
|
|
|
|
|
|
|
"\e\$" comes to indicate the beginning of emoji, then the first byte of an emoji |
1404
|
|
|
|
|
|
|
comes next, then the second bytes of at least one emoji comes next, then "\x0f" |
1405
|
|
|
|
|
|
|
comes last to indicate the end of emoji. If a string contains a series of emoji |
1406
|
|
|
|
|
|
|
whose first bytes are identical, such sequence can be compressed by cascading |
1407
|
|
|
|
|
|
|
second bytes of them to the single first byte. |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
This module considers a pair of those first and second bytes to be one letter, |
1411
|
|
|
|
|
|
|
and map them from 4500 - 47FF to U+0FFB00 - U+0FFDFF. |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
When the module encodes J-SKY emoji, it performs the compression automatically. |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
=item AU |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
This module maps AU emoji to U+0FF500 - U+0FF6FF. |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
=back |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
=head1 PurePerl mode |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
use Unicode::Japanese qw(PurePerl); |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
If you want to explicitly take the pure perl implementation, pass |
1429
|
|
|
|
|
|
|
C<'PurePerl'> to the argument of the C |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
=head1 BUGS |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
Please report bugs and requests to C or |
1435
|
|
|
|
|
|
|
L. If you |
1436
|
|
|
|
|
|
|
report them to the web interface, any progress to your report will be |
1437
|
|
|
|
|
|
|
automatically sent back to you. |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
=over 2 |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
=item * |
1443
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
This module doesn't directly convert Unicode string from/to EUC-JP or |
1447
|
|
|
|
|
|
|
ISO-2022-JP: it once converts from/to Shift_JIS and then do the rest |
1448
|
|
|
|
|
|
|
translation. So characters which aren't available in the Shift_JIS can not be |
1449
|
|
|
|
|
|
|
properly translated. |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
=item * |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
|
1455
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
The XS implementation of getcode() fails to detect the encoding when the given |
1457
|
|
|
|
|
|
|
string contains \e while its encoding is EUC-JP or Shift_JIS. |
1458
|
|
|
|
|
|
|
|
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
=item * |
1461
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
Japanese.pm is composed of textual perl script and binary character conversion |
1465
|
|
|
|
|
|
|
table. If you transfer it on FTP using ASCII mode, the file will collapse. |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
=back |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
=head1 SUPPORT |
1471
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
1473
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
perldoc Unicode::Japanese |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
You can find more information at: |
1478
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
=over 4 |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
L |
1485
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
=item * CPAN Ratings |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
L |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
1493
|
|
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
L |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
|
1497
|
|
|
|
|
|
|
=item * Search CPAN |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
L |
1500
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
=back |
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
=head1 CREDITS |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
Thanks very much to: |
1507
|
|
|
|
|
|
|
|
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
NAKAYAMA Nao |
1510
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
SUGIURA Tatsuki & Debian JP Project |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
1516
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
Copyright 2001-2008 |
1518
|
|
|
|
|
|
|
SANO Taku (SAWATARI Mikage) and YAMASHINA Hio, |
1519
|
|
|
|
|
|
|
all rights reserved. |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
1523
|
|
|
|
|
|
|
under the same terms as Perl itself. |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
=cut |
1528
|
|
|
|
|
|
|
|
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
__DATA__ |