File Coverage

blib/lib/Encode/X11.pm

Criterion	Covered	Total	%
statement	111	135	82.2
branch	37	56	66.0
condition	18	23	78.2
subroutine	9	10	90.0
pod	2	2	100.0
total	177	226	78.3

line	stmt	bran	cond	sub	pod	time	code
1							# Copyright 2011, 2012, 2013, 2014 Kevin Ryde
2
3							# This file is part of X11-Protocol-Other.
4							#
5							# X11-Protocol-Other is free software; you can redistribute it and/or modify
6							# it under the terms of the GNU General Public License as published by the
7							# Free Software Foundation; either version 3, or (at your option) any later
8							# version.
9							#
10							# X11-Protocol-Other is distributed in the hope that it will be useful, but
11							# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12							# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13							# for more details.
14							#
15							# You should have received a copy of the GNU General Public License along
16							# with X11-Protocol-Other. If not, see .
17
18
19							# /usr/share/doc/xorg-docs/specs/CTEXT/ctext.txt.gz
20							# lcCT.c
21							# lcUTF8.c
22							# RFC2237 2022-jp
23							# RFC1557 2022-kr
24
25
26							package Encode::X11;
27	1			1		8246	use 5.008;
	1					3
28	1			1		3	use strict;
	1					1
	1					15
29	1			1		3	use warnings;
	1					1
	1					17
30	1			1		2	use Carp;
	1					2
	1					45
31	1			1		3	use Encode ();
	1					1
	1					11
32	1			1		2	use Encode::Encoding;
	1					1
	1					1124
33
34							our $VERSION = 30;
35							our @ISA = ('Encode::Encoding');
36
37							# uncomment this to run the ### lines
38							#use Devel::Comments; # '###';
39
40							__PACKAGE__->Define('x11-compound-text');
41
42
43							#------------------------------------------------------------------------------
44							# encode
45
46							my @coding = ('iso-8859-1',
47							'iso-8859-2',
48							'iso-8859-3',
49							'iso-8859-4',
50							'iso-8859-7',
51							'iso-8859-6',
52							'iso-8859-8',
53							'iso-8859-5',
54							'iso-8859-9',
55							'jis0201-raw',
56
57							'jis0208-raw',
58							'ksc5601-raw',
59							'jis0212-raw',
60							'gb2312-raw',
61							);
62							# $esc[$i] corresponding to $coding[$i]
63							my @esc = (
64							# Esc 0x2D switch GR 0x80-0xFF
65							"\x1B\x2D\x41", # iso-8859-1 GR Esc-A
66							"\x1B\x2D\x42", # iso-8859-2 GR Esc-B
67							"\x1B\x2D\x43", # iso-8859-3 GR Esc-C
68							"\x1B\x2D\x44", # iso-8859-4 GR Esc-D
69							"\x1B\x2D\x46", # iso-8859-7 GR Esc-F
70							"\x1B\x2D\x47", # iso-8859-6 GR
71							"\x1B\x2D\x48", # iso-8859-8 GR
72							"\x1B\x2D\x4C", # iso-8859-5 GR
73							"\x1B\x2D\x4D", # iso-8859-9 GR
74							"\x1B\x29\x49", # jis 201 right half GR
75
76							"\x1B\x24\x28\x42", # jis 208 GL Esc$(B
77							"\x1B\x24\x28\x43", # ksc 5601 GL Esc$(C
78							"\x1B\x24\x28\x44", # jis 212 GL Esc$(D
79							"\x1B\x24\x28\x41", # gb 2312 GL Esc$(A
80
81							# FIXME: any merit generating these, when available?
82							# "\x1B\x24\x28\x47" => 'cns11643-1', # Encode::HanExtra
83							# "\x1B\x24\x28\x48" => 'cns11643-2',
84							# "\x1B\x24\x28\x49" => 'cns11643-3',
85							# "\x1B\x24\x28\x4A" => 'cns11643-4',
86							# "\x1B\x24\x28\x4B" => 'cns11643-5',
87							# "\x1B\x24\x28\x4C" => 'cns11643-6',
88							# "\x1B\x24\x28\x4D" => 'cns11643-7',
89							);
90
91							# xfree86 utf8 in compound: ESC % G --UTF-8-BYTES-- ESC % @
92							# 1B 25 47 1B 25 40
93
94							# return true if any of the @coding encodings is able to encode $str
95							sub _encodable_char {
96	1			1		4	my ($str) = @_;
97	1					2	foreach my $coding (@coding) {
98	1					1	my $input = $str;
99	1					4	Encode::encode ($coding, $input, Encode::FB_QUIET());
100	1	50				15	if (! length($input)) {
101	1					3	return 1;
102							}
103							}
104	0					0	return 0;
105							}
106
107							my $use_utf8 = 1;
108
109							sub encode {
110	5			5	1	1499	my ($self, $str, $chk) = @_;
111							### Encode-X11 encode(): 'len='.length($str)
112
113							# FIXME: don't think want to preserve esc state across multiple encode()
114							# calls, except for perlio ...
115	5					10	local $self->{'gl_non_ascii'};
116
117							# as much initial latin1 as possible
118	5					14	my $ret = Encode::encode ('iso-8859-1', $str, Encode::FB_QUIET());
119
120	5					74	my $in_latin1 = 1;
121
122	5					13	while (length($str)) {
123							### str length: length($str)
124
125	4					5	my $longest_bytes = '';
126	4					2	my $esc;
127	4					4	my $remainder = $str;
128	4					9	foreach my $i (0 .. $#coding) {
129	47	100				64	last unless length($remainder);
130	44					26	my $input = $str;
131	44					75	my $bytes = Encode::encode ($coding[$i], $input, Encode::FB_QUIET());
132	44	100				6003	if (length($input) < length($remainder)) {
133							### coding: $coding[$i]
134							### length: length($bytes)
135	3					3	$longest_bytes = $bytes;
136	3					4	$esc = $esc[$i];
137	3					2	$remainder = $input;
138	3					5	$in_latin1 = ($i == 0);
139							}
140							}
141							### $longest_bytes
142							### $esc
143
144	4	100				8	if (length($longest_bytes)) {
145	3	50				6	if ($esc eq "\x1B\x29\x49") {
		0
146							# 0x49 right half jis0201 in GR
147	3	100				10	if ($longest_bytes !~ /[\x80-\xFF]/) {
148	1					1	$esc = '';
149							}
150	3	100				9	if ($longest_bytes =~ /[\x00-\x7F]/) {
151							# 0x7E overline U+203E switch GL to jis0201
152	1					2	$esc .= "\x1B\x28\x4A"; # 0x4A left half in GL
153	1					2	$self->{'gl_non_ascii'} = 1;
154							}
155							} elsif (length($esc) == 3) {
156							### want ascii in GL
157	0					0	$ret .= _encode_ensure_ascii($self);
158							} else {
159	0					0	$self->{'gl_non_ascii'} = 1;
160							}
161	3					2	$ret .= $esc;
162	3					4	$ret .= $longest_bytes;
163	3					6	$str = $remainder;
164
165							} else {
166							### unconvertable: ord(substr($str,0,1))
167
168	1	50				4	if ($use_utf8) {
169	1					1	my $ulen = 1;
170	1					1	for (;;) {
171	1	50				4	if (_encodable_char(substr($str,$ulen,1))) {
172	1					2	last;
173							}
174	0					0	$ulen++;
175							}
176	1					3	my $input = substr($str,0,$ulen);
177	1					2	$str = substr($str,$ulen);
178
179	1					4	my $bytes = Encode::encode ('utf-8', $input, Encode::FB_QUIET());
180	1					23	$ret .= "\x1B\x25\x47";
181	1					2	$ret .= $bytes;
182	1					1	$ret .= "\x1B\x25\x40";
183	1	50				3	if (length($input)) {
184							### oops, unencodable as utf-8 too
185	0					0	$str = $input . $str;
186							} else {
187	1					3	next;
188							}
189							}
190
191	0	0				0	if ($chk) {
192							### stop
193	0					0	last;
194							} else {
195							### substitute "?" char
196	0					0	$ret .= _encode_ensure_ascii($self);
197	0					0	$ret .= '?';
198	0					0	$str = substr ($str, 1);
199							}
200							}
201							}
202							# if (! $in_latin1) {
203							# $ret .= $esc[0];
204							# }
205	5	50				7	if ($chk) {
206	5					5	$_[1] = $str; # unconverted part, if any
207							}
208							### encode final: $ret
209	5					12	return $ret;
210							}
211							sub _encode_ensure_ascii {
212	0			0		0	my ($self) = @_;
213	0	0				0	if ($self->{'gl_non_ascii'}) {
214	0					0	$self->{'gl_non_ascii'} = 0;
215	0					0	return "\x1B\x28\x42"; # ascii GL
216							} else {
217	0					0	return '';
218							}
219							}
220
221
222							#------------------------------------------------------------------------------
223							# decode()
224
225							# xfree86 utf8 in compound: ESC % G --UTF-8-BYTES-- ESC % @
226							# 25 47 25 40
227
228							my %esc_to_coding =
229							(
230							# esc[] table above
231							(map { $esc[$_] => $coding[$_] } 0 .. $#coding),
232
233							"\x1B\x28\x4A" => 'jis0201-raw', # jis0201 GL ascii except 0x7E
234							"\x1B\x29\x49" => 'jis0201-raw', # jis0201 right GR japanese
235
236							# but supposed to have 0x4A jis0201 left only in GL, and 0x49 jis0201
237							# right only in GR
238							"\x1B\x28\x49" => 'ascii', # jis0201 GL
239							"\x1B\x29\x4A" => 'iso-8859-1', # jis0201 GR
240
241							"\x1B\x28\x42" => 'ascii',
242
243							# "\x1B\x2D\x44" => 'jis0212-raw', # GL 1-bytes 96 chars
244
245							# \x24 means 2-bytes per char
246							# "\x1B\x24\x28\x41" => 'gb2312',
247							# "\x1B\x24\x28\x42" => 'jis0208-raw',# 208-1983 or 208-1990
248							"\x1B\x24\x28\x43" => 'ksc5601-raw',
249							"\x1B\x24\x28\x44" => 'jis0212-raw',# 212-1990
250
251							# http://www.itscj.ipsj.or.jp/ISO-IR/2-4.htm
252							"\x1B\x24\x28\x47" => 'cns11643-1', # Encode::HanExtra
253							"\x1B\x24\x28\x48" => 'cns11643-2',
254							"\x1B\x24\x28\x49" => 'cns11643-3',
255							"\x1B\x24\x28\x4A" => 'cns11643-4',
256							"\x1B\x24\x28\x4B" => 'cns11643-5',
257							"\x1B\x24\x28\x4C" => 'cns11643-6',
258							"\x1B\x24\x28\x4D" => 'cns11643-7',
259
260							"\x1B\x2D\x56" => 'iso-8859-10', # V
261							"\x1B\x2D\x54" => 'iso-8859-11', # T
262							"\x1B\x2D\x59" => 'iso-8859-13', # Y
263							"\x1B\x2D\x5F" => 'iso-8859-14', # "_"
264							"\x1B\x2D\x62" => 'iso-8859-15', # b
265							"\x1B\x2D\x66" => 'iso-8859-16', # f
266
267							# Emacs chinese-big5-1, A141-C67E
268							# "\x1B\x24\x28\x30" => 'big5-eten', # E0
269							# Emacs chinese-big5-2, C940-FEFE
270							# "\x1B\x24\x28\x31" => 'big5-hkscs',
271
272							# Emacs mule ipa or chinese-sisheng ?
273							# "\x1B\x2D\x30" => 'ipa',
274							# Emacs mule viscii ?
275							# "\x1B\x2D\x31" => 'viscii-lower',
276							# "\x1B\x2D\x32" => 'viscii-upper',
277
278							);
279
280							my %coding_is_lo = ('ascii' => 1,
281							'jis0208-raw' => 1,
282							'jis0212-raw' => 1,
283							'ksc5601-raw' => 1,
284							'gb2312-raw' => 1,
285							'cns11643-1' => 1,
286							'cns11643-2' => 1,
287							'cns11643-3' => 1,
288							'cns11643-4' => 1,
289							'cns11643-5' => 1,
290							'cns11643-6' => 1,
291							'cns11643-7' => 1,
292							);
293							my %coding_is_hi = ('big5-eten' => 1,
294							'big5-hkscs' => 1,
295							);
296
297							sub decode {
298	8			8	1	2897	my ($self, $bytes, $chk) = @_;
299							### Encode-X11 decode(): 'len='.length($bytes)
300
301	8					8	my $ret = ''; # wide chars to return
302	8					7	my $gl_coding = 'ascii';
303	8					6	my $gr_coding = 'iso-8859-1';
304	8					6	my $in_utf8 = 0;
305
306	8		100			33	while ((pos($bytes)\|\|0) < length $bytes) {
307	18	50				63	$bytes =~ m{\G(.*?) # $1 part
308							(\x1B # $2 esc
309							(?:[\x28\x29\x2D]. # 1-byte 94, 94GR, or 96GR
310							\|\x24[\x28\x29\x2D]. # 2-byte 94^2 or 96^2
311							\|\x25[\x47\x40] # xfree86 utf-8
312							)
313							\|$)
314							}gx or die;
315	18					26	my $part_bytes = $1;
316	18					17	my $esc = $2;
317
318							### $gl_coding
319							### $gr_coding
320							### part_bytes len: length($part_bytes)
321							#### part_bytes: $part_bytes
322	18					9	for (;;) {
323	28					20	my $coding;
324							my $half_bytes;
325	28	100	66			101	if ($in_utf8 && length($part_bytes) && ! pos($part_bytes)) {
		100	100
		100
326							### utf8 bytes
327	2					2	$half_bytes = $part_bytes;
328	2					4	pos($part_bytes) = length($part_bytes);
329	2					2	$coding = 'utf-8';
330
331							} elsif ($part_bytes =~ /\G([\x00-\x7F]+)/gc) {
332							### run of GL low bytes ...
333	3					3	$half_bytes = $1;
334	3					3	$coding = $gl_coding;
335	3	50				5	if ($coding_is_hi{$coding}) {
336	0					0	$half_bytes =~ tr/\x21-\x7E/\xA1-\xFE/;
337							}
338							} elsif ($part_bytes =~ /\G([^\x00-\x7F]+)/gc) {
339							### run of GR high bytes ...
340	5					5	$half_bytes = $1;
341	5					3	$coding = $gr_coding;
342	5	100				10	if ($coding_is_lo{$coding}) {
343							### pos: pos($part_bytes)
344	1					2	$half_bytes =~ tr/\xA1-\xFE/\x21-\x7E/;
345							### pos: pos($part_bytes)
346							}
347							} else {
348	18					13	last;
349							}
350
351	10					14	while (length $half_bytes) {
352							### $coding
353							### half_bytes len: length($half_bytes)
354							#### $half_bytes
355	10	50				43	$ret .= Encode::decode ($coding, $half_bytes,
356							$chk ? Encode::FB_QUIET() : Encode::FB_DEFAULT());
357							### half_bytes left: length($half_bytes)
358							### now ret len: length($ret)
359							#### now ret: $ret
360	10	50				9723	if (length $half_bytes) {
361							### decode error at: sprintf("%#X",pos($bytes))
362	0	0				0	if ($chk) {
363	0					0	$_[1] = substr ($bytes,
364							pos($bytes) - length($esc)
365							- length($part_bytes) + pos($part_bytes)
366							- length($half_bytes));
367	0					0	return $ret;
368
369							} else {
370	0					0	$ret .= chr(0xFFFD);
371							# or skip two for a 2-byte encoding ?
372	0					0	$half_bytes = substr($half_bytes, 1);
373							}
374							}
375							}
376							}
377
378							### esc: join(' ',map {sprintf("%02X",ord(substr($esc,$_,1)))} 0 .. length($esc)-1)
379							# XFree86
380							# http://www.itscj.ipsj.or.jp/ISO-IR/2-8-1.htm
381	18	100				29	if ($esc eq "\x1B\x25\x47") {
382	2					3	$in_utf8 = 1;
383	2					4	next;
384							}
385	16	100				21	if ($esc eq "\x1B\x25\x40") {
386	2					3	$in_utf8 = 0; # back to GL/GR style ...
387	2					6	next;
388							}
389
390	14					12	my $coding = $esc_to_coding{$esc};
391	14					11	my $gref;
392	14	100	100			68	if (($esc =~ s/\x1B\x29/\x1B\x28/) # 1-byte 94-char
			66
393							\|\|
394							($esc =~ s/\x1B\x24[\x29\x2D]/\x1B\x24\x28/) # 2-byte 94^2-char
395							\|\|
396							($esc =~ /\x1B\x2D/) # 1-byte 96-char
397							) {
398	4					4	$gref = \$gr_coding;
399							} else {
400	10					11	$gref = \$gl_coding;
401							}
402							### mangled esc: join(' ',map {sprintf("%02X",ord(substr($esc,$_,1)))} 0 .. length($esc)-1)
403
404	14		100			30	$coding \|\|= $esc_to_coding{$esc};
405	14	100	33			33	if (! defined $coding
			66
406							\|\| ($coding =~ /^cns/
407	0					0	&& ! eval { require Encode::HanExtra; 1 })) {
	0					0
408							### no coding: $coding
409	8	50				10	if ($chk) {
410	8					25	pos($bytes) -= length($esc);
411	8					14	last;
412							} else {
413	0					0	$ret .= chr(0xFFFD);
414							}
415							}
416	6					13	$$gref = $coding;
417							}
418
419							### final len: length($ret)
420							#### final ret: $ret
421	8					10	$_[1] = substr ($bytes, pos($bytes));
422	8					12	return $ret;
423							}
424
425							1;
426							__END__