File Coverage

blib/lib/Encode/CN/HZ.pm

Criterion	Covered	Total	%
statement	70	112	62.5
branch	32	60	53.3
condition			n/a
subroutine	11	12	91.6
pod	4	4	100.0
total	117	188	62.2

line	stmt	bran	sub	pod	time	code
1						package Encode::CN::HZ;
2
3	9		9		54	use strict;
	9				19
	9				217
4	9		9		41	use warnings;
	9				58
	9				207
5	9		9		4321	use utf8 ();
	9				110
	9				214
6
7	9		9		45	use vars qw($VERSION);
	9				21
	9				735
8						$VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
9
10	9		9		58	use Encode qw(:fallbacks);
	9				19
	9				1285
11
12	9		9		56	use parent qw(Encode::Encoding);
	9				21
	9				55
13						__PACKAGE__->Define('hz');
14
15						# HZ is a combination of ASCII and escaped GB, so we implement it
16						# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
17
18						# not ported for EBCDIC. Which should be used, "~" or "\x7E"?
19
20	4		4	1	44	sub needs_lines { 1 }
21
22						sub decode ($$;$) {
23	12		12	1	405	my ( $obj, $str, $chk ) = @_;
24	12	100			41	return undef unless defined $str;
25
26	11				37	my $GB = Encode::find_encoding('gb2312-raw');
27	11				31	my $ret = substr($str, 0, 0); # to propagate taintedness
28	11				20	my $in_ascii = 1; # default mode is ASCII.
29
30	11				36	while ( length $str ) {
31	3011	100			5121	if ($in_ascii) { # ASCII mode
32	1509	100			7348	if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII
		50
		50
		50
33	758				2234	$ret .= $1;
34
35						# EBCDIC should need ascii2native, but not ported.
36						}
37						elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde
38	0				0	$ret .= '~';
39						}
40						elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
41	0				0	1; # no-op
42						}
43						elsif ( $str =~ s/^\x7E\x7B// ) { # '~{'
44	751				1800	$in_ascii = 0; # to GB
45						}
46						else { # encounters an invalid escape, \x80 or greater
47	0				0	last;
48						}
49						}
50						else { # GB mode; the byte ranges are as in RFC 1843.
51	9		9		2195	no warnings 'uninitialized';
	9				22
	9				5111
52	1502	100			7846	if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
		50
53	751				1439	my $prefix = $1;
54	751				6781	$ret .= $GB->decode( $prefix, $chk );
55						}
56						elsif ( $str =~ s/^\x7E\x7D// ) { # '~}'
57	751				1729	$in_ascii = 1;
58						}
59						else { # invalid
60	0				0	last;
61						}
62						}
63						}
64	11	100			43	$_[1] = '' if $chk; # needs_lines guarantees no partial character
65	11				231	return $ret;
66						}
67
68						sub cat_decode {
69	0		0	1	0	my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
70	0				0	my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
71
72	0				0	my $GB = Encode::find_encoding('gb2312-raw');
73	0				0	my $ret = '';
74	0				0	my $in_ascii = 1; # default mode is ASCII.
75
76	0				0	my $ini_pos = pos($$rsrc);
77
78	0				0	substr( $src, 0, $pos ) = '';
79
80	0				0	my $ini_len = bytes::length($src);
81
82						# $trm is the first of the pair '~~', then 2nd tilde is to be removed.
83						# XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
84	0	0			0	$src =~ s/^\x7E// if $trm eq "\x7E";
85
86	0				0	while ( length $src ) {
87	0				0	my $now;
88	0	0			0	if ($in_ascii) { # ASCII mode
89	0	0			0	if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII
		0
		0
		0
90	0				0	$now = $1;
91						}
92						elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde
93	0				0	$now = '~';
94						}
95						elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
96	0				0	next;
97						}
98						elsif ( $src =~ s/^\x7E\x7B// ) { # '~{'
99	0				0	$in_ascii = 0; # to GB
100	0				0	next;
101						}
102						else { # encounters an invalid escape, \x80 or greater
103	0				0	last;
104						}
105						}
106						else { # GB mode; the byte ranges are as in RFC 1843.
107	0	0			0	if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
		0
108	0				0	$now = $GB->decode( $1, $chk );
109						}
110						elsif ( $src =~ s/^\x7E\x7D// ) { # '~}'
111	0				0	$in_ascii = 1;
112	0				0	next;
113						}
114						else { # invalid
115	0				0	last;
116						}
117						}
118
119	0	0			0	next if !defined $now;
120
121	0				0	$ret .= $now;
122
123	0	0			0	if ( $now eq $trm ) {
124	0				0	$$rdst .= $ret;
125	0				0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
126	0				0	pos($$rsrc) = $ini_pos;
127	0				0	return 1;
128						}
129						}
130
131	0				0	$$rdst .= $ret;
132	0				0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
133	0				0	pos($$rsrc) = $ini_pos;
134	0				0	return ''; # terminator not found
135						}
136
137						sub encode($$;$) {
138	491		491	1	2461	my ( $obj, $str, $chk ) = @_;
139	491	100			1103	return undef unless defined $str;
140
141	490				1195	my $GB = Encode::find_encoding('gb2312-raw');
142	490				1458	my $ret = substr($str, 0, 0); # to propagate taintedness;
143	490				694	my $in_ascii = 1; # default mode is ASCII.
144
145	9		9		79	no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
	9				21
	9				2340
146
147	490				1000	while ( length $str ) {
148	31438	100			776669	if ( $str =~ s/^([[:ascii:]]+)// ) {
		50
149	1482				2878	my $tmp = $1;
150	1482				2773	$tmp =~ s/~/~~/g; # escapes tildes
151	1482	100			3136	if ( !$in_ascii ) {
152	994				1702	$ret .= "\x7E\x7D"; # '~}'
153	994				1592	$in_ascii = 1;
154						}
155	1482				15980	$ret .= pack 'a*', $tmp; # remove UTF8 flag.
156						}
157						elsif ( $str =~ s/(.)// ) {
158	29956				54959	my $s = $1;
159	29956				79441	my $tmp = $GB->encode( $s, $chk );
160	29956	50			63742	last if !defined $tmp;
161	29956	100			50550	if ( length $tmp == 2 ) { # maybe a valid GB char (XXX)
		50
162	29954	100			54869	if ($in_ascii) {
163	996				1707	$ret .= "\x7E\x7B"; # '~{'
164	996				1637	$in_ascii = 0;
165						}
166	29954				382077	$ret .= $tmp;
167						}
168						elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX)
169	2	50			6	if ( !$in_ascii ) {
170	0				0	$ret .= "\x7E\x7D"; # '~}'
171	0				0	$in_ascii = 1;
172						}
173	2				10	$ret .= $tmp;
174						}
175						}
176						else { # if $str is malformed UTF8 and if length $str != 0.
177	0				0	last;
178						}
179						}
180	490	100			1132	$_[1] = $str if $chk;
181
182						# The state at the end of the chunk is discarded, even if in GB mode.
183						# That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
184						# Parhaps it is harmless, but further investigations may be required...
185
186	490	100			989	if ( !$in_ascii ) {
187	2				4	$ret .= "\x7E\x7D"; # '~}'
188	2				3	$in_ascii = 1;
189						}
190	490				1054	utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
191	490				6309	return $ret;
192						}
193
194						1;
195						__END__