File Coverage

lib/Encode/CN/HZ.pm

Criterion	Covered	Total	%
statement	65	109	59.6
branch	25	56	44.6
condition			n/a
subroutine	11	12	91.7
total	101	177	57.1

line	stmt	bran	sub	time	code
1					package Encode::CN::HZ;
2
3	6		6	813404	use strict;
	6			20
	6			250
4	6		6	63	use warnings;
	6			16
	6			191
5	6		6	2361	use utf8 ();
	6			60
	6			177
6
7	6		6	36	use vars qw($VERSION);
	6			15
	6			675
8					$VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
9
10	6		6	34	use Encode qw(:fallbacks);
	6			16
	6			1464
11
12	6		6	86	use base qw(Encode::Encoding);
	6			16
	6			547
13					__PACKAGE__->Define('hz');
14
15					# HZ is a combination of ASCII and escaped GB, so we implement it
16					# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
17
18					# not ported for EBCDIC. Which should be used, "~" or "\x7E"?
19
20	4		4	5846	sub needs_lines { 1 }
21
22					sub decode ($$;$) {
23	9		9	1280	my ( $obj, $str, $chk ) = @_;
24
25	9			39	my $GB = Encode::find_encoding('gb2312-raw');
26	9			138	my $ret = substr($str, 0, 0); # to propagate taintedness
27	9			22	my $in_ascii = 1; # default mode is ASCII.
28
29	9			37	while ( length $str ) {
30	3009	100		6113	if ($in_ascii) { # ASCII mode
31	1507	100		10247	if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII
		50
		50
		50
32	756			3327	$ret .= $1;
33
34					# EBCDIC should need ascii2native, but not ported.
35					}
36					elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde
37	0			0	$ret .= '~';
38					}
39					elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
40	0			0	1; # no-op
41					}
42					elsif ( $str =~ s/^\x7E\x7B// ) { # '~{'
43	751			2252	$in_ascii = 0; # to GB
44					}
45					else { # encounters an invalid escape, \x80 or greater
46	0			0	last;
47					}
48					}
49					else { # GB mode; the byte ranges are as in RFC 1843.
50	6		6	2301	no warnings 'uninitialized';
	6			23
	6			5410
51	1502	100		10001	if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
		50
52	751			6953	$ret .= $GB->decode( $1, $chk );
53					}
54					elsif ( $str =~ s/^\x7E\x7D// ) { # '~}'
55	751			2189	$in_ascii = 1;
56					}
57					else { # invalid
58	0			0	last;
59					}
60					}
61					}
62	9	100		37	$_[1] = '' if $chk; # needs_lines guarantees no partial character
63	9			468	return $ret;
64					}
65
66					sub cat_decode {
67	0		0	0	my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
68	0			0	my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
69
70	0			0	my $GB = Encode::find_encoding('gb2312-raw');
71	0			0	my $ret = '';
72	0			0	my $in_ascii = 1; # default mode is ASCII.
73
74	0			0	my $ini_pos = pos($$rsrc);
75
76	0			0	substr( $src, 0, $pos ) = '';
77
78	0			0	my $ini_len = bytes::length($src);
79
80					# $trm is the first of the pair '~~', then 2nd tilde is to be removed.
81					# XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
82	0	0		0	$src =~ s/^\x7E// if $trm eq "\x7E";
83
84	0			0	while ( length $src ) {
85	0			0	my $now;
86	0	0		0	if ($in_ascii) { # ASCII mode
87	0	0		0	if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII
		0
		0
		0
88	0			0	$now = $1;
89					}
90					elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde
91	0			0	$now = '~';
92					}
93					elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
94	0			0	next;
95					}
96					elsif ( $src =~ s/^\x7E\x7B// ) { # '~{'
97	0			0	$in_ascii = 0; # to GB
98	0			0	next;
99					}
100					else { # encounters an invalid escape, \x80 or greater
101	0			0	last;
102					}
103					}
104					else { # GB mode; the byte ranges are as in RFC 1843.
105	0	0		0	if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
		0
106	0			0	$now = $GB->decode( $1, $chk );
107					}
108					elsif ( $src =~ s/^\x7E\x7D// ) { # '~}'
109	0			0	$in_ascii = 1;
110	0			0	next;
111					}
112					else { # invalid
113	0			0	last;
114					}
115					}
116
117	0	0		0	next if !defined $now;
118
119	0			0	$ret .= $now;
120
121	0	0		0	if ( $now eq $trm ) {
122	0			0	$$rdst .= $ret;
123	0			0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
124	0			0	pos($$rsrc) = $ini_pos;
125	0			0	return 1;
126					}
127					}
128
129	0			0	$$rdst .= $ret;
130	0			0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
131	0			0	pos($$rsrc) = $ini_pos;
132	0			0	return ''; # terminator not found
133					}
134
135					sub encode($$;$) {
136	488		488	37941	my ( $obj, $str, $chk ) = @_;
137
138	488			1626	my $GB = Encode::find_encoding('gb2312-raw');
139	488			7415	my $ret = substr($str, 0, 0); # to propagate taintedness;
140	488			925	my $in_ascii = 1; # default mode is ASCII.
141
142	6		6	39	no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
	6			15
	6			2584
143
144	488			1414	while ( length $str ) {
145	31434	100		1007765	if ( $str =~ s/^([[:ascii:]]+)// ) {
		50
146	1480			3836	my $tmp = $1;
147	1480			3317	$tmp =~ s/~/~~/g; # escapes tildes
148	1480	100		3834	if ( !$in_ascii ) {
149	994			2195	$ret .= "\x7E\x7D"; # '~}'
150	994			2129	$in_ascii = 1;
151					}
152	1480			20267	$ret .= pack 'a*', $tmp; # remove UTF8 flag.
153					}
154					elsif ( $str =~ s/(.)// ) {
155	29954			73875	my $s = $1;
156	29954			117476	my $tmp = $GB->encode( $s, $chk );
157	29954	50		83985	last if !defined $tmp;
158	29954	50		71417	if ( length $tmp == 2 ) { # maybe a valid GB char (XXX)
		0
159	29954	100		72975	if ($in_ascii) {
160	996			2244	$ret .= "\x7E\x7B"; # '~{'
161	996			1934	$in_ascii = 0;
162					}
163	29954			492816	$ret .= $tmp;
164					}
165					elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX)
166	0	0		0	if ( !$in_ascii ) {
167	0			0	$ret .= "\x7E\x7D"; # '~}'
168	0			0	$in_ascii = 1;
169					}
170	0			0	$ret .= $tmp;
171					}
172					}
173					else { # if $str is malformed UTF8 and if length $str != 0.
174	0			0	last;
175					}
176					}
177	488	100		1584	$_[1] = $str if $chk;
178
179					# The state at the end of the chunk is discarded, even if in GB mode.
180					# That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
181					# Parhaps it is harmless, but further investigations may be required...
182
183	488	100		1178	if ( !$in_ascii ) {
184	2			6	$ret .= "\x7E\x7D"; # '~}'
185	2			5	$in_ascii = 1;
186					}
187	488			1199	utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
188	488			6811	return $ret;
189					}
190
191					1;
192					__END__