File Coverage

blib/lib/Encode/CN/HZ.pm

Criterion	Covered	Total	%
statement	70	112	62.5
branch	32	60	53.3
condition	2	2	100.0
subroutine	11	12	91.6
pod	4	4	100.0
total	119	190	62.6

line	stmt	bran	cond	sub	pod	time	code
1							package Encode::CN::HZ;
2
3	9			9		54	use strict;
	9					14
	9					226
4	9			9		48	use warnings;
	9					15
	9					239
5	9			9		2875	use utf8 ();
	9					100
	9					209
6
7	9			9		44	use vars qw($VERSION);
	9					17
	9					734
8							$VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
9
10	9			9		47	use Encode qw(:fallbacks);
	9					16
	9					1291
11
12	9			9		53	use parent qw(Encode::Encoding);
	9					14
	9					54
13							__PACKAGE__->Define('hz');
14
15							# HZ is a combination of ASCII and escaped GB, so we implement it
16							# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
17
18							# not ported for EBCDIC. Which should be used, "~" or "\x7E"?
19
20	4			4	1	40	sub needs_lines { 1 }
21
22							sub decode ($$;$) {
23	12			12	1	343	my ( $obj, $str, $chk ) = @_;
24	12	100				36	return undef unless defined $str;
25
26	11					30	my $GB = Encode::find_encoding('gb2312-raw');
27	11					31	my $ret = substr($str, 0, 0); # to propagate taintedness
28	11					18	my $in_ascii = 1; # default mode is ASCII.
29
30	11					32	while ( length $str ) {
31	3011	100				3810	if ($in_ascii) { # ASCII mode
32	1509	100				6547	if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII
		50
		50
		50
33	758					1874	$ret .= $1;
34
35							# EBCDIC should need ascii2native, but not ported.
36							}
37							elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde
38	0					0	$ret .= '~';
39							}
40							elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
41	0					0	1; # no-op
42							}
43							elsif ( $str =~ s/^\x7E\x7B// ) { # '~{'
44	751					1300	$in_ascii = 0; # to GB
45							}
46							else { # encounters an invalid escape, \x80 or greater
47	0					0	last;
48							}
49							}
50							else { # GB mode; the byte ranges are as in RFC 1843.
51	9			9		1985	no warnings 'uninitialized';
	9					16
	9					4842
52	1502	100				6492	if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
		50
53	751					1241	my $prefix = $1;
54	751					6018	$ret .= $GB->decode( $prefix, $chk );
55							}
56							elsif ( $str =~ s/^\x7E\x7D// ) { # '~}'
57	751					1276	$in_ascii = 1;
58							}
59							else { # invalid
60	0					0	last;
61							}
62							}
63							}
64	11	100				26	$_[1] = '' if $chk; # needs_lines guarantees no partial character
65	11					209	return $ret;
66							}
67
68							sub cat_decode {
69	0			0	1	0	my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
70	0					0	my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
71
72	0					0	my $GB = Encode::find_encoding('gb2312-raw');
73	0					0	my $ret = '';
74	0					0	my $in_ascii = 1; # default mode is ASCII.
75
76	0					0	my $ini_pos = pos($$rsrc);
77
78	0					0	substr( $src, 0, $pos ) = '';
79
80	0					0	my $ini_len = bytes::length($src);
81
82							# $trm is the first of the pair '~~', then 2nd tilde is to be removed.
83							# XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
84	0	0				0	$src =~ s/^\x7E// if $trm eq "\x7E";
85
86	0					0	while ( length $src ) {
87	0					0	my $now;
88	0	0				0	if ($in_ascii) { # ASCII mode
89	0	0				0	if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII
		0
		0
		0
90	0					0	$now = $1;
91							}
92							elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde
93	0					0	$now = '~';
94							}
95							elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
96	0					0	next;
97							}
98							elsif ( $src =~ s/^\x7E\x7B// ) { # '~{'
99	0					0	$in_ascii = 0; # to GB
100	0					0	next;
101							}
102							else { # encounters an invalid escape, \x80 or greater
103	0					0	last;
104							}
105							}
106							else { # GB mode; the byte ranges are as in RFC 1843.
107	0	0				0	if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
		0
108	0					0	$now = $GB->decode( $1, $chk );
109							}
110							elsif ( $src =~ s/^\x7E\x7D// ) { # '~}'
111	0					0	$in_ascii = 1;
112	0					0	next;
113							}
114							else { # invalid
115	0					0	last;
116							}
117							}
118
119	0	0				0	next if !defined $now;
120
121	0					0	$ret .= $now;
122
123	0	0				0	if ( $now eq $trm ) {
124	0					0	$$rdst .= $ret;
125	0					0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
126	0					0	pos($$rsrc) = $ini_pos;
127	0					0	return 1;
128							}
129							}
130
131	0					0	$$rdst .= $ret;
132	0					0	$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
133	0					0	pos($$rsrc) = $ini_pos;
134	0					0	return ''; # terminator not found
135							}
136
137							sub encode($$;$) {
138	491			491	1	2004	my ( $obj, $str, $chk ) = @_;
139	491	100				823	return undef unless defined $str;
140
141	490					927	my $GB = Encode::find_encoding('gb2312-raw');
142	490					1250	my $ret = substr($str, 0, 0); # to propagate taintedness;
143	490					602	my $in_ascii = 1; # default mode is ASCII.
144
145	9			9		68	no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
	9					15
	9					2226
146
147	490					784	while ( length $str ) {
148	31438	100				815866	if ( $str =~ s/^([[:ascii:]]+)// ) {
		50
149	1482					2648	my $tmp = $1;
150	1482					2548	$tmp =~ s/~/~~/g; # escapes tildes
151	1482	100				2364	if ( !$in_ascii ) {
152	994					1503	$ret .= "\x7E\x7D"; # '~}'
153	994					1363	$in_ascii = 1;
154							}
155	1482					15407	$ret .= pack 'a*', $tmp; # remove UTF8 flag.
156							}
157							elsif ( $str =~ s/(.)// ) {
158	29956					51691	my $s = $1;
159	29956		100			101497	my $tmp = $GB->encode( $s, $chk \|\| 0 );
160	29956	50				52740	last if !defined $tmp;
161	29956	100				42585	if ( length $tmp == 2 ) { # maybe a valid GB char (XXX)
		50
162	29954	100				41958	if ($in_ascii) {
163	996					1439	$ret .= "\x7E\x7B"; # '~{'
164	996					1322	$in_ascii = 0;
165							}
166	29954					389213	$ret .= $tmp;
167							}
168							elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX)
169	2	50				7	if ( !$in_ascii ) {
170	0					0	$ret .= "\x7E\x7D"; # '~}'
171	0					0	$in_ascii = 1;
172							}
173	2					8	$ret .= $tmp;
174							}
175							}
176							else { # if $str is malformed UTF8 and if length $str != 0.
177	0					0	last;
178							}
179							}
180	490	100				846	$_[1] = $str if $chk;
181
182							# The state at the end of the chunk is discarded, even if in GB mode.
183							# That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
184							# Parhaps it is harmless, but further investigations may be required...
185
186	490	100				688	if ( !$in_ascii ) {
187	2					2	$ret .= "\x7E\x7D"; # '~}'
188	2					4	$in_ascii = 1;
189							}
190	490					820	utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
191	490					4514	return $ret;
192							}
193
194							1;
195							__END__