File Coverage

blib/lib/Encode/ZapCP1252.pm

Criterion	Covered	Total	%
statement	51	52	98.0
branch	14	16	87.5
condition	3	3	100.0
subroutine	9	9	100.0
pod	0	2	0.0
total	77	82	93.9

line	stmt	bran	cond	sub	pod	time	code
1							package Encode::ZapCP1252;
2
3	3			3		205428	use strict;
	3					23
	3					125
4							require Exporter;
5	3			3		19	use vars qw($VERSION @ISA @EXPORT);
	3					6
	3					187
6	3			3		79	use 5.006_002;
	3					10
7
8							$VERSION = '0.40';
9							@ISA = qw(Exporter);
10							@EXPORT = qw(zap_cp1252 fix_cp1252);
11	3			3		28	use constant PERL588 => $] >= 5.008_008;
	3					6
	3					326
12	3			3		1757	use Encode ();
	3					31001
	3					1413
13
14							our %ascii_for = (
15							# https://en.wikipedia.org/wiki/Windows-1252
16							"\x80" => 'e', # EURO SIGN
17							"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
18							"\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK
19							"\x84" => ',,', # DOUBLE LOW-9 QUOTATION MARK
20							"\x85" => '...', # HORIZONTAL ELLIPSIS
21							"\x86" => '+', # DAGGER
22							"\x87" => '++', # DOUBLE DAGGER
23							"\x88" => '^', # MODIFIER LETTER CIRCUMFLEX ACCENT
24							"\x89" => '%', # PER MILLE SIGN
25							"\x8a" => 'S', # LATIN CAPITAL LETTER S WITH CARON
26							"\x8b" => '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
27							"\x8c" => 'OE', # LATIN CAPITAL LIGATURE OE
28							"\x8e" => 'Z', # LATIN CAPITAL LETTER Z WITH CARON
29							"\x91" => "'", # LEFT SINGLE QUOTATION MARK
30							"\x92" => "'", # RIGHT SINGLE QUOTATION MARK
31							"\x93" => '"', # LEFT DOUBLE QUOTATION MARK
32							"\x94" => '"', # RIGHT DOUBLE QUOTATION MARK
33							"\x95" => '*', # BULLET
34							"\x96" => '-', # EN DASH
35							"\x97" => '--', # EM DASH
36							"\x98" => '~', # SMALL TILDE
37							"\x99" => '(tm)', # TRADE MARK SIGN
38							"\x9a" => 's', # LATIN SMALL LETTER S WITH CARON
39							"\x9b" => '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
40							"\x9c" => 'oe', # LATIN SMALL LIGATURE OE
41							"\x9e" => 'z', # LATIN SMALL LETTER Z WITH CARON
42							"\x9f" => 'Y', # LATIN CAPITAL LETTER Y WITH DIAERESIS
43							);
44
45							our %utf8_for = (
46							# https://en.wikipedia.org/wiki/Windows-1252
47							"\x80" => '€', # EURO SIGN
48							"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
49							"\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
50							"\x84" => '„', # DOUBLE LOW-9 QUOTATION MARK
51							"\x85" => '…', # HORIZONTAL ELLIPSIS
52							"\x86" => '†', # DAGGER
53							"\x87" => '‡', # DOUBLE DAGGER
54							"\x88" => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
55							"\x89" => '‰', # PER MILLE SIGN
56							"\x8a" => 'Š', # LATIN CAPITAL LETTER S WITH CARON
57							"\x8b" => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
58							"\x8c" => 'Œ', # LATIN CAPITAL LIGATURE OE
59							"\x8e" => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
60							"\x91" => '‘', # LEFT SINGLE QUOTATION MARK
61							"\x92" => '’', # RIGHT SINGLE QUOTATION MARK
62							"\x93" => '“', # LEFT DOUBLE QUOTATION MARK
63							"\x94" => '”', # RIGHT DOUBLE QUOTATION MARK
64							"\x95" => '•', # BULLET
65							"\x96" => '–', # EN DASH
66							"\x97" => '—', # EM DASH
67							"\x98" => '˜', # SMALL TILDE
68							"\x99" => '™', # TRADE MARK SIGN
69							"\x9a" => 'š', # LATIN SMALL LETTER S WITH CARON
70							"\x9b" => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
71							"\x9c" => 'œ', # LATIN SMALL LIGATURE OE
72							"\x9e" => 'ž', # LATIN SMALL LETTER Z WITH CARON
73							"\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
74							);
75
76							my @utf8_skip = (
77							# This translates a utf-8-encoded byte into how many bytes the full utf8
78							# character occupies. Illegal start bytes have a negative count.
79
80							# UTF-8 is a variable-length encoding. The 128 ASCII characters were very
81							# deliberately set to be themselves, so UTF-8 would be backwards compatible
82							# with 7-bit applications. Every other character has 2 - 13 bytes comprising
83							# it.
84							#
85							# If the first bit of the first byte in a character is 0, it is one of those
86							# 128 ASCII characters with length 1.
87
88							# Otherwise, the first bit is 1, and if the second bit is also one, this byte
89							# starts the sequence of bytes that represent the character. The bytes C0-FF
90							# have the characteristic that the first two bits are both one. The number of
91							# bytes that form a character corresponds to the number of consecutive leading
92							# bits that are all one in the start byte. In the case of FE, the first 7
93							# bits are one, so the number of bytes in the character it represents is 7.
94							# FF is a special case, and Perl has arbitrarily set it to 13 instead of the
95							# expected 8.
96							#
97							# The remaining bytes begin with '10', from 80..9F. They are called
98							# continuation bytes, and a UTF-8 character is comprised of a start byte
99							# indicating 'n' bytes total in it, then 'n-1' of these continuation bytes.
100							# What the character is that each sequence represents is derived by shifting
101							# and adding the other bits in the bytes. (C0 and C1 aren't actually legal
102							# start bytes for security reasons that need not concern us here, hence are
103							# marked as negative in the table below.)
104
105							# 0 1 2 3 4 5 6 7 8 9 A B C D E F
106							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0
107							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1
108							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2
109							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3
110							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4
111							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5
112							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6
113							1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7
114							-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8
115							-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9
116							-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A
117							-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B
118							-1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C
119							2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D
120							3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E
121							4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F
122							);
123
124							BEGIN {
125	3	50		3		21	my $proto = $] >= 5.010000 ? '_' : '$';
126	3			10	0	219	eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }";
	10					3300
	10					34
127	3			11	0	1494	eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }";
	11					4984
	11					35
128							}
129
130							# These are the bytes that CP1252 redefines
131							my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/;
132
133							sub _tweakit {
134	21			21		45	my $table = shift;
135	21	100				64	return unless defined $_[0];
136	19	100				57	local $_[0] = $_[0] if defined wantarray;
137	19					57	my $is_utf8 = PERL588 && Encode::is_utf8($_[0]);
138	19		100			66	my $valid_utf8 = $is_utf8 && utf8::valid($_[0]);
139	19	100				51	if (!$is_utf8) {
		100
140
141							# Here is non-UTF-8. Change the 1252 characters to their UTF-8
142							# counterparts. These bytes are very rarely used in real world
143							# applications, so their presence likely indicates that CP1252 was
144							# meant.
145	12					124	$_[0] =~ s/($cp1252_re)/$table->{$1}/gems;
	272					647
146							} elsif ($valid_utf8) {
147
148							# Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on
149							# and the string is held as bytes. Change the 1252 characters to their
150							# Unicode counterparts.
151	5					50	$_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems;
	81					1324
152							} else { # Invalid UTF-8. Look for single-byte CP1252 gremlins
153
154							# Turn off the UTF-8 flag so that we can go through the string
155							# byte-by-byte.
156	2					7	Encode::_utf8_off($_[0]);
157
158	2					3	my $i = 0;
159	2					5	my $length = length $_[0];
160	2					3	my $fixed = ""; # The input after being fixed up by this loop
161	2					6	while ($i < $length) {
162
163							# Each time through the loop, we should here be ready to look at a
164							# new character, and it's 0th byte is called a 'start byte'
165	106					151	my $start_byte = substr($_[0], $i, 1);
166	106					142	my $skip = $utf8_skip[ord $start_byte];
167
168							# The table is set up so that legal UTF-8 start bytes have a
169							# positive byte length. Simply add all the bytes in the character
170							# to the output, and go on to handle the next character in the
171							# next loop iteration.
172	106	100				166	if ($skip > 0) {
173	52					73	$fixed .= substr($_[0], $i, $skip);
174	52					62	$i += $skip;
175	52					85	next;
176							}
177
178							# Here we have a byte that isn't a start byte in a position that
179							# should oughta be a start byte. The whole point of this loop is
180							# to find such bytes that are CP1252 ones and which were
181							# incorrectly inserted by the upstream process into an otherwise
182							# valid UTF-8 string. So, if we have such a one, change it into
183							# its corresponding correct character.
184	54	50				192	if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) {
	54					152
185
186							# The correct character may be UTF-8 bytes. We treat them as
187							# just a sequence of non-UTF-8 bytes, because that's what
188							# $fixed has in it so far. After everything is consistently
189							# added, we turn the UTF-8 flag back on before returning at
190							# the end.
191	54					114	Encode::_utf8_off($start_byte);
192	54					78	$fixed .= $start_byte;
193	54					64	$i++;
194	54					113	next;
195							}
196
197							# Here the byte isn't a CP1252 one.
198	0					0	die "Unexpected continuation byte: %02x", ord $start_byte;
199							}
200
201							# $fixed now has everything properly in it, but set to return it in
202							# $_[0], marked as UTF-8.
203	2					5	$_[0] = $fixed;
204	2					5	Encode::_utf8_on($_[0]);
205							}
206	19	100				175	return $_[0] if defined wantarray;
207							}
208
209							1;
210							__END__