line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Encode::ZapCP1252; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
205428
|
use strict; |
|
3
|
|
|
|
|
23
|
|
|
3
|
|
|
|
|
125
|
|
4
|
|
|
|
|
|
|
require Exporter; |
5
|
3
|
|
|
3
|
|
19
|
use vars qw($VERSION @ISA @EXPORT); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
187
|
|
6
|
3
|
|
|
3
|
|
79
|
use 5.006_002; |
|
3
|
|
|
|
|
10
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
$VERSION = '0.40'; |
9
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
10
|
|
|
|
|
|
|
@EXPORT = qw(zap_cp1252 fix_cp1252); |
11
|
3
|
|
|
3
|
|
28
|
use constant PERL588 => $] >= 5.008_008; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
326
|
|
12
|
3
|
|
|
3
|
|
1757
|
use Encode (); |
|
3
|
|
|
|
|
31001
|
|
|
3
|
|
|
|
|
1413
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our %ascii_for = ( |
15
|
|
|
|
|
|
|
# https://en.wikipedia.org/wiki/Windows-1252 |
16
|
|
|
|
|
|
|
"\x80" => 'e', # EURO SIGN |
17
|
|
|
|
|
|
|
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK |
18
|
|
|
|
|
|
|
"\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK |
19
|
|
|
|
|
|
|
"\x84" => ',,', # DOUBLE LOW-9 QUOTATION MARK |
20
|
|
|
|
|
|
|
"\x85" => '...', # HORIZONTAL ELLIPSIS |
21
|
|
|
|
|
|
|
"\x86" => '+', # DAGGER |
22
|
|
|
|
|
|
|
"\x87" => '++', # DOUBLE DAGGER |
23
|
|
|
|
|
|
|
"\x88" => '^', # MODIFIER LETTER CIRCUMFLEX ACCENT |
24
|
|
|
|
|
|
|
"\x89" => '%', # PER MILLE SIGN |
25
|
|
|
|
|
|
|
"\x8a" => 'S', # LATIN CAPITAL LETTER S WITH CARON |
26
|
|
|
|
|
|
|
"\x8b" => '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
27
|
|
|
|
|
|
|
"\x8c" => 'OE', # LATIN CAPITAL LIGATURE OE |
28
|
|
|
|
|
|
|
"\x8e" => 'Z', # LATIN CAPITAL LETTER Z WITH CARON |
29
|
|
|
|
|
|
|
"\x91" => "'", # LEFT SINGLE QUOTATION MARK |
30
|
|
|
|
|
|
|
"\x92" => "'", # RIGHT SINGLE QUOTATION MARK |
31
|
|
|
|
|
|
|
"\x93" => '"', # LEFT DOUBLE QUOTATION MARK |
32
|
|
|
|
|
|
|
"\x94" => '"', # RIGHT DOUBLE QUOTATION MARK |
33
|
|
|
|
|
|
|
"\x95" => '*', # BULLET |
34
|
|
|
|
|
|
|
"\x96" => '-', # EN DASH |
35
|
|
|
|
|
|
|
"\x97" => '--', # EM DASH |
36
|
|
|
|
|
|
|
"\x98" => '~', # SMALL TILDE |
37
|
|
|
|
|
|
|
"\x99" => '(tm)', # TRADE MARK SIGN |
38
|
|
|
|
|
|
|
"\x9a" => 's', # LATIN SMALL LETTER S WITH CARON |
39
|
|
|
|
|
|
|
"\x9b" => '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
40
|
|
|
|
|
|
|
"\x9c" => 'oe', # LATIN SMALL LIGATURE OE |
41
|
|
|
|
|
|
|
"\x9e" => 'z', # LATIN SMALL LETTER Z WITH CARON |
42
|
|
|
|
|
|
|
"\x9f" => 'Y', # LATIN CAPITAL LETTER Y WITH DIAERESIS |
43
|
|
|
|
|
|
|
); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
our %utf8_for = ( |
46
|
|
|
|
|
|
|
# https://en.wikipedia.org/wiki/Windows-1252 |
47
|
|
|
|
|
|
|
"\x80" => '€', # EURO SIGN |
48
|
|
|
|
|
|
|
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK |
49
|
|
|
|
|
|
|
"\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK |
50
|
|
|
|
|
|
|
"\x84" => '„', # DOUBLE LOW-9 QUOTATION MARK |
51
|
|
|
|
|
|
|
"\x85" => '…', # HORIZONTAL ELLIPSIS |
52
|
|
|
|
|
|
|
"\x86" => '†', # DAGGER |
53
|
|
|
|
|
|
|
"\x87" => '‡', # DOUBLE DAGGER |
54
|
|
|
|
|
|
|
"\x88" => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT |
55
|
|
|
|
|
|
|
"\x89" => '‰', # PER MILLE SIGN |
56
|
|
|
|
|
|
|
"\x8a" => 'Š', # LATIN CAPITAL LETTER S WITH CARON |
57
|
|
|
|
|
|
|
"\x8b" => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
58
|
|
|
|
|
|
|
"\x8c" => 'Œ', # LATIN CAPITAL LIGATURE OE |
59
|
|
|
|
|
|
|
"\x8e" => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON |
60
|
|
|
|
|
|
|
"\x91" => '‘', # LEFT SINGLE QUOTATION MARK |
61
|
|
|
|
|
|
|
"\x92" => '’', # RIGHT SINGLE QUOTATION MARK |
62
|
|
|
|
|
|
|
"\x93" => '“', # LEFT DOUBLE QUOTATION MARK |
63
|
|
|
|
|
|
|
"\x94" => '”', # RIGHT DOUBLE QUOTATION MARK |
64
|
|
|
|
|
|
|
"\x95" => '•', # BULLET |
65
|
|
|
|
|
|
|
"\x96" => '–', # EN DASH |
66
|
|
|
|
|
|
|
"\x97" => '—', # EM DASH |
67
|
|
|
|
|
|
|
"\x98" => '˜', # SMALL TILDE |
68
|
|
|
|
|
|
|
"\x99" => '™', # TRADE MARK SIGN |
69
|
|
|
|
|
|
|
"\x9a" => 'š', # LATIN SMALL LETTER S WITH CARON |
70
|
|
|
|
|
|
|
"\x9b" => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
71
|
|
|
|
|
|
|
"\x9c" => 'œ', # LATIN SMALL LIGATURE OE |
72
|
|
|
|
|
|
|
"\x9e" => 'ž', # LATIN SMALL LETTER Z WITH CARON |
73
|
|
|
|
|
|
|
"\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS |
74
|
|
|
|
|
|
|
); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
my @utf8_skip = ( |
77
|
|
|
|
|
|
|
# This translates a utf-8-encoded byte into how many bytes the full utf8 |
78
|
|
|
|
|
|
|
# character occupies. Illegal start bytes have a negative count. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# UTF-8 is a variable-length encoding. The 128 ASCII characters were very |
81
|
|
|
|
|
|
|
# deliberately set to be themselves, so UTF-8 would be backwards compatible |
82
|
|
|
|
|
|
|
# with 7-bit applications. Every other character has 2 - 13 bytes comprising |
83
|
|
|
|
|
|
|
# it. |
84
|
|
|
|
|
|
|
# |
85
|
|
|
|
|
|
|
# If the first bit of the first byte in a character is 0, it is one of those |
86
|
|
|
|
|
|
|
# 128 ASCII characters with length 1. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Otherwise, the first bit is 1, and if the second bit is also one, this byte |
89
|
|
|
|
|
|
|
# starts the sequence of bytes that represent the character. The bytes C0-FF |
90
|
|
|
|
|
|
|
# have the characteristic that the first two bits are both one. The number of |
91
|
|
|
|
|
|
|
# bytes that form a character corresponds to the number of consecutive leading |
92
|
|
|
|
|
|
|
# bits that are all one in the start byte. In the case of FE, the first 7 |
93
|
|
|
|
|
|
|
# bits are one, so the number of bytes in the character it represents is 7. |
94
|
|
|
|
|
|
|
# FF is a special case, and Perl has arbitrarily set it to 13 instead of the |
95
|
|
|
|
|
|
|
# expected 8. |
96
|
|
|
|
|
|
|
# |
97
|
|
|
|
|
|
|
# The remaining bytes begin with '10', from 80..9F. They are called |
98
|
|
|
|
|
|
|
# continuation bytes, and a UTF-8 character is comprised of a start byte |
99
|
|
|
|
|
|
|
# indicating 'n' bytes total in it, then 'n-1' of these continuation bytes. |
100
|
|
|
|
|
|
|
# What the character is that each sequence represents is derived by shifting |
101
|
|
|
|
|
|
|
# and adding the other bits in the bytes. (C0 and C1 aren't actually legal |
102
|
|
|
|
|
|
|
# start bytes for security reasons that need not concern us here, hence are |
103
|
|
|
|
|
|
|
# marked as negative in the table below.) |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# 0 1 2 3 4 5 6 7 8 9 A B C D E F |
106
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0 |
107
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1 |
108
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2 |
109
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3 |
110
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4 |
111
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5 |
112
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6 |
113
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7 |
114
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8 |
115
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9 |
116
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A |
117
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B |
118
|
|
|
|
|
|
|
-1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C |
119
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D |
120
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E |
121
|
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F |
122
|
|
|
|
|
|
|
); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
BEGIN { |
125
|
3
|
50
|
|
3
|
|
21
|
my $proto = $] >= 5.010000 ? '_' : '$'; |
126
|
3
|
|
|
10
|
0
|
219
|
eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }"; |
|
10
|
|
|
|
|
3300
|
|
|
10
|
|
|
|
|
34
|
|
127
|
3
|
|
|
11
|
0
|
1494
|
eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }"; |
|
11
|
|
|
|
|
4984
|
|
|
11
|
|
|
|
|
35
|
|
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# These are the bytes that CP1252 redefines |
131
|
|
|
|
|
|
|
my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _tweakit { |
134
|
21
|
|
|
21
|
|
45
|
my $table = shift; |
135
|
21
|
100
|
|
|
|
64
|
return unless defined $_[0]; |
136
|
19
|
100
|
|
|
|
57
|
local $_[0] = $_[0] if defined wantarray; |
137
|
19
|
|
|
|
|
57
|
my $is_utf8 = PERL588 && Encode::is_utf8($_[0]); |
138
|
19
|
|
100
|
|
|
66
|
my $valid_utf8 = $is_utf8 && utf8::valid($_[0]); |
139
|
19
|
100
|
|
|
|
51
|
if (!$is_utf8) { |
|
|
100
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Here is non-UTF-8. Change the 1252 characters to their UTF-8 |
142
|
|
|
|
|
|
|
# counterparts. These bytes are very rarely used in real world |
143
|
|
|
|
|
|
|
# applications, so their presence likely indicates that CP1252 was |
144
|
|
|
|
|
|
|
# meant. |
145
|
12
|
|
|
|
|
124
|
$_[0] =~ s/($cp1252_re)/$table->{$1}/gems; |
|
272
|
|
|
|
|
647
|
|
146
|
|
|
|
|
|
|
} elsif ($valid_utf8) { |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on |
149
|
|
|
|
|
|
|
# and the string is held as bytes. Change the 1252 characters to their |
150
|
|
|
|
|
|
|
# Unicode counterparts. |
151
|
5
|
|
|
|
|
50
|
$_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems; |
|
81
|
|
|
|
|
1324
|
|
152
|
|
|
|
|
|
|
} else { # Invalid UTF-8. Look for single-byte CP1252 gremlins |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Turn off the UTF-8 flag so that we can go through the string |
155
|
|
|
|
|
|
|
# byte-by-byte. |
156
|
2
|
|
|
|
|
7
|
Encode::_utf8_off($_[0]); |
157
|
|
|
|
|
|
|
|
158
|
2
|
|
|
|
|
3
|
my $i = 0; |
159
|
2
|
|
|
|
|
5
|
my $length = length $_[0]; |
160
|
2
|
|
|
|
|
3
|
my $fixed = ""; # The input after being fixed up by this loop |
161
|
2
|
|
|
|
|
6
|
while ($i < $length) { |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Each time through the loop, we should here be ready to look at a |
164
|
|
|
|
|
|
|
# new character, and it's 0th byte is called a 'start byte' |
165
|
106
|
|
|
|
|
151
|
my $start_byte = substr($_[0], $i, 1); |
166
|
106
|
|
|
|
|
142
|
my $skip = $utf8_skip[ord $start_byte]; |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# The table is set up so that legal UTF-8 start bytes have a |
169
|
|
|
|
|
|
|
# positive byte length. Simply add all the bytes in the character |
170
|
|
|
|
|
|
|
# to the output, and go on to handle the next character in the |
171
|
|
|
|
|
|
|
# next loop iteration. |
172
|
106
|
100
|
|
|
|
166
|
if ($skip > 0) { |
173
|
52
|
|
|
|
|
73
|
$fixed .= substr($_[0], $i, $skip); |
174
|
52
|
|
|
|
|
62
|
$i += $skip; |
175
|
52
|
|
|
|
|
85
|
next; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Here we have a byte that isn't a start byte in a position that |
179
|
|
|
|
|
|
|
# should oughta be a start byte. The whole point of this loop is |
180
|
|
|
|
|
|
|
# to find such bytes that are CP1252 ones and which were |
181
|
|
|
|
|
|
|
# incorrectly inserted by the upstream process into an otherwise |
182
|
|
|
|
|
|
|
# valid UTF-8 string. So, if we have such a one, change it into |
183
|
|
|
|
|
|
|
# its corresponding correct character. |
184
|
54
|
50
|
|
|
|
192
|
if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) { |
|
54
|
|
|
|
|
152
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# The correct character may be UTF-8 bytes. We treat them as |
187
|
|
|
|
|
|
|
# just a sequence of non-UTF-8 bytes, because that's what |
188
|
|
|
|
|
|
|
# $fixed has in it so far. After everything is consistently |
189
|
|
|
|
|
|
|
# added, we turn the UTF-8 flag back on before returning at |
190
|
|
|
|
|
|
|
# the end. |
191
|
54
|
|
|
|
|
114
|
Encode::_utf8_off($start_byte); |
192
|
54
|
|
|
|
|
78
|
$fixed .= $start_byte; |
193
|
54
|
|
|
|
|
64
|
$i++; |
194
|
54
|
|
|
|
|
113
|
next; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# Here the byte isn't a CP1252 one. |
198
|
0
|
|
|
|
|
0
|
die "Unexpected continuation byte: %02x", ord $start_byte; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# $fixed now has everything properly in it, but set to return it in |
202
|
|
|
|
|
|
|
# $_[0], marked as UTF-8. |
203
|
2
|
|
|
|
|
5
|
$_[0] = $fixed; |
204
|
2
|
|
|
|
|
5
|
Encode::_utf8_on($_[0]); |
205
|
|
|
|
|
|
|
} |
206
|
19
|
100
|
|
|
|
175
|
return $_[0] if defined wantarray; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
1; |
210
|
|
|
|
|
|
|
__END__ |