File Coverage

blib/lib/Encode/ZapCP1252.pm
Criterion Covered Total %
statement 51 52 98.0
branch 14 16 87.5
condition 3 3 100.0
subroutine 9 9 100.0
pod 0 2 0.0
total 77 82 93.9


line stmt bran cond sub pod time code
1             package Encode::ZapCP1252;
2              
3 3     3   205428 use strict;
  3         23  
  3         125  
4             require Exporter;
5 3     3   19 use vars qw($VERSION @ISA @EXPORT);
  3         6  
  3         187  
6 3     3   79 use 5.006_002;
  3         10  
7              
8             $VERSION = '0.40';
9             @ISA = qw(Exporter);
10             @EXPORT = qw(zap_cp1252 fix_cp1252);
11 3     3   28 use constant PERL588 => $] >= 5.008_008;
  3         6  
  3         326  
12 3     3   1757 use Encode ();
  3         31001  
  3         1413  
13              
14             our %ascii_for = (
15             # https://en.wikipedia.org/wiki/Windows-1252
16             "\x80" => 'e', # EURO SIGN
17             "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
18             "\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK
19             "\x84" => ',,', # DOUBLE LOW-9 QUOTATION MARK
20             "\x85" => '...', # HORIZONTAL ELLIPSIS
21             "\x86" => '+', # DAGGER
22             "\x87" => '++', # DOUBLE DAGGER
23             "\x88" => '^', # MODIFIER LETTER CIRCUMFLEX ACCENT
24             "\x89" => '%', # PER MILLE SIGN
25             "\x8a" => 'S', # LATIN CAPITAL LETTER S WITH CARON
26             "\x8b" => '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
27             "\x8c" => 'OE', # LATIN CAPITAL LIGATURE OE
28             "\x8e" => 'Z', # LATIN CAPITAL LETTER Z WITH CARON
29             "\x91" => "'", # LEFT SINGLE QUOTATION MARK
30             "\x92" => "'", # RIGHT SINGLE QUOTATION MARK
31             "\x93" => '"', # LEFT DOUBLE QUOTATION MARK
32             "\x94" => '"', # RIGHT DOUBLE QUOTATION MARK
33             "\x95" => '*', # BULLET
34             "\x96" => '-', # EN DASH
35             "\x97" => '--', # EM DASH
36             "\x98" => '~', # SMALL TILDE
37             "\x99" => '(tm)', # TRADE MARK SIGN
38             "\x9a" => 's', # LATIN SMALL LETTER S WITH CARON
39             "\x9b" => '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
40             "\x9c" => 'oe', # LATIN SMALL LIGATURE OE
41             "\x9e" => 'z', # LATIN SMALL LETTER Z WITH CARON
42             "\x9f" => 'Y', # LATIN CAPITAL LETTER Y WITH DIAERESIS
43             );
44              
45             our %utf8_for = (
46             # https://en.wikipedia.org/wiki/Windows-1252
47             "\x80" => '€', # EURO SIGN
48             "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
49             "\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
50             "\x84" => '„', # DOUBLE LOW-9 QUOTATION MARK
51             "\x85" => '…', # HORIZONTAL ELLIPSIS
52             "\x86" => '†', # DAGGER
53             "\x87" => '‡', # DOUBLE DAGGER
54             "\x88" => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
55             "\x89" => '‰', # PER MILLE SIGN
56             "\x8a" => 'Š', # LATIN CAPITAL LETTER S WITH CARON
57             "\x8b" => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
58             "\x8c" => 'Œ', # LATIN CAPITAL LIGATURE OE
59             "\x8e" => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
60             "\x91" => '‘', # LEFT SINGLE QUOTATION MARK
61             "\x92" => '’', # RIGHT SINGLE QUOTATION MARK
62             "\x93" => '“', # LEFT DOUBLE QUOTATION MARK
63             "\x94" => '”', # RIGHT DOUBLE QUOTATION MARK
64             "\x95" => '•', # BULLET
65             "\x96" => '–', # EN DASH
66             "\x97" => '—', # EM DASH
67             "\x98" => '˜', # SMALL TILDE
68             "\x99" => '™', # TRADE MARK SIGN
69             "\x9a" => 'š', # LATIN SMALL LETTER S WITH CARON
70             "\x9b" => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
71             "\x9c" => 'œ', # LATIN SMALL LIGATURE OE
72             "\x9e" => 'ž', # LATIN SMALL LETTER Z WITH CARON
73             "\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
74             );
75              
76             my @utf8_skip = (
77             # This translates a utf-8-encoded byte into how many bytes the full utf8
78             # character occupies. Illegal start bytes have a negative count.
79              
80             # UTF-8 is a variable-length encoding. The 128 ASCII characters were very
81             # deliberately set to be themselves, so UTF-8 would be backwards compatible
82             # with 7-bit applications. Every other character has 2 - 13 bytes comprising
83             # it.
84             #
85             # If the first bit of the first byte in a character is 0, it is one of those
86             # 128 ASCII characters with length 1.
87              
88             # Otherwise, the first bit is 1, and if the second bit is also one, this byte
89             # starts the sequence of bytes that represent the character. The bytes C0-FF
90             # have the characteristic that the first two bits are both one. The number of
91             # bytes that form a character corresponds to the number of consecutive leading
92             # bits that are all one in the start byte. In the case of FE, the first 7
93             # bits are one, so the number of bytes in the character it represents is 7.
94             # FF is a special case, and Perl has arbitrarily set it to 13 instead of the
95             # expected 8.
96             #
97             # The remaining bytes begin with '10', from 80..9F. They are called
98             # continuation bytes, and a UTF-8 character is comprised of a start byte
99             # indicating 'n' bytes total in it, then 'n-1' of these continuation bytes.
100             # What the character is that each sequence represents is derived by shifting
101             # and adding the other bits in the bytes. (C0 and C1 aren't actually legal
102             # start bytes for security reasons that need not concern us here, hence are
103             # marked as negative in the table below.)
104              
105             # 0 1 2 3 4 5 6 7 8 9 A B C D E F
106             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0
107             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1
108             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2
109             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3
110             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4
111             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5
112             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6
113             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7
114             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8
115             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9
116             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A
117             -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B
118             -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C
119             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D
120             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E
121             4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F
122             );
123              
124             BEGIN {
125 3 50   3   21 my $proto = $] >= 5.010000 ? '_' : '$';
126 3     10 0 219 eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }";
  10         3300  
  10         34  
127 3     11 0 1494 eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }";
  11         4984  
  11         35  
128             }
129              
130             # These are the bytes that CP1252 redefines
131             my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/;
132              
133             sub _tweakit {
134 21     21   45 my $table = shift;
135 21 100       64 return unless defined $_[0];
136 19 100       57 local $_[0] = $_[0] if defined wantarray;
137 19         57 my $is_utf8 = PERL588 && Encode::is_utf8($_[0]);
138 19   100     66 my $valid_utf8 = $is_utf8 && utf8::valid($_[0]);
139 19 100       51 if (!$is_utf8) {
    100          
140              
141             # Here is non-UTF-8. Change the 1252 characters to their UTF-8
142             # counterparts. These bytes are very rarely used in real world
143             # applications, so their presence likely indicates that CP1252 was
144             # meant.
145 12         124 $_[0] =~ s/($cp1252_re)/$table->{$1}/gems;
  272         647  
146             } elsif ($valid_utf8) {
147              
148             # Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on
149             # and the string is held as bytes. Change the 1252 characters to their
150             # Unicode counterparts.
151 5         50 $_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems;
  81         1324  
152             } else { # Invalid UTF-8. Look for single-byte CP1252 gremlins
153              
154             # Turn off the UTF-8 flag so that we can go through the string
155             # byte-by-byte.
156 2         7 Encode::_utf8_off($_[0]);
157              
158 2         3 my $i = 0;
159 2         5 my $length = length $_[0];
160 2         3 my $fixed = ""; # The input after being fixed up by this loop
161 2         6 while ($i < $length) {
162              
163             # Each time through the loop, we should here be ready to look at a
164             # new character, and it's 0th byte is called a 'start byte'
165 106         151 my $start_byte = substr($_[0], $i, 1);
166 106         142 my $skip = $utf8_skip[ord $start_byte];
167              
168             # The table is set up so that legal UTF-8 start bytes have a
169             # positive byte length. Simply add all the bytes in the character
170             # to the output, and go on to handle the next character in the
171             # next loop iteration.
172 106 100       166 if ($skip > 0) {
173 52         73 $fixed .= substr($_[0], $i, $skip);
174 52         62 $i += $skip;
175 52         85 next;
176             }
177              
178             # Here we have a byte that isn't a start byte in a position that
179             # should oughta be a start byte. The whole point of this loop is
180             # to find such bytes that are CP1252 ones and which were
181             # incorrectly inserted by the upstream process into an otherwise
182             # valid UTF-8 string. So, if we have such a one, change it into
183             # its corresponding correct character.
184 54 50       192 if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) {
  54         152  
185              
186             # The correct character may be UTF-8 bytes. We treat them as
187             # just a sequence of non-UTF-8 bytes, because that's what
188             # $fixed has in it so far. After everything is consistently
189             # added, we turn the UTF-8 flag back on before returning at
190             # the end.
191 54         114 Encode::_utf8_off($start_byte);
192 54         78 $fixed .= $start_byte;
193 54         64 $i++;
194 54         113 next;
195             }
196              
197             # Here the byte isn't a CP1252 one.
198 0         0 die "Unexpected continuation byte: %02x", ord $start_byte;
199             }
200              
201             # $fixed now has everything properly in it, but set to return it in
202             # $_[0], marked as UTF-8.
203 2         5 $_[0] = $fixed;
204 2         5 Encode::_utf8_on($_[0]);
205             }
206 19 100       175 return $_[0] if defined wantarray;
207             }
208              
209             1;
210             __END__