| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Encode::ZapCP1252; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 3 |  |  | 3 |  | 205428 | use strict; | 
|  | 3 |  |  |  |  | 23 |  | 
|  | 3 |  |  |  |  | 125 |  | 
| 4 |  |  |  |  |  |  | require Exporter; | 
| 5 | 3 |  |  | 3 |  | 19 | use vars qw($VERSION @ISA @EXPORT); | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 187 |  | 
| 6 | 3 |  |  | 3 |  | 79 | use 5.006_002; | 
|  | 3 |  |  |  |  | 10 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | $VERSION = '0.40'; | 
| 9 |  |  |  |  |  |  | @ISA     = qw(Exporter); | 
| 10 |  |  |  |  |  |  | @EXPORT  = qw(zap_cp1252 fix_cp1252); | 
| 11 | 3 |  |  | 3 |  | 28 | use constant PERL588 => $] >= 5.008_008; | 
|  | 3 |  |  |  |  | 6 |  | 
|  | 3 |  |  |  |  | 326 |  | 
| 12 | 3 |  |  | 3 |  | 1757 | use Encode (); | 
|  | 3 |  |  |  |  | 31001 |  | 
|  | 3 |  |  |  |  | 1413 |  | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | our %ascii_for = ( | 
| 15 |  |  |  |  |  |  | # https://en.wikipedia.org/wiki/Windows-1252 | 
| 16 |  |  |  |  |  |  | "\x80" => 'e',    # EURO SIGN | 
| 17 |  |  |  |  |  |  | "\x82" => ',',    # SINGLE LOW-9 QUOTATION MARK | 
| 18 |  |  |  |  |  |  | "\x83" => 'f',    # LATIN SMALL LETTER F WITH HOOK | 
| 19 |  |  |  |  |  |  | "\x84" => ',,',   # DOUBLE LOW-9 QUOTATION MARK | 
| 20 |  |  |  |  |  |  | "\x85" => '...',  # HORIZONTAL ELLIPSIS | 
| 21 |  |  |  |  |  |  | "\x86" => '+',    # DAGGER | 
| 22 |  |  |  |  |  |  | "\x87" => '++',   # DOUBLE DAGGER | 
| 23 |  |  |  |  |  |  | "\x88" => '^',    # MODIFIER LETTER CIRCUMFLEX ACCENT | 
| 24 |  |  |  |  |  |  | "\x89" => '%',    # PER MILLE SIGN | 
| 25 |  |  |  |  |  |  | "\x8a" => 'S',    # LATIN CAPITAL LETTER S WITH CARON | 
| 26 |  |  |  |  |  |  | "\x8b" => '<',    # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | 
| 27 |  |  |  |  |  |  | "\x8c" => 'OE',   # LATIN CAPITAL LIGATURE OE | 
| 28 |  |  |  |  |  |  | "\x8e" => 'Z',    # LATIN CAPITAL LETTER Z WITH CARON | 
| 29 |  |  |  |  |  |  | "\x91" => "'",    # LEFT SINGLE QUOTATION MARK | 
| 30 |  |  |  |  |  |  | "\x92" => "'",    # RIGHT SINGLE QUOTATION MARK | 
| 31 |  |  |  |  |  |  | "\x93" => '"',    # LEFT DOUBLE QUOTATION MARK | 
| 32 |  |  |  |  |  |  | "\x94" => '"',    # RIGHT DOUBLE QUOTATION MARK | 
| 33 |  |  |  |  |  |  | "\x95" => '*',    # BULLET | 
| 34 |  |  |  |  |  |  | "\x96" => '-',    # EN DASH | 
| 35 |  |  |  |  |  |  | "\x97" => '--',   # EM DASH | 
| 36 |  |  |  |  |  |  | "\x98" => '~',    # SMALL TILDE | 
| 37 |  |  |  |  |  |  | "\x99" => '(tm)', # TRADE MARK SIGN | 
| 38 |  |  |  |  |  |  | "\x9a" => 's',    # LATIN SMALL LETTER S WITH CARON | 
| 39 |  |  |  |  |  |  | "\x9b" => '>',    # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | 
| 40 |  |  |  |  |  |  | "\x9c" => 'oe',   # LATIN SMALL LIGATURE OE | 
| 41 |  |  |  |  |  |  | "\x9e" => 'z',    # LATIN SMALL LETTER Z WITH CARON | 
| 42 |  |  |  |  |  |  | "\x9f" => 'Y',    # LATIN CAPITAL LETTER Y WITH DIAERESIS | 
| 43 |  |  |  |  |  |  | ); | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | our %utf8_for = ( | 
| 46 |  |  |  |  |  |  | # https://en.wikipedia.org/wiki/Windows-1252 | 
| 47 |  |  |  |  |  |  | "\x80" => '€',    # EURO SIGN | 
| 48 |  |  |  |  |  |  | "\x82" => ',',    # SINGLE LOW-9 QUOTATION MARK | 
| 49 |  |  |  |  |  |  | "\x83" => 'ƒ',    # LATIN SMALL LETTER F WITH HOOK | 
| 50 |  |  |  |  |  |  | "\x84" => '„',    # DOUBLE LOW-9 QUOTATION MARK | 
| 51 |  |  |  |  |  |  | "\x85" => '…',    # HORIZONTAL ELLIPSIS | 
| 52 |  |  |  |  |  |  | "\x86" => '†',    # DAGGER | 
| 53 |  |  |  |  |  |  | "\x87" => '‡',    # DOUBLE DAGGER | 
| 54 |  |  |  |  |  |  | "\x88" => 'ˆ',    # MODIFIER LETTER CIRCUMFLEX ACCENT | 
| 55 |  |  |  |  |  |  | "\x89" => '‰',    # PER MILLE SIGN | 
| 56 |  |  |  |  |  |  | "\x8a" => 'Š',    # LATIN CAPITAL LETTER S WITH CARON | 
| 57 |  |  |  |  |  |  | "\x8b" => '‹',    # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | 
| 58 |  |  |  |  |  |  | "\x8c" => 'Œ',    # LATIN CAPITAL LIGATURE OE | 
| 59 |  |  |  |  |  |  | "\x8e" => 'Ž',    # LATIN CAPITAL LETTER Z WITH CARON | 
| 60 |  |  |  |  |  |  | "\x91" => '‘',    # LEFT SINGLE QUOTATION MARK | 
| 61 |  |  |  |  |  |  | "\x92" => '’',    # RIGHT SINGLE QUOTATION MARK | 
| 62 |  |  |  |  |  |  | "\x93" => '“',    # LEFT DOUBLE QUOTATION MARK | 
| 63 |  |  |  |  |  |  | "\x94" => '”',    # RIGHT DOUBLE QUOTATION MARK | 
| 64 |  |  |  |  |  |  | "\x95" => '•',    # BULLET | 
| 65 |  |  |  |  |  |  | "\x96" => '–',    # EN DASH | 
| 66 |  |  |  |  |  |  | "\x97" => '—',    # EM DASH | 
| 67 |  |  |  |  |  |  | "\x98" => '˜',    # SMALL TILDE | 
| 68 |  |  |  |  |  |  | "\x99" => '™',    # TRADE MARK SIGN | 
| 69 |  |  |  |  |  |  | "\x9a" => 'š',    # LATIN SMALL LETTER S WITH CARON | 
| 70 |  |  |  |  |  |  | "\x9b" => '›',    # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | 
| 71 |  |  |  |  |  |  | "\x9c" => 'œ',    # LATIN SMALL LIGATURE OE | 
| 72 |  |  |  |  |  |  | "\x9e" => 'ž',    # LATIN SMALL LETTER Z WITH CARON | 
| 73 |  |  |  |  |  |  | "\x9f" => 'Ÿ',    # LATIN CAPITAL LETTER Y WITH DIAERESIS | 
| 74 |  |  |  |  |  |  | ); | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | my @utf8_skip = ( | 
| 77 |  |  |  |  |  |  | # This translates a utf-8-encoded byte into how many bytes the full utf8 | 
| 78 |  |  |  |  |  |  | # character occupies.  Illegal start bytes have a negative count. | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | # UTF-8 is a variable-length encoding.  The 128 ASCII characters were very | 
| 81 |  |  |  |  |  |  | # deliberately set to be themselves, so UTF-8 would be backwards compatible | 
| 82 |  |  |  |  |  |  | # with 7-bit applications.  Every other character has 2 - 13 bytes comprising | 
| 83 |  |  |  |  |  |  | # it. | 
| 84 |  |  |  |  |  |  | # | 
| 85 |  |  |  |  |  |  | # If the first bit of the first byte in a character is 0, it is one of those | 
| 86 |  |  |  |  |  |  | # 128 ASCII characters with length 1. | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | # Otherwise, the first bit is 1, and if the second bit is also one, this byte | 
| 89 |  |  |  |  |  |  | # starts the sequence of bytes that represent the character.  The bytes C0-FF | 
| 90 |  |  |  |  |  |  | # have the characteristic that the first two bits are both one.  The number of | 
| 91 |  |  |  |  |  |  | # bytes that form a character corresponds to the number of consecutive leading | 
| 92 |  |  |  |  |  |  | # bits that are all one in the start byte.  In the case of FE, the first 7 | 
| 93 |  |  |  |  |  |  | # bits are one, so the number of bytes in the character it represents is 7. | 
| 94 |  |  |  |  |  |  | # FF is a special case, and Perl has arbitrarily set it to 13 instead of the | 
| 95 |  |  |  |  |  |  | # expected 8. | 
| 96 |  |  |  |  |  |  | # | 
| 97 |  |  |  |  |  |  | # The remaining bytes begin with '10', from 80..9F.  They are called | 
| 98 |  |  |  |  |  |  | # continuation bytes, and a UTF-8 character is comprised of a start byte | 
| 99 |  |  |  |  |  |  | # indicating 'n' bytes total in it, then 'n-1' of these continuation bytes. | 
| 100 |  |  |  |  |  |  | # What the character is that each sequence represents is derived by shifting | 
| 101 |  |  |  |  |  |  | # and adding the other bits in the bytes.  (C0 and C1 aren't actually legal | 
| 102 |  |  |  |  |  |  | # start bytes for security reasons that need not concern us here, hence are | 
| 103 |  |  |  |  |  |  | # marked as negative in the table below.) | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F | 
| 106 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0 | 
| 107 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 1 | 
| 108 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 2 | 
| 109 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 3 | 
| 110 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 4 | 
| 111 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 5 | 
| 112 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 6 | 
| 113 |  |  |  |  |  |  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 7 | 
| 114 |  |  |  |  |  |  | -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,  # 8 | 
| 115 |  |  |  |  |  |  | -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,  # 9 | 
| 116 |  |  |  |  |  |  | -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,  # A | 
| 117 |  |  |  |  |  |  | -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,  # B | 
| 118 |  |  |  |  |  |  | -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # C | 
| 119 |  |  |  |  |  |  | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # D | 
| 120 |  |  |  |  |  |  | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # E | 
| 121 |  |  |  |  |  |  | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13,  # F | 
| 122 |  |  |  |  |  |  | ); | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | BEGIN { | 
| 125 | 3 | 50 |  | 3 |  | 21 | my $proto = $] >= 5.010000 ? '_' : '$'; | 
| 126 | 3 |  |  | 10 | 0 | 219 | eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }"; | 
|  | 10 |  |  |  |  | 3300 |  | 
|  | 10 |  |  |  |  | 34 |  | 
| 127 | 3 |  |  | 11 | 0 | 1494 | eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for;  &_tweakit; }"; | 
|  | 11 |  |  |  |  | 4984 |  | 
|  | 11 |  |  |  |  | 35 |  | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | # These are the bytes that CP1252 redefines | 
| 131 |  |  |  |  |  |  | my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/; | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | sub _tweakit { | 
| 134 | 21 |  |  | 21 |  | 45 | my $table = shift; | 
| 135 | 21 | 100 |  |  |  | 64 | return unless defined $_[0]; | 
| 136 | 19 | 100 |  |  |  | 57 | local $_[0] = $_[0] if defined wantarray; | 
| 137 | 19 |  |  |  |  | 57 | my $is_utf8 = PERL588 && Encode::is_utf8($_[0]); | 
| 138 | 19 |  | 100 |  |  | 66 | my $valid_utf8 = $is_utf8 && utf8::valid($_[0]); | 
| 139 | 19 | 100 |  |  |  | 51 | if (!$is_utf8) { | 
|  |  | 100 |  |  |  |  |  | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | # Here is non-UTF-8. Change the 1252 characters to their UTF-8 | 
| 142 |  |  |  |  |  |  | # counterparts. These bytes are very rarely used in real world | 
| 143 |  |  |  |  |  |  | # applications, so their presence likely indicates that CP1252 was | 
| 144 |  |  |  |  |  |  | # meant. | 
| 145 | 12 |  |  |  |  | 124 | $_[0] =~ s/($cp1252_re)/$table->{$1}/gems; | 
|  | 272 |  |  |  |  | 647 |  | 
| 146 |  |  |  |  |  |  | } elsif ($valid_utf8) { | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | # Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on | 
| 149 |  |  |  |  |  |  | # and the string is held as bytes. Change the 1252 characters to their | 
| 150 |  |  |  |  |  |  | # Unicode counterparts. | 
| 151 | 5 |  |  |  |  | 50 | $_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems; | 
|  | 81 |  |  |  |  | 1324 |  | 
| 152 |  |  |  |  |  |  | } else {    # Invalid UTF-8.  Look for single-byte CP1252 gremlins | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | # Turn off the UTF-8 flag so that we can go through the string | 
| 155 |  |  |  |  |  |  | # byte-by-byte. | 
| 156 | 2 |  |  |  |  | 7 | Encode::_utf8_off($_[0]); | 
| 157 |  |  |  |  |  |  |  | 
| 158 | 2 |  |  |  |  | 3 | my $i = 0; | 
| 159 | 2 |  |  |  |  | 5 | my $length = length $_[0]; | 
| 160 | 2 |  |  |  |  | 3 | my $fixed = "";     # The input after being fixed up by this loop | 
| 161 | 2 |  |  |  |  | 6 | while ($i < $length) { | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | # Each time through the loop, we should here be ready to look at a | 
| 164 |  |  |  |  |  |  | # new character, and it's 0th byte is called a 'start byte' | 
| 165 | 106 |  |  |  |  | 151 | my $start_byte = substr($_[0], $i, 1); | 
| 166 | 106 |  |  |  |  | 142 | my $skip = $utf8_skip[ord $start_byte]; | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | # The table is set up so that legal UTF-8 start bytes have a | 
| 169 |  |  |  |  |  |  | # positive byte length.  Simply add all the bytes in the character | 
| 170 |  |  |  |  |  |  | # to the output, and go on to handle the next character in the | 
| 171 |  |  |  |  |  |  | # next loop iteration. | 
| 172 | 106 | 100 |  |  |  | 166 | if ($skip > 0) { | 
| 173 | 52 |  |  |  |  | 73 | $fixed .= substr($_[0], $i, $skip); | 
| 174 | 52 |  |  |  |  | 62 | $i += $skip; | 
| 175 | 52 |  |  |  |  | 85 | next; | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | # Here we have a byte that isn't a start byte in a position that | 
| 179 |  |  |  |  |  |  | # should oughta be a start byte.  The whole point of this loop is | 
| 180 |  |  |  |  |  |  | # to find such bytes that are CP1252 ones and which were | 
| 181 |  |  |  |  |  |  | # incorrectly inserted by the upstream process into an otherwise | 
| 182 |  |  |  |  |  |  | # valid UTF-8 string.  So, if we have such a one, change it into | 
| 183 |  |  |  |  |  |  | # its corresponding correct character. | 
| 184 | 54 | 50 |  |  |  | 192 | if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) { | 
|  | 54 |  |  |  |  | 152 |  | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | # The correct character may be UTF-8 bytes.  We treat them as | 
| 187 |  |  |  |  |  |  | # just a sequence of non-UTF-8 bytes, because that's what | 
| 188 |  |  |  |  |  |  | # $fixed has in it so far.  After everything is consistently | 
| 189 |  |  |  |  |  |  | # added, we turn the UTF-8 flag back on before returning at | 
| 190 |  |  |  |  |  |  | # the end. | 
| 191 | 54 |  |  |  |  | 114 | Encode::_utf8_off($start_byte); | 
| 192 | 54 |  |  |  |  | 78 | $fixed .= $start_byte; | 
| 193 | 54 |  |  |  |  | 64 | $i++; | 
| 194 | 54 |  |  |  |  | 113 | next; | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | # Here the byte isn't a CP1252 one. | 
| 198 | 0 |  |  |  |  | 0 | die "Unexpected continuation byte: %02x", ord $start_byte; | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | # $fixed now has everything properly in it, but set to return it in | 
| 202 |  |  |  |  |  |  | # $_[0], marked as UTF-8. | 
| 203 | 2 |  |  |  |  | 5 | $_[0] = $fixed; | 
| 204 | 2 |  |  |  |  | 5 | Encode::_utf8_on($_[0]); | 
| 205 |  |  |  |  |  |  | } | 
| 206 | 19 | 100 |  |  |  | 175 | return $_[0] if defined wantarray; | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | 1; | 
| 210 |  |  |  |  |  |  | __END__ |