| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Search::Tools::UTF8; | 
| 2 | 38 |  |  | 38 |  | 252401 | use strict; | 
|  | 38 |  |  |  |  | 97 |  | 
|  | 38 |  |  |  |  | 902 |  | 
| 3 | 38 |  |  | 38 |  | 152 | use warnings; | 
|  | 38 |  |  |  |  | 56 |  | 
|  | 38 |  |  |  |  | 733 |  | 
| 4 | 38 |  |  | 38 |  | 140 | use Carp; | 
|  | 38 |  |  |  |  | 58 |  | 
|  | 38 |  |  |  |  | 1624 |  | 
| 5 | 38 |  |  | 38 |  | 3677 | use Search::Tools;    # XS stuff | 
|  | 38 |  |  |  |  | 70 |  | 
|  | 38 |  |  |  |  | 820 |  | 
| 6 | 38 |  |  | 38 |  | 13064 | use Encode; | 
|  | 38 |  |  |  |  | 246328 |  | 
|  | 38 |  |  |  |  | 2283 |  | 
| 7 | 38 |  |  | 38 |  | 14968 | use charnames ':full'; | 
|  | 38 |  |  |  |  | 939019 |  | 
|  | 38 |  |  |  |  | 213 |  | 
| 8 | 38 |  |  | 38 |  | 10125 | use Data::Dump qw( dump ); | 
|  | 38 |  |  |  |  | 39814 |  | 
|  | 38 |  |  |  |  | 1910 |  | 
| 9 | 38 |  |  | 38 |  | 198 | use base qw( Exporter ); | 
|  | 38 |  |  |  |  | 65 |  | 
|  | 38 |  |  |  |  | 47624 |  | 
| 10 |  |  |  |  |  |  | our @EXPORT = qw( | 
| 11 |  |  |  |  |  |  | to_utf8 | 
| 12 |  |  |  |  |  |  | is_valid_utf8 | 
| 13 |  |  |  |  |  |  | is_flagged_utf8 | 
| 14 |  |  |  |  |  |  | is_perl_utf8_string | 
| 15 |  |  |  |  |  |  | is_ascii | 
| 16 |  |  |  |  |  |  | is_latin1 | 
| 17 |  |  |  |  |  |  | is_sane_utf8 | 
| 18 |  |  |  |  |  |  | find_bad_utf8 | 
| 19 |  |  |  |  |  |  | find_bad_ascii | 
| 20 |  |  |  |  |  |  | find_bad_latin1 | 
| 21 |  |  |  |  |  |  | find_bad_latin1_report | 
| 22 |  |  |  |  |  |  | byte_length | 
| 23 |  |  |  |  |  |  | looks_like_cp1252 | 
| 24 |  |  |  |  |  |  | fix_cp1252_codepoints_in_utf8 | 
| 25 |  |  |  |  |  |  | debug_bytes | 
| 26 |  |  |  |  |  |  | ); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | our $Debug = ( $ENV{PERL_DEBUG} && $ENV{PERL_DEBUG} > 2 ) ? 1 : 0; | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | our $VERSION = '1.006'; | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | sub to_utf8 { | 
| 33 | 604 |  |  | 604 | 1 | 2158 | my $str = shift; | 
| 34 | 604 | 50 |  |  |  | 1068 | Carp::cluck("\$str is undefined") unless defined $str; | 
| 35 |  |  |  |  |  |  |  | 
| 36 | 604 |  | 100 |  |  | 1163 | my $charset = shift || 'iso-8859-1'; | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | # checks first | 
| 39 | 604 | 100 |  |  |  | 965 | if ( is_flagged_utf8($str) ) { | 
| 40 | 393 | 50 |  |  |  | 617 | $Debug and carp "string '$str' is flagged utf8 already"; | 
| 41 | 393 |  |  |  |  | 1159 | return $str; | 
| 42 |  |  |  |  |  |  | } | 
| 43 | 211 | 100 |  |  |  | 1153 | if ( is_ascii($str) ) { | 
| 44 | 193 |  |  |  |  | 660 | Encode::_utf8_on($str); | 
| 45 | 193 | 50 |  |  |  | 398 | $Debug and carp "string '$str' is ascii; utf8 flag turned on"; | 
| 46 | 193 |  |  |  |  | 1196 | return $str; | 
| 47 |  |  |  |  |  |  | } | 
| 48 | 18 | 100 |  |  |  | 53 | if ( is_valid_utf8($str) ) { | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | # we got here only because the flag was off and it wasn't ascii. | 
| 51 |  |  |  |  |  |  | # however, is_valid_utf8() claims that it is valid internal UTF8, | 
| 52 |  |  |  |  |  |  | # so just turn the flag on. | 
| 53 | 6 |  |  |  |  | 19 | Encode::_utf8_on($str); | 
| 54 | 6 | 50 |  |  |  | 14 | $Debug and carp "string '$str' is valid utf8; utf8 flag turned on"; | 
| 55 | 6 |  |  |  |  | 18 | return $str; | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | $Debug | 
| 59 | 12 | 50 |  |  |  | 33 | and carp "converting $str from $charset -> utf8"; | 
| 60 | 12 |  |  |  |  | 73 | my $c = Encode::decode( $charset, $str ); | 
| 61 | 12 | 50 |  |  |  | 1292 | $Debug and carp "converted $c"; | 
| 62 |  |  |  |  |  |  |  | 
| 63 | 12 | 50 |  |  |  | 43 | unless ( is_sane_utf8( $c, 1 ) ) { | 
| 64 | 0 |  |  |  |  | 0 | carp "not sane: $c"; | 
| 65 |  |  |  |  |  |  | } | 
| 66 |  |  |  |  |  |  |  | 
| 67 | 12 |  |  |  |  | 153 | return $c; | 
| 68 |  |  |  |  |  |  | } | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub is_flagged_utf8 { | 
| 71 | 611 |  |  | 611 | 1 | 1849 | return Encode::is_utf8( $_[0] ); | 
| 72 |  |  |  |  |  |  | } | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | my $re_bit = join "|", | 
| 75 |  |  |  |  |  |  | map { Encode::encode( "utf8", chr($_) ) } ( 127 .. 255 ); | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | #binmode STDERR, ":utf8"; | 
| 78 |  |  |  |  |  |  | #print STDERR $re_bit; | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | sub is_sane_utf8 { | 
| 81 | 15 |  |  | 15 | 1 | 46 | my $string = shift; | 
| 82 | 15 |  | 50 |  |  | 52 | my $warnings = shift || $Debug || 0; | 
| 83 |  |  |  |  |  |  |  | 
| 84 | 15 |  |  |  |  | 25 | my $is_insane = 0; | 
| 85 | 15 |  |  |  |  | 3154 | while ( $string =~ /($re_bit)/go ) { | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | # work out what the double encoded string was | 
| 88 | 0 |  |  |  |  | 0 | my $bytes = $1; | 
| 89 |  |  |  |  |  |  |  | 
| 90 | 0 |  |  |  |  | 0 | my $index = $+[0] - length($bytes); | 
| 91 | 0 |  |  |  |  | 0 | my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, | 
|  | 0 |  |  |  |  | 0 |  | 
| 92 |  |  |  |  |  |  | $bytes; | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | # what character does that represent? | 
| 95 | 0 |  |  |  |  | 0 | my $char = Encode::decode( "utf8", $bytes ); | 
| 96 | 0 |  |  |  |  | 0 | my $ord  = ord($char); | 
| 97 | 0 |  |  |  |  | 0 | my $hex  = sprintf '%00x', $ord; | 
| 98 | 0 |  |  |  |  | 0 | $char = charnames::viacode($ord); | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | # print out diagnostic messages | 
| 101 | 0 | 0 |  |  |  | 0 | if ($warnings) { | 
| 102 |  |  |  |  |  |  |  | 
| 103 | 0 |  |  |  |  | 0 | warn(qq{Found dodgy chars "$codes" at char $index\n}); | 
| 104 | 0 | 0 |  |  |  | 0 | if ( Encode::is_utf8($string) ) { | 
| 105 | 0 |  |  |  |  | 0 | warn("Chars in utf8 string look like utf8 byte sequence."); | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  | else { | 
| 108 | 0 |  |  |  |  | 0 | warn("String not flagged as utf8...was it meant to be?\n"); | 
| 109 |  |  |  |  |  |  | } | 
| 110 | 0 |  |  |  |  | 0 | warn( | 
| 111 |  |  |  |  |  |  | "Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n" | 
| 112 |  |  |  |  |  |  | ); | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | } | 
| 115 | 0 |  |  |  |  | 0 | $is_insane++; | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  |  | 
| 118 | 15 | 50 |  |  |  | 122 | return $is_insane ? 0 : 1; | 
| 119 |  |  |  |  |  |  | } | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | sub is_valid_utf8 { | 
| 122 | 39 | 100 | 100 | 39 | 1 | 4963 | if (   is_latin1( $_[0] ) | 
|  |  |  | 100 |  |  |  |  | 
| 123 |  |  |  |  |  |  | && !is_ascii( $_[0] ) | 
| 124 |  |  |  |  |  |  | && !is_perl_utf8_string( $_[0] ) ) | 
| 125 |  |  |  |  |  |  | { | 
| 126 | 8 |  |  |  |  | 30 | return 0; | 
| 127 |  |  |  |  |  |  | } | 
| 128 | 31 |  |  |  |  | 155 | return is_perl_utf8_string( $_[0] ); | 
| 129 |  |  |  |  |  |  | } | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | sub find_bad_latin1_report { | 
| 132 | 2 |  |  | 2 | 1 | 223 | my $bad = find_bad_latin1(@_); | 
| 133 | 2 | 50 |  |  |  | 5 | if ($bad) { | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | # explain why we failed | 
| 136 | 2 |  |  |  |  | 5 | my $char = substr( $_[0], $bad, 1 ); | 
| 137 | 2 |  |  |  |  | 3 | my $dec  = ord($char); | 
| 138 | 2 |  |  |  |  | 7 | my $hex  = sprintf '%x', $dec; | 
| 139 | 2 |  |  |  |  | 277 | carp("byte $bad ($char) is not Latin1 (it's $dec dec / $hex hex)"); | 
| 140 |  |  |  |  |  |  | } | 
| 141 | 2 |  |  |  |  | 16 | return $bad; | 
| 142 |  |  |  |  |  |  | } | 
| 143 |  |  |  |  |  |  |  | 
| 144 |  |  |  |  |  |  | sub looks_like_cp1252 { | 
| 145 | 6 | 100 | 33 | 6 | 1 | 59 | if (   !is_latin1( $_[0] ) | 
|  |  |  | 66 |  |  |  |  | 
| 146 |  |  |  |  |  |  | && !is_ascii( $_[0] ) | 
| 147 |  |  |  |  |  |  | && $_[0] =~ m/[\x80-\x9f]/ ) | 
| 148 |  |  |  |  |  |  | { | 
| 149 | 5 |  |  |  |  | 22 | return 1; | 
| 150 |  |  |  |  |  |  | } | 
| 151 | 1 |  |  |  |  | 5 | return 0; | 
| 152 |  |  |  |  |  |  | } | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | my %win1252 = ( | 
| 155 |  |  |  |  |  |  | "\x80" => "\x{20AC}",    #EURO SIGN | 
| 156 |  |  |  |  |  |  | "\x81" => '',            #UNDEFINED | 
| 157 |  |  |  |  |  |  | "\x82" => "\x{201A}",    #SINGLE LOW-9 QUOTATION MARK | 
| 158 |  |  |  |  |  |  | "\x83" => "\x{0192}",    #LATIN SMALL LETTER F WITH HOOK | 
| 159 |  |  |  |  |  |  | "\x84" => "\x{201E}",    #DOUBLE LOW-9 QUOTATION MARK | 
| 160 |  |  |  |  |  |  | "\x85" => "\x{2026}",    #HORIZONTAL ELLIPSIS | 
| 161 |  |  |  |  |  |  | "\x86" => "\x{2020}",    #DAGGER | 
| 162 |  |  |  |  |  |  | "\x87" => "\x{2021}",    #DOUBLE DAGGER | 
| 163 |  |  |  |  |  |  | "\x88" => "\x{02C6}",    #MODIFIER LETTER CIRCUMFLEX ACCENT | 
| 164 |  |  |  |  |  |  | "\x89" => "\x{2030}",    #PER MILLE SIGN | 
| 165 |  |  |  |  |  |  | "\x8A" => "\x{0160}",    #LATIN CAPITAL LETTER S WITH CARON | 
| 166 |  |  |  |  |  |  | "\x8B" => "\x{2039}",    #SINGLE LEFT-POINTING ANGLE QUOTATION MARK | 
| 167 |  |  |  |  |  |  | "\x8C" => "\x{0152}",    #LATIN CAPITAL LIGATURE OE | 
| 168 |  |  |  |  |  |  | "\x8D" => '',            #UNDEFINED | 
| 169 |  |  |  |  |  |  | "\x8E" => "\x{017D}",    #LATIN CAPITAL LETTER Z WITH CARON | 
| 170 |  |  |  |  |  |  | "\x8F" => '',            #UNDEFINED | 
| 171 |  |  |  |  |  |  | "\x90" => '',            #UNDEFINED | 
| 172 |  |  |  |  |  |  | "\x91" => "\x{2018}",    #LEFT SINGLE QUOTATION MARK | 
| 173 |  |  |  |  |  |  | "\x92" => "\x{2019}",    #RIGHT SINGLE QUOTATION MARK | 
| 174 |  |  |  |  |  |  | "\x93" => "\x{201C}",    #LEFT DOUBLE QUOTATION MARK | 
| 175 |  |  |  |  |  |  | "\x94" => "\x{201D}",    #RIGHT DOUBLE QUOTATION MARK | 
| 176 |  |  |  |  |  |  | "\x95" => "\x{2022}",    #BULLET | 
| 177 |  |  |  |  |  |  | "\x96" => "\x{2013}",    #EN DASH | 
| 178 |  |  |  |  |  |  | "\x97" => "\x{2014}",    #EM DASH | 
| 179 |  |  |  |  |  |  | "\x98" => "\x{02DC}",    #SMALL TILDE | 
| 180 |  |  |  |  |  |  | "\x99" => "\x{2122}",    #TRADE MARK SIGN | 
| 181 |  |  |  |  |  |  | "\x9A" => "\x{0161}",    #LATIN SMALL LETTER S WITH CARON | 
| 182 |  |  |  |  |  |  | "\x9B" => "\x{203A}",    #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | 
| 183 |  |  |  |  |  |  | "\x9C" => "\x{0153}",    #LATIN SMALL LIGATURE OE | 
| 184 |  |  |  |  |  |  | "\x9D" => '',            #UNDEFINED | 
| 185 |  |  |  |  |  |  | "\x9E" => "\x{017E}",    #LATIN SMALL LETTER Z WITH CARON | 
| 186 |  |  |  |  |  |  | "\x9F" => "\x{0178}",    #LATIN CAPITAL LETTER Y WITH DIAERESIS | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | ); | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | # fix_latin (used in Transliterate) lacks the check for the | 
| 191 |  |  |  |  |  |  | # prefixed \xc2 byte, but the UTF-8 encoding for these | 
| 192 |  |  |  |  |  |  | # Windows codepoints has the leading \xc2 byte. | 
| 193 |  |  |  |  |  |  | sub fix_cp1252_codepoints_in_utf8 { | 
| 194 | 1 |  |  | 1 | 1 | 217 | my $buf = shift; | 
| 195 | 1 | 50 |  |  |  | 2 | unless ( is_valid_utf8($buf) ) { | 
| 196 | 0 |  |  |  |  | 0 | my $badbyte = find_bad_utf8($buf); | 
| 197 | 0 |  |  |  |  | 0 | croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]"; | 
| 198 |  |  |  |  |  |  | } | 
| 199 | 1 | 50 |  |  |  | 3 | $Debug and warn "converting $buf\n"; | 
| 200 | 1 |  |  |  |  | 2 | my $bytes = Encode::encode_utf8( to_utf8($buf) ); | 
| 201 | 1 |  |  |  |  | 14 | $bytes =~ s/\xc2([\x80-\x9f])/$win1252{$1}/g; | 
| 202 | 1 |  |  |  |  | 2 | return to_utf8($bytes); | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | 1; | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | __END__ |