line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::UTF8; |
2
|
38
|
|
|
38
|
|
252401
|
use strict; |
|
38
|
|
|
|
|
97
|
|
|
38
|
|
|
|
|
902
|
|
3
|
38
|
|
|
38
|
|
152
|
use warnings; |
|
38
|
|
|
|
|
56
|
|
|
38
|
|
|
|
|
733
|
|
4
|
38
|
|
|
38
|
|
140
|
use Carp; |
|
38
|
|
|
|
|
58
|
|
|
38
|
|
|
|
|
1624
|
|
5
|
38
|
|
|
38
|
|
3677
|
use Search::Tools; # XS stuff |
|
38
|
|
|
|
|
70
|
|
|
38
|
|
|
|
|
820
|
|
6
|
38
|
|
|
38
|
|
13064
|
use Encode; |
|
38
|
|
|
|
|
246328
|
|
|
38
|
|
|
|
|
2283
|
|
7
|
38
|
|
|
38
|
|
14968
|
use charnames ':full'; |
|
38
|
|
|
|
|
939019
|
|
|
38
|
|
|
|
|
213
|
|
8
|
38
|
|
|
38
|
|
10125
|
use Data::Dump qw( dump ); |
|
38
|
|
|
|
|
39814
|
|
|
38
|
|
|
|
|
1910
|
|
9
|
38
|
|
|
38
|
|
198
|
use base qw( Exporter ); |
|
38
|
|
|
|
|
65
|
|
|
38
|
|
|
|
|
47624
|
|
10
|
|
|
|
|
|
|
our @EXPORT = qw( |
11
|
|
|
|
|
|
|
to_utf8 |
12
|
|
|
|
|
|
|
is_valid_utf8 |
13
|
|
|
|
|
|
|
is_flagged_utf8 |
14
|
|
|
|
|
|
|
is_perl_utf8_string |
15
|
|
|
|
|
|
|
is_ascii |
16
|
|
|
|
|
|
|
is_latin1 |
17
|
|
|
|
|
|
|
is_sane_utf8 |
18
|
|
|
|
|
|
|
find_bad_utf8 |
19
|
|
|
|
|
|
|
find_bad_ascii |
20
|
|
|
|
|
|
|
find_bad_latin1 |
21
|
|
|
|
|
|
|
find_bad_latin1_report |
22
|
|
|
|
|
|
|
byte_length |
23
|
|
|
|
|
|
|
looks_like_cp1252 |
24
|
|
|
|
|
|
|
fix_cp1252_codepoints_in_utf8 |
25
|
|
|
|
|
|
|
debug_bytes |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our $Debug = ( $ENV{PERL_DEBUG} && $ENV{PERL_DEBUG} > 2 ) ? 1 : 0; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub to_utf8 { |
33
|
604
|
|
|
604
|
1
|
2158
|
my $str = shift; |
34
|
604
|
50
|
|
|
|
1068
|
Carp::cluck("\$str is undefined") unless defined $str; |
35
|
|
|
|
|
|
|
|
36
|
604
|
|
100
|
|
|
1163
|
my $charset = shift || 'iso-8859-1'; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# checks first |
39
|
604
|
100
|
|
|
|
965
|
if ( is_flagged_utf8($str) ) { |
40
|
393
|
50
|
|
|
|
617
|
$Debug and carp "string '$str' is flagged utf8 already"; |
41
|
393
|
|
|
|
|
1159
|
return $str; |
42
|
|
|
|
|
|
|
} |
43
|
211
|
100
|
|
|
|
1153
|
if ( is_ascii($str) ) { |
44
|
193
|
|
|
|
|
660
|
Encode::_utf8_on($str); |
45
|
193
|
50
|
|
|
|
398
|
$Debug and carp "string '$str' is ascii; utf8 flag turned on"; |
46
|
193
|
|
|
|
|
1196
|
return $str; |
47
|
|
|
|
|
|
|
} |
48
|
18
|
100
|
|
|
|
53
|
if ( is_valid_utf8($str) ) { |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# we got here only because the flag was off and it wasn't ascii. |
51
|
|
|
|
|
|
|
# however, is_valid_utf8() claims that it is valid internal UTF8, |
52
|
|
|
|
|
|
|
# so just turn the flag on. |
53
|
6
|
|
|
|
|
19
|
Encode::_utf8_on($str); |
54
|
6
|
50
|
|
|
|
14
|
$Debug and carp "string '$str' is valid utf8; utf8 flag turned on"; |
55
|
6
|
|
|
|
|
18
|
return $str; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
$Debug |
59
|
12
|
50
|
|
|
|
33
|
and carp "converting $str from $charset -> utf8"; |
60
|
12
|
|
|
|
|
73
|
my $c = Encode::decode( $charset, $str ); |
61
|
12
|
50
|
|
|
|
1292
|
$Debug and carp "converted $c"; |
62
|
|
|
|
|
|
|
|
63
|
12
|
50
|
|
|
|
43
|
unless ( is_sane_utf8( $c, 1 ) ) { |
64
|
0
|
|
|
|
|
0
|
carp "not sane: $c"; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
12
|
|
|
|
|
153
|
return $c; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub is_flagged_utf8 { |
71
|
611
|
|
|
611
|
1
|
1849
|
return Encode::is_utf8( $_[0] ); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $re_bit = join "|", |
75
|
|
|
|
|
|
|
map { Encode::encode( "utf8", chr($_) ) } ( 127 .. 255 ); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
#binmode STDERR, ":utf8"; |
78
|
|
|
|
|
|
|
#print STDERR $re_bit; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub is_sane_utf8 { |
81
|
15
|
|
|
15
|
1
|
46
|
my $string = shift; |
82
|
15
|
|
50
|
|
|
52
|
my $warnings = shift || $Debug || 0; |
83
|
|
|
|
|
|
|
|
84
|
15
|
|
|
|
|
25
|
my $is_insane = 0; |
85
|
15
|
|
|
|
|
3154
|
while ( $string =~ /($re_bit)/go ) { |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# work out what the double encoded string was |
88
|
0
|
|
|
|
|
0
|
my $bytes = $1; |
89
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
0
|
my $index = $+[0] - length($bytes); |
91
|
0
|
|
|
|
|
0
|
my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, |
|
0
|
|
|
|
|
0
|
|
92
|
|
|
|
|
|
|
$bytes; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# what character does that represent? |
95
|
0
|
|
|
|
|
0
|
my $char = Encode::decode( "utf8", $bytes ); |
96
|
0
|
|
|
|
|
0
|
my $ord = ord($char); |
97
|
0
|
|
|
|
|
0
|
my $hex = sprintf '%00x', $ord; |
98
|
0
|
|
|
|
|
0
|
$char = charnames::viacode($ord); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# print out diagnostic messages |
101
|
0
|
0
|
|
|
|
0
|
if ($warnings) { |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
0
|
warn(qq{Found dodgy chars "$codes" at char $index\n}); |
104
|
0
|
0
|
|
|
|
0
|
if ( Encode::is_utf8($string) ) { |
105
|
0
|
|
|
|
|
0
|
warn("Chars in utf8 string look like utf8 byte sequence."); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
else { |
108
|
0
|
|
|
|
|
0
|
warn("String not flagged as utf8...was it meant to be?\n"); |
109
|
|
|
|
|
|
|
} |
110
|
0
|
|
|
|
|
0
|
warn( |
111
|
|
|
|
|
|
|
"Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n" |
112
|
|
|
|
|
|
|
); |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
115
|
0
|
|
|
|
|
0
|
$is_insane++; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
15
|
50
|
|
|
|
122
|
return $is_insane ? 0 : 1; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub is_valid_utf8 { |
122
|
39
|
100
|
100
|
39
|
1
|
4963
|
if ( is_latin1( $_[0] ) |
|
|
|
100
|
|
|
|
|
123
|
|
|
|
|
|
|
&& !is_ascii( $_[0] ) |
124
|
|
|
|
|
|
|
&& !is_perl_utf8_string( $_[0] ) ) |
125
|
|
|
|
|
|
|
{ |
126
|
8
|
|
|
|
|
30
|
return 0; |
127
|
|
|
|
|
|
|
} |
128
|
31
|
|
|
|
|
155
|
return is_perl_utf8_string( $_[0] ); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub find_bad_latin1_report { |
132
|
2
|
|
|
2
|
1
|
223
|
my $bad = find_bad_latin1(@_); |
133
|
2
|
50
|
|
|
|
5
|
if ($bad) { |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# explain why we failed |
136
|
2
|
|
|
|
|
5
|
my $char = substr( $_[0], $bad, 1 ); |
137
|
2
|
|
|
|
|
3
|
my $dec = ord($char); |
138
|
2
|
|
|
|
|
7
|
my $hex = sprintf '%x', $dec; |
139
|
2
|
|
|
|
|
277
|
carp("byte $bad ($char) is not Latin1 (it's $dec dec / $hex hex)"); |
140
|
|
|
|
|
|
|
} |
141
|
2
|
|
|
|
|
16
|
return $bad; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub looks_like_cp1252 { |
145
|
6
|
100
|
33
|
6
|
1
|
59
|
if ( !is_latin1( $_[0] ) |
|
|
|
66
|
|
|
|
|
146
|
|
|
|
|
|
|
&& !is_ascii( $_[0] ) |
147
|
|
|
|
|
|
|
&& $_[0] =~ m/[\x80-\x9f]/ ) |
148
|
|
|
|
|
|
|
{ |
149
|
5
|
|
|
|
|
22
|
return 1; |
150
|
|
|
|
|
|
|
} |
151
|
1
|
|
|
|
|
5
|
return 0; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
my %win1252 = ( |
155
|
|
|
|
|
|
|
"\x80" => "\x{20AC}", #EURO SIGN |
156
|
|
|
|
|
|
|
"\x81" => '', #UNDEFINED |
157
|
|
|
|
|
|
|
"\x82" => "\x{201A}", #SINGLE LOW-9 QUOTATION MARK |
158
|
|
|
|
|
|
|
"\x83" => "\x{0192}", #LATIN SMALL LETTER F WITH HOOK |
159
|
|
|
|
|
|
|
"\x84" => "\x{201E}", #DOUBLE LOW-9 QUOTATION MARK |
160
|
|
|
|
|
|
|
"\x85" => "\x{2026}", #HORIZONTAL ELLIPSIS |
161
|
|
|
|
|
|
|
"\x86" => "\x{2020}", #DAGGER |
162
|
|
|
|
|
|
|
"\x87" => "\x{2021}", #DOUBLE DAGGER |
163
|
|
|
|
|
|
|
"\x88" => "\x{02C6}", #MODIFIER LETTER CIRCUMFLEX ACCENT |
164
|
|
|
|
|
|
|
"\x89" => "\x{2030}", #PER MILLE SIGN |
165
|
|
|
|
|
|
|
"\x8A" => "\x{0160}", #LATIN CAPITAL LETTER S WITH CARON |
166
|
|
|
|
|
|
|
"\x8B" => "\x{2039}", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
167
|
|
|
|
|
|
|
"\x8C" => "\x{0152}", #LATIN CAPITAL LIGATURE OE |
168
|
|
|
|
|
|
|
"\x8D" => '', #UNDEFINED |
169
|
|
|
|
|
|
|
"\x8E" => "\x{017D}", #LATIN CAPITAL LETTER Z WITH CARON |
170
|
|
|
|
|
|
|
"\x8F" => '', #UNDEFINED |
171
|
|
|
|
|
|
|
"\x90" => '', #UNDEFINED |
172
|
|
|
|
|
|
|
"\x91" => "\x{2018}", #LEFT SINGLE QUOTATION MARK |
173
|
|
|
|
|
|
|
"\x92" => "\x{2019}", #RIGHT SINGLE QUOTATION MARK |
174
|
|
|
|
|
|
|
"\x93" => "\x{201C}", #LEFT DOUBLE QUOTATION MARK |
175
|
|
|
|
|
|
|
"\x94" => "\x{201D}", #RIGHT DOUBLE QUOTATION MARK |
176
|
|
|
|
|
|
|
"\x95" => "\x{2022}", #BULLET |
177
|
|
|
|
|
|
|
"\x96" => "\x{2013}", #EN DASH |
178
|
|
|
|
|
|
|
"\x97" => "\x{2014}", #EM DASH |
179
|
|
|
|
|
|
|
"\x98" => "\x{02DC}", #SMALL TILDE |
180
|
|
|
|
|
|
|
"\x99" => "\x{2122}", #TRADE MARK SIGN |
181
|
|
|
|
|
|
|
"\x9A" => "\x{0161}", #LATIN SMALL LETTER S WITH CARON |
182
|
|
|
|
|
|
|
"\x9B" => "\x{203A}", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
183
|
|
|
|
|
|
|
"\x9C" => "\x{0153}", #LATIN SMALL LIGATURE OE |
184
|
|
|
|
|
|
|
"\x9D" => '', #UNDEFINED |
185
|
|
|
|
|
|
|
"\x9E" => "\x{017E}", #LATIN SMALL LETTER Z WITH CARON |
186
|
|
|
|
|
|
|
"\x9F" => "\x{0178}", #LATIN CAPITAL LETTER Y WITH DIAERESIS |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# fix_latin (used in Transliterate) lacks the check for the |
191
|
|
|
|
|
|
|
# prefixed \xc2 byte, but the UTF-8 encoding for these |
192
|
|
|
|
|
|
|
# Windows codepoints has the leading \xc2 byte. |
193
|
|
|
|
|
|
|
sub fix_cp1252_codepoints_in_utf8 { |
194
|
1
|
|
|
1
|
1
|
217
|
my $buf = shift; |
195
|
1
|
50
|
|
|
|
2
|
unless ( is_valid_utf8($buf) ) { |
196
|
0
|
|
|
|
|
0
|
my $badbyte = find_bad_utf8($buf); |
197
|
0
|
|
|
|
|
0
|
croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]"; |
198
|
|
|
|
|
|
|
} |
199
|
1
|
50
|
|
|
|
3
|
$Debug and warn "converting $buf\n"; |
200
|
1
|
|
|
|
|
2
|
my $bytes = Encode::encode_utf8( to_utf8($buf) ); |
201
|
1
|
|
|
|
|
14
|
$bytes =~ s/\xc2([\x80-\x9f])/$win1252{$1}/g; |
202
|
1
|
|
|
|
|
2
|
return to_utf8($bytes); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
1; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
__END__ |