| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Search::Tools::UTF8; |
|
2
|
38
|
|
|
38
|
|
252401
|
use strict; |
|
|
38
|
|
|
|
|
97
|
|
|
|
38
|
|
|
|
|
902
|
|
|
3
|
38
|
|
|
38
|
|
152
|
use warnings; |
|
|
38
|
|
|
|
|
56
|
|
|
|
38
|
|
|
|
|
733
|
|
|
4
|
38
|
|
|
38
|
|
140
|
use Carp; |
|
|
38
|
|
|
|
|
58
|
|
|
|
38
|
|
|
|
|
1624
|
|
|
5
|
38
|
|
|
38
|
|
3677
|
use Search::Tools; # XS stuff |
|
|
38
|
|
|
|
|
70
|
|
|
|
38
|
|
|
|
|
820
|
|
|
6
|
38
|
|
|
38
|
|
13064
|
use Encode; |
|
|
38
|
|
|
|
|
246328
|
|
|
|
38
|
|
|
|
|
2283
|
|
|
7
|
38
|
|
|
38
|
|
14968
|
use charnames ':full'; |
|
|
38
|
|
|
|
|
939019
|
|
|
|
38
|
|
|
|
|
213
|
|
|
8
|
38
|
|
|
38
|
|
10125
|
use Data::Dump qw( dump ); |
|
|
38
|
|
|
|
|
39814
|
|
|
|
38
|
|
|
|
|
1910
|
|
|
9
|
38
|
|
|
38
|
|
198
|
use base qw( Exporter ); |
|
|
38
|
|
|
|
|
65
|
|
|
|
38
|
|
|
|
|
47624
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT = qw( |
|
11
|
|
|
|
|
|
|
to_utf8 |
|
12
|
|
|
|
|
|
|
is_valid_utf8 |
|
13
|
|
|
|
|
|
|
is_flagged_utf8 |
|
14
|
|
|
|
|
|
|
is_perl_utf8_string |
|
15
|
|
|
|
|
|
|
is_ascii |
|
16
|
|
|
|
|
|
|
is_latin1 |
|
17
|
|
|
|
|
|
|
is_sane_utf8 |
|
18
|
|
|
|
|
|
|
find_bad_utf8 |
|
19
|
|
|
|
|
|
|
find_bad_ascii |
|
20
|
|
|
|
|
|
|
find_bad_latin1 |
|
21
|
|
|
|
|
|
|
find_bad_latin1_report |
|
22
|
|
|
|
|
|
|
byte_length |
|
23
|
|
|
|
|
|
|
looks_like_cp1252 |
|
24
|
|
|
|
|
|
|
fix_cp1252_codepoints_in_utf8 |
|
25
|
|
|
|
|
|
|
debug_bytes |
|
26
|
|
|
|
|
|
|
); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our $Debug = ( $ENV{PERL_DEBUG} && $ENV{PERL_DEBUG} > 2 ) ? 1 : 0; |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub to_utf8 { |
|
33
|
604
|
|
|
604
|
1
|
2158
|
my $str = shift; |
|
34
|
604
|
50
|
|
|
|
1068
|
Carp::cluck("\$str is undefined") unless defined $str; |
|
35
|
|
|
|
|
|
|
|
|
36
|
604
|
|
100
|
|
|
1163
|
my $charset = shift || 'iso-8859-1'; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# checks first |
|
39
|
604
|
100
|
|
|
|
965
|
if ( is_flagged_utf8($str) ) { |
|
40
|
393
|
50
|
|
|
|
617
|
$Debug and carp "string '$str' is flagged utf8 already"; |
|
41
|
393
|
|
|
|
|
1159
|
return $str; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
211
|
100
|
|
|
|
1153
|
if ( is_ascii($str) ) { |
|
44
|
193
|
|
|
|
|
660
|
Encode::_utf8_on($str); |
|
45
|
193
|
50
|
|
|
|
398
|
$Debug and carp "string '$str' is ascii; utf8 flag turned on"; |
|
46
|
193
|
|
|
|
|
1196
|
return $str; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
18
|
100
|
|
|
|
53
|
if ( is_valid_utf8($str) ) { |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# we got here only because the flag was off and it wasn't ascii. |
|
51
|
|
|
|
|
|
|
# however, is_valid_utf8() claims that it is valid internal UTF8, |
|
52
|
|
|
|
|
|
|
# so just turn the flag on. |
|
53
|
6
|
|
|
|
|
19
|
Encode::_utf8_on($str); |
|
54
|
6
|
50
|
|
|
|
14
|
$Debug and carp "string '$str' is valid utf8; utf8 flag turned on"; |
|
55
|
6
|
|
|
|
|
18
|
return $str; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
$Debug |
|
59
|
12
|
50
|
|
|
|
33
|
and carp "converting $str from $charset -> utf8"; |
|
60
|
12
|
|
|
|
|
73
|
my $c = Encode::decode( $charset, $str ); |
|
61
|
12
|
50
|
|
|
|
1292
|
$Debug and carp "converted $c"; |
|
62
|
|
|
|
|
|
|
|
|
63
|
12
|
50
|
|
|
|
43
|
unless ( is_sane_utf8( $c, 1 ) ) { |
|
64
|
0
|
|
|
|
|
0
|
carp "not sane: $c"; |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
12
|
|
|
|
|
153
|
return $c; |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub is_flagged_utf8 { |
|
71
|
611
|
|
|
611
|
1
|
1849
|
return Encode::is_utf8( $_[0] ); |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $re_bit = join "|", |
|
75
|
|
|
|
|
|
|
map { Encode::encode( "utf8", chr($_) ) } ( 127 .. 255 ); |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
#binmode STDERR, ":utf8"; |
|
78
|
|
|
|
|
|
|
#print STDERR $re_bit; |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub is_sane_utf8 { |
|
81
|
15
|
|
|
15
|
1
|
46
|
my $string = shift; |
|
82
|
15
|
|
50
|
|
|
52
|
my $warnings = shift || $Debug || 0; |
|
83
|
|
|
|
|
|
|
|
|
84
|
15
|
|
|
|
|
25
|
my $is_insane = 0; |
|
85
|
15
|
|
|
|
|
3154
|
while ( $string =~ /($re_bit)/go ) { |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# work out what the double encoded string was |
|
88
|
0
|
|
|
|
|
0
|
my $bytes = $1; |
|
89
|
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
0
|
my $index = $+[0] - length($bytes); |
|
91
|
0
|
|
|
|
|
0
|
my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, |
|
|
0
|
|
|
|
|
0
|
|
|
92
|
|
|
|
|
|
|
$bytes; |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# what character does that represent? |
|
95
|
0
|
|
|
|
|
0
|
my $char = Encode::decode( "utf8", $bytes ); |
|
96
|
0
|
|
|
|
|
0
|
my $ord = ord($char); |
|
97
|
0
|
|
|
|
|
0
|
my $hex = sprintf '%00x', $ord; |
|
98
|
0
|
|
|
|
|
0
|
$char = charnames::viacode($ord); |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# print out diagnostic messages |
|
101
|
0
|
0
|
|
|
|
0
|
if ($warnings) { |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
0
|
warn(qq{Found dodgy chars "$codes" at char $index\n}); |
|
104
|
0
|
0
|
|
|
|
0
|
if ( Encode::is_utf8($string) ) { |
|
105
|
0
|
|
|
|
|
0
|
warn("Chars in utf8 string look like utf8 byte sequence."); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
else { |
|
108
|
0
|
|
|
|
|
0
|
warn("String not flagged as utf8...was it meant to be?\n"); |
|
109
|
|
|
|
|
|
|
} |
|
110
|
0
|
|
|
|
|
0
|
warn( |
|
111
|
|
|
|
|
|
|
"Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n" |
|
112
|
|
|
|
|
|
|
); |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
|
115
|
0
|
|
|
|
|
0
|
$is_insane++; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
15
|
50
|
|
|
|
122
|
return $is_insane ? 0 : 1; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub is_valid_utf8 { |
|
122
|
39
|
100
|
100
|
39
|
1
|
4963
|
if ( is_latin1( $_[0] ) |
|
|
|
|
100
|
|
|
|
|
|
123
|
|
|
|
|
|
|
&& !is_ascii( $_[0] ) |
|
124
|
|
|
|
|
|
|
&& !is_perl_utf8_string( $_[0] ) ) |
|
125
|
|
|
|
|
|
|
{ |
|
126
|
8
|
|
|
|
|
30
|
return 0; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
31
|
|
|
|
|
155
|
return is_perl_utf8_string( $_[0] ); |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub find_bad_latin1_report { |
|
132
|
2
|
|
|
2
|
1
|
223
|
my $bad = find_bad_latin1(@_); |
|
133
|
2
|
50
|
|
|
|
5
|
if ($bad) { |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# explain why we failed |
|
136
|
2
|
|
|
|
|
5
|
my $char = substr( $_[0], $bad, 1 ); |
|
137
|
2
|
|
|
|
|
3
|
my $dec = ord($char); |
|
138
|
2
|
|
|
|
|
7
|
my $hex = sprintf '%x', $dec; |
|
139
|
2
|
|
|
|
|
277
|
carp("byte $bad ($char) is not Latin1 (it's $dec dec / $hex hex)"); |
|
140
|
|
|
|
|
|
|
} |
|
141
|
2
|
|
|
|
|
16
|
return $bad; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub looks_like_cp1252 { |
|
145
|
6
|
100
|
33
|
6
|
1
|
59
|
if ( !is_latin1( $_[0] ) |
|
|
|
|
66
|
|
|
|
|
|
146
|
|
|
|
|
|
|
&& !is_ascii( $_[0] ) |
|
147
|
|
|
|
|
|
|
&& $_[0] =~ m/[\x80-\x9f]/ ) |
|
148
|
|
|
|
|
|
|
{ |
|
149
|
5
|
|
|
|
|
22
|
return 1; |
|
150
|
|
|
|
|
|
|
} |
|
151
|
1
|
|
|
|
|
5
|
return 0; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
my %win1252 = ( |
|
155
|
|
|
|
|
|
|
"\x80" => "\x{20AC}", #EURO SIGN |
|
156
|
|
|
|
|
|
|
"\x81" => '', #UNDEFINED |
|
157
|
|
|
|
|
|
|
"\x82" => "\x{201A}", #SINGLE LOW-9 QUOTATION MARK |
|
158
|
|
|
|
|
|
|
"\x83" => "\x{0192}", #LATIN SMALL LETTER F WITH HOOK |
|
159
|
|
|
|
|
|
|
"\x84" => "\x{201E}", #DOUBLE LOW-9 QUOTATION MARK |
|
160
|
|
|
|
|
|
|
"\x85" => "\x{2026}", #HORIZONTAL ELLIPSIS |
|
161
|
|
|
|
|
|
|
"\x86" => "\x{2020}", #DAGGER |
|
162
|
|
|
|
|
|
|
"\x87" => "\x{2021}", #DOUBLE DAGGER |
|
163
|
|
|
|
|
|
|
"\x88" => "\x{02C6}", #MODIFIER LETTER CIRCUMFLEX ACCENT |
|
164
|
|
|
|
|
|
|
"\x89" => "\x{2030}", #PER MILLE SIGN |
|
165
|
|
|
|
|
|
|
"\x8A" => "\x{0160}", #LATIN CAPITAL LETTER S WITH CARON |
|
166
|
|
|
|
|
|
|
"\x8B" => "\x{2039}", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
167
|
|
|
|
|
|
|
"\x8C" => "\x{0152}", #LATIN CAPITAL LIGATURE OE |
|
168
|
|
|
|
|
|
|
"\x8D" => '', #UNDEFINED |
|
169
|
|
|
|
|
|
|
"\x8E" => "\x{017D}", #LATIN CAPITAL LETTER Z WITH CARON |
|
170
|
|
|
|
|
|
|
"\x8F" => '', #UNDEFINED |
|
171
|
|
|
|
|
|
|
"\x90" => '', #UNDEFINED |
|
172
|
|
|
|
|
|
|
"\x91" => "\x{2018}", #LEFT SINGLE QUOTATION MARK |
|
173
|
|
|
|
|
|
|
"\x92" => "\x{2019}", #RIGHT SINGLE QUOTATION MARK |
|
174
|
|
|
|
|
|
|
"\x93" => "\x{201C}", #LEFT DOUBLE QUOTATION MARK |
|
175
|
|
|
|
|
|
|
"\x94" => "\x{201D}", #RIGHT DOUBLE QUOTATION MARK |
|
176
|
|
|
|
|
|
|
"\x95" => "\x{2022}", #BULLET |
|
177
|
|
|
|
|
|
|
"\x96" => "\x{2013}", #EN DASH |
|
178
|
|
|
|
|
|
|
"\x97" => "\x{2014}", #EM DASH |
|
179
|
|
|
|
|
|
|
"\x98" => "\x{02DC}", #SMALL TILDE |
|
180
|
|
|
|
|
|
|
"\x99" => "\x{2122}", #TRADE MARK SIGN |
|
181
|
|
|
|
|
|
|
"\x9A" => "\x{0161}", #LATIN SMALL LETTER S WITH CARON |
|
182
|
|
|
|
|
|
|
"\x9B" => "\x{203A}", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
183
|
|
|
|
|
|
|
"\x9C" => "\x{0153}", #LATIN SMALL LIGATURE OE |
|
184
|
|
|
|
|
|
|
"\x9D" => '', #UNDEFINED |
|
185
|
|
|
|
|
|
|
"\x9E" => "\x{017E}", #LATIN SMALL LETTER Z WITH CARON |
|
186
|
|
|
|
|
|
|
"\x9F" => "\x{0178}", #LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
); |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# fix_latin (used in Transliterate) lacks the check for the |
|
191
|
|
|
|
|
|
|
# prefixed \xc2 byte, but the UTF-8 encoding for these |
|
192
|
|
|
|
|
|
|
# Windows codepoints has the leading \xc2 byte. |
|
193
|
|
|
|
|
|
|
sub fix_cp1252_codepoints_in_utf8 { |
|
194
|
1
|
|
|
1
|
1
|
217
|
my $buf = shift; |
|
195
|
1
|
50
|
|
|
|
2
|
unless ( is_valid_utf8($buf) ) { |
|
196
|
0
|
|
|
|
|
0
|
my $badbyte = find_bad_utf8($buf); |
|
197
|
0
|
|
|
|
|
0
|
croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]"; |
|
198
|
|
|
|
|
|
|
} |
|
199
|
1
|
50
|
|
|
|
3
|
$Debug and warn "converting $buf\n"; |
|
200
|
1
|
|
|
|
|
2
|
my $bytes = Encode::encode_utf8( to_utf8($buf) ); |
|
201
|
1
|
|
|
|
|
14
|
$bytes =~ s/\xc2([\x80-\x9f])/$win1252{$1}/g; |
|
202
|
1
|
|
|
|
|
2
|
return to_utf8($bytes); |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
1; |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
__END__ |