line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Encoding::FixLatin; |
2
|
|
|
|
|
|
|
{ |
3
|
|
|
|
|
|
|
$Encoding::FixLatin::VERSION = '1.04'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
120617
|
use warnings; |
|
4
|
|
|
|
|
12
|
|
|
4
|
|
|
|
|
140
|
|
7
|
4
|
|
|
4
|
|
29
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
186
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require 5.008; |
10
|
|
|
|
|
|
|
|
11
|
4
|
|
|
4
|
|
26
|
use Carp qw(croak); |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
283
|
|
12
|
4
|
|
|
4
|
|
25
|
use Exporter qw(import); |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
183
|
|
13
|
4
|
|
|
4
|
|
12787
|
use Encode qw(is_utf8 encode_utf8); |
|
4
|
|
|
|
|
91020
|
|
|
4
|
|
|
|
|
9272
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(fix_latin); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $xs_loaded = undef; # no attempt to load yet |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
my $byte_map; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $ascii_str = qr{\A([\x00-\x7F]+)(.*)\z}s; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $cont_byte = '[\x80-\xBF]'; |
25
|
|
|
|
|
|
|
my $utf8_2 = qr{\A([\xC0-\xDF])($cont_byte)(.*)\z}s; |
26
|
|
|
|
|
|
|
my $utf8_3 = qr{\A([\xE0-\xEF])($cont_byte)($cont_byte)(.*)\z}s; |
27
|
|
|
|
|
|
|
my $utf8_4 = qr{\A([\xF0-\xF7])($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; |
28
|
|
|
|
|
|
|
my $utf8_5 = qr{\A([\xF8-\xFB])($cont_byte)($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my %known_opt = map { $_ => 1 } qw(bytes_only ascii_hex overlong_fatal use_xs); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my %non_1252 = ( |
33
|
|
|
|
|
|
|
"\x81" => '%81', |
34
|
|
|
|
|
|
|
"\x8D" => '%8D', |
35
|
|
|
|
|
|
|
"\x8F" => '%8F', |
36
|
|
|
|
|
|
|
"\x90" => '%90', |
37
|
|
|
|
|
|
|
"\x9D" => '%9D', |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub fix_latin { |
41
|
36
|
|
|
36
|
1
|
6874
|
my $input = shift; |
42
|
36
|
|
|
|
|
232
|
my %opt = ( |
43
|
|
|
|
|
|
|
ascii_hex => 1, |
44
|
|
|
|
|
|
|
bytes_only => 0, |
45
|
|
|
|
|
|
|
overlong_fatal => 0, |
46
|
|
|
|
|
|
|
use_xs => 'auto', |
47
|
|
|
|
|
|
|
@_ |
48
|
|
|
|
|
|
|
); |
49
|
|
|
|
|
|
|
|
50
|
36
|
|
|
|
|
1927
|
foreach (keys %opt) { |
51
|
143
|
100
|
|
|
|
1557
|
croak "Unknown option '$_'" unless $known_opt{$_}; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
35
|
100
|
|
|
|
135
|
return unless defined($input); |
55
|
33
|
100
|
|
|
|
120
|
_init_byte_map(\%opt) unless $byte_map; |
56
|
33
|
|
|
|
|
109
|
_init_xs($opt{use_xs}); |
57
|
|
|
|
|
|
|
|
58
|
33
|
100
|
|
|
|
154
|
if(is_utf8($input)) { # input string already has utf8 flag set |
59
|
2
|
100
|
|
|
|
14
|
if($opt{bytes_only}) { |
60
|
1
|
|
|
|
|
4
|
return encode_utf8($input); |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
else { |
63
|
1
|
|
|
|
|
7
|
return $input; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
31
|
100
|
66
|
|
|
113
|
if($xs_loaded and $opt{use_xs} ne 'never') { |
68
|
2
|
50
|
|
|
|
10
|
my $olf = $opt{overlong_fatal} ? 1 : 0; |
69
|
2
|
50
|
|
|
|
8
|
my $asx = $opt{ascii_hex} ? 1 : 0; |
70
|
2
|
|
|
|
|
4
|
local($@); |
71
|
2
|
|
|
|
|
27
|
$input = eval { # assign back to $input to avoid copying if all ASCII |
72
|
2
|
|
|
|
|
14
|
Encoding::FixLatin::XS::_fix_latin_xs($input, $olf, $asx); |
73
|
|
|
|
|
|
|
}; |
74
|
2
|
50
|
|
|
|
11
|
if(my $msg = $@) { |
75
|
0
|
|
|
|
|
0
|
chomp($msg); |
76
|
0
|
|
|
|
|
0
|
croak $msg; |
77
|
|
|
|
|
|
|
}; |
78
|
2
|
50
|
|
|
|
10
|
if($opt{bytes_only}) { |
79
|
0
|
|
|
|
|
0
|
return encode_utf8($input); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
else { |
82
|
2
|
|
|
|
|
19
|
return $input; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
} |
85
|
29
|
|
|
|
|
80
|
return _fix_latin_pp($input, \%opt); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _fix_latin_pp { |
90
|
29
|
|
|
29
|
|
62
|
my($input, $opt) = @_; |
91
|
|
|
|
|
|
|
|
92
|
29
|
|
|
|
|
45
|
my $output = ''; |
93
|
29
|
|
|
|
|
39
|
my $char = ''; |
94
|
29
|
|
|
|
|
36
|
my $rest = ''; |
95
|
29
|
|
|
|
|
49
|
my $olf = $opt->{overlong_fatal}; |
96
|
29
|
|
|
|
|
121
|
while(length($input) > 0) { |
97
|
57
|
100
|
|
|
|
907
|
if($input =~ $ascii_str) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
98
|
18
|
|
|
|
|
52
|
$output .= $1; |
99
|
18
|
|
|
|
|
43
|
$rest = $2; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
elsif($input =~ $utf8_2) { |
102
|
8
|
|
|
|
|
35
|
$output .= _decode_utf8($olf, ord($1) & 0x1F, $1, $2); |
103
|
8
|
|
|
|
|
20
|
$rest = $3; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
elsif($input =~ $utf8_3) { |
106
|
6
|
|
|
|
|
38
|
$output .= _decode_utf8($olf, ord($1) & 0x0F, $1, $2, $3); |
107
|
5
|
|
|
|
|
19
|
$rest = $4; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
elsif($input =~ $utf8_4) { |
110
|
2
|
|
|
|
|
15
|
$output .= _decode_utf8($olf, ord($1) & 0x07, $1, $2, $3, $4); |
111
|
2
|
|
|
|
|
9
|
$rest = $5; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
elsif($input =~ $utf8_5) { |
114
|
1
|
|
|
|
|
7
|
$output .= _decode_utf8($olf, ord($1) & 0x03, $1, $2, $3, $4, $5); |
115
|
1
|
|
|
|
|
4
|
$rest = $6; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
else { |
118
|
22
|
|
|
|
|
128
|
($char, $rest) = $input =~ /^(.)(.*)$/s; |
119
|
22
|
100
|
100
|
|
|
131
|
if($opt->{ascii_hex} && exists $non_1252{$char}) { |
120
|
6
|
|
|
|
|
17
|
$output .= $non_1252{$char}; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
else { |
123
|
16
|
|
|
|
|
42
|
$output .= $byte_map->{$char}; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
56
|
|
|
|
|
576
|
$input = $rest; |
127
|
|
|
|
|
|
|
} |
128
|
28
|
100
|
|
|
|
5163
|
utf8::decode($output) unless $opt->{bytes_only}; |
129
|
28
|
|
|
|
|
230
|
return $output; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _decode_utf8 { |
134
|
17
|
|
|
17
|
|
31
|
my $overlong_fatal = shift; |
135
|
17
|
|
|
|
|
28
|
my $c = shift; |
136
|
17
|
|
|
|
|
27
|
my $byte_count = @_; |
137
|
17
|
|
|
|
|
781
|
foreach my $i (1..$#_) { |
138
|
30
|
|
|
|
|
102
|
$c = ($c << 6) + (ord($_[$i]) & 0x3F); |
139
|
|
|
|
|
|
|
} |
140
|
17
|
|
|
|
|
93
|
my $bytes = encode_utf8(chr($c)); |
141
|
17
|
100
|
66
|
|
|
172
|
if($overlong_fatal and $byte_count > length($bytes)) { |
142
|
1
|
|
|
|
|
3
|
my $hex_bytes= join ' ', map { sprintf('%02X', ord($_)) } @_; |
|
3
|
|
|
|
|
17
|
|
143
|
1
|
|
|
|
|
247
|
croak "Over-long UTF-8 byte sequence: $hex_bytes"; |
144
|
|
|
|
|
|
|
} |
145
|
16
|
|
|
|
|
45
|
return $bytes; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _init_byte_map { |
150
|
4
|
|
|
4
|
|
20
|
foreach my $i (0x80..0xFF) { |
151
|
512
|
|
|
|
|
664
|
my $utf_char = chr($i); |
152
|
512
|
|
|
|
|
10180
|
utf8::encode($utf_char); |
153
|
512
|
|
|
|
|
1662
|
$byte_map->{pack('C', $i)} = $utf_char; |
154
|
|
|
|
|
|
|
} |
155
|
4
|
|
|
|
|
23
|
_add_cp1252_mappings(); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _init_xs { |
160
|
33
|
|
|
33
|
|
67
|
my($use_xs) = @_; |
161
|
|
|
|
|
|
|
|
162
|
33
|
100
|
66
|
|
|
132
|
if($use_xs eq 'never' or $xs_loaded) { |
163
|
31
|
|
|
|
|
68
|
return; |
164
|
|
|
|
|
|
|
} |
165
|
2
|
50
|
|
|
|
9
|
if(!defined($xs_loaded)) { |
166
|
2
|
|
|
|
|
5
|
local($@); |
167
|
2
|
50
|
|
|
|
4
|
$xs_loaded = eval { require 'Encoding/FixLatin/XS.pm' } ? 1 : 0; |
|
2
|
|
|
|
|
12505
|
|
168
|
|
|
|
|
|
|
} |
169
|
2
|
50
|
33
|
|
|
3577
|
if(!$xs_loaded and $use_xs eq 'always') { |
170
|
0
|
|
|
|
|
0
|
croak "Failed to load Encoding::FixLatin::XS"; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub _add_cp1252_mappings { |
176
|
|
|
|
|
|
|
# From http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT |
177
|
4
|
|
|
4
|
|
73
|
my %ms_map = ( |
178
|
|
|
|
|
|
|
"\x80" => "\xE2\x82\xAC", # EURO SIGN |
179
|
|
|
|
|
|
|
"\x82" => "\xE2\x80\x9A", # SINGLE LOW-9 QUOTATION MARK |
180
|
|
|
|
|
|
|
"\x83" => "\xC6\x92", # LATIN SMALL LETTER F WITH HOOK |
181
|
|
|
|
|
|
|
"\x84" => "\xE2\x80\x9E", # DOUBLE LOW-9 QUOTATION MARK |
182
|
|
|
|
|
|
|
"\x85" => "\xE2\x80\xA6", # HORIZONTAL ELLIPSIS |
183
|
|
|
|
|
|
|
"\x86" => "\xE2\x80\xA0", # DAGGER |
184
|
|
|
|
|
|
|
"\x87" => "\xE2\x80\xA1", # DOUBLE DAGGER |
185
|
|
|
|
|
|
|
"\x88" => "\xCB\x86", # MODIFIER LETTER CIRCUMFLEX ACCENT |
186
|
|
|
|
|
|
|
"\x89" => "\xE2\x80\xB0", # PER MILLE SIGN |
187
|
|
|
|
|
|
|
"\x8A" => "\xC5\xA0", # LATIN CAPITAL LETTER S WITH CARON |
188
|
|
|
|
|
|
|
"\x8B" => "\xE2\x80\xB9", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
189
|
|
|
|
|
|
|
"\x8C" => "\xC5\x92", # LATIN CAPITAL LIGATURE OE |
190
|
|
|
|
|
|
|
"\x8E" => "\xC5\xBD", # LATIN CAPITAL LETTER Z WITH CARON |
191
|
|
|
|
|
|
|
"\x91" => "\xE2\x80\x98", # LEFT SINGLE QUOTATION MARK |
192
|
|
|
|
|
|
|
"\x92" => "\xE2\x80\x99", # RIGHT SINGLE QUOTATION MARK |
193
|
|
|
|
|
|
|
"\x93" => "\xE2\x80\x9C", # LEFT DOUBLE QUOTATION MARK |
194
|
|
|
|
|
|
|
"\x94" => "\xE2\x80\x9D", # RIGHT DOUBLE QUOTATION MARK |
195
|
|
|
|
|
|
|
"\x95" => "\xE2\x80\xA2", # BULLET |
196
|
|
|
|
|
|
|
"\x96" => "\xE2\x80\x93", # EN DASH |
197
|
|
|
|
|
|
|
"\x97" => "\xE2\x80\x94", # EM DASH |
198
|
|
|
|
|
|
|
"\x98" => "\xCB\x9C", # SMALL TILDE |
199
|
|
|
|
|
|
|
"\x99" => "\xE2\x84\xA2", # TRADE MARK SIGN |
200
|
|
|
|
|
|
|
"\x9A" => "\xC5\xA1", # LATIN SMALL LETTER S WITH CARON |
201
|
|
|
|
|
|
|
"\x9B" => "\xE2\x80\xBA", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
202
|
|
|
|
|
|
|
"\x9C" => "\xC5\x93", # LATIN SMALL LIGATURE OE |
203
|
|
|
|
|
|
|
"\x9E" => "\xC5\xBE", # LATIN SMALL LETTER Z WITH CARON |
204
|
|
|
|
|
|
|
"\x9F" => "\xC5\xB8", # LATIN CAPITAL LETTER Y WITH DIAERESIS |
205
|
|
|
|
|
|
|
); |
206
|
4
|
|
|
|
|
37
|
while(my($k, $v) = each %ms_map) { |
207
|
108
|
|
|
|
|
326
|
$byte_map->{$k} = $v; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
1; |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
__END__ |