| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Encoding::FixLatin; |
|
2
|
|
|
|
|
|
|
{ |
|
3
|
|
|
|
|
|
|
$Encoding::FixLatin::VERSION = '1.04'; |
|
4
|
|
|
|
|
|
|
} |
|
5
|
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
120617
|
use warnings; |
|
|
4
|
|
|
|
|
12
|
|
|
|
4
|
|
|
|
|
140
|
|
|
7
|
4
|
|
|
4
|
|
29
|
use strict; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
186
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require 5.008; |
|
10
|
|
|
|
|
|
|
|
|
11
|
4
|
|
|
4
|
|
26
|
use Carp qw(croak); |
|
|
4
|
|
|
|
|
14
|
|
|
|
4
|
|
|
|
|
283
|
|
|
12
|
4
|
|
|
4
|
|
25
|
use Exporter qw(import); |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
183
|
|
|
13
|
4
|
|
|
4
|
|
12787
|
use Encode qw(is_utf8 encode_utf8); |
|
|
4
|
|
|
|
|
91020
|
|
|
|
4
|
|
|
|
|
9272
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(fix_latin); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $xs_loaded = undef; # no attempt to load yet |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
my $byte_map; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $ascii_str = qr{\A([\x00-\x7F]+)(.*)\z}s; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $cont_byte = '[\x80-\xBF]'; |
|
25
|
|
|
|
|
|
|
my $utf8_2 = qr{\A([\xC0-\xDF])($cont_byte)(.*)\z}s; |
|
26
|
|
|
|
|
|
|
my $utf8_3 = qr{\A([\xE0-\xEF])($cont_byte)($cont_byte)(.*)\z}s; |
|
27
|
|
|
|
|
|
|
my $utf8_4 = qr{\A([\xF0-\xF7])($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; |
|
28
|
|
|
|
|
|
|
my $utf8_5 = qr{\A([\xF8-\xFB])($cont_byte)($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my %known_opt = map { $_ => 1 } qw(bytes_only ascii_hex overlong_fatal use_xs); |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
my %non_1252 = ( |
|
33
|
|
|
|
|
|
|
"\x81" => '%81', |
|
34
|
|
|
|
|
|
|
"\x8D" => '%8D', |
|
35
|
|
|
|
|
|
|
"\x8F" => '%8F', |
|
36
|
|
|
|
|
|
|
"\x90" => '%90', |
|
37
|
|
|
|
|
|
|
"\x9D" => '%9D', |
|
38
|
|
|
|
|
|
|
); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub fix_latin { |
|
41
|
36
|
|
|
36
|
1
|
6874
|
my $input = shift; |
|
42
|
36
|
|
|
|
|
232
|
my %opt = ( |
|
43
|
|
|
|
|
|
|
ascii_hex => 1, |
|
44
|
|
|
|
|
|
|
bytes_only => 0, |
|
45
|
|
|
|
|
|
|
overlong_fatal => 0, |
|
46
|
|
|
|
|
|
|
use_xs => 'auto', |
|
47
|
|
|
|
|
|
|
@_ |
|
48
|
|
|
|
|
|
|
); |
|
49
|
|
|
|
|
|
|
|
|
50
|
36
|
|
|
|
|
1927
|
foreach (keys %opt) { |
|
51
|
143
|
100
|
|
|
|
1557
|
croak "Unknown option '$_'" unless $known_opt{$_}; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
35
|
100
|
|
|
|
135
|
return unless defined($input); |
|
55
|
33
|
100
|
|
|
|
120
|
_init_byte_map(\%opt) unless $byte_map; |
|
56
|
33
|
|
|
|
|
109
|
_init_xs($opt{use_xs}); |
|
57
|
|
|
|
|
|
|
|
|
58
|
33
|
100
|
|
|
|
154
|
if(is_utf8($input)) { # input string already has utf8 flag set |
|
59
|
2
|
100
|
|
|
|
14
|
if($opt{bytes_only}) { |
|
60
|
1
|
|
|
|
|
4
|
return encode_utf8($input); |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
else { |
|
63
|
1
|
|
|
|
|
7
|
return $input; |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
31
|
100
|
66
|
|
|
113
|
if($xs_loaded and $opt{use_xs} ne 'never') { |
|
68
|
2
|
50
|
|
|
|
10
|
my $olf = $opt{overlong_fatal} ? 1 : 0; |
|
69
|
2
|
50
|
|
|
|
8
|
my $asx = $opt{ascii_hex} ? 1 : 0; |
|
70
|
2
|
|
|
|
|
4
|
local($@); |
|
71
|
2
|
|
|
|
|
27
|
$input = eval { # assign back to $input to avoid copying if all ASCII |
|
72
|
2
|
|
|
|
|
14
|
Encoding::FixLatin::XS::_fix_latin_xs($input, $olf, $asx); |
|
73
|
|
|
|
|
|
|
}; |
|
74
|
2
|
50
|
|
|
|
11
|
if(my $msg = $@) { |
|
75
|
0
|
|
|
|
|
0
|
chomp($msg); |
|
76
|
0
|
|
|
|
|
0
|
croak $msg; |
|
77
|
|
|
|
|
|
|
}; |
|
78
|
2
|
50
|
|
|
|
10
|
if($opt{bytes_only}) { |
|
79
|
0
|
|
|
|
|
0
|
return encode_utf8($input); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
else { |
|
82
|
2
|
|
|
|
|
19
|
return $input; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} |
|
85
|
29
|
|
|
|
|
80
|
return _fix_latin_pp($input, \%opt); |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _fix_latin_pp { |
|
90
|
29
|
|
|
29
|
|
62
|
my($input, $opt) = @_; |
|
91
|
|
|
|
|
|
|
|
|
92
|
29
|
|
|
|
|
45
|
my $output = ''; |
|
93
|
29
|
|
|
|
|
39
|
my $char = ''; |
|
94
|
29
|
|
|
|
|
36
|
my $rest = ''; |
|
95
|
29
|
|
|
|
|
49
|
my $olf = $opt->{overlong_fatal}; |
|
96
|
29
|
|
|
|
|
121
|
while(length($input) > 0) { |
|
97
|
57
|
100
|
|
|
|
907
|
if($input =~ $ascii_str) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
98
|
18
|
|
|
|
|
52
|
$output .= $1; |
|
99
|
18
|
|
|
|
|
43
|
$rest = $2; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
elsif($input =~ $utf8_2) { |
|
102
|
8
|
|
|
|
|
35
|
$output .= _decode_utf8($olf, ord($1) & 0x1F, $1, $2); |
|
103
|
8
|
|
|
|
|
20
|
$rest = $3; |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
elsif($input =~ $utf8_3) { |
|
106
|
6
|
|
|
|
|
38
|
$output .= _decode_utf8($olf, ord($1) & 0x0F, $1, $2, $3); |
|
107
|
5
|
|
|
|
|
19
|
$rest = $4; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
elsif($input =~ $utf8_4) { |
|
110
|
2
|
|
|
|
|
15
|
$output .= _decode_utf8($olf, ord($1) & 0x07, $1, $2, $3, $4); |
|
111
|
2
|
|
|
|
|
9
|
$rest = $5; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
elsif($input =~ $utf8_5) { |
|
114
|
1
|
|
|
|
|
7
|
$output .= _decode_utf8($olf, ord($1) & 0x03, $1, $2, $3, $4, $5); |
|
115
|
1
|
|
|
|
|
4
|
$rest = $6; |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
else { |
|
118
|
22
|
|
|
|
|
128
|
($char, $rest) = $input =~ /^(.)(.*)$/s; |
|
119
|
22
|
100
|
100
|
|
|
131
|
if($opt->{ascii_hex} && exists $non_1252{$char}) { |
|
120
|
6
|
|
|
|
|
17
|
$output .= $non_1252{$char}; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
else { |
|
123
|
16
|
|
|
|
|
42
|
$output .= $byte_map->{$char}; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
} |
|
126
|
56
|
|
|
|
|
576
|
$input = $rest; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
28
|
100
|
|
|
|
5163
|
utf8::decode($output) unless $opt->{bytes_only}; |
|
129
|
28
|
|
|
|
|
230
|
return $output; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _decode_utf8 { |
|
134
|
17
|
|
|
17
|
|
31
|
my $overlong_fatal = shift; |
|
135
|
17
|
|
|
|
|
28
|
my $c = shift; |
|
136
|
17
|
|
|
|
|
27
|
my $byte_count = @_; |
|
137
|
17
|
|
|
|
|
781
|
foreach my $i (1..$#_) { |
|
138
|
30
|
|
|
|
|
102
|
$c = ($c << 6) + (ord($_[$i]) & 0x3F); |
|
139
|
|
|
|
|
|
|
} |
|
140
|
17
|
|
|
|
|
93
|
my $bytes = encode_utf8(chr($c)); |
|
141
|
17
|
100
|
66
|
|
|
172
|
if($overlong_fatal and $byte_count > length($bytes)) { |
|
142
|
1
|
|
|
|
|
3
|
my $hex_bytes= join ' ', map { sprintf('%02X', ord($_)) } @_; |
|
|
3
|
|
|
|
|
17
|
|
|
143
|
1
|
|
|
|
|
247
|
croak "Over-long UTF-8 byte sequence: $hex_bytes"; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
16
|
|
|
|
|
45
|
return $bytes; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _init_byte_map { |
|
150
|
4
|
|
|
4
|
|
20
|
foreach my $i (0x80..0xFF) { |
|
151
|
512
|
|
|
|
|
664
|
my $utf_char = chr($i); |
|
152
|
512
|
|
|
|
|
10180
|
utf8::encode($utf_char); |
|
153
|
512
|
|
|
|
|
1662
|
$byte_map->{pack('C', $i)} = $utf_char; |
|
154
|
|
|
|
|
|
|
} |
|
155
|
4
|
|
|
|
|
23
|
_add_cp1252_mappings(); |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _init_xs { |
|
160
|
33
|
|
|
33
|
|
67
|
my($use_xs) = @_; |
|
161
|
|
|
|
|
|
|
|
|
162
|
33
|
100
|
66
|
|
|
132
|
if($use_xs eq 'never' or $xs_loaded) { |
|
163
|
31
|
|
|
|
|
68
|
return; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
2
|
50
|
|
|
|
9
|
if(!defined($xs_loaded)) { |
|
166
|
2
|
|
|
|
|
5
|
local($@); |
|
167
|
2
|
50
|
|
|
|
4
|
$xs_loaded = eval { require 'Encoding/FixLatin/XS.pm' } ? 1 : 0; |
|
|
2
|
|
|
|
|
12505
|
|
|
168
|
|
|
|
|
|
|
} |
|
169
|
2
|
50
|
33
|
|
|
3577
|
if(!$xs_loaded and $use_xs eq 'always') { |
|
170
|
0
|
|
|
|
|
0
|
croak "Failed to load Encoding::FixLatin::XS"; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub _add_cp1252_mappings { |
|
176
|
|
|
|
|
|
|
# From http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT |
|
177
|
4
|
|
|
4
|
|
73
|
my %ms_map = ( |
|
178
|
|
|
|
|
|
|
"\x80" => "\xE2\x82\xAC", # EURO SIGN |
|
179
|
|
|
|
|
|
|
"\x82" => "\xE2\x80\x9A", # SINGLE LOW-9 QUOTATION MARK |
|
180
|
|
|
|
|
|
|
"\x83" => "\xC6\x92", # LATIN SMALL LETTER F WITH HOOK |
|
181
|
|
|
|
|
|
|
"\x84" => "\xE2\x80\x9E", # DOUBLE LOW-9 QUOTATION MARK |
|
182
|
|
|
|
|
|
|
"\x85" => "\xE2\x80\xA6", # HORIZONTAL ELLIPSIS |
|
183
|
|
|
|
|
|
|
"\x86" => "\xE2\x80\xA0", # DAGGER |
|
184
|
|
|
|
|
|
|
"\x87" => "\xE2\x80\xA1", # DOUBLE DAGGER |
|
185
|
|
|
|
|
|
|
"\x88" => "\xCB\x86", # MODIFIER LETTER CIRCUMFLEX ACCENT |
|
186
|
|
|
|
|
|
|
"\x89" => "\xE2\x80\xB0", # PER MILLE SIGN |
|
187
|
|
|
|
|
|
|
"\x8A" => "\xC5\xA0", # LATIN CAPITAL LETTER S WITH CARON |
|
188
|
|
|
|
|
|
|
"\x8B" => "\xE2\x80\xB9", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
189
|
|
|
|
|
|
|
"\x8C" => "\xC5\x92", # LATIN CAPITAL LIGATURE OE |
|
190
|
|
|
|
|
|
|
"\x8E" => "\xC5\xBD", # LATIN CAPITAL LETTER Z WITH CARON |
|
191
|
|
|
|
|
|
|
"\x91" => "\xE2\x80\x98", # LEFT SINGLE QUOTATION MARK |
|
192
|
|
|
|
|
|
|
"\x92" => "\xE2\x80\x99", # RIGHT SINGLE QUOTATION MARK |
|
193
|
|
|
|
|
|
|
"\x93" => "\xE2\x80\x9C", # LEFT DOUBLE QUOTATION MARK |
|
194
|
|
|
|
|
|
|
"\x94" => "\xE2\x80\x9D", # RIGHT DOUBLE QUOTATION MARK |
|
195
|
|
|
|
|
|
|
"\x95" => "\xE2\x80\xA2", # BULLET |
|
196
|
|
|
|
|
|
|
"\x96" => "\xE2\x80\x93", # EN DASH |
|
197
|
|
|
|
|
|
|
"\x97" => "\xE2\x80\x94", # EM DASH |
|
198
|
|
|
|
|
|
|
"\x98" => "\xCB\x9C", # SMALL TILDE |
|
199
|
|
|
|
|
|
|
"\x99" => "\xE2\x84\xA2", # TRADE MARK SIGN |
|
200
|
|
|
|
|
|
|
"\x9A" => "\xC5\xA1", # LATIN SMALL LETTER S WITH CARON |
|
201
|
|
|
|
|
|
|
"\x9B" => "\xE2\x80\xBA", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
202
|
|
|
|
|
|
|
"\x9C" => "\xC5\x93", # LATIN SMALL LIGATURE OE |
|
203
|
|
|
|
|
|
|
"\x9E" => "\xC5\xBE", # LATIN SMALL LETTER Z WITH CARON |
|
204
|
|
|
|
|
|
|
"\x9F" => "\xC5\xB8", # LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
205
|
|
|
|
|
|
|
); |
|
206
|
4
|
|
|
|
|
37
|
while(my($k, $v) = each %ms_map) { |
|
207
|
108
|
|
|
|
|
326
|
$byte_map->{$k} = $v; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
1; |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
__END__ |