line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Encode::Guess; |
2
|
1
|
|
|
1
|
|
13785
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
3
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
4
|
1
|
|
|
1
|
|
4
|
use Encode qw(:fallbacks find_encoding); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
172
|
|
5
|
|
|
|
|
|
|
our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
my $Canon = 'Guess'; |
8
|
1
|
|
|
1
|
|
6
|
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
110
|
|
9
|
|
|
|
|
|
|
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); |
10
|
|
|
|
|
|
|
my $obj = bless { |
11
|
|
|
|
|
|
|
Name => $Canon, |
12
|
|
|
|
|
|
|
Suspects => {%DEF_SUSPECTS}, |
13
|
|
|
|
|
|
|
} => __PACKAGE__; |
14
|
|
|
|
|
|
|
Encode::define_encoding($obj, $Canon); |
15
|
|
|
|
|
|
|
|
16
|
1
|
|
|
1
|
|
5
|
use parent qw(Encode::Encoding); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6
|
|
17
|
0
|
|
|
0
|
1
|
0
|
sub needs_lines { 1 } |
18
|
0
|
|
|
0
|
1
|
0
|
sub perlio_ok { 0 } |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our @EXPORT = qw(guess_encoding); |
21
|
|
|
|
|
|
|
our $NoUTFAutoGuess = 0; |
22
|
|
|
|
|
|
|
our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub import { # Exporter not used so we do it on our own |
25
|
1
|
|
|
1
|
|
7
|
my $callpkg = caller; |
26
|
1
|
|
|
|
|
3
|
for my $item (@EXPORT) { |
27
|
1
|
|
|
1
|
|
131
|
no strict 'refs'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
446
|
|
28
|
1
|
|
|
|
|
1
|
*{"$callpkg\::$item"} = \&{"$item"}; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
4
|
|
29
|
|
|
|
|
|
|
} |
30
|
1
|
|
|
|
|
3
|
set_suspects(@_); |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub set_suspects { |
34
|
4
|
|
|
4
|
1
|
1619
|
my $class = shift; |
35
|
4
|
50
|
|
|
|
15
|
my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
36
|
4
|
|
|
|
|
24
|
$self->{Suspects} = {%DEF_SUSPECTS}; |
37
|
4
|
|
|
|
|
18
|
$self->add_suspects(@_); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub add_suspects { |
41
|
4
|
|
|
4
|
1
|
9
|
my $class = shift; |
42
|
4
|
50
|
|
|
|
11
|
my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
43
|
4
|
|
|
|
|
21
|
for my $c (@_) { |
44
|
7
|
50
|
|
|
|
18
|
my $e = find_encoding($c) or die "Unknown encoding: $c"; |
45
|
7
|
|
|
|
|
37
|
$self->{Suspects}{ $e->name } = $e; |
46
|
7
|
|
|
|
|
18
|
DEBUG and warn "Added: ", $e->name; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub decode($$;$) { |
51
|
2
|
|
|
2
|
1
|
8
|
my ( $obj, $octet, $chk ) = @_; |
52
|
2
|
|
|
|
|
6
|
my $guessed = guess( $obj, $octet ); |
53
|
2
|
50
|
|
|
|
7
|
unless ( ref($guessed) ) { |
54
|
0
|
|
|
|
|
0
|
require Carp; |
55
|
0
|
|
|
|
|
0
|
Carp::croak($guessed); |
56
|
|
|
|
|
|
|
} |
57
|
2
|
|
50
|
|
|
2521
|
my $utf8 = $guessed->decode( $octet, $chk || 0 ); |
58
|
2
|
50
|
|
|
|
9
|
$_[1] = $octet if $chk; |
59
|
2
|
|
|
|
|
29
|
return $utf8; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub guess_encoding { |
63
|
27
|
|
|
27
|
1
|
1131
|
guess( $Encode::Encoding{$Canon}, @_ ); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub guess { |
67
|
29
|
|
|
29
|
1
|
50
|
my $class = shift; |
68
|
29
|
50
|
|
|
|
79
|
my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; |
69
|
29
|
|
|
|
|
51
|
my $octet = shift; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# sanity check |
72
|
29
|
100
|
66
|
|
|
148
|
return "Empty string, empty guess" unless defined $octet and length $octet; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# cheat 0: utf8 flag; |
75
|
28
|
100
|
|
|
|
103
|
if ( Encode::is_utf8($octet) ) { |
76
|
1
|
50
|
|
|
|
5
|
return find_encoding('utf8') unless $NoUTFAutoGuess; |
77
|
0
|
|
|
|
|
0
|
Encode::_utf8_off($octet); |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# cheat 1: BOM |
81
|
1
|
|
|
1
|
|
308
|
use Encode::Unicode; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
355
|
|
82
|
27
|
50
|
|
|
|
66
|
unless ($NoUTFAutoGuess) { |
83
|
27
|
|
|
|
|
115
|
my $BOM = pack( 'C3', unpack( "C3", $octet ) ); |
84
|
27
|
50
|
33
|
|
|
121
|
return find_encoding('utf8') |
85
|
|
|
|
|
|
|
if ( defined $BOM and $BOM eq $UTF8_BOM ); |
86
|
27
|
|
|
|
|
63
|
$BOM = unpack( 'N', $octet ); |
87
|
27
|
100
|
66
|
|
|
163
|
return find_encoding('UTF-32') |
|
|
|
33
|
|
|
|
|
88
|
|
|
|
|
|
|
if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) ); |
89
|
26
|
|
|
|
|
48
|
$BOM = unpack( 'n', $octet ); |
90
|
26
|
100
|
66
|
|
|
136
|
return find_encoding('UTF-16') |
|
|
|
33
|
|
|
|
|
91
|
|
|
|
|
|
|
if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) ); |
92
|
25
|
100
|
|
|
|
88
|
if ( $octet =~ /\x00/o ) |
93
|
|
|
|
|
|
|
{ # if \x00 found, we assume UTF-(16|32)(BE|LE) |
94
|
4
|
|
|
|
|
6
|
my $utf; |
95
|
4
|
|
|
|
|
9
|
my ( $be, $le ) = ( 0, 0 ); |
96
|
4
|
100
|
|
|
|
14
|
if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed |
97
|
2
|
|
|
|
|
5
|
$utf = "UTF-32"; |
98
|
2
|
|
|
|
|
7
|
for my $char ( unpack( 'N*', $octet ) ) { |
99
|
100
|
100
|
|
|
|
170
|
$char & 0x0000ffff and $be++; |
100
|
100
|
100
|
|
|
|
181
|
$char & 0xffff0000 and $le++; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
else { # UTF-16(BE|LE) assumed |
104
|
2
|
|
|
|
|
4
|
$utf = "UTF-16"; |
105
|
2
|
|
|
|
|
8
|
for my $char ( unpack( 'n*', $octet ) ) { |
106
|
100
|
100
|
|
|
|
174
|
$char & 0x00ff and $be++; |
107
|
100
|
100
|
|
|
|
178
|
$char & 0xff00 and $le++; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
4
|
|
|
|
|
9
|
DEBUG and warn "$utf, be == $be, le == $le"; |
111
|
4
|
50
|
|
|
|
9
|
$be == $le |
112
|
|
|
|
|
|
|
and return |
113
|
|
|
|
|
|
|
"Encodings ambiguous between $utf BE and LE ($be, $le)"; |
114
|
4
|
100
|
|
|
|
10
|
$utf .= ( $be > $le ) ? 'BE' : 'LE'; |
115
|
4
|
|
|
|
|
11
|
return find_encoding($utf); |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
21
|
|
|
|
|
34
|
my %try = %{ $obj->{Suspects} }; |
|
21
|
|
|
|
|
108
|
|
119
|
21
|
|
|
|
|
58
|
for my $c (@_) { |
120
|
7
|
50
|
|
|
|
20
|
my $e = find_encoding($c) or die "Unknown encoding: $c"; |
121
|
7
|
|
|
|
|
26
|
$try{ $e->name } = $e; |
122
|
7
|
|
|
|
|
13
|
DEBUG and warn "Added: ", $e->name; |
123
|
|
|
|
|
|
|
} |
124
|
21
|
|
|
|
|
35
|
my $nline = 1; |
125
|
21
|
|
|
|
|
1686
|
for my $line ( split /\r\n?|\n/, $octet ) { |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# cheat 2 -- \e in the string |
128
|
69
|
100
|
|
|
|
176
|
if ( $line =~ /\e/o ) { |
129
|
2
|
|
|
|
|
7
|
my @keys = keys %try; |
130
|
2
|
|
|
|
|
7
|
delete @try{qw/utf8 ascii/}; |
131
|
2
|
|
|
|
|
4
|
for my $k (@keys) { |
132
|
8
|
100
|
|
|
|
25
|
ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k}; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
} |
135
|
69
|
|
|
|
|
171
|
my %ok = %try; |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# warn join(",", keys %try); |
138
|
69
|
|
|
|
|
151
|
for my $k ( keys %try ) { |
139
|
211
|
|
|
|
|
310
|
my $scratch = $line; |
140
|
211
|
|
|
|
|
1052
|
$try{$k}->decode( $scratch, FB_QUIET ); |
141
|
211
|
100
|
|
|
|
409
|
if ( $scratch eq '' ) { |
142
|
154
|
|
|
|
|
213
|
DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k ); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
else { |
145
|
1
|
|
|
1
|
|
7
|
use bytes (); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
151
|
|
146
|
57
|
|
|
|
|
109
|
DEBUG |
147
|
|
|
|
|
|
|
and warn sprintf( "%4d:%-24s not ok; %d bytes left\n", |
148
|
|
|
|
|
|
|
$nline, $k, bytes::length($scratch) ); |
149
|
57
|
|
|
|
|
113
|
delete $ok{$k}; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
} |
152
|
69
|
100
|
|
|
|
167
|
%ok or return "No appropriate encodings found!"; |
153
|
66
|
100
|
|
|
|
137
|
if ( scalar( keys(%ok) ) == 1 ) { |
154
|
14
|
|
|
|
|
25
|
my ($retval) = values(%ok); |
155
|
14
|
|
|
|
|
199
|
return $retval; |
156
|
|
|
|
|
|
|
} |
157
|
52
|
|
|
|
|
180
|
%try = %ok; |
158
|
52
|
|
|
|
|
117
|
$nline++; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
$try{ascii} |
161
|
4
|
50
|
|
|
|
13
|
or return "Encodings too ambiguous: ", join( " or ", keys %try ); |
162
|
4
|
|
|
|
|
29
|
return $try{ascii}; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
1; |
166
|
|
|
|
|
|
|
__END__ |