line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Encoding;
|
2
|
2
|
|
|
2
|
|
110542
|
use strict;
|
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
78
|
|
3
|
2
|
|
|
2
|
|
11
|
use warnings;
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
59
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
1848
|
use HTML::Parser qw();
|
|
2
|
|
|
|
|
12786
|
|
|
2
|
|
|
|
|
90
|
|
6
|
2
|
|
|
2
|
|
2049
|
use HTTP::Headers::Util qw(split_header_words);
|
|
2
|
|
|
|
|
1848
|
|
|
2
|
|
|
|
|
179
|
|
7
|
2
|
|
|
2
|
|
2227
|
use Encode qw();
|
|
2
|
|
|
|
|
31735
|
|
|
2
|
|
|
|
|
54
|
|
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
18
|
use base qw(Exporter);
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
7341
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.61';
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
our @EXPORT_OK =
|
14
|
|
|
|
|
|
|
qw/
|
15
|
|
|
|
|
|
|
&encoding_from_meta_element
|
16
|
|
|
|
|
|
|
&xml_declaration_from_octets
|
17
|
|
|
|
|
|
|
&encoding_from_first_chars
|
18
|
|
|
|
|
|
|
&encoding_from_xml_declaration
|
19
|
|
|
|
|
|
|
&encoding_from_byte_order_mark
|
20
|
|
|
|
|
|
|
&encoding_from_content_type
|
21
|
|
|
|
|
|
|
&encoding_from_xml_document
|
22
|
|
|
|
|
|
|
&encoding_from_html_document
|
23
|
|
|
|
|
|
|
&encoding_from_http_message
|
24
|
|
|
|
|
|
|
/;
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our $DEFAULT_ENCODINGS = [qw/
|
27
|
|
|
|
|
|
|
ISO-8859-1
|
28
|
|
|
|
|
|
|
UTF-16LE
|
29
|
|
|
|
|
|
|
UTF-16BE
|
30
|
|
|
|
|
|
|
UTF-32LE
|
31
|
|
|
|
|
|
|
UTF-32BE
|
32
|
|
|
|
|
|
|
UTF-8
|
33
|
|
|
|
|
|
|
/];
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our %MAP =
|
36
|
|
|
|
|
|
|
(
|
37
|
|
|
|
|
|
|
BM => "\x{FEFF}",
|
38
|
|
|
|
|
|
|
CR => "\x{000D}",
|
39
|
|
|
|
|
|
|
LF => "\x{000A}",
|
40
|
|
|
|
|
|
|
SP => "\x{0020}",
|
41
|
|
|
|
|
|
|
TB => "\x{0009}",
|
42
|
|
|
|
|
|
|
QS => "\x{003F}",
|
43
|
|
|
|
|
|
|
NL => "\x{0085}",
|
44
|
|
|
|
|
|
|
LS => "\x{2028}",
|
45
|
|
|
|
|
|
|
LT => "<", # fixme
|
46
|
|
|
|
|
|
|
GT => ">", # fixme
|
47
|
|
|
|
|
|
|
);
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub _my_encode
|
50
|
|
|
|
|
|
|
{
|
51
|
0
|
|
|
0
|
|
0
|
my $seq;
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
eval
|
54
|
0
|
|
|
|
|
0
|
{
|
55
|
0
|
|
|
|
|
0
|
$seq = Encode::encode($_[0],
|
56
|
|
|
|
|
|
|
$_[1],
|
57
|
|
|
|
|
|
|
$_[2]);
|
58
|
|
|
|
|
|
|
};
|
59
|
|
|
|
|
|
|
|
60
|
0
|
0
|
|
|
|
0
|
return $seq unless $@;
|
61
|
0
|
|
|
|
|
0
|
return;
|
62
|
|
|
|
|
|
|
}
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub _my_decode
|
65
|
|
|
|
|
|
|
{
|
66
|
0
|
|
|
0
|
|
0
|
my $str;
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
eval
|
69
|
0
|
|
|
|
|
0
|
{
|
70
|
0
|
|
|
|
|
0
|
$str = Encode::decode($_[0],
|
71
|
|
|
|
|
|
|
$_[1],
|
72
|
|
|
|
|
|
|
$_[2]);
|
73
|
|
|
|
|
|
|
};
|
74
|
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
0
|
return $str unless $@;
|
76
|
0
|
|
|
|
|
0
|
return;
|
77
|
|
|
|
|
|
|
}
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub _make_character_map
|
80
|
|
|
|
|
|
|
{
|
81
|
0
|
|
|
0
|
|
0
|
my $encoding = shift;
|
82
|
0
|
|
|
|
|
0
|
my %data;
|
83
|
|
|
|
|
|
|
|
84
|
0
|
|
|
|
|
0
|
foreach my $sym (keys %MAP)
|
85
|
|
|
|
|
|
|
{
|
86
|
0
|
|
|
|
|
0
|
my $seq = _my_encode($encoding, "$MAP{$sym}", Encode::FB_CROAK);
|
87
|
0
|
0
|
|
|
|
0
|
$data{$sym} = $seq if defined $seq;
|
88
|
|
|
|
|
|
|
}
|
89
|
|
|
|
|
|
|
|
90
|
0
|
|
|
|
|
0
|
\%data;
|
91
|
|
|
|
|
|
|
}
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# cache for U+XXXX octet sequences
|
94
|
|
|
|
|
|
|
our %CHARACTER_MAP_CACHE = ();
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub _get_character_map
|
97
|
|
|
|
|
|
|
{
|
98
|
0
|
|
|
0
|
|
0
|
my $encoding = shift;
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# read from cache
|
101
|
0
|
0
|
|
|
|
0
|
return $CHARACTER_MAP_CACHE{$encoding}
|
102
|
|
|
|
|
|
|
if exists $CHARACTER_MAP_CACHE{$encoding};
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# new cache entry
|
105
|
0
|
|
|
|
|
0
|
my $map = _make_character_map($encoding);
|
106
|
0
|
|
|
|
|
0
|
$CHARACTER_MAP_CACHE{$encoding} = $map;
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# return new entry
|
109
|
0
|
|
|
|
|
0
|
return $map;
|
110
|
|
|
|
|
|
|
}
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub encoding_from_meta_element
|
113
|
|
|
|
|
|
|
{
|
114
|
0
|
|
|
0
|
1
|
0
|
my $text = shift;
|
115
|
0
|
|
|
|
|
0
|
my $enco = shift;
|
116
|
|
|
|
|
|
|
|
117
|
0
|
0
|
|
|
|
0
|
return unless defined $text;
|
118
|
0
|
0
|
|
|
|
0
|
return unless length $text;
|
119
|
|
|
|
|
|
|
|
120
|
0
|
0
|
|
|
|
0
|
return unless defined $enco;
|
121
|
0
|
0
|
|
|
|
0
|
return unless length $enco;
|
122
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
0
|
my $pars = HTML::Parser->new
|
124
|
|
|
|
|
|
|
(
|
125
|
|
|
|
|
|
|
api_version => 3,
|
126
|
|
|
|
|
|
|
@_
|
127
|
|
|
|
|
|
|
);
|
128
|
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
0
|
my $meta = [];
|
130
|
0
|
|
|
|
|
0
|
my $leng = length $text;
|
131
|
0
|
|
|
|
|
0
|
my $size = 8192;
|
132
|
0
|
|
|
|
|
0
|
my $data = '';
|
133
|
0
|
|
|
|
|
0
|
my $utf8 = '';
|
134
|
0
|
|
|
|
|
0
|
my $i = 0;
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# todo: should finish when or logically body//*
|
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
0
|
$pars->report_tags(qw/meta head/);
|
139
|
0
|
|
|
|
|
0
|
$pars->handler(start => $meta, "tagname,attr");
|
140
|
|
|
|
|
|
|
$pars->handler
|
141
|
|
|
|
|
|
|
(
|
142
|
0
|
0
|
|
0
|
|
0
|
end => sub { $_[0]->eof if $_[1] eq "head" },
|
143
|
0
|
|
|
|
|
0
|
"self,tagname"
|
144
|
|
|
|
|
|
|
);
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
$pars->parse(sub
|
147
|
|
|
|
|
|
|
{
|
148
|
0
|
0
|
|
0
|
|
0
|
return if $i > $leng;
|
149
|
0
|
|
|
|
|
0
|
$data .= substr $text, $i, $size;
|
150
|
0
|
|
|
|
|
0
|
$i += $size;
|
151
|
0
|
|
|
|
|
0
|
_my_decode($enco, $data, Encode::FB_QUIET);
|
152
|
0
|
|
|
|
|
0
|
});
|
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
0
|
my @resu;
|
155
|
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
0
|
foreach (grep { $_->[0] eq "meta" } @$meta)
|
|
0
|
|
|
|
|
0
|
|
157
|
|
|
|
|
|
|
{
|
158
|
0
|
|
|
|
|
0
|
my %hash = %{$_->[1]};
|
|
0
|
|
|
|
|
0
|
|
159
|
0
|
0
|
|
|
|
0
|
next unless defined $hash{'content'};
|
160
|
0
|
0
|
|
|
|
0
|
next unless exists $hash{'http-equiv'};
|
161
|
0
|
0
|
|
|
|
0
|
next unless lc $hash{'http-equiv'} eq "content-type";
|
162
|
0
|
|
|
|
|
0
|
my $char = encoding_from_content_type($hash{'content'});
|
163
|
0
|
0
|
0
|
|
|
0
|
push @resu, $char if defined $char and length $char;
|
164
|
|
|
|
|
|
|
}
|
165
|
|
|
|
|
|
|
|
166
|
0
|
0
|
|
|
|
0
|
return unless @resu;
|
167
|
0
|
0
|
|
|
|
0
|
return wantarray ? @resu : $resu[0];
|
168
|
|
|
|
|
|
|
}
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub xml_declaration_from_octets
|
171
|
|
|
|
|
|
|
{
|
172
|
0
|
|
|
0
|
1
|
0
|
my $text = shift;
|
173
|
0
|
|
|
|
|
0
|
my %o = @_;
|
174
|
0
|
|
0
|
|
|
0
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
175
|
0
|
|
|
|
|
0
|
my %resu;
|
176
|
|
|
|
|
|
|
|
177
|
0
|
0
|
|
|
|
0
|
return unless defined $text;
|
178
|
0
|
0
|
|
|
|
0
|
return unless length $text;
|
179
|
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
0
|
foreach my $e (@$encodings)
|
181
|
|
|
|
|
|
|
{
|
182
|
0
|
|
|
|
|
0
|
my $map = _get_character_map($e);
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# search for >
|
185
|
0
|
|
|
|
|
0
|
my $end = index $text, $map->{GT};
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# search for
|
188
|
0
|
|
|
|
|
0
|
my $str = index $text, $map->{LT} . $map->{QS};
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# skip this encoding unless ...
|
191
|
0
|
0
|
0
|
|
|
0
|
next unless $end > 0 and $str >= 0 and $end > $str;
|
|
|
|
0
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# extract tentative XML declaration
|
194
|
0
|
|
|
|
|
0
|
my $decl = substr $text, $str, $end - $str + 1;
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# decode XML declaration
|
197
|
0
|
|
|
|
|
0
|
my $deco = _my_decode($e, $decl, Encode::FB_CROAK);
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# skip encoding if decoding failed
|
200
|
0
|
0
|
|
|
|
0
|
next unless defined $deco;
|
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
0
|
$resu{$deco}++;
|
203
|
|
|
|
|
|
|
}
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# No XML declarations found
|
206
|
0
|
0
|
|
|
|
0
|
return unless keys %resu;
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# sort by number of matches, most match first
|
209
|
0
|
|
|
|
|
0
|
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
|
|
0
|
|
|
|
|
0
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# in array context return all encodings,
|
212
|
|
|
|
|
|
|
# in scalar context return best match.
|
213
|
0
|
0
|
|
|
|
0
|
return wantarray ? @sort : $sort[0];
|
214
|
|
|
|
|
|
|
}
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub encoding_from_first_chars
|
217
|
|
|
|
|
|
|
{
|
218
|
0
|
|
|
0
|
1
|
0
|
my $text = shift;
|
219
|
0
|
|
|
|
|
0
|
my %o = @_;
|
220
|
0
|
|
0
|
|
|
0
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
221
|
0
|
|
0
|
|
|
0
|
my $whitespace = $o{whitespace} || [qw/CR LF TB SP/];
|
222
|
|
|
|
|
|
|
|
223
|
0
|
0
|
|
|
|
0
|
return unless defined $text;
|
224
|
0
|
0
|
|
|
|
0
|
return unless length $text;
|
225
|
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
0
|
my %resu;
|
227
|
0
|
|
|
|
|
0
|
foreach my $e (@$encodings)
|
228
|
|
|
|
|
|
|
{
|
229
|
0
|
|
|
|
|
0
|
my $m = _get_character_map($e);
|
230
|
0
|
|
|
|
|
0
|
my $i = index $text, $m->{LT};
|
231
|
0
|
0
|
|
|
|
0
|
next unless $i >= 0;
|
232
|
0
|
|
|
|
|
0
|
my $t = substr $text, 0, $i;
|
233
|
|
|
|
|
|
|
|
234
|
0
|
|
|
|
|
0
|
my @y;
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# construct \xXX\xXX string from octets, might make sense to
|
237
|
|
|
|
|
|
|
# have this in the map construction process
|
238
|
0
|
|
|
|
|
0
|
push@y,"(?:".join("",map{sprintf"\\x%02x",ord}split//,$m->{$_}).")"
|
239
|
0
|
|
|
|
|
0
|
foreach grep defined, @$whitespace;
|
240
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
0
|
my $x = join "|", @y;
|
242
|
0
|
|
|
|
|
0
|
$t =~ s/^($x)+//g;
|
243
|
|
|
|
|
|
|
|
244
|
0
|
0
|
|
|
|
0
|
$resu{$e} = $i + length $m->{LT} unless length $t;
|
245
|
|
|
|
|
|
|
}
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# ...
|
248
|
0
|
0
|
|
|
|
0
|
return unless keys %resu;
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# sort by match length, longest match first
|
251
|
0
|
|
|
|
|
0
|
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
|
|
0
|
|
|
|
|
0
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# in array context return all encodings,
|
254
|
|
|
|
|
|
|
# in scalar context return best match.
|
255
|
0
|
0
|
|
|
|
0
|
return wantarray ? @sort : $sort[0];
|
256
|
|
|
|
|
|
|
}
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub encoding_from_xml_declaration
|
259
|
|
|
|
|
|
|
{
|
260
|
0
|
|
|
0
|
1
|
0
|
my $decl = shift;
|
261
|
|
|
|
|
|
|
|
262
|
0
|
0
|
|
|
|
0
|
return unless defined $decl;
|
263
|
0
|
0
|
|
|
|
0
|
return unless length $decl;
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# todo: move this to some better place...
|
266
|
0
|
|
|
|
|
0
|
my $ws = qr/[\x09\x85\x20\x0d\x0a\x{2028}]*/;
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# skip if not an XML declaration
|
269
|
0
|
0
|
|
|
|
0
|
return unless $decl =~ /^<\?xml$ws/i;
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# attempt to extract encoding pseudo attribute
|
272
|
0
|
0
|
0
|
|
|
0
|
return unless $decl =~ /encoding$ws=$ws'([^']+)'/i or
|
273
|
|
|
|
|
|
|
$decl =~ /encoding$ws=$ws"([^"]+)"/i;
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# no encoding pseudo-attribute
|
276
|
0
|
0
|
|
|
|
0
|
return unless defined $1;
|
277
|
0
|
|
|
|
|
0
|
my $enco = $1;
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# strip leading/trailing whitespace/quotes
|
280
|
0
|
|
|
|
|
0
|
$enco =~ s/^[\s'"]+|[\s'"]+$//g;
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# collapse white-space
|
283
|
0
|
|
|
|
|
0
|
$enco =~ s/\s+/ /g;
|
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# treat empty charset as if it were unspecified
|
286
|
0
|
0
|
|
|
|
0
|
return unless length $enco;
|
287
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
0
|
return $enco;
|
289
|
|
|
|
|
|
|
}
|
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub encoding_from_byte_order_mark
|
292
|
|
|
|
|
|
|
{
|
293
|
0
|
|
|
0
|
1
|
0
|
my $text = shift;
|
294
|
0
|
|
|
|
|
0
|
my %o = @_;
|
295
|
0
|
|
0
|
|
|
0
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
296
|
0
|
|
|
|
|
0
|
my %resu;
|
297
|
|
|
|
|
|
|
|
298
|
0
|
0
|
|
|
|
0
|
return unless defined $text;
|
299
|
0
|
0
|
|
|
|
0
|
return unless length $text;
|
300
|
|
|
|
|
|
|
|
301
|
0
|
|
|
|
|
0
|
foreach my $e (@$encodings)
|
302
|
|
|
|
|
|
|
{
|
303
|
0
|
|
|
|
|
0
|
my $map = _get_character_map($e);
|
304
|
0
|
|
|
|
|
0
|
my $bom = $map->{BM};
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# encoding cannot encode U+FEFF
|
307
|
0
|
0
|
|
|
|
0
|
next unless defined $bom;
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# remember match length
|
310
|
0
|
0
|
|
|
|
0
|
$resu{$e} = length $bom if $text =~ /^(\Q$bom\E)/;
|
311
|
|
|
|
|
|
|
}
|
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# does not start with BOM
|
314
|
0
|
0
|
|
|
|
0
|
return unless keys %resu;
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# sort by match length, longest match first
|
317
|
0
|
|
|
|
|
0
|
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
|
|
0
|
|
|
|
|
0
|
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# in array context return all encodings,
|
320
|
|
|
|
|
|
|
# in scalar context return best match.
|
321
|
0
|
0
|
|
|
|
0
|
return wantarray ? @sort : $sort[0];
|
322
|
|
|
|
|
|
|
}
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
sub encoding_from_content_type
|
325
|
|
|
|
|
|
|
{
|
326
|
42
|
|
|
42
|
1
|
46
|
my $text = shift;
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# nothing to do...
|
329
|
42
|
100
|
66
|
|
|
151
|
return unless defined $text and length $text;
|
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# downgrade Unicode strings
|
332
|
41
|
50
|
|
|
|
109
|
$text = Encode::encode_utf8($text) if Encode::is_utf8($text);
|
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# split parameters, only look at the first set
|
335
|
41
|
|
|
|
|
37
|
my %data = @{(split_header_words($text))[0]};
|
|
41
|
|
|
|
|
98
|
|
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# extract first charset parameter if any
|
338
|
41
|
|
|
|
|
2255
|
my $char;
|
339
|
41
|
|
|
|
|
87
|
foreach my $param (keys %data) {
|
340
|
64
|
100
|
50
|
|
|
178
|
$char = $data{$param} and last if 'charset' eq lc $param;
|
341
|
|
|
|
|
|
|
}
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# no charset parameter
|
344
|
41
|
100
|
|
|
|
92
|
return unless defined $char;
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# there are no special escapes so just remove \s
|
347
|
39
|
|
|
|
|
50
|
$char =~ tr/\\//d;
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# strip leading/trailing whitespace/quotes
|
350
|
39
|
|
|
|
|
146
|
$char =~ s/^[\s'"]+|[\s'"]+$//g;
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# collapse white-space
|
353
|
39
|
|
|
|
|
49
|
$char =~ s/\s+/ /g;
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
# treat empty charset as if it were unspecified
|
356
|
39
|
50
|
|
|
|
68
|
return unless length $char;
|
357
|
|
|
|
|
|
|
|
358
|
39
|
|
|
|
|
103
|
return $char
|
359
|
|
|
|
|
|
|
}
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub encoding_from_xml_document
|
362
|
|
|
|
|
|
|
{
|
363
|
0
|
|
|
0
|
1
|
0
|
my $text = shift;
|
364
|
0
|
|
|
|
|
0
|
my %o = @_;
|
365
|
0
|
|
0
|
|
|
0
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
366
|
0
|
|
|
|
|
0
|
my %resu;
|
367
|
|
|
|
|
|
|
|
368
|
0
|
0
|
|
|
|
0
|
return unless defined $text;
|
369
|
0
|
0
|
|
|
|
0
|
return unless length $text;
|
370
|
|
|
|
|
|
|
|
371
|
0
|
|
|
|
|
0
|
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
|
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# BOM determines encoding
|
374
|
0
|
0
|
|
|
|
0
|
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
|
|
|
0
|
|
|
|
|
|
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# no BOM
|
377
|
0
|
|
|
|
|
0
|
my @decls = xml_declaration_from_octets($text, encodings => $encodings);
|
378
|
0
|
|
|
|
|
0
|
foreach my $decl (@decls)
|
379
|
|
|
|
|
|
|
{
|
380
|
0
|
|
|
|
|
0
|
my $enco = encoding_from_xml_declaration($decl);
|
381
|
0
|
0
|
0
|
|
|
0
|
$resu{$enco}++ if defined $enco and length $enco;
|
382
|
|
|
|
|
|
|
}
|
383
|
|
|
|
|
|
|
|
384
|
0
|
0
|
|
|
|
0
|
return unless keys %resu;
|
385
|
0
|
|
|
|
|
0
|
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
|
|
0
|
|
|
|
|
0
|
|
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# in array context return all encodings,
|
388
|
|
|
|
|
|
|
# in scalar context return best match.
|
389
|
0
|
0
|
|
|
|
0
|
return wantarray ? (xml => \@sort) : $sort[0];
|
390
|
|
|
|
|
|
|
}
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub encoding_from_html_document
|
393
|
|
|
|
|
|
|
{
|
394
|
2
|
|
|
2
|
1
|
25
|
my $text = shift;
|
395
|
2
|
|
|
|
|
6
|
my %o = @_;
|
396
|
2
|
|
33
|
|
|
25
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
397
|
2
|
|
50
|
|
|
11
|
my $popts = $o{parser_options} || {};
|
398
|
2
|
50
|
|
|
|
7
|
my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1;
|
399
|
|
|
|
|
|
|
|
400
|
2
|
50
|
|
|
|
7
|
return unless defined $text;
|
401
|
2
|
50
|
|
|
|
8
|
return unless length $text;
|
402
|
|
|
|
|
|
|
|
403
|
0
|
0
|
|
|
|
0
|
if ($xhtml)
|
404
|
|
|
|
|
|
|
{
|
405
|
0
|
0
|
|
|
|
0
|
my @xml = wantarray
|
406
|
|
|
|
|
|
|
? encoding_from_xml_document($text, encodings => $encodings)
|
407
|
|
|
|
|
|
|
: scalar encoding_from_xml_document($text, encodings => $encodings);
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
return wantarray
|
410
|
|
|
|
|
|
|
? @xml
|
411
|
0
|
0
|
0
|
|
|
0
|
: $xml[0]
|
|
|
0
|
|
|
|
|
|
412
|
|
|
|
|
|
|
if @xml and defined $xml[0];
|
413
|
|
|
|
|
|
|
}
|
414
|
|
|
|
|
|
|
else
|
415
|
|
|
|
|
|
|
{
|
416
|
0
|
|
|
|
|
0
|
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# BOM determines encoding
|
419
|
0
|
0
|
|
|
|
0
|
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
|
|
|
0
|
|
|
|
|
|
420
|
|
|
|
|
|
|
}
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# no BOM
|
423
|
0
|
|
|
|
|
0
|
my @resu;
|
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# sanity check to exclude e.g. UTF-32
|
426
|
0
|
|
|
|
|
0
|
my @first = encoding_from_first_chars($text, encodings => $encodings);
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# fall back to provided encoding list
|
429
|
0
|
0
|
|
|
|
0
|
@first = @$encodings unless @first;
|
430
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
0
|
foreach my $try (@first)
|
432
|
|
|
|
|
|
|
{
|
433
|
0
|
|
|
|
|
0
|
push @resu, encoding_from_meta_element($text, $try, %$popts);
|
434
|
|
|
|
|
|
|
}
|
435
|
|
|
|
|
|
|
|
436
|
0
|
0
|
|
|
|
0
|
return unless @resu;
|
437
|
0
|
0
|
|
|
|
0
|
return wantarray ? (meta => \@resu) : $resu[0];
|
438
|
|
|
|
|
|
|
}
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
sub encoding_from_http_message
|
441
|
|
|
|
|
|
|
{
|
442
|
42
|
|
|
42
|
1
|
35665
|
my $mess = shift;
|
443
|
42
|
|
|
|
|
65
|
my %o = @_;
|
444
|
|
|
|
|
|
|
|
445
|
42
|
|
33
|
|
|
114
|
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
|
446
|
42
|
|
33
|
|
|
216
|
my $is_html = $o{is_html} || qr{^text/html$}i;
|
447
|
42
|
|
33
|
|
|
185
|
my $is_xml = $o{is_xml} || qr{^.+/(?:.+\+)?xml$}i;
|
448
|
42
|
|
33
|
|
|
163
|
my $is_t_xml = $o{is_text_xml} || qr{^text/(?:.+\+)?xml$}i;
|
449
|
42
|
|
50
|
|
|
124
|
my $html_d = $o{html_default} || "ISO-8859-1";
|
450
|
42
|
|
50
|
|
|
107
|
my $xml_d = $o{xml_default} || "UTF-8";
|
451
|
42
|
|
|
|
|
46
|
my $txml = $o{text_xml_default};
|
452
|
|
|
|
|
|
|
|
453
|
42
|
50
|
|
|
|
69
|
my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1;
|
454
|
42
|
50
|
|
|
|
64
|
my $default = exists $o{default} ? $o{default} : 1;
|
455
|
|
|
|
|
|
|
|
456
|
42
|
|
|
|
|
107
|
my $type = $mess->header('Content-Type');
|
457
|
42
|
|
|
|
|
1291
|
my $charset = encoding_from_content_type($type);
|
458
|
|
|
|
|
|
|
|
459
|
42
|
50
|
|
|
|
113
|
if ($mess->content_type =~ $is_xml)
|
460
|
|
|
|
|
|
|
{
|
461
|
0
|
0
|
|
|
|
0
|
return wantarray ? (protocol => $charset) : $charset
|
|
|
0
|
|
|
|
|
|
462
|
|
|
|
|
|
|
if defined $charset;
|
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# special case for text/xml at user option
|
465
|
0
|
0
|
0
|
|
|
0
|
return wantarray ? (protocol_default => $txml) : $txml
|
|
|
0
|
|
|
|
|
|
466
|
|
|
|
|
|
|
if defined $txml and $mess->content_type =~ $is_t_xml;
|
467
|
|
|
|
|
|
|
|
468
|
0
|
0
|
|
|
|
0
|
if (wantarray)
|
469
|
|
|
|
|
|
|
{
|
470
|
0
|
|
|
|
|
0
|
my @xml = encoding_from_xml_document($mess->content, encodings => $encodings);
|
471
|
0
|
0
|
|
|
|
0
|
return @xml if @xml;
|
472
|
|
|
|
|
|
|
}
|
473
|
|
|
|
|
|
|
else
|
474
|
|
|
|
|
|
|
{
|
475
|
0
|
|
|
|
|
0
|
my $xml = scalar encoding_from_xml_document($mess->content, encodings => $encodings);
|
476
|
0
|
0
|
|
|
|
0
|
return $xml if defined $xml;
|
477
|
|
|
|
|
|
|
}
|
478
|
|
|
|
|
|
|
|
479
|
0
|
0
|
|
|
|
0
|
return wantarray ? (default => $xml_d) : $xml_d if defined $default;
|
|
|
0
|
|
|
|
|
|
480
|
|
|
|
|
|
|
}
|
481
|
|
|
|
|
|
|
|
482
|
42
|
100
|
|
|
|
1272
|
if ($mess->content_type =~ $is_html)
|
483
|
|
|
|
|
|
|
{
|
484
|
41
|
50
|
|
|
|
1072
|
return wantarray ? (protocol => $charset) : $charset
|
|
|
100
|
|
|
|
|
|
485
|
|
|
|
|
|
|
if defined $charset;
|
486
|
|
|
|
|
|
|
|
487
|
2
|
50
|
|
|
|
7
|
if (wantarray)
|
488
|
|
|
|
|
|
|
{
|
489
|
0
|
|
|
|
|
0
|
my @html = encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml);
|
490
|
0
|
0
|
|
|
|
0
|
return @html if @html;
|
491
|
|
|
|
|
|
|
}
|
492
|
|
|
|
|
|
|
else
|
493
|
|
|
|
|
|
|
{
|
494
|
2
|
|
|
|
|
13
|
my $html = scalar encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml);
|
495
|
2
|
50
|
|
|
|
7
|
return $html if defined $html;
|
496
|
|
|
|
|
|
|
}
|
497
|
|
|
|
|
|
|
|
498
|
2
|
50
|
|
|
|
29
|
return wantarray ? (default => $html_d) : $html_d if defined $default;
|
|
|
50
|
|
|
|
|
|
499
|
|
|
|
|
|
|
}
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
return
|
502
|
1
|
|
|
|
|
18
|
}
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
1;
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
__END__
|