line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
2
|
|
|
|
|
|
|
package IO::HTML; |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Copyright 2020 Christopher J. Madsen |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Author: Christopher J. Madsen |
7
|
|
|
|
|
|
|
# Created: 14 Jan 2012 |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
10
|
|
|
|
|
|
|
# it under the same terms as Perl itself. |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
13
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
14
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the |
15
|
|
|
|
|
|
|
# GNU General Public License or the Artistic License for more details. |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# ABSTRACT: Open an HTML file with automatic charset detection |
18
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
19
|
|
|
|
|
|
|
|
20
|
4
|
|
|
4
|
|
273882
|
use 5.008; |
|
4
|
|
|
|
|
47
|
|
21
|
4
|
|
|
4
|
|
22
|
use strict; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
113
|
|
22
|
4
|
|
|
4
|
|
37
|
use warnings; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
148
|
|
23
|
|
|
|
|
|
|
|
24
|
4
|
|
|
4
|
|
25
|
use Carp 'croak'; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
284
|
|
25
|
4
|
|
|
4
|
|
2292
|
use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding |
|
4
|
|
|
|
|
41273
|
|
|
4
|
|
|
|
|
323
|
|
26
|
4
|
|
|
4
|
|
32
|
use Exporter 5.57 'import'; |
|
4
|
|
|
|
|
52
|
|
|
4
|
|
|
|
|
7321
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our $VERSION = '1.004'; |
29
|
|
|
|
|
|
|
# This file is part of IO-HTML 1.004 (September 26, 2020) |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
our $bytes_to_check ||= 1024; |
33
|
|
|
|
|
|
|
our $default_encoding ||= 'cp1252'; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our @EXPORT = qw(html_file); |
36
|
|
|
|
|
|
|
our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile |
37
|
|
|
|
|
|
|
sniff_encoding); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
40
|
|
|
|
|
|
|
rw => [qw( html_file html_file_and_encoding html_outfile )], |
41
|
|
|
|
|
|
|
all => [ @EXPORT, @EXPORT_OK ], |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
#===================================================================== |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub html_file |
48
|
|
|
|
|
|
|
{ |
49
|
18
|
|
|
18
|
1
|
22584
|
(&html_file_and_encoding)[0]; # return just the filehandle |
50
|
|
|
|
|
|
|
} # end html_file |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Note: I made html_file and html_file_and_encoding separate functions |
54
|
|
|
|
|
|
|
# (instead of making html_file context-sensitive) because I wanted to |
55
|
|
|
|
|
|
|
# use html_file in function calls (i.e. list context) without having |
56
|
|
|
|
|
|
|
# to write "scalar html_file" all the time. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub html_file_and_encoding |
59
|
|
|
|
|
|
|
{ |
60
|
36
|
|
|
36
|
1
|
28818
|
my ($filename, $options) = @_; |
61
|
|
|
|
|
|
|
|
62
|
36
|
|
100
|
|
|
172
|
$options ||= {}; |
63
|
|
|
|
|
|
|
|
64
|
36
|
50
|
|
|
|
1232
|
open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!"; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
36
|
|
|
|
|
131
|
my ($encoding, $bom) = sniff_encoding($in, $filename, $options); |
68
|
|
|
|
|
|
|
|
69
|
36
|
100
|
|
|
|
76
|
if (not defined $encoding) { |
70
|
8
|
50
|
|
|
|
27
|
croak "No default encoding specified" |
71
|
|
|
|
|
|
|
unless defined($encoding = $default_encoding); |
72
|
8
|
50
|
|
|
|
20
|
$encoding = find_encoding($encoding) if $options->{encoding}; |
73
|
|
|
|
|
|
|
} # end if we didn't find an encoding |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
binmode $in, sprintf(":encoding(%s):crlf", |
76
|
36
|
100
|
|
2
|
|
559
|
$options->{encoding} ? $encoding->name : $encoding); |
|
2
|
|
|
|
|
13
|
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
15
|
|
77
|
|
|
|
|
|
|
|
78
|
36
|
|
|
|
|
2348
|
return ($in, $encoding, $bom); |
79
|
|
|
|
|
|
|
} # end html_file_and_encoding |
80
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub html_outfile |
84
|
|
|
|
|
|
|
{ |
85
|
6
|
|
|
6
|
1
|
23188
|
my ($filename, $encoding, $bom) = @_; |
86
|
|
|
|
|
|
|
|
87
|
6
|
50
|
|
|
|
28
|
if (not defined $encoding) { |
|
|
100
|
|
|
|
|
|
88
|
0
|
0
|
|
|
|
0
|
croak "No default encoding specified" |
89
|
|
|
|
|
|
|
unless defined($encoding = $default_encoding); |
90
|
|
|
|
|
|
|
} # end if we didn't find an encoding |
91
|
|
|
|
|
|
|
elsif (ref $encoding) { |
92
|
1
|
|
|
|
|
5
|
$encoding = $encoding->name; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
6
|
50
|
|
|
|
277
|
open(my $out, ">:encoding($encoding)", $filename) |
96
|
|
|
|
|
|
|
or croak "Failed to open $filename: $!"; |
97
|
|
|
|
|
|
|
|
98
|
6
|
100
|
|
|
|
4342
|
print $out "\x{FeFF}" if $bom; |
99
|
|
|
|
|
|
|
|
100
|
6
|
|
|
|
|
25
|
return $out; |
101
|
|
|
|
|
|
|
} # end html_outfile |
102
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub sniff_encoding |
106
|
|
|
|
|
|
|
{ |
107
|
72
|
|
|
72
|
1
|
19964
|
my ($in, $filename, $options) = @_; |
108
|
|
|
|
|
|
|
|
109
|
72
|
100
|
|
|
|
179
|
$filename = 'file' unless defined $filename; |
110
|
72
|
|
100
|
|
|
186
|
$options ||= {}; |
111
|
|
|
|
|
|
|
|
112
|
72
|
|
|
|
|
167
|
my $pos = tell $in; |
113
|
72
|
50
|
|
|
|
161
|
croak "Could not seek $filename: $!" if $pos < 0; |
114
|
|
|
|
|
|
|
|
115
|
72
|
50
|
|
|
|
702
|
croak "Could not read $filename: $!" |
116
|
|
|
|
|
|
|
unless defined read $in, my($buf), $bytes_to_check; |
117
|
|
|
|
|
|
|
|
118
|
72
|
50
|
|
|
|
378
|
seek $in, $pos, 0 or croak "Could not seek $filename: $!"; |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Check for BOM: |
122
|
72
|
|
|
|
|
114
|
my $bom; |
123
|
72
|
|
|
|
|
86
|
my $encoding = do { |
124
|
72
|
100
|
|
|
|
289
|
if ($buf =~ /^\xFe\xFF/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
125
|
4
|
|
|
|
|
7
|
$bom = 2; |
126
|
4
|
|
|
|
|
9
|
'UTF-16BE'; |
127
|
|
|
|
|
|
|
} elsif ($buf =~ /^\xFF\xFe/) { |
128
|
8
|
|
|
|
|
11
|
$bom = 2; |
129
|
8
|
|
|
|
|
16
|
'UTF-16LE'; |
130
|
|
|
|
|
|
|
} elsif ($buf =~ /^\xEF\xBB\xBF/) { |
131
|
4
|
|
|
|
|
8
|
$bom = 3; |
132
|
4
|
|
|
|
|
9
|
'utf-8-strict'; |
133
|
|
|
|
|
|
|
} else { |
134
|
56
|
|
|
|
|
131
|
find_charset_in($buf, $options); # check for |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
}; # end $encoding |
137
|
|
|
|
|
|
|
|
138
|
72
|
100
|
|
|
|
214
|
if ($bom) { |
|
|
100
|
|
|
|
|
|
139
|
16
|
50
|
|
|
|
71
|
seek $in, $bom, 1 or croak "Could not seek $filename: $!"; |
140
|
16
|
|
|
|
|
25
|
$bom = 1; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
elsif (not defined $encoding) { # try decoding as UTF-8 |
143
|
28
|
|
|
|
|
82
|
my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET); |
144
|
28
|
100
|
100
|
|
|
1467
|
if ($buf =~ /^(?: # nothing left over |
145
|
|
|
|
|
|
|
| [\xC2-\xDF] # incomplete 2-byte char |
146
|
|
|
|
|
|
|
| [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char |
147
|
|
|
|
|
|
|
| [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char |
148
|
|
|
|
|
|
|
)\z/x and $test =~ /[^\x00-\x7F]/) { |
149
|
12
|
|
|
|
|
23
|
$encoding = 'utf-8-strict'; |
150
|
|
|
|
|
|
|
} # end if valid UTF-8 with at least one multi-byte character: |
151
|
|
|
|
|
|
|
} # end if testing for UTF-8 |
152
|
|
|
|
|
|
|
|
153
|
72
|
100
|
100
|
|
|
284
|
if (defined $encoding and $options->{encoding} and not ref $encoding) { |
|
|
|
100
|
|
|
|
|
154
|
9
|
|
|
|
|
28
|
$encoding = find_encoding($encoding); |
155
|
|
|
|
|
|
|
} # end if $encoding is a string and we want an object |
156
|
|
|
|
|
|
|
|
157
|
72
|
50
|
|
|
|
397
|
return wantarray ? ($encoding, $bom) : $encoding; |
158
|
|
|
|
|
|
|
} # end sniff_encoding |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
#===================================================================== |
161
|
|
|
|
|
|
|
# Based on HTML5 8.2.2.2 Determining the character encoding: |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Get attribute from current position of $_ |
164
|
|
|
|
|
|
|
sub _get_attribute |
165
|
|
|
|
|
|
|
{ |
166
|
286
|
|
|
286
|
|
502
|
m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or / |
167
|
|
|
|
|
|
|
|
168
|
286
|
100
|
100
|
|
|
1070
|
return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc; |
169
|
|
|
|
|
|
|
|
170
|
94
|
|
|
|
|
321
|
my ($name, $value) = (lc $1, ''); |
171
|
|
|
|
|
|
|
|
172
|
94
|
100
|
|
|
|
330
|
if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc) { |
173
|
84
|
100
|
|
|
|
186
|
if (/\G"/gc) { |
|
|
50
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Double-quoted attribute value |
175
|
72
|
|
|
|
|
176
|
/\G([^"]*)("?)/gc; |
176
|
72
|
100
|
|
|
|
184
|
return unless $2; # Incomplete attribute (missing closing quote) |
177
|
63
|
|
|
|
|
135
|
$value = lc $1; |
178
|
|
|
|
|
|
|
} elsif (/\G'/gc) { |
179
|
|
|
|
|
|
|
# Single-quoted attribute value |
180
|
0
|
|
|
|
|
0
|
/\G([^']*)('?)/gc; |
181
|
0
|
0
|
|
|
|
0
|
return unless $2; # Incomplete attribute (missing closing quote) |
182
|
0
|
|
|
|
|
0
|
$value = lc $1; |
183
|
|
|
|
|
|
|
} else { |
184
|
|
|
|
|
|
|
# Unquoted attribute value |
185
|
12
|
|
|
|
|
25
|
/\G([^\x09\x0A\x0C\x0D >]*)/gc; |
186
|
12
|
|
|
|
|
24
|
$value = lc $1; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} # end if attribute has value |
189
|
|
|
|
|
|
|
|
190
|
85
|
50
|
|
|
|
301
|
return wantarray ? ($name, $value) : 1; |
191
|
|
|
|
|
|
|
} # end _get_attribute |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Examine a meta value for a charset: |
194
|
|
|
|
|
|
|
sub _get_charset_from_meta |
195
|
|
|
|
|
|
|
{ |
196
|
19
|
|
|
19
|
|
37
|
for (shift) { |
197
|
19
|
|
|
|
|
84
|
while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) { |
198
|
16
|
50
|
33
|
|
|
171
|
return $1 if (/\G"([^"]*)"/gc or |
|
|
|
33
|
|
|
|
|
199
|
|
|
|
|
|
|
/\G'([^']*)'/gc or |
200
|
|
|
|
|
|
|
/\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc); |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} # end for value |
203
|
|
|
|
|
|
|
|
204
|
3
|
|
|
|
|
12
|
return undef; |
205
|
|
|
|
|
|
|
} # end _get_charset_from_meta |
206
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub find_charset_in |
210
|
|
|
|
|
|
|
{ |
211
|
81
|
|
|
81
|
1
|
14156
|
for (shift) { |
212
|
81
|
|
100
|
|
|
228
|
my $options = shift || {}; |
213
|
|
|
|
|
|
|
# search only the first $bytes_to_check bytes (default 1024) |
214
|
81
|
100
|
|
|
|
182
|
my $stop = length > $bytes_to_check ? $bytes_to_check : length; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
my $expect_pragma = (defined $options->{need_pragma} |
217
|
81
|
100
|
|
|
|
186
|
? $options->{need_pragma} : 1); |
218
|
|
|
|
|
|
|
|
219
|
81
|
|
|
|
|
230
|
pos() = 0; |
220
|
81
|
|
|
|
|
234
|
while (pos() < $stop) { |
221
|
211
|
100
|
|
|
|
909
|
if (/\G |