line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
2
|
|
|
|
|
|
|
package IO::HTML; |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Copyright 2015 Christopher J. Madsen |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Author: Christopher J. Madsen |
7
|
|
|
|
|
|
|
# Created: 14 Jan 2012 |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
10
|
|
|
|
|
|
|
# it under the same terms as Perl itself. |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
13
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
14
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the |
15
|
|
|
|
|
|
|
# GNU General Public License or the Artistic License for more details. |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# ABSTRACT: Open an HTML file with automatic charset detection |
18
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
19
|
|
|
|
|
|
|
|
20
|
4
|
|
|
4
|
|
80346
|
use 5.008; |
|
4
|
|
|
|
|
16
|
|
21
|
4
|
|
|
4
|
|
19
|
use strict; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
84
|
|
22
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
130
|
|
23
|
|
|
|
|
|
|
|
24
|
4
|
|
|
4
|
|
21
|
use Carp 'croak'; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
309
|
|
25
|
4
|
|
|
4
|
|
3250
|
use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding |
|
4
|
|
|
|
|
42665
|
|
|
4
|
|
|
|
|
340
|
|
26
|
4
|
|
|
4
|
|
25
|
use Exporter 5.57 'import'; |
|
4
|
|
|
|
|
49
|
|
|
4
|
|
|
|
|
6086
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our $VERSION = '1.002'; # TRIAL VERSION |
29
|
|
|
|
|
|
|
# This file is part of IO-HTML 1.002 (September 19, 2015) |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
our $bytes_to_check ||= 1024; |
33
|
|
|
|
|
|
|
our $default_encoding ||= 'cp1252'; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our @EXPORT = qw(html_file); |
36
|
|
|
|
|
|
|
our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile |
37
|
|
|
|
|
|
|
sniff_encoding); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
40
|
|
|
|
|
|
|
rw => [qw( html_file html_file_and_encoding html_outfile )], |
41
|
|
|
|
|
|
|
all => [ @EXPORT, @EXPORT_OK ], |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
#===================================================================== |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub html_file |
48
|
|
|
|
|
|
|
{ |
49
|
14
|
|
|
14
|
1
|
19161
|
(&html_file_and_encoding)[0]; # return just the filehandle |
50
|
|
|
|
|
|
|
} # end html_file |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Note: I made html_file and html_file_and_encoding separate functions |
54
|
|
|
|
|
|
|
# (instead of making html_file context-sensitive) because I wanted to |
55
|
|
|
|
|
|
|
# use html_file in function calls (i.e. list context) without having |
56
|
|
|
|
|
|
|
# to write "scalar html_file" all the time. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub html_file_and_encoding |
59
|
|
|
|
|
|
|
{ |
60
|
28
|
|
|
28
|
1
|
1324767
|
my ($filename, $options) = @_; |
61
|
|
|
|
|
|
|
|
62
|
28
|
|
100
|
|
|
161
|
$options ||= {}; |
63
|
|
|
|
|
|
|
|
64
|
28
|
50
|
|
|
|
1263
|
open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!"; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
28
|
|
|
|
|
84
|
my ($encoding, $bom) = sniff_encoding($in, $filename, $options); |
68
|
|
|
|
|
|
|
|
69
|
28
|
100
|
|
|
|
77
|
if (not defined $encoding) { |
70
|
4
|
50
|
|
|
|
14
|
croak "No default encoding specified" |
71
|
|
|
|
|
|
|
unless defined($encoding = $default_encoding); |
72
|
4
|
50
|
|
|
|
14
|
$encoding = find_encoding($encoding) if $options->{encoding}; |
73
|
|
|
|
|
|
|
} # end if we didn't find an encoding |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
binmode $in, sprintf(":encoding(%s):crlf", |
76
|
28
|
100
|
|
2
|
|
328
|
$options->{encoding} ? $encoding->name : $encoding); |
|
2
|
|
|
|
|
14
|
|
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
11
|
|
77
|
|
|
|
|
|
|
|
78
|
28
|
|
|
|
|
5032
|
return ($in, $encoding, $bom); |
79
|
|
|
|
|
|
|
} # end html_file_and_encoding |
80
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub html_outfile |
84
|
|
|
|
|
|
|
{ |
85
|
6
|
|
|
6
|
1
|
6161
|
my ($filename, $encoding, $bom) = @_; |
86
|
|
|
|
|
|
|
|
87
|
6
|
50
|
|
|
|
27
|
if (not defined $encoding) { |
|
|
100
|
|
|
|
|
|
88
|
0
|
0
|
|
|
|
0
|
croak "No default encoding specified" |
89
|
|
|
|
|
|
|
unless defined($encoding = $default_encoding); |
90
|
|
|
|
|
|
|
} # end if we didn't find an encoding |
91
|
|
|
|
|
|
|
elsif (ref $encoding) { |
92
|
1
|
|
|
|
|
5
|
$encoding = $encoding->name; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
6
|
50
|
|
|
|
235
|
open(my $out, ">:encoding($encoding)", $filename) |
96
|
|
|
|
|
|
|
or croak "Failed to open $filename: $!"; |
97
|
|
|
|
|
|
|
|
98
|
6
|
100
|
|
|
|
7264
|
print $out "\x{FeFF}" if $bom; |
99
|
|
|
|
|
|
|
|
100
|
6
|
|
|
|
|
21
|
return $out; |
101
|
|
|
|
|
|
|
} # end html_outfile |
102
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub sniff_encoding |
106
|
|
|
|
|
|
|
{ |
107
|
56
|
|
|
56
|
1
|
16168
|
my ($in, $filename, $options) = @_; |
108
|
|
|
|
|
|
|
|
109
|
56
|
100
|
|
|
|
145
|
$filename = 'file' unless defined $filename; |
110
|
56
|
|
100
|
|
|
147
|
$options ||= {}; |
111
|
|
|
|
|
|
|
|
112
|
56
|
|
|
|
|
110
|
my $pos = tell $in; |
113
|
56
|
50
|
|
|
|
117
|
croak "Could not seek $filename: $!" if $pos < 0; |
114
|
|
|
|
|
|
|
|
115
|
56
|
50
|
|
|
|
484
|
croak "Could not read $filename: $!" |
116
|
|
|
|
|
|
|
unless defined read $in, my($buf), $bytes_to_check; |
117
|
|
|
|
|
|
|
|
118
|
56
|
50
|
|
|
|
177
|
seek $in, $pos, 0 or croak "Could not seek $filename: $!"; |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# Check for BOM: |
122
|
56
|
|
|
|
|
64
|
my $bom; |
123
|
56
|
|
|
|
|
58
|
my $encoding = do { |
124
|
56
|
100
|
|
|
|
261
|
if ($buf =~ /^\xFe\xFF/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
125
|
4
|
|
|
|
|
7
|
$bom = 2; |
126
|
4
|
|
|
|
|
10
|
'UTF-16BE'; |
127
|
|
|
|
|
|
|
} elsif ($buf =~ /^\xFF\xFe/) { |
128
|
8
|
|
|
|
|
10
|
$bom = 2; |
129
|
8
|
|
|
|
|
19
|
'UTF-16LE'; |
130
|
|
|
|
|
|
|
} elsif ($buf =~ /^\xEF\xBB\xBF/) { |
131
|
4
|
|
|
|
|
7
|
$bom = 3; |
132
|
4
|
|
|
|
|
8
|
'utf-8-strict'; |
133
|
|
|
|
|
|
|
} else { |
134
|
40
|
|
|
|
|
163
|
find_charset_in($buf, $options); # check for |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
}; # end $encoding |
137
|
|
|
|
|
|
|
|
138
|
56
|
100
|
|
|
|
204
|
if ($bom) { |
|
|
100
|
|
|
|
|
|
139
|
16
|
50
|
|
|
|
57
|
seek $in, $bom, 1 or croak "Could not seek $filename: $!"; |
140
|
16
|
|
|
|
|
24
|
$bom = 1; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
elsif (not defined $encoding) { # try decoding as UTF-8 |
143
|
20
|
|
|
|
|
106
|
my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET); |
144
|
20
|
100
|
100
|
|
|
1193
|
if ($buf =~ /^(?: # nothing left over |
145
|
|
|
|
|
|
|
| [\xC2-\xDF] # incomplete 2-byte char |
146
|
|
|
|
|
|
|
| [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char |
147
|
|
|
|
|
|
|
| [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char |
148
|
|
|
|
|
|
|
)\z/x and $test =~ /[^\x00-\x7F]/) { |
149
|
12
|
|
|
|
|
27
|
$encoding = 'utf-8-strict'; |
150
|
|
|
|
|
|
|
} # end if valid UTF-8 with at least one multi-byte character: |
151
|
|
|
|
|
|
|
} # end if testing for UTF-8 |
152
|
|
|
|
|
|
|
|
153
|
56
|
100
|
66
|
|
|
268
|
if (defined $encoding and $options->{encoding} and not ref $encoding) { |
|
|
|
100
|
|
|
|
|
154
|
9
|
|
|
|
|
34
|
$encoding = find_encoding($encoding); |
155
|
|
|
|
|
|
|
} # end if $encoding is a string and we want an object |
156
|
|
|
|
|
|
|
|
157
|
56
|
50
|
|
|
|
323
|
return wantarray ? ($encoding, $bom) : $encoding; |
158
|
|
|
|
|
|
|
} # end sniff_encoding |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
#===================================================================== |
161
|
|
|
|
|
|
|
# Based on HTML5 8.2.2.2 Determining the character encoding: |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Get attribute from current position of $_ |
164
|
|
|
|
|
|
|
sub _get_attribute |
165
|
|
|
|
|
|
|
{ |
166
|
220
|
|
|
220
|
|
446
|
m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or / |
167
|
|
|
|
|
|
|
|
168
|
220
|
100
|
100
|
|
|
1523
|
return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc; |
169
|
|
|
|
|
|
|
|
170
|
69
|
|
|
|
|
224
|
my ($name, $value) = (lc $1, ''); |
171
|
|
|
|
|
|
|
|
172
|
69
|
50
|
66
|
|
|
632
|
if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc |
|
|
|
66
|
|
|
|
|
173
|
|
|
|
|
|
|
and (/\G"([^"]*)"?/gc or |
174
|
|
|
|
|
|
|
/\G'([^']*)'?/gc or |
175
|
|
|
|
|
|
|
/\G([^\x09\x0A\x0C\x0D >]*)/gc)) { |
176
|
66
|
|
|
|
|
198
|
$value = lc $1; |
177
|
|
|
|
|
|
|
} # end if attribute has value |
178
|
|
|
|
|
|
|
|
179
|
69
|
100
|
|
|
|
343
|
return wantarray ? ($name, $value) : 1; |
180
|
|
|
|
|
|
|
} # end _get_attribute |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Examine a meta value for a charset: |
183
|
|
|
|
|
|
|
sub _get_charset_from_meta |
184
|
|
|
|
|
|
|
{ |
185
|
19
|
|
|
19
|
|
47
|
for (shift) { |
186
|
19
|
|
|
|
|
105
|
while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) { |
187
|
16
|
50
|
33
|
|
|
284
|
return $1 if (/\G"([^"]*)"/gc or |
|
|
|
33
|
|
|
|
|
188
|
|
|
|
|
|
|
/\G'([^']*)'/gc or |
189
|
|
|
|
|
|
|
/\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
} # end for value |
192
|
|
|
|
|
|
|
|
193
|
3
|
|
|
|
|
20
|
return undef; |
194
|
|
|
|
|
|
|
} # end _get_charset_from_meta |
195
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub find_charset_in |
199
|
|
|
|
|
|
|
{ |
200
|
62
|
|
|
62
|
1
|
16238
|
for (shift) { |
201
|
62
|
|
100
|
|
|
228
|
my $options = shift || {}; |
202
|
|
|
|
|
|
|
# search only the first $bytes_to_check bytes (default 1024) |
203
|
62
|
50
|
|
|
|
167
|
my $stop = length > $bytes_to_check ? $bytes_to_check : length; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
my $expect_pragma = (defined $options->{need_pragma} |
206
|
62
|
100
|
|
|
|
147
|
? $options->{need_pragma} : 1); |
207
|
|
|
|
|
|
|
|
208
|
62
|
|
|
|
|
200
|
pos() = 0; |
209
|
62
|
|
|
|
|
212
|
while (pos() < $stop) { |
210
|
158
|
100
|
|
|
|
877
|
if (/\G |