File Coverage

blib/lib/IO/HTML.pm

Criterion	Covered	Total	%
statement	99	103	96.1
branch	75	94	79.7
condition	29	36	80.5
subroutine	14	14	100.0
pod	5	5	100.0
total	222	252	88.1

line	stmt	bran	cond	sub	pod	time	code
1							#---------------------------------------------------------------------
2							package IO::HTML;
3							#
4							# Copyright 2015 Christopher J. Madsen
5							#
6							# Author: Christopher J. Madsen
7							# Created: 14 Jan 2012
8							#
9							# This program is free software; you can redistribute it and/or modify
10							# it under the same terms as Perl itself.
11							#
12							# This program is distributed in the hope that it will be useful,
13							# but WITHOUT ANY WARRANTY; without even the implied warranty of
14							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
15							# GNU General Public License or the Artistic License for more details.
16							#
17							# ABSTRACT: Open an HTML file with automatic charset detection
18							#---------------------------------------------------------------------
19
20	4			4		77457	use 5.008;
	4					14
21	4			4		21	use strict;
	4					8
	4					81
22	4			4		16	use warnings;
	4					7
	4					126
23
24	4			4		15	use Carp 'croak';
	4					8
	4					289
25	4			4		3078	use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
	4					41569
	4					330
26	4			4		24	use Exporter 5.57 'import';
	4					48
	4					6702
27
28							our $VERSION = '1.003'; # TRIAL VERSION
29							# This file is part of IO-HTML 1.003 (September 26, 2015)
30
31
32							our $bytes_to_check \|\|= 1024;
33							our $default_encoding \|\|= 'cp1252';
34
35							our @EXPORT = qw(html_file);
36							our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
37							sniff_encoding);
38
39							our %EXPORT_TAGS = (
40							rw => [qw( html_file html_file_and_encoding html_outfile )],
41							all => [ @EXPORT, @EXPORT_OK ],
42							);
43
44							#=====================================================================
45
46
47							sub html_file
48							{
49	18			18	1	15325	(&html_file_and_encoding)[0]; # return just the filehandle
50							} # end html_file
51
52
53							# Note: I made html_file and html_file_and_encoding separate functions
54							# (instead of making html_file context-sensitive) because I wanted to
55							# use html_file in function calls (i.e. list context) without having
56							# to write "scalar html_file" all the time.
57
58							sub html_file_and_encoding
59							{
60	36			36	1	69001	my ($filename, $options) = @_;
61
62	36		100			160	$options \|\|= {};
63
64	36	50				1199	open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
65
66
67	36					80	my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
68
69	36	100				89	if (not defined $encoding) {
70	8	50				23	croak "No default encoding specified"
71							unless defined($encoding = $default_encoding);
72	8	50				23	$encoding = find_encoding($encoding) if $options->{encoding};
73							} # end if we didn't find an encoding
74
75							binmode $in, sprintf(":encoding(%s):crlf",
76	36	100		2		417	$options->{encoding} ? $encoding->name : $encoding);
	2					21
	2					5
	2					20
77
78	36					5958	return ($in, $encoding, $bom);
79							} # end html_file_and_encoding
80							#---------------------------------------------------------------------
81
82
83							sub html_outfile
84							{
85	6			6	1	9989	my ($filename, $encoding, $bom) = @_;
86
87	6	50				40	if (not defined $encoding) {
		100
88	0	0				0	croak "No default encoding specified"
89							unless defined($encoding = $default_encoding);
90							} # end if we didn't find an encoding
91							elsif (ref $encoding) {
92	1					4	$encoding = $encoding->name;
93							}
94
95	6	50				384	open(my $out, ">:encoding($encoding)", $filename)
96							or croak "Failed to open $filename: $!";
97
98	6	100				36530	print $out "\x{FeFF}" if $bom;
99
100	6					34	return $out;
101							} # end html_outfile
102							#---------------------------------------------------------------------
103
104
105							sub sniff_encoding
106							{
107	72			72	1	13666	my ($in, $filename, $options) = @_;
108
109	72	100				168	$filename = 'file' unless defined $filename;
110	72		100			173	$options \|\|= {};
111
112	72					122	my $pos = tell $in;
113	72	50				139	croak "Could not seek $filename: $!" if $pos < 0;
114
115	72	50				556	croak "Could not read $filename: $!"
116							unless defined read $in, my($buf), $bytes_to_check;
117
118	72	50				201	seek $in, $pos, 0 or croak "Could not seek $filename: $!";
119
120
121							# Check for BOM:
122	72					77	my $bom;
123	72					132	my $encoding = do {
124	72	100				290	if ($buf =~ /^\xFe\xFF/) {
		100
		100
125	4					12	$bom = 2;
126	4					12	'UTF-16BE';
127							} elsif ($buf =~ /^\xFF\xFe/) {
128	8					9	$bom = 2;
129	8					13	'UTF-16LE';
130							} elsif ($buf =~ /^\xEF\xBB\xBF/) {
131	4					5	$bom = 3;
132	4					8	'utf-8-strict';
133							} else {
134	56					106	find_charset_in($buf, $options); # check for
135							}
136							}; # end $encoding
137
138	72	100				219	if ($bom) {
		100
139	16	50				46	seek $in, $bom, 1 or croak "Could not seek $filename: $!";
140	16					21	$bom = 1;
141							}
142							elsif (not defined $encoding) { # try decoding as UTF-8
143	28					105	my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
144	28	100	100			1505	if ($buf =~ /^(?: # nothing left over
145							\| [\xC2-\xDF] # incomplete 2-byte char
146							\| [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
147							\| [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
148							)\z/x and $test =~ /[^\x00-\x7F]/) {
149	12					23	$encoding = 'utf-8-strict';
150							} # end if valid UTF-8 with at least one multi-byte character:
151							} # end if testing for UTF-8
152
153	72	100	66			273	if (defined $encoding and $options->{encoding} and not ref $encoding) {
			100
154	9					24	$encoding = find_encoding($encoding);
155							} # end if $encoding is a string and we want an object
156
157	72	50				329	return wantarray ? ($encoding, $bom) : $encoding;
158							} # end sniff_encoding
159
160							#=====================================================================
161							# Based on HTML5 8.2.2.2 Determining the character encoding:
162
163							# Get attribute from current position of $_
164							sub _get_attribute
165							{
166	286			286		509	m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
167
168	286	100	100			1447	return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
169
170	94					276	my ($name, $value) = (lc $1, '');
171
172	94	100				341	if (/\G[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/gc) {
173	84	100				279	if (/\G"/gc) {
		50
174							# Double-quoted attribute value
175	72					188	/\G([^"]*)("?)/gc;
176	72	100				215	return unless $2; # Incomplete attribute (missing closing quote)
177	63					153	$value = lc $1;
178							} elsif (/\G'/gc) {
179							# Single-quoted attribute value
180	0					0	/\G([^']*)('?)/gc;
181	0	0				0	return unless $2; # Incomplete attribute (missing closing quote)
182	0					0	$value = lc $1;
183							} else {
184							# Unquoted attribute value
185	12					31	/\G([^\x09\x0A\x0C\x0D >]*)/gc;
186	12					29	$value = lc $1;
187							}
188							} # end if attribute has value
189
190	85	50				388	return wantarray ? ($name, $value) : 1;
191							} # end _get_attribute
192
193							# Examine a meta value for a charset:
194							sub _get_charset_from_meta
195							{
196	19			19		36	for (shift) {
197	19					81	while (/charset[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/ig) {
198	16	50	33			197	return $1 if (/\G"([^"]*)"/gc or
			33
199							/\G'([^']*)'/gc or
200							/\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
201							}
202							} # end for value
203
204	3					14	return undef;
205							} # end _get_charset_from_meta
206							#---------------------------------------------------------------------
207
208
209							sub find_charset_in
210							{
211	81			81	1	17215	for (shift) {
212	81		100			272	my $options = shift \|\| {};
213							# search only the first $bytes_to_check bytes (default 1024)
214	81	100				196	my $stop = length > $bytes_to_check ? $bytes_to_check : length;
215
216							my $expect_pragma = (defined $options->{need_pragma}
217	81	100				188	? $options->{need_pragma} : 1);
218
219	81					217	pos() = 0;
220	81					244	while (pos() < $stop) {
221	211	100				975	if (/\G