File Coverage

blib/lib/IO/HTML.pm

Criterion	Covered	Total	%
statement	94	95	98.9
branch	66	86	76.7
condition	33	42	78.5
subroutine	14	14	100.0
pod	5	5	100.0
total	212	242	87.6

line	stmt	bran	cond	sub	pod	time	code
1							#---------------------------------------------------------------------
2							package IO::HTML;
3							#
4							# Copyright 2015 Christopher J. Madsen
5							#
6							# Author: Christopher J. Madsen
7							# Created: 14 Jan 2012
8							#
9							# This program is free software; you can redistribute it and/or modify
10							# it under the same terms as Perl itself.
11							#
12							# This program is distributed in the hope that it will be useful,
13							# but WITHOUT ANY WARRANTY; without even the implied warranty of
14							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
15							# GNU General Public License or the Artistic License for more details.
16							#
17							# ABSTRACT: Open an HTML file with automatic charset detection
18							#---------------------------------------------------------------------
19
20	4			4		80346	use 5.008;
	4					16
21	4			4		19	use strict;
	4					6
	4					84
22	4			4		20	use warnings;
	4					8
	4					130
23
24	4			4		21	use Carp 'croak';
	4					7
	4					309
25	4			4		3250	use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
	4					42665
	4					340
26	4			4		25	use Exporter 5.57 'import';
	4					49
	4					6086
27
28							our $VERSION = '1.002'; # TRIAL VERSION
29							# This file is part of IO-HTML 1.002 (September 19, 2015)
30
31
32							our $bytes_to_check \|\|= 1024;
33							our $default_encoding \|\|= 'cp1252';
34
35							our @EXPORT = qw(html_file);
36							our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
37							sniff_encoding);
38
39							our %EXPORT_TAGS = (
40							rw => [qw( html_file html_file_and_encoding html_outfile )],
41							all => [ @EXPORT, @EXPORT_OK ],
42							);
43
44							#=====================================================================
45
46
47							sub html_file
48							{
49	14			14	1	19161	(&html_file_and_encoding)[0]; # return just the filehandle
50							} # end html_file
51
52
53							# Note: I made html_file and html_file_and_encoding separate functions
54							# (instead of making html_file context-sensitive) because I wanted to
55							# use html_file in function calls (i.e. list context) without having
56							# to write "scalar html_file" all the time.
57
58							sub html_file_and_encoding
59							{
60	28			28	1	1324767	my ($filename, $options) = @_;
61
62	28		100			161	$options \|\|= {};
63
64	28	50				1263	open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
65
66
67	28					84	my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
68
69	28	100				77	if (not defined $encoding) {
70	4	50				14	croak "No default encoding specified"
71							unless defined($encoding = $default_encoding);
72	4	50				14	$encoding = find_encoding($encoding) if $options->{encoding};
73							} # end if we didn't find an encoding
74
75							binmode $in, sprintf(":encoding(%s):crlf",
76	28	100		2		328	$options->{encoding} ? $encoding->name : $encoding);
	2					14
	2					3
	2					11
77
78	28					5032	return ($in, $encoding, $bom);
79							} # end html_file_and_encoding
80							#---------------------------------------------------------------------
81
82
83							sub html_outfile
84							{
85	6			6	1	6161	my ($filename, $encoding, $bom) = @_;
86
87	6	50				27	if (not defined $encoding) {
		100
88	0	0				0	croak "No default encoding specified"
89							unless defined($encoding = $default_encoding);
90							} # end if we didn't find an encoding
91							elsif (ref $encoding) {
92	1					5	$encoding = $encoding->name;
93							}
94
95	6	50				235	open(my $out, ">:encoding($encoding)", $filename)
96							or croak "Failed to open $filename: $!";
97
98	6	100				7264	print $out "\x{FeFF}" if $bom;
99
100	6					21	return $out;
101							} # end html_outfile
102							#---------------------------------------------------------------------
103
104
105							sub sniff_encoding
106							{
107	56			56	1	16168	my ($in, $filename, $options) = @_;
108
109	56	100				145	$filename = 'file' unless defined $filename;
110	56		100			147	$options \|\|= {};
111
112	56					110	my $pos = tell $in;
113	56	50				117	croak "Could not seek $filename: $!" if $pos < 0;
114
115	56	50				484	croak "Could not read $filename: $!"
116							unless defined read $in, my($buf), $bytes_to_check;
117
118	56	50				177	seek $in, $pos, 0 or croak "Could not seek $filename: $!";
119
120
121							# Check for BOM:
122	56					64	my $bom;
123	56					58	my $encoding = do {
124	56	100				261	if ($buf =~ /^\xFe\xFF/) {
		100
		100
125	4					7	$bom = 2;
126	4					10	'UTF-16BE';
127							} elsif ($buf =~ /^\xFF\xFe/) {
128	8					10	$bom = 2;
129	8					19	'UTF-16LE';
130							} elsif ($buf =~ /^\xEF\xBB\xBF/) {
131	4					7	$bom = 3;
132	4					8	'utf-8-strict';
133							} else {
134	40					163	find_charset_in($buf, $options); # check for
135							}
136							}; # end $encoding
137
138	56	100				204	if ($bom) {
		100
139	16	50				57	seek $in, $bom, 1 or croak "Could not seek $filename: $!";
140	16					24	$bom = 1;
141							}
142							elsif (not defined $encoding) { # try decoding as UTF-8
143	20					106	my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
144	20	100	100			1193	if ($buf =~ /^(?: # nothing left over
145							\| [\xC2-\xDF] # incomplete 2-byte char
146							\| [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
147							\| [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
148							)\z/x and $test =~ /[^\x00-\x7F]/) {
149	12					27	$encoding = 'utf-8-strict';
150							} # end if valid UTF-8 with at least one multi-byte character:
151							} # end if testing for UTF-8
152
153	56	100	66			268	if (defined $encoding and $options->{encoding} and not ref $encoding) {
			100
154	9					34	$encoding = find_encoding($encoding);
155							} # end if $encoding is a string and we want an object
156
157	56	50				323	return wantarray ? ($encoding, $bom) : $encoding;
158							} # end sniff_encoding
159
160							#=====================================================================
161							# Based on HTML5 8.2.2.2 Determining the character encoding:
162
163							# Get attribute from current position of $_
164							sub _get_attribute
165							{
166	220			220		446	m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
167
168	220	100	100			1523	return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
169
170	69					224	my ($name, $value) = (lc $1, '');
171
172	69	50	66			632	if (/\G[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/gc
			66
173							and (/\G"([^"]*)"?/gc or
174							/\G'([^']*)'?/gc or
175							/\G([^\x09\x0A\x0C\x0D >]*)/gc)) {
176	66					198	$value = lc $1;
177							} # end if attribute has value
178
179	69	100				343	return wantarray ? ($name, $value) : 1;
180							} # end _get_attribute
181
182							# Examine a meta value for a charset:
183							sub _get_charset_from_meta
184							{
185	19			19		47	for (shift) {
186	19					105	while (/charset[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/ig) {
187	16	50	33			284	return $1 if (/\G"([^"]*)"/gc or
			33
188							/\G'([^']*)'/gc or
189							/\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
190							}
191							} # end for value
192
193	3					20	return undef;
194							} # end _get_charset_from_meta
195							#---------------------------------------------------------------------
196
197
198							sub find_charset_in
199							{
200	62			62	1	16238	for (shift) {
201	62		100			228	my $options = shift \|\| {};
202							# search only the first $bytes_to_check bytes (default 1024)
203	62	50				167	my $stop = length > $bytes_to_check ? $bytes_to_check : length;
204
205							my $expect_pragma = (defined $options->{need_pragma}
206	62	100				147	? $options->{need_pragma} : 1);
207
208	62					200	pos() = 0;
209	62					212	while (pos() < $stop) {
210	158	100				877	if (/\G