File Coverage

lib/HTML/Encapsulate.pm

Criterion	Covered	Total	%
statement	43	45	95.5
branch			n/a
condition			n/a
subroutine	15	15	100.0
pod			n/a
total	58	60	96.6

line	stmt	sub	time	code
1				package HTML::Encapsulate;
2	2	2	16026	use warnings;
	2		5
	2		63
3	2	2	10	use strict;
	2		2
	2		52
4	2	2	28	use Carp;
	2		4
	2		171
5	2	2	11	use PerlIO;
	2		2
	2		15
6	2	2	62	use File::Path qw(mkpath);
	2		4
	2		95
7	2	2	11	use File::Spec;
	2		3
	2		54
8	2	2	10	use File::Spec::Unix;
	2		3
	2		66
9	2	2	9	use Carp qw(croak carp cluck confess);
	2		4
	2		92
10	2	2	16	use Exporter;
	2		3
	2		75
11	2	2	1124	use LWP::UserAgent;
	2		45277
	2		54
12	2	2	1827	use HTML::TreeBuilder::XPath;
	2		149136
	2		27
13	2	2	85	use Scalar::Util qw(blessed);
	2		6
	2		174
14	2	2	11	use URI;
	2		4
	2		57
15	2	2	11	use HTML::Entities qw(decode_entities encode_entities);
	2		3
	2		126
16	2	2	902	use HTML::Tidy;
	0
	0
17				use HTTP::Response::Encoding;
18				use HTML::HeadParser;
19				use HTTP::Headers::Util;
20
21				use version; our $VERSION = qv('0.3');
22
23
24				# We don't want to inherit Exporter, we can't always import the import
25				# method, so this is a workaround.
26				sub import { goto &Exporter::import }
27
28
29
30				our @EXPORT_OK = qw(download);
31
32				our %TIDY_OPTIONS = (lower_literals => 1,
33				show_errors => 0,
34				show_warnings => 0,
35				tidy_mark => 0);
36
37
38				# url(blah) or url( 'blah' ) etc.
39				my $QUOTED_STR = qr/ " ([^"]) " \| ' ([^']) ' /x;
40
41				my $URL_RE = qr/ url \s* \(
42				\s* (?: $QUOTED_STR \| (.?) ) \s
43				\)
44				/ix;
45
46				my $IMPORT_RE = qr/
47				\@import (?:
48				\s+ $URL_RE \| # @import url(blah) with optional quotes
49				\s* $QUOTED_STR \| # @import "blah" or @import 'blah'
50				\s+ (\S+) # @import blah
51				)
52				/xi;
53
54				sub _inner_html
55				{
56				my $node = shift;
57				join "", map { ref $_? $_->as_HTML : $_ } $node->content_list;
58				}
59
60				sub _slurp
61				{
62				my $path = shift;
63				my $encoding = defined $_[0]?
64				"encoding($_[0])" : "";
65
66				local $/;
67				confess "failed to open file '$path': $!"
68				unless open my $fh, "<$encoding", $path;
69				my $content = <$fh>;
70				close $fh;
71				return $content;
72				}
73
74				sub _spit
75				{
76				my $path = shift;
77				my $content = shift;
78				confess "failed to open file '$path': $!" unless open my $fh, ">", $path;
79				print $fh $content;
80				close $fh;
81				}
82
83				# This parses the charset from a HTML doc's HEAD section, if present,
84				#
85				# The code here is adapted from Tatsuhiko Miyagawa's here:
86				# http://svn.bulknews.net/repos/public/HTTP-Response-Charset/trunk/lib/HTTP/Response/Charset.pm
87				#
88				# See also http://use.perl.org/~miyagawa/journal/31250
89				# HTTP::Response::Charset seems not to be on CPAN, however.
90				{
91
92				my $boms = [
93				'UTF-8' => "\x{ef}\x{bb}\x{bf}",
94				'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
95				'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
96				'UTF-16BE' => "\x{fe}\x{ff}",
97				'UTF-16LE' => "\x{ff}\x{fe}",
98				];
99
100
101				sub _detect_encoding
102				{
103				my $filename = shift;
104
105				# 1) We assume the content has been identified as HTML,
106				# and the Content-Type header already checked.
107
108				# Read in a max 4k chunk from the content;
109				my $chunk;
110				{
111				open my $fh, "<", $filename
112				or Carp::confess "Failed to read file '$filename': $!";
113				read $fh, $chunk, 4096; # read up to 4k
114				close $fh;
115				}
116
117				# 2) Look for META head tags
118				{
119				my $head_parser = HTML::HeadParser->new;
120				$head_parser->parse($chunk);
121				$head_parser->eof;
122
123				my $content_type = $head_parser->header('Content-Type');
124				return unless $content_type;
125				my ($words) = HTTP::Headers::Util::split_header_words($content_type);
126				my %param = @$words;
127				return $param{charset};
128				}
129
130				# 3) If there's a UTF BOM set, look for it
131				my $count = 0;
132				while (my ($enc, $bom) = $boms->[$count++, $count++])
133				{
134				return $enc
135				if $bom eq substr($chunk, 0, length $bom);
136				}
137
138				# 4) If it looks like an XML document, look for XML declaration
139				if ($chunk =~ m!^<\?xml\s+version="1.0"\s+encoding="([\w\-]+)"\?>!) {
140				return $1;
141				}
142
143				# 5) If there's Encode::Detect module installed, try it
144				if ( eval "use Encode::Detect::Detector" ) {
145				my $charset = Encode::Detect::Detector::detect($chunk);
146				return $charset if $charset;
147				}
148
149				return;
150				}
151				}
152
153
154				# Constructor
155
156				sub new
157				{
158				my $class = shift;
159				croak "You must supply a matched set of key => value paramters"
160				if @_ % 2;
161
162				my %options = @_;
163
164				unless (defined $options{ua})
165				{
166				# the default user agent should follow redirects
167				my $ua = LWP::UserAgent->new(
168				requests_redirectable => [qw(GET POST HEAD)]
169				);
170				$options{ua} = $ua;
171				}
172
173				my $self = bless \%options, $class;
174
175				return $self;
176				}
177
178				sub ua { @_>1 ? shift->{ua} = shift : shift->{ua} }
179
180				our $DEFAULT_INSTANCE; # lazily assigned within download
181
182				sub download
183				{
184				my $self = shift;
185
186				# An URI or HTTP::Request for the page we want
187				my $request = shift;
188
189				# Where to save things. A directory - the main file will be called
190				# 'index.html'
191				my $content_dir = shift;
192
193				# A specialised UserAgent to use
194				my $ua = shift;
195
196				if (!blessed($self)
197				\|\| !$self->isa(__PACKAGE__))
198				{ # we're a function, readjust the paramters accordingly
199				($self, $request, $content_dir, $ua)
200				= ( ($DEFAULT_INSTANCE \|\|= __PACKAGE__->new), $self, $request, $content_dir, shift);
201				}
202
203
204				# If no user agent supplied, use the instance's
205				$ua \|\|= $self->{ua};
206
207				croak "please supply an URL or HTTP::Request to download"
208				unless $request;
209
210				$request = HTTP::Request->new(GET => $request)
211				if (blessed $request && $request->isa('URI'))
212				\|\| ref \$request eq 'SCALAR';
213
214				croak "first argument must be an URL or HTTP::Request instance"
215				unless blessed $request and $request->isa('HTTP::Request');
216
217				croak "please supply a directory to copy into"
218				unless $content_dir;
219
220				carp "warning, path '$content_dir' already exists, we may overwrite content"
221				if -e $content_dir;
222
223				# All seems in order, now proceed....
224				mkpath $content_dir;
225
226
227				# First get the main document
228				my $file = File::Spec->catdir($content_dir, "index.html");
229				my $response = $ua->request($request, $file);
230
231				unless ($response and $response->is_success)
232				{
233				croak "HTTP request failed: ". $response->status_line;
234				}
235
236				# If it's not HTML, we can't understand it, so just leave it
237				# unchanged.
238				return unless $response->content_type =~ /html$/;
239
240
241				# Otherwise, "localise" it....
242
243				# This will parse the HTML so we can get the links
244				my $parser = HTML::TreeBuilder::XPath->new;
245
246				# Get the encoding, if we can
247				my $encoding = $response->encoding \|\| _detect_encoding($file);
248
249				# HTML::Tidy does a better job of interpreting bad html than
250				# HTML::TreeBuilder alone, so we pass it through that first. If
251				# we don't, the resulting HTML obtained after HTML::TreeBuilder
252				# has parsed it can be broken.
253				{
254				my $tidy = HTML::Tidy->new(\%TIDY_OPTIONS);
255				$tidy->ignore( text => qr/./ );
256
257
258				my $content = _slurp($file, $encoding);
259
260				{
261				no warnings 'redefine';
262
263				# HTML::Tidy insists on calling this function.... silence
264				# it, locally
265				local *Carp::carp = sub {};
266
267				$content = $tidy->clean($content);
268				}
269
270				$parser->parse($content);
271				}
272
273				my %seen; # We store URLs we've already processed in here
274
275				# This will both download an URL's target and rewrite the URL to
276				# point to the downloaded copy - here we refer to that process as
277				# "localising" an url.
278				my $localise_url = sub
279				{
280				my $url = shift \|\| croak "no url parameter supplied";
281				$url = URI->new_abs(decode_entities($url), $response->base)
282				unless blessed $url;
283
284				my $local_url = $seen{$url};
285
286				unless ($local_url)
287				{
288				# FIXME check for inline URL images? (i.e. data:// urls)
289				my ($ext) = $url->path =~ m![.]([^./]+)$!;
290				my $index = keys(%seen)+1;
291				my $filename = $index;
292				$filename .= ".$ext"
293				if defined $ext;
294				my $file = File::Spec->catfile($content_dir, $filename);
295
296
297				# clean up things like '/../foo' which will cause an error
298				# if passed to $ua->get
299				my $url_path = File::Spec::Unix->canonpath($url->path);
300				$url->path($url_path);
301
302				$local_url = $seen{$url} = $filename;
303
304				# print "downloading $url -> $file\n"; DEBUG
305				my $response2 = $ua->get($url, ':content_file' => $file);
306
307				carp "failed to download $url: ". $response2->status_line
308				unless $response2->is_success
309				&& -f $file;
310				}
311
312				return $local_url;
313
314				};
315
316				# This will localise URLs in tag attributes
317				my $process_attr = sub
318				{
319				my ($attr) = @_;
320
321				my $url = $attr->getValue;
322				return unless $url ne "";
323
324				my $local_url = $localise_url->($url);
325				# warn "url $url -> $local_url"; # DEBUG
326				# rewrite the attribute
327				$attr->getParentNode->attr($attr->getName, $local_url);
328				};
329
330				# This will localise a stylesheet link
331				my $localise_style_url = sub
332				{
333				# note, CSS defines URLs to be relative to the stylesheet.
334				my $base = shift \|\| croak "you must supply a base url";
335				my $url = URI->new_abs(shift, $base);
336
337				my $local_url = $localise_url->($url);
338
339				$local_url = encode_entities($local_url);
340
341				# warn "localising $url-> $local_url\n"; # DEBUG
342				return "url($local_url)";
343				};
344
345				my $process_stylesheet; # defined later
346
347				# This will localise a stylesheet @import link
348				my $localise_import = sub
349				{
350				my $base = shift;
351				my $url = shift;
352
353				my $local_url = $localise_url->($url);
354				my $stylesheet_file = File::Spec->catdir($content_dir, $local_url);
355
356				my $content = _slurp $stylesheet_file;
357				$process_stylesheet->($base, $content);
358				_spit $stylesheet_file, $content;
359
360				# Note, we don't convert the url, since that will be done later
361				return "\@import url($url)";
362				};
363
364				# This function will localise an entire stylesheet's links. It
365				# returns the number of things downloaded.
366				$process_stylesheet = sub
367				{
368				my $base = shift \|\| croak "you must supply a base url";
369
370				# First, convert all '@import' statements to the '@import url()' form,
371				# then localise all url() references. Return true if either has been applied.
372				my @stylesheets = $_[0] =~ s/$IMPORT_RE/$localise_import->($base, $+)/ige;
373				my @urls = $_[0] =~ s/$URL_RE/$localise_style_url->($base, $+)/ige;
374
375				return @stylesheets + @urls;
376				};
377
378				# This localises a