File Coverage

lib/HTML/Encapsulate.pm

Criterion	Covered	Total	%
statement	43	45	95.5
branch			n/a
condition			n/a
subroutine	15	15	100.0
pod			n/a
total	58	60	96.6

line	stmt	sub	time	code
1				package HTML::Encapsulate;
2	2	2	18629	use warnings;
	2		4
	2		60
3	2	2	10	use strict;
	2		4
	2		44
4	2	2	9	use Carp;
	2		12
	2		123
5	2	2	10	use PerlIO;
	2		4
	2		14
6	2	2	52	use File::Path qw(mkpath);
	2		4
	2		100
7	2	2	9	use File::Spec;
	2		3
	2		41
8	2	2	10	use File::Spec::Unix;
	2		3
	2		58
9	2	2	8	use Carp qw(croak carp cluck confess);
	2		3
	2		97
10	2	2	9	use Exporter;
	2		2
	2		58
11	2	2	949	use LWP::UserAgent;
	2		47361
	2		67
12	2	2	1668	use HTML::TreeBuilder::XPath;
	2		145033
	2		22
13	2	2	89	use Scalar::Util qw(blessed);
	2		4
	2		120
14	2	2	12	use URI;
	2		4
	2		58
15	2	2	9	use HTML::Entities qw(decode_entities encode_entities);
	2		5
	2		182
16	2	2	861	use HTML::Tidy;
	0
	0
17				use HTTP::Response::Encoding;
18				use HTML::HeadParser;
19				use HTTP::Headers::Util;
20
21				use version; our $VERSION = qv('0.3');
22
23
24				our @EXPORT_OK = qw(download);
25
26				# We don't want to inherit Exporter, we can't always import the import
27				# method, so this is a workaround.
28				sub import { goto &Exporter::import }
29
30
31				our %TIDY_OPTIONS = (lower_literals => 1,
32				show_errors => 0,
33				show_warnings => 0,
34				tidy_mark => 0);
35
36
37				# url(blah) or url( 'blah' ) etc.
38				my $QUOTED_STR = qr/ " ([^"]) " \| ' ([^']) ' /x;
39
40				my $URL_RE = qr/ url \s* \(
41				\s* (?: $QUOTED_STR \| (.?) ) \s
42				\)
43				/ix;
44
45				my $IMPORT_RE = qr/
46				\@import (?:
47				\s+ $URL_RE \| # @import url(blah) with optional quotes
48				\s* $QUOTED_STR \| # @import "blah" or @import 'blah'
49				\s+ (\S+) # @import blah
50				)
51				/xi;
52
53				sub _inner_html
54				{
55				my $node = shift;
56				join "", map { ref $_? $_->as_HTML : $_ } $node->content_list;
57				}
58
59				sub _slurp
60				{
61				my $path = shift;
62				my $encoding = defined $_[0]?
63				"encoding($_[0])" : "";
64
65				local $/;
66				confess "failed to open file '$path': $!"
67				unless open my $fh, "<$encoding", $path;
68				my $content = <$fh>;
69				close $fh;
70				return $content;
71				}
72
73				sub _spit
74				{
75				my $path = shift;
76				my $content = shift;
77				confess "failed to open file '$path': $!" unless open my $fh, ">", $path;
78				print $fh $content;
79				close $fh;
80				}
81
82				# This parses the charset from a HTML doc's HEAD section, if present,
83				#
84				# The code here is adapted from Tatsuhiko Miyagawa's here:
85				# http://svn.bulknews.net/repos/public/HTTP-Response-Charset/trunk/lib/HTTP/Response/Charset.pm
86				#
87				# See also http://use.perl.org/~miyagawa/journal/31250
88				# HTTP::Response::Charset seems not to be on CPAN, however.
89				{
90
91				my $boms = [
92				'UTF-8' => "\x{ef}\x{bb}\x{bf}",
93				'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
94				'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
95				'UTF-16BE' => "\x{fe}\x{ff}",
96				'UTF-16LE' => "\x{ff}\x{fe}",
97				];
98
99
100				sub _detect_encoding
101				{
102				my $filename = shift;
103
104				# 1) We assume the content has been identified as HTML,
105				# and the Content-Type header already checked.
106
107				# Read in a max 4k chunk from the content;
108				my $chunk;
109				{
110				open my $fh, "<", $filename
111				or Carp::confess "Failed to read file '$filename': $!";
112				read $fh, $chunk, 4096; # read up to 4k
113				close $fh;
114				}
115
116				# 2) Look for META head tags
117				{
118				my $head_parser = HTML::HeadParser->new;
119				$head_parser->parse($chunk);
120				$head_parser->eof;
121
122				my $content_type = $head_parser->header('Content-Type');
123				return unless $content_type;
124				my ($words) = HTTP::Headers::Util::split_header_words($content_type);
125				my %param = @$words;
126				return $param{charset};
127				}
128
129				# 3) If there's a UTF BOM set, look for it
130				my $count = 0;
131				while (my ($enc, $bom) = $boms->[$count++, $count++])
132				{
133				return $enc
134				if $bom eq substr($chunk, 0, length $bom);
135				}
136
137				# 4) If it looks like an XML document, look for XML declaration
138				if ($chunk =~ m!^<\?xml\s+version="1.0"\s+encoding="([\w\-]+)"\?>!) {
139				return $1;
140				}
141
142				# 5) If there's Encode::Detect module installed, try it
143				if ( eval "use Encode::Detect::Detector" ) {
144				my $charset = Encode::Detect::Detector::detect($chunk);
145				return $charset if $charset;
146				}
147
148				return;
149				}
150				}
151
152
153				# Constructor
154
155				sub new
156				{
157				my $class = shift;
158				croak "You must supply a matched set of key => value paramters"
159				if @_ % 2;
160
161				my %options = @_;
162
163				unless (defined $options{ua})
164				{
165				# the default user agent should follow redirects
166				my $ua = LWP::UserAgent->new(
167				requests_redirectable => [qw(GET POST HEAD)]
168				);
169				$options{ua} = $ua;
170				}
171
172				my $self = bless \%options, $class;
173
174				return $self;
175				}
176
177				sub ua { @_>1 ? shift->{ua} = shift : shift->{ua} }
178
179				our $DEFAULT_INSTANCE; # lazily assigned within download
180
181				sub download
182				{
183				my $self = shift;
184
185				# An URI or HTTP::Request for the page we want
186				my $request = shift;
187
188				# Where to save things. A directory - the main file will be called
189				# 'index.html'
190				my $content_dir = shift;
191
192				# A specialised UserAgent to use
193				my $ua = shift;
194
195				if (!blessed($self)
196				\|\| !$self->isa(__PACKAGE__))
197				{ # we're a function, readjust the paramters accordingly
198				($self, $request, $content_dir, $ua)
199				= ( ($DEFAULT_INSTANCE \|\|= __PACKAGE__->new), $self, $request, $content_dir, shift);
200				}
201
202
203				# If no user agent supplied, use the instance's
204				$ua \|\|= $self->{ua};
205
206				croak "please supply an URL or HTTP::Request to download"
207				unless $request;
208
209				$request = HTTP::Request->new(GET => $request)
210				if (blessed $request && $request->isa('URI'))
211				\|\| ref \$request eq 'SCALAR';
212
213				croak "first argument must be an URL or HTTP::Request instance"
214				unless blessed $request and $request->isa('HTTP::Request');
215
216				croak "please supply a directory to copy into"
217				unless $content_dir;
218
219				carp "warning, path '$content_dir' already exists, we may overwrite content"
220				if -e $content_dir;
221
222				# All seems in order, now proceed....
223				mkpath $content_dir;
224
225
226				# First get the main document
227				my $file = File::Spec->catdir($content_dir, "index.html");
228				my $response = $ua->request($request, $file);
229
230				unless ($response and $response->is_success)
231				{
232				croak "HTTP request failed: ". $response->status_line;
233				}
234
235				# If it's not HTML, we can't understand it, so just leave it
236				# unchanged.
237				return unless $response->content_type =~ /html$/;
238
239
240				# Otherwise, "localise" it....
241
242				# This will parse the HTML so we can get the links
243				my $parser = HTML::TreeBuilder::XPath->new;
244
245				# Get the encoding, if we can
246				my $encoding = $response->encoding \|\| _detect_encoding($file);
247
248				# HTML::Tidy does a better job of interpreting bad html than
249				# HTML::TreeBuilder alone, so we pass it through that first. If
250				# we don't, the resulting HTML obtained after HTML::TreeBuilder
251				# has parsed it can be broken.
252				{
253				my $tidy = HTML::Tidy->new(\%TIDY_OPTIONS);
254				$tidy->ignore( text => qr/./ );
255
256
257				my $content = _slurp($file, $encoding);
258
259				{
260				no warnings 'redefine';
261
262				# HTML::Tidy insists on calling this function.... silence
263				# it, locally
264				local *Carp::carp = sub {};
265
266				$content = $tidy->clean($content);
267				}
268
269				$parser->parse($content);
270				}
271
272				my %seen; # We store URLs we've already processed in here
273
274				# This will both download an URL's target and rewrite the URL to
275				# point to the downloaded copy - here we refer to that process as
276				# "localising" an url.
277				my $localise_url = sub
278				{
279				my $url = shift \|\| croak "no url parameter supplied";
280				$url = URI->new_abs(decode_entities($url), $response->base)
281				unless blessed $url;
282
283				my $local_url = $seen{$url};
284
285				unless ($local_url)
286				{
287				# FIXME check for inline URL images? (i.e. data:// urls)
288				my ($ext) = $url->path =~ m![.]([^./]+)$!;
289				my $index = keys(%seen)+1;
290				my $filename = $index;
291				$filename .= ".$ext"
292				if defined $ext;
293				my $file = File::Spec->catfile($content_dir, $filename);
294
295
296				# clean up things like '/../foo' which will cause an error
297				# if passed to $ua->get
298				my $url_path = File::Spec::Unix->canonpath($url->path);
299				$url->path($url_path);
300
301				$local_url = $seen{$url} = $filename;
302
303				# print "downloading $url -> $file\n"; DEBUG
304				my $response2 = $ua->get($url, ':content_file' => $file);
305
306				carp "failed to download $url: ". $response2->status_line
307				unless $response2->is_success
308				&& -f $file;
309				}
310
311				return $local_url;
312
313				};
314
315				# This will localise URLs in tag attributes
316				my $process_attr = sub
317				{
318				my ($attr) = @_;
319
320				my $url = $attr->getValue;
321				return unless $url ne "";
322
323				my $local_url = $localise_url->($url);
324				# warn "url $url -> $local_url"; # DEBUG
325				# rewrite the attribute
326				$attr->getParentNode->attr($attr->getName, $local_url);
327				};
328
329				# This will localise a stylesheet link
330				my $localise_style_url = sub
331				{
332				# note, CSS defines URLs to be relative to the stylesheet.
333				my $base = shift \|\| croak "you must supply a base url";
334				my $url = URI->new_abs(shift, $base);
335
336				my $local_url = $localise_url->($url);
337
338				$local_url = encode_entities($local_url);
339
340				# warn "localising $url-> $local_url\n"; # DEBUG
341				return "url($local_url)";
342				};
343
344				my $process_stylesheet; # defined later
345
346				# This will localise a stylesheet @import link
347				my $localise_import = sub
348				{
349				my $base = shift;
350				my $url = shift;
351
352				my $local_url = $localise_url->($url);
353				my $stylesheet_file = File::Spec->catdir($content_dir, $local_url);
354
355				my $content = _slurp $stylesheet_file;
356				$process_stylesheet->($base, $content);
357				_spit $stylesheet_file, $content;
358
359				# Note, we don't convert the url, since that will be done later
360				return "\@import url($url)";
361				};
362
363				# This function will localise an entire stylesheet's links. It
364				# returns the number of things downloaded.
365				$process_stylesheet = sub
366				{
367				my $base = shift \|\| croak "you must supply a base url";
368
369				# First, convert all '@import' statements to the '@import url()' form,
370				# then localise all url() references. Return true if either has been applied.
371				my @stylesheets = $_[0] =~ s/$IMPORT_RE/$localise_import->($base, $+)/ige;
372				my @urls = $_[0] =~ s/$URL_RE/$localise_style_url->($base, $+)/ige;
373
374				return @stylesheets + @urls;
375				};
376
377				# This localises a