File Coverage

blib/lib/App/FetchwareX/HTMLPageSync.pm

Criterion	Covered	Total	%
statement	57	188	30.3
branch	3	28	10.7
condition	0	3	0.0
subroutine	19	31	61.2
pod	13	13	100.0
total	92	263	34.9

line	stmt	bran	cond	sub	pod	time	code
1							package App::FetchwareX::HTMLPageSync;
2							our $VERSION = '1.016'; # VERSION: generated by DZP::OurPkgVersion
3							# ABSTRACT: An App::Fetchware extension that downloads files based on an HTML page.
4	1			1		7703	use strict;
	1					1
	1					23
5	1			1		4	use warnings;
	1					1
	1					19
6
7							# Enable Perl 6 knockoffs, and use 5.10.1, because smartmatching and other
8							# things in 5.10 were changed in 5.10.1+.
9	1			1		17	use 5.010001;
	1					3
10
11							# Use fetchware's API's to help us out.
12	1			1		3	use App::Fetchware::Util ':UTIL';
	1					1
	1					162
13	1			1		5	use App::Fetchware::Config ':CONFIG';
	1					1
	1					90
14	1			1		411	use App::Fetchware::Fetchwarefile;
	1					2
	1					28
15	1					145	use App::Fetchware qw(
16							:OVERRIDE_NEW
17							:OVERRIDE_NEW_INSTALL
18							:OVERRIDE_CHECK_SYNTAX
19	1			1		5	);
	1					0
20
21							# Local imports.
22	1			1		4	use File::Copy 'cp';
	1					2
	1					36
23	1			1		4	use File::Path 'remove_tree';
	1					1
	1					40
24	1			1		3	use URI::Split 'uri_split';
	1					1
	1					41
25	1			1		4	use File::Spec 'splitpath';
	1					0
	1					14
26	1			1		3	use Data::Dumper;
	1					1
	1					32
27	1			1		4	use Scalar::Util 'blessed';
	1					1
	1					45
28
29							# Use App::Fetchware::ExportAPI to specify which App::Fetchware API subroutines
30							# we are going to "KEEP", import from App::Fetchware, and which API subs we are
31							# going to "OVERRRIDE", implemente here in this package.
32							#
33							# ExportAPI takes care of the grunt work for us by setting our packages @EXPORT
34							# appropriatly, and even importing Exporter's import() method into our package
35							# for us, so that our App::Fetchware API subroutines and configuration options
36							# specified below can be import()ed properly.
37							use App::Fetchware::ExportAPI
38							# KEEP or "inherit" new_install, because I want my new_install to just call
39							# ask_to_install_now_to_test_fetchwarefile(), and App::Fetchware's does that
40							# already for me. And start() and end() are to create and manage the
41							# temporary directory for me, so I don't have to worry about polluting the
42							# current working directory with temporary files.
43	1					7	KEEP => [qw(new_install start end)],
44							# OVERRIDE everything else.
45							OVERRIDE =>
46							[qw(new check_syntax lookup download verify unarchive build install
47							uninstall upgrade)]
48	1			1		403	;
	1					2
49
50
51							# Use App::Fetchware::CreateconfigOptions to build our App::Fetchware
52							# configuration options for us. These are subroutines with correct prototypes to
53							# turn a perl code file into something that resembles a configuration file.
54							use App::Fetchware::CreateConfigOptions
55	1					6	ONE => [qw(
56							page_name
57							html_page_url
58							destination_directory
59							user_agent
60							html_treebuilder_callback
61							download_links_callback
62							)],
63							BOOLEAN => [qw(keep_destination_directory)]
64	1			1		5	;
	1					2
65
66
67	1			1		4	use Exporter 'import';
	1					1
	1					1719
68							our %EXPORT_TAGS = (
69							TESTING => [qw(
70							get_html_page_url
71							get_destination_directory
72							ask_about_keep_destination_directory
73							new
74							new_install
75							)]
76							);
77							our @EXPORT_OK = map {@{$_}} values %EXPORT_TAGS;
78
79
80
81
82
83							sub new {
84	0			0	1	0	my ($term, $page_name) = @_;
85
86							# Instantiate a new Fetchwarefile object for managing and generating a
87							# Fetchwarefile, which we'll write to a file for the user or use to
88							# build a associated Fetchware package.
89	0					0	my $now = localtime;
90	0					0	my $fetchwarefile = App::Fetchware::Fetchwarefile->new(
91							header => <
92							use App::FetchwareX::HTMLPageSync;
93							# Auto generated $now by HTMLPageSync's fetchware new command.
94							# However, feel free to edit this file if HTMLPageSync's new command's
95							# autoconfiguration is not enough.
96							#
97							# Please look up HTMLPageSync's documentation of its configuration file syntax at
98							# perldoc App::FetchwareX::HTMLPageSync, and only if its configuration file
99							# syntax is not malleable enough for your application should you resort to
100							# customizing fetchware's behavior. For extra flexible customization see perldoc
101							# App::Fetchwarex::HTMLPageSync.
102							EOF
103							descriptions => {
104
105							page_name => <
106							page_name simply names the HTML page the Fetchwarefile is responsible for
107							downloading, analyzing via optional callbacks, and copying to your
108							destination_directory.
109							EOA
110							html_page_url => <
111							html_page_url is HTMLPageSync's lookup_url equivalent. It specifies a HTTP url
112							that returns a page of HTML that can be easily parsed of links to later
113							download.
114							EOA
115							destination_directory => <
116							destination_directory is the directory on your computer where you want the files
117							that you configure HTMLPageSync to parse to be copied to.
118							EOA
119							user_agent => <
120							user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library
121							Fetchware uses, where the library will lie to the Web server you are Web
122							scraping from to hopefully prevent the Web sever from banning you, or updating
123							the page you want to scrap to use too much Javascript, which would prevent the
124							simple parser HTMLPageSync uses from working on the specified html_page_url.
125							EOA
126							html_treebuilder_callback => <
127							html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync
128							will execute instead of its default callback that just looks for images.
129
130							It receives one parameter, which is an HTML::Element at the first C,
131							anchor/link tag.
132
133							It must [return 'True';] to indicate that that link should be included in the
134							list of download links, or return false, [return undef], to indicate that that
135							link should not be included in the list of download links.
136							EOA
137							download_links_callback => <
138							download_links_callback specifies an optional callback that will allow you to do
139							post processing of the list of downloaded urls. This is needed, because the
140							results of the html_treebuilder_callback are still HTML::Element objects that
141							need to be converted to just string download urls. That is what the default
142							C does.
143
144							It receives a list of all of the download HTML::Elements that
145							C returned true on. It is called only once, and
146							should return a list of string download links for download later by
147							HTMLPageSync.
148							EOA
149							keep_destination_directory => <
150							keep_destination_directory is a boolean true or false configuration option that
151							when true prevents HTMLPageSync from deleting your destination_directory when
152							you run fetchware uninstall.
153							EOA
154							}
155							);
156
157	0					0	extension_name(__PACKAGE__);
158
159	0					0	opening_message(<
160							HTMLPageSync's new command is not as sophistocated as Fetchware's. Unless you
161							only want to download images, you will have to get your hands dirty, and code up
162							some custom Perl callbacks to customize HTMLPageSync's behavior. However, it
163							will ask you quite nicely the basic options, so if those are all you need, then
164							this command will successfully generate a HTMLPageSync Fetchwarefile for you.
165
166							After it lets you choose the easy options of page_name, html_page_url,
167							and destination_directory, it will give you an opportunity to modify the
168							user_agent string HTMLPageSync uses to avoid betting banned or having your
169							scraping stick out like a sore thumb in the target Web server's logs. Then,
170							you'll be asked about the advanced options. If you want them it will add generic
171							ones to the Fetchwarefile that you can then fill in later on when HTMLPageSync
172							asks you if you want to edit the generated Fetchwarefile manually. Finally,
173							after your Fetchwarefile is generated HTMLPageSync will ask you if you would
174							like to install your generated Fetchwarefile to test it out.
175							EOM
176
177							# Ask the user for the basic configuration options.
178	0					0	$page_name = fetchwarefile_name(page_name => $page_name);
179	0					0	vmsg "Determined your page_name option to be [$page_name]";
180
181	0					0	$fetchwarefile->config_options(page_name => $page_name);
182	0					0	vmsg "Appended page_name [$page_name] configuration option to Fetchwarefile";
183
184	0					0	my $html_page_url = get_html_page_url($term);
185	0					0	vmsg "Asked user for html_page_url [$html_page_url] from user.";
186
187	0					0	$fetchwarefile->config_options(html_page_url => $html_page_url);
188	0					0	vmsg "Appended html_page_url [$html_page_url] configuration option to Fetchwarefile";
189
190	0					0	my $destination_directory = get_destination_directory($term);
191	0					0	vmsg "Asked user for destination_directory [$destination_directory] from user.";
192
193	0					0	$fetchwarefile->config_options(destination_directory => $destination_directory);
194	0					0	vmsg <
195							Appended destination_directory [$destination_directory] configuration option to
196							your Fetchwarefile";
197							EOM
198
199							# Asks and sets the keep_destination_directory configuratio option if the
200							# user wants to set it.
201	0					0	ask_about_keep_destination_directory($term, $fetchwarefile);
202
203	0					0	vmsg 'Prompting for other options that may be needed.';
204	0					0	my $other_options_hashref = prompt_for_other_options($term,
205							user_agent => {
206							prompt => <
207							What user_agent configuration option would you like?
208							EOP
209							print_me => <
210							user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library
211							Fetchware uses, where the library will lie to the Web server you are Web
212							scraping from to hopefully prevent the Web sever from banning you, or updating
213							the page you want to scrap to use too much Javascript, which would prevent the
214							simple parser HTMLPageSync uses from working on the specified html_page_url.
215							EOP
216							},
217							html_treebuilder_callback => {
218							prompt => <
219							What html_treebuilder_callback configuration option would you like?
220							EOP
221							print_me => <
222							html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync
223							will execute instead of its default callback that just looks for images.
224
225							It receives one parameter, which is an HTML::Element at the first C,
226							anchor/link tag.
227
228							It must [return 'True';] to indicate that that link should be included in the
229							list of download links, or return false, [return undef], to indicate that that
230							link should not be included in the list of download links.
231
232							Because Term::UI's imput is limited to just one line, please just press enter,
233							and a dummy value will go into your Fetchwarefile, where you can then replace
234							that dummy value with a proper Perl callback next, when Fetchware gives you the
235							option to edit your Fetchwarefile manually.
236							EOP
237							default => 'sub { my $h = shift; die "Dummy placeholder fill me in."; }',
238							},
239							download_links_callback => {
240							prompt => <
241							What download_links_callback configuration option would you like?
242							EOP
243							print_me => <
244							download_links_callback specifies an optional callback that will allow you to do
245							post processing of the list of downloaded urls. This is needed, because the
246							results of the html_treebuilder_callback are still HTML::Element objects that
247							need to be converted to just string download urls. That is what the default
248							C does.
249
250							It receives a list of all of the download HTML::Elements that
251							C returned true on. It is called only once, and
252							should return a list of string download links for download later by
253							HTMLPageSync.
254
255							Because Term::UI's imput is limited to just one line, please just press enter,
256							and a dummy value will go into your Fetchwarefile, where you can then replace
257							that dummy value with a proper Perl callback next, when Fetchware gives you the
258							option to edit your Fetchwarefile manually.
259							EOP
260							default => 'sub { my @download_urls = @_; die "Dummy placeholder fill me in."; }',
261							},
262							);
263	0					0	vmsg 'User entered the following options.';
264	0					0	vmsg Dumper($other_options_hashref);
265
266							# Append all other options to the Fetchwarefile.
267	0					0	$fetchwarefile->config_options(%$other_options_hashref);
268	0					0	vmsg 'Appended all other options listed above to Fetchwarefile.';
269
270	0					0	my $edited_fetchwarefile = edit_manually($term, $fetchwarefile);
271	0					0	vmsg <
272							Asked user if they would like to edit their generated Fetchwarefile manually.
273							EOM
274							# Generate Fetchwarefile.
275							# If edit_manually() did not modify the Fetchwarefile, then generate it.
276	0	0	0			0	if (blessed($edited_fetchwarefile)
277							and
278							$edited_fetchwarefile->isa('App::Fetchware::Fetchwarefile')) {
279	0					0	$fetchwarefile = $fetchwarefile->generate();
280							# If edit_manually() modified the Fetchwarefile, then do not generate it,
281							# and replace the Fetchwarefile object with the new string that represents
282							# the user's edited Fetchwarefile.
283							} else {
284	0					0	$fetchwarefile = $edited_fetchwarefile;
285							}
286
287							# Whatever variables the new() API subroutine returns are written via a pipe
288							# back to the parent, and then the parent reads the variables back, and
289							# makes then available to new_install(), back in the parent, as arguments.
290	0					0	return $page_name, $fetchwarefile;
291							}
292
293
294
295							sub get_html_page_url {
296	0			0	1	0	my $term = shift;
297
298
299							# prompt for lookup_url.
300	0					0	my $html_page_url = $term->get_reply(
301							print_me => <
302							Fetchware's heart and soul is its html_page_url. This is the configuration option
303							that tells fetchware where to check if any new links have been added to the
304							specified Web page that match your criteria for download.
305
306							How to determine your application's html_page_url:
307							1. Simply specify the URL that of the Web page that has the images that you
308							would like to have Fetchware download for you.
309							EOP
310							prompt => q{What is your Web page's html_page_url? },
311							allow => qr!(ftp\|http\|file)://!);
312
313	0					0	return $html_page_url;
314							}
315
316
317
318							sub get_destination_directory {
319	0			0	1	0	my $term = shift;
320
321							# prompt for lookup_url.
322	0					0	my $destination_directory = $term->get_reply(
323							print_me => <
324							destination_directory is the directory on your computer where you want the files
325							that you configure HTMLPageSync to parse to be copied to.
326							EOP
327							prompt => q{What is your destination_directory? });
328
329	0					0	return $destination_directory;
330							}
331
332
333
334							sub ask_about_keep_destination_directory {
335	0			0	1	0	my ($term, $fetchwarefile) = @_;
336
337	0	0				0	if (
338							$term->ask_yn(
339							print_me => <
340							By default, HTMLPageSync deletes your destination_directory when you uninstall
341							that destination_directory's assocated Fetchware package or Fetchwarefile. This
342							is done, because your deleting the Fetchware package, so it makes sense to
343							delete that package's associated data.
344
345							If you wish to keep your destination_directory after you uninstall this
346							HTMLPageSync Fetchware package, then answer N below.
347							EOP
348							prompt => 'Is deleting your destination_directory on uninstall OK? ',
349							default => 'y',
350							)
351							) {
352	0					0	vmsg <
353							User wants [keep_destination_directory 'True';] added to their Fetchwarefile.
354							EOM
355
356	0					0	$fetchwarefile->config_options(keep_destination_directory => 'True');
357	0					0	vmsg <
358							Appended [keep_destination_directory 'True';] to user's Fetchwarefile.
359							EOM
360							}
361							}
362
363
364
365
366
367							sub check_syntax {
368
369							# Use check_config_options() to run config() a bunch of times to check the
370							# already parsed Fetchwarefile.
371	0			0	1	0	return check_config_options(
372							Mandatory => [ 'page_name', <
373							App-Fetchware: Your Fetchwarefile must specify a page_name configuration
374							option. Please add one, and try again.
375							EOM
376							Mandatory => [ 'html_page_url', <
377							App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration
378							option. Please add one, and try again.
379							EOM
380							Mandatory => [ 'destination_directory', <
381							App-Fetchware: Your Fetchwarefile must specify a destination_directory
382							configuration option. Please add one, and try again.
383							EOM
384							);
385							}
386
387
388
389
390
391							###BUGALERT### lookup() returns all files each time it is run; therefore, it
392							#breaks the way Fetchware is supposed to work! lookup() is supposed to return
393							#"the latest version." And in HTMLPageSync's case, it should not include files
394							#already downloaded, because it should only return "new files" by comparing the
395							#"availabe list of files" to the "already downloaded one."
396							sub lookup {
397	0			0	1	0	msg
398	0					0	"Looking up download urls using html_page_url [@{[config('html_page_url')]}]";
399							###BUGALERT### Create a user changeable version of lookup_check_args??(), so
400							#that App::Fetchware 'subclasses' can use it.
401							# Download the url the user specified.
402	0					0	my $filename = do {
403	0	0				0	if (defined config('user_agent')) {
404	0					0	download_http_url(config('html_page_url'),
405							user_agent => config('user_agent'));
406							} else {
407	0					0	download_http_url(config('html_page_url'));
408							}
409							};
410	0					0	vmsg "Downloaded html_page_url to local file [$filename].";
411
412							# Create a HTML::TreeBuilder object for the now downloaded file.
413	0					0	my $tree = HTML::TreeBuilder->new();
414							# Parse $filename into a HTML::Element tree.
415	0					0	$tree->parse_file($filename);
416	0					0	vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.';
417
418	0					0	my $tree_callback = do {
419	0	0				0	if (config('html_treebuilder_callback')) {
420	0					0	vmsg <
421							Using user supplied html_treebuilder_callback to parse downloaded HTML file:
422							[
423	0					0	@{[config('html_treebuilder_callback')]}
424							]
425							EOM
426	0					0	config('html_treebuilder_callback');
427							} else {
428	0					0	vmsg <
429							Using built-in default html_treebuilder_callback that only wants images.
430							EOM
431							sub {
432	0			0		0	my $tag = shift;
433	0					0	my $link = $tag->attr('href');
434	0	0				0	if (defined $link) {
435							# If the anchor tag is an image...
436	0	0				0	if ($link =~ /\.(jpg\|jpeg\|png\|bmp\|tiff?\|gif)$/) {
437							# ...return true...
438	0					0	return 'True';
439							} else {
440							# ...if not return false.
441	0					0	return undef; #false
442							}
443							}
444	0					0	};
445							}
446							};
447
448							# Find the links that match our default callback or the user specified one
449							# if the user specified one.
450	0					0	my @download_urls = $tree->look_down(
451							_tag => 'a',
452							$tree_callback
453							);
454	0					0	vmsg <
455							Determined download urls to be:
456							@download_urls
457							EOM
458
459							# Sort through the list of HTML::Element tags to finalize the list to
460							# download.
461	0					0	my $links_callback = do {
462	0	0				0	if (config('download_links_callback')) {
463	0					0	vmsg <
464							Determined download_links_callback to be user specified:
465							[
466	0					0	@{[config('download_links_callback')]}
467							]
468							EOM
469	0					0	config('download_links_callback');
470							} else {
471							# Strip off HTML::Element crap by default.
472							sub {
473	0			0		0	vmsg <
474							Using built-in default download_links_callback that turns HTML::Elements into
475							download urls.
476							EOM
477	0					0	my @download_urls = @_;
478
479	0					0	for my $link (@download_urls) {
480	0					0	$link = $link->attr('href');
481							}
482
483							# Must return them, because this coderef was called by value not
484							# by reference.
485	0					0	return @download_urls;
486	0					0	};
487							}
488							};
489
490							# Call download_links_callback or call default one to strip off
491							# HTML::Element crap.
492	0					0	@download_urls = $links_callback->(@download_urls);
493	0					0	vmsg <
494							Determined download urls to be:
495							[
496	0					0	@{[@download_urls]}
497							]
498							EOM
499
500							# The download_urls may be relative links instead of absolute links.
501							# Relative ones could just be filenames without any knowledge of what the
502							# actual server or path or even scheme is. Fix this by prepending
503							# html_page_url to each link if there is no scheme.
504	0					0	for my $download_url (@download_urls) {
505	0	0				0	if ($download_url !~ m!^(ftp\|http\|file)://!) {
506	0					0	$download_url = config('html_page_url') . '/' . $download_url;
507							}
508							}
509
510							# Return a ref to the array of download urls, because lookup()'s API only
511							# allows it to return a single value, but that single value does not have to
512							# a scalar. It can be a array ref, which is used here. This works, because
513							# what is returned here by lookup() is passed unchanged to download(), which
514							# is also part of this API, so I can use what I return here as I please
515							# inside download().
516	0					0	return \@download_urls;
517							}
518
519
520
521							sub download {
522	0			0	1	0	my ($temp_dir, $download_url) = @_;
523
524	0					0	msg 'Downloading the download urls lookup() determined.';
525
526	0					0	my @download_file_paths;
527							# Loop over @$download_url to download all user specified URLs to temp_dir.
528	0					0	for my $url (@$download_url) {
529							# Use user specified agent if they asked for it.
530	0	0				0	if (defined config('user_agent')) {
531	0					0	vmsg <
532							Downloadig url
533							[$url]
534							using the user specified user_agent
535	0					0	[@{[config('user_agent')]}]
536							EOM
537	0					0	my $downloaded_file =
538							download_http_url($url, agent => config('user_agent'));
539	0					0	push @download_file_paths, $downloaded_file;
540							} else {
541	0					0	vmsg "Downloading url [$url].";
542	0					0	my $downloaded_file = download_http_url($url);
543	0					0	push @download_file_paths, $downloaded_file;
544							}
545							}
546
547	0					0	local $" = "\n"; # print each @download_file_paths on its own line.
548	0					0	vmsg <
549							Downloaded specified urls to the following paths:
550							[
551	0					0	@{[@download_file_paths]}
552							]
553							EOM
554
555							# AKA $package_path.
556	0					0	return \@download_file_paths;
557							}
558
559
560
561							sub verify {
562	2			2	1	1389	vmsg <
563							Skipping verify subroutine, because HTMLPageSync does not need to verify anything
564							EOM
565	2					7	do_nothing();
566							}
567
568
569
570							sub unarchive {
571	1			1	1	835	vmsg <
572							Skipping unarchive subroutine, because HTMLPageSync does not need to unarchive
573							anything
574							EOM
575	1					5	do_nothing();
576							}
577
578
579
580							sub build {
581	0			0	1	0	vmsg <
582							Skipping build subroutine, because HTMLPageSync does not need to build anything
583							EOM
584	0					0	do_nothing();
585							}
586
587
588
589							sub install {
590							# AKA $package_path.
591	0			0	1	0	my $download_file_paths = shift;
592
593	0					0	msg <
594							Copying files downloaded to a local temp directory to final destination directory.
595							EOM
596
597							# Copy over the files that have been returned by download().
598	0					0	for my $file_path (@$download_file_paths) {
599	0					0	vmsg <
600	0					0	Copying [$file_path] -> [@{[config('destination_directory')]}].
601							EOM
602							###BUGALERT### Should this die and all the rest be croaks instead???
603	0	0				0	cp($file_path, config('destination_directory')) or die <
604							App-FetchwareX-HTMLPageSync: run-time error. Fetchware failed to copy the file [$file_path] to the
605	0					0	destination directory [@{[config('destination_directory')]}].
606							The OS error was [$!].
607							EOD
608							}
609
610	0					0	vmsg 'Successfully copied files to destination directory.';
611
612	0					0	return 'True indicating success!';
613							}
614
615
616
617
618
619							sub uninstall {
620	1			1	1	2823	my $build_path = shift;
621
622							# Only delete destination_directory if keep_destination_directory is false.
623	1	50				11	unless (config('keep_destination_directory')) {
624
625	1					9	msg <
626							Uninstalling this HTMLPageSync package by deleting your destination directory.
627							EOM
628
629							###BUGALERT### Before release go though all of Fetchware's API, and subifiy
630							#each main component like lookup and download were, the later ones were not
631							#done this way. That way I can put say chdir_to_build_path() here instead of
632							#basicaly copying and pasting the code like I do below. Also
633							#chdir_to_build_path() can be put in :OVERRIDE_UNINSTALL!!! Which I can use
634							#here.
635	1	50				18	chdir $build_path or die <
636							App-FetchwareX-HTMLPageSync: Failed to uninstall the specified package and specifically to change
637							working directory to [$build_path] before running make uninstall or the
638							uninstall_commands provided in the package's Fetchwarefile. Os error [$!].
639							EOD
640
641	1	50				5	if ( defined config('destination_directory')) {
642							# Use File::Path's remove_tree() to delete the destination_directory
643							# thereby "uninstalling" this package. Will throw an exception that I'll
644							# let the main eval in bin/fetchware catch, print, and exit 1.
645	0					0	vmsg <
646	0					0	Deleting entire destination directory [@{[config('destination_directory')]}].
647							EOM
648	0					0	remove_tree(config('destination_directory'));
649							} else {
650	1					10	die <
651							App-FetchwareX-HTMLPageSync: Failed to uninstall the specified App::FetchwareX::HTMLPageSync
652							package, because no destination_directory is specified in its Fetchwarefile.
653							This configuration option is required and must be specified.
654							EOD
655							}
656							# keep_destination_directory was set, so don't delete destination directory.
657							} else {
658	0						msg <
659							Uninstalling this HTMLPageSync package but keeping your destination directory.
660							EOM
661
662							}
663
664	0						return 'True for success.';
665							}
666
667
668
669
670							sub upgrade {
671	0			0	1		my $download_path = shift; # $fetchware_package_path is not used in HTMLPageSync.
672
673							# Get the listing of already downloaded file names.
674	0						my @installed_downloads = glob(config('destination_directory'));
675
676							# Preprocess both @$download_path and @installed_downloads to ensure that
677							# URL crap or differing full paths won't screw up the "comparisons". The
678							# clever delete hashslice does the "comparisons" if you will.
679	0						my @download_path_filenames = map { ( uri_split($_) )[2] } @$download_path;
	0
680	0						my @installed_downloads_filenames = map { ( splitpath($_) ) [2] }
	0
681							@installed_downloads;
682
683							# Determine what files are in @$download_path, but not in
684							# @installed_downloads.
685							# Algo based on code from Perl Cookbook pg. 126.
686	0						my %seen;
687	0						@seen{@$download_path} = ();
688	0						delete @seen{@installed_downloads};
689
690	0						my @new_urls_to_download = keys %seen;
691
692	0	0					if (@new_urls_to_download > 0) {
693							# Alter $download_path to only list @new_urls_to_download. That way
694							# download() only downloads the new URLs not the already downloaded ones
695							# again.
696	0						$download_path = [@new_urls_to_download];
697
698	0						return 'New URLs Found.';
699							} else {
700	0						return;
701							}
702							}
703
704
705							1;
706
707							=pod
708
709							=head1 NAME
710
711							App::FetchwareX::HTMLPageSync - An App::Fetchware extension that downloads files based on an HTML page.
712
713							=head1 VERSION
714
715							version 1.016
716
717							=head1 SYNOPSIS
718
719							=head2 Example App::FetchwareX::HTMLPageSync Fetchwarefile.
720
721							page_name 'Cool Wallpapers';
722
723							html_page_url 'http://some-html-page-with-cool.urls';
724
725							destination_directory 'wallpapers';
726
727							# pretend to be firefox
728							user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1';
729
730							# Customize the callbacks.
731							html_treebuilder_callback sub {
732							# Get one HTML::Element.
733							my $h = shift;
734
735							# Return true or false to indicate if this HTML::Element shoudd be a
736							# download link.
737							if (something) {
738							return 'True';
739							} else {
740							return undef;
741							}
742							};
743
744							download_links_callback sub {
745							my @download_urls = @_;
746
747							my @wanted_download_urls;
748							for my $link (@download_urls) {
749							# Pick ones to keep.
750							puse @wanted_download_urls, $link;
751							}
752
753							return @wanted_download_urls;
754							};
755
756							=head2 App::FetchwareX::HTMLPageSync App::Fetchware-like API.
757
758							my $temp_file = start();
759
760							my $download_url = lookup();
761
762							download($temp_dir, $download_url);
763
764							verify($download_url, $package_path);
765
766							unarchive($package_path);
767
768							build($build_path);
769
770							install();
771
772							uninstall($build_path);
773
774							=head1 MOTIVATION
775
776							I want to automatically parse a Web page with links to wall papers that I want
777							to download. Only I want software to do it for me. That's where this
778							App::Fetchware extension comes in.
779
780							=head1 DESCRIPTION
781
782							App::FetchwareX::HTMLPageSync is an example App::Fetchware extension. It's not
783							a large extension, but instead is a simple one meant to show how easy it is
784							extend App::Fetchware.
785
786							App::FetchwareX::HTMLPageSync parses the Web page you specify to create a list of
787							download links. Then it downloads those links, and installs them to your
788							C.
789
790							In order to use App::FetchwareX::HTMLPageSync to help you mirror the download
791							links on a HTML page you need to create a App::FetchwareX::HTMLPageSync
792							Fetchwarefile, you can do this easily by just running C, and
793							typing in C when it asks you what extension of Fetchwarefile you
794							want to create.
795							L
796							Then you'll need to
797							L
798
799							=head1 App::FetchwareX::HTMLPageSync API SUBROUTINES
800
801							This is App::FetchwareX::HTMLPageSync's API that fetchware uses to execute any
802							Fetchwarefile's that make use of App::FetchwareX::HTMLPageSync. This API is the
803							same that regular old App::Fetchware uses for most standard FOSS software, and
804							this internal documentation is only needed when debugging HTMLPageSync's code or
805							when studying it to create your own fetchware extension.
806
807							=head2 new()
808
809							my ($program_name, $fetchwarefile) = new($term, $program_name);
810
811							# Or in an extension, you can return whatever list of variables you want,
812							# and then cmd_new() will provide them as arguments to new_install() except
813							# a $term Term::ReadLine object will precede the others.
814							my ($term, $program_name, $fetchwarefile, $custom_argument1, $custom_argument2)
815							= new($term, $program_name);
816
817							new() is App::Fetchware's API subroutine that implements fetchware's new
818							command. It simply uses Term::UI to ask the user some questions that determine
819							what configuration options will be added to the genereted Fetchwarefile. new()
820							takes a $term, Term::UI/Term::Readline object, and the optional name of the
821							program or Website in this case that HTMLPageSync is page syncing.
822
823							Whatever scalars (not references just regular strings) that new() returns will
824							be shared with new()'s sister API subroutine new_install() that is called after
825							new() is called by cmd_install(), which implements fetchware's new command.
826							new_install() is called in the parent process, so it does have root permissions,
827							so be sure to test it as root as well.
828
829							=over
830
831							=item drop_privs() NOTES
832
833							This section notes whatever problems you might come accross implementing and
834							debugging your Fetchware extension due to fetchware's drop_privs mechanism.
835
836							See L.
837
838							=over
839
840							=item *
841
842							This subroutine is B run as root; instead, it is run as a regular user
843							unless the C configuration option has been set to true.
844
845							=back
846
847							=back
848
849							=head3 get_html_page_url()
850
851							my $html_page_url = get_html_page_url($term);
852
853							Uses $term argument as a L/L object to interactively
854							explain what a L is, and to ask the user to provide one and press
855							enter.
856
857							=head3 get_destination_directory()
858
859							my $destination_directory = get_destination_directory($term);
860
861							Uses $term argument as a L/L object to interactively
862							explain what a C is, and to ask the user to provide one
863							and press enter.
864
865							=head3 ask_about_keep_destination_directory()
866
867							ask_about_keep_destination_directory($term, $fetchwarefile);
868
869							ask_about_keep_destination_directory() does just that it asks the user if they
870							would like to enable the C configuration option to
871							preserve their C when they uninstall the assocated
872							Fetchware package or Fetchwarefile. If they answer Y,
873							C is added to their Fetchwarefile, and if not
874							nothing is added, because deleteing their C is the
875							default that will happen even if the C is not even
876							in the Fetchwarefile.
877
878							=head2 new_install()
879
880							my $fetchware_package_path = new_install($page_name, $fetchwarefile);
881
882							new_install() asks the user if they would like to install the previously
883							generated Fetchwarefile that new() created. If they answer yes, then that
884							program associated with that Fetchwarefile is installed. In our case, that means
885							that whatever files are configured for download will be downloaded. If they
886							answer no, then the path to the generated Fetchwarefile will be printed.
887
888							new_install() is imported by L from App::Fetchware,
889							and also exported by App::FetchwareX::HTMLPageSync. This is how
890							App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
891
892							=head2 check_syntax()
893
894							'Syntax Ok' = check_syntax()
895
896							=over
897
898							=item Configuration subroutines used:
899
900							=over
901
902							=item none
903
904							=back
905
906							=back
907
908							Calls check_config_options() to check for the following syntax errors in
909							Fetchwarefiles. Note by the time check_syntax() has been called
910							parse_fetchwarefile() has already parsed the Fetchwarefile, and any syntax
911							errors in the user's Fetchwarefile will have already been reported by Perl.
912
913							This may seem like a bug, but it's not. Do you really want to try to use regexes
914							or something to try to parse the Fetchwarefile reliably, and then report errors
915							to users? Or add PPI of all insane Perl modules as a dependency just to write
916							syntax checking code that most of the time says the syntax is Ok anyway, and
917							therefore a complete waste of time and effort? I don't want to deal with any of
918							that insanity.
919
920							Instead, check_syntax() uses config() to examine the already parsed
921							Fetchwarefile for "higher-level" or "Fetchware-level" syntax errors. Syntax
922							errors that are B syntax errors instead of just Perl syntax errors.
923
924							For yours and my own convienience I created check_config_options() helper
925							subroutine. Its data driven, and will check Fetchwarefile's for three different
926							types of common syntax errors that occur in App::Fetchware's Fetchwarefile
927							syntax. These errors are more at the level of I than actual syntax
928							errors. See its POD below for additional details.
929
930							Below briefly lists what App::Fetchware's implementation of check_syntax()
931							checks.
932
933							=over
934
935							=item * Mandatory configuration options
936
937							=over
938
939							=item * page_name, html_page_url, and destination_directory are required for all Fetchwarefiles.
940
941							=back
942
943							=back
944
945							=over
946
947							=item drop_privs() NOTES
948
949							This section notes whatever problems you might come accross implementing and
950							debugging your Fetchware extension due to fetchware's drop_privs mechanism.
951
952							See L.
953
954							=over
955
956							=item *
957
958							check_syntax() is run in the parent process before even start() has run, so no
959							temporary directory is available for use.
960
961							=back
962
963							=back
964
965							=head2 start()
966
967							my $temp_file = start();
968
969							start() creats a temp dir, chmod 700's it, and chdir()'s to it just like the one
970							in App::Fetchware does. App::FetchwareX::HTMLPageSync
971
972							start() is imported use L from App::Fetchware,
973							and also exported by App::FetchwareX::HTMLPageSync. This is how
974							App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
975
976							=head2 lookup()
977
978							my $download_url = lookup();
979
980							lookup() downloads the user specified C, parses it using
981							HTML::TreeBuilder, and uses C and
982							C if specified to maniuplate the tree to determine what
983							download urls the user wants.
984
985							This list of download urls is returned as an array reference, $download_url.
986
987							=head2 download()
988
989							download($temp_dir, $download_url);
990
991							download() uses App::Fetchware's utility function download_http_url() to
992							download all of the urls that lookup() returned. If the user specifed a
993							C configuration option, then that option is passed along to
994							download_http_url()'s call to HTTP::Tiny.
995
996							=head2 verify()
997
998							verify($download_url, $package_path);
999
1000							verify() simply calls App::Fetchware's :UTIL subroutine do_nothing(), which as
1001							you can tell from its name does nothing, but return. The reason for the useless
1002							do_nothing() call is simply for better documentation, and standardizing how to
1003							override a App::Fetchware API subroutine in order for it to do nothing at all,
1004							so that you can prevent the original App::Fetchware subroutine from doing what
1005							it normally does.
1006
1007							=head2 unarchive()
1008
1009							unarchive();
1010
1011							unarchive() does nothing by calling App::Fetchware's :UTIL subroutine
1012							do_nothing(), which does nothing.
1013
1014							=head2 build()
1015
1016							build($build_path);
1017
1018							build() does the same thing as verify(), and that is nothing by calling
1019							App::Fetchware's do_nothing() subroutine to better document the fact
1020							that it does nothing.
1021
1022							=head2 install()
1023
1024							install($package_path);
1025
1026							install() takes the $package_path, which is really an array ref of the paths
1027							of the files that download() copied, and copies them the the user specified
1028							destination directory, C.
1029
1030							=head2 end()
1031
1032							end();
1033
1034							end() chdir()s back to the original directory, and cleans up the temp directory
1035							just like the one in App::Fetchware does. App::FetchwareX::HTMLPageSync
1036
1037							end() is imported use L from App::Fetchware,
1038							and also exported by App::FetchwareX::HTMLPageSync. This is how
1039							App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
1040
1041							=head2 uninstall()
1042
1043							uninstall($build_path);
1044
1045							Uninstalls App::FetchwareX::HTMLPageSync by recursivly deleting the
1046							C where it stores the wallpapers or whatever you
1047							specified it to download for you. If you would like to keep your
1048							C, then set the C to true in
1049							your Fetchwarefile, and Fetchware will I delete you
1050							C, when you uninstall your Fetchware package.
1051
1052							=head2 upgrade()
1053
1054							my $upgrade = upgrade($download_path, $fetchware_package_path)
1055
1056							if ($upgrade) {
1057							...
1058							}
1059
1060							=over
1061
1062							=item Configuration subroutines used:
1063
1064							=over
1065
1066							=item none
1067
1068							=back
1069
1070							=back
1071
1072							Uses $download_path, an arrayref of URLs to download in HTMLPageSync, and
1073							compares it against the list of files that has already been downloaded by
1074							glob()ing C. And then comparing the file names of the
1075							specified files.
1076
1077							Returns true if $download_path has any URLs that have not already been
1078							downloaded into C. Note: HEAD HTTP querries are B
1079							used to check if any already downloaded files are I than the files in
1080							the C.
1081
1082							Returns false if $download_path is the same as C.
1083
1084							=over
1085
1086							=item drop_privs() NOTES
1087
1088							This section notes whatever problems you might come accross implementing and
1089							debugging your Fetchware extension due to fetchware's drop_privs mechanism.
1090
1091							See L.
1092
1093							=over
1094
1095							=item *
1096
1097							upgrade() is run in the B process as nobody or C, because the child
1098							needs to know if it should actually bother running the rest of fetchware's API
1099							subroutines.
1100
1101							=back
1102
1103							=back
1104
1105							=head1 MANUALLY CREATING A App::FetchwareX::HTMLPageSync FETCHWAREFILEN
1106
1107							In order to use App::FetchwareX::HTMLPageSync you must first create a
1108							Fetchwarefile to use it. You can use C as explain above, or
1109							create one manually in your text editor.
1110
1111							=over
1112
1113							=item B<1. Name it>
1114
1115							Use your text editor to create a file with a C<.Fetchwarefile> file extension.
1116							Use of this convention is not required, but it makes it obvious what type of
1117							file it is. Then, just copy and paste the example text below, and replace
1118							C<[page_name]> with what you choose your C to be. C is
1119							simply a configuration opton that simply names your Fetchwarefile. It is not
1120							actually used for anything other than to name your Fetchwarefile to document
1121							what program or behavior this Fetchwarefile manages.
1122
1123							use App::FetchwareX::HTMLPageSync;
1124
1125							page_name '[page_name]';
1126
1127							Fetchwarefiles are actually small, well structured, Perl programs that can
1128							contain arbitrary perl code to customize fetchware's behavior, or, in most
1129							cases, simply specify a number of fetchware or a fetchware extension's (as in
1130							this case) configuration options. Below is my filled in example
1131							App::FetchwareX::HTMLPageSync fetchwarefile.
1132
1133							use App::FetchwareX::HTMLPageSync;
1134
1135							page_name 'Cool Wallpapers';
1136
1137							Notice the C line at the top. That line is
1138							absolutely critical for this Fetchwarefile to work properly, because it is what
1139							allows fetchware to use Perl's own syntax as a nice easy to use syntax for
1140							Fetchwarefiles. If you do not use the matching C line,
1141							then fetchware will spit out crazy errors from Perl's own compiler listing all
1142							of the syntax errors you have. If you ever receive that error, just ensure you
1143							have the correct C line at the top of your
1144							Fetchwarefile.
1145
1146							=item B<2. Determine your html_page_url>
1147
1148							At the heart of App::FetchwareX::HTMLPageSync is its C, which is
1149							the URL to the HTML page you want HTMLPageSync to download and parse out links
1150							to wallpaper or whatever else you'd like to automate downloading. To figure this
1151							out just use your browser to find the HTML page you want to use, and then copy
1152							and paste the url between the single quotes C<'> as shown in the example below.
1153
1154							html_page_url '';
1155
1156							And then after you copy the url.
1157
1158							html_page_url 'http://some.url/something.html';
1159
1160							=item B<3. Determine your destination_directory>
1161
1162							HTMLPageSync also needs to know your C. This is the
1163							directory that HTMLPageSync will copy your downloaded files to. This directory
1164							will also be deleted when you uninstall this HTMLPageSync fetchware package just
1165							like a standard App::Fetchware package would uninstall any installed software
1166							when it is uninstalled. Just copy and paste the example below, and fill in the
1167							space between the single quotes C<'>.
1168
1169							destination_directory '';
1170
1171							After pasting it should look like.
1172
1173							destination_directory '~/wallpapers';
1174
1175							Furthermore, if you want to keep your C after you
1176							uninstall your HTMLPageSync fetchware package, just set the
1177							C configuration option to true:
1178
1179							keep_destination_directory 'True';
1180
1181							If this is set in your HTMLPageSync Fetchwarefile, HTMLPageSync will not delete
1182							your C when your HTMLPageSync fetchware package is
1183							uninstalled.
1184
1185							=item B<4. Specifiy other options>
1186
1187							That's all there is to it unless you need to further customize HTMLPageSync's
1188							behavior to get just the links you need to download.
1189
1190							At this point you can install your new Fetchwarefile with:
1191
1192							fetchware install [path to your new fetchwarefile]
1193
1194							Or you can futher customize it as shown next.
1195
1196							=item B<5. Specify an optional user_agent>
1197
1198							Many sites don't like bots downloading stuff from them wasting their bandwidth,
1199							and will even limit what you can do based on your user agent, which is the HTTP
1200							standard's name for your browser. This option allows you to pretend to be
1201							something other than HTMLPageSync's underlying library, L. Just copy
1202							and past the example below, and paste what you want you user agent to be between
1203							the single quotes C<'> as before.
1204
1205							user_agent '';
1206
1207							And after pasting.
1208
1209							user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1';
1210
1211							=item B<6. Specify an optonal html_treebuilder_callback>
1212
1213							C specifies an optional anonymous Perl subroutine
1214							reference that will replace the default one that HTMLPageSync uses. The default
1215							one limits the download to only image format links, which is flexible enough for
1216							downloading wallpapers.
1217
1218							If you want to download something different, then paste the example below in
1219							your Fetchwarefile.
1220
1221							html_treebuilder_callback sub {
1222							# Get one HTML::Element.
1223							my $h = shift;
1224
1225							# Return true or false to indicate if this HTML::Element shoudd be a
1226							# download link.
1227							if (something) {
1228							return 'True';
1229							} else {
1230							return undef;
1231							}
1232							};
1233
1234							And create a Perl anonymous subroutine C that will
1235							be executed instead of the default one. This requires knowledge of the Perl
1236							programming language. The one below limits itself to only pdfs and MS word
1237							documents.
1238
1239							# Download pdfs and word documents only.
1240							html_treebuilder_callback sub {
1241							my $tag = shift;
1242							my $link = $tag->attr('href');
1243							if (defined $link) {
1244							# If the anchor tag is an image...
1245							if ($link =~ /\.(pdf\|doc\|docx)$/) {
1246							# ...return true...
1247							return 'True';
1248							} else {
1249							# ...if not return false.
1250							return undef; #false
1251							}
1252							}
1253							};
1254
1255							=item B<7. Specify an optional download_links_callbacks>
1256
1257							C specifies an optional anonymous Perl subroutine
1258							reference that will replace the default one that HTMLPageSync uses. The default
1259							one removes the HTML::Element skin each download link is wrapped in, because of
1260							the use of L. This simply strips off the object-oriented crap
1261							its wrapped in, and turns it into a simply string scalar.
1262
1263							If you want to post process the download link in some other way, then just copy
1264							and paste the code below into your Fetchwarefile, and add whatever other Perl
1265							code you may need. This requires knowledge of the Perl programming language.
1266
1267							download_links_callback sub {
1268							my @download_urls = @_;
1269
1270							my @wanted_download_urls;
1271							for my $link (@download_urls) {
1272							# Pick ones to keep.
1273							puse @wanted_download_urls, $link;
1274							}
1275
1276							return @wanted_download_urls;
1277							};
1278
1279							=back
1280
1281							=head1 USING YOUR App::FetchwareX::HTMLPageSync FETCHWAREFILE WITH FETCHWARE
1282
1283							After you have
1284							L
1285							as shown above you need to actually use the fetchware command line program to
1286							install, upgrade, and uninstall your App::FetchwareX::HTMLPageSync Fetchwarefile.
1287
1288							Take note how fetchware's package management metaphor does not quite line up
1289							with what App::FetchwareX::HTMLPageSync does. Why would a HTML page mirroring
1290							script be installed, upgraded, or uninstalled? Well HTMLPageSync simply adapts
1291							fetchware's package management metaphor to its own enviroment performing the
1292							likely action for when one of fetchware's behaviors are executed.
1293
1294							=over
1295
1296							=item B
1297
1298							A C will cause HTMLPageSync to ask the user a bunch of questons,
1299							and help them create a new HTMLPageSync Fetchwarefile.
1300
1301							=item B
1302
1303							A C while using a HTMLPageSync Fetchwarefile causes fetchware
1304							to download your C, parse it, download any matching links, and
1305							then copy them to your C as you specify in your
1306							Fetchwarefile.
1307
1308							=item B
1309
1310							A C will redownload the C, parse it, and
1311							compare the corresponding list of files to the list of files already downloaded,
1312							and if any new files have been added, then they will be downloaded. New versions
1313							of existing files is not supported. No timestamp checking is implemented
1314							currently.
1315
1316							=item B
1317
1318							A C will cause fetchware to delete this fetchware package
1319							from its database as well as recursively deleting everything inside your
1320							C as well as that directory itself. So when you uninstall
1321							a HTMLPageSync fetchware package ensure that you really want to, because it will
1322							delete whatever files it downloaded for you in the first place.
1323
1324							However, if you would like fetchware to preserve your C,
1325							you can set the boolean C configuration option to
1326							true, like C, to keep HTMLPageSync from
1327							deleting your destination directory.
1328
1329							=back
1330
1331							=head1 HOW App::FetchwareX::HTMLPageSync OVERRIDES App::Fetchware
1332
1333							This sections documents how App::FetchwareX::HTMLPageSync overrides
1334							App::Fetchware's API, and is only interesting if you're debugging
1335							App::FetchwareX::HTMLPageSync, or you're writing your own App::Fetcwhare
1336							extension. If not, you don't need to know these details.
1337
1338							=head2 App::Fetchware API Subroutines
1339
1340							=head3 new()
1341
1342							HTMLPageSync overrides new(), and implements its own Q&A wizard interface
1343							helping users create HTMLPageSync Fetchwarefiles.
1344
1345							=head3 new_install()
1346
1347							HTMLPageSync just inherits App::Fetchware's new_install(), which just asks the
1348							user if they would like Fetchware to instell the already generated
1349							Fetchwarefile.
1350
1351							=head3 check_syntax()
1352
1353							check_syntax() is also overridden to check HTMLPageSync's own Fetchware-level
1354							syntax.
1355
1356							=head3 start() and end()
1357
1358							HTMLPageSync just imports start() and end() from App::Fetchware to take
1359							advantage of their ability to manage a temporary directory.
1360
1361							=head3 lookup()
1362
1363							lookup() is overridden, and downloads the C, which is the main
1364							configuration option that HTMLPageSync uses. Then lookup() parses that
1365							C, and determines what the download urls should be. If the
1366							C and C exist, then they are
1367							called to customize lookup()'s default bahavior. See their descriptions below.
1368
1369							=head3 download()
1370
1371							download() downloads the array ref of download links that lookup() returns.
1372
1373							=head3 verify()
1374
1375							verify() is overridden to do nothing.
1376
1377							=head3 unarchive()
1378
1379							verify() is overridden to do nothing.
1380
1381							=head3 build()
1382
1383							build() is overridden to do nothing.
1384
1385							=head3 install()
1386
1387							install() takes its argument, which is an arrayref of of the paths of the
1388							files that were downloaded to the tempdir created by start(), and copies them to
1389							the user's provided C.
1390
1391							=head3 end() and start()
1392
1393							HTMLPageSync just imports end() and start() from App::Fetchware to take
1394							advantage of their ability to manage a temporary directory.
1395
1396							=head3 uninstall()
1397
1398							uninstall() recursively deletes your C where it stores
1399							whatever links you choose to download unless of course the
1400							C configuration option is set to true.
1401
1402							=head3 upgrade()
1403
1404							Determines if any looked up URLs have not been downloaded yet, and returns true
1405							if that is the case.
1406
1407							=head2 App::FetchwareX::HTMLPageSync's Configuration Subroutines
1408
1409							Because HTMLPageSync is a App::Fetchware extension, it can not just use the same
1410							configuration subroutines that App::Fetchware uses. Instead, it must create its
1411							own configuration subroutines with App::Fetchware::CreateConfigOptions. These
1412							configuration subroutines are the configuration options that you use in your
1413							App::Fetchware or App::Fetchware extension.
1414
1415							=head3 page_name [MANDATORY]
1416
1417							HTMLPageSync's equivelent to App::Fetchware's C. It's simply the
1418							name of the page or what you want to download on that page.
1419
1420							=head3 html_page_url [MANDATORY]
1421
1422							HTMLPageSync's equivelent to App::Fetchware's C, and is just as
1423							mandatory. This is the url of the HTML page that will be downloaded and
1424							processed.
1425
1426							=head3 destination_directory [MANDATORY]
1427
1428							This option is also mandatory, and it specifies the directory where the files
1429							that you want to download are downloaded to.
1430
1431							=head3 user_agent [OPTIONAL]
1432
1433							This option is optional, and it allows you to have HTML::Tiny pretend to be a
1434							Web browser or perhaps bot if you want to.
1435
1436							=head3 html_treebuilder_callback [OPTIONAL]
1437
1438							This optional option allows you to specify a perl C that lookup() will
1439							execute instead of its default callback that just looks for images.
1440
1441							It receives one parameter, which is an HTML::Element at the first C,
1442							anchor/link tag.
1443
1444							It must C to indicate that that link should be included in the
1445							list of download links, or return false, C, to indicate that that
1446							link should not be included in the list of download links.
1447
1448							=head3 download_links_callback [OPTIONAL]
1449
1450							This optional option specifies an optional callback that will allow you to do
1451							post processing of the list of downloaded urls. This is needed, because the
1452							results of the C are still HTML::Element objects that
1453							need to be converted to just string download urls. That is what the default
1454							C does.
1455
1456							It receives a list of all of the download HTML::Elements that
1457							C returned true on. It is called only once, and
1458							should return a list of string download links for download later by HTML::Tiny
1459							in download().
1460
1461							=head3 keep_destination_directory [OPTIONAL]
1462
1463							This optional option is a boolean true or false configuration option that
1464							when true prevents HTMLPageSync from deleting your destination_directory when
1465							you run fetchware uninstall.
1466
1467							Its default is false, so by defualt HTMLPageSync B delete your files from
1468							your C unless you set this to true.
1469
1470							=head1 ERRORS
1471
1472							As with the rest of App::Fetchware, App::Fetchware::Config does not return any
1473							error codes; instead, all errors are die()'d if it's App::Fetchware::Config's
1474							error, or croak()'d if its the caller's fault. These exceptions are simple
1475							strings, and are listed in the L section below.
1476
1477							=head1 CAVEATS
1478
1479							Certain features of App::FetchwareX::HTMLPageSync require knowledge of the Perl
1480							programming language in order for you to make use of them. However, this is
1481							limited to optional callbacks that are not needed for most uses. These features
1482							are the C and C callbacks.
1483
1484							=head1 AUTHOR
1485
1486							David Yingling
1487
1488							=head1 COPYRIGHT AND LICENSE
1489
1490							This software is copyright (c) 2016 by David Yingling.
1491
1492							This is free software; you can redistribute it and/or modify it under
1493							the same terms as the Perl 5 programming language system itself.
1494
1495							=cut
1496
1497							__END__