File Coverage

blib/lib/App/FetchwareX/HTMLPageSync.pm
Criterion Covered Total %
statement 57 188 30.3
branch 3 28 10.7
condition 0 3 0.0
subroutine 19 31 61.2
pod 13 13 100.0
total 92 263 34.9


line stmt bran cond sub pod time code
1             package App::FetchwareX::HTMLPageSync;
2             our $VERSION = '1.016'; # VERSION: generated by DZP::OurPkgVersion
3             # ABSTRACT: An App::Fetchware extension that downloads files based on an HTML page.
4 1     1   7703 use strict;
  1         1  
  1         23  
5 1     1   4 use warnings;
  1         1  
  1         19  
6              
7             # Enable Perl 6 knockoffs, and use 5.10.1, because smartmatching and other
8             # things in 5.10 were changed in 5.10.1+.
9 1     1   17 use 5.010001;
  1         3  
10              
11             # Use fetchware's API's to help us out.
12 1     1   3 use App::Fetchware::Util ':UTIL';
  1         1  
  1         162  
13 1     1   5 use App::Fetchware::Config ':CONFIG';
  1         1  
  1         90  
14 1     1   411 use App::Fetchware::Fetchwarefile;
  1         2  
  1         28  
15 1         145 use App::Fetchware qw(
16             :OVERRIDE_NEW
17             :OVERRIDE_NEW_INSTALL
18             :OVERRIDE_CHECK_SYNTAX
19 1     1   5 );
  1         0  
20              
21             # Local imports.
22 1     1   4 use File::Copy 'cp';
  1         2  
  1         36  
23 1     1   4 use File::Path 'remove_tree';
  1         1  
  1         40  
24 1     1   3 use URI::Split 'uri_split';
  1         1  
  1         41  
25 1     1   4 use File::Spec 'splitpath';
  1         0  
  1         14  
26 1     1   3 use Data::Dumper;
  1         1  
  1         32  
27 1     1   4 use Scalar::Util 'blessed';
  1         1  
  1         45  
28              
29             # Use App::Fetchware::ExportAPI to specify which App::Fetchware API subroutines
30             # we are going to "KEEP", import from App::Fetchware, and which API subs we are
31             # going to "OVERRRIDE", implemente here in this package.
32             #
33             # ExportAPI takes care of the grunt work for us by setting our packages @EXPORT
34             # appropriatly, and even importing Exporter's import() method into our package
35             # for us, so that our App::Fetchware API subroutines and configuration options
36             # specified below can be import()ed properly.
37             use App::Fetchware::ExportAPI
38             # KEEP or "inherit" new_install, because I want my new_install to just call
39             # ask_to_install_now_to_test_fetchwarefile(), and App::Fetchware's does that
40             # already for me. And start() and end() are to create and manage the
41             # temporary directory for me, so I don't have to worry about polluting the
42             # current working directory with temporary files.
43 1         7 KEEP => [qw(new_install start end)],
44             # OVERRIDE everything else.
45             OVERRIDE =>
46             [qw(new check_syntax lookup download verify unarchive build install
47             uninstall upgrade)]
48 1     1   403 ;
  1         2  
49              
50              
51             # Use App::Fetchware::CreateconfigOptions to build our App::Fetchware
52             # configuration options for us. These are subroutines with correct prototypes to
53             # turn a perl code file into something that resembles a configuration file.
54             use App::Fetchware::CreateConfigOptions
55 1         6 ONE => [qw(
56             page_name
57             html_page_url
58             destination_directory
59             user_agent
60             html_treebuilder_callback
61             download_links_callback
62             )],
63             BOOLEAN => [qw(keep_destination_directory)]
64 1     1   5 ;
  1         2  
65              
66              
67 1     1   4 use Exporter 'import';
  1         1  
  1         1719  
68             our %EXPORT_TAGS = (
69             TESTING => [qw(
70             get_html_page_url
71             get_destination_directory
72             ask_about_keep_destination_directory
73             new
74             new_install
75             )]
76             );
77             our @EXPORT_OK = map {@{$_}} values %EXPORT_TAGS;
78              
79              
80              
81              
82              
83             sub new {
84 0     0 1 0 my ($term, $page_name) = @_;
85              
86             # Instantiate a new Fetchwarefile object for managing and generating a
87             # Fetchwarefile, which we'll write to a file for the user or use to
88             # build a associated Fetchware package.
89 0         0 my $now = localtime;
90 0         0 my $fetchwarefile = App::Fetchware::Fetchwarefile->new(
91             header => <
92             use App::FetchwareX::HTMLPageSync;
93             # Auto generated $now by HTMLPageSync's fetchware new command.
94             # However, feel free to edit this file if HTMLPageSync's new command's
95             # autoconfiguration is not enough.
96             #
97             # Please look up HTMLPageSync's documentation of its configuration file syntax at
98             # perldoc App::FetchwareX::HTMLPageSync, and only if its configuration file
99             # syntax is not malleable enough for your application should you resort to
100             # customizing fetchware's behavior. For extra flexible customization see perldoc
101             # App::Fetchwarex::HTMLPageSync.
102             EOF
103             descriptions => {
104              
105             page_name => <
106             page_name simply names the HTML page the Fetchwarefile is responsible for
107             downloading, analyzing via optional callbacks, and copying to your
108             destination_directory.
109             EOA
110             html_page_url => <
111             html_page_url is HTMLPageSync's lookup_url equivalent. It specifies a HTTP url
112             that returns a page of HTML that can be easily parsed of links to later
113             download.
114             EOA
115             destination_directory => <
116             destination_directory is the directory on your computer where you want the files
117             that you configure HTMLPageSync to parse to be copied to.
118             EOA
119             user_agent => <
120             user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library
121             Fetchware uses, where the library will lie to the Web server you are Web
122             scraping from to hopefully prevent the Web sever from banning you, or updating
123             the page you want to scrap to use too much Javascript, which would prevent the
124             simple parser HTMLPageSync uses from working on the specified html_page_url.
125             EOA
126             html_treebuilder_callback => <
127             html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync
128             will execute instead of its default callback that just looks for images.
129              
130             It receives one parameter, which is an HTML::Element at the first C,
131             anchor/link tag.
132              
133             It must [return 'True';] to indicate that that link should be included in the
134             list of download links, or return false, [return undef], to indicate that that
135             link should not be included in the list of download links.
136             EOA
137             download_links_callback => <
138             download_links_callback specifies an optional callback that will allow you to do
139             post processing of the list of downloaded urls. This is needed, because the
140             results of the html_treebuilder_callback are still HTML::Element objects that
141             need to be converted to just string download urls. That is what the default
142             C does.
143              
144             It receives a list of all of the download HTML::Elements that
145             C returned true on. It is called only once, and
146             should return a list of string download links for download later by
147             HTMLPageSync.
148             EOA
149             keep_destination_directory => <
150             keep_destination_directory is a boolean true or false configuration option that
151             when true prevents HTMLPageSync from deleting your destination_directory when
152             you run fetchware uninstall.
153             EOA
154             }
155             );
156              
157 0         0 extension_name(__PACKAGE__);
158              
159 0         0 opening_message(<
160             HTMLPageSync's new command is not as sophistocated as Fetchware's. Unless you
161             only want to download images, you will have to get your hands dirty, and code up
162             some custom Perl callbacks to customize HTMLPageSync's behavior. However, it
163             will ask you quite nicely the basic options, so if those are all you need, then
164             this command will successfully generate a HTMLPageSync Fetchwarefile for you.
165              
166             After it lets you choose the easy options of page_name, html_page_url,
167             and destination_directory, it will give you an opportunity to modify the
168             user_agent string HTMLPageSync uses to avoid betting banned or having your
169             scraping stick out like a sore thumb in the target Web server's logs. Then,
170             you'll be asked about the advanced options. If you want them it will add generic
171             ones to the Fetchwarefile that you can then fill in later on when HTMLPageSync
172             asks you if you want to edit the generated Fetchwarefile manually. Finally,
173             after your Fetchwarefile is generated HTMLPageSync will ask you if you would
174             like to install your generated Fetchwarefile to test it out.
175             EOM
176              
177             # Ask the user for the basic configuration options.
178 0         0 $page_name = fetchwarefile_name(page_name => $page_name);
179 0         0 vmsg "Determined your page_name option to be [$page_name]";
180              
181 0         0 $fetchwarefile->config_options(page_name => $page_name);
182 0         0 vmsg "Appended page_name [$page_name] configuration option to Fetchwarefile";
183              
184 0         0 my $html_page_url = get_html_page_url($term);
185 0         0 vmsg "Asked user for html_page_url [$html_page_url] from user.";
186              
187 0         0 $fetchwarefile->config_options(html_page_url => $html_page_url);
188 0         0 vmsg "Appended html_page_url [$html_page_url] configuration option to Fetchwarefile";
189              
190 0         0 my $destination_directory = get_destination_directory($term);
191 0         0 vmsg "Asked user for destination_directory [$destination_directory] from user.";
192              
193 0         0 $fetchwarefile->config_options(destination_directory => $destination_directory);
194 0         0 vmsg <
195             Appended destination_directory [$destination_directory] configuration option to
196             your Fetchwarefile";
197             EOM
198              
199             # Asks and sets the keep_destination_directory configuratio option if the
200             # user wants to set it.
201 0         0 ask_about_keep_destination_directory($term, $fetchwarefile);
202              
203 0         0 vmsg 'Prompting for other options that may be needed.';
204 0         0 my $other_options_hashref = prompt_for_other_options($term,
205             user_agent => {
206             prompt => <
207             What user_agent configuration option would you like?
208             EOP
209             print_me => <
210             user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library
211             Fetchware uses, where the library will lie to the Web server you are Web
212             scraping from to hopefully prevent the Web sever from banning you, or updating
213             the page you want to scrap to use too much Javascript, which would prevent the
214             simple parser HTMLPageSync uses from working on the specified html_page_url.
215             EOP
216             },
217             html_treebuilder_callback => {
218             prompt => <
219             What html_treebuilder_callback configuration option would you like?
220             EOP
221             print_me => <
222             html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync
223             will execute instead of its default callback that just looks for images.
224              
225             It receives one parameter, which is an HTML::Element at the first C,
226             anchor/link tag.
227              
228             It must [return 'True';] to indicate that that link should be included in the
229             list of download links, or return false, [return undef], to indicate that that
230             link should not be included in the list of download links.
231              
232             Because Term::UI's imput is limited to just one line, please just press enter,
233             and a dummy value will go into your Fetchwarefile, where you can then replace
234             that dummy value with a proper Perl callback next, when Fetchware gives you the
235             option to edit your Fetchwarefile manually.
236             EOP
237             default => 'sub { my $h = shift; die "Dummy placeholder fill me in."; }',
238             },
239             download_links_callback => {
240             prompt => <
241             What download_links_callback configuration option would you like?
242             EOP
243             print_me => <
244             download_links_callback specifies an optional callback that will allow you to do
245             post processing of the list of downloaded urls. This is needed, because the
246             results of the html_treebuilder_callback are still HTML::Element objects that
247             need to be converted to just string download urls. That is what the default
248             C does.
249              
250             It receives a list of all of the download HTML::Elements that
251             C returned true on. It is called only once, and
252             should return a list of string download links for download later by
253             HTMLPageSync.
254              
255             Because Term::UI's imput is limited to just one line, please just press enter,
256             and a dummy value will go into your Fetchwarefile, where you can then replace
257             that dummy value with a proper Perl callback next, when Fetchware gives you the
258             option to edit your Fetchwarefile manually.
259             EOP
260             default => 'sub { my @download_urls = @_; die "Dummy placeholder fill me in."; }',
261             },
262             );
263 0         0 vmsg 'User entered the following options.';
264 0         0 vmsg Dumper($other_options_hashref);
265              
266             # Append all other options to the Fetchwarefile.
267 0         0 $fetchwarefile->config_options(%$other_options_hashref);
268 0         0 vmsg 'Appended all other options listed above to Fetchwarefile.';
269              
270 0         0 my $edited_fetchwarefile = edit_manually($term, $fetchwarefile);
271 0         0 vmsg <
272             Asked user if they would like to edit their generated Fetchwarefile manually.
273             EOM
274             # Generate Fetchwarefile.
275             # If edit_manually() did not modify the Fetchwarefile, then generate it.
276 0 0 0     0 if (blessed($edited_fetchwarefile)
277             and
278             $edited_fetchwarefile->isa('App::Fetchware::Fetchwarefile')) {
279 0         0 $fetchwarefile = $fetchwarefile->generate();
280             # If edit_manually() modified the Fetchwarefile, then do not generate it,
281             # and replace the Fetchwarefile object with the new string that represents
282             # the user's edited Fetchwarefile.
283             } else {
284 0         0 $fetchwarefile = $edited_fetchwarefile;
285             }
286              
287             # Whatever variables the new() API subroutine returns are written via a pipe
288             # back to the parent, and then the parent reads the variables back, and
289             # makes then available to new_install(), back in the parent, as arguments.
290 0         0 return $page_name, $fetchwarefile;
291             }
292              
293              
294              
295             sub get_html_page_url {
296 0     0 1 0 my $term = shift;
297              
298              
299             # prompt for lookup_url.
300 0         0 my $html_page_url = $term->get_reply(
301             print_me => <
302             Fetchware's heart and soul is its html_page_url. This is the configuration option
303             that tells fetchware where to check if any new links have been added to the
304             specified Web page that match your criteria for download.
305              
306             How to determine your application's html_page_url:
307             1. Simply specify the URL that of the Web page that has the images that you
308             would like to have Fetchware download for you.
309             EOP
310             prompt => q{What is your Web page's html_page_url? },
311             allow => qr!(ftp|http|file)://!);
312              
313 0         0 return $html_page_url;
314             }
315              
316              
317              
318             sub get_destination_directory {
319 0     0 1 0 my $term = shift;
320              
321             # prompt for lookup_url.
322 0         0 my $destination_directory = $term->get_reply(
323             print_me => <
324             destination_directory is the directory on your computer where you want the files
325             that you configure HTMLPageSync to parse to be copied to.
326             EOP
327             prompt => q{What is your destination_directory? });
328              
329 0         0 return $destination_directory;
330             }
331              
332              
333              
334             sub ask_about_keep_destination_directory {
335 0     0 1 0 my ($term, $fetchwarefile) = @_;
336              
337 0 0       0 if (
338             $term->ask_yn(
339             print_me => <
340             By default, HTMLPageSync deletes your destination_directory when you uninstall
341             that destination_directory's assocated Fetchware package or Fetchwarefile. This
342             is done, because your deleting the Fetchware package, so it makes sense to
343             delete that package's associated data.
344              
345             If you wish to keep your destination_directory after you uninstall this
346             HTMLPageSync Fetchware package, then answer N below.
347             EOP
348             prompt => 'Is deleting your destination_directory on uninstall OK? ',
349             default => 'y',
350             )
351             ) {
352 0         0 vmsg <
353             User wants [keep_destination_directory 'True';] added to their Fetchwarefile.
354             EOM
355              
356 0         0 $fetchwarefile->config_options(keep_destination_directory => 'True');
357 0         0 vmsg <
358             Appended [keep_destination_directory 'True';] to user's Fetchwarefile.
359             EOM
360             }
361             }
362              
363              
364              
365              
366              
367             sub check_syntax {
368              
369             # Use check_config_options() to run config() a bunch of times to check the
370             # already parsed Fetchwarefile.
371 0     0 1 0 return check_config_options(
372             Mandatory => [ 'page_name', <
373             App-Fetchware: Your Fetchwarefile must specify a page_name configuration
374             option. Please add one, and try again.
375             EOM
376             Mandatory => [ 'html_page_url', <
377             App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration
378             option. Please add one, and try again.
379             EOM
380             Mandatory => [ 'destination_directory', <
381             App-Fetchware: Your Fetchwarefile must specify a destination_directory
382             configuration option. Please add one, and try again.
383             EOM
384             );
385             }
386              
387              
388              
389              
390              
391             ###BUGALERT### lookup() returns all files each time it is run; therefore, it
392             #breaks the way Fetchware is supposed to work! lookup() is supposed to return
393             #"the latest version." And in HTMLPageSync's case, it should not include files
394             #already downloaded, because it should only return "new files" by comparing the
395             #"availabe list of files" to the "already downloaded one."
396             sub lookup {
397 0     0 1 0 msg
398 0         0 "Looking up download urls using html_page_url [@{[config('html_page_url')]}]";
399             ###BUGALERT### Create a user changeable version of lookup_check_args??(), so
400             #that App::Fetchware 'subclasses' can use it.
401             # Download the url the user specified.
402 0         0 my $filename = do {
403 0 0       0 if (defined config('user_agent')) {
404 0         0 download_http_url(config('html_page_url'),
405             user_agent => config('user_agent'));
406             } else {
407 0         0 download_http_url(config('html_page_url'));
408             }
409             };
410 0         0 vmsg "Downloaded html_page_url to local file [$filename].";
411              
412             # Create a HTML::TreeBuilder object for the now downloaded file.
413 0         0 my $tree = HTML::TreeBuilder->new();
414             # Parse $filename into a HTML::Element tree.
415 0         0 $tree->parse_file($filename);
416 0         0 vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.';
417              
418 0         0 my $tree_callback = do {
419 0 0       0 if (config('html_treebuilder_callback')) {
420 0         0 vmsg <
421             Using user supplied html_treebuilder_callback to parse downloaded HTML file:
422             [
423 0         0 @{[config('html_treebuilder_callback')]}
424             ]
425             EOM
426 0         0 config('html_treebuilder_callback');
427             } else {
428 0         0 vmsg <
429             Using built-in default html_treebuilder_callback that only wants images.
430             EOM
431             sub {
432 0     0   0 my $tag = shift;
433 0         0 my $link = $tag->attr('href');
434 0 0       0 if (defined $link) {
435             # If the anchor tag is an image...
436 0 0       0 if ($link =~ /\.(jpg|jpeg|png|bmp|tiff?|gif)$/) {
437             # ...return true...
438 0         0 return 'True';
439             } else {
440             # ...if not return false.
441 0         0 return undef; #false
442             }
443             }
444 0         0 };
445             }
446             };
447              
448             # Find the links that match our default callback or the user specified one
449             # if the user specified one.
450 0         0 my @download_urls = $tree->look_down(
451             _tag => 'a',
452             $tree_callback
453             );
454 0         0 vmsg <
455             Determined download urls to be:
456             @download_urls
457             EOM
458              
459             # Sort through the list of HTML::Element tags to finalize the list to
460             # download.
461 0         0 my $links_callback = do {
462 0 0       0 if (config('download_links_callback')) {
463 0         0 vmsg <
464             Determined download_links_callback to be user specified:
465             [
466 0         0 @{[config('download_links_callback')]}
467             ]
468             EOM
469 0         0 config('download_links_callback');
470             } else {
471             # Strip off HTML::Element crap by default.
472             sub {
473 0     0   0 vmsg <
474             Using built-in default download_links_callback that turns HTML::Elements into
475             download urls.
476             EOM
477 0         0 my @download_urls = @_;
478              
479 0         0 for my $link (@download_urls) {
480 0         0 $link = $link->attr('href');
481             }
482              
483             # Must return them, because this coderef was called by value not
484             # by reference.
485 0         0 return @download_urls;
486 0         0 };
487             }
488             };
489              
490             # Call download_links_callback or call default one to strip off
491             # HTML::Element crap.
492 0         0 @download_urls = $links_callback->(@download_urls);
493 0         0 vmsg <
494             Determined download urls to be:
495             [
496 0         0 @{[@download_urls]}
497             ]
498             EOM
499              
500             # The download_urls may be relative links instead of absolute links.
501             # Relative ones could just be filenames without any knowledge of what the
502             # actual server or path or even scheme is. Fix this by prepending
503             # html_page_url to each link if there is no scheme.
504 0         0 for my $download_url (@download_urls) {
505 0 0       0 if ($download_url !~ m!^(ftp|http|file)://!) {
506 0         0 $download_url = config('html_page_url') . '/' . $download_url;
507             }
508             }
509              
510             # Return a ref to the array of download urls, because lookup()'s API only
511             # allows it to return a single value, but that single value does not have to
512             # a scalar. It can be a array ref, which is used here. This works, because
513             # what is returned here by lookup() is passed unchanged to download(), which
514             # is also part of this API, so I can use what I return here as I please
515             # inside download().
516 0         0 return \@download_urls;
517             }
518              
519              
520              
521             sub download {
522 0     0 1 0 my ($temp_dir, $download_url) = @_;
523              
524 0         0 msg 'Downloading the download urls lookup() determined.';
525              
526 0         0 my @download_file_paths;
527             # Loop over @$download_url to download all user specified URLs to temp_dir.
528 0         0 for my $url (@$download_url) {
529             # Use user specified agent if they asked for it.
530 0 0       0 if (defined config('user_agent')) {
531 0         0 vmsg <
532             Downloadig url
533             [$url]
534             using the user specified user_agent
535 0         0 [@{[config('user_agent')]}]
536             EOM
537 0         0 my $downloaded_file =
538             download_http_url($url, agent => config('user_agent'));
539 0         0 push @download_file_paths, $downloaded_file;
540             } else {
541 0         0 vmsg "Downloading url [$url].";
542 0         0 my $downloaded_file = download_http_url($url);
543 0         0 push @download_file_paths, $downloaded_file;
544             }
545             }
546              
547 0         0 local $" = "\n"; # print each @download_file_paths on its own line.
548 0         0 vmsg <
549             Downloaded specified urls to the following paths:
550             [
551 0         0 @{[@download_file_paths]}
552             ]
553             EOM
554              
555             # AKA $package_path.
556 0         0 return \@download_file_paths;
557             }
558              
559              
560              
561             sub verify {
562 2     2 1 1389 vmsg <
563             Skipping verify subroutine, because HTMLPageSync does not need to verify anything
564             EOM
565 2         7 do_nothing();
566             }
567              
568              
569              
570             sub unarchive {
571 1     1 1 835 vmsg <
572             Skipping unarchive subroutine, because HTMLPageSync does not need to unarchive
573             anything
574             EOM
575 1         5 do_nothing();
576             }
577              
578              
579              
580             sub build {
581 0     0 1 0 vmsg <
582             Skipping build subroutine, because HTMLPageSync does not need to build anything
583             EOM
584 0         0 do_nothing();
585             }
586              
587              
588              
589             sub install {
590             # AKA $package_path.
591 0     0 1 0 my $download_file_paths = shift;
592              
593 0         0 msg <
594             Copying files downloaded to a local temp directory to final destination directory.
595             EOM
596              
597             # Copy over the files that have been returned by download().
598 0         0 for my $file_path (@$download_file_paths) {
599 0         0 vmsg <
600 0         0 Copying [$file_path] -> [@{[config('destination_directory')]}].
601             EOM
602             ###BUGALERT### Should this die and all the rest be croaks instead???
603 0 0       0 cp($file_path, config('destination_directory')) or die <
604             App-FetchwareX-HTMLPageSync: run-time error. Fetchware failed to copy the file [$file_path] to the
605 0         0 destination directory [@{[config('destination_directory')]}].
606             The OS error was [$!].
607             EOD
608             }
609              
610 0         0 vmsg 'Successfully copied files to destination directory.';
611              
612 0         0 return 'True indicating success!';
613             }
614              
615              
616              
617              
618              
619             sub uninstall {
620 1     1 1 2823 my $build_path = shift;
621              
622             # Only delete destination_directory if keep_destination_directory is false.
623 1 50       11 unless (config('keep_destination_directory')) {
624              
625 1         9 msg <
626             Uninstalling this HTMLPageSync package by deleting your destination directory.
627             EOM
628              
629             ###BUGALERT### Before release go though all of Fetchware's API, and subifiy
630             #each main component like lookup and download were, the later ones were not
631             #done this way. That way I can put say chdir_to_build_path() here instead of
632             #basicaly copying and pasting the code like I do below. Also
633             #chdir_to_build_path() can be put in :OVERRIDE_UNINSTALL!!! Which I can use
634             #here.
635 1 50       18 chdir $build_path or die <
636             App-FetchwareX-HTMLPageSync: Failed to uninstall the specified package and specifically to change
637             working directory to [$build_path] before running make uninstall or the
638             uninstall_commands provided in the package's Fetchwarefile. Os error [$!].
639             EOD
640              
641 1 50       5 if ( defined config('destination_directory')) {
642             # Use File::Path's remove_tree() to delete the destination_directory
643             # thereby "uninstalling" this package. Will throw an exception that I'll
644             # let the main eval in bin/fetchware catch, print, and exit 1.
645 0         0 vmsg <
646 0         0 Deleting entire destination directory [@{[config('destination_directory')]}].
647             EOM
648 0         0 remove_tree(config('destination_directory'));
649             } else {
650 1         10 die <
651             App-FetchwareX-HTMLPageSync: Failed to uninstall the specified App::FetchwareX::HTMLPageSync
652             package, because no destination_directory is specified in its Fetchwarefile.
653             This configuration option is required and must be specified.
654             EOD
655             }
656             # keep_destination_directory was set, so don't delete destination directory.
657             } else {
658 0           msg <
659             Uninstalling this HTMLPageSync package but keeping your destination directory.
660             EOM
661              
662             }
663              
664 0           return 'True for success.';
665             }
666              
667              
668              
669              
670             sub upgrade {
671 0     0 1   my $download_path = shift; # $fetchware_package_path is not used in HTMLPageSync.
672              
673             # Get the listing of already downloaded file names.
674 0           my @installed_downloads = glob(config('destination_directory'));
675              
676             # Preprocess both @$download_path and @installed_downloads to ensure that
677             # URL crap or differing full paths won't screw up the "comparisons". The
678             # clever delete hashslice does the "comparisons" if you will.
679 0           my @download_path_filenames = map { ( uri_split($_) )[2] } @$download_path;
  0            
680 0           my @installed_downloads_filenames = map { ( splitpath($_) ) [2] }
  0            
681             @installed_downloads;
682              
683             # Determine what files are in @$download_path, but not in
684             # @installed_downloads.
685             # Algo based on code from Perl Cookbook pg. 126.
686 0           my %seen;
687 0           @seen{@$download_path} = ();
688 0           delete @seen{@installed_downloads};
689              
690 0           my @new_urls_to_download = keys %seen;
691              
692 0 0         if (@new_urls_to_download > 0) {
693             # Alter $download_path to only list @new_urls_to_download. That way
694             # download() only downloads the new URLs not the already downloaded ones
695             # again.
696 0           $download_path = [@new_urls_to_download];
697              
698 0           return 'New URLs Found.';
699             } else {
700 0           return;
701             }
702             }
703              
704              
705             1;
706              
707             =pod
708              
709             =head1 NAME
710              
711             App::FetchwareX::HTMLPageSync - An App::Fetchware extension that downloads files based on an HTML page.
712              
713             =head1 VERSION
714              
715             version 1.016
716              
717             =head1 SYNOPSIS
718              
719             =head2 Example App::FetchwareX::HTMLPageSync Fetchwarefile.
720              
721             page_name 'Cool Wallpapers';
722              
723             html_page_url 'http://some-html-page-with-cool.urls';
724              
725             destination_directory 'wallpapers';
726              
727             # pretend to be firefox
728             user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1';
729              
730             # Customize the callbacks.
731             html_treebuilder_callback sub {
732             # Get one HTML::Element.
733             my $h = shift;
734              
735             # Return true or false to indicate if this HTML::Element shoudd be a
736             # download link.
737             if (something) {
738             return 'True';
739             } else {
740             return undef;
741             }
742             };
743              
744             download_links_callback sub {
745             my @download_urls = @_;
746              
747             my @wanted_download_urls;
748             for my $link (@download_urls) {
749             # Pick ones to keep.
750             puse @wanted_download_urls, $link;
751             }
752              
753             return @wanted_download_urls;
754             };
755              
756             =head2 App::FetchwareX::HTMLPageSync App::Fetchware-like API.
757              
758             my $temp_file = start();
759              
760             my $download_url = lookup();
761              
762             download($temp_dir, $download_url);
763              
764             verify($download_url, $package_path);
765              
766             unarchive($package_path);
767              
768             build($build_path);
769              
770             install();
771              
772             uninstall($build_path);
773              
774             =head1 MOTIVATION
775              
776             I want to automatically parse a Web page with links to wall papers that I want
777             to download. Only I want software to do it for me. That's where this
778             App::Fetchware extension comes in.
779              
780             =head1 DESCRIPTION
781              
782             App::FetchwareX::HTMLPageSync is an example App::Fetchware extension. It's not
783             a large extension, but instead is a simple one meant to show how easy it is
784             extend App::Fetchware.
785              
786             App::FetchwareX::HTMLPageSync parses the Web page you specify to create a list of
787             download links. Then it downloads those links, and installs them to your
788             C.
789              
790             In order to use App::FetchwareX::HTMLPageSync to help you mirror the download
791             links on a HTML page you need to create a App::FetchwareX::HTMLPageSync
792             Fetchwarefile, you can do this easily by just running C, and
793             typing in C when it asks you what extension of Fetchwarefile you
794             want to create.
795             L
796             Then you'll need to
797             L
798              
799             =head1 App::FetchwareX::HTMLPageSync API SUBROUTINES
800              
801             This is App::FetchwareX::HTMLPageSync's API that fetchware uses to execute any
802             Fetchwarefile's that make use of App::FetchwareX::HTMLPageSync. This API is the
803             same that regular old App::Fetchware uses for most standard FOSS software, and
804             this internal documentation is only needed when debugging HTMLPageSync's code or
805             when studying it to create your own fetchware extension.
806              
807             =head2 new()
808              
809             my ($program_name, $fetchwarefile) = new($term, $program_name);
810              
811             # Or in an extension, you can return whatever list of variables you want,
812             # and then cmd_new() will provide them as arguments to new_install() except
813             # a $term Term::ReadLine object will precede the others.
814             my ($term, $program_name, $fetchwarefile, $custom_argument1, $custom_argument2)
815             = new($term, $program_name);
816              
817             new() is App::Fetchware's API subroutine that implements fetchware's new
818             command. It simply uses Term::UI to ask the user some questions that determine
819             what configuration options will be added to the genereted Fetchwarefile. new()
820             takes a $term, Term::UI/Term::Readline object, and the optional name of the
821             program or Website in this case that HTMLPageSync is page syncing.
822              
823             Whatever scalars (not references just regular strings) that new() returns will
824             be shared with new()'s sister API subroutine new_install() that is called after
825             new() is called by cmd_install(), which implements fetchware's new command.
826             new_install() is called in the parent process, so it does have root permissions,
827             so be sure to test it as root as well.
828              
829             =over
830              
831             =item drop_privs() NOTES
832              
833             This section notes whatever problems you might come accross implementing and
834             debugging your Fetchware extension due to fetchware's drop_privs mechanism.
835              
836             See L.
837              
838             =over
839              
840             =item *
841              
842             This subroutine is B run as root; instead, it is run as a regular user
843             unless the C configuration option has been set to true.
844              
845             =back
846              
847             =back
848              
849             =head3 get_html_page_url()
850              
851             my $html_page_url = get_html_page_url($term);
852              
853             Uses $term argument as a L/L object to interactively
854             explain what a L is, and to ask the user to provide one and press
855             enter.
856              
857             =head3 get_destination_directory()
858              
859             my $destination_directory = get_destination_directory($term);
860              
861             Uses $term argument as a L/L object to interactively
862             explain what a C is, and to ask the user to provide one
863             and press enter.
864              
865             =head3 ask_about_keep_destination_directory()
866              
867             ask_about_keep_destination_directory($term, $fetchwarefile);
868              
869             ask_about_keep_destination_directory() does just that it asks the user if they
870             would like to enable the C configuration option to
871             preserve their C when they uninstall the assocated
872             Fetchware package or Fetchwarefile. If they answer Y,
873             C is added to their Fetchwarefile, and if not
874             nothing is added, because deleteing their C is the
875             default that will happen even if the C is not even
876             in the Fetchwarefile.
877              
878             =head2 new_install()
879              
880             my $fetchware_package_path = new_install($page_name, $fetchwarefile);
881              
882             new_install() asks the user if they would like to install the previously
883             generated Fetchwarefile that new() created. If they answer yes, then that
884             program associated with that Fetchwarefile is installed. In our case, that means
885             that whatever files are configured for download will be downloaded. If they
886             answer no, then the path to the generated Fetchwarefile will be printed.
887              
888             new_install() is imported by L from App::Fetchware,
889             and also exported by App::FetchwareX::HTMLPageSync. This is how
890             App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
891              
892             =head2 check_syntax()
893              
894             'Syntax Ok' = check_syntax()
895              
896             =over
897              
898             =item Configuration subroutines used:
899              
900             =over
901              
902             =item none
903              
904             =back
905              
906             =back
907              
908             Calls check_config_options() to check for the following syntax errors in
909             Fetchwarefiles. Note by the time check_syntax() has been called
910             parse_fetchwarefile() has already parsed the Fetchwarefile, and any syntax
911             errors in the user's Fetchwarefile will have already been reported by Perl.
912              
913             This may seem like a bug, but it's not. Do you really want to try to use regexes
914             or something to try to parse the Fetchwarefile reliably, and then report errors
915             to users? Or add PPI of all insane Perl modules as a dependency just to write
916             syntax checking code that most of the time says the syntax is Ok anyway, and
917             therefore a complete waste of time and effort? I don't want to deal with any of
918             that insanity.
919              
920             Instead, check_syntax() uses config() to examine the already parsed
921             Fetchwarefile for "higher-level" or "Fetchware-level" syntax errors. Syntax
922             errors that are B syntax errors instead of just Perl syntax errors.
923              
924             For yours and my own convienience I created check_config_options() helper
925             subroutine. Its data driven, and will check Fetchwarefile's for three different
926             types of common syntax errors that occur in App::Fetchware's Fetchwarefile
927             syntax. These errors are more at the level of I than actual syntax
928             errors. See its POD below for additional details.
929              
930             Below briefly lists what App::Fetchware's implementation of check_syntax()
931             checks.
932              
933             =over
934              
935             =item * Mandatory configuration options
936              
937             =over
938              
939             =item * page_name, html_page_url, and destination_directory are required for all Fetchwarefiles.
940              
941             =back
942              
943             =back
944              
945             =over
946              
947             =item drop_privs() NOTES
948              
949             This section notes whatever problems you might come accross implementing and
950             debugging your Fetchware extension due to fetchware's drop_privs mechanism.
951              
952             See L.
953              
954             =over
955              
956             =item *
957              
958             check_syntax() is run in the parent process before even start() has run, so no
959             temporary directory is available for use.
960              
961             =back
962              
963             =back
964              
965             =head2 start()
966              
967             my $temp_file = start();
968              
969             start() creats a temp dir, chmod 700's it, and chdir()'s to it just like the one
970             in App::Fetchware does. App::FetchwareX::HTMLPageSync
971              
972             start() is imported use L from App::Fetchware,
973             and also exported by App::FetchwareX::HTMLPageSync. This is how
974             App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
975              
976             =head2 lookup()
977              
978             my $download_url = lookup();
979              
980             lookup() downloads the user specified C, parses it using
981             HTML::TreeBuilder, and uses C and
982             C if specified to maniuplate the tree to determine what
983             download urls the user wants.
984              
985             This list of download urls is returned as an array reference, $download_url.
986              
987             =head2 download()
988              
989             download($temp_dir, $download_url);
990              
991             download() uses App::Fetchware's utility function download_http_url() to
992             download all of the urls that lookup() returned. If the user specifed a
993             C configuration option, then that option is passed along to
994             download_http_url()'s call to HTTP::Tiny.
995              
996             =head2 verify()
997              
998             verify($download_url, $package_path);
999              
1000             verify() simply calls App::Fetchware's :UTIL subroutine do_nothing(), which as
1001             you can tell from its name does nothing, but return. The reason for the useless
1002             do_nothing() call is simply for better documentation, and standardizing how to
1003             override a App::Fetchware API subroutine in order for it to do nothing at all,
1004             so that you can prevent the original App::Fetchware subroutine from doing what
1005             it normally does.
1006              
1007             =head2 unarchive()
1008              
1009             unarchive();
1010              
1011             unarchive() does nothing by calling App::Fetchware's :UTIL subroutine
1012             do_nothing(), which does nothing.
1013              
1014             =head2 build()
1015              
1016             build($build_path);
1017              
1018             build() does the same thing as verify(), and that is nothing by calling
1019             App::Fetchware's do_nothing() subroutine to better document the fact
1020             that it does nothing.
1021              
1022             =head2 install()
1023              
1024             install($package_path);
1025              
1026             install() takes the $package_path, which is really an array ref of the paths
1027             of the files that download() copied, and copies them the the user specified
1028             destination directory, C.
1029              
1030             =head2 end()
1031              
1032             end();
1033              
1034             end() chdir()s back to the original directory, and cleans up the temp directory
1035             just like the one in App::Fetchware does. App::FetchwareX::HTMLPageSync
1036              
1037             end() is imported use L from App::Fetchware,
1038             and also exported by App::FetchwareX::HTMLPageSync. This is how
1039             App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware.
1040              
1041             =head2 uninstall()
1042              
1043             uninstall($build_path);
1044              
1045             Uninstalls App::FetchwareX::HTMLPageSync by recursivly deleting the
1046             C where it stores the wallpapers or whatever you
1047             specified it to download for you. If you would like to keep your
1048             C, then set the C to true in
1049             your Fetchwarefile, and Fetchware will I delete you
1050             C, when you uninstall your Fetchware package.
1051              
1052             =head2 upgrade()
1053              
1054             my $upgrade = upgrade($download_path, $fetchware_package_path)
1055              
1056             if ($upgrade) {
1057             ...
1058             }
1059              
1060             =over
1061              
1062             =item Configuration subroutines used:
1063              
1064             =over
1065              
1066             =item none
1067              
1068             =back
1069              
1070             =back
1071              
1072             Uses $download_path, an arrayref of URLs to download in HTMLPageSync, and
1073             compares it against the list of files that has already been downloaded by
1074             glob()ing C. And then comparing the file names of the
1075             specified files.
1076              
1077             Returns true if $download_path has any URLs that have not already been
1078             downloaded into C. Note: HEAD HTTP querries are B
1079             used to check if any already downloaded files are I than the files in
1080             the C.
1081              
1082             Returns false if $download_path is the same as C.
1083              
1084             =over
1085              
1086             =item drop_privs() NOTES
1087              
1088             This section notes whatever problems you might come accross implementing and
1089             debugging your Fetchware extension due to fetchware's drop_privs mechanism.
1090              
1091             See L.
1092              
1093             =over
1094              
1095             =item *
1096              
1097             upgrade() is run in the B process as nobody or C, because the child
1098             needs to know if it should actually bother running the rest of fetchware's API
1099             subroutines.
1100              
1101             =back
1102              
1103             =back
1104              
1105             =head1 MANUALLY CREATING A App::FetchwareX::HTMLPageSync FETCHWAREFILEN
1106              
1107             In order to use App::FetchwareX::HTMLPageSync you must first create a
1108             Fetchwarefile to use it. You can use C as explain above, or
1109             create one manually in your text editor.
1110              
1111             =over
1112              
1113             =item B<1. Name it>
1114              
1115             Use your text editor to create a file with a C<.Fetchwarefile> file extension.
1116             Use of this convention is not required, but it makes it obvious what type of
1117             file it is. Then, just copy and paste the example text below, and replace
1118             C<[page_name]> with what you choose your C to be. C is
1119             simply a configuration opton that simply names your Fetchwarefile. It is not
1120             actually used for anything other than to name your Fetchwarefile to document
1121             what program or behavior this Fetchwarefile manages.
1122              
1123             use App::FetchwareX::HTMLPageSync;
1124              
1125             page_name '[page_name]';
1126              
1127             Fetchwarefiles are actually small, well structured, Perl programs that can
1128             contain arbitrary perl code to customize fetchware's behavior, or, in most
1129             cases, simply specify a number of fetchware or a fetchware extension's (as in
1130             this case) configuration options. Below is my filled in example
1131             App::FetchwareX::HTMLPageSync fetchwarefile.
1132              
1133             use App::FetchwareX::HTMLPageSync;
1134              
1135             page_name 'Cool Wallpapers';
1136              
1137             Notice the C line at the top. That line is
1138             absolutely critical for this Fetchwarefile to work properly, because it is what
1139             allows fetchware to use Perl's own syntax as a nice easy to use syntax for
1140             Fetchwarefiles. If you do not use the matching C line,
1141             then fetchware will spit out crazy errors from Perl's own compiler listing all
1142             of the syntax errors you have. If you ever receive that error, just ensure you
1143             have the correct C line at the top of your
1144             Fetchwarefile.
1145              
1146             =item B<2. Determine your html_page_url>
1147              
1148             At the heart of App::FetchwareX::HTMLPageSync is its C, which is
1149             the URL to the HTML page you want HTMLPageSync to download and parse out links
1150             to wallpaper or whatever else you'd like to automate downloading. To figure this
1151             out just use your browser to find the HTML page you want to use, and then copy
1152             and paste the url between the single quotes C<'> as shown in the example below.
1153              
1154             html_page_url '';
1155              
1156             And then after you copy the url.
1157              
1158             html_page_url 'http://some.url/something.html';
1159              
1160             =item B<3. Determine your destination_directory>
1161              
1162             HTMLPageSync also needs to know your C. This is the
1163             directory that HTMLPageSync will copy your downloaded files to. This directory
1164             will also be deleted when you uninstall this HTMLPageSync fetchware package just
1165             like a standard App::Fetchware package would uninstall any installed software
1166             when it is uninstalled. Just copy and paste the example below, and fill in the
1167             space between the single quotes C<'>.
1168              
1169             destination_directory '';
1170              
1171             After pasting it should look like.
1172              
1173             destination_directory '~/wallpapers';
1174              
1175             Furthermore, if you want to keep your C after you
1176             uninstall your HTMLPageSync fetchware package, just set the
1177             C configuration option to true:
1178              
1179             keep_destination_directory 'True';
1180              
1181             If this is set in your HTMLPageSync Fetchwarefile, HTMLPageSync will not delete
1182             your C when your HTMLPageSync fetchware package is
1183             uninstalled.
1184              
1185             =item B<4. Specifiy other options>
1186              
1187             That's all there is to it unless you need to further customize HTMLPageSync's
1188             behavior to get just the links you need to download.
1189              
1190             At this point you can install your new Fetchwarefile with:
1191              
1192             fetchware install [path to your new fetchwarefile]
1193              
1194             Or you can futher customize it as shown next.
1195              
1196             =item B<5. Specify an optional user_agent>
1197              
1198             Many sites don't like bots downloading stuff from them wasting their bandwidth,
1199             and will even limit what you can do based on your user agent, which is the HTTP
1200             standard's name for your browser. This option allows you to pretend to be
1201             something other than HTMLPageSync's underlying library, L. Just copy
1202             and past the example below, and paste what you want you user agent to be between
1203             the single quotes C<'> as before.
1204              
1205             user_agent '';
1206              
1207             And after pasting.
1208              
1209             user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1';
1210              
1211             =item B<6. Specify an optonal html_treebuilder_callback>
1212              
1213             C specifies an optional anonymous Perl subroutine
1214             reference that will replace the default one that HTMLPageSync uses. The default
1215             one limits the download to only image format links, which is flexible enough for
1216             downloading wallpapers.
1217              
1218             If you want to download something different, then paste the example below in
1219             your Fetchwarefile.
1220              
1221             html_treebuilder_callback sub {
1222             # Get one HTML::Element.
1223             my $h = shift;
1224              
1225             # Return true or false to indicate if this HTML::Element shoudd be a
1226             # download link.
1227             if (something) {
1228             return 'True';
1229             } else {
1230             return undef;
1231             }
1232             };
1233              
1234             And create a Perl anonymous subroutine C that will
1235             be executed instead of the default one. This requires knowledge of the Perl
1236             programming language. The one below limits itself to only pdfs and MS word
1237             documents.
1238              
1239             # Download pdfs and word documents only.
1240             html_treebuilder_callback sub {
1241             my $tag = shift;
1242             my $link = $tag->attr('href');
1243             if (defined $link) {
1244             # If the anchor tag is an image...
1245             if ($link =~ /\.(pdf|doc|docx)$/) {
1246             # ...return true...
1247             return 'True';
1248             } else {
1249             # ...if not return false.
1250             return undef; #false
1251             }
1252             }
1253             };
1254              
1255             =item B<7. Specify an optional download_links_callbacks>
1256              
1257             C specifies an optional anonymous Perl subroutine
1258             reference that will replace the default one that HTMLPageSync uses. The default
1259             one removes the HTML::Element skin each download link is wrapped in, because of
1260             the use of L. This simply strips off the object-oriented crap
1261             its wrapped in, and turns it into a simply string scalar.
1262              
1263             If you want to post process the download link in some other way, then just copy
1264             and paste the code below into your Fetchwarefile, and add whatever other Perl
1265             code you may need. This requires knowledge of the Perl programming language.
1266              
1267             download_links_callback sub {
1268             my @download_urls = @_;
1269              
1270             my @wanted_download_urls;
1271             for my $link (@download_urls) {
1272             # Pick ones to keep.
1273             puse @wanted_download_urls, $link;
1274             }
1275              
1276             return @wanted_download_urls;
1277             };
1278              
1279             =back
1280              
1281             =head1 USING YOUR App::FetchwareX::HTMLPageSync FETCHWAREFILE WITH FETCHWARE
1282              
1283             After you have
1284             L
1285             as shown above you need to actually use the fetchware command line program to
1286             install, upgrade, and uninstall your App::FetchwareX::HTMLPageSync Fetchwarefile.
1287              
1288             Take note how fetchware's package management metaphor does not quite line up
1289             with what App::FetchwareX::HTMLPageSync does. Why would a HTML page mirroring
1290             script be installed, upgraded, or uninstalled? Well HTMLPageSync simply adapts
1291             fetchware's package management metaphor to its own enviroment performing the
1292             likely action for when one of fetchware's behaviors are executed.
1293              
1294             =over
1295              
1296             =item B
1297              
1298             A C will cause HTMLPageSync to ask the user a bunch of questons,
1299             and help them create a new HTMLPageSync Fetchwarefile.
1300              
1301             =item B
1302              
1303             A C while using a HTMLPageSync Fetchwarefile causes fetchware
1304             to download your C, parse it, download any matching links, and
1305             then copy them to your C as you specify in your
1306             Fetchwarefile.
1307              
1308             =item B
1309              
1310             A C will redownload the C, parse it, and
1311             compare the corresponding list of files to the list of files already downloaded,
1312             and if any new files have been added, then they will be downloaded. New versions
1313             of existing files is not supported. No timestamp checking is implemented
1314             currently.
1315              
1316             =item B
1317              
1318             A C will cause fetchware to delete this fetchware package
1319             from its database as well as recursively deleting everything inside your
1320             C as well as that directory itself. So when you uninstall
1321             a HTMLPageSync fetchware package ensure that you really want to, because it will
1322             delete whatever files it downloaded for you in the first place.
1323              
1324             However, if you would like fetchware to preserve your C,
1325             you can set the boolean C configuration option to
1326             true, like C, to keep HTMLPageSync from
1327             deleting your destination directory.
1328              
1329             =back
1330              
1331             =head1 HOW App::FetchwareX::HTMLPageSync OVERRIDES App::Fetchware
1332              
1333             This sections documents how App::FetchwareX::HTMLPageSync overrides
1334             App::Fetchware's API, and is only interesting if you're debugging
1335             App::FetchwareX::HTMLPageSync, or you're writing your own App::Fetcwhare
1336             extension. If not, you don't need to know these details.
1337              
1338             =head2 App::Fetchware API Subroutines
1339              
1340             =head3 new()
1341              
1342             HTMLPageSync overrides new(), and implements its own Q&A wizard interface
1343             helping users create HTMLPageSync Fetchwarefiles.
1344              
1345             =head3 new_install()
1346              
1347             HTMLPageSync just inherits App::Fetchware's new_install(), which just asks the
1348             user if they would like Fetchware to instell the already generated
1349             Fetchwarefile.
1350              
1351             =head3 check_syntax()
1352              
1353             check_syntax() is also overridden to check HTMLPageSync's own Fetchware-level
1354             syntax.
1355              
1356             =head3 start() and end()
1357              
1358             HTMLPageSync just imports start() and end() from App::Fetchware to take
1359             advantage of their ability to manage a temporary directory.
1360              
1361             =head3 lookup()
1362              
1363             lookup() is overridden, and downloads the C, which is the main
1364             configuration option that HTMLPageSync uses. Then lookup() parses that
1365             C, and determines what the download urls should be. If the
1366             C and C exist, then they are
1367             called to customize lookup()'s default bahavior. See their descriptions below.
1368              
1369             =head3 download()
1370              
1371             download() downloads the array ref of download links that lookup() returns.
1372              
1373             =head3 verify()
1374              
1375             verify() is overridden to do nothing.
1376              
1377             =head3 unarchive()
1378              
1379             verify() is overridden to do nothing.
1380              
1381             =head3 build()
1382              
1383             build() is overridden to do nothing.
1384              
1385             =head3 install()
1386              
1387             install() takes its argument, which is an arrayref of of the paths of the
1388             files that were downloaded to the tempdir created by start(), and copies them to
1389             the user's provided C.
1390              
1391             =head3 end() and start()
1392              
1393             HTMLPageSync just imports end() and start() from App::Fetchware to take
1394             advantage of their ability to manage a temporary directory.
1395              
1396             =head3 uninstall()
1397              
1398             uninstall() recursively deletes your C where it stores
1399             whatever links you choose to download unless of course the
1400             C configuration option is set to true.
1401              
1402             =head3 upgrade()
1403              
1404             Determines if any looked up URLs have not been downloaded yet, and returns true
1405             if that is the case.
1406              
1407             =head2 App::FetchwareX::HTMLPageSync's Configuration Subroutines
1408              
1409             Because HTMLPageSync is a App::Fetchware extension, it can not just use the same
1410             configuration subroutines that App::Fetchware uses. Instead, it must create its
1411             own configuration subroutines with App::Fetchware::CreateConfigOptions. These
1412             configuration subroutines are the configuration options that you use in your
1413             App::Fetchware or App::Fetchware extension.
1414              
1415             =head3 page_name [MANDATORY]
1416              
1417             HTMLPageSync's equivelent to App::Fetchware's C. It's simply the
1418             name of the page or what you want to download on that page.
1419              
1420             =head3 html_page_url [MANDATORY]
1421              
1422             HTMLPageSync's equivelent to App::Fetchware's C, and is just as
1423             mandatory. This is the url of the HTML page that will be downloaded and
1424             processed.
1425              
1426             =head3 destination_directory [MANDATORY]
1427              
1428             This option is also mandatory, and it specifies the directory where the files
1429             that you want to download are downloaded to.
1430              
1431             =head3 user_agent [OPTIONAL]
1432              
1433             This option is optional, and it allows you to have HTML::Tiny pretend to be a
1434             Web browser or perhaps bot if you want to.
1435              
1436             =head3 html_treebuilder_callback [OPTIONAL]
1437              
1438             This optional option allows you to specify a perl C that lookup() will
1439             execute instead of its default callback that just looks for images.
1440              
1441             It receives one parameter, which is an HTML::Element at the first C,
1442             anchor/link tag.
1443              
1444             It must C to indicate that that link should be included in the
1445             list of download links, or return false, C, to indicate that that
1446             link should not be included in the list of download links.
1447              
1448             =head3 download_links_callback [OPTIONAL]
1449              
1450             This optional option specifies an optional callback that will allow you to do
1451             post processing of the list of downloaded urls. This is needed, because the
1452             results of the C are still HTML::Element objects that
1453             need to be converted to just string download urls. That is what the default
1454             C does.
1455              
1456             It receives a list of all of the download HTML::Elements that
1457             C returned true on. It is called only once, and
1458             should return a list of string download links for download later by HTML::Tiny
1459             in download().
1460              
1461             =head3 keep_destination_directory [OPTIONAL]
1462              
1463             This optional option is a boolean true or false configuration option that
1464             when true prevents HTMLPageSync from deleting your destination_directory when
1465             you run fetchware uninstall.
1466              
1467             Its default is false, so by defualt HTMLPageSync B delete your files from
1468             your C unless you set this to true.
1469              
1470             =head1 ERRORS
1471              
1472             As with the rest of App::Fetchware, App::Fetchware::Config does not return any
1473             error codes; instead, all errors are die()'d if it's App::Fetchware::Config's
1474             error, or croak()'d if its the caller's fault. These exceptions are simple
1475             strings, and are listed in the L section below.
1476              
1477             =head1 CAVEATS
1478              
1479             Certain features of App::FetchwareX::HTMLPageSync require knowledge of the Perl
1480             programming language in order for you to make use of them. However, this is
1481             limited to optional callbacks that are not needed for most uses. These features
1482             are the C and C callbacks.
1483              
1484             =head1 AUTHOR
1485              
1486             David Yingling
1487              
1488             =head1 COPYRIGHT AND LICENSE
1489              
1490             This software is copyright (c) 2016 by David Yingling.
1491              
1492             This is free software; you can redistribute it and/or modify it under
1493             the same terms as the Perl 5 programming language system itself.
1494              
1495             =cut
1496              
1497             __END__