line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::FetchwareX::HTMLPageSync; |
2
|
|
|
|
|
|
|
our $VERSION = '1.016'; # VERSION: generated by DZP::OurPkgVersion |
3
|
|
|
|
|
|
|
# ABSTRACT: An App::Fetchware extension that downloads files based on an HTML page. |
4
|
1
|
|
|
1
|
|
7703
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
23
|
|
5
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
19
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# Enable Perl 6 knockoffs, and use 5.10.1, because smartmatching and other |
8
|
|
|
|
|
|
|
# things in 5.10 were changed in 5.10.1+. |
9
|
1
|
|
|
1
|
|
17
|
use 5.010001; |
|
1
|
|
|
|
|
3
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Use fetchware's API's to help us out. |
12
|
1
|
|
|
1
|
|
3
|
use App::Fetchware::Util ':UTIL'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
162
|
|
13
|
1
|
|
|
1
|
|
5
|
use App::Fetchware::Config ':CONFIG'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
90
|
|
14
|
1
|
|
|
1
|
|
411
|
use App::Fetchware::Fetchwarefile; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
15
|
1
|
|
|
|
|
145
|
use App::Fetchware qw( |
16
|
|
|
|
|
|
|
:OVERRIDE_NEW |
17
|
|
|
|
|
|
|
:OVERRIDE_NEW_INSTALL |
18
|
|
|
|
|
|
|
:OVERRIDE_CHECK_SYNTAX |
19
|
1
|
|
|
1
|
|
5
|
); |
|
1
|
|
|
|
|
0
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Local imports. |
22
|
1
|
|
|
1
|
|
4
|
use File::Copy 'cp'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
23
|
1
|
|
|
1
|
|
4
|
use File::Path 'remove_tree'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
40
|
|
24
|
1
|
|
|
1
|
|
3
|
use URI::Split 'uri_split'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
41
|
|
25
|
1
|
|
|
1
|
|
4
|
use File::Spec 'splitpath'; |
|
1
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
14
|
|
26
|
1
|
|
|
1
|
|
3
|
use Data::Dumper; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
32
|
|
27
|
1
|
|
|
1
|
|
4
|
use Scalar::Util 'blessed'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
45
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Use App::Fetchware::ExportAPI to specify which App::Fetchware API subroutines |
30
|
|
|
|
|
|
|
# we are going to "KEEP", import from App::Fetchware, and which API subs we are |
31
|
|
|
|
|
|
|
# going to "OVERRRIDE", implemente here in this package. |
32
|
|
|
|
|
|
|
# |
33
|
|
|
|
|
|
|
# ExportAPI takes care of the grunt work for us by setting our packages @EXPORT |
34
|
|
|
|
|
|
|
# appropriatly, and even importing Exporter's import() method into our package |
35
|
|
|
|
|
|
|
# for us, so that our App::Fetchware API subroutines and configuration options |
36
|
|
|
|
|
|
|
# specified below can be import()ed properly. |
37
|
|
|
|
|
|
|
use App::Fetchware::ExportAPI |
38
|
|
|
|
|
|
|
# KEEP or "inherit" new_install, because I want my new_install to just call |
39
|
|
|
|
|
|
|
# ask_to_install_now_to_test_fetchwarefile(), and App::Fetchware's does that |
40
|
|
|
|
|
|
|
# already for me. And start() and end() are to create and manage the |
41
|
|
|
|
|
|
|
# temporary directory for me, so I don't have to worry about polluting the |
42
|
|
|
|
|
|
|
# current working directory with temporary files. |
43
|
1
|
|
|
|
|
7
|
KEEP => [qw(new_install start end)], |
44
|
|
|
|
|
|
|
# OVERRIDE everything else. |
45
|
|
|
|
|
|
|
OVERRIDE => |
46
|
|
|
|
|
|
|
[qw(new check_syntax lookup download verify unarchive build install |
47
|
|
|
|
|
|
|
uninstall upgrade)] |
48
|
1
|
|
|
1
|
|
403
|
; |
|
1
|
|
|
|
|
2
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Use App::Fetchware::CreateconfigOptions to build our App::Fetchware |
52
|
|
|
|
|
|
|
# configuration options for us. These are subroutines with correct prototypes to |
53
|
|
|
|
|
|
|
# turn a perl code file into something that resembles a configuration file. |
54
|
|
|
|
|
|
|
use App::Fetchware::CreateConfigOptions |
55
|
1
|
|
|
|
|
6
|
ONE => [qw( |
56
|
|
|
|
|
|
|
page_name |
57
|
|
|
|
|
|
|
html_page_url |
58
|
|
|
|
|
|
|
destination_directory |
59
|
|
|
|
|
|
|
user_agent |
60
|
|
|
|
|
|
|
html_treebuilder_callback |
61
|
|
|
|
|
|
|
download_links_callback |
62
|
|
|
|
|
|
|
)], |
63
|
|
|
|
|
|
|
BOOLEAN => [qw(keep_destination_directory)] |
64
|
1
|
|
|
1
|
|
5
|
; |
|
1
|
|
|
|
|
2
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
1
|
|
|
1
|
|
4
|
use Exporter 'import'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1719
|
|
68
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
69
|
|
|
|
|
|
|
TESTING => [qw( |
70
|
|
|
|
|
|
|
get_html_page_url |
71
|
|
|
|
|
|
|
get_destination_directory |
72
|
|
|
|
|
|
|
ask_about_keep_destination_directory |
73
|
|
|
|
|
|
|
new |
74
|
|
|
|
|
|
|
new_install |
75
|
|
|
|
|
|
|
)] |
76
|
|
|
|
|
|
|
); |
77
|
|
|
|
|
|
|
our @EXPORT_OK = map {@{$_}} values %EXPORT_TAGS; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub new { |
84
|
0
|
|
|
0
|
1
|
0
|
my ($term, $page_name) = @_; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Instantiate a new Fetchwarefile object for managing and generating a |
87
|
|
|
|
|
|
|
# Fetchwarefile, which we'll write to a file for the user or use to |
88
|
|
|
|
|
|
|
# build a associated Fetchware package. |
89
|
0
|
|
|
|
|
0
|
my $now = localtime; |
90
|
0
|
|
|
|
|
0
|
my $fetchwarefile = App::Fetchware::Fetchwarefile->new( |
91
|
|
|
|
|
|
|
header => <
|
92
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
93
|
|
|
|
|
|
|
# Auto generated $now by HTMLPageSync's fetchware new command. |
94
|
|
|
|
|
|
|
# However, feel free to edit this file if HTMLPageSync's new command's |
95
|
|
|
|
|
|
|
# autoconfiguration is not enough. |
96
|
|
|
|
|
|
|
# |
97
|
|
|
|
|
|
|
# Please look up HTMLPageSync's documentation of its configuration file syntax at |
98
|
|
|
|
|
|
|
# perldoc App::FetchwareX::HTMLPageSync, and only if its configuration file |
99
|
|
|
|
|
|
|
# syntax is not malleable enough for your application should you resort to |
100
|
|
|
|
|
|
|
# customizing fetchware's behavior. For extra flexible customization see perldoc |
101
|
|
|
|
|
|
|
# App::Fetchwarex::HTMLPageSync. |
102
|
|
|
|
|
|
|
EOF |
103
|
|
|
|
|
|
|
descriptions => { |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
page_name => <
|
106
|
|
|
|
|
|
|
page_name simply names the HTML page the Fetchwarefile is responsible for |
107
|
|
|
|
|
|
|
downloading, analyzing via optional callbacks, and copying to your |
108
|
|
|
|
|
|
|
destination_directory. |
109
|
|
|
|
|
|
|
EOA |
110
|
|
|
|
|
|
|
html_page_url => <
|
111
|
|
|
|
|
|
|
html_page_url is HTMLPageSync's lookup_url equivalent. It specifies a HTTP url |
112
|
|
|
|
|
|
|
that returns a page of HTML that can be easily parsed of links to later |
113
|
|
|
|
|
|
|
download. |
114
|
|
|
|
|
|
|
EOA |
115
|
|
|
|
|
|
|
destination_directory => <
|
116
|
|
|
|
|
|
|
destination_directory is the directory on your computer where you want the files |
117
|
|
|
|
|
|
|
that you configure HTMLPageSync to parse to be copied to. |
118
|
|
|
|
|
|
|
EOA |
119
|
|
|
|
|
|
|
user_agent => <
|
120
|
|
|
|
|
|
|
user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library |
121
|
|
|
|
|
|
|
Fetchware uses, where the library will lie to the Web server you are Web |
122
|
|
|
|
|
|
|
scraping from to hopefully prevent the Web sever from banning you, or updating |
123
|
|
|
|
|
|
|
the page you want to scrap to use too much Javascript, which would prevent the |
124
|
|
|
|
|
|
|
simple parser HTMLPageSync uses from working on the specified html_page_url. |
125
|
|
|
|
|
|
|
EOA |
126
|
|
|
|
|
|
|
html_treebuilder_callback => <
|
127
|
|
|
|
|
|
|
html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync |
128
|
|
|
|
|
|
|
will execute instead of its default callback that just looks for images. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
131
|
|
|
|
|
|
|
anchor/link tag. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
It must [return 'True';] to indicate that that link should be included in the |
134
|
|
|
|
|
|
|
list of download links, or return false, [return undef], to indicate that that |
135
|
|
|
|
|
|
|
link should not be included in the list of download links. |
136
|
|
|
|
|
|
|
EOA |
137
|
|
|
|
|
|
|
download_links_callback => <
|
138
|
|
|
|
|
|
|
download_links_callback specifies an optional callback that will allow you to do |
139
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
140
|
|
|
|
|
|
|
results of the html_treebuilder_callback are still HTML::Element objects that |
141
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
142
|
|
|
|
|
|
|
C does. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
145
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
146
|
|
|
|
|
|
|
should return a list of string download links for download later by |
147
|
|
|
|
|
|
|
HTMLPageSync. |
148
|
|
|
|
|
|
|
EOA |
149
|
|
|
|
|
|
|
keep_destination_directory => <
|
150
|
|
|
|
|
|
|
keep_destination_directory is a boolean true or false configuration option that |
151
|
|
|
|
|
|
|
when true prevents HTMLPageSync from deleting your destination_directory when |
152
|
|
|
|
|
|
|
you run fetchware uninstall. |
153
|
|
|
|
|
|
|
EOA |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
); |
156
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
0
|
extension_name(__PACKAGE__); |
158
|
|
|
|
|
|
|
|
159
|
0
|
|
|
|
|
0
|
opening_message(<
|
160
|
|
|
|
|
|
|
HTMLPageSync's new command is not as sophistocated as Fetchware's. Unless you |
161
|
|
|
|
|
|
|
only want to download images, you will have to get your hands dirty, and code up |
162
|
|
|
|
|
|
|
some custom Perl callbacks to customize HTMLPageSync's behavior. However, it |
163
|
|
|
|
|
|
|
will ask you quite nicely the basic options, so if those are all you need, then |
164
|
|
|
|
|
|
|
this command will successfully generate a HTMLPageSync Fetchwarefile for you. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
After it lets you choose the easy options of page_name, html_page_url, |
167
|
|
|
|
|
|
|
and destination_directory, it will give you an opportunity to modify the |
168
|
|
|
|
|
|
|
user_agent string HTMLPageSync uses to avoid betting banned or having your |
169
|
|
|
|
|
|
|
scraping stick out like a sore thumb in the target Web server's logs. Then, |
170
|
|
|
|
|
|
|
you'll be asked about the advanced options. If you want them it will add generic |
171
|
|
|
|
|
|
|
ones to the Fetchwarefile that you can then fill in later on when HTMLPageSync |
172
|
|
|
|
|
|
|
asks you if you want to edit the generated Fetchwarefile manually. Finally, |
173
|
|
|
|
|
|
|
after your Fetchwarefile is generated HTMLPageSync will ask you if you would |
174
|
|
|
|
|
|
|
like to install your generated Fetchwarefile to test it out. |
175
|
|
|
|
|
|
|
EOM |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Ask the user for the basic configuration options. |
178
|
0
|
|
|
|
|
0
|
$page_name = fetchwarefile_name(page_name => $page_name); |
179
|
0
|
|
|
|
|
0
|
vmsg "Determined your page_name option to be [$page_name]"; |
180
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(page_name => $page_name); |
182
|
0
|
|
|
|
|
0
|
vmsg "Appended page_name [$page_name] configuration option to Fetchwarefile"; |
183
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
0
|
my $html_page_url = get_html_page_url($term); |
185
|
0
|
|
|
|
|
0
|
vmsg "Asked user for html_page_url [$html_page_url] from user."; |
186
|
|
|
|
|
|
|
|
187
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(html_page_url => $html_page_url); |
188
|
0
|
|
|
|
|
0
|
vmsg "Appended html_page_url [$html_page_url] configuration option to Fetchwarefile"; |
189
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
0
|
my $destination_directory = get_destination_directory($term); |
191
|
0
|
|
|
|
|
0
|
vmsg "Asked user for destination_directory [$destination_directory] from user."; |
192
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(destination_directory => $destination_directory); |
194
|
0
|
|
|
|
|
0
|
vmsg <
|
195
|
|
|
|
|
|
|
Appended destination_directory [$destination_directory] configuration option to |
196
|
|
|
|
|
|
|
your Fetchwarefile"; |
197
|
|
|
|
|
|
|
EOM |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Asks and sets the keep_destination_directory configuratio option if the |
200
|
|
|
|
|
|
|
# user wants to set it. |
201
|
0
|
|
|
|
|
0
|
ask_about_keep_destination_directory($term, $fetchwarefile); |
202
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
vmsg 'Prompting for other options that may be needed.'; |
204
|
0
|
|
|
|
|
0
|
my $other_options_hashref = prompt_for_other_options($term, |
205
|
|
|
|
|
|
|
user_agent => { |
206
|
|
|
|
|
|
|
prompt => <
|
207
|
|
|
|
|
|
|
What user_agent configuration option would you like? |
208
|
|
|
|
|
|
|
EOP |
209
|
|
|
|
|
|
|
print_me => <
|
210
|
|
|
|
|
|
|
user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library |
211
|
|
|
|
|
|
|
Fetchware uses, where the library will lie to the Web server you are Web |
212
|
|
|
|
|
|
|
scraping from to hopefully prevent the Web sever from banning you, or updating |
213
|
|
|
|
|
|
|
the page you want to scrap to use too much Javascript, which would prevent the |
214
|
|
|
|
|
|
|
simple parser HTMLPageSync uses from working on the specified html_page_url. |
215
|
|
|
|
|
|
|
EOP |
216
|
|
|
|
|
|
|
}, |
217
|
|
|
|
|
|
|
html_treebuilder_callback => { |
218
|
|
|
|
|
|
|
prompt => <
|
219
|
|
|
|
|
|
|
What html_treebuilder_callback configuration option would you like? |
220
|
|
|
|
|
|
|
EOP |
221
|
|
|
|
|
|
|
print_me => <
|
222
|
|
|
|
|
|
|
html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync |
223
|
|
|
|
|
|
|
will execute instead of its default callback that just looks for images. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
226
|
|
|
|
|
|
|
anchor/link tag. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
It must [return 'True';] to indicate that that link should be included in the |
229
|
|
|
|
|
|
|
list of download links, or return false, [return undef], to indicate that that |
230
|
|
|
|
|
|
|
link should not be included in the list of download links. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Because Term::UI's imput is limited to just one line, please just press enter, |
233
|
|
|
|
|
|
|
and a dummy value will go into your Fetchwarefile, where you can then replace |
234
|
|
|
|
|
|
|
that dummy value with a proper Perl callback next, when Fetchware gives you the |
235
|
|
|
|
|
|
|
option to edit your Fetchwarefile manually. |
236
|
|
|
|
|
|
|
EOP |
237
|
|
|
|
|
|
|
default => 'sub { my $h = shift; die "Dummy placeholder fill me in."; }', |
238
|
|
|
|
|
|
|
}, |
239
|
|
|
|
|
|
|
download_links_callback => { |
240
|
|
|
|
|
|
|
prompt => <
|
241
|
|
|
|
|
|
|
What download_links_callback configuration option would you like? |
242
|
|
|
|
|
|
|
EOP |
243
|
|
|
|
|
|
|
print_me => <
|
244
|
|
|
|
|
|
|
download_links_callback specifies an optional callback that will allow you to do |
245
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
246
|
|
|
|
|
|
|
results of the html_treebuilder_callback are still HTML::Element objects that |
247
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
248
|
|
|
|
|
|
|
C does. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
251
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
252
|
|
|
|
|
|
|
should return a list of string download links for download later by |
253
|
|
|
|
|
|
|
HTMLPageSync. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Because Term::UI's imput is limited to just one line, please just press enter, |
256
|
|
|
|
|
|
|
and a dummy value will go into your Fetchwarefile, where you can then replace |
257
|
|
|
|
|
|
|
that dummy value with a proper Perl callback next, when Fetchware gives you the |
258
|
|
|
|
|
|
|
option to edit your Fetchwarefile manually. |
259
|
|
|
|
|
|
|
EOP |
260
|
|
|
|
|
|
|
default => 'sub { my @download_urls = @_; die "Dummy placeholder fill me in."; }', |
261
|
|
|
|
|
|
|
}, |
262
|
|
|
|
|
|
|
); |
263
|
0
|
|
|
|
|
0
|
vmsg 'User entered the following options.'; |
264
|
0
|
|
|
|
|
0
|
vmsg Dumper($other_options_hashref); |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Append all other options to the Fetchwarefile. |
267
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(%$other_options_hashref); |
268
|
0
|
|
|
|
|
0
|
vmsg 'Appended all other options listed above to Fetchwarefile.'; |
269
|
|
|
|
|
|
|
|
270
|
0
|
|
|
|
|
0
|
my $edited_fetchwarefile = edit_manually($term, $fetchwarefile); |
271
|
0
|
|
|
|
|
0
|
vmsg <
|
272
|
|
|
|
|
|
|
Asked user if they would like to edit their generated Fetchwarefile manually. |
273
|
|
|
|
|
|
|
EOM |
274
|
|
|
|
|
|
|
# Generate Fetchwarefile. |
275
|
|
|
|
|
|
|
# If edit_manually() did not modify the Fetchwarefile, then generate it. |
276
|
0
|
0
|
0
|
|
|
0
|
if (blessed($edited_fetchwarefile) |
277
|
|
|
|
|
|
|
and |
278
|
|
|
|
|
|
|
$edited_fetchwarefile->isa('App::Fetchware::Fetchwarefile')) { |
279
|
0
|
|
|
|
|
0
|
$fetchwarefile = $fetchwarefile->generate(); |
280
|
|
|
|
|
|
|
# If edit_manually() modified the Fetchwarefile, then do not generate it, |
281
|
|
|
|
|
|
|
# and replace the Fetchwarefile object with the new string that represents |
282
|
|
|
|
|
|
|
# the user's edited Fetchwarefile. |
283
|
|
|
|
|
|
|
} else { |
284
|
0
|
|
|
|
|
0
|
$fetchwarefile = $edited_fetchwarefile; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Whatever variables the new() API subroutine returns are written via a pipe |
288
|
|
|
|
|
|
|
# back to the parent, and then the parent reads the variables back, and |
289
|
|
|
|
|
|
|
# makes then available to new_install(), back in the parent, as arguments. |
290
|
0
|
|
|
|
|
0
|
return $page_name, $fetchwarefile; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub get_html_page_url { |
296
|
0
|
|
|
0
|
1
|
0
|
my $term = shift; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# prompt for lookup_url. |
300
|
0
|
|
|
|
|
0
|
my $html_page_url = $term->get_reply( |
301
|
|
|
|
|
|
|
print_me => <
|
302
|
|
|
|
|
|
|
Fetchware's heart and soul is its html_page_url. This is the configuration option |
303
|
|
|
|
|
|
|
that tells fetchware where to check if any new links have been added to the |
304
|
|
|
|
|
|
|
specified Web page that match your criteria for download. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
How to determine your application's html_page_url: |
307
|
|
|
|
|
|
|
1. Simply specify the URL that of the Web page that has the images that you |
308
|
|
|
|
|
|
|
would like to have Fetchware download for you. |
309
|
|
|
|
|
|
|
EOP |
310
|
|
|
|
|
|
|
prompt => q{What is your Web page's html_page_url? }, |
311
|
|
|
|
|
|
|
allow => qr!(ftp|http|file)://!); |
312
|
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
0
|
return $html_page_url; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub get_destination_directory { |
319
|
0
|
|
|
0
|
1
|
0
|
my $term = shift; |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# prompt for lookup_url. |
322
|
0
|
|
|
|
|
0
|
my $destination_directory = $term->get_reply( |
323
|
|
|
|
|
|
|
print_me => <
|
324
|
|
|
|
|
|
|
destination_directory is the directory on your computer where you want the files |
325
|
|
|
|
|
|
|
that you configure HTMLPageSync to parse to be copied to. |
326
|
|
|
|
|
|
|
EOP |
327
|
|
|
|
|
|
|
prompt => q{What is your destination_directory? }); |
328
|
|
|
|
|
|
|
|
329
|
0
|
|
|
|
|
0
|
return $destination_directory; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub ask_about_keep_destination_directory { |
335
|
0
|
|
|
0
|
1
|
0
|
my ($term, $fetchwarefile) = @_; |
336
|
|
|
|
|
|
|
|
337
|
0
|
0
|
|
|
|
0
|
if ( |
338
|
|
|
|
|
|
|
$term->ask_yn( |
339
|
|
|
|
|
|
|
print_me => <
|
340
|
|
|
|
|
|
|
By default, HTMLPageSync deletes your destination_directory when you uninstall |
341
|
|
|
|
|
|
|
that destination_directory's assocated Fetchware package or Fetchwarefile. This |
342
|
|
|
|
|
|
|
is done, because your deleting the Fetchware package, so it makes sense to |
343
|
|
|
|
|
|
|
delete that package's associated data. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
If you wish to keep your destination_directory after you uninstall this |
346
|
|
|
|
|
|
|
HTMLPageSync Fetchware package, then answer N below. |
347
|
|
|
|
|
|
|
EOP |
348
|
|
|
|
|
|
|
prompt => 'Is deleting your destination_directory on uninstall OK? ', |
349
|
|
|
|
|
|
|
default => 'y', |
350
|
|
|
|
|
|
|
) |
351
|
|
|
|
|
|
|
) { |
352
|
0
|
|
|
|
|
0
|
vmsg <
|
353
|
|
|
|
|
|
|
User wants [keep_destination_directory 'True';] added to their Fetchwarefile. |
354
|
|
|
|
|
|
|
EOM |
355
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(keep_destination_directory => 'True'); |
357
|
0
|
|
|
|
|
0
|
vmsg <
|
358
|
|
|
|
|
|
|
Appended [keep_destination_directory 'True';] to user's Fetchwarefile. |
359
|
|
|
|
|
|
|
EOM |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub check_syntax { |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# Use check_config_options() to run config() a bunch of times to check the |
370
|
|
|
|
|
|
|
# already parsed Fetchwarefile. |
371
|
0
|
|
|
0
|
1
|
0
|
return check_config_options( |
372
|
|
|
|
|
|
|
Mandatory => [ 'page_name', <
|
373
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a page_name configuration |
374
|
|
|
|
|
|
|
option. Please add one, and try again. |
375
|
|
|
|
|
|
|
EOM |
376
|
|
|
|
|
|
|
Mandatory => [ 'html_page_url', <
|
377
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration |
378
|
|
|
|
|
|
|
option. Please add one, and try again. |
379
|
|
|
|
|
|
|
EOM |
380
|
|
|
|
|
|
|
Mandatory => [ 'destination_directory', <
|
381
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a destination_directory |
382
|
|
|
|
|
|
|
configuration option. Please add one, and try again. |
383
|
|
|
|
|
|
|
EOM |
384
|
|
|
|
|
|
|
); |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
###BUGALERT### lookup() returns all files each time it is run; therefore, it |
392
|
|
|
|
|
|
|
#breaks the way Fetchware is supposed to work! lookup() is supposed to return |
393
|
|
|
|
|
|
|
#"the latest version." And in HTMLPageSync's case, it should not include files |
394
|
|
|
|
|
|
|
#already downloaded, because it should only return "new files" by comparing the |
395
|
|
|
|
|
|
|
#"availabe list of files" to the "already downloaded one." |
396
|
|
|
|
|
|
|
sub lookup { |
397
|
0
|
|
|
0
|
1
|
0
|
msg |
398
|
0
|
|
|
|
|
0
|
"Looking up download urls using html_page_url [@{[config('html_page_url')]}]"; |
399
|
|
|
|
|
|
|
###BUGALERT### Create a user changeable version of lookup_check_args??(), so |
400
|
|
|
|
|
|
|
#that App::Fetchware 'subclasses' can use it. |
401
|
|
|
|
|
|
|
# Download the url the user specified. |
402
|
0
|
|
|
|
|
0
|
my $filename = do { |
403
|
0
|
0
|
|
|
|
0
|
if (defined config('user_agent')) { |
404
|
0
|
|
|
|
|
0
|
download_http_url(config('html_page_url'), |
405
|
|
|
|
|
|
|
user_agent => config('user_agent')); |
406
|
|
|
|
|
|
|
} else { |
407
|
0
|
|
|
|
|
0
|
download_http_url(config('html_page_url')); |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
}; |
410
|
0
|
|
|
|
|
0
|
vmsg "Downloaded html_page_url to local file [$filename]."; |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# Create a HTML::TreeBuilder object for the now downloaded file. |
413
|
0
|
|
|
|
|
0
|
my $tree = HTML::TreeBuilder->new(); |
414
|
|
|
|
|
|
|
# Parse $filename into a HTML::Element tree. |
415
|
0
|
|
|
|
|
0
|
$tree->parse_file($filename); |
416
|
0
|
|
|
|
|
0
|
vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.'; |
417
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my $tree_callback = do { |
419
|
0
|
0
|
|
|
|
0
|
if (config('html_treebuilder_callback')) { |
420
|
0
|
|
|
|
|
0
|
vmsg <
|
421
|
|
|
|
|
|
|
Using user supplied html_treebuilder_callback to parse downloaded HTML file: |
422
|
|
|
|
|
|
|
[ |
423
|
0
|
|
|
|
|
0
|
@{[config('html_treebuilder_callback')]} |
424
|
|
|
|
|
|
|
] |
425
|
|
|
|
|
|
|
EOM |
426
|
0
|
|
|
|
|
0
|
config('html_treebuilder_callback'); |
427
|
|
|
|
|
|
|
} else { |
428
|
0
|
|
|
|
|
0
|
vmsg <
|
429
|
|
|
|
|
|
|
Using built-in default html_treebuilder_callback that only wants images. |
430
|
|
|
|
|
|
|
EOM |
431
|
|
|
|
|
|
|
sub { |
432
|
0
|
|
|
0
|
|
0
|
my $tag = shift; |
433
|
0
|
|
|
|
|
0
|
my $link = $tag->attr('href'); |
434
|
0
|
0
|
|
|
|
0
|
if (defined $link) { |
435
|
|
|
|
|
|
|
# If the anchor tag is an image... |
436
|
0
|
0
|
|
|
|
0
|
if ($link =~ /\.(jpg|jpeg|png|bmp|tiff?|gif)$/) { |
437
|
|
|
|
|
|
|
# ...return true... |
438
|
0
|
|
|
|
|
0
|
return 'True'; |
439
|
|
|
|
|
|
|
} else { |
440
|
|
|
|
|
|
|
# ...if not return false. |
441
|
0
|
|
|
|
|
0
|
return undef; #false |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
} |
444
|
0
|
|
|
|
|
0
|
}; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
}; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Find the links that match our default callback or the user specified one |
449
|
|
|
|
|
|
|
# if the user specified one. |
450
|
0
|
|
|
|
|
0
|
my @download_urls = $tree->look_down( |
451
|
|
|
|
|
|
|
_tag => 'a', |
452
|
|
|
|
|
|
|
$tree_callback |
453
|
|
|
|
|
|
|
); |
454
|
0
|
|
|
|
|
0
|
vmsg <
|
455
|
|
|
|
|
|
|
Determined download urls to be: |
456
|
|
|
|
|
|
|
@download_urls |
457
|
|
|
|
|
|
|
EOM |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# Sort through the list of HTML::Element tags to finalize the list to |
460
|
|
|
|
|
|
|
# download. |
461
|
0
|
|
|
|
|
0
|
my $links_callback = do { |
462
|
0
|
0
|
|
|
|
0
|
if (config('download_links_callback')) { |
463
|
0
|
|
|
|
|
0
|
vmsg <
|
464
|
|
|
|
|
|
|
Determined download_links_callback to be user specified: |
465
|
|
|
|
|
|
|
[ |
466
|
0
|
|
|
|
|
0
|
@{[config('download_links_callback')]} |
467
|
|
|
|
|
|
|
] |
468
|
|
|
|
|
|
|
EOM |
469
|
0
|
|
|
|
|
0
|
config('download_links_callback'); |
470
|
|
|
|
|
|
|
} else { |
471
|
|
|
|
|
|
|
# Strip off HTML::Element crap by default. |
472
|
|
|
|
|
|
|
sub { |
473
|
0
|
|
|
0
|
|
0
|
vmsg <
|
474
|
|
|
|
|
|
|
Using built-in default download_links_callback that turns HTML::Elements into |
475
|
|
|
|
|
|
|
download urls. |
476
|
|
|
|
|
|
|
EOM |
477
|
0
|
|
|
|
|
0
|
my @download_urls = @_; |
478
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
0
|
for my $link (@download_urls) { |
480
|
0
|
|
|
|
|
0
|
$link = $link->attr('href'); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# Must return them, because this coderef was called by value not |
484
|
|
|
|
|
|
|
# by reference. |
485
|
0
|
|
|
|
|
0
|
return @download_urls; |
486
|
0
|
|
|
|
|
0
|
}; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
}; |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# Call download_links_callback or call default one to strip off |
491
|
|
|
|
|
|
|
# HTML::Element crap. |
492
|
0
|
|
|
|
|
0
|
@download_urls = $links_callback->(@download_urls); |
493
|
0
|
|
|
|
|
0
|
vmsg <
|
494
|
|
|
|
|
|
|
Determined download urls to be: |
495
|
|
|
|
|
|
|
[ |
496
|
0
|
|
|
|
|
0
|
@{[@download_urls]} |
497
|
|
|
|
|
|
|
] |
498
|
|
|
|
|
|
|
EOM |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
# The download_urls may be relative links instead of absolute links. |
501
|
|
|
|
|
|
|
# Relative ones could just be filenames without any knowledge of what the |
502
|
|
|
|
|
|
|
# actual server or path or even scheme is. Fix this by prepending |
503
|
|
|
|
|
|
|
# html_page_url to each link if there is no scheme. |
504
|
0
|
|
|
|
|
0
|
for my $download_url (@download_urls) { |
505
|
0
|
0
|
|
|
|
0
|
if ($download_url !~ m!^(ftp|http|file)://!) { |
506
|
0
|
|
|
|
|
0
|
$download_url = config('html_page_url') . '/' . $download_url; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Return a ref to the array of download urls, because lookup()'s API only |
511
|
|
|
|
|
|
|
# allows it to return a single value, but that single value does not have to |
512
|
|
|
|
|
|
|
# a scalar. It can be a array ref, which is used here. This works, because |
513
|
|
|
|
|
|
|
# what is returned here by lookup() is passed unchanged to download(), which |
514
|
|
|
|
|
|
|
# is also part of this API, so I can use what I return here as I please |
515
|
|
|
|
|
|
|
# inside download(). |
516
|
0
|
|
|
|
|
0
|
return \@download_urls; |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub download { |
522
|
0
|
|
|
0
|
1
|
0
|
my ($temp_dir, $download_url) = @_; |
523
|
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
0
|
msg 'Downloading the download urls lookup() determined.'; |
525
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
0
|
my @download_file_paths; |
527
|
|
|
|
|
|
|
# Loop over @$download_url to download all user specified URLs to temp_dir. |
528
|
0
|
|
|
|
|
0
|
for my $url (@$download_url) { |
529
|
|
|
|
|
|
|
# Use user specified agent if they asked for it. |
530
|
0
|
0
|
|
|
|
0
|
if (defined config('user_agent')) { |
531
|
0
|
|
|
|
|
0
|
vmsg <
|
532
|
|
|
|
|
|
|
Downloadig url |
533
|
|
|
|
|
|
|
[$url] |
534
|
|
|
|
|
|
|
using the user specified user_agent |
535
|
0
|
|
|
|
|
0
|
[@{[config('user_agent')]}] |
536
|
|
|
|
|
|
|
EOM |
537
|
0
|
|
|
|
|
0
|
my $downloaded_file = |
538
|
|
|
|
|
|
|
download_http_url($url, agent => config('user_agent')); |
539
|
0
|
|
|
|
|
0
|
push @download_file_paths, $downloaded_file; |
540
|
|
|
|
|
|
|
} else { |
541
|
0
|
|
|
|
|
0
|
vmsg "Downloading url [$url]."; |
542
|
0
|
|
|
|
|
0
|
my $downloaded_file = download_http_url($url); |
543
|
0
|
|
|
|
|
0
|
push @download_file_paths, $downloaded_file; |
544
|
|
|
|
|
|
|
} |
545
|
|
|
|
|
|
|
} |
546
|
|
|
|
|
|
|
|
547
|
0
|
|
|
|
|
0
|
local $" = "\n"; # print each @download_file_paths on its own line. |
548
|
0
|
|
|
|
|
0
|
vmsg <
|
549
|
|
|
|
|
|
|
Downloaded specified urls to the following paths: |
550
|
|
|
|
|
|
|
[ |
551
|
0
|
|
|
|
|
0
|
@{[@download_file_paths]} |
552
|
|
|
|
|
|
|
] |
553
|
|
|
|
|
|
|
EOM |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# AKA $package_path. |
556
|
0
|
|
|
|
|
0
|
return \@download_file_paths; |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub verify { |
562
|
2
|
|
|
2
|
1
|
1389
|
vmsg <
|
563
|
|
|
|
|
|
|
Skipping verify subroutine, because HTMLPageSync does not need to verify anything |
564
|
|
|
|
|
|
|
EOM |
565
|
2
|
|
|
|
|
7
|
do_nothing(); |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
sub unarchive { |
571
|
1
|
|
|
1
|
1
|
835
|
vmsg <
|
572
|
|
|
|
|
|
|
Skipping unarchive subroutine, because HTMLPageSync does not need to unarchive |
573
|
|
|
|
|
|
|
anything |
574
|
|
|
|
|
|
|
EOM |
575
|
1
|
|
|
|
|
5
|
do_nothing(); |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
sub build { |
581
|
0
|
|
|
0
|
1
|
0
|
vmsg <
|
582
|
|
|
|
|
|
|
Skipping build subroutine, because HTMLPageSync does not need to build anything |
583
|
|
|
|
|
|
|
EOM |
584
|
0
|
|
|
|
|
0
|
do_nothing(); |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
sub install { |
590
|
|
|
|
|
|
|
# AKA $package_path. |
591
|
0
|
|
|
0
|
1
|
0
|
my $download_file_paths = shift; |
592
|
|
|
|
|
|
|
|
593
|
0
|
|
|
|
|
0
|
msg <
|
594
|
|
|
|
|
|
|
Copying files downloaded to a local temp directory to final destination directory. |
595
|
|
|
|
|
|
|
EOM |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# Copy over the files that have been returned by download(). |
598
|
0
|
|
|
|
|
0
|
for my $file_path (@$download_file_paths) { |
599
|
0
|
|
|
|
|
0
|
vmsg <
|
600
|
0
|
|
|
|
|
0
|
Copying [$file_path] -> [@{[config('destination_directory')]}]. |
601
|
|
|
|
|
|
|
EOM |
602
|
|
|
|
|
|
|
###BUGALERT### Should this die and all the rest be croaks instead??? |
603
|
0
|
0
|
|
|
|
0
|
cp($file_path, config('destination_directory')) or die <
|
604
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: run-time error. Fetchware failed to copy the file [$file_path] to the |
605
|
0
|
|
|
|
|
0
|
destination directory [@{[config('destination_directory')]}]. |
606
|
|
|
|
|
|
|
The OS error was [$!]. |
607
|
|
|
|
|
|
|
EOD |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
|
610
|
0
|
|
|
|
|
0
|
vmsg 'Successfully copied files to destination directory.'; |
611
|
|
|
|
|
|
|
|
612
|
0
|
|
|
|
|
0
|
return 'True indicating success!'; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub uninstall { |
620
|
1
|
|
|
1
|
1
|
2823
|
my $build_path = shift; |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# Only delete destination_directory if keep_destination_directory is false. |
623
|
1
|
50
|
|
|
|
11
|
unless (config('keep_destination_directory')) { |
624
|
|
|
|
|
|
|
|
625
|
1
|
|
|
|
|
9
|
msg <
|
626
|
|
|
|
|
|
|
Uninstalling this HTMLPageSync package by deleting your destination directory. |
627
|
|
|
|
|
|
|
EOM |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
###BUGALERT### Before release go though all of Fetchware's API, and subifiy |
630
|
|
|
|
|
|
|
#each main component like lookup and download were, the later ones were not |
631
|
|
|
|
|
|
|
#done this way. That way I can put say chdir_to_build_path() here instead of |
632
|
|
|
|
|
|
|
#basicaly copying and pasting the code like I do below. Also |
633
|
|
|
|
|
|
|
#chdir_to_build_path() can be put in :OVERRIDE_UNINSTALL!!! Which I can use |
634
|
|
|
|
|
|
|
#here. |
635
|
1
|
50
|
|
|
|
18
|
chdir $build_path or die <
|
636
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: Failed to uninstall the specified package and specifically to change |
637
|
|
|
|
|
|
|
working directory to [$build_path] before running make uninstall or the |
638
|
|
|
|
|
|
|
uninstall_commands provided in the package's Fetchwarefile. Os error [$!]. |
639
|
|
|
|
|
|
|
EOD |
640
|
|
|
|
|
|
|
|
641
|
1
|
50
|
|
|
|
5
|
if ( defined config('destination_directory')) { |
642
|
|
|
|
|
|
|
# Use File::Path's remove_tree() to delete the destination_directory |
643
|
|
|
|
|
|
|
# thereby "uninstalling" this package. Will throw an exception that I'll |
644
|
|
|
|
|
|
|
# let the main eval in bin/fetchware catch, print, and exit 1. |
645
|
0
|
|
|
|
|
0
|
vmsg <
|
646
|
0
|
|
|
|
|
0
|
Deleting entire destination directory [@{[config('destination_directory')]}]. |
647
|
|
|
|
|
|
|
EOM |
648
|
0
|
|
|
|
|
0
|
remove_tree(config('destination_directory')); |
649
|
|
|
|
|
|
|
} else { |
650
|
1
|
|
|
|
|
10
|
die <
|
651
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: Failed to uninstall the specified App::FetchwareX::HTMLPageSync |
652
|
|
|
|
|
|
|
package, because no destination_directory is specified in its Fetchwarefile. |
653
|
|
|
|
|
|
|
This configuration option is required and must be specified. |
654
|
|
|
|
|
|
|
EOD |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
# keep_destination_directory was set, so don't delete destination directory. |
657
|
|
|
|
|
|
|
} else { |
658
|
0
|
|
|
|
|
|
msg <
|
659
|
|
|
|
|
|
|
Uninstalling this HTMLPageSync package but keeping your destination directory. |
660
|
|
|
|
|
|
|
EOM |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
|
664
|
0
|
|
|
|
|
|
return 'True for success.'; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
sub upgrade { |
671
|
0
|
|
|
0
|
1
|
|
my $download_path = shift; # $fetchware_package_path is not used in HTMLPageSync. |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# Get the listing of already downloaded file names. |
674
|
0
|
|
|
|
|
|
my @installed_downloads = glob(config('destination_directory')); |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# Preprocess both @$download_path and @installed_downloads to ensure that |
677
|
|
|
|
|
|
|
# URL crap or differing full paths won't screw up the "comparisons". The |
678
|
|
|
|
|
|
|
# clever delete hashslice does the "comparisons" if you will. |
679
|
0
|
|
|
|
|
|
my @download_path_filenames = map { ( uri_split($_) )[2] } @$download_path; |
|
0
|
|
|
|
|
|
|
680
|
0
|
|
|
|
|
|
my @installed_downloads_filenames = map { ( splitpath($_) ) [2] } |
|
0
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
@installed_downloads; |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# Determine what files are in @$download_path, but not in |
684
|
|
|
|
|
|
|
# @installed_downloads. |
685
|
|
|
|
|
|
|
# Algo based on code from Perl Cookbook pg. 126. |
686
|
0
|
|
|
|
|
|
my %seen; |
687
|
0
|
|
|
|
|
|
@seen{@$download_path} = (); |
688
|
0
|
|
|
|
|
|
delete @seen{@installed_downloads}; |
689
|
|
|
|
|
|
|
|
690
|
0
|
|
|
|
|
|
my @new_urls_to_download = keys %seen; |
691
|
|
|
|
|
|
|
|
692
|
0
|
0
|
|
|
|
|
if (@new_urls_to_download > 0) { |
693
|
|
|
|
|
|
|
# Alter $download_path to only list @new_urls_to_download. That way |
694
|
|
|
|
|
|
|
# download() only downloads the new URLs not the already downloaded ones |
695
|
|
|
|
|
|
|
# again. |
696
|
0
|
|
|
|
|
|
$download_path = [@new_urls_to_download]; |
697
|
|
|
|
|
|
|
|
698
|
0
|
|
|
|
|
|
return 'New URLs Found.'; |
699
|
|
|
|
|
|
|
} else { |
700
|
0
|
|
|
|
|
|
return; |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
1; |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=pod |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head1 NAME |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync - An App::Fetchware extension that downloads files based on an HTML page. |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=head1 VERSION |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
version 1.016 |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=head1 SYNOPSIS |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=head2 Example App::FetchwareX::HTMLPageSync Fetchwarefile. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
page_name 'Cool Wallpapers'; |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
html_page_url 'http://some-html-page-with-cool.urls'; |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
destination_directory 'wallpapers'; |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# pretend to be firefox |
728
|
|
|
|
|
|
|
user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'; |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# Customize the callbacks. |
731
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
732
|
|
|
|
|
|
|
# Get one HTML::Element. |
733
|
|
|
|
|
|
|
my $h = shift; |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# Return true or false to indicate if this HTML::Element shoudd be a |
736
|
|
|
|
|
|
|
# download link. |
737
|
|
|
|
|
|
|
if (something) { |
738
|
|
|
|
|
|
|
return 'True'; |
739
|
|
|
|
|
|
|
} else { |
740
|
|
|
|
|
|
|
return undef; |
741
|
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
}; |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
download_links_callback sub { |
745
|
|
|
|
|
|
|
my @download_urls = @_; |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
my @wanted_download_urls; |
748
|
|
|
|
|
|
|
for my $link (@download_urls) { |
749
|
|
|
|
|
|
|
# Pick ones to keep. |
750
|
|
|
|
|
|
|
puse @wanted_download_urls, $link; |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
return @wanted_download_urls; |
754
|
|
|
|
|
|
|
}; |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=head2 App::FetchwareX::HTMLPageSync App::Fetchware-like API. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
my $temp_file = start(); |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
my $download_url = lookup(); |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
download($temp_dir, $download_url); |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
verify($download_url, $package_path); |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
unarchive($package_path); |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
build($build_path); |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
install(); |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
uninstall($build_path); |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=head1 MOTIVATION |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
I want to automatically parse a Web page with links to wall papers that I want |
777
|
|
|
|
|
|
|
to download. Only I want software to do it for me. That's where this |
778
|
|
|
|
|
|
|
App::Fetchware extension comes in. |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=head1 DESCRIPTION |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync is an example App::Fetchware extension. It's not |
783
|
|
|
|
|
|
|
a large extension, but instead is a simple one meant to show how easy it is |
784
|
|
|
|
|
|
|
extend App::Fetchware. |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync parses the Web page you specify to create a list of |
787
|
|
|
|
|
|
|
download links. Then it downloads those links, and installs them to your |
788
|
|
|
|
|
|
|
C. |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
In order to use App::FetchwareX::HTMLPageSync to help you mirror the download |
791
|
|
|
|
|
|
|
links on a HTML page you need to create a App::FetchwareX::HTMLPageSync |
792
|
|
|
|
|
|
|
Fetchwarefile, you can do this easily by just running C, and |
793
|
|
|
|
|
|
|
typing in C when it asks you what extension of Fetchwarefile you |
794
|
|
|
|
|
|
|
want to create. |
795
|
|
|
|
|
|
|
L |
796
|
|
|
|
|
|
|
Then you'll need to |
797
|
|
|
|
|
|
|
L |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
=head1 App::FetchwareX::HTMLPageSync API SUBROUTINES |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
This is App::FetchwareX::HTMLPageSync's API that fetchware uses to execute any |
802
|
|
|
|
|
|
|
Fetchwarefile's that make use of App::FetchwareX::HTMLPageSync. This API is the |
803
|
|
|
|
|
|
|
same that regular old App::Fetchware uses for most standard FOSS software, and |
804
|
|
|
|
|
|
|
this internal documentation is only needed when debugging HTMLPageSync's code or |
805
|
|
|
|
|
|
|
when studying it to create your own fetchware extension. |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
=head2 new() |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
my ($program_name, $fetchwarefile) = new($term, $program_name); |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
# Or in an extension, you can return whatever list of variables you want, |
812
|
|
|
|
|
|
|
# and then cmd_new() will provide them as arguments to new_install() except |
813
|
|
|
|
|
|
|
# a $term Term::ReadLine object will precede the others. |
814
|
|
|
|
|
|
|
my ($term, $program_name, $fetchwarefile, $custom_argument1, $custom_argument2) |
815
|
|
|
|
|
|
|
= new($term, $program_name); |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
new() is App::Fetchware's API subroutine that implements fetchware's new |
818
|
|
|
|
|
|
|
command. It simply uses Term::UI to ask the user some questions that determine |
819
|
|
|
|
|
|
|
what configuration options will be added to the genereted Fetchwarefile. new() |
820
|
|
|
|
|
|
|
takes a $term, Term::UI/Term::Readline object, and the optional name of the |
821
|
|
|
|
|
|
|
program or Website in this case that HTMLPageSync is page syncing. |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Whatever scalars (not references just regular strings) that new() returns will |
824
|
|
|
|
|
|
|
be shared with new()'s sister API subroutine new_install() that is called after |
825
|
|
|
|
|
|
|
new() is called by cmd_install(), which implements fetchware's new command. |
826
|
|
|
|
|
|
|
new_install() is called in the parent process, so it does have root permissions, |
827
|
|
|
|
|
|
|
so be sure to test it as root as well. |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
=over |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=item drop_privs() NOTES |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
834
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
See L. |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=over |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=item * |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
This subroutine is B run as root; instead, it is run as a regular user |
843
|
|
|
|
|
|
|
unless the C configuration option has been set to true. |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
=back |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
=back |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head3 get_html_page_url() |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
my $html_page_url = get_html_page_url($term); |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
Uses $term argument as a L/L object to interactively |
854
|
|
|
|
|
|
|
explain what a L is, and to ask the user to provide one and press |
855
|
|
|
|
|
|
|
enter. |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=head3 get_destination_directory() |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
my $destination_directory = get_destination_directory($term); |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
Uses $term argument as a L/L object to interactively |
862
|
|
|
|
|
|
|
explain what a C is, and to ask the user to provide one |
863
|
|
|
|
|
|
|
and press enter. |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head3 ask_about_keep_destination_directory() |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
ask_about_keep_destination_directory($term, $fetchwarefile); |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
ask_about_keep_destination_directory() does just that it asks the user if they |
870
|
|
|
|
|
|
|
would like to enable the C configuration option to |
871
|
|
|
|
|
|
|
preserve their C when they uninstall the assocated |
872
|
|
|
|
|
|
|
Fetchware package or Fetchwarefile. If they answer Y, |
873
|
|
|
|
|
|
|
C is added to their Fetchwarefile, and if not |
874
|
|
|
|
|
|
|
nothing is added, because deleteing their C is the |
875
|
|
|
|
|
|
|
default that will happen even if the C is not even |
876
|
|
|
|
|
|
|
in the Fetchwarefile. |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head2 new_install() |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
my $fetchware_package_path = new_install($page_name, $fetchwarefile); |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
new_install() asks the user if they would like to install the previously |
883
|
|
|
|
|
|
|
generated Fetchwarefile that new() created. If they answer yes, then that |
884
|
|
|
|
|
|
|
program associated with that Fetchwarefile is installed. In our case, that means |
885
|
|
|
|
|
|
|
that whatever files are configured for download will be downloaded. If they |
886
|
|
|
|
|
|
|
answer no, then the path to the generated Fetchwarefile will be printed. |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
new_install() is imported by L from App::Fetchware, |
889
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
890
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=head2 check_syntax() |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
'Syntax Ok' = check_syntax() |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=over |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=item Configuration subroutines used: |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
=over |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=item none |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
=back |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
=back |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
Calls check_config_options() to check for the following syntax errors in |
909
|
|
|
|
|
|
|
Fetchwarefiles. Note by the time check_syntax() has been called |
910
|
|
|
|
|
|
|
parse_fetchwarefile() has already parsed the Fetchwarefile, and any syntax |
911
|
|
|
|
|
|
|
errors in the user's Fetchwarefile will have already been reported by Perl. |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
This may seem like a bug, but it's not. Do you really want to try to use regexes |
914
|
|
|
|
|
|
|
or something to try to parse the Fetchwarefile reliably, and then report errors |
915
|
|
|
|
|
|
|
to users? Or add PPI of all insane Perl modules as a dependency just to write |
916
|
|
|
|
|
|
|
syntax checking code that most of the time says the syntax is Ok anyway, and |
917
|
|
|
|
|
|
|
therefore a complete waste of time and effort? I don't want to deal with any of |
918
|
|
|
|
|
|
|
that insanity. |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
Instead, check_syntax() uses config() to examine the already parsed |
921
|
|
|
|
|
|
|
Fetchwarefile for "higher-level" or "Fetchware-level" syntax errors. Syntax |
922
|
|
|
|
|
|
|
errors that are B syntax errors instead of just Perl syntax errors. |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
For yours and my own convienience I created check_config_options() helper |
925
|
|
|
|
|
|
|
subroutine. Its data driven, and will check Fetchwarefile's for three different |
926
|
|
|
|
|
|
|
types of common syntax errors that occur in App::Fetchware's Fetchwarefile |
927
|
|
|
|
|
|
|
syntax. These errors are more at the level of I than actual syntax |
928
|
|
|
|
|
|
|
errors. See its POD below for additional details. |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
Below briefly lists what App::Fetchware's implementation of check_syntax() |
931
|
|
|
|
|
|
|
checks. |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=over |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
=item * Mandatory configuration options |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
=over |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=item * page_name, html_page_url, and destination_directory are required for all Fetchwarefiles. |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=back |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=back |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
=over |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=item drop_privs() NOTES |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
950
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
See L. |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
=over |
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
=item * |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
check_syntax() is run in the parent process before even start() has run, so no |
959
|
|
|
|
|
|
|
temporary directory is available for use. |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=back |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
=back |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
=head2 start() |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
my $temp_file = start(); |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
start() creats a temp dir, chmod 700's it, and chdir()'s to it just like the one |
970
|
|
|
|
|
|
|
in App::Fetchware does. App::FetchwareX::HTMLPageSync |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
start() is imported use L from App::Fetchware, |
973
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
974
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=head2 lookup() |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
my $download_url = lookup(); |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
lookup() downloads the user specified C, parses it using |
981
|
|
|
|
|
|
|
HTML::TreeBuilder, and uses C and |
982
|
|
|
|
|
|
|
C if specified to maniuplate the tree to determine what |
983
|
|
|
|
|
|
|
download urls the user wants. |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
This list of download urls is returned as an array reference, $download_url. |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=head2 download() |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
download($temp_dir, $download_url); |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
download() uses App::Fetchware's utility function download_http_url() to |
992
|
|
|
|
|
|
|
download all of the urls that lookup() returned. If the user specifed a |
993
|
|
|
|
|
|
|
C configuration option, then that option is passed along to |
994
|
|
|
|
|
|
|
download_http_url()'s call to HTTP::Tiny. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
=head2 verify() |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
verify($download_url, $package_path); |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
verify() simply calls App::Fetchware's :UTIL subroutine do_nothing(), which as |
1001
|
|
|
|
|
|
|
you can tell from its name does nothing, but return. The reason for the useless |
1002
|
|
|
|
|
|
|
do_nothing() call is simply for better documentation, and standardizing how to |
1003
|
|
|
|
|
|
|
override a App::Fetchware API subroutine in order for it to do nothing at all, |
1004
|
|
|
|
|
|
|
so that you can prevent the original App::Fetchware subroutine from doing what |
1005
|
|
|
|
|
|
|
it normally does. |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 unarchive() |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
unarchive(); |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
unarchive() does nothing by calling App::Fetchware's :UTIL subroutine |
1012
|
|
|
|
|
|
|
do_nothing(), which does nothing. |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
=head2 build() |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
build($build_path); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
build() does the same thing as verify(), and that is nothing by calling |
1019
|
|
|
|
|
|
|
App::Fetchware's do_nothing() subroutine to better document the fact |
1020
|
|
|
|
|
|
|
that it does nothing. |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
=head2 install() |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
install($package_path); |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
install() takes the $package_path, which is really an array ref of the paths |
1027
|
|
|
|
|
|
|
of the files that download() copied, and copies them the the user specified |
1028
|
|
|
|
|
|
|
destination directory, C. |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=head2 end() |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
end(); |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
end() chdir()s back to the original directory, and cleans up the temp directory |
1035
|
|
|
|
|
|
|
just like the one in App::Fetchware does. App::FetchwareX::HTMLPageSync |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
end() is imported use L from App::Fetchware, |
1038
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
1039
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
=head2 uninstall() |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
uninstall($build_path); |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Uninstalls App::FetchwareX::HTMLPageSync by recursivly deleting the |
1046
|
|
|
|
|
|
|
C where it stores the wallpapers or whatever you |
1047
|
|
|
|
|
|
|
specified it to download for you. If you would like to keep your |
1048
|
|
|
|
|
|
|
C, then set the C to true in |
1049
|
|
|
|
|
|
|
your Fetchwarefile, and Fetchware will I delete you |
1050
|
|
|
|
|
|
|
C, when you uninstall your Fetchware package. |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
=head2 upgrade() |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
my $upgrade = upgrade($download_path, $fetchware_package_path) |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
if ($upgrade) { |
1057
|
|
|
|
|
|
|
... |
1058
|
|
|
|
|
|
|
} |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=over |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
=item Configuration subroutines used: |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=over |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item none |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=back |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
=back |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
Uses $download_path, an arrayref of URLs to download in HTMLPageSync, and |
1073
|
|
|
|
|
|
|
compares it against the list of files that has already been downloaded by |
1074
|
|
|
|
|
|
|
glob()ing C. And then comparing the file names of the |
1075
|
|
|
|
|
|
|
specified files. |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
Returns true if $download_path has any URLs that have not already been |
1078
|
|
|
|
|
|
|
downloaded into C. Note: HEAD HTTP querries are B |
1079
|
|
|
|
|
|
|
used to check if any already downloaded files are I than the files in |
1080
|
|
|
|
|
|
|
the C. |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
Returns false if $download_path is the same as C. |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
=over |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=item drop_privs() NOTES |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
1089
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
See L. |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
=over |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
=item * |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
upgrade() is run in the B process as nobody or C, because the child |
1098
|
|
|
|
|
|
|
needs to know if it should actually bother running the rest of fetchware's API |
1099
|
|
|
|
|
|
|
subroutines. |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
=back |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=back |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=head1 MANUALLY CREATING A App::FetchwareX::HTMLPageSync FETCHWAREFILEN |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
In order to use App::FetchwareX::HTMLPageSync you must first create a |
1108
|
|
|
|
|
|
|
Fetchwarefile to use it. You can use C as explain above, or |
1109
|
|
|
|
|
|
|
create one manually in your text editor. |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
=over |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=item B<1. Name it> |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Use your text editor to create a file with a C<.Fetchwarefile> file extension. |
1116
|
|
|
|
|
|
|
Use of this convention is not required, but it makes it obvious what type of |
1117
|
|
|
|
|
|
|
file it is. Then, just copy and paste the example text below, and replace |
1118
|
|
|
|
|
|
|
C<[page_name]> with what you choose your C to be. C is |
1119
|
|
|
|
|
|
|
simply a configuration opton that simply names your Fetchwarefile. It is not |
1120
|
|
|
|
|
|
|
actually used for anything other than to name your Fetchwarefile to document |
1121
|
|
|
|
|
|
|
what program or behavior this Fetchwarefile manages. |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
page_name '[page_name]'; |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
Fetchwarefiles are actually small, well structured, Perl programs that can |
1128
|
|
|
|
|
|
|
contain arbitrary perl code to customize fetchware's behavior, or, in most |
1129
|
|
|
|
|
|
|
cases, simply specify a number of fetchware or a fetchware extension's (as in |
1130
|
|
|
|
|
|
|
this case) configuration options. Below is my filled in example |
1131
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync fetchwarefile. |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
page_name 'Cool Wallpapers'; |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
Notice the C |
1138
|
|
|
|
|
|
|
absolutely critical for this Fetchwarefile to work properly, because it is what |
1139
|
|
|
|
|
|
|
allows fetchware to use Perl's own syntax as a nice easy to use syntax for |
1140
|
|
|
|
|
|
|
Fetchwarefiles. If you do not use the matching C |
1141
|
|
|
|
|
|
|
then fetchware will spit out crazy errors from Perl's own compiler listing all |
1142
|
|
|
|
|
|
|
of the syntax errors you have. If you ever receive that error, just ensure you |
1143
|
|
|
|
|
|
|
have the correct C |
1144
|
|
|
|
|
|
|
Fetchwarefile. |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
=item B<2. Determine your html_page_url> |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
At the heart of App::FetchwareX::HTMLPageSync is its C, which is |
1149
|
|
|
|
|
|
|
the URL to the HTML page you want HTMLPageSync to download and parse out links |
1150
|
|
|
|
|
|
|
to wallpaper or whatever else you'd like to automate downloading. To figure this |
1151
|
|
|
|
|
|
|
out just use your browser to find the HTML page you want to use, and then copy |
1152
|
|
|
|
|
|
|
and paste the url between the single quotes C<'> as shown in the example below. |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
html_page_url ''; |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
And then after you copy the url. |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
html_page_url 'http://some.url/something.html'; |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
=item B<3. Determine your destination_directory> |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
HTMLPageSync also needs to know your C. This is the |
1163
|
|
|
|
|
|
|
directory that HTMLPageSync will copy your downloaded files to. This directory |
1164
|
|
|
|
|
|
|
will also be deleted when you uninstall this HTMLPageSync fetchware package just |
1165
|
|
|
|
|
|
|
like a standard App::Fetchware package would uninstall any installed software |
1166
|
|
|
|
|
|
|
when it is uninstalled. Just copy and paste the example below, and fill in the |
1167
|
|
|
|
|
|
|
space between the single quotes C<'>. |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
destination_directory ''; |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
After pasting it should look like. |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
destination_directory '~/wallpapers'; |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
Furthermore, if you want to keep your C after you |
1176
|
|
|
|
|
|
|
uninstall your HTMLPageSync fetchware package, just set the |
1177
|
|
|
|
|
|
|
C configuration option to true: |
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
keep_destination_directory 'True'; |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
If this is set in your HTMLPageSync Fetchwarefile, HTMLPageSync will not delete |
1182
|
|
|
|
|
|
|
your C when your HTMLPageSync fetchware package is |
1183
|
|
|
|
|
|
|
uninstalled. |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
=item B<4. Specifiy other options> |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
That's all there is to it unless you need to further customize HTMLPageSync's |
1188
|
|
|
|
|
|
|
behavior to get just the links you need to download. |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
At this point you can install your new Fetchwarefile with: |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
fetchware install [path to your new fetchwarefile] |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
Or you can futher customize it as shown next. |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
=item B<5. Specify an optional user_agent> |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
Many sites don't like bots downloading stuff from them wasting their bandwidth, |
1199
|
|
|
|
|
|
|
and will even limit what you can do based on your user agent, which is the HTTP |
1200
|
|
|
|
|
|
|
standard's name for your browser. This option allows you to pretend to be |
1201
|
|
|
|
|
|
|
something other than HTMLPageSync's underlying library, L. Just copy |
1202
|
|
|
|
|
|
|
and past the example below, and paste what you want you user agent to be between |
1203
|
|
|
|
|
|
|
the single quotes C<'> as before. |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
user_agent ''; |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
And after pasting. |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'; |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=item B<6. Specify an optonal html_treebuilder_callback> |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
C specifies an optional anonymous Perl subroutine |
1214
|
|
|
|
|
|
|
reference that will replace the default one that HTMLPageSync uses. The default |
1215
|
|
|
|
|
|
|
one limits the download to only image format links, which is flexible enough for |
1216
|
|
|
|
|
|
|
downloading wallpapers. |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
If you want to download something different, then paste the example below in |
1219
|
|
|
|
|
|
|
your Fetchwarefile. |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
1222
|
|
|
|
|
|
|
# Get one HTML::Element. |
1223
|
|
|
|
|
|
|
my $h = shift; |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
# Return true or false to indicate if this HTML::Element shoudd be a |
1226
|
|
|
|
|
|
|
# download link. |
1227
|
|
|
|
|
|
|
if (something) { |
1228
|
|
|
|
|
|
|
return 'True'; |
1229
|
|
|
|
|
|
|
} else { |
1230
|
|
|
|
|
|
|
return undef; |
1231
|
|
|
|
|
|
|
} |
1232
|
|
|
|
|
|
|
}; |
1233
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
And create a Perl anonymous subroutine C that will |
1235
|
|
|
|
|
|
|
be executed instead of the default one. This requires knowledge of the Perl |
1236
|
|
|
|
|
|
|
programming language. The one below limits itself to only pdfs and MS word |
1237
|
|
|
|
|
|
|
documents. |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
# Download pdfs and word documents only. |
1240
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
1241
|
|
|
|
|
|
|
my $tag = shift; |
1242
|
|
|
|
|
|
|
my $link = $tag->attr('href'); |
1243
|
|
|
|
|
|
|
if (defined $link) { |
1244
|
|
|
|
|
|
|
# If the anchor tag is an image... |
1245
|
|
|
|
|
|
|
if ($link =~ /\.(pdf|doc|docx)$/) { |
1246
|
|
|
|
|
|
|
# ...return true... |
1247
|
|
|
|
|
|
|
return 'True'; |
1248
|
|
|
|
|
|
|
} else { |
1249
|
|
|
|
|
|
|
# ...if not return false. |
1250
|
|
|
|
|
|
|
return undef; #false |
1251
|
|
|
|
|
|
|
} |
1252
|
|
|
|
|
|
|
} |
1253
|
|
|
|
|
|
|
}; |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
=item B<7. Specify an optional download_links_callbacks> |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
C specifies an optional anonymous Perl subroutine |
1258
|
|
|
|
|
|
|
reference that will replace the default one that HTMLPageSync uses. The default |
1259
|
|
|
|
|
|
|
one removes the HTML::Element skin each download link is wrapped in, because of |
1260
|
|
|
|
|
|
|
the use of L. This simply strips off the object-oriented crap |
1261
|
|
|
|
|
|
|
its wrapped in, and turns it into a simply string scalar. |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
If you want to post process the download link in some other way, then just copy |
1264
|
|
|
|
|
|
|
and paste the code below into your Fetchwarefile, and add whatever other Perl |
1265
|
|
|
|
|
|
|
code you may need. This requires knowledge of the Perl programming language. |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
download_links_callback sub { |
1268
|
|
|
|
|
|
|
my @download_urls = @_; |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
my @wanted_download_urls; |
1271
|
|
|
|
|
|
|
for my $link (@download_urls) { |
1272
|
|
|
|
|
|
|
# Pick ones to keep. |
1273
|
|
|
|
|
|
|
puse @wanted_download_urls, $link; |
1274
|
|
|
|
|
|
|
} |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
return @wanted_download_urls; |
1277
|
|
|
|
|
|
|
}; |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
=back |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
=head1 USING YOUR App::FetchwareX::HTMLPageSync FETCHWAREFILE WITH FETCHWARE |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
After you have |
1284
|
|
|
|
|
|
|
L |
1285
|
|
|
|
|
|
|
as shown above you need to actually use the fetchware command line program to |
1286
|
|
|
|
|
|
|
install, upgrade, and uninstall your App::FetchwareX::HTMLPageSync Fetchwarefile. |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
Take note how fetchware's package management metaphor does not quite line up |
1289
|
|
|
|
|
|
|
with what App::FetchwareX::HTMLPageSync does. Why would a HTML page mirroring |
1290
|
|
|
|
|
|
|
script be installed, upgraded, or uninstalled? Well HTMLPageSync simply adapts |
1291
|
|
|
|
|
|
|
fetchware's package management metaphor to its own enviroment performing the |
1292
|
|
|
|
|
|
|
likely action for when one of fetchware's behaviors are executed. |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
=over |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
=item B |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
A C will cause HTMLPageSync to ask the user a bunch of questons, |
1299
|
|
|
|
|
|
|
and help them create a new HTMLPageSync Fetchwarefile. |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
=item B |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
A C while using a HTMLPageSync Fetchwarefile causes fetchware |
1304
|
|
|
|
|
|
|
to download your C, parse it, download any matching links, and |
1305
|
|
|
|
|
|
|
then copy them to your C as you specify in your |
1306
|
|
|
|
|
|
|
Fetchwarefile. |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
=item B |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
A C will redownload the C, parse it, and |
1311
|
|
|
|
|
|
|
compare the corresponding list of files to the list of files already downloaded, |
1312
|
|
|
|
|
|
|
and if any new files have been added, then they will be downloaded. New versions |
1313
|
|
|
|
|
|
|
of existing files is not supported. No timestamp checking is implemented |
1314
|
|
|
|
|
|
|
currently. |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
=item B |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
A C will cause fetchware to delete this fetchware package |
1319
|
|
|
|
|
|
|
from its database as well as recursively deleting everything inside your |
1320
|
|
|
|
|
|
|
C as well as that directory itself. So when you uninstall |
1321
|
|
|
|
|
|
|
a HTMLPageSync fetchware package ensure that you really want to, because it will |
1322
|
|
|
|
|
|
|
delete whatever files it downloaded for you in the first place. |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
However, if you would like fetchware to preserve your C, |
1325
|
|
|
|
|
|
|
you can set the boolean C configuration option to |
1326
|
|
|
|
|
|
|
true, like C, to keep HTMLPageSync from |
1327
|
|
|
|
|
|
|
deleting your destination directory. |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
=back |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
=head1 HOW App::FetchwareX::HTMLPageSync OVERRIDES App::Fetchware |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
This sections documents how App::FetchwareX::HTMLPageSync overrides |
1334
|
|
|
|
|
|
|
App::Fetchware's API, and is only interesting if you're debugging |
1335
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync, or you're writing your own App::Fetcwhare |
1336
|
|
|
|
|
|
|
extension. If not, you don't need to know these details. |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
=head2 App::Fetchware API Subroutines |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
=head3 new() |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
HTMLPageSync overrides new(), and implements its own Q&A wizard interface |
1343
|
|
|
|
|
|
|
helping users create HTMLPageSync Fetchwarefiles. |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
=head3 new_install() |
1346
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
HTMLPageSync just inherits App::Fetchware's new_install(), which just asks the |
1348
|
|
|
|
|
|
|
user if they would like Fetchware to instell the already generated |
1349
|
|
|
|
|
|
|
Fetchwarefile. |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
=head3 check_syntax() |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
check_syntax() is also overridden to check HTMLPageSync's own Fetchware-level |
1354
|
|
|
|
|
|
|
syntax. |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
=head3 start() and end() |
1357
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
HTMLPageSync just imports start() and end() from App::Fetchware to take |
1359
|
|
|
|
|
|
|
advantage of their ability to manage a temporary directory. |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
=head3 lookup() |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
lookup() is overridden, and downloads the C, which is the main |
1364
|
|
|
|
|
|
|
configuration option that HTMLPageSync uses. Then lookup() parses that |
1365
|
|
|
|
|
|
|
C, and determines what the download urls should be. If the |
1366
|
|
|
|
|
|
|
C and C exist, then they are |
1367
|
|
|
|
|
|
|
called to customize lookup()'s default bahavior. See their descriptions below. |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
=head3 download() |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
download() downloads the array ref of download links that lookup() returns. |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head3 verify() |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
verify() is overridden to do nothing. |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
=head3 unarchive() |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
verify() is overridden to do nothing. |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
=head3 build() |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
build() is overridden to do nothing. |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
=head3 install() |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
install() takes its argument, which is an arrayref of of the paths of the |
1388
|
|
|
|
|
|
|
files that were downloaded to the tempdir created by start(), and copies them to |
1389
|
|
|
|
|
|
|
the user's provided C. |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
=head3 end() and start() |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
HTMLPageSync just imports end() and start() from App::Fetchware to take |
1394
|
|
|
|
|
|
|
advantage of their ability to manage a temporary directory. |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
=head3 uninstall() |
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
uninstall() recursively deletes your C where it stores |
1399
|
|
|
|
|
|
|
whatever links you choose to download unless of course the |
1400
|
|
|
|
|
|
|
C configuration option is set to true. |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
=head3 upgrade() |
1403
|
|
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
Determines if any looked up URLs have not been downloaded yet, and returns true |
1405
|
|
|
|
|
|
|
if that is the case. |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
=head2 App::FetchwareX::HTMLPageSync's Configuration Subroutines |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
Because HTMLPageSync is a App::Fetchware extension, it can not just use the same |
1410
|
|
|
|
|
|
|
configuration subroutines that App::Fetchware uses. Instead, it must create its |
1411
|
|
|
|
|
|
|
own configuration subroutines with App::Fetchware::CreateConfigOptions. These |
1412
|
|
|
|
|
|
|
configuration subroutines are the configuration options that you use in your |
1413
|
|
|
|
|
|
|
App::Fetchware or App::Fetchware extension. |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
=head3 page_name [MANDATORY] |
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
HTMLPageSync's equivelent to App::Fetchware's C. It's simply the |
1418
|
|
|
|
|
|
|
name of the page or what you want to download on that page. |
1419
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
=head3 html_page_url [MANDATORY] |
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
HTMLPageSync's equivelent to App::Fetchware's C, and is just as |
1423
|
|
|
|
|
|
|
mandatory. This is the url of the HTML page that will be downloaded and |
1424
|
|
|
|
|
|
|
processed. |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
=head3 destination_directory [MANDATORY] |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
This option is also mandatory, and it specifies the directory where the files |
1429
|
|
|
|
|
|
|
that you want to download are downloaded to. |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
=head3 user_agent [OPTIONAL] |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
This option is optional, and it allows you to have HTML::Tiny pretend to be a |
1434
|
|
|
|
|
|
|
Web browser or perhaps bot if you want to. |
1435
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
=head3 html_treebuilder_callback [OPTIONAL] |
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
This optional option allows you to specify a perl C that lookup() will |
1439
|
|
|
|
|
|
|
execute instead of its default callback that just looks for images. |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
1442
|
|
|
|
|
|
|
anchor/link tag. |
1443
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
It must C to indicate that that link should be included in the |
1445
|
|
|
|
|
|
|
list of download links, or return false, C, to indicate that that |
1446
|
|
|
|
|
|
|
link should not be included in the list of download links. |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
=head3 download_links_callback [OPTIONAL] |
1449
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
This optional option specifies an optional callback that will allow you to do |
1451
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
1452
|
|
|
|
|
|
|
results of the C are still HTML::Element objects that |
1453
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
1454
|
|
|
|
|
|
|
C does. |
1455
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
1457
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
1458
|
|
|
|
|
|
|
should return a list of string download links for download later by HTML::Tiny |
1459
|
|
|
|
|
|
|
in download(). |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
=head3 keep_destination_directory [OPTIONAL] |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
This optional option is a boolean true or false configuration option that |
1464
|
|
|
|
|
|
|
when true prevents HTMLPageSync from deleting your destination_directory when |
1465
|
|
|
|
|
|
|
you run fetchware uninstall. |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
Its default is false, so by defualt HTMLPageSync B delete your files from |
1468
|
|
|
|
|
|
|
your C unless you set this to true. |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
=head1 ERRORS |
1471
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
As with the rest of App::Fetchware, App::Fetchware::Config does not return any |
1473
|
|
|
|
|
|
|
error codes; instead, all errors are die()'d if it's App::Fetchware::Config's |
1474
|
|
|
|
|
|
|
error, or croak()'d if its the caller's fault. These exceptions are simple |
1475
|
|
|
|
|
|
|
strings, and are listed in the L section below. |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
=head1 CAVEATS |
1478
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
Certain features of App::FetchwareX::HTMLPageSync require knowledge of the Perl |
1480
|
|
|
|
|
|
|
programming language in order for you to make use of them. However, this is |
1481
|
|
|
|
|
|
|
limited to optional callbacks that are not needed for most uses. These features |
1482
|
|
|
|
|
|
|
are the C and C callbacks. |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
=head1 AUTHOR |
1485
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
David Yingling |
1487
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1489
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
This software is copyright (c) 2016 by David Yingling. |
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
1493
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
1494
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
=cut |
1496
|
|
|
|
|
|
|
|
1497
|
|
|
|
|
|
|
__END__ |