| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::FetchwareX::HTMLPageSync; |
|
2
|
|
|
|
|
|
|
our $VERSION = '1.016'; # VERSION: generated by DZP::OurPkgVersion |
|
3
|
|
|
|
|
|
|
# ABSTRACT: An App::Fetchware extension that downloads files based on an HTML page. |
|
4
|
1
|
|
|
1
|
|
7703
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
23
|
|
|
5
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
19
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# Enable Perl 6 knockoffs, and use 5.10.1, because smartmatching and other |
|
8
|
|
|
|
|
|
|
# things in 5.10 were changed in 5.10.1+. |
|
9
|
1
|
|
|
1
|
|
17
|
use 5.010001; |
|
|
1
|
|
|
|
|
3
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Use fetchware's API's to help us out. |
|
12
|
1
|
|
|
1
|
|
3
|
use App::Fetchware::Util ':UTIL'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
162
|
|
|
13
|
1
|
|
|
1
|
|
5
|
use App::Fetchware::Config ':CONFIG'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
90
|
|
|
14
|
1
|
|
|
1
|
|
411
|
use App::Fetchware::Fetchwarefile; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
28
|
|
|
15
|
1
|
|
|
|
|
145
|
use App::Fetchware qw( |
|
16
|
|
|
|
|
|
|
:OVERRIDE_NEW |
|
17
|
|
|
|
|
|
|
:OVERRIDE_NEW_INSTALL |
|
18
|
|
|
|
|
|
|
:OVERRIDE_CHECK_SYNTAX |
|
19
|
1
|
|
|
1
|
|
5
|
); |
|
|
1
|
|
|
|
|
0
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Local imports. |
|
22
|
1
|
|
|
1
|
|
4
|
use File::Copy 'cp'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
23
|
1
|
|
|
1
|
|
4
|
use File::Path 'remove_tree'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
40
|
|
|
24
|
1
|
|
|
1
|
|
3
|
use URI::Split 'uri_split'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
41
|
|
|
25
|
1
|
|
|
1
|
|
4
|
use File::Spec 'splitpath'; |
|
|
1
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
14
|
|
|
26
|
1
|
|
|
1
|
|
3
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
32
|
|
|
27
|
1
|
|
|
1
|
|
4
|
use Scalar::Util 'blessed'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
45
|
|
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Use App::Fetchware::ExportAPI to specify which App::Fetchware API subroutines |
|
30
|
|
|
|
|
|
|
# we are going to "KEEP", import from App::Fetchware, and which API subs we are |
|
31
|
|
|
|
|
|
|
# going to "OVERRRIDE", implemente here in this package. |
|
32
|
|
|
|
|
|
|
# |
|
33
|
|
|
|
|
|
|
# ExportAPI takes care of the grunt work for us by setting our packages @EXPORT |
|
34
|
|
|
|
|
|
|
# appropriatly, and even importing Exporter's import() method into our package |
|
35
|
|
|
|
|
|
|
# for us, so that our App::Fetchware API subroutines and configuration options |
|
36
|
|
|
|
|
|
|
# specified below can be import()ed properly. |
|
37
|
|
|
|
|
|
|
use App::Fetchware::ExportAPI |
|
38
|
|
|
|
|
|
|
# KEEP or "inherit" new_install, because I want my new_install to just call |
|
39
|
|
|
|
|
|
|
# ask_to_install_now_to_test_fetchwarefile(), and App::Fetchware's does that |
|
40
|
|
|
|
|
|
|
# already for me. And start() and end() are to create and manage the |
|
41
|
|
|
|
|
|
|
# temporary directory for me, so I don't have to worry about polluting the |
|
42
|
|
|
|
|
|
|
# current working directory with temporary files. |
|
43
|
1
|
|
|
|
|
7
|
KEEP => [qw(new_install start end)], |
|
44
|
|
|
|
|
|
|
# OVERRIDE everything else. |
|
45
|
|
|
|
|
|
|
OVERRIDE => |
|
46
|
|
|
|
|
|
|
[qw(new check_syntax lookup download verify unarchive build install |
|
47
|
|
|
|
|
|
|
uninstall upgrade)] |
|
48
|
1
|
|
|
1
|
|
403
|
; |
|
|
1
|
|
|
|
|
2
|
|
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Use App::Fetchware::CreateconfigOptions to build our App::Fetchware |
|
52
|
|
|
|
|
|
|
# configuration options for us. These are subroutines with correct prototypes to |
|
53
|
|
|
|
|
|
|
# turn a perl code file into something that resembles a configuration file. |
|
54
|
|
|
|
|
|
|
use App::Fetchware::CreateConfigOptions |
|
55
|
1
|
|
|
|
|
6
|
ONE => [qw( |
|
56
|
|
|
|
|
|
|
page_name |
|
57
|
|
|
|
|
|
|
html_page_url |
|
58
|
|
|
|
|
|
|
destination_directory |
|
59
|
|
|
|
|
|
|
user_agent |
|
60
|
|
|
|
|
|
|
html_treebuilder_callback |
|
61
|
|
|
|
|
|
|
download_links_callback |
|
62
|
|
|
|
|
|
|
)], |
|
63
|
|
|
|
|
|
|
BOOLEAN => [qw(keep_destination_directory)] |
|
64
|
1
|
|
|
1
|
|
5
|
; |
|
|
1
|
|
|
|
|
2
|
|
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
67
|
1
|
|
|
1
|
|
4
|
use Exporter 'import'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
1719
|
|
|
68
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
|
69
|
|
|
|
|
|
|
TESTING => [qw( |
|
70
|
|
|
|
|
|
|
get_html_page_url |
|
71
|
|
|
|
|
|
|
get_destination_directory |
|
72
|
|
|
|
|
|
|
ask_about_keep_destination_directory |
|
73
|
|
|
|
|
|
|
new |
|
74
|
|
|
|
|
|
|
new_install |
|
75
|
|
|
|
|
|
|
)] |
|
76
|
|
|
|
|
|
|
); |
|
77
|
|
|
|
|
|
|
our @EXPORT_OK = map {@{$_}} values %EXPORT_TAGS; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub new { |
|
84
|
0
|
|
|
0
|
1
|
0
|
my ($term, $page_name) = @_; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Instantiate a new Fetchwarefile object for managing and generating a |
|
87
|
|
|
|
|
|
|
# Fetchwarefile, which we'll write to a file for the user or use to |
|
88
|
|
|
|
|
|
|
# build a associated Fetchware package. |
|
89
|
0
|
|
|
|
|
0
|
my $now = localtime; |
|
90
|
0
|
|
|
|
|
0
|
my $fetchwarefile = App::Fetchware::Fetchwarefile->new( |
|
91
|
|
|
|
|
|
|
header => <
|
|
92
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
|
93
|
|
|
|
|
|
|
# Auto generated $now by HTMLPageSync's fetchware new command. |
|
94
|
|
|
|
|
|
|
# However, feel free to edit this file if HTMLPageSync's new command's |
|
95
|
|
|
|
|
|
|
# autoconfiguration is not enough. |
|
96
|
|
|
|
|
|
|
# |
|
97
|
|
|
|
|
|
|
# Please look up HTMLPageSync's documentation of its configuration file syntax at |
|
98
|
|
|
|
|
|
|
# perldoc App::FetchwareX::HTMLPageSync, and only if its configuration file |
|
99
|
|
|
|
|
|
|
# syntax is not malleable enough for your application should you resort to |
|
100
|
|
|
|
|
|
|
# customizing fetchware's behavior. For extra flexible customization see perldoc |
|
101
|
|
|
|
|
|
|
# App::Fetchwarex::HTMLPageSync. |
|
102
|
|
|
|
|
|
|
EOF |
|
103
|
|
|
|
|
|
|
descriptions => { |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
page_name => <
|
|
106
|
|
|
|
|
|
|
page_name simply names the HTML page the Fetchwarefile is responsible for |
|
107
|
|
|
|
|
|
|
downloading, analyzing via optional callbacks, and copying to your |
|
108
|
|
|
|
|
|
|
destination_directory. |
|
109
|
|
|
|
|
|
|
EOA |
|
110
|
|
|
|
|
|
|
html_page_url => <
|
|
111
|
|
|
|
|
|
|
html_page_url is HTMLPageSync's lookup_url equivalent. It specifies a HTTP url |
|
112
|
|
|
|
|
|
|
that returns a page of HTML that can be easily parsed of links to later |
|
113
|
|
|
|
|
|
|
download. |
|
114
|
|
|
|
|
|
|
EOA |
|
115
|
|
|
|
|
|
|
destination_directory => <
|
|
116
|
|
|
|
|
|
|
destination_directory is the directory on your computer where you want the files |
|
117
|
|
|
|
|
|
|
that you configure HTMLPageSync to parse to be copied to. |
|
118
|
|
|
|
|
|
|
EOA |
|
119
|
|
|
|
|
|
|
user_agent => <
|
|
120
|
|
|
|
|
|
|
user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library |
|
121
|
|
|
|
|
|
|
Fetchware uses, where the library will lie to the Web server you are Web |
|
122
|
|
|
|
|
|
|
scraping from to hopefully prevent the Web sever from banning you, or updating |
|
123
|
|
|
|
|
|
|
the page you want to scrap to use too much Javascript, which would prevent the |
|
124
|
|
|
|
|
|
|
simple parser HTMLPageSync uses from working on the specified html_page_url. |
|
125
|
|
|
|
|
|
|
EOA |
|
126
|
|
|
|
|
|
|
html_treebuilder_callback => <
|
|
127
|
|
|
|
|
|
|
html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync |
|
128
|
|
|
|
|
|
|
will execute instead of its default callback that just looks for images. |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
|
131
|
|
|
|
|
|
|
anchor/link tag. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
It must [return 'True';] to indicate that that link should be included in the |
|
134
|
|
|
|
|
|
|
list of download links, or return false, [return undef], to indicate that that |
|
135
|
|
|
|
|
|
|
link should not be included in the list of download links. |
|
136
|
|
|
|
|
|
|
EOA |
|
137
|
|
|
|
|
|
|
download_links_callback => <
|
|
138
|
|
|
|
|
|
|
download_links_callback specifies an optional callback that will allow you to do |
|
139
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
|
140
|
|
|
|
|
|
|
results of the html_treebuilder_callback are still HTML::Element objects that |
|
141
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
|
142
|
|
|
|
|
|
|
C does. |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
|
145
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
|
146
|
|
|
|
|
|
|
should return a list of string download links for download later by |
|
147
|
|
|
|
|
|
|
HTMLPageSync. |
|
148
|
|
|
|
|
|
|
EOA |
|
149
|
|
|
|
|
|
|
keep_destination_directory => <
|
|
150
|
|
|
|
|
|
|
keep_destination_directory is a boolean true or false configuration option that |
|
151
|
|
|
|
|
|
|
when true prevents HTMLPageSync from deleting your destination_directory when |
|
152
|
|
|
|
|
|
|
you run fetchware uninstall. |
|
153
|
|
|
|
|
|
|
EOA |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
); |
|
156
|
|
|
|
|
|
|
|
|
157
|
0
|
|
|
|
|
0
|
extension_name(__PACKAGE__); |
|
158
|
|
|
|
|
|
|
|
|
159
|
0
|
|
|
|
|
0
|
opening_message(<
|
|
160
|
|
|
|
|
|
|
HTMLPageSync's new command is not as sophistocated as Fetchware's. Unless you |
|
161
|
|
|
|
|
|
|
only want to download images, you will have to get your hands dirty, and code up |
|
162
|
|
|
|
|
|
|
some custom Perl callbacks to customize HTMLPageSync's behavior. However, it |
|
163
|
|
|
|
|
|
|
will ask you quite nicely the basic options, so if those are all you need, then |
|
164
|
|
|
|
|
|
|
this command will successfully generate a HTMLPageSync Fetchwarefile for you. |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
After it lets you choose the easy options of page_name, html_page_url, |
|
167
|
|
|
|
|
|
|
and destination_directory, it will give you an opportunity to modify the |
|
168
|
|
|
|
|
|
|
user_agent string HTMLPageSync uses to avoid betting banned or having your |
|
169
|
|
|
|
|
|
|
scraping stick out like a sore thumb in the target Web server's logs. Then, |
|
170
|
|
|
|
|
|
|
you'll be asked about the advanced options. If you want them it will add generic |
|
171
|
|
|
|
|
|
|
ones to the Fetchwarefile that you can then fill in later on when HTMLPageSync |
|
172
|
|
|
|
|
|
|
asks you if you want to edit the generated Fetchwarefile manually. Finally, |
|
173
|
|
|
|
|
|
|
after your Fetchwarefile is generated HTMLPageSync will ask you if you would |
|
174
|
|
|
|
|
|
|
like to install your generated Fetchwarefile to test it out. |
|
175
|
|
|
|
|
|
|
EOM |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Ask the user for the basic configuration options. |
|
178
|
0
|
|
|
|
|
0
|
$page_name = fetchwarefile_name(page_name => $page_name); |
|
179
|
0
|
|
|
|
|
0
|
vmsg "Determined your page_name option to be [$page_name]"; |
|
180
|
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(page_name => $page_name); |
|
182
|
0
|
|
|
|
|
0
|
vmsg "Appended page_name [$page_name] configuration option to Fetchwarefile"; |
|
183
|
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
0
|
my $html_page_url = get_html_page_url($term); |
|
185
|
0
|
|
|
|
|
0
|
vmsg "Asked user for html_page_url [$html_page_url] from user."; |
|
186
|
|
|
|
|
|
|
|
|
187
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(html_page_url => $html_page_url); |
|
188
|
0
|
|
|
|
|
0
|
vmsg "Appended html_page_url [$html_page_url] configuration option to Fetchwarefile"; |
|
189
|
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
0
|
my $destination_directory = get_destination_directory($term); |
|
191
|
0
|
|
|
|
|
0
|
vmsg "Asked user for destination_directory [$destination_directory] from user."; |
|
192
|
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(destination_directory => $destination_directory); |
|
194
|
0
|
|
|
|
|
0
|
vmsg <
|
|
195
|
|
|
|
|
|
|
Appended destination_directory [$destination_directory] configuration option to |
|
196
|
|
|
|
|
|
|
your Fetchwarefile"; |
|
197
|
|
|
|
|
|
|
EOM |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Asks and sets the keep_destination_directory configuratio option if the |
|
200
|
|
|
|
|
|
|
# user wants to set it. |
|
201
|
0
|
|
|
|
|
0
|
ask_about_keep_destination_directory($term, $fetchwarefile); |
|
202
|
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
vmsg 'Prompting for other options that may be needed.'; |
|
204
|
0
|
|
|
|
|
0
|
my $other_options_hashref = prompt_for_other_options($term, |
|
205
|
|
|
|
|
|
|
user_agent => { |
|
206
|
|
|
|
|
|
|
prompt => <
|
|
207
|
|
|
|
|
|
|
What user_agent configuration option would you like? |
|
208
|
|
|
|
|
|
|
EOP |
|
209
|
|
|
|
|
|
|
print_me => <
|
|
210
|
|
|
|
|
|
|
user_agent, if specified, will be passed to HTML::Tiny, the Perl HTTP library |
|
211
|
|
|
|
|
|
|
Fetchware uses, where the library will lie to the Web server you are Web |
|
212
|
|
|
|
|
|
|
scraping from to hopefully prevent the Web sever from banning you, or updating |
|
213
|
|
|
|
|
|
|
the page you want to scrap to use too much Javascript, which would prevent the |
|
214
|
|
|
|
|
|
|
simple parser HTMLPageSync uses from working on the specified html_page_url. |
|
215
|
|
|
|
|
|
|
EOP |
|
216
|
|
|
|
|
|
|
}, |
|
217
|
|
|
|
|
|
|
html_treebuilder_callback => { |
|
218
|
|
|
|
|
|
|
prompt => <
|
|
219
|
|
|
|
|
|
|
What html_treebuilder_callback configuration option would you like? |
|
220
|
|
|
|
|
|
|
EOP |
|
221
|
|
|
|
|
|
|
print_me => <
|
|
222
|
|
|
|
|
|
|
html_treebuilder_callback allows you to specify a perl CODEREF that HTMLPageSync |
|
223
|
|
|
|
|
|
|
will execute instead of its default callback that just looks for images. |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
|
226
|
|
|
|
|
|
|
anchor/link tag. |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
It must [return 'True';] to indicate that that link should be included in the |
|
229
|
|
|
|
|
|
|
list of download links, or return false, [return undef], to indicate that that |
|
230
|
|
|
|
|
|
|
link should not be included in the list of download links. |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Because Term::UI's imput is limited to just one line, please just press enter, |
|
233
|
|
|
|
|
|
|
and a dummy value will go into your Fetchwarefile, where you can then replace |
|
234
|
|
|
|
|
|
|
that dummy value with a proper Perl callback next, when Fetchware gives you the |
|
235
|
|
|
|
|
|
|
option to edit your Fetchwarefile manually. |
|
236
|
|
|
|
|
|
|
EOP |
|
237
|
|
|
|
|
|
|
default => 'sub { my $h = shift; die "Dummy placeholder fill me in."; }', |
|
238
|
|
|
|
|
|
|
}, |
|
239
|
|
|
|
|
|
|
download_links_callback => { |
|
240
|
|
|
|
|
|
|
prompt => <
|
|
241
|
|
|
|
|
|
|
What download_links_callback configuration option would you like? |
|
242
|
|
|
|
|
|
|
EOP |
|
243
|
|
|
|
|
|
|
print_me => <
|
|
244
|
|
|
|
|
|
|
download_links_callback specifies an optional callback that will allow you to do |
|
245
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
|
246
|
|
|
|
|
|
|
results of the html_treebuilder_callback are still HTML::Element objects that |
|
247
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
|
248
|
|
|
|
|
|
|
C does. |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
|
251
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
|
252
|
|
|
|
|
|
|
should return a list of string download links for download later by |
|
253
|
|
|
|
|
|
|
HTMLPageSync. |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Because Term::UI's imput is limited to just one line, please just press enter, |
|
256
|
|
|
|
|
|
|
and a dummy value will go into your Fetchwarefile, where you can then replace |
|
257
|
|
|
|
|
|
|
that dummy value with a proper Perl callback next, when Fetchware gives you the |
|
258
|
|
|
|
|
|
|
option to edit your Fetchwarefile manually. |
|
259
|
|
|
|
|
|
|
EOP |
|
260
|
|
|
|
|
|
|
default => 'sub { my @download_urls = @_; die "Dummy placeholder fill me in."; }', |
|
261
|
|
|
|
|
|
|
}, |
|
262
|
|
|
|
|
|
|
); |
|
263
|
0
|
|
|
|
|
0
|
vmsg 'User entered the following options.'; |
|
264
|
0
|
|
|
|
|
0
|
vmsg Dumper($other_options_hashref); |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Append all other options to the Fetchwarefile. |
|
267
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(%$other_options_hashref); |
|
268
|
0
|
|
|
|
|
0
|
vmsg 'Appended all other options listed above to Fetchwarefile.'; |
|
269
|
|
|
|
|
|
|
|
|
270
|
0
|
|
|
|
|
0
|
my $edited_fetchwarefile = edit_manually($term, $fetchwarefile); |
|
271
|
0
|
|
|
|
|
0
|
vmsg <
|
|
272
|
|
|
|
|
|
|
Asked user if they would like to edit their generated Fetchwarefile manually. |
|
273
|
|
|
|
|
|
|
EOM |
|
274
|
|
|
|
|
|
|
# Generate Fetchwarefile. |
|
275
|
|
|
|
|
|
|
# If edit_manually() did not modify the Fetchwarefile, then generate it. |
|
276
|
0
|
0
|
0
|
|
|
0
|
if (blessed($edited_fetchwarefile) |
|
277
|
|
|
|
|
|
|
and |
|
278
|
|
|
|
|
|
|
$edited_fetchwarefile->isa('App::Fetchware::Fetchwarefile')) { |
|
279
|
0
|
|
|
|
|
0
|
$fetchwarefile = $fetchwarefile->generate(); |
|
280
|
|
|
|
|
|
|
# If edit_manually() modified the Fetchwarefile, then do not generate it, |
|
281
|
|
|
|
|
|
|
# and replace the Fetchwarefile object with the new string that represents |
|
282
|
|
|
|
|
|
|
# the user's edited Fetchwarefile. |
|
283
|
|
|
|
|
|
|
} else { |
|
284
|
0
|
|
|
|
|
0
|
$fetchwarefile = $edited_fetchwarefile; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Whatever variables the new() API subroutine returns are written via a pipe |
|
288
|
|
|
|
|
|
|
# back to the parent, and then the parent reads the variables back, and |
|
289
|
|
|
|
|
|
|
# makes then available to new_install(), back in the parent, as arguments. |
|
290
|
0
|
|
|
|
|
0
|
return $page_name, $fetchwarefile; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub get_html_page_url { |
|
296
|
0
|
|
|
0
|
1
|
0
|
my $term = shift; |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# prompt for lookup_url. |
|
300
|
0
|
|
|
|
|
0
|
my $html_page_url = $term->get_reply( |
|
301
|
|
|
|
|
|
|
print_me => <
|
|
302
|
|
|
|
|
|
|
Fetchware's heart and soul is its html_page_url. This is the configuration option |
|
303
|
|
|
|
|
|
|
that tells fetchware where to check if any new links have been added to the |
|
304
|
|
|
|
|
|
|
specified Web page that match your criteria for download. |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
How to determine your application's html_page_url: |
|
307
|
|
|
|
|
|
|
1. Simply specify the URL that of the Web page that has the images that you |
|
308
|
|
|
|
|
|
|
would like to have Fetchware download for you. |
|
309
|
|
|
|
|
|
|
EOP |
|
310
|
|
|
|
|
|
|
prompt => q{What is your Web page's html_page_url? }, |
|
311
|
|
|
|
|
|
|
allow => qr!(ftp|http|file)://!); |
|
312
|
|
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
0
|
return $html_page_url; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub get_destination_directory { |
|
319
|
0
|
|
|
0
|
1
|
0
|
my $term = shift; |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# prompt for lookup_url. |
|
322
|
0
|
|
|
|
|
0
|
my $destination_directory = $term->get_reply( |
|
323
|
|
|
|
|
|
|
print_me => <
|
|
324
|
|
|
|
|
|
|
destination_directory is the directory on your computer where you want the files |
|
325
|
|
|
|
|
|
|
that you configure HTMLPageSync to parse to be copied to. |
|
326
|
|
|
|
|
|
|
EOP |
|
327
|
|
|
|
|
|
|
prompt => q{What is your destination_directory? }); |
|
328
|
|
|
|
|
|
|
|
|
329
|
0
|
|
|
|
|
0
|
return $destination_directory; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub ask_about_keep_destination_directory { |
|
335
|
0
|
|
|
0
|
1
|
0
|
my ($term, $fetchwarefile) = @_; |
|
336
|
|
|
|
|
|
|
|
|
337
|
0
|
0
|
|
|
|
0
|
if ( |
|
338
|
|
|
|
|
|
|
$term->ask_yn( |
|
339
|
|
|
|
|
|
|
print_me => <
|
|
340
|
|
|
|
|
|
|
By default, HTMLPageSync deletes your destination_directory when you uninstall |
|
341
|
|
|
|
|
|
|
that destination_directory's assocated Fetchware package or Fetchwarefile. This |
|
342
|
|
|
|
|
|
|
is done, because your deleting the Fetchware package, so it makes sense to |
|
343
|
|
|
|
|
|
|
delete that package's associated data. |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
If you wish to keep your destination_directory after you uninstall this |
|
346
|
|
|
|
|
|
|
HTMLPageSync Fetchware package, then answer N below. |
|
347
|
|
|
|
|
|
|
EOP |
|
348
|
|
|
|
|
|
|
prompt => 'Is deleting your destination_directory on uninstall OK? ', |
|
349
|
|
|
|
|
|
|
default => 'y', |
|
350
|
|
|
|
|
|
|
) |
|
351
|
|
|
|
|
|
|
) { |
|
352
|
0
|
|
|
|
|
0
|
vmsg <
|
|
353
|
|
|
|
|
|
|
User wants [keep_destination_directory 'True';] added to their Fetchwarefile. |
|
354
|
|
|
|
|
|
|
EOM |
|
355
|
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
0
|
$fetchwarefile->config_options(keep_destination_directory => 'True'); |
|
357
|
0
|
|
|
|
|
0
|
vmsg <
|
|
358
|
|
|
|
|
|
|
Appended [keep_destination_directory 'True';] to user's Fetchwarefile. |
|
359
|
|
|
|
|
|
|
EOM |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
} |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub check_syntax { |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# Use check_config_options() to run config() a bunch of times to check the |
|
370
|
|
|
|
|
|
|
# already parsed Fetchwarefile. |
|
371
|
0
|
|
|
0
|
1
|
0
|
return check_config_options( |
|
372
|
|
|
|
|
|
|
Mandatory => [ 'page_name', <
|
|
373
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a page_name configuration |
|
374
|
|
|
|
|
|
|
option. Please add one, and try again. |
|
375
|
|
|
|
|
|
|
EOM |
|
376
|
|
|
|
|
|
|
Mandatory => [ 'html_page_url', <
|
|
377
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a html_page_url configuration |
|
378
|
|
|
|
|
|
|
option. Please add one, and try again. |
|
379
|
|
|
|
|
|
|
EOM |
|
380
|
|
|
|
|
|
|
Mandatory => [ 'destination_directory', <
|
|
381
|
|
|
|
|
|
|
App-Fetchware: Your Fetchwarefile must specify a destination_directory |
|
382
|
|
|
|
|
|
|
configuration option. Please add one, and try again. |
|
383
|
|
|
|
|
|
|
EOM |
|
384
|
|
|
|
|
|
|
); |
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
###BUGALERT### lookup() returns all files each time it is run; therefore, it |
|
392
|
|
|
|
|
|
|
#breaks the way Fetchware is supposed to work! lookup() is supposed to return |
|
393
|
|
|
|
|
|
|
#"the latest version." And in HTMLPageSync's case, it should not include files |
|
394
|
|
|
|
|
|
|
#already downloaded, because it should only return "new files" by comparing the |
|
395
|
|
|
|
|
|
|
#"availabe list of files" to the "already downloaded one." |
|
396
|
|
|
|
|
|
|
sub lookup { |
|
397
|
0
|
|
|
0
|
1
|
0
|
msg |
|
398
|
0
|
|
|
|
|
0
|
"Looking up download urls using html_page_url [@{[config('html_page_url')]}]"; |
|
399
|
|
|
|
|
|
|
###BUGALERT### Create a user changeable version of lookup_check_args??(), so |
|
400
|
|
|
|
|
|
|
#that App::Fetchware 'subclasses' can use it. |
|
401
|
|
|
|
|
|
|
# Download the url the user specified. |
|
402
|
0
|
|
|
|
|
0
|
my $filename = do { |
|
403
|
0
|
0
|
|
|
|
0
|
if (defined config('user_agent')) { |
|
404
|
0
|
|
|
|
|
0
|
download_http_url(config('html_page_url'), |
|
405
|
|
|
|
|
|
|
user_agent => config('user_agent')); |
|
406
|
|
|
|
|
|
|
} else { |
|
407
|
0
|
|
|
|
|
0
|
download_http_url(config('html_page_url')); |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
}; |
|
410
|
0
|
|
|
|
|
0
|
vmsg "Downloaded html_page_url to local file [$filename]."; |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# Create a HTML::TreeBuilder object for the now downloaded file. |
|
413
|
0
|
|
|
|
|
0
|
my $tree = HTML::TreeBuilder->new(); |
|
414
|
|
|
|
|
|
|
# Parse $filename into a HTML::Element tree. |
|
415
|
0
|
|
|
|
|
0
|
$tree->parse_file($filename); |
|
416
|
0
|
|
|
|
|
0
|
vmsg 'Created HTML::TreeBuilder object to parse downloaded html file.'; |
|
417
|
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
my $tree_callback = do { |
|
419
|
0
|
0
|
|
|
|
0
|
if (config('html_treebuilder_callback')) { |
|
420
|
0
|
|
|
|
|
0
|
vmsg <
|
|
421
|
|
|
|
|
|
|
Using user supplied html_treebuilder_callback to parse downloaded HTML file: |
|
422
|
|
|
|
|
|
|
[ |
|
423
|
0
|
|
|
|
|
0
|
@{[config('html_treebuilder_callback')]} |
|
424
|
|
|
|
|
|
|
] |
|
425
|
|
|
|
|
|
|
EOM |
|
426
|
0
|
|
|
|
|
0
|
config('html_treebuilder_callback'); |
|
427
|
|
|
|
|
|
|
} else { |
|
428
|
0
|
|
|
|
|
0
|
vmsg <
|
|
429
|
|
|
|
|
|
|
Using built-in default html_treebuilder_callback that only wants images. |
|
430
|
|
|
|
|
|
|
EOM |
|
431
|
|
|
|
|
|
|
sub { |
|
432
|
0
|
|
|
0
|
|
0
|
my $tag = shift; |
|
433
|
0
|
|
|
|
|
0
|
my $link = $tag->attr('href'); |
|
434
|
0
|
0
|
|
|
|
0
|
if (defined $link) { |
|
435
|
|
|
|
|
|
|
# If the anchor tag is an image... |
|
436
|
0
|
0
|
|
|
|
0
|
if ($link =~ /\.(jpg|jpeg|png|bmp|tiff?|gif)$/) { |
|
437
|
|
|
|
|
|
|
# ...return true... |
|
438
|
0
|
|
|
|
|
0
|
return 'True'; |
|
439
|
|
|
|
|
|
|
} else { |
|
440
|
|
|
|
|
|
|
# ...if not return false. |
|
441
|
0
|
|
|
|
|
0
|
return undef; #false |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} |
|
444
|
0
|
|
|
|
|
0
|
}; |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
}; |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Find the links that match our default callback or the user specified one |
|
449
|
|
|
|
|
|
|
# if the user specified one. |
|
450
|
0
|
|
|
|
|
0
|
my @download_urls = $tree->look_down( |
|
451
|
|
|
|
|
|
|
_tag => 'a', |
|
452
|
|
|
|
|
|
|
$tree_callback |
|
453
|
|
|
|
|
|
|
); |
|
454
|
0
|
|
|
|
|
0
|
vmsg <
|
|
455
|
|
|
|
|
|
|
Determined download urls to be: |
|
456
|
|
|
|
|
|
|
@download_urls |
|
457
|
|
|
|
|
|
|
EOM |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# Sort through the list of HTML::Element tags to finalize the list to |
|
460
|
|
|
|
|
|
|
# download. |
|
461
|
0
|
|
|
|
|
0
|
my $links_callback = do { |
|
462
|
0
|
0
|
|
|
|
0
|
if (config('download_links_callback')) { |
|
463
|
0
|
|
|
|
|
0
|
vmsg <
|
|
464
|
|
|
|
|
|
|
Determined download_links_callback to be user specified: |
|
465
|
|
|
|
|
|
|
[ |
|
466
|
0
|
|
|
|
|
0
|
@{[config('download_links_callback')]} |
|
467
|
|
|
|
|
|
|
] |
|
468
|
|
|
|
|
|
|
EOM |
|
469
|
0
|
|
|
|
|
0
|
config('download_links_callback'); |
|
470
|
|
|
|
|
|
|
} else { |
|
471
|
|
|
|
|
|
|
# Strip off HTML::Element crap by default. |
|
472
|
|
|
|
|
|
|
sub { |
|
473
|
0
|
|
|
0
|
|
0
|
vmsg <
|
|
474
|
|
|
|
|
|
|
Using built-in default download_links_callback that turns HTML::Elements into |
|
475
|
|
|
|
|
|
|
download urls. |
|
476
|
|
|
|
|
|
|
EOM |
|
477
|
0
|
|
|
|
|
0
|
my @download_urls = @_; |
|
478
|
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
0
|
for my $link (@download_urls) { |
|
480
|
0
|
|
|
|
|
0
|
$link = $link->attr('href'); |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# Must return them, because this coderef was called by value not |
|
484
|
|
|
|
|
|
|
# by reference. |
|
485
|
0
|
|
|
|
|
0
|
return @download_urls; |
|
486
|
0
|
|
|
|
|
0
|
}; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
}; |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# Call download_links_callback or call default one to strip off |
|
491
|
|
|
|
|
|
|
# HTML::Element crap. |
|
492
|
0
|
|
|
|
|
0
|
@download_urls = $links_callback->(@download_urls); |
|
493
|
0
|
|
|
|
|
0
|
vmsg <
|
|
494
|
|
|
|
|
|
|
Determined download urls to be: |
|
495
|
|
|
|
|
|
|
[ |
|
496
|
0
|
|
|
|
|
0
|
@{[@download_urls]} |
|
497
|
|
|
|
|
|
|
] |
|
498
|
|
|
|
|
|
|
EOM |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
# The download_urls may be relative links instead of absolute links. |
|
501
|
|
|
|
|
|
|
# Relative ones could just be filenames without any knowledge of what the |
|
502
|
|
|
|
|
|
|
# actual server or path or even scheme is. Fix this by prepending |
|
503
|
|
|
|
|
|
|
# html_page_url to each link if there is no scheme. |
|
504
|
0
|
|
|
|
|
0
|
for my $download_url (@download_urls) { |
|
505
|
0
|
0
|
|
|
|
0
|
if ($download_url !~ m!^(ftp|http|file)://!) { |
|
506
|
0
|
|
|
|
|
0
|
$download_url = config('html_page_url') . '/' . $download_url; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
|
|
|
|
|
|
} |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Return a ref to the array of download urls, because lookup()'s API only |
|
511
|
|
|
|
|
|
|
# allows it to return a single value, but that single value does not have to |
|
512
|
|
|
|
|
|
|
# a scalar. It can be a array ref, which is used here. This works, because |
|
513
|
|
|
|
|
|
|
# what is returned here by lookup() is passed unchanged to download(), which |
|
514
|
|
|
|
|
|
|
# is also part of this API, so I can use what I return here as I please |
|
515
|
|
|
|
|
|
|
# inside download(). |
|
516
|
0
|
|
|
|
|
0
|
return \@download_urls; |
|
517
|
|
|
|
|
|
|
} |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub download { |
|
522
|
0
|
|
|
0
|
1
|
0
|
my ($temp_dir, $download_url) = @_; |
|
523
|
|
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
0
|
msg 'Downloading the download urls lookup() determined.'; |
|
525
|
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
0
|
my @download_file_paths; |
|
527
|
|
|
|
|
|
|
# Loop over @$download_url to download all user specified URLs to temp_dir. |
|
528
|
0
|
|
|
|
|
0
|
for my $url (@$download_url) { |
|
529
|
|
|
|
|
|
|
# Use user specified agent if they asked for it. |
|
530
|
0
|
0
|
|
|
|
0
|
if (defined config('user_agent')) { |
|
531
|
0
|
|
|
|
|
0
|
vmsg <
|
|
532
|
|
|
|
|
|
|
Downloadig url |
|
533
|
|
|
|
|
|
|
[$url] |
|
534
|
|
|
|
|
|
|
using the user specified user_agent |
|
535
|
0
|
|
|
|
|
0
|
[@{[config('user_agent')]}] |
|
536
|
|
|
|
|
|
|
EOM |
|
537
|
0
|
|
|
|
|
0
|
my $downloaded_file = |
|
538
|
|
|
|
|
|
|
download_http_url($url, agent => config('user_agent')); |
|
539
|
0
|
|
|
|
|
0
|
push @download_file_paths, $downloaded_file; |
|
540
|
|
|
|
|
|
|
} else { |
|
541
|
0
|
|
|
|
|
0
|
vmsg "Downloading url [$url]."; |
|
542
|
0
|
|
|
|
|
0
|
my $downloaded_file = download_http_url($url); |
|
543
|
0
|
|
|
|
|
0
|
push @download_file_paths, $downloaded_file; |
|
544
|
|
|
|
|
|
|
} |
|
545
|
|
|
|
|
|
|
} |
|
546
|
|
|
|
|
|
|
|
|
547
|
0
|
|
|
|
|
0
|
local $" = "\n"; # print each @download_file_paths on its own line. |
|
548
|
0
|
|
|
|
|
0
|
vmsg <
|
|
549
|
|
|
|
|
|
|
Downloaded specified urls to the following paths: |
|
550
|
|
|
|
|
|
|
[ |
|
551
|
0
|
|
|
|
|
0
|
@{[@download_file_paths]} |
|
552
|
|
|
|
|
|
|
] |
|
553
|
|
|
|
|
|
|
EOM |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# AKA $package_path. |
|
556
|
0
|
|
|
|
|
0
|
return \@download_file_paths; |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub verify { |
|
562
|
2
|
|
|
2
|
1
|
1389
|
vmsg <
|
|
563
|
|
|
|
|
|
|
Skipping verify subroutine, because HTMLPageSync does not need to verify anything |
|
564
|
|
|
|
|
|
|
EOM |
|
565
|
2
|
|
|
|
|
7
|
do_nothing(); |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
sub unarchive { |
|
571
|
1
|
|
|
1
|
1
|
835
|
vmsg <
|
|
572
|
|
|
|
|
|
|
Skipping unarchive subroutine, because HTMLPageSync does not need to unarchive |
|
573
|
|
|
|
|
|
|
anything |
|
574
|
|
|
|
|
|
|
EOM |
|
575
|
1
|
|
|
|
|
5
|
do_nothing(); |
|
576
|
|
|
|
|
|
|
} |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
sub build { |
|
581
|
0
|
|
|
0
|
1
|
0
|
vmsg <
|
|
582
|
|
|
|
|
|
|
Skipping build subroutine, because HTMLPageSync does not need to build anything |
|
583
|
|
|
|
|
|
|
EOM |
|
584
|
0
|
|
|
|
|
0
|
do_nothing(); |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
sub install { |
|
590
|
|
|
|
|
|
|
# AKA $package_path. |
|
591
|
0
|
|
|
0
|
1
|
0
|
my $download_file_paths = shift; |
|
592
|
|
|
|
|
|
|
|
|
593
|
0
|
|
|
|
|
0
|
msg <
|
|
594
|
|
|
|
|
|
|
Copying files downloaded to a local temp directory to final destination directory. |
|
595
|
|
|
|
|
|
|
EOM |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# Copy over the files that have been returned by download(). |
|
598
|
0
|
|
|
|
|
0
|
for my $file_path (@$download_file_paths) { |
|
599
|
0
|
|
|
|
|
0
|
vmsg <
|
|
600
|
0
|
|
|
|
|
0
|
Copying [$file_path] -> [@{[config('destination_directory')]}]. |
|
601
|
|
|
|
|
|
|
EOM |
|
602
|
|
|
|
|
|
|
###BUGALERT### Should this die and all the rest be croaks instead??? |
|
603
|
0
|
0
|
|
|
|
0
|
cp($file_path, config('destination_directory')) or die <
|
|
604
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: run-time error. Fetchware failed to copy the file [$file_path] to the |
|
605
|
0
|
|
|
|
|
0
|
destination directory [@{[config('destination_directory')]}]. |
|
606
|
|
|
|
|
|
|
The OS error was [$!]. |
|
607
|
|
|
|
|
|
|
EOD |
|
608
|
|
|
|
|
|
|
} |
|
609
|
|
|
|
|
|
|
|
|
610
|
0
|
|
|
|
|
0
|
vmsg 'Successfully copied files to destination directory.'; |
|
611
|
|
|
|
|
|
|
|
|
612
|
0
|
|
|
|
|
0
|
return 'True indicating success!'; |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub uninstall { |
|
620
|
1
|
|
|
1
|
1
|
2823
|
my $build_path = shift; |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# Only delete destination_directory if keep_destination_directory is false. |
|
623
|
1
|
50
|
|
|
|
11
|
unless (config('keep_destination_directory')) { |
|
624
|
|
|
|
|
|
|
|
|
625
|
1
|
|
|
|
|
9
|
msg <
|
|
626
|
|
|
|
|
|
|
Uninstalling this HTMLPageSync package by deleting your destination directory. |
|
627
|
|
|
|
|
|
|
EOM |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
###BUGALERT### Before release go though all of Fetchware's API, and subifiy |
|
630
|
|
|
|
|
|
|
#each main component like lookup and download were, the later ones were not |
|
631
|
|
|
|
|
|
|
#done this way. That way I can put say chdir_to_build_path() here instead of |
|
632
|
|
|
|
|
|
|
#basicaly copying and pasting the code like I do below. Also |
|
633
|
|
|
|
|
|
|
#chdir_to_build_path() can be put in :OVERRIDE_UNINSTALL!!! Which I can use |
|
634
|
|
|
|
|
|
|
#here. |
|
635
|
1
|
50
|
|
|
|
18
|
chdir $build_path or die <
|
|
636
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: Failed to uninstall the specified package and specifically to change |
|
637
|
|
|
|
|
|
|
working directory to [$build_path] before running make uninstall or the |
|
638
|
|
|
|
|
|
|
uninstall_commands provided in the package's Fetchwarefile. Os error [$!]. |
|
639
|
|
|
|
|
|
|
EOD |
|
640
|
|
|
|
|
|
|
|
|
641
|
1
|
50
|
|
|
|
5
|
if ( defined config('destination_directory')) { |
|
642
|
|
|
|
|
|
|
# Use File::Path's remove_tree() to delete the destination_directory |
|
643
|
|
|
|
|
|
|
# thereby "uninstalling" this package. Will throw an exception that I'll |
|
644
|
|
|
|
|
|
|
# let the main eval in bin/fetchware catch, print, and exit 1. |
|
645
|
0
|
|
|
|
|
0
|
vmsg <
|
|
646
|
0
|
|
|
|
|
0
|
Deleting entire destination directory [@{[config('destination_directory')]}]. |
|
647
|
|
|
|
|
|
|
EOM |
|
648
|
0
|
|
|
|
|
0
|
remove_tree(config('destination_directory')); |
|
649
|
|
|
|
|
|
|
} else { |
|
650
|
1
|
|
|
|
|
10
|
die <
|
|
651
|
|
|
|
|
|
|
App-FetchwareX-HTMLPageSync: Failed to uninstall the specified App::FetchwareX::HTMLPageSync |
|
652
|
|
|
|
|
|
|
package, because no destination_directory is specified in its Fetchwarefile. |
|
653
|
|
|
|
|
|
|
This configuration option is required and must be specified. |
|
654
|
|
|
|
|
|
|
EOD |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
# keep_destination_directory was set, so don't delete destination directory. |
|
657
|
|
|
|
|
|
|
} else { |
|
658
|
0
|
|
|
|
|
|
msg <
|
|
659
|
|
|
|
|
|
|
Uninstalling this HTMLPageSync package but keeping your destination directory. |
|
660
|
|
|
|
|
|
|
EOM |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
} |
|
663
|
|
|
|
|
|
|
|
|
664
|
0
|
|
|
|
|
|
return 'True for success.'; |
|
665
|
|
|
|
|
|
|
} |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
sub upgrade { |
|
671
|
0
|
|
|
0
|
1
|
|
my $download_path = shift; # $fetchware_package_path is not used in HTMLPageSync. |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# Get the listing of already downloaded file names. |
|
674
|
0
|
|
|
|
|
|
my @installed_downloads = glob(config('destination_directory')); |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# Preprocess both @$download_path and @installed_downloads to ensure that |
|
677
|
|
|
|
|
|
|
# URL crap or differing full paths won't screw up the "comparisons". The |
|
678
|
|
|
|
|
|
|
# clever delete hashslice does the "comparisons" if you will. |
|
679
|
0
|
|
|
|
|
|
my @download_path_filenames = map { ( uri_split($_) )[2] } @$download_path; |
|
|
0
|
|
|
|
|
|
|
|
680
|
0
|
|
|
|
|
|
my @installed_downloads_filenames = map { ( splitpath($_) ) [2] } |
|
|
0
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
@installed_downloads; |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# Determine what files are in @$download_path, but not in |
|
684
|
|
|
|
|
|
|
# @installed_downloads. |
|
685
|
|
|
|
|
|
|
# Algo based on code from Perl Cookbook pg. 126. |
|
686
|
0
|
|
|
|
|
|
my %seen; |
|
687
|
0
|
|
|
|
|
|
@seen{@$download_path} = (); |
|
688
|
0
|
|
|
|
|
|
delete @seen{@installed_downloads}; |
|
689
|
|
|
|
|
|
|
|
|
690
|
0
|
|
|
|
|
|
my @new_urls_to_download = keys %seen; |
|
691
|
|
|
|
|
|
|
|
|
692
|
0
|
0
|
|
|
|
|
if (@new_urls_to_download > 0) { |
|
693
|
|
|
|
|
|
|
# Alter $download_path to only list @new_urls_to_download. That way |
|
694
|
|
|
|
|
|
|
# download() only downloads the new URLs not the already downloaded ones |
|
695
|
|
|
|
|
|
|
# again. |
|
696
|
0
|
|
|
|
|
|
$download_path = [@new_urls_to_download]; |
|
697
|
|
|
|
|
|
|
|
|
698
|
0
|
|
|
|
|
|
return 'New URLs Found.'; |
|
699
|
|
|
|
|
|
|
} else { |
|
700
|
0
|
|
|
|
|
|
return; |
|
701
|
|
|
|
|
|
|
} |
|
702
|
|
|
|
|
|
|
} |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
1; |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=pod |
|
708
|
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head1 NAME |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync - An App::Fetchware extension that downloads files based on an HTML page. |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=head1 VERSION |
|
714
|
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
version 1.016 |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=head2 Example App::FetchwareX::HTMLPageSync Fetchwarefile. |
|
720
|
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
page_name 'Cool Wallpapers'; |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
html_page_url 'http://some-html-page-with-cool.urls'; |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
destination_directory 'wallpapers'; |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# pretend to be firefox |
|
728
|
|
|
|
|
|
|
user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'; |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# Customize the callbacks. |
|
731
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
|
732
|
|
|
|
|
|
|
# Get one HTML::Element. |
|
733
|
|
|
|
|
|
|
my $h = shift; |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# Return true or false to indicate if this HTML::Element shoudd be a |
|
736
|
|
|
|
|
|
|
# download link. |
|
737
|
|
|
|
|
|
|
if (something) { |
|
738
|
|
|
|
|
|
|
return 'True'; |
|
739
|
|
|
|
|
|
|
} else { |
|
740
|
|
|
|
|
|
|
return undef; |
|
741
|
|
|
|
|
|
|
} |
|
742
|
|
|
|
|
|
|
}; |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
download_links_callback sub { |
|
745
|
|
|
|
|
|
|
my @download_urls = @_; |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
my @wanted_download_urls; |
|
748
|
|
|
|
|
|
|
for my $link (@download_urls) { |
|
749
|
|
|
|
|
|
|
# Pick ones to keep. |
|
750
|
|
|
|
|
|
|
puse @wanted_download_urls, $link; |
|
751
|
|
|
|
|
|
|
} |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
return @wanted_download_urls; |
|
754
|
|
|
|
|
|
|
}; |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=head2 App::FetchwareX::HTMLPageSync App::Fetchware-like API. |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
my $temp_file = start(); |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
my $download_url = lookup(); |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
download($temp_dir, $download_url); |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
verify($download_url, $package_path); |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
unarchive($package_path); |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
build($build_path); |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
install(); |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
uninstall($build_path); |
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=head1 MOTIVATION |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
I want to automatically parse a Web page with links to wall papers that I want |
|
777
|
|
|
|
|
|
|
to download. Only I want software to do it for me. That's where this |
|
778
|
|
|
|
|
|
|
App::Fetchware extension comes in. |
|
779
|
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync is an example App::Fetchware extension. It's not |
|
783
|
|
|
|
|
|
|
a large extension, but instead is a simple one meant to show how easy it is |
|
784
|
|
|
|
|
|
|
extend App::Fetchware. |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync parses the Web page you specify to create a list of |
|
787
|
|
|
|
|
|
|
download links. Then it downloads those links, and installs them to your |
|
788
|
|
|
|
|
|
|
C. |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
In order to use App::FetchwareX::HTMLPageSync to help you mirror the download |
|
791
|
|
|
|
|
|
|
links on a HTML page you need to create a App::FetchwareX::HTMLPageSync |
|
792
|
|
|
|
|
|
|
Fetchwarefile, you can do this easily by just running C, and |
|
793
|
|
|
|
|
|
|
typing in C when it asks you what extension of Fetchwarefile you |
|
794
|
|
|
|
|
|
|
want to create. |
|
795
|
|
|
|
|
|
|
L |
|
796
|
|
|
|
|
|
|
Then you'll need to |
|
797
|
|
|
|
|
|
|
L |
|
798
|
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
=head1 App::FetchwareX::HTMLPageSync API SUBROUTINES |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
This is App::FetchwareX::HTMLPageSync's API that fetchware uses to execute any |
|
802
|
|
|
|
|
|
|
Fetchwarefile's that make use of App::FetchwareX::HTMLPageSync. This API is the |
|
803
|
|
|
|
|
|
|
same that regular old App::Fetchware uses for most standard FOSS software, and |
|
804
|
|
|
|
|
|
|
this internal documentation is only needed when debugging HTMLPageSync's code or |
|
805
|
|
|
|
|
|
|
when studying it to create your own fetchware extension. |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
=head2 new() |
|
808
|
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
my ($program_name, $fetchwarefile) = new($term, $program_name); |
|
810
|
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
# Or in an extension, you can return whatever list of variables you want, |
|
812
|
|
|
|
|
|
|
# and then cmd_new() will provide them as arguments to new_install() except |
|
813
|
|
|
|
|
|
|
# a $term Term::ReadLine object will precede the others. |
|
814
|
|
|
|
|
|
|
my ($term, $program_name, $fetchwarefile, $custom_argument1, $custom_argument2) |
|
815
|
|
|
|
|
|
|
= new($term, $program_name); |
|
816
|
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
new() is App::Fetchware's API subroutine that implements fetchware's new |
|
818
|
|
|
|
|
|
|
command. It simply uses Term::UI to ask the user some questions that determine |
|
819
|
|
|
|
|
|
|
what configuration options will be added to the genereted Fetchwarefile. new() |
|
820
|
|
|
|
|
|
|
takes a $term, Term::UI/Term::Readline object, and the optional name of the |
|
821
|
|
|
|
|
|
|
program or Website in this case that HTMLPageSync is page syncing. |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Whatever scalars (not references just regular strings) that new() returns will |
|
824
|
|
|
|
|
|
|
be shared with new()'s sister API subroutine new_install() that is called after |
|
825
|
|
|
|
|
|
|
new() is called by cmd_install(), which implements fetchware's new command. |
|
826
|
|
|
|
|
|
|
new_install() is called in the parent process, so it does have root permissions, |
|
827
|
|
|
|
|
|
|
so be sure to test it as root as well. |
|
828
|
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
=over |
|
830
|
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=item drop_privs() NOTES |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
|
834
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
See L. |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=over |
|
839
|
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=item * |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
This subroutine is B run as root; instead, it is run as a regular user |
|
843
|
|
|
|
|
|
|
unless the C configuration option has been set to true. |
|
844
|
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
=back |
|
846
|
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
=back |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head3 get_html_page_url() |
|
850
|
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
my $html_page_url = get_html_page_url($term); |
|
852
|
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
Uses $term argument as a L/L object to interactively |
|
854
|
|
|
|
|
|
|
explain what a L is, and to ask the user to provide one and press |
|
855
|
|
|
|
|
|
|
enter. |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=head3 get_destination_directory() |
|
858
|
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
my $destination_directory = get_destination_directory($term); |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
Uses $term argument as a L/L object to interactively |
|
862
|
|
|
|
|
|
|
explain what a C is, and to ask the user to provide one |
|
863
|
|
|
|
|
|
|
and press enter. |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head3 ask_about_keep_destination_directory() |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
ask_about_keep_destination_directory($term, $fetchwarefile); |
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
ask_about_keep_destination_directory() does just that it asks the user if they |
|
870
|
|
|
|
|
|
|
would like to enable the C configuration option to |
|
871
|
|
|
|
|
|
|
preserve their C when they uninstall the assocated |
|
872
|
|
|
|
|
|
|
Fetchware package or Fetchwarefile. If they answer Y, |
|
873
|
|
|
|
|
|
|
C is added to their Fetchwarefile, and if not |
|
874
|
|
|
|
|
|
|
nothing is added, because deleteing their C is the |
|
875
|
|
|
|
|
|
|
default that will happen even if the C is not even |
|
876
|
|
|
|
|
|
|
in the Fetchwarefile. |
|
877
|
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head2 new_install() |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
my $fetchware_package_path = new_install($page_name, $fetchwarefile); |
|
881
|
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
new_install() asks the user if they would like to install the previously |
|
883
|
|
|
|
|
|
|
generated Fetchwarefile that new() created. If they answer yes, then that |
|
884
|
|
|
|
|
|
|
program associated with that Fetchwarefile is installed. In our case, that means |
|
885
|
|
|
|
|
|
|
that whatever files are configured for download will be downloaded. If they |
|
886
|
|
|
|
|
|
|
answer no, then the path to the generated Fetchwarefile will be printed. |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
new_install() is imported by L from App::Fetchware, |
|
889
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
|
890
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=head2 check_syntax() |
|
893
|
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
'Syntax Ok' = check_syntax() |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=over |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=item Configuration subroutines used: |
|
899
|
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
=over |
|
901
|
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=item none |
|
903
|
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
=back |
|
905
|
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
=back |
|
907
|
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
Calls check_config_options() to check for the following syntax errors in |
|
909
|
|
|
|
|
|
|
Fetchwarefiles. Note by the time check_syntax() has been called |
|
910
|
|
|
|
|
|
|
parse_fetchwarefile() has already parsed the Fetchwarefile, and any syntax |
|
911
|
|
|
|
|
|
|
errors in the user's Fetchwarefile will have already been reported by Perl. |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
This may seem like a bug, but it's not. Do you really want to try to use regexes |
|
914
|
|
|
|
|
|
|
or something to try to parse the Fetchwarefile reliably, and then report errors |
|
915
|
|
|
|
|
|
|
to users? Or add PPI of all insane Perl modules as a dependency just to write |
|
916
|
|
|
|
|
|
|
syntax checking code that most of the time says the syntax is Ok anyway, and |
|
917
|
|
|
|
|
|
|
therefore a complete waste of time and effort? I don't want to deal with any of |
|
918
|
|
|
|
|
|
|
that insanity. |
|
919
|
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
Instead, check_syntax() uses config() to examine the already parsed |
|
921
|
|
|
|
|
|
|
Fetchwarefile for "higher-level" or "Fetchware-level" syntax errors. Syntax |
|
922
|
|
|
|
|
|
|
errors that are B syntax errors instead of just Perl syntax errors. |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
For yours and my own convienience I created check_config_options() helper |
|
925
|
|
|
|
|
|
|
subroutine. Its data driven, and will check Fetchwarefile's for three different |
|
926
|
|
|
|
|
|
|
types of common syntax errors that occur in App::Fetchware's Fetchwarefile |
|
927
|
|
|
|
|
|
|
syntax. These errors are more at the level of I than actual syntax |
|
928
|
|
|
|
|
|
|
errors. See its POD below for additional details. |
|
929
|
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
Below briefly lists what App::Fetchware's implementation of check_syntax() |
|
931
|
|
|
|
|
|
|
checks. |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=over |
|
934
|
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
=item * Mandatory configuration options |
|
936
|
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
=over |
|
938
|
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=item * page_name, html_page_url, and destination_directory are required for all Fetchwarefiles. |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=back |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=back |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
=over |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=item drop_privs() NOTES |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
|
950
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
See L. |
|
953
|
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
=over |
|
955
|
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
=item * |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
check_syntax() is run in the parent process before even start() has run, so no |
|
959
|
|
|
|
|
|
|
temporary directory is available for use. |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=back |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
=back |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
=head2 start() |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
my $temp_file = start(); |
|
968
|
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
start() creats a temp dir, chmod 700's it, and chdir()'s to it just like the one |
|
970
|
|
|
|
|
|
|
in App::Fetchware does. App::FetchwareX::HTMLPageSync |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
start() is imported use L from App::Fetchware, |
|
973
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
|
974
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=head2 lookup() |
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
my $download_url = lookup(); |
|
979
|
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
lookup() downloads the user specified C, parses it using |
|
981
|
|
|
|
|
|
|
HTML::TreeBuilder, and uses C and |
|
982
|
|
|
|
|
|
|
C if specified to maniuplate the tree to determine what |
|
983
|
|
|
|
|
|
|
download urls the user wants. |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
This list of download urls is returned as an array reference, $download_url. |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=head2 download() |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
download($temp_dir, $download_url); |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
download() uses App::Fetchware's utility function download_http_url() to |
|
992
|
|
|
|
|
|
|
download all of the urls that lookup() returned. If the user specifed a |
|
993
|
|
|
|
|
|
|
C configuration option, then that option is passed along to |
|
994
|
|
|
|
|
|
|
download_http_url()'s call to HTTP::Tiny. |
|
995
|
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
=head2 verify() |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
verify($download_url, $package_path); |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
verify() simply calls App::Fetchware's :UTIL subroutine do_nothing(), which as |
|
1001
|
|
|
|
|
|
|
you can tell from its name does nothing, but return. The reason for the useless |
|
1002
|
|
|
|
|
|
|
do_nothing() call is simply for better documentation, and standardizing how to |
|
1003
|
|
|
|
|
|
|
override a App::Fetchware API subroutine in order for it to do nothing at all, |
|
1004
|
|
|
|
|
|
|
so that you can prevent the original App::Fetchware subroutine from doing what |
|
1005
|
|
|
|
|
|
|
it normally does. |
|
1006
|
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 unarchive() |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
unarchive(); |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
unarchive() does nothing by calling App::Fetchware's :UTIL subroutine |
|
1012
|
|
|
|
|
|
|
do_nothing(), which does nothing. |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
=head2 build() |
|
1015
|
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
build($build_path); |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
build() does the same thing as verify(), and that is nothing by calling |
|
1019
|
|
|
|
|
|
|
App::Fetchware's do_nothing() subroutine to better document the fact |
|
1020
|
|
|
|
|
|
|
that it does nothing. |
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
=head2 install() |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
install($package_path); |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
install() takes the $package_path, which is really an array ref of the paths |
|
1027
|
|
|
|
|
|
|
of the files that download() copied, and copies them the the user specified |
|
1028
|
|
|
|
|
|
|
destination directory, C. |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=head2 end() |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
end(); |
|
1033
|
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
end() chdir()s back to the original directory, and cleans up the temp directory |
|
1035
|
|
|
|
|
|
|
just like the one in App::Fetchware does. App::FetchwareX::HTMLPageSync |
|
1036
|
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
end() is imported use L from App::Fetchware, |
|
1038
|
|
|
|
|
|
|
and also exported by App::FetchwareX::HTMLPageSync. This is how |
|
1039
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync "subclasses" App::Fetchware. |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
=head2 uninstall() |
|
1042
|
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
uninstall($build_path); |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Uninstalls App::FetchwareX::HTMLPageSync by recursivly deleting the |
|
1046
|
|
|
|
|
|
|
C where it stores the wallpapers or whatever you |
|
1047
|
|
|
|
|
|
|
specified it to download for you. If you would like to keep your |
|
1048
|
|
|
|
|
|
|
C, then set the C to true in |
|
1049
|
|
|
|
|
|
|
your Fetchwarefile, and Fetchware will I delete you |
|
1050
|
|
|
|
|
|
|
C, when you uninstall your Fetchware package. |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
=head2 upgrade() |
|
1053
|
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
my $upgrade = upgrade($download_path, $fetchware_package_path) |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
if ($upgrade) { |
|
1057
|
|
|
|
|
|
|
... |
|
1058
|
|
|
|
|
|
|
} |
|
1059
|
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=over |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
=item Configuration subroutines used: |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=over |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item none |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=back |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
=back |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
Uses $download_path, an arrayref of URLs to download in HTMLPageSync, and |
|
1073
|
|
|
|
|
|
|
compares it against the list of files that has already been downloaded by |
|
1074
|
|
|
|
|
|
|
glob()ing C. And then comparing the file names of the |
|
1075
|
|
|
|
|
|
|
specified files. |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
Returns true if $download_path has any URLs that have not already been |
|
1078
|
|
|
|
|
|
|
downloaded into C. Note: HEAD HTTP querries are B |
|
1079
|
|
|
|
|
|
|
used to check if any already downloaded files are I than the files in |
|
1080
|
|
|
|
|
|
|
the C. |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
Returns false if $download_path is the same as C. |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
=over |
|
1085
|
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=item drop_privs() NOTES |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
This section notes whatever problems you might come accross implementing and |
|
1089
|
|
|
|
|
|
|
debugging your Fetchware extension due to fetchware's drop_privs mechanism. |
|
1090
|
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
See L. |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
=over |
|
1094
|
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
=item * |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
upgrade() is run in the B process as nobody or C, because the child |
|
1098
|
|
|
|
|
|
|
needs to know if it should actually bother running the rest of fetchware's API |
|
1099
|
|
|
|
|
|
|
subroutines. |
|
1100
|
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
=back |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=back |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=head1 MANUALLY CREATING A App::FetchwareX::HTMLPageSync FETCHWAREFILEN |
|
1106
|
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
In order to use App::FetchwareX::HTMLPageSync you must first create a |
|
1108
|
|
|
|
|
|
|
Fetchwarefile to use it. You can use C as explain above, or |
|
1109
|
|
|
|
|
|
|
create one manually in your text editor. |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
=over |
|
1112
|
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=item B<1. Name it> |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Use your text editor to create a file with a C<.Fetchwarefile> file extension. |
|
1116
|
|
|
|
|
|
|
Use of this convention is not required, but it makes it obvious what type of |
|
1117
|
|
|
|
|
|
|
file it is. Then, just copy and paste the example text below, and replace |
|
1118
|
|
|
|
|
|
|
C<[page_name]> with what you choose your C to be. C is |
|
1119
|
|
|
|
|
|
|
simply a configuration opton that simply names your Fetchwarefile. It is not |
|
1120
|
|
|
|
|
|
|
actually used for anything other than to name your Fetchwarefile to document |
|
1121
|
|
|
|
|
|
|
what program or behavior this Fetchwarefile manages. |
|
1122
|
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
page_name '[page_name]'; |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
Fetchwarefiles are actually small, well structured, Perl programs that can |
|
1128
|
|
|
|
|
|
|
contain arbitrary perl code to customize fetchware's behavior, or, in most |
|
1129
|
|
|
|
|
|
|
cases, simply specify a number of fetchware or a fetchware extension's (as in |
|
1130
|
|
|
|
|
|
|
this case) configuration options. Below is my filled in example |
|
1131
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync fetchwarefile. |
|
1132
|
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
use App::FetchwareX::HTMLPageSync; |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
page_name 'Cool Wallpapers'; |
|
1136
|
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
Notice the C |
|
1138
|
|
|
|
|
|
|
absolutely critical for this Fetchwarefile to work properly, because it is what |
|
1139
|
|
|
|
|
|
|
allows fetchware to use Perl's own syntax as a nice easy to use syntax for |
|
1140
|
|
|
|
|
|
|
Fetchwarefiles. If you do not use the matching C |
|
1141
|
|
|
|
|
|
|
then fetchware will spit out crazy errors from Perl's own compiler listing all |
|
1142
|
|
|
|
|
|
|
of the syntax errors you have. If you ever receive that error, just ensure you |
|
1143
|
|
|
|
|
|
|
have the correct C |
|
1144
|
|
|
|
|
|
|
Fetchwarefile. |
|
1145
|
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
=item B<2. Determine your html_page_url> |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
At the heart of App::FetchwareX::HTMLPageSync is its C, which is |
|
1149
|
|
|
|
|
|
|
the URL to the HTML page you want HTMLPageSync to download and parse out links |
|
1150
|
|
|
|
|
|
|
to wallpaper or whatever else you'd like to automate downloading. To figure this |
|
1151
|
|
|
|
|
|
|
out just use your browser to find the HTML page you want to use, and then copy |
|
1152
|
|
|
|
|
|
|
and paste the url between the single quotes C<'> as shown in the example below. |
|
1153
|
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
html_page_url ''; |
|
1155
|
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
And then after you copy the url. |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
html_page_url 'http://some.url/something.html'; |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
=item B<3. Determine your destination_directory> |
|
1161
|
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
HTMLPageSync also needs to know your C. This is the |
|
1163
|
|
|
|
|
|
|
directory that HTMLPageSync will copy your downloaded files to. This directory |
|
1164
|
|
|
|
|
|
|
will also be deleted when you uninstall this HTMLPageSync fetchware package just |
|
1165
|
|
|
|
|
|
|
like a standard App::Fetchware package would uninstall any installed software |
|
1166
|
|
|
|
|
|
|
when it is uninstalled. Just copy and paste the example below, and fill in the |
|
1167
|
|
|
|
|
|
|
space between the single quotes C<'>. |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
destination_directory ''; |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
After pasting it should look like. |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
destination_directory '~/wallpapers'; |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
Furthermore, if you want to keep your C after you |
|
1176
|
|
|
|
|
|
|
uninstall your HTMLPageSync fetchware package, just set the |
|
1177
|
|
|
|
|
|
|
C configuration option to true: |
|
1178
|
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
keep_destination_directory 'True'; |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
If this is set in your HTMLPageSync Fetchwarefile, HTMLPageSync will not delete |
|
1182
|
|
|
|
|
|
|
your C when your HTMLPageSync fetchware package is |
|
1183
|
|
|
|
|
|
|
uninstalled. |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
=item B<4. Specifiy other options> |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
That's all there is to it unless you need to further customize HTMLPageSync's |
|
1188
|
|
|
|
|
|
|
behavior to get just the links you need to download. |
|
1189
|
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
At this point you can install your new Fetchwarefile with: |
|
1191
|
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
fetchware install [path to your new fetchwarefile] |
|
1193
|
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
Or you can futher customize it as shown next. |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
=item B<5. Specify an optional user_agent> |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
Many sites don't like bots downloading stuff from them wasting their bandwidth, |
|
1199
|
|
|
|
|
|
|
and will even limit what you can do based on your user agent, which is the HTTP |
|
1200
|
|
|
|
|
|
|
standard's name for your browser. This option allows you to pretend to be |
|
1201
|
|
|
|
|
|
|
something other than HTMLPageSync's underlying library, L. Just copy |
|
1202
|
|
|
|
|
|
|
and past the example below, and paste what you want you user agent to be between |
|
1203
|
|
|
|
|
|
|
the single quotes C<'> as before. |
|
1204
|
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
user_agent ''; |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
And after pasting. |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
user_agent 'Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'; |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=item B<6. Specify an optonal html_treebuilder_callback> |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
C specifies an optional anonymous Perl subroutine |
|
1214
|
|
|
|
|
|
|
reference that will replace the default one that HTMLPageSync uses. The default |
|
1215
|
|
|
|
|
|
|
one limits the download to only image format links, which is flexible enough for |
|
1216
|
|
|
|
|
|
|
downloading wallpapers. |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
If you want to download something different, then paste the example below in |
|
1219
|
|
|
|
|
|
|
your Fetchwarefile. |
|
1220
|
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
|
1222
|
|
|
|
|
|
|
# Get one HTML::Element. |
|
1223
|
|
|
|
|
|
|
my $h = shift; |
|
1224
|
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
# Return true or false to indicate if this HTML::Element shoudd be a |
|
1226
|
|
|
|
|
|
|
# download link. |
|
1227
|
|
|
|
|
|
|
if (something) { |
|
1228
|
|
|
|
|
|
|
return 'True'; |
|
1229
|
|
|
|
|
|
|
} else { |
|
1230
|
|
|
|
|
|
|
return undef; |
|
1231
|
|
|
|
|
|
|
} |
|
1232
|
|
|
|
|
|
|
}; |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
And create a Perl anonymous subroutine C that will |
|
1235
|
|
|
|
|
|
|
be executed instead of the default one. This requires knowledge of the Perl |
|
1236
|
|
|
|
|
|
|
programming language. The one below limits itself to only pdfs and MS word |
|
1237
|
|
|
|
|
|
|
documents. |
|
1238
|
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
# Download pdfs and word documents only. |
|
1240
|
|
|
|
|
|
|
html_treebuilder_callback sub { |
|
1241
|
|
|
|
|
|
|
my $tag = shift; |
|
1242
|
|
|
|
|
|
|
my $link = $tag->attr('href'); |
|
1243
|
|
|
|
|
|
|
if (defined $link) { |
|
1244
|
|
|
|
|
|
|
# If the anchor tag is an image... |
|
1245
|
|
|
|
|
|
|
if ($link =~ /\.(pdf|doc|docx)$/) { |
|
1246
|
|
|
|
|
|
|
# ...return true... |
|
1247
|
|
|
|
|
|
|
return 'True'; |
|
1248
|
|
|
|
|
|
|
} else { |
|
1249
|
|
|
|
|
|
|
# ...if not return false. |
|
1250
|
|
|
|
|
|
|
return undef; #false |
|
1251
|
|
|
|
|
|
|
} |
|
1252
|
|
|
|
|
|
|
} |
|
1253
|
|
|
|
|
|
|
}; |
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
=item B<7. Specify an optional download_links_callbacks> |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
C specifies an optional anonymous Perl subroutine |
|
1258
|
|
|
|
|
|
|
reference that will replace the default one that HTMLPageSync uses. The default |
|
1259
|
|
|
|
|
|
|
one removes the HTML::Element skin each download link is wrapped in, because of |
|
1260
|
|
|
|
|
|
|
the use of L. This simply strips off the object-oriented crap |
|
1261
|
|
|
|
|
|
|
its wrapped in, and turns it into a simply string scalar. |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
If you want to post process the download link in some other way, then just copy |
|
1264
|
|
|
|
|
|
|
and paste the code below into your Fetchwarefile, and add whatever other Perl |
|
1265
|
|
|
|
|
|
|
code you may need. This requires knowledge of the Perl programming language. |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
download_links_callback sub { |
|
1268
|
|
|
|
|
|
|
my @download_urls = @_; |
|
1269
|
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
my @wanted_download_urls; |
|
1271
|
|
|
|
|
|
|
for my $link (@download_urls) { |
|
1272
|
|
|
|
|
|
|
# Pick ones to keep. |
|
1273
|
|
|
|
|
|
|
puse @wanted_download_urls, $link; |
|
1274
|
|
|
|
|
|
|
} |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
return @wanted_download_urls; |
|
1277
|
|
|
|
|
|
|
}; |
|
1278
|
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
=back |
|
1280
|
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
=head1 USING YOUR App::FetchwareX::HTMLPageSync FETCHWAREFILE WITH FETCHWARE |
|
1282
|
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
After you have |
|
1284
|
|
|
|
|
|
|
L |
|
1285
|
|
|
|
|
|
|
as shown above you need to actually use the fetchware command line program to |
|
1286
|
|
|
|
|
|
|
install, upgrade, and uninstall your App::FetchwareX::HTMLPageSync Fetchwarefile. |
|
1287
|
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
Take note how fetchware's package management metaphor does not quite line up |
|
1289
|
|
|
|
|
|
|
with what App::FetchwareX::HTMLPageSync does. Why would a HTML page mirroring |
|
1290
|
|
|
|
|
|
|
script be installed, upgraded, or uninstalled? Well HTMLPageSync simply adapts |
|
1291
|
|
|
|
|
|
|
fetchware's package management metaphor to its own enviroment performing the |
|
1292
|
|
|
|
|
|
|
likely action for when one of fetchware's behaviors are executed. |
|
1293
|
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
=over |
|
1295
|
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
=item B |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
A C will cause HTMLPageSync to ask the user a bunch of questons, |
|
1299
|
|
|
|
|
|
|
and help them create a new HTMLPageSync Fetchwarefile. |
|
1300
|
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
=item B |
|
1302
|
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
A C while using a HTMLPageSync Fetchwarefile causes fetchware |
|
1304
|
|
|
|
|
|
|
to download your C, parse it, download any matching links, and |
|
1305
|
|
|
|
|
|
|
then copy them to your C as you specify in your |
|
1306
|
|
|
|
|
|
|
Fetchwarefile. |
|
1307
|
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
=item B |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
A C will redownload the C, parse it, and |
|
1311
|
|
|
|
|
|
|
compare the corresponding list of files to the list of files already downloaded, |
|
1312
|
|
|
|
|
|
|
and if any new files have been added, then they will be downloaded. New versions |
|
1313
|
|
|
|
|
|
|
of existing files is not supported. No timestamp checking is implemented |
|
1314
|
|
|
|
|
|
|
currently. |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
=item B |
|
1317
|
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
A C will cause fetchware to delete this fetchware package |
|
1319
|
|
|
|
|
|
|
from its database as well as recursively deleting everything inside your |
|
1320
|
|
|
|
|
|
|
C as well as that directory itself. So when you uninstall |
|
1321
|
|
|
|
|
|
|
a HTMLPageSync fetchware package ensure that you really want to, because it will |
|
1322
|
|
|
|
|
|
|
delete whatever files it downloaded for you in the first place. |
|
1323
|
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
However, if you would like fetchware to preserve your C, |
|
1325
|
|
|
|
|
|
|
you can set the boolean C configuration option to |
|
1326
|
|
|
|
|
|
|
true, like C, to keep HTMLPageSync from |
|
1327
|
|
|
|
|
|
|
deleting your destination directory. |
|
1328
|
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
=back |
|
1330
|
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
=head1 HOW App::FetchwareX::HTMLPageSync OVERRIDES App::Fetchware |
|
1332
|
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
This sections documents how App::FetchwareX::HTMLPageSync overrides |
|
1334
|
|
|
|
|
|
|
App::Fetchware's API, and is only interesting if you're debugging |
|
1335
|
|
|
|
|
|
|
App::FetchwareX::HTMLPageSync, or you're writing your own App::Fetcwhare |
|
1336
|
|
|
|
|
|
|
extension. If not, you don't need to know these details. |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
=head2 App::Fetchware API Subroutines |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
=head3 new() |
|
1341
|
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
HTMLPageSync overrides new(), and implements its own Q&A wizard interface |
|
1343
|
|
|
|
|
|
|
helping users create HTMLPageSync Fetchwarefiles. |
|
1344
|
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
=head3 new_install() |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
HTMLPageSync just inherits App::Fetchware's new_install(), which just asks the |
|
1348
|
|
|
|
|
|
|
user if they would like Fetchware to instell the already generated |
|
1349
|
|
|
|
|
|
|
Fetchwarefile. |
|
1350
|
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
=head3 check_syntax() |
|
1352
|
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
check_syntax() is also overridden to check HTMLPageSync's own Fetchware-level |
|
1354
|
|
|
|
|
|
|
syntax. |
|
1355
|
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
=head3 start() and end() |
|
1357
|
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
HTMLPageSync just imports start() and end() from App::Fetchware to take |
|
1359
|
|
|
|
|
|
|
advantage of their ability to manage a temporary directory. |
|
1360
|
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
=head3 lookup() |
|
1362
|
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
lookup() is overridden, and downloads the C, which is the main |
|
1364
|
|
|
|
|
|
|
configuration option that HTMLPageSync uses. Then lookup() parses that |
|
1365
|
|
|
|
|
|
|
C, and determines what the download urls should be. If the |
|
1366
|
|
|
|
|
|
|
C and C exist, then they are |
|
1367
|
|
|
|
|
|
|
called to customize lookup()'s default bahavior. See their descriptions below. |
|
1368
|
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
=head3 download() |
|
1370
|
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
download() downloads the array ref of download links that lookup() returns. |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head3 verify() |
|
1374
|
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
verify() is overridden to do nothing. |
|
1376
|
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
=head3 unarchive() |
|
1378
|
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
verify() is overridden to do nothing. |
|
1380
|
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
=head3 build() |
|
1382
|
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
build() is overridden to do nothing. |
|
1384
|
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
=head3 install() |
|
1386
|
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
install() takes its argument, which is an arrayref of of the paths of the |
|
1388
|
|
|
|
|
|
|
files that were downloaded to the tempdir created by start(), and copies them to |
|
1389
|
|
|
|
|
|
|
the user's provided C. |
|
1390
|
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
=head3 end() and start() |
|
1392
|
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
HTMLPageSync just imports end() and start() from App::Fetchware to take |
|
1394
|
|
|
|
|
|
|
advantage of their ability to manage a temporary directory. |
|
1395
|
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
=head3 uninstall() |
|
1397
|
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
uninstall() recursively deletes your C where it stores |
|
1399
|
|
|
|
|
|
|
whatever links you choose to download unless of course the |
|
1400
|
|
|
|
|
|
|
C configuration option is set to true. |
|
1401
|
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
=head3 upgrade() |
|
1403
|
|
|
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
Determines if any looked up URLs have not been downloaded yet, and returns true |
|
1405
|
|
|
|
|
|
|
if that is the case. |
|
1406
|
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
=head2 App::FetchwareX::HTMLPageSync's Configuration Subroutines |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
Because HTMLPageSync is a App::Fetchware extension, it can not just use the same |
|
1410
|
|
|
|
|
|
|
configuration subroutines that App::Fetchware uses. Instead, it must create its |
|
1411
|
|
|
|
|
|
|
own configuration subroutines with App::Fetchware::CreateConfigOptions. These |
|
1412
|
|
|
|
|
|
|
configuration subroutines are the configuration options that you use in your |
|
1413
|
|
|
|
|
|
|
App::Fetchware or App::Fetchware extension. |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
=head3 page_name [MANDATORY] |
|
1416
|
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
HTMLPageSync's equivelent to App::Fetchware's C. It's simply the |
|
1418
|
|
|
|
|
|
|
name of the page or what you want to download on that page. |
|
1419
|
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
=head3 html_page_url [MANDATORY] |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
HTMLPageSync's equivelent to App::Fetchware's C, and is just as |
|
1423
|
|
|
|
|
|
|
mandatory. This is the url of the HTML page that will be downloaded and |
|
1424
|
|
|
|
|
|
|
processed. |
|
1425
|
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
=head3 destination_directory [MANDATORY] |
|
1427
|
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
This option is also mandatory, and it specifies the directory where the files |
|
1429
|
|
|
|
|
|
|
that you want to download are downloaded to. |
|
1430
|
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
=head3 user_agent [OPTIONAL] |
|
1432
|
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
This option is optional, and it allows you to have HTML::Tiny pretend to be a |
|
1434
|
|
|
|
|
|
|
Web browser or perhaps bot if you want to. |
|
1435
|
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
=head3 html_treebuilder_callback [OPTIONAL] |
|
1437
|
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
This optional option allows you to specify a perl C that lookup() will |
|
1439
|
|
|
|
|
|
|
execute instead of its default callback that just looks for images. |
|
1440
|
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
It receives one parameter, which is an HTML::Element at the first C, |
|
1442
|
|
|
|
|
|
|
anchor/link tag. |
|
1443
|
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
It must C to indicate that that link should be included in the |
|
1445
|
|
|
|
|
|
|
list of download links, or return false, C, to indicate that that |
|
1446
|
|
|
|
|
|
|
link should not be included in the list of download links. |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
=head3 download_links_callback [OPTIONAL] |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
This optional option specifies an optional callback that will allow you to do |
|
1451
|
|
|
|
|
|
|
post processing of the list of downloaded urls. This is needed, because the |
|
1452
|
|
|
|
|
|
|
results of the C are still HTML::Element objects that |
|
1453
|
|
|
|
|
|
|
need to be converted to just string download urls. That is what the default |
|
1454
|
|
|
|
|
|
|
C does. |
|
1455
|
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
It receives a list of all of the download HTML::Elements that |
|
1457
|
|
|
|
|
|
|
C returned true on. It is called only once, and |
|
1458
|
|
|
|
|
|
|
should return a list of string download links for download later by HTML::Tiny |
|
1459
|
|
|
|
|
|
|
in download(). |
|
1460
|
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
=head3 keep_destination_directory [OPTIONAL] |
|
1462
|
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
This optional option is a boolean true or false configuration option that |
|
1464
|
|
|
|
|
|
|
when true prevents HTMLPageSync from deleting your destination_directory when |
|
1465
|
|
|
|
|
|
|
you run fetchware uninstall. |
|
1466
|
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
Its default is false, so by defualt HTMLPageSync B delete your files from |
|
1468
|
|
|
|
|
|
|
your C unless you set this to true. |
|
1469
|
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
=head1 ERRORS |
|
1471
|
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
As with the rest of App::Fetchware, App::Fetchware::Config does not return any |
|
1473
|
|
|
|
|
|
|
error codes; instead, all errors are die()'d if it's App::Fetchware::Config's |
|
1474
|
|
|
|
|
|
|
error, or croak()'d if its the caller's fault. These exceptions are simple |
|
1475
|
|
|
|
|
|
|
strings, and are listed in the L section below. |
|
1476
|
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
=head1 CAVEATS |
|
1478
|
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
Certain features of App::FetchwareX::HTMLPageSync require knowledge of the Perl |
|
1480
|
|
|
|
|
|
|
programming language in order for you to make use of them. However, this is |
|
1481
|
|
|
|
|
|
|
limited to optional callbacks that are not needed for most uses. These features |
|
1482
|
|
|
|
|
|
|
are the C and C callbacks. |
|
1483
|
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1485
|
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
David Yingling |
|
1487
|
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1489
|
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
This software is copyright (c) 2016 by David Yingling. |
|
1491
|
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
|
1493
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
|
1494
|
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
=cut |
|
1496
|
|
|
|
|
|
|
|
|
1497
|
|
|
|
|
|
|
__END__ |