line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTTP::GetImages;
|
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
781
|
use vars qw /$EXTENSIONS_RE $EXTENSIONS_BAD $VERSION/;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
92
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
$VERSION=0.343;
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTTP::GetImages - Spider to recover and store images from web pages.
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use HTTP::GetImages;
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
$_ = new HTTP::GetImages (
|
16
|
|
|
|
|
|
|
dir => '.',
|
17
|
|
|
|
|
|
|
todo => ['http://www.google.com/',],
|
18
|
|
|
|
|
|
|
dont => ['http://www.somewhere/ignorethis.html','http://and.this.html'],
|
19
|
|
|
|
|
|
|
chat => 1,
|
20
|
|
|
|
|
|
|
);
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
$_->print_imgs;
|
23
|
|
|
|
|
|
|
$_->print_done;
|
24
|
|
|
|
|
|
|
$_->print_failed;
|
25
|
|
|
|
|
|
|
$_->print_ignored;
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $hash = $_->imgs_as_hash;
|
28
|
|
|
|
|
|
|
foreach (keys %{$hash}){
|
29
|
|
|
|
|
|
|
warn "$_ = ",$hash->{$_},"\n";
|
30
|
|
|
|
|
|
|
}
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
exit;
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This module allow syou to automate the searching, recovery and local storage
|
37
|
|
|
|
|
|
|
of images from the web, including those linked by anchor (C), mage (C)
|
38
|
|
|
|
|
|
|
and image map (C) elements.
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Supply a URI or list of URIs to process, and C will recurse
|
41
|
|
|
|
|
|
|
over every link it finds, searching for images.
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
By supplying a list of URIs, you can restrict the search to certain webservers
|
44
|
|
|
|
|
|
|
and directories, or exclude it from certain webservers and directories.
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
You can also decide to reject images that are too small or too large.
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
LWP::UserAgent;
|
51
|
|
|
|
|
|
|
HTTP::Request;
|
52
|
|
|
|
|
|
|
HTML::TokeParser;
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=cut
|
55
|
|
|
|
|
|
|
|
56
|
1
|
|
|
1
|
|
852314
|
use LWP::UserAgent;
|
|
1
|
|
|
|
|
1427552
|
|
|
1
|
|
|
|
|
44
|
|
57
|
1
|
|
|
1
|
|
12
|
use HTTP::Request;
|
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
26
|
|
58
|
1
|
|
|
1
|
|
907
|
use HTML::TokeParser;
|
|
1
|
|
|
|
|
13062
|
|
|
1
|
|
|
|
|
34
|
|
59
|
1
|
|
|
1
|
|
57
|
use Carp;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
102
|
|
60
|
1
|
|
|
1
|
|
5
|
use strict;
|
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
28
|
|
61
|
1
|
|
|
1
|
|
6
|
use warnings;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
62
|
1
|
|
|
1
|
|
5
|
no strict 'refs';
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
3101
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 PACKAGE GLOBAL VARIABLE
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 $CHAT
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Set to above zero if you'd like a real-time report to C.
|
69
|
|
|
|
|
|
|
Defaults to off.
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=cut
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
my $CHAT;
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Default values to apply to $self->{ext_ok}
|
76
|
|
|
|
|
|
|
$EXTENSIONS_RE = '(jpg|jpeg|bmp|gif|png|xbm|xmp)';
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# Default values for $self->{ext_bad}
|
79
|
|
|
|
|
|
|
$EXTENSIONS_BAD = '(wmv|avi|rm|mpg|asf|ram|asx|mpeg|mp3)';
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=head1 CONSTRUCTOR METHOD new
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Besides the class reference, accepts name=>value pairs:
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=over 4
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item max_attempts
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The maximum attempts the agent should make to access the site. Default is three.
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=item dir
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
the path to the directory in which to store images (no trailing oblique necessary);
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item rename
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Default value is 0, which allows images to be saved with their original names.
|
99
|
|
|
|
|
|
|
If set with a value of 1, images will be given new names based on the time
|
100
|
|
|
|
|
|
|
they were saved at. If set to 2, images will be given filenames according to their
|
101
|
|
|
|
|
|
|
source location.
|
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=item todo
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
one or more URL to process: can be an anonymous array, array reference, or scalar.
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=item dont
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
As C, above, but URLs should be ignored.
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
If one of these is C, then will ignore all B documents
|
112
|
|
|
|
|
|
|
that do not match exactly those in the C array of URLs to process.
|
113
|
|
|
|
|
|
|
If one of these is C, will ignore no documents.
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=item ext_ok
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
A regular expression 'or' list of image extensions to match.
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Will be applied at the end of a filename, after a point, and is insensitive to case.
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Defaults to C<(jpg|jpeg|bmp|gif|png|xbm|xmp)>.
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item ext_bad
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
As C (above), but default value is:C<(wmv|avi|rm|mpg|asf|ram|asx|mpeg|mp3)>
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=item match_url
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
The minimum path a URL must contain. This can be a scalar or an array reference.
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=item min_size.
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
The minimum size an image can be if it is to be saved.
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=item max_size
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
The maximum size an image can be if it is to be saved.
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=back
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
The object has several private variables, which
|
142
|
|
|
|
|
|
|
you can access for the results when the job is done.
|
143
|
|
|
|
|
|
|
However, do check out the public methods for accessing
|
144
|
|
|
|
|
|
|
these.
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=over 4
|
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=item DONE
|
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
a hash keys of which are the original URLs of the images, value being are the local filenames.
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=item FAILED
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
a hash, keys of which are the failed URLs, values being short reasons.
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=cut
|
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
0
|
0
|
|
sub new { my ($class) = (shift);
|
159
|
0
|
0
|
|
|
|
|
warn "Making new ",__PACKAGE__ if $CHAT;
|
160
|
0
|
0
|
|
|
|
|
unless (defined $class) {
|
161
|
0
|
|
|
|
|
|
carp "Usage: ".__PACKAGE__."->new( {key=>value} )\n";
|
162
|
0
|
|
|
|
|
|
return undef;
|
163
|
|
|
|
|
|
|
}
|
164
|
0
|
|
|
|
|
|
my %args;
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Take parameters and place in object slots/set as instance variables
|
167
|
0
|
0
|
|
|
|
|
if (ref $_[0] eq 'HASH'){ %args = %{$_[0]} }
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
elsif (not ref $_[0]){ %args = @_ }
|
169
|
|
|
|
|
|
|
else {
|
170
|
0
|
|
|
|
|
|
carp "Usage: $class->new( { key=>values, } )";
|
171
|
0
|
|
|
|
|
|
return undef;
|
172
|
|
|
|
|
|
|
}
|
173
|
0
|
|
|
|
|
|
my $self = bless {}, $class;
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# Slots that have default values:
|
176
|
|
|
|
|
|
|
# $self->{min_size};
|
177
|
|
|
|
|
|
|
# $self->{match_url}
|
178
|
|
|
|
|
|
|
# $self->{dir},
|
179
|
|
|
|
|
|
|
# $todo,= []
|
180
|
0
|
|
|
|
|
|
$self->{dont} = [];
|
181
|
|
|
|
|
|
|
# $MINIMGSIZE
|
182
|
0
|
|
|
|
|
|
$self->{ext_ok} = $EXTENSIONS_RE; # Defualt extensions to use
|
183
|
0
|
|
|
|
|
|
$self->{ext_bad} = $EXTENSIONS_BAD; # Ditto for ignore.
|
184
|
0
|
|
|
|
|
|
$self->{rename} = 0;
|
185
|
0
|
|
|
|
|
|
$self->{max_attempts} = 3;
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Set/overwrite public slots with user's values
|
188
|
0
|
|
|
|
|
|
foreach (keys %args) {
|
189
|
0
|
|
|
|
|
|
$self->{lc $_} = $args{$_};
|
190
|
0
|
0
|
|
|
|
|
warn "$_ -> $self->{$_}\n" if $CHAT;
|
191
|
|
|
|
|
|
|
}
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Catch parameter errors
|
194
|
0
|
0
|
0
|
|
|
|
if (not exists $self->{dir} or not defined $self->{dir}){
|
195
|
0
|
|
|
|
|
|
croak "No 'dir' slot defined";
|
196
|
|
|
|
|
|
|
}
|
197
|
0
|
0
|
|
|
|
|
if (!-d $self->{dir}){
|
198
|
0
|
|
|
|
|
|
croak "The dir to save to <$self->{dir}> could not be found or is not a directory";
|
199
|
|
|
|
|
|
|
}
|
200
|
0
|
0
|
|
|
|
|
if (not exists $self->{todo}){
|
201
|
0
|
|
|
|
|
|
croak "The 'todo' slot is not defined";
|
202
|
|
|
|
|
|
|
}
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# React to user slots
|
205
|
0
|
0
|
0
|
|
|
|
if (exists $self->{chat} and defined $self->{chat}){
|
206
|
0
|
|
|
|
|
|
$CHAT = 1;
|
207
|
0
|
|
|
|
|
|
warn "Chat mode on";
|
208
|
0
|
|
|
|
|
|
} else { undef $CHAT }
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Turn scalars into arrays for later use
|
211
|
0
|
0
|
0
|
|
|
|
if (exists $self->{match_url} and not ref $self->{match_url}){
|
212
|
0
|
|
|
|
|
|
$self->{match_url} = [$self->{match_url}];
|
213
|
|
|
|
|
|
|
}
|
214
|
0
|
0
|
0
|
|
|
|
if (exists $self->{todo} and not ref $self->{todo}){
|
215
|
0
|
|
|
|
|
|
$self->{todo} = [$self->{todo}];
|
216
|
|
|
|
|
|
|
}
|
217
|
0
|
0
|
0
|
|
|
|
if (exists $self->{dont} and not ref $self->{dont}){
|
218
|
0
|
|
|
|
|
|
$self->{dont} = [$self->{dont}];
|
219
|
|
|
|
|
|
|
}
|
220
|
0
|
|
|
|
|
|
@_ = @{$self->{todo}};
|
|
0
|
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
$self->{todo} = {};
|
222
|
0
|
|
|
|
|
|
foreach (@_){ $self->{todo}->{$_} = 1 }
|
|
0
|
|
|
|
|
|
|
223
|
0
|
0
|
|
|
|
|
if ($self->{dont}){
|
224
|
0
|
|
|
|
|
|
@_ = @{$self->{dont}};
|
|
0
|
|
|
|
|
|
|
225
|
0
|
|
|
|
|
|
$self->{dont} = {};
|
226
|
0
|
|
|
|
|
|
foreach (@_){ $self->{dont}->{$_} = 1 }
|
|
0
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
}
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Slots that are not adjustable by user:
|
230
|
0
|
|
|
|
|
|
$self->{DONE} = {};
|
231
|
0
|
|
|
|
|
|
$self->{FAILED} = {};
|
232
|
|
|
|
|
|
|
|
233
|
0
|
|
|
|
|
|
DOC:
|
234
|
0
|
|
|
|
|
|
while (keys %{$self->{todo}} ){
|
235
|
0
|
|
|
|
|
|
@_ = keys %{$self->{todo}};
|
|
0
|
|
|
|
|
|
|
236
|
0
|
|
|
|
|
|
my $doc_url = shift @_;
|
237
|
0
|
0
|
|
|
|
|
warn "-"x60,"\n" if $CHAT;
|
238
|
0
|
|
|
|
|
|
my ($doc,$p);
|
239
|
|
|
|
|
|
|
# If using match_url feature: ignore doc if not match start of one string
|
240
|
0
|
0
|
|
|
|
|
if (exists $self->{match_url}){
|
241
|
0
|
|
|
|
|
|
foreach (@{$self->{match_url}}){
|
|
0
|
|
|
|
|
|
|
242
|
0
|
0
|
|
|
|
|
if ($doc_url !~ /^$_/){
|
243
|
0
|
0
|
|
|
|
|
warn "URL out of scope: $doc_url $_\n" if $CHAT;
|
244
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
245
|
0
|
|
|
|
|
|
next DOC;
|
246
|
|
|
|
|
|
|
} else {
|
247
|
0
|
0
|
|
|
|
|
warn "URL ok by $_\n" if $CHAT;
|
248
|
|
|
|
|
|
|
}
|
249
|
|
|
|
|
|
|
}
|
250
|
|
|
|
|
|
|
}
|
251
|
|
|
|
|
|
|
|
252
|
0
|
0
|
0
|
|
|
|
if (exists $self->{FAILED}->{$doc_url} or exists $self->{DONE}->{$doc_url}){
|
253
|
0
|
0
|
|
|
|
|
warn "Already done $doc_url.\n" if $CHAT;
|
254
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
255
|
0
|
|
|
|
|
|
next DOC;
|
256
|
|
|
|
|
|
|
}
|
257
|
|
|
|
|
|
|
|
258
|
0
|
0
|
|
|
|
|
if (exists $self->{dont}->{$doc_url}){
|
259
|
0
|
0
|
|
|
|
|
warn "In IGNORE list: $doc_url.\n" if $CHAT;
|
260
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
261
|
0
|
|
|
|
|
|
next DOC;
|
262
|
|
|
|
|
|
|
}
|
263
|
|
|
|
|
|
|
|
264
|
0
|
0
|
0
|
|
|
|
if (exists $self->{dont}->{ALL} and not $self->{todo}->{$doc_url}){
|
265
|
0
|
0
|
|
|
|
|
warn "Not in TODO list: $doc_url.\n" if $CHAT;
|
266
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
267
|
0
|
|
|
|
|
|
next DOC;
|
268
|
|
|
|
|
|
|
}
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Not in do list, not an image, not run with IGNORE NONE option
|
271
|
0
|
0
|
0
|
|
|
|
if (not exists $self->{todo}->{$doc_url} and $doc_url !~ m|(\.$self->{ext_ok})$|i
|
|
|
|
0
|
|
|
|
|
272
|
|
|
|
|
|
|
and not exists $self->{dont}->{NONE}){
|
273
|
0
|
0
|
|
|
|
|
warn "Not in DO list - ignoring $doc_url .\n" if $CHAT;
|
274
|
0
|
|
|
|
|
|
$self->{dont}->{$doc_url} = "Ignoring";
|
275
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
276
|
0
|
|
|
|
|
|
next DOC;
|
277
|
|
|
|
|
|
|
}
|
278
|
|
|
|
|
|
|
|
279
|
0
|
0
|
|
|
|
|
unless ($doc = $self->get_document($doc_url)){
|
280
|
0
|
0
|
|
|
|
|
warn "Agent could not open $doc_url" if $CHAT;
|
281
|
0
|
|
|
|
|
|
$self->{FAILED}->{$doc_url} = "Agent couldn't open document";
|
282
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
283
|
0
|
|
|
|
|
|
next DOC;
|
284
|
|
|
|
|
|
|
}
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# If an image, save it
|
287
|
0
|
0
|
|
|
|
|
if ($doc_url =~ m|(\.$self->{ext_ok})$|i) {
|
288
|
0
|
|
|
|
|
|
$self->{DONE}->{$doc_url} = $self->_save_img($doc_url,$doc);
|
289
|
0
|
0
|
|
|
|
|
warn "OK: $doc_url" if $CHAT;
|
290
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
291
|
0
|
|
|
|
|
|
next DOC;
|
292
|
|
|
|
|
|
|
} else {
|
293
|
0
|
|
|
|
|
|
$self->{DONE}->{$doc_url} = "Did HTML.";
|
294
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
295
|
|
|
|
|
|
|
}
|
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# Otherwise try to parse it
|
298
|
0
|
0
|
|
|
|
|
unless ($p = new HTML::TokeParser( \$doc )){
|
299
|
0
|
0
|
|
|
|
|
warn "* Couldn't create parser from \$doc\n" if $CHAT;
|
300
|
0
|
|
|
|
|
|
$self->{FAILED}->{$doc_url} = "Couldn't create agent parser";
|
301
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
302
|
0
|
|
|
|
|
|
next DOC;
|
303
|
|
|
|
|
|
|
}
|
304
|
0
|
0
|
|
|
|
|
warn "OK - parsing document $doc_url ...\n" if $CHAT;
|
305
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
while (my $token = $p->get_token){
|
307
|
|
|
|
|
|
|
|
308
|
0
|
0
|
0
|
|
|
|
if (@$token[1] eq 'img' and exists @$token[2]->{src}){
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
309
|
0
|
0
|
|
|
|
|
warn "*** Found image: @$token[2]->{src}\n" if $CHAT;
|
310
|
0
|
|
|
|
|
|
my $uri = &abs_url( $doc_url, @$token[2]->{src} );
|
311
|
0
|
0
|
0
|
|
|
|
if ($uri and not exists $self->{IGNORE0}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
312
|
|
|
|
|
|
|
){
|
313
|
0
|
|
|
|
|
|
$self->{todo}->{$uri} = 1;
|
314
|
|
|
|
|
|
|
} else {
|
315
|
0
|
0
|
|
|
|
|
warn "\t ignoring that img.\n" if $CHAT;
|
316
|
|
|
|
|
|
|
}
|
317
|
|
|
|
|
|
|
}
|
318
|
|
|
|
|
|
|
elsif (@$token[1] =~ /^(area|a)$/ and exists @$token[2]->{href} and @$token[0] eq 'S'){
|
319
|
0
|
0
|
|
|
|
|
warn "*** Found link: @$token[2]->{href}\n" if $CHAT;
|
320
|
0
|
|
|
|
|
|
my $uri = &abs_url( $doc_url, @$token[2]->{href} );
|
321
|
0
|
0
|
0
|
|
|
|
if ($uri and not exists $self->{dont}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
322
|
|
|
|
|
|
|
and not (exists $self->{dont}->{ALL} and not exists $self->{todo}->{$uri})
|
323
|
|
|
|
|
|
|
){
|
324
|
0
|
|
|
|
|
|
$self->{todo}->{$uri} = 1;
|
325
|
|
|
|
|
|
|
} else {
|
326
|
0
|
0
|
|
|
|
|
warn "\t ignoring that link.\n" if $CHAT;
|
327
|
|
|
|
|
|
|
}
|
328
|
|
|
|
|
|
|
}
|
329
|
|
|
|
|
|
|
elsif (@$token[1] eq 'frame' and exists(@$token[2]->{src})){ # This block (DL)
|
330
|
0
|
0
|
|
|
|
|
warn "*** Found frame: @$token[2]->{src}\n" if $CHAT;
|
331
|
0
|
|
|
|
|
|
my $uri = &abs_url( $doc_url, @$token[2]->{src} );
|
332
|
0
|
0
|
0
|
|
|
|
if ($uri and not exists $self->{IGNORE0}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
333
|
|
|
|
|
|
|
and not (exists $self->{dont}->{ALL} and not exists $self->{todo}->{$uri}) ){
|
334
|
0
|
|
|
|
|
|
$self->{todo}->{$uri} = 1;
|
335
|
|
|
|
|
|
|
} else {
|
336
|
0
|
0
|
|
|
|
|
warn "\t ignoring that frame.\n" if $CHAT;
|
337
|
|
|
|
|
|
|
}
|
338
|
|
|
|
|
|
|
}
|
339
|
|
|
|
|
|
|
} # Next token
|
340
|
0
|
|
|
|
|
|
delete $self->{todo}->{$doc_url};
|
341
|
|
|
|
|
|
|
} # Next DOC
|
342
|
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
|
return $self;
|
344
|
|
|
|
|
|
|
} # End sub new
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
#
|
351
|
|
|
|
|
|
|
# SUB get_document
|
352
|
|
|
|
|
|
|
# Accepts a URL, returns the source of the document at the URL
|
353
|
|
|
|
|
|
|
# or undef on failure
|
354
|
|
|
|
|
|
|
#
|
355
|
0
|
|
|
0
|
0
|
|
sub get_document { my ($self,$url) = (shift,shift); # Recieve as argument the URL to access
|
356
|
0
|
0
|
|
|
|
|
if ($url =~ m|(\.$self->{ext_bad})$|i) { # (DL)
|
357
|
0
|
0
|
|
|
|
|
warn "Ignoring - extension on the 'bad' list" if $CHAT;
|
358
|
0
|
|
|
|
|
|
return undef;
|
359
|
|
|
|
|
|
|
}
|
360
|
0
|
|
|
|
|
|
my ($req,$res);
|
361
|
0
|
|
|
|
|
|
my $ua = LWP::UserAgent->new; # Create a new UserAgent
|
362
|
0
|
|
|
|
|
|
for my $attempt (1..$self->{max_attempts}){
|
363
|
0
|
0
|
0
|
|
|
|
if ($attempt!=1 and $attempt-1 == $self->{max_attempts}){
|
364
|
0
|
|
|
|
|
|
$ua->agent('MSIE Internet Explorer 6.0 (Mozilla compatible'); # Naughty?
|
365
|
|
|
|
|
|
|
} else {
|
366
|
0
|
|
|
|
|
|
$ua->agent('Perl::'.__PACKAGE__.' v'.$VERSION); # Give it a type name
|
367
|
|
|
|
|
|
|
}
|
368
|
0
|
0
|
|
|
|
|
warn "Attempt ($attempt) to access <$url>...\n" if $CHAT;
|
369
|
0
|
|
|
|
|
|
$req = new HTTP::Request('GET', $url); # Format URL request
|
370
|
0
|
0
|
|
|
|
|
next if not defined $req;
|
371
|
0
|
|
|
|
|
|
$res = $ua->request($req); # $res is the object UA returned
|
372
|
0
|
0
|
|
|
|
|
last if $res->is_success(); # If not successful
|
373
|
|
|
|
|
|
|
}
|
374
|
0
|
0
|
|
|
|
|
if (not defined $req){
|
375
|
0
|
0
|
|
|
|
|
warn "...could not GET.\n" if $CHAT;
|
376
|
0
|
|
|
|
|
|
return undef;
|
377
|
|
|
|
|
|
|
}
|
378
|
0
|
0
|
|
|
|
|
if (not $res->is_success()) { # If not successful
|
379
|
0
|
0
|
|
|
|
|
warn"...failed.\n" if $CHAT;
|
380
|
|
|
|
|
|
|
return undef
|
381
|
0
|
|
|
|
|
|
}
|
382
|
|
|
|
|
|
|
|
383
|
0
|
0
|
|
|
|
|
warn "...ok.\n" if $CHAT;
|
384
|
|
|
|
|
|
|
# Test size
|
385
|
0
|
0
|
0
|
|
|
|
if ((exists $self->{max_size} or exists $self->{min_size})
|
|
|
|
0
|
|
|
|
|
386
|
|
|
|
|
|
|
and $url =~ m|(\.$self->{ext_ok})$|i) {
|
387
|
0
|
|
|
|
|
|
$_ = length ($res->content);
|
388
|
0
|
0
|
0
|
|
|
|
if (defined $_ and $self->{min_size} and $_ < $self->{min_size}){
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
389
|
0
|
0
|
|
|
|
|
warn "Image size too small, ignoring.\n" if $CHAT;
|
390
|
0
|
|
|
|
|
|
$self->{dont}->{$url} = "Size $_ bytes is too small.";
|
391
|
0
|
|
|
|
|
|
return undef;
|
392
|
|
|
|
|
|
|
}
|
393
|
|
|
|
|
|
|
elsif (defined $_ and $self->{max_size} and $_ > $self->{max_size}){
|
394
|
0
|
0
|
|
|
|
|
warn "Image size too large, ignoring.\n" if $CHAT;
|
395
|
0
|
|
|
|
|
|
$self->{dont}->{$url} = "Size $_ bytes is too large.";
|
396
|
0
|
|
|
|
|
|
return undef;
|
397
|
|
|
|
|
|
|
}
|
398
|
|
|
|
|
|
|
}
|
399
|
0
|
|
|
|
|
|
return $res->content; # $res->content is the HTML the UA returned from the URL
|
400
|
|
|
|
|
|
|
}
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# PRIVATE METHOD _save_img
|
405
|
|
|
|
|
|
|
#
|
406
|
|
|
|
|
|
|
# Accepts and the actual image source.
|
407
|
|
|
|
|
|
|
# Won't store same image twice.
|
408
|
|
|
|
|
|
|
#
|
409
|
|
|
|
|
|
|
# Returns the path the image was saved at.
|
410
|
|
|
|
|
|
|
|
411
|
0
|
|
|
0
|
|
|
sub _save_img { my ($self,$url,$img) = (shift,shift,shift,shift);
|
412
|
0
|
|
|
|
|
|
local *OUT;
|
413
|
0
|
|
|
|
|
|
my $filename;
|
414
|
|
|
|
|
|
|
# Remvoe any file path from the $url
|
415
|
0
|
0
|
0
|
|
|
|
if (exists $self->{DONE}->{$url} or exists $self->{FAILED}->{$url}){
|
416
|
0
|
0
|
|
|
|
|
warn "Already got this one ($url), not saving.\n" if $CHAT;
|
417
|
0
|
|
|
|
|
|
return undef;
|
418
|
|
|
|
|
|
|
}
|
419
|
0
|
|
|
|
|
|
$url =~ m|/([^./]+)(\.$self->{ext_ok})$|i;
|
420
|
0
|
0
|
|
|
|
|
if ($self->{rename}){
|
|
|
0
|
|
|
|
|
|
421
|
0
|
|
|
|
|
|
$filename = $self->{dir}.'/'.(join'',localtime).$2;
|
422
|
|
|
|
|
|
|
} elsif ($self->{rename} == 2){ # )
|
423
|
0
|
|
|
|
|
|
$filename = $url; # } DL
|
424
|
0
|
|
|
|
|
|
$filename =~ s/\/|\:|\~|\?/_/g; # )
|
425
|
0
|
|
|
|
|
|
$filename = $self->{dir}.'\\'.$filename; # )
|
426
|
|
|
|
|
|
|
} else {
|
427
|
0
|
|
|
|
|
|
$filename = "$self->{dir}/$1$2";
|
428
|
|
|
|
|
|
|
}
|
429
|
0
|
0
|
|
|
|
|
warn "Saving image as <$filename>...\n" if $CHAT;
|
430
|
0
|
0
|
0
|
|
|
|
open OUT,">$filename" or warn "Couldn't open to save <$filename>!" and return "Failed to save.";
|
431
|
0
|
|
|
|
|
|
binmode OUT;
|
432
|
0
|
|
|
|
|
|
print OUT $img;
|
433
|
0
|
|
|
|
|
|
close OUT;
|
434
|
0
|
0
|
|
|
|
|
warn "...ok.\n" if $CHAT;
|
435
|
0
|
|
|
|
|
|
return $filename;
|
436
|
|
|
|
|
|
|
}
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
#
|
440
|
|
|
|
|
|
|
# SUB abs_url returns an absolute URL for a $child_url linked from $parent_url
|
441
|
|
|
|
|
|
|
#
|
442
|
|
|
|
|
|
|
# DOC http://www.netverifier.com/pin/nicolette/jezfuzchr001.html
|
443
|
|
|
|
|
|
|
# SRC /pin/nicolette/jezfuzchr001.jpg
|
444
|
|
|
|
|
|
|
#
|
445
|
0
|
|
|
0
|
1
|
|
sub abs_url { my ($parent_url,$child_url) = (shift,shift);
|
446
|
0
|
0
|
|
|
|
|
if ($child_url =~/^#/){
|
447
|
0
|
|
|
|
|
|
return undef;
|
448
|
|
|
|
|
|
|
}
|
449
|
0
|
|
|
|
|
|
my $hack;
|
450
|
0
|
0
|
|
|
|
|
if ($child_url =~ m|^/|) {
|
451
|
0
|
|
|
|
|
|
$parent_url =~ s|^(http://[\w.]+)?/.*$|$1|i;
|
452
|
0
|
|
|
|
|
|
return $parent_url.$child_url;
|
453
|
|
|
|
|
|
|
}
|
454
|
0
|
0
|
|
|
|
|
if ($child_url =~ m|^\.\.\/|i){
|
|
|
0
|
|
|
|
|
|
455
|
0
|
|
|
|
|
|
$parent_url =~ s/\/[^\/|^~]+$//; # Strip filename (fix: DL)
|
456
|
0
|
0
|
|
|
|
|
if ($parent_url =~ /\/$/){$parent_url =~ s/\/$//;} # (DL)
|
|
0
|
|
|
|
|
|
|
457
|
0
|
0
|
|
|
|
|
if ($child_url =~ /^\.\//){$child_url =~ s/^\.\///;} # (DL)
|
|
0
|
|
|
|
|
|
|
458
|
0
|
|
|
|
|
|
while ($child_url=~s/^\.\.\///gs ){
|
459
|
0
|
|
|
|
|
|
$parent_url =~s/[^\/]+\/?$//;
|
460
|
|
|
|
|
|
|
}
|
461
|
0
|
|
|
|
|
|
$child_url = $parent_url.$child_url;
|
462
|
|
|
|
|
|
|
} elsif ($child_url !~ m/^http:\/\//i){
|
463
|
|
|
|
|
|
|
# Assume relative path needs dir
|
464
|
0
|
|
|
|
|
|
$parent_url =~ s/\/[^\/]+$//; # Strip filename
|
465
|
0
|
0
|
|
|
|
|
if ($parent_url =~ /\/$/){ chop $parent_url }
|
|
0
|
|
|
|
|
|
|
466
|
0
|
|
|
|
|
|
$child_url = $parent_url .'/'.$child_url;
|
467
|
|
|
|
|
|
|
}
|
468
|
0
|
|
|
|
|
|
return $child_url;
|
469
|
|
|
|
|
|
|
}
|
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=head2 METHOD print_imgs
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
Print a list of the images saved.
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
=cut
|
477
|
|
|
|
|
|
|
|
478
|
0
|
|
|
0
|
1
|
|
sub print_imgs { my $self=shift;
|
479
|
0
|
|
|
|
|
|
foreach (keys %{$self->{DONE}}){
|
|
0
|
|
|
|
|
|
|
480
|
0
|
0
|
|
|
|
|
next if $_!~$self->{ext_ok}; # hack hack
|
481
|
0
|
|
|
|
|
|
print "From $_\n\t$self->{DONE}->{$_}\n";
|
482
|
|
|
|
|
|
|
}
|
483
|
|
|
|
|
|
|
}
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=head2 METHOD imgs_as_hash
|
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
Returns a reference to a hash of images saved,
|
488
|
|
|
|
|
|
|
where keys are new image locations, values are original locations.
|
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=cut
|
491
|
|
|
|
|
|
|
|
492
|
0
|
|
|
0
|
1
|
|
sub imgs_as_hash { my $self=shift;
|
493
|
0
|
|
|
|
|
|
my $n = {};;
|
494
|
0
|
|
|
|
|
|
foreach (keys %{$self->{DONE}}){
|
|
0
|
|
|
|
|
|
|
495
|
0
|
0
|
|
|
|
|
next if $_!~$self->{ext_ok}; # hack hack
|
496
|
0
|
|
|
|
|
|
$n->{$self->{DONE}->{$_}} = $_;
|
497
|
|
|
|
|
|
|
}
|
498
|
0
|
|
|
|
|
|
return $n;
|
499
|
|
|
|
|
|
|
}
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=head2 METHOD print_done
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Print a list of the URLs accessed
|
504
|
|
|
|
|
|
|
and return a reference to a hash of the same.
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=cut
|
507
|
|
|
|
|
|
|
|
508
|
0
|
|
|
0
|
1
|
|
sub print_done { my $self=shift;
|
509
|
0
|
|
|
|
|
|
foreach (keys %{$self->{DONE}}){
|
|
0
|
|
|
|
|
|
|
510
|
0
|
|
|
|
|
|
print "At $_\n\t$self->{DONE}->{$_}\n";
|
511
|
|
|
|
|
|
|
}
|
512
|
0
|
|
|
|
|
|
return \$self->{DONE};
|
513
|
|
|
|
|
|
|
}
|
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head2 METHOD print_failed
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
Print a list of the URLs failed, and reasons
|
518
|
|
|
|
|
|
|
and return a reference to a hash of the same.
|
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut
|
521
|
|
|
|
|
|
|
|
522
|
0
|
|
|
0
|
1
|
|
sub print_failed { my $self=shift;
|
523
|
0
|
|
|
|
|
|
foreach (keys %{$self->{FAILED}}){
|
|
0
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
|
print "At $_\n\t$self->{FAILED}->{$_}\n";
|
525
|
|
|
|
|
|
|
}
|
526
|
0
|
|
|
|
|
|
return \$self->{FAILED};
|
527
|
|
|
|
|
|
|
}
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=head2 METHOD print_ignored
|
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
Print a list of the URLs ignored
|
532
|
|
|
|
|
|
|
and return a reference to a hash of the same.
|
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=cut
|
535
|
|
|
|
|
|
|
|
536
|
0
|
|
|
0
|
1
|
|
sub print_ignored { my $self=shift;
|
537
|
0
|
|
|
|
|
|
foreach (keys %{$self->{IGNORED}}){
|
|
0
|
|
|
|
|
|
|
538
|
0
|
|
|
|
|
|
print "At $_\n\t$self->{IGNORED}->{$_}\n";
|
539
|
|
|
|
|
|
|
}
|
540
|
0
|
|
|
|
|
|
return \$self->{IGNORED};
|
541
|
|
|
|
|
|
|
}
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
1; # Return a true value for 'use'
|
548
|
|
|
|
|
|
|
__END__
|