File Coverage

blib/lib/HTTP/GetImages.pm
Criterion Covered Total %
statement 24 243 9.8
branch 0 154 0.0
condition 0 104 0.0
subroutine 8 17 47.0
pod 6 8 75.0
total 38 526 7.2


line stmt bran cond sub pod time code
1             package HTTP::GetImages;
2            
3 1     1   781 use vars qw /$EXTENSIONS_RE $EXTENSIONS_BAD $VERSION/;
  1         2  
  1         92  
4            
5             $VERSION=0.343;
6            
7             =head1 NAME
8            
9             HTTP::GetImages - Spider to recover and store images from web pages.
10            
11             =head1 SYNOPSIS
12            
13             use HTTP::GetImages;
14            
15             $_ = new HTTP::GetImages (
16             dir => '.',
17             todo => ['http://www.google.com/',],
18             dont => ['http://www.somewhere/ignorethis.html','http://and.this.html'],
19             chat => 1,
20             );
21            
22             $_->print_imgs;
23             $_->print_done;
24             $_->print_failed;
25             $_->print_ignored;
26            
27             my $hash = $_->imgs_as_hash;
28             foreach (keys %{$hash}){
29             warn "$_ = ",$hash->{$_},"\n";
30             }
31            
32             exit;
33            
34             =head1 DESCRIPTION
35            
36             This module allow syou to automate the searching, recovery and local storage
37             of images from the web, including those linked by anchor (C), mage (C)
38             and image map (C) elements.
39            
40             Supply a URI or list of URIs to process, and C will recurse
41             over every link it finds, searching for images.
42            
43             By supplying a list of URIs, you can restrict the search to certain webservers
44             and directories, or exclude it from certain webservers and directories.
45            
46             You can also decide to reject images that are too small or too large.
47            
48             =head1 DEPENDENCIES
49            
50             LWP::UserAgent;
51             HTTP::Request;
52             HTML::TokeParser;
53            
54             =cut
55            
56 1     1   852314 use LWP::UserAgent;
  1         1427552  
  1         44  
57 1     1   12 use HTTP::Request;
  1         7  
  1         26  
58 1     1   907 use HTML::TokeParser;
  1         13062  
  1         34  
59 1     1   57 use Carp;
  1         2  
  1         102  
60 1     1   5 use strict;
  1         9  
  1         28  
61 1     1   6 use warnings;
  1         2  
  1         36  
62 1     1   5 no strict 'refs';
  1         1  
  1         3101  
63            
64             =head1 PACKAGE GLOBAL VARIABLE
65            
66             =head2 $CHAT
67            
68             Set to above zero if you'd like a real-time report to C.
69             Defaults to off.
70            
71             =cut
72            
73             my $CHAT;
74            
75             # Default values to apply to $self->{ext_ok}
76             $EXTENSIONS_RE = '(jpg|jpeg|bmp|gif|png|xbm|xmp)';
77            
78             # Default values for $self->{ext_bad}
79             $EXTENSIONS_BAD = '(wmv|avi|rm|mpg|asf|ram|asx|mpeg|mp3)';
80            
81            
82             =head1 CONSTRUCTOR METHOD new
83            
84             Besides the class reference, accepts name=>value pairs:
85            
86             =over 4
87            
88             =item max_attempts
89            
90             The maximum attempts the agent should make to access the site. Default is three.
91            
92             =item dir
93            
94             the path to the directory in which to store images (no trailing oblique necessary);
95            
96             =item rename
97            
98             Default value is 0, which allows images to be saved with their original names.
99             If set with a value of 1, images will be given new names based on the time
100             they were saved at. If set to 2, images will be given filenames according to their
101             source location.
102            
103             =item todo
104            
105             one or more URL to process: can be an anonymous array, array reference, or scalar.
106            
107             =item dont
108            
109             As C, above, but URLs should be ignored.
110            
111             If one of these is C, then will ignore all B documents
112             that do not match exactly those in the C array of URLs to process.
113             If one of these is C, will ignore no documents.
114            
115             =item ext_ok
116            
117             A regular expression 'or' list of image extensions to match.
118            
119             Will be applied at the end of a filename, after a point, and is insensitive to case.
120            
121             Defaults to C<(jpg|jpeg|bmp|gif|png|xbm|xmp)>.
122            
123             =item ext_bad
124            
125             As C (above), but default value is:C<(wmv|avi|rm|mpg|asf|ram|asx|mpeg|mp3)>
126            
127             =item match_url
128            
129             The minimum path a URL must contain. This can be a scalar or an array reference.
130            
131             =item min_size.
132            
133             The minimum size an image can be if it is to be saved.
134            
135             =item max_size
136            
137             The maximum size an image can be if it is to be saved.
138            
139             =back
140            
141             The object has several private variables, which
142             you can access for the results when the job is done.
143             However, do check out the public methods for accessing
144             these.
145            
146             =over 4
147            
148             =item DONE
149            
150             a hash keys of which are the original URLs of the images, value being are the local filenames.
151            
152             =item FAILED
153            
154             a hash, keys of which are the failed URLs, values being short reasons.
155            
156             =cut
157            
158 0     0 0   sub new { my ($class) = (shift);
159 0 0         warn "Making new ",__PACKAGE__ if $CHAT;
160 0 0         unless (defined $class) {
161 0           carp "Usage: ".__PACKAGE__."->new( {key=>value} )\n";
162 0           return undef;
163             }
164 0           my %args;
165            
166             # Take parameters and place in object slots/set as instance variables
167 0 0         if (ref $_[0] eq 'HASH'){ %args = %{$_[0]} }
  0 0          
  0            
168 0           elsif (not ref $_[0]){ %args = @_ }
169             else {
170 0           carp "Usage: $class->new( { key=>values, } )";
171 0           return undef;
172             }
173 0           my $self = bless {}, $class;
174            
175             # Slots that have default values:
176             # $self->{min_size};
177             # $self->{match_url}
178             # $self->{dir},
179             # $todo,= []
180 0           $self->{dont} = [];
181             # $MINIMGSIZE
182 0           $self->{ext_ok} = $EXTENSIONS_RE; # Defualt extensions to use
183 0           $self->{ext_bad} = $EXTENSIONS_BAD; # Ditto for ignore.
184 0           $self->{rename} = 0;
185 0           $self->{max_attempts} = 3;
186            
187             # Set/overwrite public slots with user's values
188 0           foreach (keys %args) {
189 0           $self->{lc $_} = $args{$_};
190 0 0         warn "$_ -> $self->{$_}\n" if $CHAT;
191             }
192            
193             # Catch parameter errors
194 0 0 0       if (not exists $self->{dir} or not defined $self->{dir}){
195 0           croak "No 'dir' slot defined";
196             }
197 0 0         if (!-d $self->{dir}){
198 0           croak "The dir to save to <$self->{dir}> could not be found or is not a directory";
199             }
200 0 0         if (not exists $self->{todo}){
201 0           croak "The 'todo' slot is not defined";
202             }
203            
204             # React to user slots
205 0 0 0       if (exists $self->{chat} and defined $self->{chat}){
206 0           $CHAT = 1;
207 0           warn "Chat mode on";
208 0           } else { undef $CHAT }
209            
210             # Turn scalars into arrays for later use
211 0 0 0       if (exists $self->{match_url} and not ref $self->{match_url}){
212 0           $self->{match_url} = [$self->{match_url}];
213             }
214 0 0 0       if (exists $self->{todo} and not ref $self->{todo}){
215 0           $self->{todo} = [$self->{todo}];
216             }
217 0 0 0       if (exists $self->{dont} and not ref $self->{dont}){
218 0           $self->{dont} = [$self->{dont}];
219             }
220 0           @_ = @{$self->{todo}};
  0            
221 0           $self->{todo} = {};
222 0           foreach (@_){ $self->{todo}->{$_} = 1 }
  0            
223 0 0         if ($self->{dont}){
224 0           @_ = @{$self->{dont}};
  0            
225 0           $self->{dont} = {};
226 0           foreach (@_){ $self->{dont}->{$_} = 1 }
  0            
227             }
228            
229             # Slots that are not adjustable by user:
230 0           $self->{DONE} = {};
231 0           $self->{FAILED} = {};
232            
233 0           DOC:
234 0           while (keys %{$self->{todo}} ){
235 0           @_ = keys %{$self->{todo}};
  0            
236 0           my $doc_url = shift @_;
237 0 0         warn "-"x60,"\n" if $CHAT;
238 0           my ($doc,$p);
239             # If using match_url feature: ignore doc if not match start of one string
240 0 0         if (exists $self->{match_url}){
241 0           foreach (@{$self->{match_url}}){
  0            
242 0 0         if ($doc_url !~ /^$_/){
243 0 0         warn "URL out of scope: $doc_url $_\n" if $CHAT;
244 0           delete $self->{todo}->{$doc_url};
245 0           next DOC;
246             } else {
247 0 0         warn "URL ok by $_\n" if $CHAT;
248             }
249             }
250             }
251            
252 0 0 0       if (exists $self->{FAILED}->{$doc_url} or exists $self->{DONE}->{$doc_url}){
253 0 0         warn "Already done $doc_url.\n" if $CHAT;
254 0           delete $self->{todo}->{$doc_url};
255 0           next DOC;
256             }
257            
258 0 0         if (exists $self->{dont}->{$doc_url}){
259 0 0         warn "In IGNORE list: $doc_url.\n" if $CHAT;
260 0           delete $self->{todo}->{$doc_url};
261 0           next DOC;
262             }
263            
264 0 0 0       if (exists $self->{dont}->{ALL} and not $self->{todo}->{$doc_url}){
265 0 0         warn "Not in TODO list: $doc_url.\n" if $CHAT;
266 0           delete $self->{todo}->{$doc_url};
267 0           next DOC;
268             }
269            
270             # Not in do list, not an image, not run with IGNORE NONE option
271 0 0 0       if (not exists $self->{todo}->{$doc_url} and $doc_url !~ m|(\.$self->{ext_ok})$|i
      0        
272             and not exists $self->{dont}->{NONE}){
273 0 0         warn "Not in DO list - ignoring $doc_url .\n" if $CHAT;
274 0           $self->{dont}->{$doc_url} = "Ignoring";
275 0           delete $self->{todo}->{$doc_url};
276 0           next DOC;
277             }
278            
279 0 0         unless ($doc = $self->get_document($doc_url)){
280 0 0         warn "Agent could not open $doc_url" if $CHAT;
281 0           $self->{FAILED}->{$doc_url} = "Agent couldn't open document";
282 0           delete $self->{todo}->{$doc_url};
283 0           next DOC;
284             }
285            
286             # If an image, save it
287 0 0         if ($doc_url =~ m|(\.$self->{ext_ok})$|i) {
288 0           $self->{DONE}->{$doc_url} = $self->_save_img($doc_url,$doc);
289 0 0         warn "OK: $doc_url" if $CHAT;
290 0           delete $self->{todo}->{$doc_url};
291 0           next DOC;
292             } else {
293 0           $self->{DONE}->{$doc_url} = "Did HTML.";
294 0           delete $self->{todo}->{$doc_url};
295             }
296            
297             # Otherwise try to parse it
298 0 0         unless ($p = new HTML::TokeParser( \$doc )){
299 0 0         warn "* Couldn't create parser from \$doc\n" if $CHAT;
300 0           $self->{FAILED}->{$doc_url} = "Couldn't create agent parser";
301 0           delete $self->{todo}->{$doc_url};
302 0           next DOC;
303             }
304 0 0         warn "OK - parsing document $doc_url ...\n" if $CHAT;
305            
306 0           while (my $token = $p->get_token){
307            
308 0 0 0       if (@$token[1] eq 'img' and exists @$token[2]->{src}){
    0 0        
    0 0        
      0        
309 0 0         warn "*** Found image: @$token[2]->{src}\n" if $CHAT;
310 0           my $uri = &abs_url( $doc_url, @$token[2]->{src} );
311 0 0 0       if ($uri and not exists $self->{IGNORE0}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
      0        
      0        
312             ){
313 0           $self->{todo}->{$uri} = 1;
314             } else {
315 0 0         warn "\t ignoring that img.\n" if $CHAT;
316             }
317             }
318             elsif (@$token[1] =~ /^(area|a)$/ and exists @$token[2]->{href} and @$token[0] eq 'S'){
319 0 0         warn "*** Found link: @$token[2]->{href}\n" if $CHAT;
320 0           my $uri = &abs_url( $doc_url, @$token[2]->{href} );
321 0 0 0       if ($uri and not exists $self->{dont}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
      0        
      0        
      0        
      0        
322             and not (exists $self->{dont}->{ALL} and not exists $self->{todo}->{$uri})
323             ){
324 0           $self->{todo}->{$uri} = 1;
325             } else {
326 0 0         warn "\t ignoring that link.\n" if $CHAT;
327             }
328             }
329             elsif (@$token[1] eq 'frame' and exists(@$token[2]->{src})){ # This block (DL)
330 0 0         warn "*** Found frame: @$token[2]->{src}\n" if $CHAT;
331 0           my $uri = &abs_url( $doc_url, @$token[2]->{src} );
332 0 0 0       if ($uri and not exists $self->{IGNORE0}->{$uri} and not exists $self->{DONE}->{$uri} and not exists $self->{FAILED}->{$uri}
      0        
      0        
      0        
      0        
333             and not (exists $self->{dont}->{ALL} and not exists $self->{todo}->{$uri}) ){
334 0           $self->{todo}->{$uri} = 1;
335             } else {
336 0 0         warn "\t ignoring that frame.\n" if $CHAT;
337             }
338             }
339             } # Next token
340 0           delete $self->{todo}->{$doc_url};
341             } # Next DOC
342            
343 0           return $self;
344             } # End sub new
345            
346            
347            
348            
349            
350             #
351             # SUB get_document
352             # Accepts a URL, returns the source of the document at the URL
353             # or undef on failure
354             #
355 0     0 0   sub get_document { my ($self,$url) = (shift,shift); # Recieve as argument the URL to access
356 0 0         if ($url =~ m|(\.$self->{ext_bad})$|i) { # (DL)
357 0 0         warn "Ignoring - extension on the 'bad' list" if $CHAT;
358 0           return undef;
359             }
360 0           my ($req,$res);
361 0           my $ua = LWP::UserAgent->new; # Create a new UserAgent
362 0           for my $attempt (1..$self->{max_attempts}){
363 0 0 0       if ($attempt!=1 and $attempt-1 == $self->{max_attempts}){
364 0           $ua->agent('MSIE Internet Explorer 6.0 (Mozilla compatible'); # Naughty?
365             } else {
366 0           $ua->agent('Perl::'.__PACKAGE__.' v'.$VERSION); # Give it a type name
367             }
368 0 0         warn "Attempt ($attempt) to access <$url>...\n" if $CHAT;
369 0           $req = new HTTP::Request('GET', $url); # Format URL request
370 0 0         next if not defined $req;
371 0           $res = $ua->request($req); # $res is the object UA returned
372 0 0         last if $res->is_success(); # If not successful
373             }
374 0 0         if (not defined $req){
375 0 0         warn "...could not GET.\n" if $CHAT;
376 0           return undef;
377             }
378 0 0         if (not $res->is_success()) { # If not successful
379 0 0         warn"...failed.\n" if $CHAT;
380             return undef
381 0           }
382            
383 0 0         warn "...ok.\n" if $CHAT;
384             # Test size
385 0 0 0       if ((exists $self->{max_size} or exists $self->{min_size})
      0        
386             and $url =~ m|(\.$self->{ext_ok})$|i) {
387 0           $_ = length ($res->content);
388 0 0 0       if (defined $_ and $self->{min_size} and $_ < $self->{min_size}){
    0 0        
      0        
      0        
389 0 0         warn "Image size too small, ignoring.\n" if $CHAT;
390 0           $self->{dont}->{$url} = "Size $_ bytes is too small.";
391 0           return undef;
392             }
393             elsif (defined $_ and $self->{max_size} and $_ > $self->{max_size}){
394 0 0         warn "Image size too large, ignoring.\n" if $CHAT;
395 0           $self->{dont}->{$url} = "Size $_ bytes is too large.";
396 0           return undef;
397             }
398             }
399 0           return $res->content; # $res->content is the HTML the UA returned from the URL
400             }
401            
402            
403            
404             # PRIVATE METHOD _save_img
405             #
406             # Accepts and the actual image source.
407             # Won't store same image twice.
408             #
409             # Returns the path the image was saved at.
410            
411 0     0     sub _save_img { my ($self,$url,$img) = (shift,shift,shift,shift);
412 0           local *OUT;
413 0           my $filename;
414             # Remvoe any file path from the $url
415 0 0 0       if (exists $self->{DONE}->{$url} or exists $self->{FAILED}->{$url}){
416 0 0         warn "Already got this one ($url), not saving.\n" if $CHAT;
417 0           return undef;
418             }
419 0           $url =~ m|/([^./]+)(\.$self->{ext_ok})$|i;
420 0 0         if ($self->{rename}){
    0          
421 0           $filename = $self->{dir}.'/'.(join'',localtime).$2;
422             } elsif ($self->{rename} == 2){ # )
423 0           $filename = $url; # } DL
424 0           $filename =~ s/\/|\:|\~|\?/_/g; # )
425 0           $filename = $self->{dir}.'\\'.$filename; # )
426             } else {
427 0           $filename = "$self->{dir}/$1$2";
428             }
429 0 0         warn "Saving image as <$filename>...\n" if $CHAT;
430 0 0 0       open OUT,">$filename" or warn "Couldn't open to save <$filename>!" and return "Failed to save.";
431 0           binmode OUT;
432 0           print OUT $img;
433 0           close OUT;
434 0 0         warn "...ok.\n" if $CHAT;
435 0           return $filename;
436             }
437            
438            
439             #
440             # SUB abs_url returns an absolute URL for a $child_url linked from $parent_url
441             #
442             # DOC http://www.netverifier.com/pin/nicolette/jezfuzchr001.html
443             # SRC /pin/nicolette/jezfuzchr001.jpg
444             #
445 0     0 1   sub abs_url { my ($parent_url,$child_url) = (shift,shift);
446 0 0         if ($child_url =~/^#/){
447 0           return undef;
448             }
449 0           my $hack;
450 0 0         if ($child_url =~ m|^/|) {
451 0           $parent_url =~ s|^(http://[\w.]+)?/.*$|$1|i;
452 0           return $parent_url.$child_url;
453             }
454 0 0         if ($child_url =~ m|^\.\.\/|i){
    0          
455 0           $parent_url =~ s/\/[^\/|^~]+$//; # Strip filename (fix: DL)
456 0 0         if ($parent_url =~ /\/$/){$parent_url =~ s/\/$//;} # (DL)
  0            
457 0 0         if ($child_url =~ /^\.\//){$child_url =~ s/^\.\///;} # (DL)
  0            
458 0           while ($child_url=~s/^\.\.\///gs ){
459 0           $parent_url =~s/[^\/]+\/?$//;
460             }
461 0           $child_url = $parent_url.$child_url;
462             } elsif ($child_url !~ m/^http:\/\//i){
463             # Assume relative path needs dir
464 0           $parent_url =~ s/\/[^\/]+$//; # Strip filename
465 0 0         if ($parent_url =~ /\/$/){ chop $parent_url }
  0            
466 0           $child_url = $parent_url .'/'.$child_url;
467             }
468 0           return $child_url;
469             }
470            
471            
472             =head2 METHOD print_imgs
473            
474             Print a list of the images saved.
475            
476             =cut
477            
478 0     0 1   sub print_imgs { my $self=shift;
479 0           foreach (keys %{$self->{DONE}}){
  0            
480 0 0         next if $_!~$self->{ext_ok}; # hack hack
481 0           print "From $_\n\t$self->{DONE}->{$_}\n";
482             }
483             }
484            
485             =head2 METHOD imgs_as_hash
486            
487             Returns a reference to a hash of images saved,
488             where keys are new image locations, values are original locations.
489            
490             =cut
491            
492 0     0 1   sub imgs_as_hash { my $self=shift;
493 0           my $n = {};;
494 0           foreach (keys %{$self->{DONE}}){
  0            
495 0 0         next if $_!~$self->{ext_ok}; # hack hack
496 0           $n->{$self->{DONE}->{$_}} = $_;
497             }
498 0           return $n;
499             }
500            
501             =head2 METHOD print_done
502            
503             Print a list of the URLs accessed
504             and return a reference to a hash of the same.
505            
506             =cut
507            
508 0     0 1   sub print_done { my $self=shift;
509 0           foreach (keys %{$self->{DONE}}){
  0            
510 0           print "At $_\n\t$self->{DONE}->{$_}\n";
511             }
512 0           return \$self->{DONE};
513             }
514            
515             =head2 METHOD print_failed
516            
517             Print a list of the URLs failed, and reasons
518             and return a reference to a hash of the same.
519            
520             =cut
521            
522 0     0 1   sub print_failed { my $self=shift;
523 0           foreach (keys %{$self->{FAILED}}){
  0            
524 0           print "At $_\n\t$self->{FAILED}->{$_}\n";
525             }
526 0           return \$self->{FAILED};
527             }
528            
529             =head2 METHOD print_ignored
530            
531             Print a list of the URLs ignored
532             and return a reference to a hash of the same.
533            
534             =cut
535            
536 0     0 1   sub print_ignored { my $self=shift;
537 0           foreach (keys %{$self->{IGNORED}}){
  0            
538 0           print "At $_\n\t$self->{IGNORED}->{$_}\n";
539             }
540 0           return \$self->{IGNORED};
541             }
542            
543            
544            
545            
546            
547             1; # Return a true value for 'use'
548             __END__