File Coverage

blib/lib/Dezi/Aggregator/Spider.pm
Criterion Covered Total %
statement 22 24 91.6
branch n/a
condition n/a
subroutine 8 8 100.0
pod n/a
total 30 32 93.7


line stmt bran cond sub pod time code
1             package Dezi::Aggregator::Spider;
2 1     1   1936 use Moose;
  1         449773  
  1         6  
3             extends 'Dezi::Aggregator';
4 1     1   6776 use Carp;
  1         2  
  1         108  
5 1     1   5 use Scalar::Util qw( blessed );
  1         2  
  1         47  
6 1     1   1171 use URI;
  1         4923  
  1         30  
7 1     1   10787 use HTTP::Cookies;
  1         13226  
  1         40  
8 1     1   902 use Types::Standard qw( InstanceOf Maybe Int CodeRef Str Bool ArrayRef );
  1         68768  
  1         23  
9 1     1   2022 use Dezi::Types qw( DeziFileRules DeziEpoch );
  1         3  
  1         11  
10 1     1   1220 use Dezi::Utils;
  0            
  0            
11             use Dezi::Queue;
12             use Dezi::Cache;
13             use Dezi::Aggregator::Spider::UA;
14             use Search::Tools::UTF8;
15             use XML::Feed;
16             use WWW::Sitemap::XML;
17             use File::Rules;
18             use Class::Load;
19              
20             #
21             # TODO tests for cookies, non-text urls needing filters
22             #
23             #
24              
25             has 'agent' => (
26             is => 'rw',
27             isa => Str,
28             default => sub {'dezi-spider http://dezi.org/'},
29             );
30             has 'authn_callback' => ( is => 'rw', isa => CodeRef );
31             has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} );
32             has 'credentials' => ( is => 'rw', isa => Str );
33             has 'delay' => ( is => 'rw', isa => Int, default => sub {5} );
34             has 'email' => (
35             is => 'rw',
36             isa => Str,
37             default => sub {'dezi@user.failed.to.set.email.invalid'},
38             );
39             has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, );
40             has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} );
41             has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} );
42              
43             # whitelist which HTML tags we consider "links"
44             # should be subset of what HTML::LinkExtor considers links
45             has 'link_tags' => (
46             is => 'rw',
47             isa => ArrayRef,
48             default => sub { [ 'a', 'frame', 'iframe' ] }
49             );
50              
51             has 'max_depth' => ( is => 'rw', isa => Maybe [Int] );
52             has 'max_files' => ( is => 'rw', isa => Int, default => sub {0} );
53             has 'max_size' => ( is => 'rw', isa => Int, default => sub {5_000_000} );
54             has 'max_time' => ( is => 'rw', isa => Int, ); # TODO
55             has 'md5_cache' => (
56             is => 'rw',
57             isa => InstanceOf ['Dezi::Cache'],
58             default => sub { Dezi::Cache->new }
59             );
60             has 'modified_since' => ( is => 'rw', isa => DeziEpoch, coerce => 1, );
61             has 'queue' => (
62             is => 'rw',
63             isa => InstanceOf ['Dezi::Queue'],
64             default => sub { Dezi::Queue->new }
65             );
66             has 'remove_leading_dots' => ( is => 'rw', isa => Bool, default => sub {1} );
67             has 'same_hosts' => ( is => 'rw', isa => ArrayRef, default => sub { [] } );
68             has 'timeout' => ( is => 'rw', isa => Int, default => sub {30} );
69             has 'ua' => ( is => 'rw', isa => InstanceOf ['LWP::UserAgent'] );
70             has 'uri_cache' => (
71             is => 'rw',
72             isa => InstanceOf ['Dezi::Cache'],
73             default => sub { Dezi::Cache->new },
74             );
75             has 'use_md5' => ( is => 'rw', isa => Bool, default => sub {0} );
76             has 'use_cookies' => ( is => 'rw', isa => Bool, default => sub {1} );
77              
78             #use LWP::Debug qw(+);
79              
80             our $VERSION = '0.014';
81              
82             # shortcut
83             my $UTILS = 'Dezi::Utils';
84              
85             =pod
86              
87             =head1 NAME
88              
89             Dezi::Aggregator::Spider - web aggregator
90              
91             =head1 SYNOPSIS
92              
93             use Dezi::Aggregator::Spider;
94             my $spider = Dezi::Aggregator::Spider->new(
95             indexer => Dezi::Indexer->new
96             );
97            
98             $spider->indexer->start;
99             $spider->crawl( 'http://swish-e.org/' );
100             $spider->indexer->finish;
101              
102             =head1 DESCRIPTION
103              
104             Dezi::Aggregator::Spider is a web crawler similar to
105             the spider.pl script in the Swish-e 2.4 distribution. Internally,
106             Dezi::Aggregator::Spider uses LWP::RobotUA to do the hard work.
107             See L<Dezi::Aggregator::Spider::UA>.
108              
109             =head1 METHODS
110              
111             See L<Dezi::Aggregator>.
112              
113             =head2 new( I<params> )
114              
115             All I<params> have their own get/set methods too. They include:
116              
117             =over
118              
119             =item agent I<string>
120              
121             Get/set the user-agent string reported by the user agent.
122              
123             =item email I<string>
124              
125             Get/set the email string reported by the user agent.
126              
127             =item use_md5 I<1|0>
128              
129             Flag as to whether each URI's content should be fingerprinted
130             and compared. Useful if the same content is available under multiple
131             URIs and you only want to index it once.
132              
133             =item uri_cache I<cache_object>
134              
135             Get/set the Dezi::Cache-derived object used to track which URIs have
136             been fetched already.
137              
138             =item md5_cache I<cache_object>
139              
140             If use_md5() is true, this Dezi::Cache-derived object tracks
141             the URI fingerprints.
142              
143             =item file_rules I<File_Rules_or_ARRAY>
144              
145             Apply L<File::Rules> object in uri_ok(). I<File_Rules_or_ARRAY> should
146             be a L<File::Rules> object or an array of strings suitable to passing
147             to File::Rules->new().
148              
149             =item queue I<queue_object>
150              
151             Get/set the Dezi::Queue-derived object for tracking which URIs still
152             need to be fetched.
153              
154             =item ua I<lwp_useragent>
155              
156             Get/set the Dezi::Aggregagor::Spider::UA object.
157              
158             =item max_depth I<n>
159              
160             How many levels of links to follow. B<NOTE:> This value describes the number
161             of links from the first argument passed to I<crawl>.
162              
163             Default is unlimited depth.
164              
165             =item max_time I<n>
166              
167             This optional key will set the max minutes to spider. Spidering
168             for this host will stop after C<max_time> seconds, and move on to the
169             next server, if any. The default is to not limit by time.
170              
171             =item max_files I<n>
172              
173             This optional key sets the max number of files to spider before aborting.
174             The default is to not limit by number of files. This is the number of requests
175             made to the remote server, not the total number of files to index (see C<max_indexed>).
176             This count is displayted at the end of indexing as C<Unique URLs>.
177              
178             This feature can (and perhaps should) be use when spidering a web site where dynamic
179             content may generate unique URLs to prevent run-away spidering.
180              
181             =item max_size I<n>
182              
183             This optional key sets the max size of a file read from the web server.
184             This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is
185             truncated per LWP::UserAgent.
186              
187             Set max_size to zero for unlimited size.
188              
189             =item modified_since I<date>
190              
191             This optional parameter will skip any URIs that do not report having
192             been modified since I<date>. The C<Last-Modified> HTTP header is used to
193             determine modification time.
194              
195             =item keep_alive I<1|0>
196              
197             This optional parameter will enable keep alive requests. This can dramatically speed
198             up spidering and reduce the load on server being spidered. The default is to not use
199             keep alives, although enabling it will probably be the right thing to do.
200              
201             To get the most out of keep alives, you may want to set up your web server to
202             allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
203             Apache's default is 100, which should be good.
204              
205             When a connection is not closed the spider does not wait the "delay"
206             time when making the next request. In other words, there is no delay in
207             requesting documents while the connection is open.
208              
209             Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
210              
211             =item delay I<n>
212              
213             Get/set the number of seconds to wait between making requests. Default is
214             5 seconds (a very friendly delay).
215              
216             =item timeout I<n>
217              
218             Get/set the number of seconds to wait before considering the remote
219             server unresponsive. The default is 10.
220              
221             =item authn_callback I<code_ref>
222              
223             CODE reference to fetch username/password credentials when necessary. See also
224             C<credentials>.
225              
226             =item credential_timeout I<n>
227              
228             Number of seconds to wait before skipping manual prompt for username/password.
229              
230             =item credentials I<user:pass>
231              
232             String with C<username>:C<password> pair to be used when prompted by
233             the server.
234              
235             =item follow_redirects I<1|0>
236              
237             By default, 3xx responses from the server will be followed when
238             they are on the same hostname. Set to false (0) to not follow
239             redirects.
240              
241             =item link_tags
242              
243             TODO
244              
245             =item remove_leading_dots I<1|0>
246              
247             Microsoft server hack.
248              
249             =item same_hosts I<array_ref>
250              
251             ARRAY ref of hostnames to be treated as identical to the original
252             host being spidered. By default the spider will not follow
253             links to different hosts.
254              
255             =back
256              
257             =head2 BUILD
258              
259             Initializes a new spider object. Called by new().
260              
261             =cut
262              
263             sub BUILD {
264             my $self = shift;
265              
266             $self->{_auth_cache} = Dezi::Cache->new; # ALWAYS inmemory cache
267              
268             $self->{ua}
269             ||= Dezi::Aggregator::Spider::UA->new( $self->agent, $self->email, );
270              
271             $self->{ua}
272             ->set_link_tags( { map { lc($_) => 1 } @{ $self->{link_tags} } } );
273              
274             # we handle our own delay
275             $self->{ua}->delay(0);
276              
277             $self->{ua}->timeout( $self->timeout );
278              
279             # TODO we test this using HEAD request. Set here too?
280             #$self->{ua}->max_size( $self->{max_size} ) if $self->{max_size};
281              
282             if ( $self->use_cookies ) {
283             $self->{ua}->cookie_jar( HTTP::Cookies->new() );
284             }
285             if ( $self->keep_alive ) {
286             if ( $self->{ua}->can('conn_cache') ) {
287             $self->{ua}
288             ->conn_cache( { total_capacity => $self->keep_alive } );
289             }
290             else {
291             warn
292             "can't use keep-alive: conn_cache() method not available on ua "
293             . ref( $self->{ua} );
294             }
295             }
296              
297             $self->{_current_depth} = 1;
298              
299             $self->{same_host_lookup} = { map { $_ => 1 } @{ $self->{same_hosts} } };
300              
301             if ( $self->use_md5 ) {
302             Class::Load::load_class('Digest::MD5');
303             }
304              
305             # if Dezi::Indexer::Config defined, use that for some items
306             if ( $self->indexer and $self->indexer->config ) {
307             if ( $self->indexer->config->FileRules && !$self->file_rules ) {
308             $self->file_rules(
309             File::Rules->new( $self->indexer->config->FileRules ) );
310             }
311             }
312              
313             # from spider.pl. not sure if we need it or not.
314             # Lame Microsoft
315             $URI::ABS_REMOTE_LEADING_DOTS = $self->remove_leading_dots;
316              
317             return $self;
318             }
319              
320             =head2 uri_ok( I<uri> )
321              
322             Returns true if I<uri> is acceptable for including in an index.
323             The 'ok-ness' of the I<uri> is based on its base, robot rules,
324             and the spider configuration.
325              
326             =cut
327              
328             sub uri_ok {
329             my $self = shift;
330             my $uri = shift or croak "URI required";
331             my $str = $uri->canonical->as_string;
332             $str =~ s/#.*//; # target anchors create noise
333              
334             if ( $self->verbose > 1 || $self->debug ) {
335             $self->write_log_line();
336             $self->write_log(
337             uri => $uri,
338             msg => "checking if ok",
339             );
340             }
341              
342             if ( $uri->scheme !~ m,^http, ) {
343             $self->debug and $self->write_log(
344             uri => $uri,
345             msg => "skipping, unsupported scheme"
346             );
347             return 0;
348             }
349              
350             # check if we're on the same host.
351             if ( $uri->rel( $self->{_base} ) eq $uri ) {
352              
353             # not on this host. check our aliases
354             if ( !exists $self->{same_host_lookup}
355             ->{ $uri->canonical->authority || '' } )
356             {
357             my $host = $uri->canonical->authority;
358             $self->debug
359             and $self->write_log(
360             uri => $uri,
361             msg => "skipping, different host $host",
362             );
363             return 0;
364             }
365              
366             # in same host lookup, so proceed.
367             }
368              
369             my $path = $uri->path;
370             my $swish3 = $self->indexer ? $self->indexer->swish3 : undef;
371             my $mime = $UTILS->get_mime( $path, $swish3 );
372              
373             if ( !$UTILS->get_parser_for_mime( $mime, $swish3 ) ) {
374             $self->debug and $self->write_log(
375             uri => $uri,
376             msg => "skipping, no parser for $mime",
377             );
378             return 0;
379             }
380              
381             # check regex
382             if ( $self->file_rules ) {
383              
384             if ( $self->_apply_file_rules( $uri->path_query, $self->file_rules )
385             && !$self->_apply_file_match( $uri->path_query,
386             $self->file_rules ) )
387             {
388             $self->debug and $self->write_log(
389             uri => $uri,
390             msg => "skipping, matched file_rules",
391             );
392             return 0;
393             }
394             }
395              
396             # head request to check max_size and modified_since
397             if ( $self->max_size or $self->modified_since ) {
398             my %head_args = (
399             uri => $uri,
400             delay => 0, # assume each get() applies the delay
401             debug => $self->debug,
402             verbose => $self->verbose,
403             );
404              
405             if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) {
406             $head_args{user} = $user;
407             $head_args{pass} = $pass;
408             }
409             my $resp = $self->ua->head(%head_args);
410              
411             # early abort if resource doesn't exist
412             if ( $resp->status == 404 ) {
413             $self->debug
414             and $self->write_log(
415             uri => $uri,
416             msg => "skipping, 404 not found",
417             );
418             return 0;
419             }
420              
421             # redirect? assume ok now and _make_request will check on it later.
422             if ( $resp->is_redirect ) {
423             $self->debug
424             and $self->write_log(
425             uri => $uri,
426             msg => "deferring, is_redirect",
427             );
428             return 1;
429             }
430              
431             my $last_mod = $resp->last_modified;
432             if ( $last_mod
433             and $self->modified_since
434             and $self->modified_since > $last_mod )
435             {
436             $self->debug
437             and $self->write_log(
438             uri => $uri,
439             msg => sprintf(
440             "skipping, last modified %s (%s < %s)",
441             $resp->header('last-modified'), $last_mod,
442             $self->modified_since
443             ),
444             );
445             return 0;
446             }
447              
448             if ( $resp->content_length and $self->max_size ) {
449             if ( $resp->content_length > $self->max_size ) {
450             $self->debug
451             and $self->write_log(
452             uri => $uri,
453             msg => sprintf( "skipping, %s > max_size",
454             $resp->content_length ),
455             );
456             return 0;
457             }
458             }
459              
460             }
461              
462             ( $self->verbose > 1 || $self->debug ) and $self->write_log(
463             uri => $uri,
464             msg => "ok",
465             );
466             return 1;
467             }
468              
469             sub _add_links {
470             my ( $self, $parent, @links ) = @_;
471              
472             # calc depth
473             if ( !$self->{_parent} || $self->{_parent} ne $parent ) {
474             $self->{_current_depth}++;
475             }
476              
477             $self->{_parent} ||= $parent; # first time.
478              
479             $self->debug and $self->write_log(
480             uri => $parent,
481             msg => sprintf( 'evaluating %s links', scalar(@links) ),
482             );
483              
484             for my $l (@links) {
485             my $uri = $l->abs( $self->{_base} ) or next;
486             $uri = $uri->canonical; # normalize
487             if ( $self->uri_cache->has("$uri") ) {
488             $self->debug and $self->write_log(
489             uri => $uri,
490             msg => "skipping, already checked",
491             );
492             next;
493             }
494             $self->uri_cache->add( "$uri" => $self->{_current_depth} );
495              
496             if ( $self->uri_ok($uri) ) {
497             $self->add_to_queue($uri);
498             }
499             }
500             }
501              
502             # ported from spider.pl
503             # Do we need to authorize? If so, ask for password and request again.
504             # First we try using any cached value
505             # Then we try using the get_password callback
506             # Then we ask.
507              
508             sub _authorize {
509             my ( $self, $uri, $response ) = @_;
510              
511             delete $self->{last_auth}; # since we know that doesn't work
512              
513             if ( $response->header('WWW-Authenticate')
514             && $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i )
515             {
516             my $realm = $1;
517             my $user_pass;
518              
519             # Do we have a cached user/pass for this realm?
520             # only each URI only once
521             unless ( $self->{_request}->{auth}->{$uri}++ ) {
522             my $key = $uri->canonical->host_port . ':' . $realm;
523              
524             if ( $user_pass = $self->{_auth_cache}->get($key) ) {
525              
526             # If we didn't just try it, try again
527             unless ( $uri->userinfo && $user_pass eq $uri->userinfo ) {
528              
529             # add the user/pass to the URI
530             $uri->userinfo($user_pass);
531              
532             #warn " >> set userinfo via _auth_cache\n" if $self->debug;
533             return 1;
534             }
535             else {
536             # we've tried this before
537             #warn "tried $user_pass before";
538             return 0;
539             }
540             }
541             }
542              
543             # now check for a callback password (if $user_pass not set)
544             unless ( $user_pass || $self->{_request}->{auth}->{callback}++ ) {
545              
546             # Check for a callback function
547             if ( $self->{authn_callback}
548             and ref $self->{authn_callback} eq 'CODE' )
549             {
550             $user_pass = $self->{authn_callback}
551             ->( $self, $uri, $response, $realm );
552             $uri->userinfo($user_pass);
553              
554             #warn " >> set userinfo via authn_callback\n" if $self->debug;
555             return 1;
556             }
557             }
558              
559             # otherwise, prompt (over and over)
560             if ( !$user_pass ) {
561             $user_pass = $self->_get_basic_credentials( $uri, $realm );
562             }
563              
564             if ($user_pass) {
565             $uri->userinfo($user_pass);
566             $self->{cur_realm} = $realm; # save so we can cache if it's valid
567             return 1;
568             }
569             }
570              
571             return 0;
572              
573             }
574              
575             # From spider.pl
576             sub _get_basic_credentials {
577             my ( $self, $uri, $realm ) = @_;
578              
579             # Exists but undefined means don't ask.
580             return
581             if exists $self->{credential_timeout}
582             && !defined $self->{credential_timeout};
583              
584             my $netloc = $uri->canonical->host_port;
585              
586             my ( $user, $password );
587              
588             eval {
589             local $SIG{ALRM} = sub { die "timed out\n" };
590              
591             # a zero timeout means don't time out
592             alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i;
593              
594             if ( $uri->userinfo ) {
595             print STDERR "\nSorry: invalid username/password\n";
596             $uri->userinfo(undef);
597             }
598              
599             print STDERR
600             "Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: ";
601             $user = <STDIN>;
602             chomp($user) if $user;
603             die "No Username specified\n" unless length $user;
604              
605             alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i;
606              
607             print STDERR "Password: ";
608             system("stty -echo");
609             $password = <STDIN>;
610             system("stty echo");
611             print STDERR "\n"; # because we disabled echo
612             chomp($password);
613             alarm(0) unless $^O =~ /Win32/i;
614             };
615              
616             alarm(0) unless $^O =~ /Win32/i;
617              
618             return if $@;
619              
620             return join ':', $user, $password;
621              
622             }
623              
624             =head2 add_to_queue( I<uri> )
625              
626             Add I<uri> to the queue.
627              
628             =cut
629              
630             sub add_to_queue {
631             my $self = shift;
632             my $uri = shift or croak "uri required";
633             return $self->queue->put($uri);
634             }
635              
636             =head2 next_from_queue
637              
638             Return next I<uri> from queue.
639              
640             =cut
641              
642             sub next_from_queue {
643             my $self = shift;
644             return $self->queue->get();
645             }
646              
647             =head2 left_in_queue
648              
649             Returns queue()->size().
650              
651             =cut
652              
653             sub left_in_queue {
654             return shift->queue->size();
655             }
656              
657             =head2 remove_from_queue( I<uri> )
658              
659             Calls queue()->remove(I<uri>).
660              
661             =cut
662              
663             sub remove_from_queue {
664             my $self = shift;
665             my $uri = shift or croak "uri required";
666             return $self->queue->remove($uri);
667             }
668              
669             =head2 get_doc
670              
671             Returns the next URI from the queue() as a Dezi::Indexer::Doc object,
672             or the error message if there was one.
673              
674             Returns undef if the queue is empty or max_depth() has been reached.
675              
676             =cut
677              
678             sub get_doc {
679             my $self = shift;
680              
681             # return unless we have something in the queue
682             return unless $self->left_in_queue();
683              
684             # pop the queue and make it a URI
685             my $uri = $self->next_from_queue();
686             my $depth = $self->uri_cache->get("$uri");
687              
688             $self->debug
689             and $self->write_log(
690             uri => $uri,
691             msg => sprintf(
692             "depth:%d max_depth:%s",
693             $depth, ( $self->max_depth || 'undef' )
694             ),
695             );
696              
697             return if defined $self->max_depth && $depth > $self->max_depth;
698              
699             $self->{_cur_depth} = $depth;
700              
701             my $doc = $self->_make_request($uri);
702              
703             if ($doc) {
704             $self->remove_from_queue($uri);
705             }
706              
707             return $doc;
708             }
709              
710             =head2 get_authorized_doc( I<uri>, I<response> )
711              
712             Called internally when the server returns a 401 or 403 response.
713             Will attempt to determine the correct credentials for I<uri>
714             based on the previous attempt in I<response> and what you
715             have configured in B<credentials>, B<authn_callback> or when
716             manually prompted.
717              
718             =cut
719              
720             sub get_authorized_doc {
721             my $self = shift;
722             my $uri = shift or croak "uri required";
723             my $response = shift or croak "response required";
724              
725             # set up credentials
726             $self->_authorize( $uri, $response->http_response ) or return;
727              
728             return $self->_make_request($uri);
729             }
730              
731             sub _make_request {
732             my ( $self, $uri ) = @_;
733              
734             # get our useragent
735             my $ua = $self->ua;
736             my $delay = 0;
737             if ( $self->{keep_alive} ) {
738             $delay = 0;
739             }
740             elsif ( !$self->{delay} or !$self->{_last_response_time} ) {
741             $delay = 0;
742             }
743             else {
744             my $elapsed = time() - $self->{_last_response_time};
745             $delay = $self->{delay} - $elapsed;
746             $delay = 0 if $delay < 0;
747             $self->debug
748             and $self->write_log(
749             uri => $uri,
750             msg => "elapsed:$elapsed delay:$delay",
751             );
752             }
753              
754             $self->write_log(
755             uri => $uri,
756             msg => "GET delay:$delay",
757             ) if $self->verbose;
758              
759             my %get_args = (
760             uri => $uri,
761             delay => $delay,
762             debug => $self->debug,
763             verbose => $self->verbose,
764             );
765              
766             if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) {
767             $get_args{user} = $user;
768             $get_args{pass} = $pass;
769             }
770              
771             # fetch the uri. $ua handles delay internally.
772             my $response = $ua->get(%get_args);
773             my $http_response = $response->http_response;
774              
775             # flag current time for next delay calc.
776             $self->{_last_response_time} = time();
777              
778             # redirect? follow, conditionally.
779             if ( $response->is_redirect ) {
780             my $location = $response->header('location');
781             if ( !$location ) {
782             $self->write_log(
783             uri => $uri,
784             msg => "skipping, redirect without a Location header",
785             );
786             return $response->status;
787             }
788             $self->debug
789             and $self->write_log(
790             uri => $uri,
791             msg => "redirect: $location",
792             );
793             if ( $self->follow_redirects ) {
794             $self->_add_links( $uri,
795             URI->new_abs( $location, $http_response->base ) );
796             }
797             return $response->status;
798             }
799              
800             if ( $response->ct ) {
801             $self->debug and $self->write_log(
802             uri => $uri,
803             msg => 'content-type: ' . $response->ct,
804             );
805             }
806              
807             # add its links to the queue.
808             # If the resource looks like an XML feed of some kind,
809             # glean its links differently than if it is an HTML response.
810             if ( my $feed = $self->looks_like_feed($http_response) ) {
811             $self->debug and $self->write_log(
812             uri => $uri,
813             msg => 'looks like feed'
814             );
815             my @links;
816             for my $entry ( $feed->entries ) {
817             push @links, URI->new( $entry->link );
818             }
819             $self->_add_links( $uri, @links );
820              
821             # we don't want the feed content, we want the links.
822             # TODO make this optional
823             return $response->status;
824             }
825             elsif ( my $sitemap = $self->looks_like_sitemap($http_response) ) {
826             $self->debug and $self->write_log(
827             uri => $uri,
828             msg => 'looks like sitemap',
829             );
830             my @links;
831             for my $url ( $sitemap->urls ) {
832             push @links, URI->new( $url->loc );
833             }
834             $self->_add_links( $uri, @links );
835              
836             # we don't want the feed content, we want the links.
837             # TODO make this optional
838             return $response->status;
839             }
840             else {
841             $self->_add_links( $uri, $response->links );
842             }
843              
844             # return $uri as a Doc object
845             my $use_uri = $response->success ? $ua->uri : $uri;
846             my $meta = {
847             org_uri => $uri,
848             ret_uri => ( $use_uri || $uri ),
849             depth => delete $self->{_cur_depth},
850             status => $response->status,
851             success => $response->success,
852             is_html => $response->is_html,
853             title => (
854             $response->success
855             ? ( $response->is_html
856             ? ( $response->title || "No title: $use_uri" )
857             : $use_uri
858             )
859             : "Failed: $use_uri"
860             ),
861             ct => ( $response->success ? $response->ct : "Unknown" ),
862             };
863              
864             my $headers = $http_response->headers;
865             my $buf = $response->content;
866              
867             if ( $self->{use_md5} ) {
868             my $fingerprint = $response->header('Content-MD5')
869             || Digest::MD5::md5_base64($buf);
870             if ( $self->md5_cache->has($fingerprint) ) {
871             return "duplicate content for "
872             . $self->md5_cache->get($fingerprint);
873             }
874             $self->md5_cache->add( $fingerprint => $uri );
875             }
876              
877             if ( $response->success ) {
878              
879             my $content_type = $meta->{ct};
880             my $swish3 = $self->indexer ? $self->indexer->swish3 : undef;
881             if ( !$UTILS->get_parser_for_mime( $content_type, $swish3 ) ) {
882             $self->write_log(
883             uri => $uri,
884             msg => "no parser for $content_type",
885             );
886             }
887             my $charset = $headers->content_type;
888             $charset =~ s/;?$meta->{ct};?//;
889             my $encoding = $headers->content_encoding || $charset;
890             my %doc = (
891             url => $meta->{org_uri},
892             modtime => ( $headers->last_modified || $headers->date ),
893             type => $meta->{ct},
894             content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ),
895             size => $headers->content_length || length( pack 'C0a*', $buf ),
896             charset => $encoding,
897             );
898              
899             # cache whatever credentials were used so we can re-use
900             if ( $self->{cur_realm} and $uri->userinfo ) {
901             my $key = $uri->canonical->host_port . ':' . $self->{cur_realm};
902             $self->{_auth_cache}->add( $key => $uri->userinfo );
903              
904             # not too sure of the best logic here
905             my $path = $uri->path;
906             $path =~ s!/[^/]*$!!;
907             $self->{last_auth} = {
908             path => $path,
909             auth => $uri->userinfo,
910             };
911             }
912              
913             # return doc
914             return $self->doc_class->new(%doc);
915              
916             }
917             elsif ( $response->status == 401 ) {
918              
919             # authorize and try again
920             $self->write_log(
921             uri => $uri,
922             msg => sprintf( "authn denied, retrying, %s",
923             $response->status_line ),
924             );
925             return $self->get_authorized_doc( $uri, $response )
926             || $response->status;
927             }
928             elsif ($response->status == 403
929             && $http_response->status_line =~ m/robots.txt/ )
930             {
931              
932             # ignore
933             $self->write_log(
934             uri => $uri,
935             msg => sprintf( "skipped, %s", $http_response->status_line ),
936             );
937             return $self->get_authorized_doc( $uri, $response )
938             || $response->status;
939             }
940             elsif ( $response->status == 403 ) {
941              
942             # authorize and try again
943             $self->write_log(
944             uri => $uri,
945             msg => sprintf( "retrying, %s", $http_response->status_line ),
946             );
947             return $self->get_authorized_doc( $uri, $response );
948             }
949             else {
950              
951             $self->write_log(
952             uri => $uri,
953             msg => $http_response->status_line,
954             );
955             return $response->status;
956             }
957              
958             return; # never get here.
959             }
960              
961             sub _get_user_pass {
962             my $self = shift;
963             my $uri = shift;
964              
965             # Set basic auth if defined - use URI specific first, then credentials.
966             # this doesn't track what should have authorization
967             my $last_auth;
968             if ( $self->{last_auth} ) {
969             my $path = $uri->path;
970             $path =~ s!/[^/]*$!!;
971             $last_auth = $self->{last_auth}->{auth}
972             if $self->{last_auth}->{path} eq $path;
973             }
974              
975             my ( $user, $pass ) = split /:/,
976             ( $last_auth || $uri->userinfo || $self->credentials || '' );
977              
978             return ( $user, $pass );
979             }
980              
981             =head2 looks_like_feed( I<http_response> )
982              
983             Called internally to perform naive heuristics on I<http_response>
984             to determine whether it looks like an XML feed of some kind,
985             rather than a HTML page.
986              
987             =cut
988              
989             sub looks_like_feed {
990             my $self = shift;
991             my $response = shift or croak "response required";
992             my $headers = $response->headers;
993             my $ct = $headers->content_type;
994             if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
995             return 0;
996             }
997             if ( $ct eq 'text/xml'
998             or $ct eq 'application/rss+xml'
999             or $ct eq 'application/rdf+xml'
1000             or $ct eq 'application/atom+xml' )
1001             {
1002             my $xml = $response->decoded_content; # TODO or content()
1003             return XML::Feed->parse( \$xml );
1004             }
1005              
1006             return 0;
1007             }
1008              
1009             =head2 looks_like_sitemap( I<http_response> )
1010              
1011             Called internally to perform naive heuristics on I<http_response>
1012             to determine whether it looks like a XML sitemap feed,
1013             rather than a HTML page.
1014              
1015             =cut
1016              
1017             sub looks_like_sitemap {
1018             my $self = shift;
1019             my $response = shift or croak "response required";
1020             my $headers = $response->headers;
1021             my $ct = $headers->content_type;
1022             if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
1023             return 0;
1024             }
1025             if ( $ct eq 'text/xml'
1026             or $ct eq 'application/xml' )
1027             {
1028             my $xml = $response->decoded_content; # TODO or content()
1029             my $sitemap = WWW::Sitemap::XML->new();
1030             eval { $sitemap->load( string => $xml ); };
1031             if ($@) {
1032             return 0;
1033             }
1034             return $sitemap;
1035             }
1036              
1037             return 0;
1038             }
1039              
1040             =head2 crawl( I<uri> )
1041              
1042             Implements the required crawl() method. Recursively fetches I<uri>
1043             and its child links to a depth set in max_depth().
1044              
1045             Will quit after max_files() unless max_files==0.
1046              
1047             Will quit after max_time() seconds unless max_time==0.
1048              
1049             =cut
1050              
1051             sub crawl {
1052             my $self = shift;
1053             my @urls = @_;
1054              
1055             my $indexer = $self->indexer; # may be undef
1056              
1057             for my $url (@urls) {
1058             my $started = time();
1059             $self->debug and $self->write_log(
1060             uri => $url,
1061             msg => "crawling",
1062             );
1063              
1064             my $uri = URI->new($url)->canonical;
1065             $self->uri_cache->add( "$uri" => 1 );
1066             $self->add_to_queue($uri);
1067             $self->{_base} = $uri->as_string;
1068             while ( my $doc = $self->get_doc ) {
1069             $self->debug and $self->write_log_line();
1070             next unless blessed($doc);
1071              
1072             # indexer not required
1073             $indexer->process($doc) if $indexer;
1074              
1075             $self->_increment_count;
1076              
1077             # abort if we've met any max_* conditions
1078             last if $self->max_files and $self->count >= $self->max_files;
1079             last
1080             if $self->max_time
1081             and ( time() - $started ) > $self->max_time;
1082             }
1083             }
1084              
1085             return $self->count;
1086             }
1087              
1088             =head2 write_log( I<args> )
1089              
1090             Passes I<args> to Dezi::Utils::write_log().
1091              
1092             =cut
1093              
1094             sub write_log {
1095             Dezi::Utils::write_log(@_);
1096             }
1097              
1098             =head2 write_log_line([I<char>, I<width>])
1099              
1100             Pass through to Dezi::Utils::write_log_line().
1101              
1102             =cut
1103              
1104             sub write_log_line {
1105             Dezi::Utils::write_log_line(@_);
1106             }
1107              
1108             __PACKAGE__->meta->make_immutable;
1109              
1110             1;
1111              
1112             __END__
1113              
1114             =head1 AUTHOR
1115              
1116             Peter Karman, E<lt>perl@peknet.comE<gt>
1117              
1118             =head1 BUGS
1119              
1120             Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
1121             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
1122             I will be notified, and then you'll
1123             automatically be notified of progress on your bug as I make changes.
1124              
1125             =head1 SUPPORT
1126              
1127             You can find documentation for this module with the perldoc command.
1128              
1129             perldoc Dezi
1130              
1131              
1132             You can also look for information at:
1133              
1134             =over 4
1135              
1136             =item * Mailing list
1137              
1138             L<http://lists.swish-e.org/listinfo/users>
1139              
1140             =item * RT: CPAN's request tracker
1141              
1142             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>
1143              
1144             =item * AnnoCPAN: Annotated CPAN documentation
1145              
1146             L<http://annocpan.org/dist/Dezi-App>
1147              
1148             =item * CPAN Ratings
1149              
1150             L<http://cpanratings.perl.org/d/Dezi-App>
1151              
1152             =item * Search CPAN
1153              
1154             L<http://search.cpan.org/dist/Dezi-App/>
1155              
1156             =back
1157              
1158             =head1 COPYRIGHT AND LICENSE
1159              
1160             Copyright 2008-2009 by Peter Karman
1161              
1162             This library is free software; you can redistribute it and/or modify
1163             it under the same terms as Perl itself.
1164              
1165             =head1 SEE ALSO
1166              
1167             L<http://swish-e.org/>