File Coverage

blib/lib/Dezi/Aggregator/Spider.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Dezi::Aggregator::Spider;
2	1	1	1936	use Moose;
	1		449773
	1		6
3				extends 'Dezi::Aggregator';
4	1	1	6776	use Carp;
	1		2
	1		108
5	1	1	5	use Scalar::Util qw( blessed );
	1		2
	1		47
6	1	1	1171	use URI;
	1		4923
	1		30
7	1	1	10787	use HTTP::Cookies;
	1		13226
	1		40
8	1	1	902	use Types::Standard qw( InstanceOf Maybe Int CodeRef Str Bool ArrayRef );
	1		68768
	1		23
9	1	1	2022	use Dezi::Types qw( DeziFileRules DeziEpoch );
	1		3
	1		11
10	1	1	1220	use Dezi::Utils;
	0
	0
11				use Dezi::Queue;
12				use Dezi::Cache;
13				use Dezi::Aggregator::Spider::UA;
14				use Search::Tools::UTF8;
15				use XML::Feed;
16				use WWW::Sitemap::XML;
17				use File::Rules;
18				use Class::Load;
19
20				#
21				# TODO tests for cookies, non-text urls needing filters
22				#
23				#
24
25				has 'agent' => (
26				is => 'rw',
27				isa => Str,
28				default => sub {'dezi-spider http://dezi.org/'},
29				);
30				has 'authn_callback' => ( is => 'rw', isa => CodeRef );
31				has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} );
32				has 'credentials' => ( is => 'rw', isa => Str );
33				has 'delay' => ( is => 'rw', isa => Int, default => sub {5} );
34				has 'email' => (
35				is => 'rw',
36				isa => Str,
37				default => sub {'dezi@user.failed.to.set.email.invalid'},
38				);
39				has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, );
40				has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} );
41				has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} );
42
43				# whitelist which HTML tags we consider "links"
44				# should be subset of what HTML::LinkExtor considers links
45				has 'link_tags' => (
46				is => 'rw',
47				isa => ArrayRef,
48				default => sub { [ 'a', 'frame', 'iframe' ] }
49				);
50
51				has 'max_depth' => ( is => 'rw', isa => Maybe [Int] );
52				has 'max_files' => ( is => 'rw', isa => Int, default => sub {0} );
53				has 'max_size' => ( is => 'rw', isa => Int, default => sub {5_000_000} );
54				has 'max_time' => ( is => 'rw', isa => Int, ); # TODO
55				has 'md5_cache' => (
56				is => 'rw',
57				isa => InstanceOf ['Dezi::Cache'],
58				default => sub { Dezi::Cache->new }
59				);
60				has 'modified_since' => ( is => 'rw', isa => DeziEpoch, coerce => 1, );
61				has 'queue' => (
62				is => 'rw',
63				isa => InstanceOf ['Dezi::Queue'],
64				default => sub { Dezi::Queue->new }
65				);
66				has 'remove_leading_dots' => ( is => 'rw', isa => Bool, default => sub {1} );
67				has 'same_hosts' => ( is => 'rw', isa => ArrayRef, default => sub { [] } );
68				has 'timeout' => ( is => 'rw', isa => Int, default => sub {30} );
69				has 'ua' => ( is => 'rw', isa => InstanceOf ['LWP::UserAgent'] );
70				has 'uri_cache' => (
71				is => 'rw',
72				isa => InstanceOf ['Dezi::Cache'],
73				default => sub { Dezi::Cache->new },
74				);
75				has 'use_md5' => ( is => 'rw', isa => Bool, default => sub {0} );
76				has 'use_cookies' => ( is => 'rw', isa => Bool, default => sub {1} );
77
78				#use LWP::Debug qw(+);
79
80				our $VERSION = '0.014';
81
82				# shortcut
83				my $UTILS = 'Dezi::Utils';
84
85				=pod
86
87				=head1 NAME
88
89				Dezi::Aggregator::Spider - web aggregator
90
91				=head1 SYNOPSIS
92
93				use Dezi::Aggregator::Spider;
94				my $spider = Dezi::Aggregator::Spider->new(
95				indexer => Dezi::Indexer->new
96				);
97
98				$spider->indexer->start;
99				$spider->crawl( 'http://swish-e.org/' );
100				$spider->indexer->finish;
101
102				=head1 DESCRIPTION
103
104				Dezi::Aggregator::Spider is a web crawler similar to
105				the spider.pl script in the Swish-e 2.4 distribution. Internally,
106				Dezi::Aggregator::Spider uses LWP::RobotUA to do the hard work.
107				See L<Dezi::Aggregator::Spider::UA>.
108
109				=head1 METHODS
110
111				See L<Dezi::Aggregator>.
112
113				=head2 new( I<params> )
114
115				All I<params> have their own get/set methods too. They include:
116
117				=over
118
119				=item agent I<string>
120
121				Get/set the user-agent string reported by the user agent.
122
123				=item email I<string>
124
125				Get/set the email string reported by the user agent.
126
127				=item use_md5 I<1\|0>
128
129				Flag as to whether each URI's content should be fingerprinted
130				and compared. Useful if the same content is available under multiple
131				URIs and you only want to index it once.
132
133				=item uri_cache I<cache_object>
134
135				Get/set the Dezi::Cache-derived object used to track which URIs have
136				been fetched already.
137
138				=item md5_cache I<cache_object>
139
140				If use_md5() is true, this Dezi::Cache-derived object tracks
141				the URI fingerprints.
142
143				=item file_rules I<File_Rules_or_ARRAY>
144
145				Apply L<File::Rules> object in uri_ok(). I<File_Rules_or_ARRAY> should
146				be a L<File::Rules> object or an array of strings suitable to passing
147				to File::Rules->new().
148
149				=item queue I<queue_object>
150
151				Get/set the Dezi::Queue-derived object for tracking which URIs still
152				need to be fetched.
153
154				=item ua I<lwp_useragent>
155
156				Get/set the Dezi::Aggregagor::Spider::UA object.
157
158				=item max_depth I<n>
159
160				How many levels of links to follow. B<NOTE:> This value describes the number
161				of links from the first argument passed to I<crawl>.
162
163				Default is unlimited depth.
164
165				=item max_time I<n>
166
167				This optional key will set the max minutes to spider. Spidering
168				for this host will stop after C<max_time> seconds, and move on to the
169				next server, if any. The default is to not limit by time.
170
171				=item max_files I<n>
172
173				This optional key sets the max number of files to spider before aborting.
174				The default is to not limit by number of files. This is the number of requests
175				made to the remote server, not the total number of files to index (see C<max_indexed>).
176				This count is displayted at the end of indexing as C<Unique URLs>.
177
178				This feature can (and perhaps should) be use when spidering a web site where dynamic
179				content may generate unique URLs to prevent run-away spidering.
180
181				=item max_size I<n>
182
183				This optional key sets the max size of a file read from the web server.
184				This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is
185				truncated per LWP::UserAgent.
186
187				Set max_size to zero for unlimited size.
188
189				=item modified_since I<date>
190
191				This optional parameter will skip any URIs that do not report having
192				been modified since I<date>. The C<Last-Modified> HTTP header is used to
193				determine modification time.
194
195				=item keep_alive I<1\|0>
196
197				This optional parameter will enable keep alive requests. This can dramatically speed
198				up spidering and reduce the load on server being spidered. The default is to not use
199				keep alives, although enabling it will probably be the right thing to do.
200
201				To get the most out of keep alives, you may want to set up your web server to
202				allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
203				Apache's default is 100, which should be good.
204
205				When a connection is not closed the spider does not wait the "delay"
206				time when making the next request. In other words, there is no delay in
207				requesting documents while the connection is open.
208
209				Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
210
211				=item delay I<n>
212
213				Get/set the number of seconds to wait between making requests. Default is
214				5 seconds (a very friendly delay).
215
216				=item timeout I<n>
217
218				Get/set the number of seconds to wait before considering the remote
219				server unresponsive. The default is 10.
220
221				=item authn_callback I<code_ref>
222
223				CODE reference to fetch username/password credentials when necessary. See also
224				C<credentials>.
225
226				=item credential_timeout I<n>
227
228				Number of seconds to wait before skipping manual prompt for username/password.
229
230				=item credentials I<user:pass>
231
232				String with C<username>:C<password> pair to be used when prompted by
233				the server.
234
235				=item follow_redirects I<1\|0>
236
237				By default, 3xx responses from the server will be followed when
238				they are on the same hostname. Set to false (0) to not follow
239				redirects.
240
241				=item link_tags
242
243				TODO
244
245				=item remove_leading_dots I<1\|0>
246
247				Microsoft server hack.
248
249				=item same_hosts I<array_ref>
250
251				ARRAY ref of hostnames to be treated as identical to the original
252				host being spidered. By default the spider will not follow
253				links to different hosts.
254
255				=back
256
257				=head2 BUILD
258
259				Initializes a new spider object. Called by new().
260
261				=cut
262
263				sub BUILD {
264				my $self = shift;
265
266				$self->{_auth_cache} = Dezi::Cache->new; # ALWAYS inmemory cache
267
268				$self->{ua}
269				\|\|= Dezi::Aggregator::Spider::UA->new( $self->agent, $self->email, );
270
271				$self->{ua}
272				->set_link_tags( { map { lc($_) => 1 } @{ $self->{link_tags} } } );
273
274				# we handle our own delay
275				$self->{ua}->delay(0);
276
277				$self->{ua}->timeout( $self->timeout );
278
279				# TODO we test this using HEAD request. Set here too?
280				#$self->{ua}->max_size( $self->{max_size} ) if $self->{max_size};
281
282				if ( $self->use_cookies ) {
283				$self->{ua}->cookie_jar( HTTP::Cookies->new() );
284				}
285				if ( $self->keep_alive ) {
286				if ( $self->{ua}->can('conn_cache') ) {
287				$self->{ua}
288				->conn_cache( { total_capacity => $self->keep_alive } );
289				}
290				else {
291				warn
292				"can't use keep-alive: conn_cache() method not available on ua "
293				. ref( $self->{ua} );
294				}
295				}
296
297				$self->{_current_depth} = 1;
298
299				$self->{same_host_lookup} = { map { $_ => 1 } @{ $self->{same_hosts} } };
300
301				if ( $self->use_md5 ) {
302				Class::Load::load_class('Digest::MD5');
303				}
304
305				# if Dezi::Indexer::Config defined, use that for some items
306				if ( $self->indexer and $self->indexer->config ) {
307				if ( $self->indexer->config->FileRules && !$self->file_rules ) {
308				$self->file_rules(
309				File::Rules->new( $self->indexer->config->FileRules ) );
310				}
311				}
312
313				# from spider.pl. not sure if we need it or not.
314				# Lame Microsoft
315				$URI::ABS_REMOTE_LEADING_DOTS = $self->remove_leading_dots;
316
317				return $self;
318				}
319
320				=head2 uri_ok( I<uri> )
321
322				Returns true if I<uri> is acceptable for including in an index.
323				The 'ok-ness' of the I<uri> is based on its base, robot rules,
324				and the spider configuration.
325
326				=cut
327
328				sub uri_ok {
329				my $self = shift;
330				my $uri = shift or croak "URI required";
331				my $str = $uri->canonical->as_string;
332				$str =~ s/#.*//; # target anchors create noise
333
334				if ( $self->verbose > 1 \|\| $self->debug ) {
335				$self->write_log_line();
336				$self->write_log(
337				uri => $uri,
338				msg => "checking if ok",
339				);
340				}
341
342				if ( $uri->scheme !~ m,^http, ) {
343				$self->debug and $self->write_log(
344				uri => $uri,
345				msg => "skipping, unsupported scheme"
346				);
347				return 0;
348				}
349
350				# check if we're on the same host.
351				if ( $uri->rel( $self->{_base} ) eq $uri ) {
352
353				# not on this host. check our aliases
354				if ( !exists $self->{same_host_lookup}
355				->{ $uri->canonical->authority \|\| '' } )
356				{
357				my $host = $uri->canonical->authority;
358				$self->debug
359				and $self->write_log(
360				uri => $uri,
361				msg => "skipping, different host $host",
362				);
363				return 0;
364				}
365
366				# in same host lookup, so proceed.
367				}
368
369				my $path = $uri->path;
370				my $swish3 = $self->indexer ? $self->indexer->swish3 : undef;
371				my $mime = $UTILS->get_mime( $path, $swish3 );
372
373				if ( !$UTILS->get_parser_for_mime( $mime, $swish3 ) ) {
374				$self->debug and $self->write_log(
375				uri => $uri,
376				msg => "skipping, no parser for $mime",
377				);
378				return 0;
379				}
380
381				# check regex
382				if ( $self->file_rules ) {
383
384				if ( $self->_apply_file_rules( $uri->path_query, $self->file_rules )
385				&& !$self->_apply_file_match( $uri->path_query,
386				$self->file_rules ) )
387				{
388				$self->debug and $self->write_log(
389				uri => $uri,
390				msg => "skipping, matched file_rules",
391				);
392				return 0;
393				}
394				}
395
396				# head request to check max_size and modified_since
397				if ( $self->max_size or $self->modified_since ) {
398				my %head_args = (
399				uri => $uri,
400				delay => 0, # assume each get() applies the delay
401				debug => $self->debug,
402				verbose => $self->verbose,
403				);
404
405				if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) {
406				$head_args{user} = $user;
407				$head_args{pass} = $pass;
408				}
409				my $resp = $self->ua->head(%head_args);
410
411				# early abort if resource doesn't exist
412				if ( $resp->status == 404 ) {
413				$self->debug
414				and $self->write_log(
415				uri => $uri,
416				msg => "skipping, 404 not found",
417				);
418				return 0;
419				}
420
421				# redirect? assume ok now and _make_request will check on it later.
422				if ( $resp->is_redirect ) {
423				$self->debug
424				and $self->write_log(
425				uri => $uri,
426				msg => "deferring, is_redirect",
427				);
428				return 1;
429				}
430
431				my $last_mod = $resp->last_modified;
432				if ( $last_mod
433				and $self->modified_since
434				and $self->modified_since > $last_mod )
435				{
436				$self->debug
437				and $self->write_log(
438				uri => $uri,
439				msg => sprintf(
440				"skipping, last modified %s (%s < %s)",
441				$resp->header('last-modified'), $last_mod,
442				$self->modified_since
443				),
444				);
445				return 0;
446				}
447
448				if ( $resp->content_length and $self->max_size ) {
449				if ( $resp->content_length > $self->max_size ) {
450				$self->debug
451				and $self->write_log(
452				uri => $uri,
453				msg => sprintf( "skipping, %s > max_size",
454				$resp->content_length ),
455				);
456				return 0;
457				}
458				}
459
460				}
461
462				( $self->verbose > 1 \|\| $self->debug ) and $self->write_log(
463				uri => $uri,
464				msg => "ok",
465				);
466				return 1;
467				}
468
469				sub _add_links {
470				my ( $self, $parent, @links ) = @_;
471
472				# calc depth
473				if ( !$self->{_parent} \|\| $self->{_parent} ne $parent ) {
474				$self->{_current_depth}++;
475				}
476
477				$self->{_parent} \|\|= $parent; # first time.
478
479				$self->debug and $self->write_log(
480				uri => $parent,
481				msg => sprintf( 'evaluating %s links', scalar(@links) ),
482				);
483
484				for my $l (@links) {
485				my $uri = $l->abs( $self->{_base} ) or next;
486				$uri = $uri->canonical; # normalize
487				if ( $self->uri_cache->has("$uri") ) {
488				$self->debug and $self->write_log(
489				uri => $uri,
490				msg => "skipping, already checked",
491				);
492				next;
493				}
494				$self->uri_cache->add( "$uri" => $self->{_current_depth} );
495
496				if ( $self->uri_ok($uri) ) {
497				$self->add_to_queue($uri);
498				}
499				}
500				}
501
502				# ported from spider.pl
503				# Do we need to authorize? If so, ask for password and request again.
504				# First we try using any cached value
505				# Then we try using the get_password callback
506				# Then we ask.
507
508				sub _authorize {
509				my ( $self, $uri, $response ) = @_;
510
511				delete $self->{last_auth}; # since we know that doesn't work
512
513				if ( $response->header('WWW-Authenticate')
514				&& $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i )
515				{
516				my $realm = $1;
517				my $user_pass;
518
519				# Do we have a cached user/pass for this realm?
520				# only each URI only once
521				unless ( $self->{_request}->{auth}->{$uri}++ ) {
522				my $key = $uri->canonical->host_port . ':' . $realm;
523
524				if ( $user_pass = $self->{_auth_cache}->get($key) ) {
525
526				# If we didn't just try it, try again
527				unless ( $uri->userinfo && $user_pass eq $uri->userinfo ) {
528
529				# add the user/pass to the URI
530				$uri->userinfo($user_pass);
531
532				#warn " >> set userinfo via _auth_cache\n" if $self->debug;
533				return 1;
534				}
535				else {
536				# we've tried this before
537				#warn "tried $user_pass before";
538				return 0;
539				}
540				}
541				}
542
543				# now check for a callback password (if $user_pass not set)
544				unless ( $user_pass \|\| $self->{_request}->{auth}->{callback}++ ) {
545
546				# Check for a callback function
547				if ( $self->{authn_callback}
548				and ref $self->{authn_callback} eq 'CODE' )
549				{
550				$user_pass = $self->{authn_callback}
551				->( $self, $uri, $response, $realm );
552				$uri->userinfo($user_pass);
553
554				#warn " >> set userinfo via authn_callback\n" if $self->debug;
555				return 1;
556				}
557				}
558
559				# otherwise, prompt (over and over)
560				if ( !$user_pass ) {
561				$user_pass = $self->_get_basic_credentials( $uri, $realm );
562				}
563
564				if ($user_pass) {
565				$uri->userinfo($user_pass);
566				$self->{cur_realm} = $realm; # save so we can cache if it's valid
567				return 1;
568				}
569				}
570
571				return 0;
572
573				}
574
575				# From spider.pl
576				sub _get_basic_credentials {
577				my ( $self, $uri, $realm ) = @_;
578
579				# Exists but undefined means don't ask.
580				return
581				if exists $self->{credential_timeout}
582				&& !defined $self->{credential_timeout};
583
584				my $netloc = $uri->canonical->host_port;
585
586				my ( $user, $password );
587
588				eval {
589				local $SIG{ALRM} = sub { die "timed out\n" };
590
591				# a zero timeout means don't time out
592				alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i;
593
594				if ( $uri->userinfo ) {
595				print STDERR "\nSorry: invalid username/password\n";
596				$uri->userinfo(undef);
597				}
598
599				print STDERR
600				"Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: ";
601				$user = <STDIN>;
602				chomp($user) if $user;
603				die "No Username specified\n" unless length $user;
604
605				alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i;
606
607				print STDERR "Password: ";
608				system("stty -echo");
609				$password = <STDIN>;
610				system("stty echo");
611				print STDERR "\n"; # because we disabled echo
612				chomp($password);
613				alarm(0) unless $^O =~ /Win32/i;
614				};
615
616				alarm(0) unless $^O =~ /Win32/i;
617
618				return if $@;
619
620				return join ':', $user, $password;
621
622				}
623
624				=head2 add_to_queue( I<uri> )
625
626				Add I<uri> to the queue.
627
628				=cut
629
630				sub add_to_queue {
631				my $self = shift;
632				my $uri = shift or croak "uri required";
633				return $self->queue->put($uri);
634				}
635
636				=head2 next_from_queue
637
638				Return next I<uri> from queue.
639
640				=cut
641
642				sub next_from_queue {
643				my $self = shift;
644				return $self->queue->get();
645				}
646
647				=head2 left_in_queue
648
649				Returns queue()->size().
650
651				=cut
652
653				sub left_in_queue {
654				return shift->queue->size();
655				}
656
657				=head2 remove_from_queue( I<uri> )
658
659				Calls queue()->remove(I<uri>).
660
661				=cut
662
663				sub remove_from_queue {
664				my $self = shift;
665				my $uri = shift or croak "uri required";
666				return $self->queue->remove($uri);
667				}
668
669				=head2 get_doc
670
671				Returns the next URI from the queue() as a Dezi::Indexer::Doc object,
672				or the error message if there was one.
673
674				Returns undef if the queue is empty or max_depth() has been reached.
675
676				=cut
677
678				sub get_doc {
679				my $self = shift;
680
681				# return unless we have something in the queue
682				return unless $self->left_in_queue();
683
684				# pop the queue and make it a URI
685				my $uri = $self->next_from_queue();
686				my $depth = $self->uri_cache->get("$uri");
687
688				$self->debug
689				and $self->write_log(
690				uri => $uri,
691				msg => sprintf(
692				"depth:%d max_depth:%s",
693				$depth, ( $self->max_depth \|\| 'undef' )
694				),
695				);
696
697				return if defined $self->max_depth && $depth > $self->max_depth;
698
699				$self->{_cur_depth} = $depth;
700
701				my $doc = $self->_make_request($uri);
702
703				if ($doc) {
704				$self->remove_from_queue($uri);
705				}
706
707				return $doc;
708				}
709
710				=head2 get_authorized_doc( I<uri>, I<response> )
711
712				Called internally when the server returns a 401 or 403 response.
713				Will attempt to determine the correct credentials for I<uri>
714				based on the previous attempt in I<response> and what you
715				have configured in B<credentials>, B<authn_callback> or when
716				manually prompted.
717
718				=cut
719
720				sub get_authorized_doc {
721				my $self = shift;
722				my $uri = shift or croak "uri required";
723				my $response = shift or croak "response required";
724
725				# set up credentials
726				$self->_authorize( $uri, $response->http_response ) or return;
727
728				return $self->_make_request($uri);
729				}
730
731				sub _make_request {
732				my ( $self, $uri ) = @_;
733
734				# get our useragent
735				my $ua = $self->ua;
736				my $delay = 0;
737				if ( $self->{keep_alive} ) {
738				$delay = 0;
739				}
740				elsif ( !$self->{delay} or !$self->{_last_response_time} ) {
741				$delay = 0;
742				}
743				else {
744				my $elapsed = time() - $self->{_last_response_time};
745				$delay = $self->{delay} - $elapsed;
746				$delay = 0 if $delay < 0;
747				$self->debug
748				and $self->write_log(
749				uri => $uri,
750				msg => "elapsed:$elapsed delay:$delay",
751				);
752				}
753
754				$self->write_log(
755				uri => $uri,
756				msg => "GET delay:$delay",
757				) if $self->verbose;
758
759				my %get_args = (
760				uri => $uri,
761				delay => $delay,
762				debug => $self->debug,
763				verbose => $self->verbose,
764				);
765
766				if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) {
767				$get_args{user} = $user;
768				$get_args{pass} = $pass;
769				}
770
771				# fetch the uri. $ua handles delay internally.
772				my $response = $ua->get(%get_args);
773				my $http_response = $response->http_response;
774
775				# flag current time for next delay calc.
776				$self->{_last_response_time} = time();
777
778				# redirect? follow, conditionally.
779				if ( $response->is_redirect ) {
780				my $location = $response->header('location');
781				if ( !$location ) {
782				$self->write_log(
783				uri => $uri,
784				msg => "skipping, redirect without a Location header",
785				);
786				return $response->status;
787				}
788				$self->debug
789				and $self->write_log(
790				uri => $uri,
791				msg => "redirect: $location",
792				);
793				if ( $self->follow_redirects ) {
794				$self->_add_links( $uri,
795				URI->new_abs( $location, $http_response->base ) );
796				}
797				return $response->status;
798				}
799
800				if ( $response->ct ) {
801				$self->debug and $self->write_log(
802				uri => $uri,
803				msg => 'content-type: ' . $response->ct,
804				);
805				}
806
807				# add its links to the queue.
808				# If the resource looks like an XML feed of some kind,
809				# glean its links differently than if it is an HTML response.
810				if ( my $feed = $self->looks_like_feed($http_response) ) {
811				$self->debug and $self->write_log(
812				uri => $uri,
813				msg => 'looks like feed'
814				);
815				my @links;
816				for my $entry ( $feed->entries ) {
817				push @links, URI->new( $entry->link );
818				}
819				$self->_add_links( $uri, @links );
820
821				# we don't want the feed content, we want the links.
822				# TODO make this optional
823				return $response->status;
824				}
825				elsif ( my $sitemap = $self->looks_like_sitemap($http_response) ) {
826				$self->debug and $self->write_log(
827				uri => $uri,
828				msg => 'looks like sitemap',
829				);
830				my @links;
831				for my $url ( $sitemap->urls ) {
832				push @links, URI->new( $url->loc );
833				}
834				$self->_add_links( $uri, @links );
835
836				# we don't want the feed content, we want the links.
837				# TODO make this optional
838				return $response->status;
839				}
840				else {
841				$self->_add_links( $uri, $response->links );
842				}
843
844				# return $uri as a Doc object
845				my $use_uri = $response->success ? $ua->uri : $uri;
846				my $meta = {
847				org_uri => $uri,
848				ret_uri => ( $use_uri \|\| $uri ),
849				depth => delete $self->{_cur_depth},
850				status => $response->status,
851				success => $response->success,
852				is_html => $response->is_html,
853				title => (
854				$response->success
855				? ( $response->is_html
856				? ( $response->title \|\| "No title: $use_uri" )
857				: $use_uri
858				)
859				: "Failed: $use_uri"
860				),
861				ct => ( $response->success ? $response->ct : "Unknown" ),
862				};
863
864				my $headers = $http_response->headers;
865				my $buf = $response->content;
866
867				if ( $self->{use_md5} ) {
868				my $fingerprint = $response->header('Content-MD5')
869				\|\| Digest::MD5::md5_base64($buf);
870				if ( $self->md5_cache->has($fingerprint) ) {
871				return "duplicate content for "
872				. $self->md5_cache->get($fingerprint);
873				}
874				$self->md5_cache->add( $fingerprint => $uri );
875				}
876
877				if ( $response->success ) {
878
879				my $content_type = $meta->{ct};
880				my $swish3 = $self->indexer ? $self->indexer->swish3 : undef;
881				if ( !$UTILS->get_parser_for_mime( $content_type, $swish3 ) ) {
882				$self->write_log(
883				uri => $uri,
884				msg => "no parser for $content_type",
885				);
886				}
887				my $charset = $headers->content_type;
888				$charset =~ s/;?$meta->{ct};?//;
889				my $encoding = $headers->content_encoding \|\| $charset;
890				my %doc = (
891				url => $meta->{org_uri},
892				modtime => ( $headers->last_modified \|\| $headers->date ),
893				type => $meta->{ct},
894				content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ),
895				size => $headers->content_length \|\| length( pack 'C0a*', $buf ),
896				charset => $encoding,
897				);
898
899				# cache whatever credentials were used so we can re-use
900				if ( $self->{cur_realm} and $uri->userinfo ) {
901				my $key = $uri->canonical->host_port . ':' . $self->{cur_realm};
902				$self->{_auth_cache}->add( $key => $uri->userinfo );
903
904				# not too sure of the best logic here
905				my $path = $uri->path;
906				$path =~ s!/[^/]*$!!;
907				$self->{last_auth} = {
908				path => $path,
909				auth => $uri->userinfo,
910				};
911				}
912
913				# return doc
914				return $self->doc_class->new(%doc);
915
916				}
917				elsif ( $response->status == 401 ) {
918
919				# authorize and try again
920				$self->write_log(
921				uri => $uri,
922				msg => sprintf( "authn denied, retrying, %s",
923				$response->status_line ),
924				);
925				return $self->get_authorized_doc( $uri, $response )
926				\|\| $response->status;
927				}
928				elsif ($response->status == 403
929				&& $http_response->status_line =~ m/robots.txt/ )
930				{
931
932				# ignore
933				$self->write_log(
934				uri => $uri,
935				msg => sprintf( "skipped, %s", $http_response->status_line ),
936				);
937				return $self->get_authorized_doc( $uri, $response )
938				\|\| $response->status;
939				}
940				elsif ( $response->status == 403 ) {
941
942				# authorize and try again
943				$self->write_log(
944				uri => $uri,
945				msg => sprintf( "retrying, %s", $http_response->status_line ),
946				);
947				return $self->get_authorized_doc( $uri, $response );
948				}
949				else {
950
951				$self->write_log(
952				uri => $uri,
953				msg => $http_response->status_line,
954				);
955				return $response->status;
956				}
957
958				return; # never get here.
959				}
960
961				sub _get_user_pass {
962				my $self = shift;
963				my $uri = shift;
964
965				# Set basic auth if defined - use URI specific first, then credentials.
966				# this doesn't track what should have authorization
967				my $last_auth;
968				if ( $self->{last_auth} ) {
969				my $path = $uri->path;
970				$path =~ s!/[^/]*$!!;
971				$last_auth = $self->{last_auth}->{auth}
972				if $self->{last_auth}->{path} eq $path;
973				}
974
975				my ( $user, $pass ) = split /:/,
976				( $last_auth \|\| $uri->userinfo \|\| $self->credentials \|\| '' );
977
978				return ( $user, $pass );
979				}
980
981				=head2 looks_like_feed( I<http_response> )
982
983				Called internally to perform naive heuristics on I<http_response>
984				to determine whether it looks like an XML feed of some kind,
985				rather than a HTML page.
986
987				=cut
988
989				sub looks_like_feed {
990				my $self = shift;
991				my $response = shift or croak "response required";
992				my $headers = $response->headers;
993				my $ct = $headers->content_type;
994				if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
995				return 0;
996				}
997				if ( $ct eq 'text/xml'
998				or $ct eq 'application/rss+xml'
999				or $ct eq 'application/rdf+xml'
1000				or $ct eq 'application/atom+xml' )
1001				{
1002				my $xml = $response->decoded_content; # TODO or content()
1003				return XML::Feed->parse( \$xml );
1004				}
1005
1006				return 0;
1007				}
1008
1009				=head2 looks_like_sitemap( I<http_response> )
1010
1011				Called internally to perform naive heuristics on I<http_response>
1012				to determine whether it looks like a XML sitemap feed,
1013				rather than a HTML page.
1014
1015				=cut
1016
1017				sub looks_like_sitemap {
1018				my $self = shift;
1019				my $response = shift or croak "response required";
1020				my $headers = $response->headers;
1021				my $ct = $headers->content_type;
1022				if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) {
1023				return 0;
1024				}
1025				if ( $ct eq 'text/xml'
1026				or $ct eq 'application/xml' )
1027				{
1028				my $xml = $response->decoded_content; # TODO or content()
1029				my $sitemap = WWW::Sitemap::XML->new();
1030				eval { $sitemap->load( string => $xml ); };
1031				if ($@) {
1032				return 0;
1033				}
1034				return $sitemap;
1035				}
1036
1037				return 0;
1038				}
1039
1040				=head2 crawl( I<uri> )
1041
1042				Implements the required crawl() method. Recursively fetches I<uri>
1043				and its child links to a depth set in max_depth().
1044
1045				Will quit after max_files() unless max_files==0.
1046
1047				Will quit after max_time() seconds unless max_time==0.
1048
1049				=cut
1050
1051				sub crawl {
1052				my $self = shift;
1053				my @urls = @_;
1054
1055				my $indexer = $self->indexer; # may be undef
1056
1057				for my $url (@urls) {
1058				my $started = time();
1059				$self->debug and $self->write_log(
1060				uri => $url,
1061				msg => "crawling",
1062				);
1063
1064				my $uri = URI->new($url)->canonical;
1065				$self->uri_cache->add( "$uri" => 1 );
1066				$self->add_to_queue($uri);
1067				$self->{_base} = $uri->as_string;
1068				while ( my $doc = $self->get_doc ) {
1069				$self->debug and $self->write_log_line();
1070				next unless blessed($doc);
1071
1072				# indexer not required
1073				$indexer->process($doc) if $indexer;
1074
1075				$self->_increment_count;
1076
1077				# abort if we've met any max_* conditions
1078				last if $self->max_files and $self->count >= $self->max_files;
1079				last
1080				if $self->max_time
1081				and ( time() - $started ) > $self->max_time;
1082				}
1083				}
1084
1085				return $self->count;
1086				}
1087
1088				=head2 write_log( I<args> )
1089
1090				Passes I<args> to Dezi::Utils::write_log().
1091
1092				=cut
1093
1094				sub write_log {
1095				Dezi::Utils::write_log(@_);
1096				}
1097
1098				=head2 write_log_line([I<char>, I<width>])
1099
1100				Pass through to Dezi::Utils::write_log_line().
1101
1102				=cut
1103
1104				sub write_log_line {
1105				Dezi::Utils::write_log_line(@_);
1106				}
1107
1108				__PACKAGE__->meta->make_immutable;
1109
1110				1;
1111
1112				__END__
1113
1114				=head1 AUTHOR
1115
1116				Peter Karman, E<lt>perl@peknet.comE<gt>
1117
1118				=head1 BUGS
1119
1120				Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
1121				the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
1122				I will be notified, and then you'll
1123				automatically be notified of progress on your bug as I make changes.
1124
1125				=head1 SUPPORT
1126
1127				You can find documentation for this module with the perldoc command.
1128
1129				perldoc Dezi
1130
1131
1132				You can also look for information at:
1133
1134				=over 4
1135
1136				=item * Mailing list
1137
1138				L<http://lists.swish-e.org/listinfo/users>
1139
1140				=item * RT: CPAN's request tracker
1141
1142				L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>
1143
1144				=item * AnnoCPAN: Annotated CPAN documentation
1145
1146				L<http://annocpan.org/dist/Dezi-App>
1147
1148				=item * CPAN Ratings
1149
1150				L<http://cpanratings.perl.org/d/Dezi-App>
1151
1152				=item * Search CPAN
1153
1154				L<http://search.cpan.org/dist/Dezi-App/>
1155
1156				=back
1157
1158				=head1 COPYRIGHT AND LICENSE
1159
1160				Copyright 2008-2009 by Peter Karman
1161
1162				This library is free software; you can redistribute it and/or modify
1163				it under the same terms as Perl itself.
1164
1165				=head1 SEE ALSO
1166
1167				L<http://swish-e.org/>