| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Dezi::Aggregator::Spider; |
|
2
|
1
|
|
|
1
|
|
1936
|
use Moose; |
|
|
1
|
|
|
|
|
449773
|
|
|
|
1
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
|
|
extends 'Dezi::Aggregator'; |
|
4
|
1
|
|
|
1
|
|
6776
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
108
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use Scalar::Util qw( blessed ); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
47
|
|
|
6
|
1
|
|
|
1
|
|
1171
|
use URI; |
|
|
1
|
|
|
|
|
4923
|
|
|
|
1
|
|
|
|
|
30
|
|
|
7
|
1
|
|
|
1
|
|
10787
|
use HTTP::Cookies; |
|
|
1
|
|
|
|
|
13226
|
|
|
|
1
|
|
|
|
|
40
|
|
|
8
|
1
|
|
|
1
|
|
902
|
use Types::Standard qw( InstanceOf Maybe Int CodeRef Str Bool ArrayRef ); |
|
|
1
|
|
|
|
|
68768
|
|
|
|
1
|
|
|
|
|
23
|
|
|
9
|
1
|
|
|
1
|
|
2022
|
use Dezi::Types qw( DeziFileRules DeziEpoch ); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
11
|
|
|
10
|
1
|
|
|
1
|
|
1220
|
use Dezi::Utils; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Dezi::Queue; |
|
12
|
|
|
|
|
|
|
use Dezi::Cache; |
|
13
|
|
|
|
|
|
|
use Dezi::Aggregator::Spider::UA; |
|
14
|
|
|
|
|
|
|
use Search::Tools::UTF8; |
|
15
|
|
|
|
|
|
|
use XML::Feed; |
|
16
|
|
|
|
|
|
|
use WWW::Sitemap::XML; |
|
17
|
|
|
|
|
|
|
use File::Rules; |
|
18
|
|
|
|
|
|
|
use Class::Load; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# |
|
21
|
|
|
|
|
|
|
# TODO tests for cookies, non-text urls needing filters |
|
22
|
|
|
|
|
|
|
# |
|
23
|
|
|
|
|
|
|
# |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has 'agent' => ( |
|
26
|
|
|
|
|
|
|
is => 'rw', |
|
27
|
|
|
|
|
|
|
isa => Str, |
|
28
|
|
|
|
|
|
|
default => sub {'dezi-spider http://dezi.org/'}, |
|
29
|
|
|
|
|
|
|
); |
|
30
|
|
|
|
|
|
|
has 'authn_callback' => ( is => 'rw', isa => CodeRef ); |
|
31
|
|
|
|
|
|
|
has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} ); |
|
32
|
|
|
|
|
|
|
has 'credentials' => ( is => 'rw', isa => Str ); |
|
33
|
|
|
|
|
|
|
has 'delay' => ( is => 'rw', isa => Int, default => sub {5} ); |
|
34
|
|
|
|
|
|
|
has 'email' => ( |
|
35
|
|
|
|
|
|
|
is => 'rw', |
|
36
|
|
|
|
|
|
|
isa => Str, |
|
37
|
|
|
|
|
|
|
default => sub {'dezi@user.failed.to.set.email.invalid'}, |
|
38
|
|
|
|
|
|
|
); |
|
39
|
|
|
|
|
|
|
has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, ); |
|
40
|
|
|
|
|
|
|
has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} ); |
|
41
|
|
|
|
|
|
|
has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} ); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# whitelist which HTML tags we consider "links" |
|
44
|
|
|
|
|
|
|
# should be subset of what HTML::LinkExtor considers links |
|
45
|
|
|
|
|
|
|
has 'link_tags' => ( |
|
46
|
|
|
|
|
|
|
is => 'rw', |
|
47
|
|
|
|
|
|
|
isa => ArrayRef, |
|
48
|
|
|
|
|
|
|
default => sub { [ 'a', 'frame', 'iframe' ] } |
|
49
|
|
|
|
|
|
|
); |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
has 'max_depth' => ( is => 'rw', isa => Maybe [Int] ); |
|
52
|
|
|
|
|
|
|
has 'max_files' => ( is => 'rw', isa => Int, default => sub {0} ); |
|
53
|
|
|
|
|
|
|
has 'max_size' => ( is => 'rw', isa => Int, default => sub {5_000_000} ); |
|
54
|
|
|
|
|
|
|
has 'max_time' => ( is => 'rw', isa => Int, ); # TODO |
|
55
|
|
|
|
|
|
|
has 'md5_cache' => ( |
|
56
|
|
|
|
|
|
|
is => 'rw', |
|
57
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Cache'], |
|
58
|
|
|
|
|
|
|
default => sub { Dezi::Cache->new } |
|
59
|
|
|
|
|
|
|
); |
|
60
|
|
|
|
|
|
|
has 'modified_since' => ( is => 'rw', isa => DeziEpoch, coerce => 1, ); |
|
61
|
|
|
|
|
|
|
has 'queue' => ( |
|
62
|
|
|
|
|
|
|
is => 'rw', |
|
63
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Queue'], |
|
64
|
|
|
|
|
|
|
default => sub { Dezi::Queue->new } |
|
65
|
|
|
|
|
|
|
); |
|
66
|
|
|
|
|
|
|
has 'remove_leading_dots' => ( is => 'rw', isa => Bool, default => sub {1} ); |
|
67
|
|
|
|
|
|
|
has 'same_hosts' => ( is => 'rw', isa => ArrayRef, default => sub { [] } ); |
|
68
|
|
|
|
|
|
|
has 'timeout' => ( is => 'rw', isa => Int, default => sub {30} ); |
|
69
|
|
|
|
|
|
|
has 'ua' => ( is => 'rw', isa => InstanceOf ['LWP::UserAgent'] ); |
|
70
|
|
|
|
|
|
|
has 'uri_cache' => ( |
|
71
|
|
|
|
|
|
|
is => 'rw', |
|
72
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Cache'], |
|
73
|
|
|
|
|
|
|
default => sub { Dezi::Cache->new }, |
|
74
|
|
|
|
|
|
|
); |
|
75
|
|
|
|
|
|
|
has 'use_md5' => ( is => 'rw', isa => Bool, default => sub {0} ); |
|
76
|
|
|
|
|
|
|
has 'use_cookies' => ( is => 'rw', isa => Bool, default => sub {1} ); |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
#use LWP::Debug qw(+); |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
our $VERSION = '0.014'; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# shortcut |
|
83
|
|
|
|
|
|
|
my $UTILS = 'Dezi::Utils'; |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=pod |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 NAME |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Dezi::Aggregator::Spider - web aggregator |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
use Dezi::Aggregator::Spider; |
|
94
|
|
|
|
|
|
|
my $spider = Dezi::Aggregator::Spider->new( |
|
95
|
|
|
|
|
|
|
indexer => Dezi::Indexer->new |
|
96
|
|
|
|
|
|
|
); |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$spider->indexer->start; |
|
99
|
|
|
|
|
|
|
$spider->crawl( 'http://swish-e.org/' ); |
|
100
|
|
|
|
|
|
|
$spider->indexer->finish; |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Dezi::Aggregator::Spider is a web crawler similar to |
|
105
|
|
|
|
|
|
|
the spider.pl script in the Swish-e 2.4 distribution. Internally, |
|
106
|
|
|
|
|
|
|
Dezi::Aggregator::Spider uses LWP::RobotUA to do the hard work. |
|
107
|
|
|
|
|
|
|
See L<Dezi::Aggregator::Spider::UA>. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 METHODS |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
See L<Dezi::Aggregator>. |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 new( I<params> ) |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
All I<params> have their own get/set methods too. They include: |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=over |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=item agent I<string> |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Get/set the user-agent string reported by the user agent. |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item email I<string> |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
Get/set the email string reported by the user agent. |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=item use_md5 I<1|0> |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Flag as to whether each URI's content should be fingerprinted |
|
130
|
|
|
|
|
|
|
and compared. Useful if the same content is available under multiple |
|
131
|
|
|
|
|
|
|
URIs and you only want to index it once. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item uri_cache I<cache_object> |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Get/set the Dezi::Cache-derived object used to track which URIs have |
|
136
|
|
|
|
|
|
|
been fetched already. |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item md5_cache I<cache_object> |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
If use_md5() is true, this Dezi::Cache-derived object tracks |
|
141
|
|
|
|
|
|
|
the URI fingerprints. |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=item file_rules I<File_Rules_or_ARRAY> |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Apply L<File::Rules> object in uri_ok(). I<File_Rules_or_ARRAY> should |
|
146
|
|
|
|
|
|
|
be a L<File::Rules> object or an array of strings suitable to passing |
|
147
|
|
|
|
|
|
|
to File::Rules->new(). |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item queue I<queue_object> |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Get/set the Dezi::Queue-derived object for tracking which URIs still |
|
152
|
|
|
|
|
|
|
need to be fetched. |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item ua I<lwp_useragent> |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Get/set the Dezi::Aggregagor::Spider::UA object. |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=item max_depth I<n> |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
How many levels of links to follow. B<NOTE:> This value describes the number |
|
161
|
|
|
|
|
|
|
of links from the first argument passed to I<crawl>. |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Default is unlimited depth. |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=item max_time I<n> |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
This optional key will set the max minutes to spider. Spidering |
|
168
|
|
|
|
|
|
|
for this host will stop after C<max_time> seconds, and move on to the |
|
169
|
|
|
|
|
|
|
next server, if any. The default is to not limit by time. |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item max_files I<n> |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
This optional key sets the max number of files to spider before aborting. |
|
174
|
|
|
|
|
|
|
The default is to not limit by number of files. This is the number of requests |
|
175
|
|
|
|
|
|
|
made to the remote server, not the total number of files to index (see C<max_indexed>). |
|
176
|
|
|
|
|
|
|
This count is displayted at the end of indexing as C<Unique URLs>. |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This feature can (and perhaps should) be use when spidering a web site where dynamic |
|
179
|
|
|
|
|
|
|
content may generate unique URLs to prevent run-away spidering. |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=item max_size I<n> |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
This optional key sets the max size of a file read from the web server. |
|
184
|
|
|
|
|
|
|
This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is |
|
185
|
|
|
|
|
|
|
truncated per LWP::UserAgent. |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Set max_size to zero for unlimited size. |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=item modified_since I<date> |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
This optional parameter will skip any URIs that do not report having |
|
192
|
|
|
|
|
|
|
been modified since I<date>. The C<Last-Modified> HTTP header is used to |
|
193
|
|
|
|
|
|
|
determine modification time. |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item keep_alive I<1|0> |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This optional parameter will enable keep alive requests. This can dramatically speed |
|
198
|
|
|
|
|
|
|
up spidering and reduce the load on server being spidered. The default is to not use |
|
199
|
|
|
|
|
|
|
keep alives, although enabling it will probably be the right thing to do. |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
To get the most out of keep alives, you may want to set up your web server to |
|
202
|
|
|
|
|
|
|
allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache). |
|
203
|
|
|
|
|
|
|
Apache's default is 100, which should be good. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
When a connection is not closed the spider does not wait the "delay" |
|
206
|
|
|
|
|
|
|
time when making the next request. In other words, there is no delay in |
|
207
|
|
|
|
|
|
|
requesting documents while the connection is open. |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Note: you must have at least libwww-perl-5.53_90 installed to use this feature. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=item delay I<n> |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Get/set the number of seconds to wait between making requests. Default is |
|
214
|
|
|
|
|
|
|
5 seconds (a very friendly delay). |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item timeout I<n> |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Get/set the number of seconds to wait before considering the remote |
|
219
|
|
|
|
|
|
|
server unresponsive. The default is 10. |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item authn_callback I<code_ref> |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
CODE reference to fetch username/password credentials when necessary. See also |
|
224
|
|
|
|
|
|
|
C<credentials>. |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item credential_timeout I<n> |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Number of seconds to wait before skipping manual prompt for username/password. |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item credentials I<user:pass> |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
String with C<username>:C<password> pair to be used when prompted by |
|
233
|
|
|
|
|
|
|
the server. |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item follow_redirects I<1|0> |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
By default, 3xx responses from the server will be followed when |
|
238
|
|
|
|
|
|
|
they are on the same hostname. Set to false (0) to not follow |
|
239
|
|
|
|
|
|
|
redirects. |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item link_tags |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
TODO |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item remove_leading_dots I<1|0> |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Microsoft server hack. |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=item same_hosts I<array_ref> |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
ARRAY ref of hostnames to be treated as identical to the original |
|
252
|
|
|
|
|
|
|
host being spidered. By default the spider will not follow |
|
253
|
|
|
|
|
|
|
links to different hosts. |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=back |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 BUILD |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Initializes a new spider object. Called by new(). |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub BUILD { |
|
264
|
|
|
|
|
|
|
my $self = shift; |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
$self->{_auth_cache} = Dezi::Cache->new; # ALWAYS inmemory cache |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
$self->{ua} |
|
269
|
|
|
|
|
|
|
||= Dezi::Aggregator::Spider::UA->new( $self->agent, $self->email, ); |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
$self->{ua} |
|
272
|
|
|
|
|
|
|
->set_link_tags( { map { lc($_) => 1 } @{ $self->{link_tags} } } ); |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# we handle our own delay |
|
275
|
|
|
|
|
|
|
$self->{ua}->delay(0); |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
$self->{ua}->timeout( $self->timeout ); |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# TODO we test this using HEAD request. Set here too? |
|
280
|
|
|
|
|
|
|
#$self->{ua}->max_size( $self->{max_size} ) if $self->{max_size}; |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
if ( $self->use_cookies ) { |
|
283
|
|
|
|
|
|
|
$self->{ua}->cookie_jar( HTTP::Cookies->new() ); |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
if ( $self->keep_alive ) { |
|
286
|
|
|
|
|
|
|
if ( $self->{ua}->can('conn_cache') ) { |
|
287
|
|
|
|
|
|
|
$self->{ua} |
|
288
|
|
|
|
|
|
|
->conn_cache( { total_capacity => $self->keep_alive } ); |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
else { |
|
291
|
|
|
|
|
|
|
warn |
|
292
|
|
|
|
|
|
|
"can't use keep-alive: conn_cache() method not available on ua " |
|
293
|
|
|
|
|
|
|
. ref( $self->{ua} ); |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
$self->{_current_depth} = 1; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
$self->{same_host_lookup} = { map { $_ => 1 } @{ $self->{same_hosts} } }; |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
if ( $self->use_md5 ) { |
|
302
|
|
|
|
|
|
|
Class::Load::load_class('Digest::MD5'); |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# if Dezi::Indexer::Config defined, use that for some items |
|
306
|
|
|
|
|
|
|
if ( $self->indexer and $self->indexer->config ) { |
|
307
|
|
|
|
|
|
|
if ( $self->indexer->config->FileRules && !$self->file_rules ) { |
|
308
|
|
|
|
|
|
|
$self->file_rules( |
|
309
|
|
|
|
|
|
|
File::Rules->new( $self->indexer->config->FileRules ) ); |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# from spider.pl. not sure if we need it or not. |
|
314
|
|
|
|
|
|
|
# Lame Microsoft |
|
315
|
|
|
|
|
|
|
$URI::ABS_REMOTE_LEADING_DOTS = $self->remove_leading_dots; |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
return $self; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=head2 uri_ok( I<uri> ) |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
Returns true if I<uri> is acceptable for including in an index. |
|
323
|
|
|
|
|
|
|
The 'ok-ness' of the I<uri> is based on its base, robot rules, |
|
324
|
|
|
|
|
|
|
and the spider configuration. |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=cut |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub uri_ok { |
|
329
|
|
|
|
|
|
|
my $self = shift; |
|
330
|
|
|
|
|
|
|
my $uri = shift or croak "URI required"; |
|
331
|
|
|
|
|
|
|
my $str = $uri->canonical->as_string; |
|
332
|
|
|
|
|
|
|
$str =~ s/#.*//; # target anchors create noise |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
if ( $self->verbose > 1 || $self->debug ) { |
|
335
|
|
|
|
|
|
|
$self->write_log_line(); |
|
336
|
|
|
|
|
|
|
$self->write_log( |
|
337
|
|
|
|
|
|
|
uri => $uri, |
|
338
|
|
|
|
|
|
|
msg => "checking if ok", |
|
339
|
|
|
|
|
|
|
); |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
if ( $uri->scheme !~ m,^http, ) { |
|
343
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
344
|
|
|
|
|
|
|
uri => $uri, |
|
345
|
|
|
|
|
|
|
msg => "skipping, unsupported scheme" |
|
346
|
|
|
|
|
|
|
); |
|
347
|
|
|
|
|
|
|
return 0; |
|
348
|
|
|
|
|
|
|
} |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# check if we're on the same host. |
|
351
|
|
|
|
|
|
|
if ( $uri->rel( $self->{_base} ) eq $uri ) { |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# not on this host. check our aliases |
|
354
|
|
|
|
|
|
|
if ( !exists $self->{same_host_lookup} |
|
355
|
|
|
|
|
|
|
->{ $uri->canonical->authority || '' } ) |
|
356
|
|
|
|
|
|
|
{ |
|
357
|
|
|
|
|
|
|
my $host = $uri->canonical->authority; |
|
358
|
|
|
|
|
|
|
$self->debug |
|
359
|
|
|
|
|
|
|
and $self->write_log( |
|
360
|
|
|
|
|
|
|
uri => $uri, |
|
361
|
|
|
|
|
|
|
msg => "skipping, different host $host", |
|
362
|
|
|
|
|
|
|
); |
|
363
|
|
|
|
|
|
|
return 0; |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# in same host lookup, so proceed. |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
my $path = $uri->path; |
|
370
|
|
|
|
|
|
|
my $swish3 = $self->indexer ? $self->indexer->swish3 : undef; |
|
371
|
|
|
|
|
|
|
my $mime = $UTILS->get_mime( $path, $swish3 ); |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
if ( !$UTILS->get_parser_for_mime( $mime, $swish3 ) ) { |
|
374
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
375
|
|
|
|
|
|
|
uri => $uri, |
|
376
|
|
|
|
|
|
|
msg => "skipping, no parser for $mime", |
|
377
|
|
|
|
|
|
|
); |
|
378
|
|
|
|
|
|
|
return 0; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# check regex |
|
382
|
|
|
|
|
|
|
if ( $self->file_rules ) { |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
if ( $self->_apply_file_rules( $uri->path_query, $self->file_rules ) |
|
385
|
|
|
|
|
|
|
&& !$self->_apply_file_match( $uri->path_query, |
|
386
|
|
|
|
|
|
|
$self->file_rules ) ) |
|
387
|
|
|
|
|
|
|
{ |
|
388
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
389
|
|
|
|
|
|
|
uri => $uri, |
|
390
|
|
|
|
|
|
|
msg => "skipping, matched file_rules", |
|
391
|
|
|
|
|
|
|
); |
|
392
|
|
|
|
|
|
|
return 0; |
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
} |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# head request to check max_size and modified_since |
|
397
|
|
|
|
|
|
|
if ( $self->max_size or $self->modified_since ) { |
|
398
|
|
|
|
|
|
|
my %head_args = ( |
|
399
|
|
|
|
|
|
|
uri => $uri, |
|
400
|
|
|
|
|
|
|
delay => 0, # assume each get() applies the delay |
|
401
|
|
|
|
|
|
|
debug => $self->debug, |
|
402
|
|
|
|
|
|
|
verbose => $self->verbose, |
|
403
|
|
|
|
|
|
|
); |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) { |
|
406
|
|
|
|
|
|
|
$head_args{user} = $user; |
|
407
|
|
|
|
|
|
|
$head_args{pass} = $pass; |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
my $resp = $self->ua->head(%head_args); |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# early abort if resource doesn't exist |
|
412
|
|
|
|
|
|
|
if ( $resp->status == 404 ) { |
|
413
|
|
|
|
|
|
|
$self->debug |
|
414
|
|
|
|
|
|
|
and $self->write_log( |
|
415
|
|
|
|
|
|
|
uri => $uri, |
|
416
|
|
|
|
|
|
|
msg => "skipping, 404 not found", |
|
417
|
|
|
|
|
|
|
); |
|
418
|
|
|
|
|
|
|
return 0; |
|
419
|
|
|
|
|
|
|
} |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
# redirect? assume ok now and _make_request will check on it later. |
|
422
|
|
|
|
|
|
|
if ( $resp->is_redirect ) { |
|
423
|
|
|
|
|
|
|
$self->debug |
|
424
|
|
|
|
|
|
|
and $self->write_log( |
|
425
|
|
|
|
|
|
|
uri => $uri, |
|
426
|
|
|
|
|
|
|
msg => "deferring, is_redirect", |
|
427
|
|
|
|
|
|
|
); |
|
428
|
|
|
|
|
|
|
return 1; |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
my $last_mod = $resp->last_modified; |
|
432
|
|
|
|
|
|
|
if ( $last_mod |
|
433
|
|
|
|
|
|
|
and $self->modified_since |
|
434
|
|
|
|
|
|
|
and $self->modified_since > $last_mod ) |
|
435
|
|
|
|
|
|
|
{ |
|
436
|
|
|
|
|
|
|
$self->debug |
|
437
|
|
|
|
|
|
|
and $self->write_log( |
|
438
|
|
|
|
|
|
|
uri => $uri, |
|
439
|
|
|
|
|
|
|
msg => sprintf( |
|
440
|
|
|
|
|
|
|
"skipping, last modified %s (%s < %s)", |
|
441
|
|
|
|
|
|
|
$resp->header('last-modified'), $last_mod, |
|
442
|
|
|
|
|
|
|
$self->modified_since |
|
443
|
|
|
|
|
|
|
), |
|
444
|
|
|
|
|
|
|
); |
|
445
|
|
|
|
|
|
|
return 0; |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
if ( $resp->content_length and $self->max_size ) { |
|
449
|
|
|
|
|
|
|
if ( $resp->content_length > $self->max_size ) { |
|
450
|
|
|
|
|
|
|
$self->debug |
|
451
|
|
|
|
|
|
|
and $self->write_log( |
|
452
|
|
|
|
|
|
|
uri => $uri, |
|
453
|
|
|
|
|
|
|
msg => sprintf( "skipping, %s > max_size", |
|
454
|
|
|
|
|
|
|
$resp->content_length ), |
|
455
|
|
|
|
|
|
|
); |
|
456
|
|
|
|
|
|
|
return 0; |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
( $self->verbose > 1 || $self->debug ) and $self->write_log( |
|
463
|
|
|
|
|
|
|
uri => $uri, |
|
464
|
|
|
|
|
|
|
msg => "ok", |
|
465
|
|
|
|
|
|
|
); |
|
466
|
|
|
|
|
|
|
return 1; |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub _add_links { |
|
470
|
|
|
|
|
|
|
my ( $self, $parent, @links ) = @_; |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# calc depth |
|
473
|
|
|
|
|
|
|
if ( !$self->{_parent} || $self->{_parent} ne $parent ) { |
|
474
|
|
|
|
|
|
|
$self->{_current_depth}++; |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
$self->{_parent} ||= $parent; # first time. |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
480
|
|
|
|
|
|
|
uri => $parent, |
|
481
|
|
|
|
|
|
|
msg => sprintf( 'evaluating %s links', scalar(@links) ), |
|
482
|
|
|
|
|
|
|
); |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
for my $l (@links) { |
|
485
|
|
|
|
|
|
|
my $uri = $l->abs( $self->{_base} ) or next; |
|
486
|
|
|
|
|
|
|
$uri = $uri->canonical; # normalize |
|
487
|
|
|
|
|
|
|
if ( $self->uri_cache->has("$uri") ) { |
|
488
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
489
|
|
|
|
|
|
|
uri => $uri, |
|
490
|
|
|
|
|
|
|
msg => "skipping, already checked", |
|
491
|
|
|
|
|
|
|
); |
|
492
|
|
|
|
|
|
|
next; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
$self->uri_cache->add( "$uri" => $self->{_current_depth} ); |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
if ( $self->uri_ok($uri) ) { |
|
497
|
|
|
|
|
|
|
$self->add_to_queue($uri); |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# ported from spider.pl |
|
503
|
|
|
|
|
|
|
# Do we need to authorize? If so, ask for password and request again. |
|
504
|
|
|
|
|
|
|
# First we try using any cached value |
|
505
|
|
|
|
|
|
|
# Then we try using the get_password callback |
|
506
|
|
|
|
|
|
|
# Then we ask. |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub _authorize { |
|
509
|
|
|
|
|
|
|
my ( $self, $uri, $response ) = @_; |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
delete $self->{last_auth}; # since we know that doesn't work |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
if ( $response->header('WWW-Authenticate') |
|
514
|
|
|
|
|
|
|
&& $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) |
|
515
|
|
|
|
|
|
|
{ |
|
516
|
|
|
|
|
|
|
my $realm = $1; |
|
517
|
|
|
|
|
|
|
my $user_pass; |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# Do we have a cached user/pass for this realm? |
|
520
|
|
|
|
|
|
|
# only each URI only once |
|
521
|
|
|
|
|
|
|
unless ( $self->{_request}->{auth}->{$uri}++ ) { |
|
522
|
|
|
|
|
|
|
my $key = $uri->canonical->host_port . ':' . $realm; |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
if ( $user_pass = $self->{_auth_cache}->get($key) ) { |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# If we didn't just try it, try again |
|
527
|
|
|
|
|
|
|
unless ( $uri->userinfo && $user_pass eq $uri->userinfo ) { |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
# add the user/pass to the URI |
|
530
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
#warn " >> set userinfo via _auth_cache\n" if $self->debug; |
|
533
|
|
|
|
|
|
|
return 1; |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
else { |
|
536
|
|
|
|
|
|
|
# we've tried this before |
|
537
|
|
|
|
|
|
|
#warn "tried $user_pass before"; |
|
538
|
|
|
|
|
|
|
return 0; |
|
539
|
|
|
|
|
|
|
} |
|
540
|
|
|
|
|
|
|
} |
|
541
|
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# now check for a callback password (if $user_pass not set) |
|
544
|
|
|
|
|
|
|
unless ( $user_pass || $self->{_request}->{auth}->{callback}++ ) { |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# Check for a callback function |
|
547
|
|
|
|
|
|
|
if ( $self->{authn_callback} |
|
548
|
|
|
|
|
|
|
and ref $self->{authn_callback} eq 'CODE' ) |
|
549
|
|
|
|
|
|
|
{ |
|
550
|
|
|
|
|
|
|
$user_pass = $self->{authn_callback} |
|
551
|
|
|
|
|
|
|
->( $self, $uri, $response, $realm ); |
|
552
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
#warn " >> set userinfo via authn_callback\n" if $self->debug; |
|
555
|
|
|
|
|
|
|
return 1; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# otherwise, prompt (over and over) |
|
560
|
|
|
|
|
|
|
if ( !$user_pass ) { |
|
561
|
|
|
|
|
|
|
$user_pass = $self->_get_basic_credentials( $uri, $realm ); |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
if ($user_pass) { |
|
565
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
|
566
|
|
|
|
|
|
|
$self->{cur_realm} = $realm; # save so we can cache if it's valid |
|
567
|
|
|
|
|
|
|
return 1; |
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
return 0; |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# From spider.pl |
|
576
|
|
|
|
|
|
|
sub _get_basic_credentials { |
|
577
|
|
|
|
|
|
|
my ( $self, $uri, $realm ) = @_; |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# Exists but undefined means don't ask. |
|
580
|
|
|
|
|
|
|
return |
|
581
|
|
|
|
|
|
|
if exists $self->{credential_timeout} |
|
582
|
|
|
|
|
|
|
&& !defined $self->{credential_timeout}; |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
my $netloc = $uri->canonical->host_port; |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
my ( $user, $password ); |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
eval { |
|
589
|
|
|
|
|
|
|
local $SIG{ALRM} = sub { die "timed out\n" }; |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
# a zero timeout means don't time out |
|
592
|
|
|
|
|
|
|
alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i; |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
if ( $uri->userinfo ) { |
|
595
|
|
|
|
|
|
|
print STDERR "\nSorry: invalid username/password\n"; |
|
596
|
|
|
|
|
|
|
$uri->userinfo(undef); |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
print STDERR |
|
600
|
|
|
|
|
|
|
"Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: "; |
|
601
|
|
|
|
|
|
|
$user = <STDIN>; |
|
602
|
|
|
|
|
|
|
chomp($user) if $user; |
|
603
|
|
|
|
|
|
|
die "No Username specified\n" unless length $user; |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i; |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
print STDERR "Password: "; |
|
608
|
|
|
|
|
|
|
system("stty -echo"); |
|
609
|
|
|
|
|
|
|
$password = <STDIN>; |
|
610
|
|
|
|
|
|
|
system("stty echo"); |
|
611
|
|
|
|
|
|
|
print STDERR "\n"; # because we disabled echo |
|
612
|
|
|
|
|
|
|
chomp($password); |
|
613
|
|
|
|
|
|
|
alarm(0) unless $^O =~ /Win32/i; |
|
614
|
|
|
|
|
|
|
}; |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
alarm(0) unless $^O =~ /Win32/i; |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
return if $@; |
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
return join ':', $user, $password; |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
} |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=head2 add_to_queue( I<uri> ) |
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
Add I<uri> to the queue. |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=cut |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
sub add_to_queue { |
|
631
|
|
|
|
|
|
|
my $self = shift; |
|
632
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
|
633
|
|
|
|
|
|
|
return $self->queue->put($uri); |
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=head2 next_from_queue |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Return next I<uri> from queue. |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=cut |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub next_from_queue { |
|
643
|
|
|
|
|
|
|
my $self = shift; |
|
644
|
|
|
|
|
|
|
return $self->queue->get(); |
|
645
|
|
|
|
|
|
|
} |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
=head2 left_in_queue |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
Returns queue()->size(). |
|
650
|
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=cut |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
sub left_in_queue { |
|
654
|
|
|
|
|
|
|
return shift->queue->size(); |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=head2 remove_from_queue( I<uri> ) |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
Calls queue()->remove(I<uri>). |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=cut |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
sub remove_from_queue { |
|
664
|
|
|
|
|
|
|
my $self = shift; |
|
665
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
|
666
|
|
|
|
|
|
|
return $self->queue->remove($uri); |
|
667
|
|
|
|
|
|
|
} |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=head2 get_doc |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
Returns the next URI from the queue() as a Dezi::Indexer::Doc object, |
|
672
|
|
|
|
|
|
|
or the error message if there was one. |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
Returns undef if the queue is empty or max_depth() has been reached. |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=cut |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub get_doc { |
|
679
|
|
|
|
|
|
|
my $self = shift; |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# return unless we have something in the queue |
|
682
|
|
|
|
|
|
|
return unless $self->left_in_queue(); |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
# pop the queue and make it a URI |
|
685
|
|
|
|
|
|
|
my $uri = $self->next_from_queue(); |
|
686
|
|
|
|
|
|
|
my $depth = $self->uri_cache->get("$uri"); |
|
687
|
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
$self->debug |
|
689
|
|
|
|
|
|
|
and $self->write_log( |
|
690
|
|
|
|
|
|
|
uri => $uri, |
|
691
|
|
|
|
|
|
|
msg => sprintf( |
|
692
|
|
|
|
|
|
|
"depth:%d max_depth:%s", |
|
693
|
|
|
|
|
|
|
$depth, ( $self->max_depth || 'undef' ) |
|
694
|
|
|
|
|
|
|
), |
|
695
|
|
|
|
|
|
|
); |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
return if defined $self->max_depth && $depth > $self->max_depth; |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
$self->{_cur_depth} = $depth; |
|
700
|
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
my $doc = $self->_make_request($uri); |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
if ($doc) { |
|
704
|
|
|
|
|
|
|
$self->remove_from_queue($uri); |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
return $doc; |
|
708
|
|
|
|
|
|
|
} |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head2 get_authorized_doc( I<uri>, I<response> ) |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
Called internally when the server returns a 401 or 403 response. |
|
713
|
|
|
|
|
|
|
Will attempt to determine the correct credentials for I<uri> |
|
714
|
|
|
|
|
|
|
based on the previous attempt in I<response> and what you |
|
715
|
|
|
|
|
|
|
have configured in B<credentials>, B<authn_callback> or when |
|
716
|
|
|
|
|
|
|
manually prompted. |
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
=cut |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
sub get_authorized_doc { |
|
721
|
|
|
|
|
|
|
my $self = shift; |
|
722
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
|
723
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# set up credentials |
|
726
|
|
|
|
|
|
|
$self->_authorize( $uri, $response->http_response ) or return; |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
return $self->_make_request($uri); |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub _make_request { |
|
732
|
|
|
|
|
|
|
my ( $self, $uri ) = @_; |
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
# get our useragent |
|
735
|
|
|
|
|
|
|
my $ua = $self->ua; |
|
736
|
|
|
|
|
|
|
my $delay = 0; |
|
737
|
|
|
|
|
|
|
if ( $self->{keep_alive} ) { |
|
738
|
|
|
|
|
|
|
$delay = 0; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
elsif ( !$self->{delay} or !$self->{_last_response_time} ) { |
|
741
|
|
|
|
|
|
|
$delay = 0; |
|
742
|
|
|
|
|
|
|
} |
|
743
|
|
|
|
|
|
|
else { |
|
744
|
|
|
|
|
|
|
my $elapsed = time() - $self->{_last_response_time}; |
|
745
|
|
|
|
|
|
|
$delay = $self->{delay} - $elapsed; |
|
746
|
|
|
|
|
|
|
$delay = 0 if $delay < 0; |
|
747
|
|
|
|
|
|
|
$self->debug |
|
748
|
|
|
|
|
|
|
and $self->write_log( |
|
749
|
|
|
|
|
|
|
uri => $uri, |
|
750
|
|
|
|
|
|
|
msg => "elapsed:$elapsed delay:$delay", |
|
751
|
|
|
|
|
|
|
); |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
$self->write_log( |
|
755
|
|
|
|
|
|
|
uri => $uri, |
|
756
|
|
|
|
|
|
|
msg => "GET delay:$delay", |
|
757
|
|
|
|
|
|
|
) if $self->verbose; |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
my %get_args = ( |
|
760
|
|
|
|
|
|
|
uri => $uri, |
|
761
|
|
|
|
|
|
|
delay => $delay, |
|
762
|
|
|
|
|
|
|
debug => $self->debug, |
|
763
|
|
|
|
|
|
|
verbose => $self->verbose, |
|
764
|
|
|
|
|
|
|
); |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) { |
|
767
|
|
|
|
|
|
|
$get_args{user} = $user; |
|
768
|
|
|
|
|
|
|
$get_args{pass} = $pass; |
|
769
|
|
|
|
|
|
|
} |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
# fetch the uri. $ua handles delay internally. |
|
772
|
|
|
|
|
|
|
my $response = $ua->get(%get_args); |
|
773
|
|
|
|
|
|
|
my $http_response = $response->http_response; |
|
774
|
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
# flag current time for next delay calc. |
|
776
|
|
|
|
|
|
|
$self->{_last_response_time} = time(); |
|
777
|
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# redirect? follow, conditionally. |
|
779
|
|
|
|
|
|
|
if ( $response->is_redirect ) { |
|
780
|
|
|
|
|
|
|
my $location = $response->header('location'); |
|
781
|
|
|
|
|
|
|
if ( !$location ) { |
|
782
|
|
|
|
|
|
|
$self->write_log( |
|
783
|
|
|
|
|
|
|
uri => $uri, |
|
784
|
|
|
|
|
|
|
msg => "skipping, redirect without a Location header", |
|
785
|
|
|
|
|
|
|
); |
|
786
|
|
|
|
|
|
|
return $response->status; |
|
787
|
|
|
|
|
|
|
} |
|
788
|
|
|
|
|
|
|
$self->debug |
|
789
|
|
|
|
|
|
|
and $self->write_log( |
|
790
|
|
|
|
|
|
|
uri => $uri, |
|
791
|
|
|
|
|
|
|
msg => "redirect: $location", |
|
792
|
|
|
|
|
|
|
); |
|
793
|
|
|
|
|
|
|
if ( $self->follow_redirects ) { |
|
794
|
|
|
|
|
|
|
$self->_add_links( $uri, |
|
795
|
|
|
|
|
|
|
URI->new_abs( $location, $http_response->base ) ); |
|
796
|
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
return $response->status; |
|
798
|
|
|
|
|
|
|
} |
|
799
|
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
if ( $response->ct ) { |
|
801
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
802
|
|
|
|
|
|
|
uri => $uri, |
|
803
|
|
|
|
|
|
|
msg => 'content-type: ' . $response->ct, |
|
804
|
|
|
|
|
|
|
); |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
# add its links to the queue. |
|
808
|
|
|
|
|
|
|
# If the resource looks like an XML feed of some kind, |
|
809
|
|
|
|
|
|
|
# glean its links differently than if it is an HTML response. |
|
810
|
|
|
|
|
|
|
if ( my $feed = $self->looks_like_feed($http_response) ) { |
|
811
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
812
|
|
|
|
|
|
|
uri => $uri, |
|
813
|
|
|
|
|
|
|
msg => 'looks like feed' |
|
814
|
|
|
|
|
|
|
); |
|
815
|
|
|
|
|
|
|
my @links; |
|
816
|
|
|
|
|
|
|
for my $entry ( $feed->entries ) { |
|
817
|
|
|
|
|
|
|
push @links, URI->new( $entry->link ); |
|
818
|
|
|
|
|
|
|
} |
|
819
|
|
|
|
|
|
|
$self->_add_links( $uri, @links ); |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
# we don't want the feed content, we want the links. |
|
822
|
|
|
|
|
|
|
# TODO make this optional |
|
823
|
|
|
|
|
|
|
return $response->status; |
|
824
|
|
|
|
|
|
|
} |
|
825
|
|
|
|
|
|
|
elsif ( my $sitemap = $self->looks_like_sitemap($http_response) ) { |
|
826
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
827
|
|
|
|
|
|
|
uri => $uri, |
|
828
|
|
|
|
|
|
|
msg => 'looks like sitemap', |
|
829
|
|
|
|
|
|
|
); |
|
830
|
|
|
|
|
|
|
my @links; |
|
831
|
|
|
|
|
|
|
for my $url ( $sitemap->urls ) { |
|
832
|
|
|
|
|
|
|
push @links, URI->new( $url->loc ); |
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
$self->_add_links( $uri, @links ); |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
# we don't want the feed content, we want the links. |
|
837
|
|
|
|
|
|
|
# TODO make this optional |
|
838
|
|
|
|
|
|
|
return $response->status; |
|
839
|
|
|
|
|
|
|
} |
|
840
|
|
|
|
|
|
|
else { |
|
841
|
|
|
|
|
|
|
$self->_add_links( $uri, $response->links ); |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
# return $uri as a Doc object |
|
845
|
|
|
|
|
|
|
my $use_uri = $response->success ? $ua->uri : $uri; |
|
846
|
|
|
|
|
|
|
my $meta = { |
|
847
|
|
|
|
|
|
|
org_uri => $uri, |
|
848
|
|
|
|
|
|
|
ret_uri => ( $use_uri || $uri ), |
|
849
|
|
|
|
|
|
|
depth => delete $self->{_cur_depth}, |
|
850
|
|
|
|
|
|
|
status => $response->status, |
|
851
|
|
|
|
|
|
|
success => $response->success, |
|
852
|
|
|
|
|
|
|
is_html => $response->is_html, |
|
853
|
|
|
|
|
|
|
title => ( |
|
854
|
|
|
|
|
|
|
$response->success |
|
855
|
|
|
|
|
|
|
? ( $response->is_html |
|
856
|
|
|
|
|
|
|
? ( $response->title || "No title: $use_uri" ) |
|
857
|
|
|
|
|
|
|
: $use_uri |
|
858
|
|
|
|
|
|
|
) |
|
859
|
|
|
|
|
|
|
: "Failed: $use_uri" |
|
860
|
|
|
|
|
|
|
), |
|
861
|
|
|
|
|
|
|
ct => ( $response->success ? $response->ct : "Unknown" ), |
|
862
|
|
|
|
|
|
|
}; |
|
863
|
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
my $headers = $http_response->headers; |
|
865
|
|
|
|
|
|
|
my $buf = $response->content; |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
if ( $self->{use_md5} ) { |
|
868
|
|
|
|
|
|
|
my $fingerprint = $response->header('Content-MD5') |
|
869
|
|
|
|
|
|
|
|| Digest::MD5::md5_base64($buf); |
|
870
|
|
|
|
|
|
|
if ( $self->md5_cache->has($fingerprint) ) { |
|
871
|
|
|
|
|
|
|
return "duplicate content for " |
|
872
|
|
|
|
|
|
|
. $self->md5_cache->get($fingerprint); |
|
873
|
|
|
|
|
|
|
} |
|
874
|
|
|
|
|
|
|
$self->md5_cache->add( $fingerprint => $uri ); |
|
875
|
|
|
|
|
|
|
} |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
if ( $response->success ) { |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
my $content_type = $meta->{ct}; |
|
880
|
|
|
|
|
|
|
my $swish3 = $self->indexer ? $self->indexer->swish3 : undef; |
|
881
|
|
|
|
|
|
|
if ( !$UTILS->get_parser_for_mime( $content_type, $swish3 ) ) { |
|
882
|
|
|
|
|
|
|
$self->write_log( |
|
883
|
|
|
|
|
|
|
uri => $uri, |
|
884
|
|
|
|
|
|
|
msg => "no parser for $content_type", |
|
885
|
|
|
|
|
|
|
); |
|
886
|
|
|
|
|
|
|
} |
|
887
|
|
|
|
|
|
|
my $charset = $headers->content_type; |
|
888
|
|
|
|
|
|
|
$charset =~ s/;?$meta->{ct};?//; |
|
889
|
|
|
|
|
|
|
my $encoding = $headers->content_encoding || $charset; |
|
890
|
|
|
|
|
|
|
my %doc = ( |
|
891
|
|
|
|
|
|
|
url => $meta->{org_uri}, |
|
892
|
|
|
|
|
|
|
modtime => ( $headers->last_modified || $headers->date ), |
|
893
|
|
|
|
|
|
|
type => $meta->{ct}, |
|
894
|
|
|
|
|
|
|
content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ), |
|
895
|
|
|
|
|
|
|
size => $headers->content_length || length( pack 'C0a*', $buf ), |
|
896
|
|
|
|
|
|
|
charset => $encoding, |
|
897
|
|
|
|
|
|
|
); |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# cache whatever credentials were used so we can re-use |
|
900
|
|
|
|
|
|
|
if ( $self->{cur_realm} and $uri->userinfo ) { |
|
901
|
|
|
|
|
|
|
my $key = $uri->canonical->host_port . ':' . $self->{cur_realm}; |
|
902
|
|
|
|
|
|
|
$self->{_auth_cache}->add( $key => $uri->userinfo ); |
|
903
|
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
# not too sure of the best logic here |
|
905
|
|
|
|
|
|
|
my $path = $uri->path; |
|
906
|
|
|
|
|
|
|
$path =~ s!/[^/]*$!!; |
|
907
|
|
|
|
|
|
|
$self->{last_auth} = { |
|
908
|
|
|
|
|
|
|
path => $path, |
|
909
|
|
|
|
|
|
|
auth => $uri->userinfo, |
|
910
|
|
|
|
|
|
|
}; |
|
911
|
|
|
|
|
|
|
} |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
# return doc |
|
914
|
|
|
|
|
|
|
return $self->doc_class->new(%doc); |
|
915
|
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
elsif ( $response->status == 401 ) { |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
# authorize and try again |
|
920
|
|
|
|
|
|
|
$self->write_log( |
|
921
|
|
|
|
|
|
|
uri => $uri, |
|
922
|
|
|
|
|
|
|
msg => sprintf( "authn denied, retrying, %s", |
|
923
|
|
|
|
|
|
|
$response->status_line ), |
|
924
|
|
|
|
|
|
|
); |
|
925
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ) |
|
926
|
|
|
|
|
|
|
|| $response->status; |
|
927
|
|
|
|
|
|
|
} |
|
928
|
|
|
|
|
|
|
elsif ($response->status == 403 |
|
929
|
|
|
|
|
|
|
&& $http_response->status_line =~ m/robots.txt/ ) |
|
930
|
|
|
|
|
|
|
{ |
|
931
|
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
# ignore |
|
933
|
|
|
|
|
|
|
$self->write_log( |
|
934
|
|
|
|
|
|
|
uri => $uri, |
|
935
|
|
|
|
|
|
|
msg => sprintf( "skipped, %s", $http_response->status_line ), |
|
936
|
|
|
|
|
|
|
); |
|
937
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ) |
|
938
|
|
|
|
|
|
|
|| $response->status; |
|
939
|
|
|
|
|
|
|
} |
|
940
|
|
|
|
|
|
|
elsif ( $response->status == 403 ) { |
|
941
|
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
# authorize and try again |
|
943
|
|
|
|
|
|
|
$self->write_log( |
|
944
|
|
|
|
|
|
|
uri => $uri, |
|
945
|
|
|
|
|
|
|
msg => sprintf( "retrying, %s", $http_response->status_line ), |
|
946
|
|
|
|
|
|
|
); |
|
947
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ); |
|
948
|
|
|
|
|
|
|
} |
|
949
|
|
|
|
|
|
|
else { |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
$self->write_log( |
|
952
|
|
|
|
|
|
|
uri => $uri, |
|
953
|
|
|
|
|
|
|
msg => $http_response->status_line, |
|
954
|
|
|
|
|
|
|
); |
|
955
|
|
|
|
|
|
|
return $response->status; |
|
956
|
|
|
|
|
|
|
} |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
return; # never get here. |
|
959
|
|
|
|
|
|
|
} |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
sub _get_user_pass { |
|
962
|
|
|
|
|
|
|
my $self = shift; |
|
963
|
|
|
|
|
|
|
my $uri = shift; |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
# Set basic auth if defined - use URI specific first, then credentials. |
|
966
|
|
|
|
|
|
|
# this doesn't track what should have authorization |
|
967
|
|
|
|
|
|
|
my $last_auth; |
|
968
|
|
|
|
|
|
|
if ( $self->{last_auth} ) { |
|
969
|
|
|
|
|
|
|
my $path = $uri->path; |
|
970
|
|
|
|
|
|
|
$path =~ s!/[^/]*$!!; |
|
971
|
|
|
|
|
|
|
$last_auth = $self->{last_auth}->{auth} |
|
972
|
|
|
|
|
|
|
if $self->{last_auth}->{path} eq $path; |
|
973
|
|
|
|
|
|
|
} |
|
974
|
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
my ( $user, $pass ) = split /:/, |
|
976
|
|
|
|
|
|
|
( $last_auth || $uri->userinfo || $self->credentials || '' ); |
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
return ( $user, $pass ); |
|
979
|
|
|
|
|
|
|
} |
|
980
|
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
=head2 looks_like_feed( I<http_response> ) |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
Called internally to perform naive heuristics on I<http_response> |
|
984
|
|
|
|
|
|
|
to determine whether it looks like an XML feed of some kind, |
|
985
|
|
|
|
|
|
|
rather than a HTML page. |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=cut |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
sub looks_like_feed { |
|
990
|
|
|
|
|
|
|
my $self = shift; |
|
991
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
|
992
|
|
|
|
|
|
|
my $headers = $response->headers; |
|
993
|
|
|
|
|
|
|
my $ct = $headers->content_type; |
|
994
|
|
|
|
|
|
|
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) { |
|
995
|
|
|
|
|
|
|
return 0; |
|
996
|
|
|
|
|
|
|
} |
|
997
|
|
|
|
|
|
|
if ( $ct eq 'text/xml' |
|
998
|
|
|
|
|
|
|
or $ct eq 'application/rss+xml' |
|
999
|
|
|
|
|
|
|
or $ct eq 'application/rdf+xml' |
|
1000
|
|
|
|
|
|
|
or $ct eq 'application/atom+xml' ) |
|
1001
|
|
|
|
|
|
|
{ |
|
1002
|
|
|
|
|
|
|
my $xml = $response->decoded_content; # TODO or content() |
|
1003
|
|
|
|
|
|
|
return XML::Feed->parse( \$xml ); |
|
1004
|
|
|
|
|
|
|
} |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
return 0; |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
=head2 looks_like_sitemap( I<http_response> ) |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Called internally to perform naive heuristics on I<http_response> |
|
1012
|
|
|
|
|
|
|
to determine whether it looks like a XML sitemap feed, |
|
1013
|
|
|
|
|
|
|
rather than a HTML page. |
|
1014
|
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
=cut |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub looks_like_sitemap { |
|
1018
|
|
|
|
|
|
|
my $self = shift; |
|
1019
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
|
1020
|
|
|
|
|
|
|
my $headers = $response->headers; |
|
1021
|
|
|
|
|
|
|
my $ct = $headers->content_type; |
|
1022
|
|
|
|
|
|
|
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) { |
|
1023
|
|
|
|
|
|
|
return 0; |
|
1024
|
|
|
|
|
|
|
} |
|
1025
|
|
|
|
|
|
|
if ( $ct eq 'text/xml' |
|
1026
|
|
|
|
|
|
|
or $ct eq 'application/xml' ) |
|
1027
|
|
|
|
|
|
|
{ |
|
1028
|
|
|
|
|
|
|
my $xml = $response->decoded_content; # TODO or content() |
|
1029
|
|
|
|
|
|
|
my $sitemap = WWW::Sitemap::XML->new(); |
|
1030
|
|
|
|
|
|
|
eval { $sitemap->load( string => $xml ); }; |
|
1031
|
|
|
|
|
|
|
if ($@) { |
|
1032
|
|
|
|
|
|
|
return 0; |
|
1033
|
|
|
|
|
|
|
} |
|
1034
|
|
|
|
|
|
|
return $sitemap; |
|
1035
|
|
|
|
|
|
|
} |
|
1036
|
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
return 0; |
|
1038
|
|
|
|
|
|
|
} |
|
1039
|
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
=head2 crawl( I<uri> ) |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
Implements the required crawl() method. Recursively fetches I<uri> |
|
1043
|
|
|
|
|
|
|
and its child links to a depth set in max_depth(). |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Will quit after max_files() unless max_files==0. |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
Will quit after max_time() seconds unless max_time==0. |
|
1048
|
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=cut |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
sub crawl { |
|
1052
|
|
|
|
|
|
|
my $self = shift; |
|
1053
|
|
|
|
|
|
|
my @urls = @_; |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
my $indexer = $self->indexer; # may be undef |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
for my $url (@urls) { |
|
1058
|
|
|
|
|
|
|
my $started = time(); |
|
1059
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
|
1060
|
|
|
|
|
|
|
uri => $url, |
|
1061
|
|
|
|
|
|
|
msg => "crawling", |
|
1062
|
|
|
|
|
|
|
); |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
my $uri = URI->new($url)->canonical; |
|
1065
|
|
|
|
|
|
|
$self->uri_cache->add( "$uri" => 1 ); |
|
1066
|
|
|
|
|
|
|
$self->add_to_queue($uri); |
|
1067
|
|
|
|
|
|
|
$self->{_base} = $uri->as_string; |
|
1068
|
|
|
|
|
|
|
while ( my $doc = $self->get_doc ) { |
|
1069
|
|
|
|
|
|
|
$self->debug and $self->write_log_line(); |
|
1070
|
|
|
|
|
|
|
next unless blessed($doc); |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# indexer not required |
|
1073
|
|
|
|
|
|
|
$indexer->process($doc) if $indexer; |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
$self->_increment_count; |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
# abort if we've met any max_* conditions |
|
1078
|
|
|
|
|
|
|
last if $self->max_files and $self->count >= $self->max_files; |
|
1079
|
|
|
|
|
|
|
last |
|
1080
|
|
|
|
|
|
|
if $self->max_time |
|
1081
|
|
|
|
|
|
|
and ( time() - $started ) > $self->max_time; |
|
1082
|
|
|
|
|
|
|
} |
|
1083
|
|
|
|
|
|
|
} |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
return $self->count; |
|
1086
|
|
|
|
|
|
|
} |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
=head2 write_log( I<args> ) |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
Passes I<args> to Dezi::Utils::write_log(). |
|
1091
|
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
=cut |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
sub write_log { |
|
1095
|
|
|
|
|
|
|
Dezi::Utils::write_log(@_); |
|
1096
|
|
|
|
|
|
|
} |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=head2 write_log_line([I<char>, I<width>]) |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
Pass through to Dezi::Utils::write_log_line(). |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=cut |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
sub write_log_line { |
|
1105
|
|
|
|
|
|
|
Dezi::Utils::write_log_line(@_); |
|
1106
|
|
|
|
|
|
|
} |
|
1107
|
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
1109
|
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
1; |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
__END__ |
|
1113
|
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1115
|
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
Peter Karman, E<lt>perl@peknet.comE<gt> |
|
1117
|
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
=head1 BUGS |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through |
|
1121
|
|
|
|
|
|
|
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>. |
|
1122
|
|
|
|
|
|
|
I will be notified, and then you'll |
|
1123
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
=head1 SUPPORT |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
perldoc Dezi |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
You can also look for information at: |
|
1133
|
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=over 4 |
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=item * Mailing list |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
L<http://lists.swish-e.org/listinfo/users> |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
|
1141
|
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App> |
|
1143
|
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
|
1145
|
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
L<http://annocpan.org/dist/Dezi-App> |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
=item * CPAN Ratings |
|
1149
|
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/Dezi-App> |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=item * Search CPAN |
|
1153
|
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
L<http://search.cpan.org/dist/Dezi-App/> |
|
1155
|
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=back |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
Copyright 2008-2009 by Peter Karman |
|
1161
|
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
1163
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
1164
|
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
1166
|
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
L<http://swish-e.org/> |