line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Dezi::Aggregator::Spider; |
2
|
1
|
|
|
1
|
|
1936
|
use Moose; |
|
1
|
|
|
|
|
449773
|
|
|
1
|
|
|
|
|
6
|
|
3
|
|
|
|
|
|
|
extends 'Dezi::Aggregator'; |
4
|
1
|
|
|
1
|
|
6776
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
108
|
|
5
|
1
|
|
|
1
|
|
5
|
use Scalar::Util qw( blessed ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
6
|
1
|
|
|
1
|
|
1171
|
use URI; |
|
1
|
|
|
|
|
4923
|
|
|
1
|
|
|
|
|
30
|
|
7
|
1
|
|
|
1
|
|
10787
|
use HTTP::Cookies; |
|
1
|
|
|
|
|
13226
|
|
|
1
|
|
|
|
|
40
|
|
8
|
1
|
|
|
1
|
|
902
|
use Types::Standard qw( InstanceOf Maybe Int CodeRef Str Bool ArrayRef ); |
|
1
|
|
|
|
|
68768
|
|
|
1
|
|
|
|
|
23
|
|
9
|
1
|
|
|
1
|
|
2022
|
use Dezi::Types qw( DeziFileRules DeziEpoch ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
11
|
|
10
|
1
|
|
|
1
|
|
1220
|
use Dezi::Utils; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Dezi::Queue; |
12
|
|
|
|
|
|
|
use Dezi::Cache; |
13
|
|
|
|
|
|
|
use Dezi::Aggregator::Spider::UA; |
14
|
|
|
|
|
|
|
use Search::Tools::UTF8; |
15
|
|
|
|
|
|
|
use XML::Feed; |
16
|
|
|
|
|
|
|
use WWW::Sitemap::XML; |
17
|
|
|
|
|
|
|
use File::Rules; |
18
|
|
|
|
|
|
|
use Class::Load; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# |
21
|
|
|
|
|
|
|
# TODO tests for cookies, non-text urls needing filters |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has 'agent' => ( |
26
|
|
|
|
|
|
|
is => 'rw', |
27
|
|
|
|
|
|
|
isa => Str, |
28
|
|
|
|
|
|
|
default => sub {'dezi-spider http://dezi.org/'}, |
29
|
|
|
|
|
|
|
); |
30
|
|
|
|
|
|
|
has 'authn_callback' => ( is => 'rw', isa => CodeRef ); |
31
|
|
|
|
|
|
|
has 'credential_timeout' => ( is => 'rw', isa => Int, default => sub {30} ); |
32
|
|
|
|
|
|
|
has 'credentials' => ( is => 'rw', isa => Str ); |
33
|
|
|
|
|
|
|
has 'delay' => ( is => 'rw', isa => Int, default => sub {5} ); |
34
|
|
|
|
|
|
|
has 'email' => ( |
35
|
|
|
|
|
|
|
is => 'rw', |
36
|
|
|
|
|
|
|
isa => Str, |
37
|
|
|
|
|
|
|
default => sub {'dezi@user.failed.to.set.email.invalid'}, |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
has 'file_rules' => ( is => 'rw', isa => DeziFileRules, coerce => 1, ); |
40
|
|
|
|
|
|
|
has 'follow_redirects' => ( is => 'rw', isa => Bool, default => sub {1} ); |
41
|
|
|
|
|
|
|
has 'keep_alive' => ( is => 'rw', isa => Bool, default => sub {0} ); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# whitelist which HTML tags we consider "links" |
44
|
|
|
|
|
|
|
# should be subset of what HTML::LinkExtor considers links |
45
|
|
|
|
|
|
|
has 'link_tags' => ( |
46
|
|
|
|
|
|
|
is => 'rw', |
47
|
|
|
|
|
|
|
isa => ArrayRef, |
48
|
|
|
|
|
|
|
default => sub { [ 'a', 'frame', 'iframe' ] } |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
has 'max_depth' => ( is => 'rw', isa => Maybe [Int] ); |
52
|
|
|
|
|
|
|
has 'max_files' => ( is => 'rw', isa => Int, default => sub {0} ); |
53
|
|
|
|
|
|
|
has 'max_size' => ( is => 'rw', isa => Int, default => sub {5_000_000} ); |
54
|
|
|
|
|
|
|
has 'max_time' => ( is => 'rw', isa => Int, ); # TODO |
55
|
|
|
|
|
|
|
has 'md5_cache' => ( |
56
|
|
|
|
|
|
|
is => 'rw', |
57
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Cache'], |
58
|
|
|
|
|
|
|
default => sub { Dezi::Cache->new } |
59
|
|
|
|
|
|
|
); |
60
|
|
|
|
|
|
|
has 'modified_since' => ( is => 'rw', isa => DeziEpoch, coerce => 1, ); |
61
|
|
|
|
|
|
|
has 'queue' => ( |
62
|
|
|
|
|
|
|
is => 'rw', |
63
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Queue'], |
64
|
|
|
|
|
|
|
default => sub { Dezi::Queue->new } |
65
|
|
|
|
|
|
|
); |
66
|
|
|
|
|
|
|
has 'remove_leading_dots' => ( is => 'rw', isa => Bool, default => sub {1} ); |
67
|
|
|
|
|
|
|
has 'same_hosts' => ( is => 'rw', isa => ArrayRef, default => sub { [] } ); |
68
|
|
|
|
|
|
|
has 'timeout' => ( is => 'rw', isa => Int, default => sub {30} ); |
69
|
|
|
|
|
|
|
has 'ua' => ( is => 'rw', isa => InstanceOf ['LWP::UserAgent'] ); |
70
|
|
|
|
|
|
|
has 'uri_cache' => ( |
71
|
|
|
|
|
|
|
is => 'rw', |
72
|
|
|
|
|
|
|
isa => InstanceOf ['Dezi::Cache'], |
73
|
|
|
|
|
|
|
default => sub { Dezi::Cache->new }, |
74
|
|
|
|
|
|
|
); |
75
|
|
|
|
|
|
|
has 'use_md5' => ( is => 'rw', isa => Bool, default => sub {0} ); |
76
|
|
|
|
|
|
|
has 'use_cookies' => ( is => 'rw', isa => Bool, default => sub {1} ); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
#use LWP::Debug qw(+); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
our $VERSION = '0.014'; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# shortcut |
83
|
|
|
|
|
|
|
my $UTILS = 'Dezi::Utils'; |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=pod |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 NAME |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Dezi::Aggregator::Spider - web aggregator |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 SYNOPSIS |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
use Dezi::Aggregator::Spider; |
94
|
|
|
|
|
|
|
my $spider = Dezi::Aggregator::Spider->new( |
95
|
|
|
|
|
|
|
indexer => Dezi::Indexer->new |
96
|
|
|
|
|
|
|
); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$spider->indexer->start; |
99
|
|
|
|
|
|
|
$spider->crawl( 'http://swish-e.org/' ); |
100
|
|
|
|
|
|
|
$spider->indexer->finish; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head1 DESCRIPTION |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Dezi::Aggregator::Spider is a web crawler similar to |
105
|
|
|
|
|
|
|
the spider.pl script in the Swish-e 2.4 distribution. Internally, |
106
|
|
|
|
|
|
|
Dezi::Aggregator::Spider uses LWP::RobotUA to do the hard work. |
107
|
|
|
|
|
|
|
See L<Dezi::Aggregator::Spider::UA>. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 METHODS |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
See L<Dezi::Aggregator>. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 new( I<params> ) |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
All I<params> have their own get/set methods too. They include: |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=over |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=item agent I<string> |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Get/set the user-agent string reported by the user agent. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item email I<string> |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
Get/set the email string reported by the user agent. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=item use_md5 I<1|0> |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Flag as to whether each URI's content should be fingerprinted |
130
|
|
|
|
|
|
|
and compared. Useful if the same content is available under multiple |
131
|
|
|
|
|
|
|
URIs and you only want to index it once. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item uri_cache I<cache_object> |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Get/set the Dezi::Cache-derived object used to track which URIs have |
136
|
|
|
|
|
|
|
been fetched already. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item md5_cache I<cache_object> |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
If use_md5() is true, this Dezi::Cache-derived object tracks |
141
|
|
|
|
|
|
|
the URI fingerprints. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=item file_rules I<File_Rules_or_ARRAY> |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Apply L<File::Rules> object in uri_ok(). I<File_Rules_or_ARRAY> should |
146
|
|
|
|
|
|
|
be a L<File::Rules> object or an array of strings suitable to passing |
147
|
|
|
|
|
|
|
to File::Rules->new(). |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item queue I<queue_object> |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Get/set the Dezi::Queue-derived object for tracking which URIs still |
152
|
|
|
|
|
|
|
need to be fetched. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item ua I<lwp_useragent> |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Get/set the Dezi::Aggregagor::Spider::UA object. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=item max_depth I<n> |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
How many levels of links to follow. B<NOTE:> This value describes the number |
161
|
|
|
|
|
|
|
of links from the first argument passed to I<crawl>. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Default is unlimited depth. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=item max_time I<n> |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
This optional key will set the max minutes to spider. Spidering |
168
|
|
|
|
|
|
|
for this host will stop after C<max_time> seconds, and move on to the |
169
|
|
|
|
|
|
|
next server, if any. The default is to not limit by time. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item max_files I<n> |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
This optional key sets the max number of files to spider before aborting. |
174
|
|
|
|
|
|
|
The default is to not limit by number of files. This is the number of requests |
175
|
|
|
|
|
|
|
made to the remote server, not the total number of files to index (see C<max_indexed>). |
176
|
|
|
|
|
|
|
This count is displayted at the end of indexing as C<Unique URLs>. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This feature can (and perhaps should) be use when spidering a web site where dynamic |
179
|
|
|
|
|
|
|
content may generate unique URLs to prevent run-away spidering. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=item max_size I<n> |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
This optional key sets the max size of a file read from the web server. |
184
|
|
|
|
|
|
|
This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is |
185
|
|
|
|
|
|
|
truncated per LWP::UserAgent. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Set max_size to zero for unlimited size. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=item modified_since I<date> |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
This optional parameter will skip any URIs that do not report having |
192
|
|
|
|
|
|
|
been modified since I<date>. The C<Last-Modified> HTTP header is used to |
193
|
|
|
|
|
|
|
determine modification time. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item keep_alive I<1|0> |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This optional parameter will enable keep alive requests. This can dramatically speed |
198
|
|
|
|
|
|
|
up spidering and reduce the load on server being spidered. The default is to not use |
199
|
|
|
|
|
|
|
keep alives, although enabling it will probably be the right thing to do. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
To get the most out of keep alives, you may want to set up your web server to |
202
|
|
|
|
|
|
|
allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache). |
203
|
|
|
|
|
|
|
Apache's default is 100, which should be good. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
When a connection is not closed the spider does not wait the "delay" |
206
|
|
|
|
|
|
|
time when making the next request. In other words, there is no delay in |
207
|
|
|
|
|
|
|
requesting documents while the connection is open. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Note: you must have at least libwww-perl-5.53_90 installed to use this feature. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=item delay I<n> |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Get/set the number of seconds to wait between making requests. Default is |
214
|
|
|
|
|
|
|
5 seconds (a very friendly delay). |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item timeout I<n> |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Get/set the number of seconds to wait before considering the remote |
219
|
|
|
|
|
|
|
server unresponsive. The default is 10. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item authn_callback I<code_ref> |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
CODE reference to fetch username/password credentials when necessary. See also |
224
|
|
|
|
|
|
|
C<credentials>. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item credential_timeout I<n> |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Number of seconds to wait before skipping manual prompt for username/password. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item credentials I<user:pass> |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
String with C<username>:C<password> pair to be used when prompted by |
233
|
|
|
|
|
|
|
the server. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item follow_redirects I<1|0> |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
By default, 3xx responses from the server will be followed when |
238
|
|
|
|
|
|
|
they are on the same hostname. Set to false (0) to not follow |
239
|
|
|
|
|
|
|
redirects. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item link_tags |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
TODO |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item remove_leading_dots I<1|0> |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Microsoft server hack. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=item same_hosts I<array_ref> |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
ARRAY ref of hostnames to be treated as identical to the original |
252
|
|
|
|
|
|
|
host being spidered. By default the spider will not follow |
253
|
|
|
|
|
|
|
links to different hosts. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=back |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 BUILD |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Initializes a new spider object. Called by new(). |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub BUILD { |
264
|
|
|
|
|
|
|
my $self = shift; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
$self->{_auth_cache} = Dezi::Cache->new; # ALWAYS inmemory cache |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
$self->{ua} |
269
|
|
|
|
|
|
|
||= Dezi::Aggregator::Spider::UA->new( $self->agent, $self->email, ); |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
$self->{ua} |
272
|
|
|
|
|
|
|
->set_link_tags( { map { lc($_) => 1 } @{ $self->{link_tags} } } ); |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# we handle our own delay |
275
|
|
|
|
|
|
|
$self->{ua}->delay(0); |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
$self->{ua}->timeout( $self->timeout ); |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# TODO we test this using HEAD request. Set here too? |
280
|
|
|
|
|
|
|
#$self->{ua}->max_size( $self->{max_size} ) if $self->{max_size}; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
if ( $self->use_cookies ) { |
283
|
|
|
|
|
|
|
$self->{ua}->cookie_jar( HTTP::Cookies->new() ); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
if ( $self->keep_alive ) { |
286
|
|
|
|
|
|
|
if ( $self->{ua}->can('conn_cache') ) { |
287
|
|
|
|
|
|
|
$self->{ua} |
288
|
|
|
|
|
|
|
->conn_cache( { total_capacity => $self->keep_alive } ); |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
else { |
291
|
|
|
|
|
|
|
warn |
292
|
|
|
|
|
|
|
"can't use keep-alive: conn_cache() method not available on ua " |
293
|
|
|
|
|
|
|
. ref( $self->{ua} ); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
$self->{_current_depth} = 1; |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
$self->{same_host_lookup} = { map { $_ => 1 } @{ $self->{same_hosts} } }; |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
if ( $self->use_md5 ) { |
302
|
|
|
|
|
|
|
Class::Load::load_class('Digest::MD5'); |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# if Dezi::Indexer::Config defined, use that for some items |
306
|
|
|
|
|
|
|
if ( $self->indexer and $self->indexer->config ) { |
307
|
|
|
|
|
|
|
if ( $self->indexer->config->FileRules && !$self->file_rules ) { |
308
|
|
|
|
|
|
|
$self->file_rules( |
309
|
|
|
|
|
|
|
File::Rules->new( $self->indexer->config->FileRules ) ); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# from spider.pl. not sure if we need it or not. |
314
|
|
|
|
|
|
|
# Lame Microsoft |
315
|
|
|
|
|
|
|
$URI::ABS_REMOTE_LEADING_DOTS = $self->remove_leading_dots; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
return $self; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=head2 uri_ok( I<uri> ) |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
Returns true if I<uri> is acceptable for including in an index. |
323
|
|
|
|
|
|
|
The 'ok-ness' of the I<uri> is based on its base, robot rules, |
324
|
|
|
|
|
|
|
and the spider configuration. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=cut |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub uri_ok { |
329
|
|
|
|
|
|
|
my $self = shift; |
330
|
|
|
|
|
|
|
my $uri = shift or croak "URI required"; |
331
|
|
|
|
|
|
|
my $str = $uri->canonical->as_string; |
332
|
|
|
|
|
|
|
$str =~ s/#.*//; # target anchors create noise |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
if ( $self->verbose > 1 || $self->debug ) { |
335
|
|
|
|
|
|
|
$self->write_log_line(); |
336
|
|
|
|
|
|
|
$self->write_log( |
337
|
|
|
|
|
|
|
uri => $uri, |
338
|
|
|
|
|
|
|
msg => "checking if ok", |
339
|
|
|
|
|
|
|
); |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
if ( $uri->scheme !~ m,^http, ) { |
343
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
344
|
|
|
|
|
|
|
uri => $uri, |
345
|
|
|
|
|
|
|
msg => "skipping, unsupported scheme" |
346
|
|
|
|
|
|
|
); |
347
|
|
|
|
|
|
|
return 0; |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# check if we're on the same host. |
351
|
|
|
|
|
|
|
if ( $uri->rel( $self->{_base} ) eq $uri ) { |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# not on this host. check our aliases |
354
|
|
|
|
|
|
|
if ( !exists $self->{same_host_lookup} |
355
|
|
|
|
|
|
|
->{ $uri->canonical->authority || '' } ) |
356
|
|
|
|
|
|
|
{ |
357
|
|
|
|
|
|
|
my $host = $uri->canonical->authority; |
358
|
|
|
|
|
|
|
$self->debug |
359
|
|
|
|
|
|
|
and $self->write_log( |
360
|
|
|
|
|
|
|
uri => $uri, |
361
|
|
|
|
|
|
|
msg => "skipping, different host $host", |
362
|
|
|
|
|
|
|
); |
363
|
|
|
|
|
|
|
return 0; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# in same host lookup, so proceed. |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
my $path = $uri->path; |
370
|
|
|
|
|
|
|
my $swish3 = $self->indexer ? $self->indexer->swish3 : undef; |
371
|
|
|
|
|
|
|
my $mime = $UTILS->get_mime( $path, $swish3 ); |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
if ( !$UTILS->get_parser_for_mime( $mime, $swish3 ) ) { |
374
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
375
|
|
|
|
|
|
|
uri => $uri, |
376
|
|
|
|
|
|
|
msg => "skipping, no parser for $mime", |
377
|
|
|
|
|
|
|
); |
378
|
|
|
|
|
|
|
return 0; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# check regex |
382
|
|
|
|
|
|
|
if ( $self->file_rules ) { |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
if ( $self->_apply_file_rules( $uri->path_query, $self->file_rules ) |
385
|
|
|
|
|
|
|
&& !$self->_apply_file_match( $uri->path_query, |
386
|
|
|
|
|
|
|
$self->file_rules ) ) |
387
|
|
|
|
|
|
|
{ |
388
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
389
|
|
|
|
|
|
|
uri => $uri, |
390
|
|
|
|
|
|
|
msg => "skipping, matched file_rules", |
391
|
|
|
|
|
|
|
); |
392
|
|
|
|
|
|
|
return 0; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# head request to check max_size and modified_since |
397
|
|
|
|
|
|
|
if ( $self->max_size or $self->modified_since ) { |
398
|
|
|
|
|
|
|
my %head_args = ( |
399
|
|
|
|
|
|
|
uri => $uri, |
400
|
|
|
|
|
|
|
delay => 0, # assume each get() applies the delay |
401
|
|
|
|
|
|
|
debug => $self->debug, |
402
|
|
|
|
|
|
|
verbose => $self->verbose, |
403
|
|
|
|
|
|
|
); |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) { |
406
|
|
|
|
|
|
|
$head_args{user} = $user; |
407
|
|
|
|
|
|
|
$head_args{pass} = $pass; |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
my $resp = $self->ua->head(%head_args); |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# early abort if resource doesn't exist |
412
|
|
|
|
|
|
|
if ( $resp->status == 404 ) { |
413
|
|
|
|
|
|
|
$self->debug |
414
|
|
|
|
|
|
|
and $self->write_log( |
415
|
|
|
|
|
|
|
uri => $uri, |
416
|
|
|
|
|
|
|
msg => "skipping, 404 not found", |
417
|
|
|
|
|
|
|
); |
418
|
|
|
|
|
|
|
return 0; |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
# redirect? assume ok now and _make_request will check on it later. |
422
|
|
|
|
|
|
|
if ( $resp->is_redirect ) { |
423
|
|
|
|
|
|
|
$self->debug |
424
|
|
|
|
|
|
|
and $self->write_log( |
425
|
|
|
|
|
|
|
uri => $uri, |
426
|
|
|
|
|
|
|
msg => "deferring, is_redirect", |
427
|
|
|
|
|
|
|
); |
428
|
|
|
|
|
|
|
return 1; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
my $last_mod = $resp->last_modified; |
432
|
|
|
|
|
|
|
if ( $last_mod |
433
|
|
|
|
|
|
|
and $self->modified_since |
434
|
|
|
|
|
|
|
and $self->modified_since > $last_mod ) |
435
|
|
|
|
|
|
|
{ |
436
|
|
|
|
|
|
|
$self->debug |
437
|
|
|
|
|
|
|
and $self->write_log( |
438
|
|
|
|
|
|
|
uri => $uri, |
439
|
|
|
|
|
|
|
msg => sprintf( |
440
|
|
|
|
|
|
|
"skipping, last modified %s (%s < %s)", |
441
|
|
|
|
|
|
|
$resp->header('last-modified'), $last_mod, |
442
|
|
|
|
|
|
|
$self->modified_since |
443
|
|
|
|
|
|
|
), |
444
|
|
|
|
|
|
|
); |
445
|
|
|
|
|
|
|
return 0; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
if ( $resp->content_length and $self->max_size ) { |
449
|
|
|
|
|
|
|
if ( $resp->content_length > $self->max_size ) { |
450
|
|
|
|
|
|
|
$self->debug |
451
|
|
|
|
|
|
|
and $self->write_log( |
452
|
|
|
|
|
|
|
uri => $uri, |
453
|
|
|
|
|
|
|
msg => sprintf( "skipping, %s > max_size", |
454
|
|
|
|
|
|
|
$resp->content_length ), |
455
|
|
|
|
|
|
|
); |
456
|
|
|
|
|
|
|
return 0; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
( $self->verbose > 1 || $self->debug ) and $self->write_log( |
463
|
|
|
|
|
|
|
uri => $uri, |
464
|
|
|
|
|
|
|
msg => "ok", |
465
|
|
|
|
|
|
|
); |
466
|
|
|
|
|
|
|
return 1; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub _add_links { |
470
|
|
|
|
|
|
|
my ( $self, $parent, @links ) = @_; |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# calc depth |
473
|
|
|
|
|
|
|
if ( !$self->{_parent} || $self->{_parent} ne $parent ) { |
474
|
|
|
|
|
|
|
$self->{_current_depth}++; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
$self->{_parent} ||= $parent; # first time. |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
480
|
|
|
|
|
|
|
uri => $parent, |
481
|
|
|
|
|
|
|
msg => sprintf( 'evaluating %s links', scalar(@links) ), |
482
|
|
|
|
|
|
|
); |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
for my $l (@links) { |
485
|
|
|
|
|
|
|
my $uri = $l->abs( $self->{_base} ) or next; |
486
|
|
|
|
|
|
|
$uri = $uri->canonical; # normalize |
487
|
|
|
|
|
|
|
if ( $self->uri_cache->has("$uri") ) { |
488
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
489
|
|
|
|
|
|
|
uri => $uri, |
490
|
|
|
|
|
|
|
msg => "skipping, already checked", |
491
|
|
|
|
|
|
|
); |
492
|
|
|
|
|
|
|
next; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
$self->uri_cache->add( "$uri" => $self->{_current_depth} ); |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
if ( $self->uri_ok($uri) ) { |
497
|
|
|
|
|
|
|
$self->add_to_queue($uri); |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# ported from spider.pl |
503
|
|
|
|
|
|
|
# Do we need to authorize? If so, ask for password and request again. |
504
|
|
|
|
|
|
|
# First we try using any cached value |
505
|
|
|
|
|
|
|
# Then we try using the get_password callback |
506
|
|
|
|
|
|
|
# Then we ask. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub _authorize { |
509
|
|
|
|
|
|
|
my ( $self, $uri, $response ) = @_; |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
delete $self->{last_auth}; # since we know that doesn't work |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
if ( $response->header('WWW-Authenticate') |
514
|
|
|
|
|
|
|
&& $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) |
515
|
|
|
|
|
|
|
{ |
516
|
|
|
|
|
|
|
my $realm = $1; |
517
|
|
|
|
|
|
|
my $user_pass; |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# Do we have a cached user/pass for this realm? |
520
|
|
|
|
|
|
|
# only each URI only once |
521
|
|
|
|
|
|
|
unless ( $self->{_request}->{auth}->{$uri}++ ) { |
522
|
|
|
|
|
|
|
my $key = $uri->canonical->host_port . ':' . $realm; |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
if ( $user_pass = $self->{_auth_cache}->get($key) ) { |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# If we didn't just try it, try again |
527
|
|
|
|
|
|
|
unless ( $uri->userinfo && $user_pass eq $uri->userinfo ) { |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
# add the user/pass to the URI |
530
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
#warn " >> set userinfo via _auth_cache\n" if $self->debug; |
533
|
|
|
|
|
|
|
return 1; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
else { |
536
|
|
|
|
|
|
|
# we've tried this before |
537
|
|
|
|
|
|
|
#warn "tried $user_pass before"; |
538
|
|
|
|
|
|
|
return 0; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# now check for a callback password (if $user_pass not set) |
544
|
|
|
|
|
|
|
unless ( $user_pass || $self->{_request}->{auth}->{callback}++ ) { |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# Check for a callback function |
547
|
|
|
|
|
|
|
if ( $self->{authn_callback} |
548
|
|
|
|
|
|
|
and ref $self->{authn_callback} eq 'CODE' ) |
549
|
|
|
|
|
|
|
{ |
550
|
|
|
|
|
|
|
$user_pass = $self->{authn_callback} |
551
|
|
|
|
|
|
|
->( $self, $uri, $response, $realm ); |
552
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
#warn " >> set userinfo via authn_callback\n" if $self->debug; |
555
|
|
|
|
|
|
|
return 1; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# otherwise, prompt (over and over) |
560
|
|
|
|
|
|
|
if ( !$user_pass ) { |
561
|
|
|
|
|
|
|
$user_pass = $self->_get_basic_credentials( $uri, $realm ); |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
if ($user_pass) { |
565
|
|
|
|
|
|
|
$uri->userinfo($user_pass); |
566
|
|
|
|
|
|
|
$self->{cur_realm} = $realm; # save so we can cache if it's valid |
567
|
|
|
|
|
|
|
return 1; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
return 0; |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# From spider.pl |
576
|
|
|
|
|
|
|
sub _get_basic_credentials { |
577
|
|
|
|
|
|
|
my ( $self, $uri, $realm ) = @_; |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# Exists but undefined means don't ask. |
580
|
|
|
|
|
|
|
return |
581
|
|
|
|
|
|
|
if exists $self->{credential_timeout} |
582
|
|
|
|
|
|
|
&& !defined $self->{credential_timeout}; |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
my $netloc = $uri->canonical->host_port; |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
my ( $user, $password ); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
eval { |
589
|
|
|
|
|
|
|
local $SIG{ALRM} = sub { die "timed out\n" }; |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
# a zero timeout means don't time out |
592
|
|
|
|
|
|
|
alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i; |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
if ( $uri->userinfo ) { |
595
|
|
|
|
|
|
|
print STDERR "\nSorry: invalid username/password\n"; |
596
|
|
|
|
|
|
|
$uri->userinfo(undef); |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
print STDERR |
600
|
|
|
|
|
|
|
"Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: "; |
601
|
|
|
|
|
|
|
$user = <STDIN>; |
602
|
|
|
|
|
|
|
chomp($user) if $user; |
603
|
|
|
|
|
|
|
die "No Username specified\n" unless length $user; |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
alarm( $self->{credential_timeout} ) unless $^O =~ /Win32/i; |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
print STDERR "Password: "; |
608
|
|
|
|
|
|
|
system("stty -echo"); |
609
|
|
|
|
|
|
|
$password = <STDIN>; |
610
|
|
|
|
|
|
|
system("stty echo"); |
611
|
|
|
|
|
|
|
print STDERR "\n"; # because we disabled echo |
612
|
|
|
|
|
|
|
chomp($password); |
613
|
|
|
|
|
|
|
alarm(0) unless $^O =~ /Win32/i; |
614
|
|
|
|
|
|
|
}; |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
alarm(0) unless $^O =~ /Win32/i; |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
return if $@; |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
return join ':', $user, $password; |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=head2 add_to_queue( I<uri> ) |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
Add I<uri> to the queue. |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=cut |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
sub add_to_queue { |
631
|
|
|
|
|
|
|
my $self = shift; |
632
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
633
|
|
|
|
|
|
|
return $self->queue->put($uri); |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=head2 next_from_queue |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Return next I<uri> from queue. |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=cut |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub next_from_queue { |
643
|
|
|
|
|
|
|
my $self = shift; |
644
|
|
|
|
|
|
|
return $self->queue->get(); |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
=head2 left_in_queue |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
Returns queue()->size(). |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=cut |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
sub left_in_queue { |
654
|
|
|
|
|
|
|
return shift->queue->size(); |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=head2 remove_from_queue( I<uri> ) |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
Calls queue()->remove(I<uri>). |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=cut |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
sub remove_from_queue { |
664
|
|
|
|
|
|
|
my $self = shift; |
665
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
666
|
|
|
|
|
|
|
return $self->queue->remove($uri); |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=head2 get_doc |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
Returns the next URI from the queue() as a Dezi::Indexer::Doc object, |
672
|
|
|
|
|
|
|
or the error message if there was one. |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
Returns undef if the queue is empty or max_depth() has been reached. |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=cut |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
sub get_doc { |
679
|
|
|
|
|
|
|
my $self = shift; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# return unless we have something in the queue |
682
|
|
|
|
|
|
|
return unless $self->left_in_queue(); |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
# pop the queue and make it a URI |
685
|
|
|
|
|
|
|
my $uri = $self->next_from_queue(); |
686
|
|
|
|
|
|
|
my $depth = $self->uri_cache->get("$uri"); |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
$self->debug |
689
|
|
|
|
|
|
|
and $self->write_log( |
690
|
|
|
|
|
|
|
uri => $uri, |
691
|
|
|
|
|
|
|
msg => sprintf( |
692
|
|
|
|
|
|
|
"depth:%d max_depth:%s", |
693
|
|
|
|
|
|
|
$depth, ( $self->max_depth || 'undef' ) |
694
|
|
|
|
|
|
|
), |
695
|
|
|
|
|
|
|
); |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
return if defined $self->max_depth && $depth > $self->max_depth; |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
$self->{_cur_depth} = $depth; |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
my $doc = $self->_make_request($uri); |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
if ($doc) { |
704
|
|
|
|
|
|
|
$self->remove_from_queue($uri); |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
return $doc; |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head2 get_authorized_doc( I<uri>, I<response> ) |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
Called internally when the server returns a 401 or 403 response. |
713
|
|
|
|
|
|
|
Will attempt to determine the correct credentials for I<uri> |
714
|
|
|
|
|
|
|
based on the previous attempt in I<response> and what you |
715
|
|
|
|
|
|
|
have configured in B<credentials>, B<authn_callback> or when |
716
|
|
|
|
|
|
|
manually prompted. |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
=cut |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
sub get_authorized_doc { |
721
|
|
|
|
|
|
|
my $self = shift; |
722
|
|
|
|
|
|
|
my $uri = shift or croak "uri required"; |
723
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# set up credentials |
726
|
|
|
|
|
|
|
$self->_authorize( $uri, $response->http_response ) or return; |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
return $self->_make_request($uri); |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub _make_request { |
732
|
|
|
|
|
|
|
my ( $self, $uri ) = @_; |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
# get our useragent |
735
|
|
|
|
|
|
|
my $ua = $self->ua; |
736
|
|
|
|
|
|
|
my $delay = 0; |
737
|
|
|
|
|
|
|
if ( $self->{keep_alive} ) { |
738
|
|
|
|
|
|
|
$delay = 0; |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
elsif ( !$self->{delay} or !$self->{_last_response_time} ) { |
741
|
|
|
|
|
|
|
$delay = 0; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
else { |
744
|
|
|
|
|
|
|
my $elapsed = time() - $self->{_last_response_time}; |
745
|
|
|
|
|
|
|
$delay = $self->{delay} - $elapsed; |
746
|
|
|
|
|
|
|
$delay = 0 if $delay < 0; |
747
|
|
|
|
|
|
|
$self->debug |
748
|
|
|
|
|
|
|
and $self->write_log( |
749
|
|
|
|
|
|
|
uri => $uri, |
750
|
|
|
|
|
|
|
msg => "elapsed:$elapsed delay:$delay", |
751
|
|
|
|
|
|
|
); |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
$self->write_log( |
755
|
|
|
|
|
|
|
uri => $uri, |
756
|
|
|
|
|
|
|
msg => "GET delay:$delay", |
757
|
|
|
|
|
|
|
) if $self->verbose; |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
my %get_args = ( |
760
|
|
|
|
|
|
|
uri => $uri, |
761
|
|
|
|
|
|
|
delay => $delay, |
762
|
|
|
|
|
|
|
debug => $self->debug, |
763
|
|
|
|
|
|
|
verbose => $self->verbose, |
764
|
|
|
|
|
|
|
); |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
if ( my ( $user, $pass ) = $self->_get_user_pass($uri) ) { |
767
|
|
|
|
|
|
|
$get_args{user} = $user; |
768
|
|
|
|
|
|
|
$get_args{pass} = $pass; |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
# fetch the uri. $ua handles delay internally. |
772
|
|
|
|
|
|
|
my $response = $ua->get(%get_args); |
773
|
|
|
|
|
|
|
my $http_response = $response->http_response; |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
# flag current time for next delay calc. |
776
|
|
|
|
|
|
|
$self->{_last_response_time} = time(); |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# redirect? follow, conditionally. |
779
|
|
|
|
|
|
|
if ( $response->is_redirect ) { |
780
|
|
|
|
|
|
|
my $location = $response->header('location'); |
781
|
|
|
|
|
|
|
if ( !$location ) { |
782
|
|
|
|
|
|
|
$self->write_log( |
783
|
|
|
|
|
|
|
uri => $uri, |
784
|
|
|
|
|
|
|
msg => "skipping, redirect without a Location header", |
785
|
|
|
|
|
|
|
); |
786
|
|
|
|
|
|
|
return $response->status; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
$self->debug |
789
|
|
|
|
|
|
|
and $self->write_log( |
790
|
|
|
|
|
|
|
uri => $uri, |
791
|
|
|
|
|
|
|
msg => "redirect: $location", |
792
|
|
|
|
|
|
|
); |
793
|
|
|
|
|
|
|
if ( $self->follow_redirects ) { |
794
|
|
|
|
|
|
|
$self->_add_links( $uri, |
795
|
|
|
|
|
|
|
URI->new_abs( $location, $http_response->base ) ); |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
return $response->status; |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
if ( $response->ct ) { |
801
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
802
|
|
|
|
|
|
|
uri => $uri, |
803
|
|
|
|
|
|
|
msg => 'content-type: ' . $response->ct, |
804
|
|
|
|
|
|
|
); |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
# add its links to the queue. |
808
|
|
|
|
|
|
|
# If the resource looks like an XML feed of some kind, |
809
|
|
|
|
|
|
|
# glean its links differently than if it is an HTML response. |
810
|
|
|
|
|
|
|
if ( my $feed = $self->looks_like_feed($http_response) ) { |
811
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
812
|
|
|
|
|
|
|
uri => $uri, |
813
|
|
|
|
|
|
|
msg => 'looks like feed' |
814
|
|
|
|
|
|
|
); |
815
|
|
|
|
|
|
|
my @links; |
816
|
|
|
|
|
|
|
for my $entry ( $feed->entries ) { |
817
|
|
|
|
|
|
|
push @links, URI->new( $entry->link ); |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
$self->_add_links( $uri, @links ); |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
# we don't want the feed content, we want the links. |
822
|
|
|
|
|
|
|
# TODO make this optional |
823
|
|
|
|
|
|
|
return $response->status; |
824
|
|
|
|
|
|
|
} |
825
|
|
|
|
|
|
|
elsif ( my $sitemap = $self->looks_like_sitemap($http_response) ) { |
826
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
827
|
|
|
|
|
|
|
uri => $uri, |
828
|
|
|
|
|
|
|
msg => 'looks like sitemap', |
829
|
|
|
|
|
|
|
); |
830
|
|
|
|
|
|
|
my @links; |
831
|
|
|
|
|
|
|
for my $url ( $sitemap->urls ) { |
832
|
|
|
|
|
|
|
push @links, URI->new( $url->loc ); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
$self->_add_links( $uri, @links ); |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
# we don't want the feed content, we want the links. |
837
|
|
|
|
|
|
|
# TODO make this optional |
838
|
|
|
|
|
|
|
return $response->status; |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
else { |
841
|
|
|
|
|
|
|
$self->_add_links( $uri, $response->links ); |
842
|
|
|
|
|
|
|
} |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
# return $uri as a Doc object |
845
|
|
|
|
|
|
|
my $use_uri = $response->success ? $ua->uri : $uri; |
846
|
|
|
|
|
|
|
my $meta = { |
847
|
|
|
|
|
|
|
org_uri => $uri, |
848
|
|
|
|
|
|
|
ret_uri => ( $use_uri || $uri ), |
849
|
|
|
|
|
|
|
depth => delete $self->{_cur_depth}, |
850
|
|
|
|
|
|
|
status => $response->status, |
851
|
|
|
|
|
|
|
success => $response->success, |
852
|
|
|
|
|
|
|
is_html => $response->is_html, |
853
|
|
|
|
|
|
|
title => ( |
854
|
|
|
|
|
|
|
$response->success |
855
|
|
|
|
|
|
|
? ( $response->is_html |
856
|
|
|
|
|
|
|
? ( $response->title || "No title: $use_uri" ) |
857
|
|
|
|
|
|
|
: $use_uri |
858
|
|
|
|
|
|
|
) |
859
|
|
|
|
|
|
|
: "Failed: $use_uri" |
860
|
|
|
|
|
|
|
), |
861
|
|
|
|
|
|
|
ct => ( $response->success ? $response->ct : "Unknown" ), |
862
|
|
|
|
|
|
|
}; |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
my $headers = $http_response->headers; |
865
|
|
|
|
|
|
|
my $buf = $response->content; |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
if ( $self->{use_md5} ) { |
868
|
|
|
|
|
|
|
my $fingerprint = $response->header('Content-MD5') |
869
|
|
|
|
|
|
|
|| Digest::MD5::md5_base64($buf); |
870
|
|
|
|
|
|
|
if ( $self->md5_cache->has($fingerprint) ) { |
871
|
|
|
|
|
|
|
return "duplicate content for " |
872
|
|
|
|
|
|
|
. $self->md5_cache->get($fingerprint); |
873
|
|
|
|
|
|
|
} |
874
|
|
|
|
|
|
|
$self->md5_cache->add( $fingerprint => $uri ); |
875
|
|
|
|
|
|
|
} |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
if ( $response->success ) { |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
my $content_type = $meta->{ct}; |
880
|
|
|
|
|
|
|
my $swish3 = $self->indexer ? $self->indexer->swish3 : undef; |
881
|
|
|
|
|
|
|
if ( !$UTILS->get_parser_for_mime( $content_type, $swish3 ) ) { |
882
|
|
|
|
|
|
|
$self->write_log( |
883
|
|
|
|
|
|
|
uri => $uri, |
884
|
|
|
|
|
|
|
msg => "no parser for $content_type", |
885
|
|
|
|
|
|
|
); |
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
my $charset = $headers->content_type; |
888
|
|
|
|
|
|
|
$charset =~ s/;?$meta->{ct};?//; |
889
|
|
|
|
|
|
|
my $encoding = $headers->content_encoding || $charset; |
890
|
|
|
|
|
|
|
my %doc = ( |
891
|
|
|
|
|
|
|
url => $meta->{org_uri}, |
892
|
|
|
|
|
|
|
modtime => ( $headers->last_modified || $headers->date ), |
893
|
|
|
|
|
|
|
type => $meta->{ct}, |
894
|
|
|
|
|
|
|
content => ( $encoding =~ m/utf-8/i ? to_utf8($buf) : $buf ), |
895
|
|
|
|
|
|
|
size => $headers->content_length || length( pack 'C0a*', $buf ), |
896
|
|
|
|
|
|
|
charset => $encoding, |
897
|
|
|
|
|
|
|
); |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# cache whatever credentials were used so we can re-use |
900
|
|
|
|
|
|
|
if ( $self->{cur_realm} and $uri->userinfo ) { |
901
|
|
|
|
|
|
|
my $key = $uri->canonical->host_port . ':' . $self->{cur_realm}; |
902
|
|
|
|
|
|
|
$self->{_auth_cache}->add( $key => $uri->userinfo ); |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
# not too sure of the best logic here |
905
|
|
|
|
|
|
|
my $path = $uri->path; |
906
|
|
|
|
|
|
|
$path =~ s!/[^/]*$!!; |
907
|
|
|
|
|
|
|
$self->{last_auth} = { |
908
|
|
|
|
|
|
|
path => $path, |
909
|
|
|
|
|
|
|
auth => $uri->userinfo, |
910
|
|
|
|
|
|
|
}; |
911
|
|
|
|
|
|
|
} |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
# return doc |
914
|
|
|
|
|
|
|
return $self->doc_class->new(%doc); |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
elsif ( $response->status == 401 ) { |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
# authorize and try again |
920
|
|
|
|
|
|
|
$self->write_log( |
921
|
|
|
|
|
|
|
uri => $uri, |
922
|
|
|
|
|
|
|
msg => sprintf( "authn denied, retrying, %s", |
923
|
|
|
|
|
|
|
$response->status_line ), |
924
|
|
|
|
|
|
|
); |
925
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ) |
926
|
|
|
|
|
|
|
|| $response->status; |
927
|
|
|
|
|
|
|
} |
928
|
|
|
|
|
|
|
elsif ($response->status == 403 |
929
|
|
|
|
|
|
|
&& $http_response->status_line =~ m/robots.txt/ ) |
930
|
|
|
|
|
|
|
{ |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
# ignore |
933
|
|
|
|
|
|
|
$self->write_log( |
934
|
|
|
|
|
|
|
uri => $uri, |
935
|
|
|
|
|
|
|
msg => sprintf( "skipped, %s", $http_response->status_line ), |
936
|
|
|
|
|
|
|
); |
937
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ) |
938
|
|
|
|
|
|
|
|| $response->status; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
elsif ( $response->status == 403 ) { |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
# authorize and try again |
943
|
|
|
|
|
|
|
$self->write_log( |
944
|
|
|
|
|
|
|
uri => $uri, |
945
|
|
|
|
|
|
|
msg => sprintf( "retrying, %s", $http_response->status_line ), |
946
|
|
|
|
|
|
|
); |
947
|
|
|
|
|
|
|
return $self->get_authorized_doc( $uri, $response ); |
948
|
|
|
|
|
|
|
} |
949
|
|
|
|
|
|
|
else { |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
$self->write_log( |
952
|
|
|
|
|
|
|
uri => $uri, |
953
|
|
|
|
|
|
|
msg => $http_response->status_line, |
954
|
|
|
|
|
|
|
); |
955
|
|
|
|
|
|
|
return $response->status; |
956
|
|
|
|
|
|
|
} |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
return; # never get here. |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
sub _get_user_pass { |
962
|
|
|
|
|
|
|
my $self = shift; |
963
|
|
|
|
|
|
|
my $uri = shift; |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
# Set basic auth if defined - use URI specific first, then credentials. |
966
|
|
|
|
|
|
|
# this doesn't track what should have authorization |
967
|
|
|
|
|
|
|
my $last_auth; |
968
|
|
|
|
|
|
|
if ( $self->{last_auth} ) { |
969
|
|
|
|
|
|
|
my $path = $uri->path; |
970
|
|
|
|
|
|
|
$path =~ s!/[^/]*$!!; |
971
|
|
|
|
|
|
|
$last_auth = $self->{last_auth}->{auth} |
972
|
|
|
|
|
|
|
if $self->{last_auth}->{path} eq $path; |
973
|
|
|
|
|
|
|
} |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
my ( $user, $pass ) = split /:/, |
976
|
|
|
|
|
|
|
( $last_auth || $uri->userinfo || $self->credentials || '' ); |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
return ( $user, $pass ); |
979
|
|
|
|
|
|
|
} |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
=head2 looks_like_feed( I<http_response> ) |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
Called internally to perform naive heuristics on I<http_response> |
984
|
|
|
|
|
|
|
to determine whether it looks like an XML feed of some kind, |
985
|
|
|
|
|
|
|
rather than a HTML page. |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=cut |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
sub looks_like_feed { |
990
|
|
|
|
|
|
|
my $self = shift; |
991
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
992
|
|
|
|
|
|
|
my $headers = $response->headers; |
993
|
|
|
|
|
|
|
my $ct = $headers->content_type; |
994
|
|
|
|
|
|
|
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) { |
995
|
|
|
|
|
|
|
return 0; |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
if ( $ct eq 'text/xml' |
998
|
|
|
|
|
|
|
or $ct eq 'application/rss+xml' |
999
|
|
|
|
|
|
|
or $ct eq 'application/rdf+xml' |
1000
|
|
|
|
|
|
|
or $ct eq 'application/atom+xml' ) |
1001
|
|
|
|
|
|
|
{ |
1002
|
|
|
|
|
|
|
my $xml = $response->decoded_content; # TODO or content() |
1003
|
|
|
|
|
|
|
return XML::Feed->parse( \$xml ); |
1004
|
|
|
|
|
|
|
} |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
return 0; |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
=head2 looks_like_sitemap( I<http_response> ) |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Called internally to perform naive heuristics on I<http_response> |
1012
|
|
|
|
|
|
|
to determine whether it looks like a XML sitemap feed, |
1013
|
|
|
|
|
|
|
rather than a HTML page. |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
=cut |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub looks_like_sitemap { |
1018
|
|
|
|
|
|
|
my $self = shift; |
1019
|
|
|
|
|
|
|
my $response = shift or croak "response required"; |
1020
|
|
|
|
|
|
|
my $headers = $response->headers; |
1021
|
|
|
|
|
|
|
my $ct = $headers->content_type; |
1022
|
|
|
|
|
|
|
if ( $ct eq 'text/html' or $ct eq 'application/xhtml+xml' ) { |
1023
|
|
|
|
|
|
|
return 0; |
1024
|
|
|
|
|
|
|
} |
1025
|
|
|
|
|
|
|
if ( $ct eq 'text/xml' |
1026
|
|
|
|
|
|
|
or $ct eq 'application/xml' ) |
1027
|
|
|
|
|
|
|
{ |
1028
|
|
|
|
|
|
|
my $xml = $response->decoded_content; # TODO or content() |
1029
|
|
|
|
|
|
|
my $sitemap = WWW::Sitemap::XML->new(); |
1030
|
|
|
|
|
|
|
eval { $sitemap->load( string => $xml ); }; |
1031
|
|
|
|
|
|
|
if ($@) { |
1032
|
|
|
|
|
|
|
return 0; |
1033
|
|
|
|
|
|
|
} |
1034
|
|
|
|
|
|
|
return $sitemap; |
1035
|
|
|
|
|
|
|
} |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
return 0; |
1038
|
|
|
|
|
|
|
} |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
=head2 crawl( I<uri> ) |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
Implements the required crawl() method. Recursively fetches I<uri> |
1043
|
|
|
|
|
|
|
and its child links to a depth set in max_depth(). |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Will quit after max_files() unless max_files==0. |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
Will quit after max_time() seconds unless max_time==0. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=cut |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
sub crawl { |
1052
|
|
|
|
|
|
|
my $self = shift; |
1053
|
|
|
|
|
|
|
my @urls = @_; |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
my $indexer = $self->indexer; # may be undef |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
for my $url (@urls) { |
1058
|
|
|
|
|
|
|
my $started = time(); |
1059
|
|
|
|
|
|
|
$self->debug and $self->write_log( |
1060
|
|
|
|
|
|
|
uri => $url, |
1061
|
|
|
|
|
|
|
msg => "crawling", |
1062
|
|
|
|
|
|
|
); |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
my $uri = URI->new($url)->canonical; |
1065
|
|
|
|
|
|
|
$self->uri_cache->add( "$uri" => 1 ); |
1066
|
|
|
|
|
|
|
$self->add_to_queue($uri); |
1067
|
|
|
|
|
|
|
$self->{_base} = $uri->as_string; |
1068
|
|
|
|
|
|
|
while ( my $doc = $self->get_doc ) { |
1069
|
|
|
|
|
|
|
$self->debug and $self->write_log_line(); |
1070
|
|
|
|
|
|
|
next unless blessed($doc); |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# indexer not required |
1073
|
|
|
|
|
|
|
$indexer->process($doc) if $indexer; |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
$self->_increment_count; |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
# abort if we've met any max_* conditions |
1078
|
|
|
|
|
|
|
last if $self->max_files and $self->count >= $self->max_files; |
1079
|
|
|
|
|
|
|
last |
1080
|
|
|
|
|
|
|
if $self->max_time |
1081
|
|
|
|
|
|
|
and ( time() - $started ) > $self->max_time; |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
return $self->count; |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
=head2 write_log( I<args> ) |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
Passes I<args> to Dezi::Utils::write_log(). |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
=cut |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
sub write_log { |
1095
|
|
|
|
|
|
|
Dezi::Utils::write_log(@_); |
1096
|
|
|
|
|
|
|
} |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=head2 write_log_line([I<char>, I<width>]) |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
Pass through to Dezi::Utils::write_log_line(). |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=cut |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
sub write_log_line { |
1105
|
|
|
|
|
|
|
Dezi::Utils::write_log_line(@_); |
1106
|
|
|
|
|
|
|
} |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
1; |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
__END__ |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=head1 AUTHOR |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
Peter Karman, E<lt>perl@peknet.comE<gt> |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
=head1 BUGS |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through |
1121
|
|
|
|
|
|
|
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>. |
1122
|
|
|
|
|
|
|
I will be notified, and then you'll |
1123
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
=head1 SUPPORT |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
perldoc Dezi |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
You can also look for information at: |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=over 4 |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=item * Mailing list |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
L<http://lists.swish-e.org/listinfo/users> |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App> |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
L<http://annocpan.org/dist/Dezi-App> |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
=item * CPAN Ratings |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/Dezi-App> |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=item * Search CPAN |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
L<http://search.cpan.org/dist/Dezi-App/> |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=back |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
Copyright 2008-2009 by Peter Karman |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
1163
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=head1 SEE ALSO |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
L<http://swish-e.org/> |