line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package LWP::RobotUA; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
207099
|
use base qw(LWP::UserAgent); |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
796
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = '6.29'; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
require WWW::RobotRules; |
8
|
|
|
|
|
|
|
require HTTP::Request; |
9
|
|
|
|
|
|
|
require HTTP::Response; |
10
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
12
|
use Carp (); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
23
|
|
12
|
2
|
|
|
2
|
|
7
|
use HTTP::Status (); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
27
|
|
13
|
2
|
|
|
2
|
|
8
|
use HTTP::Date qw(time2str); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
73
|
|
14
|
2
|
|
|
2
|
|
9
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
1523
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# Additional attributes in addition to those found in LWP::UserAgent: |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# $self->{'delay'} Required delay between request to the same |
21
|
|
|
|
|
|
|
# server in minutes. |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# $self->{'rules'} A WWW::RobotRules object |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub new |
27
|
|
|
|
|
|
|
{ |
28
|
2
|
|
|
2
|
1
|
491013
|
my $class = shift; |
29
|
2
|
|
|
|
|
9
|
my %cnf; |
30
|
2
|
50
|
|
|
|
12
|
if (@_ < 4) { |
31
|
|
|
|
|
|
|
# legacy args |
32
|
2
|
|
|
|
|
23
|
@cnf{qw(agent from rules)} = @_; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
else { |
35
|
0
|
|
|
|
|
0
|
%cnf = @_; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
2
|
50
|
|
|
|
15
|
Carp::croak('LWP::RobotUA agent required') unless $cnf{agent}; |
39
|
|
|
|
|
|
|
Carp::croak('LWP::RobotUA from address required') |
40
|
2
|
50
|
33
|
|
|
26
|
unless $cnf{from} && $cnf{from} =~ m/\@/; |
41
|
|
|
|
|
|
|
|
42
|
2
|
|
50
|
|
|
17
|
my $delay = delete $cnf{delay} || 1; |
43
|
2
|
|
|
|
|
8
|
my $use_sleep = delete $cnf{use_sleep}; |
44
|
2
|
50
|
|
|
|
11
|
$use_sleep = 1 unless defined($use_sleep); |
45
|
2
|
|
|
|
|
7
|
my $rules = delete $cnf{rules}; |
46
|
|
|
|
|
|
|
|
47
|
2
|
|
|
|
|
33
|
my $self = LWP::UserAgent->new(%cnf); |
48
|
2
|
|
|
|
|
12
|
$self = bless $self, $class; |
49
|
|
|
|
|
|
|
|
50
|
2
|
|
|
|
|
17
|
$self->{'delay'} = $delay; # minutes |
51
|
2
|
|
|
|
|
9
|
$self->{'use_sleep'} = $use_sleep; |
52
|
|
|
|
|
|
|
|
53
|
2
|
50
|
|
|
|
10
|
if ($rules) { |
54
|
0
|
|
|
|
|
0
|
$rules->agent($cnf{agent}); |
55
|
0
|
|
|
|
|
0
|
$self->{'rules'} = $rules; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
else { |
58
|
2
|
|
|
|
|
26
|
$self->{'rules'} = WWW::RobotRules->new($cnf{agent}); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
2
|
|
|
|
|
118
|
$self; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
8
|
|
|
8
|
1
|
7548
|
sub delay { shift->_elem('delay', @_); } |
66
|
2
|
|
|
2
|
1
|
10
|
sub use_sleep { shift->_elem('use_sleep', @_); } |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub agent |
70
|
|
|
|
|
|
|
{ |
71
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
72
|
0
|
|
|
|
|
0
|
my $old = $self->SUPER::agent(@_); |
73
|
0
|
0
|
|
|
|
0
|
if (@_) { |
74
|
|
|
|
|
|
|
# Changing our name means to start fresh |
75
|
0
|
|
|
|
|
0
|
$self->{'rules'}->agent($self->{'agent'}); |
76
|
|
|
|
|
|
|
} |
77
|
0
|
|
|
|
|
0
|
$old; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub rules { |
82
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
83
|
0
|
|
|
|
|
0
|
my $old = $self->_elem('rules', @_); |
84
|
0
|
0
|
|
|
|
0
|
$self->{'rules'}->agent($self->{'agent'}) if @_; |
85
|
0
|
|
|
|
|
0
|
$old; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub no_visits |
90
|
|
|
|
|
|
|
{ |
91
|
2
|
|
|
2
|
1
|
51
|
my($self, $netloc) = @_; |
92
|
2
|
50
|
|
|
|
10
|
$self->{'rules'}->no_visits($netloc) || 0; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
*host_count = \&no_visits; # backwards compatibility with LWP-5.02 |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub host_wait |
99
|
|
|
|
|
|
|
{ |
100
|
13
|
|
|
13
|
1
|
81
|
my($self, $netloc) = @_; |
101
|
13
|
100
|
|
|
|
36
|
return undef unless defined $netloc; |
102
|
12
|
|
|
|
|
45
|
my $last = $self->{'rules'}->last_visit($netloc); |
103
|
12
|
100
|
|
|
|
96
|
if ($last) { |
104
|
10
|
|
|
|
|
34
|
my $wait = int($self->{'delay'} * 60 - (time - $last)); |
105
|
10
|
50
|
|
|
|
27
|
$wait = 0 if $wait < 0; |
106
|
10
|
|
|
|
|
29
|
return $wait; |
107
|
|
|
|
|
|
|
} |
108
|
2
|
|
|
|
|
8
|
return 0; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub simple_request |
113
|
|
|
|
|
|
|
{ |
114
|
13
|
|
|
13
|
1
|
31
|
my($self, $request, $arg, $size) = @_; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# Do we try to access a new server? |
117
|
13
|
|
|
|
|
37
|
my $allowed = $self->{'rules'}->allowed($request->uri); |
118
|
|
|
|
|
|
|
|
119
|
13
|
100
|
|
|
|
2299
|
if ($allowed < 0) { |
120
|
|
|
|
|
|
|
# Host is not visited before, or robots.txt expired; fetch "robots.txt" |
121
|
2
|
|
|
|
|
10
|
my $robot_url = $request->uri->clone; |
122
|
2
|
|
|
|
|
36
|
$robot_url->path("robots.txt"); |
123
|
2
|
|
|
|
|
135
|
$robot_url->query(undef); |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# make access to robot.txt legal since this will be a recursive call |
126
|
2
|
|
|
|
|
58
|
$self->{'rules'}->parse($robot_url, ""); |
127
|
|
|
|
|
|
|
|
128
|
2
|
|
|
|
|
406
|
my $robot_req = HTTP::Request->new('GET', $robot_url); |
129
|
2
|
|
|
|
|
157
|
my $parse_head = $self->parse_head(0); |
130
|
2
|
|
|
|
|
28
|
my $robot_res = $self->request($robot_req); |
131
|
2
|
|
|
|
|
14
|
$self->parse_head($parse_head); |
132
|
2
|
|
|
|
|
17
|
my $fresh_until = $robot_res->fresh_until; |
133
|
2
|
|
|
|
|
1661
|
my $content = ""; |
134
|
2
|
50
|
33
|
|
|
11
|
if ($robot_res->is_success && $robot_res->content_is_text) { |
135
|
2
|
|
|
|
|
218
|
$content = $robot_res->decoded_content; |
136
|
2
|
50
|
33
|
|
|
4724
|
$content = "" unless $content && $content =~ /^\s*Disallow\s*:/mi; |
137
|
|
|
|
|
|
|
} |
138
|
2
|
|
|
|
|
23
|
$self->{'rules'}->parse($robot_url, $content, $fresh_until); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# recalculate allowed... |
141
|
2
|
|
|
|
|
1708
|
$allowed = $self->{'rules'}->allowed($request->uri); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# Check rules |
145
|
13
|
100
|
|
|
|
531
|
unless ($allowed) { |
146
|
2
|
|
|
|
|
9
|
my $res = HTTP::Response->new( |
147
|
|
|
|
|
|
|
HTTP::Status::RC_FORBIDDEN, 'Forbidden by robots.txt'); |
148
|
2
|
|
|
|
|
77
|
$res->request( $request ); # bind it to that request |
149
|
2
|
|
|
|
|
17
|
return $res; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
11
|
|
|
|
|
22
|
my $netloc = eval { local $SIG{__DIE__}; $request->uri->host_port; }; |
|
11
|
|
|
|
|
46
|
|
|
11
|
|
|
|
|
37
|
|
153
|
11
|
|
|
|
|
348
|
my $wait = $self->host_wait($netloc); |
154
|
|
|
|
|
|
|
|
155
|
11
|
100
|
|
|
|
30
|
if ($wait) { |
156
|
6
|
100
|
|
|
|
18
|
if ($self->{'use_sleep'}) { |
157
|
4
|
|
|
|
|
12000476
|
sleep($wait) |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
else { |
160
|
2
|
|
|
|
|
8
|
my $res = HTTP::Response->new( |
161
|
|
|
|
|
|
|
HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down'); |
162
|
2
|
|
|
|
|
78
|
$res->header('Retry-After', time2str(time + $wait)); |
163
|
2
|
|
|
|
|
127
|
$res->request( $request ); # bind it to that request |
164
|
2
|
|
|
|
|
18
|
return $res; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# Perform the request |
169
|
9
|
|
|
|
|
171
|
my $res = $self->SUPER::simple_request($request, $arg, $size); |
170
|
|
|
|
|
|
|
|
171
|
9
|
|
|
|
|
61
|
$self->{'rules'}->visit($netloc); |
172
|
|
|
|
|
|
|
|
173
|
9
|
|
|
|
|
174
|
$res; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub as_string |
178
|
|
|
|
|
|
|
{ |
179
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
180
|
0
|
|
|
|
|
|
my @s; |
181
|
0
|
|
|
|
|
|
push(@s, "Robot: $self->{'agent'} operated by $self->{'from'} [$self]"); |
182
|
0
|
|
|
|
|
|
push(@s, " Minimum delay: " . int($self->{'delay'}*60) . "s"); |
183
|
0
|
0
|
|
|
|
|
push(@s, " Will sleep if too early") if $self->{'use_sleep'}; |
184
|
0
|
|
|
|
|
|
push(@s, " Rules = $self->{'rules'}"); |
185
|
0
|
|
|
|
|
|
join("\n", @s, ''); |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
1; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
__END__ |