line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
CrawlerCommons::RobotRulesParser - parser for robots.txt files |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use CrawlerCommons::RobotRulesParser; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my $rules_parser = CrawlerCommons::RobotRulesParser->new; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
my $content = "User-agent: *\r\nDisallow: *images"; |
12
|
|
|
|
|
|
|
my $content_type = "text/plain"; |
13
|
|
|
|
|
|
|
my $robot_names = "any-old-robot"; |
14
|
|
|
|
|
|
|
my $url = "http://domain.com/"; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
my $robot_rules = |
17
|
|
|
|
|
|
|
$rules_parser->parse_content($url, $content, $content_type, $robot_names); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
say "We're allowed to crawl the index :)" |
20
|
|
|
|
|
|
|
if $robot_rules->is_allowed( "https://www.domain.com/index.html"); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
say "Not allowed to crawl: $_" unless $robot_rules->is_allowed( $_ ) |
23
|
|
|
|
|
|
|
for ("http://www.domain.com/images/some_file.png", |
24
|
|
|
|
|
|
|
"http://www.domain.com/images/another_file.png"); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This module is a fairly close reproduction of the Crawler-Commons |
29
|
|
|
|
|
|
|
L<SimpleRobotRulesParser|http://crawler-commons.github.io/crawler-commons/0.7/crawlercommons/robots/SimpleRobotRulesParser.html> |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
From BaseRobotsParser javadoc: |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Parse the robots.txt file in <i>content</i>, and return rules appropriate |
34
|
|
|
|
|
|
|
for processing paths by <i>userAgent</i>. Note that multiple agent names |
35
|
|
|
|
|
|
|
may be provided as comma-separated values; the order of these shouldn't |
36
|
|
|
|
|
|
|
matter, as the file is parsed in order, and each agent name found in the |
37
|
|
|
|
|
|
|
file will be compared to every agent name found in robotNames. |
38
|
|
|
|
|
|
|
Also note that names are lower-cased before comparison, and that any |
39
|
|
|
|
|
|
|
robot name you pass shouldn't contain commas or spaces; if the name has |
40
|
|
|
|
|
|
|
spaces, it will be split into multiple names, each of which will be |
41
|
|
|
|
|
|
|
compared against agent names in the robots.txt file. An agent name is |
42
|
|
|
|
|
|
|
considered a match if it's a prefix match on the provided robot name. For |
43
|
|
|
|
|
|
|
example, if you pass in "Mozilla Crawlerbot-super 1.0", this would match |
44
|
|
|
|
|
|
|
"crawlerbot" as the agent name, because of splitting on spaces, |
45
|
|
|
|
|
|
|
lower-casing, and the prefix match rule. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
The method failedFetch is not implemented. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=cut |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
############################################################################### |
52
|
|
|
|
|
|
|
package CrawlerCommons::RobotRulesParser; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# MODULE IMPORTS |
56
|
|
|
|
|
|
|
######################################## |
57
|
|
|
|
|
|
|
# Pragmas |
58
|
|
|
|
|
|
|
#------------------# |
59
|
2
|
|
|
2
|
|
185873
|
use 5.10.1; |
|
2
|
|
|
|
|
7
|
|
60
|
2
|
|
|
2
|
|
9
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
45
|
|
61
|
2
|
|
|
2
|
|
325
|
use utf8; |
|
2
|
|
|
|
|
17
|
|
|
2
|
|
|
|
|
12
|
|
62
|
2
|
|
|
2
|
|
42
|
use warnings; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
45
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# CPAN/Core |
65
|
|
|
|
|
|
|
#------------------# |
66
|
2
|
|
|
2
|
|
234
|
use Const::Fast; |
|
2
|
|
|
|
|
1657
|
|
|
2
|
|
|
|
|
12
|
|
67
|
2
|
|
|
2
|
|
756
|
use Encode qw(decode encode); |
|
2
|
|
|
|
|
14573
|
|
|
2
|
|
|
|
|
117
|
|
68
|
2
|
|
|
2
|
|
267
|
use Try::Tiny; |
|
2
|
|
|
|
|
1379
|
|
|
2
|
|
|
|
|
86
|
|
69
|
2
|
|
|
2
|
|
481
|
use URI::Escape; |
|
2
|
|
|
|
|
2391
|
|
|
2
|
|
|
|
|
123
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Moose Setup |
72
|
|
|
|
|
|
|
#------------------# |
73
|
2
|
|
|
2
|
|
666
|
use Moose; |
|
2
|
|
|
|
|
791673
|
|
|
2
|
|
|
|
|
14
|
|
74
|
2
|
|
|
2
|
|
14198
|
use namespace::autoclean; |
|
2
|
|
|
|
|
12225
|
|
|
2
|
|
|
|
|
7
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Moose Pragmas |
77
|
|
|
|
|
|
|
#------------------# |
78
|
|
|
|
|
|
|
with 'MooseX::Log::Log4perl'; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Custom Modules |
81
|
|
|
|
|
|
|
#------------------# |
82
|
2
|
|
|
2
|
|
750
|
use CrawlerCommons::RobotDirective; |
|
2
|
|
|
|
|
11
|
|
|
2
|
|
|
|
|
153
|
|
83
|
2
|
|
|
2
|
|
1088
|
use CrawlerCommons::ParseState; |
|
2
|
|
|
|
|
11
|
|
|
2
|
|
|
|
|
96
|
|
84
|
2
|
|
|
2
|
|
21
|
use CrawlerCommons::RobotRules; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
56
|
|
85
|
2
|
|
|
2
|
|
939
|
use CrawlerCommons::RobotToken; |
|
2
|
|
|
|
|
10
|
|
|
2
|
|
|
|
|
563
|
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# VARIABLES/CONSTANTS |
88
|
|
|
|
|
|
|
######################################## |
89
|
|
|
|
|
|
|
# Constants |
90
|
|
|
|
|
|
|
#------------------# |
91
|
|
|
|
|
|
|
const my $DEBUG => $ENV{DEBUG} // 0; |
92
|
|
|
|
|
|
|
const my $TEST => $ENV{TEST} // 0; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
const my $BLANK_DIRECTIVE_PATTERN=> qr![ \t]+(.*)!o; |
95
|
|
|
|
|
|
|
const my $COLON_DIRECTIVE_PATTERN=> qr![ \t]*:[ \t]*(.*)!o; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
const my $MAX_CRAWL_DELAY => 300000; |
98
|
|
|
|
|
|
|
const my $MAX_WARNINGS => 5; |
99
|
|
|
|
|
|
|
const my $SIMPLE_HTML_PATTERN => qr!<(?:html|head|body)\s*>!is; |
100
|
|
|
|
|
|
|
const my $USER_AGENT_PATTERN => qr!user-agent:!i; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Variables |
103
|
|
|
|
|
|
|
#------------------# |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# setup |
106
|
|
|
|
|
|
|
BEGIN { |
107
|
2
|
|
|
2
|
|
19
|
require Log::Log4perl; |
108
|
2
|
50
|
|
|
|
3879
|
Log::Log4perl->easy_init($Log::Log4perl::ERROR) |
109
|
|
|
|
|
|
|
unless $Log::Log4perl::Logger::INITIALIZED; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head1 VERSION |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Version 0.03 |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# MOOSE ATTRIBUTES |
123
|
|
|
|
|
|
|
######################################## |
124
|
|
|
|
|
|
|
# Class |
125
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
126
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# Instance |
129
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
130
|
|
|
|
|
|
|
has 'num_warnings' => ( |
131
|
|
|
|
|
|
|
default => 0, |
132
|
|
|
|
|
|
|
handles => { |
133
|
|
|
|
|
|
|
increment_warnings => 'inc', |
134
|
|
|
|
|
|
|
}, |
135
|
|
|
|
|
|
|
is => 'ro', |
136
|
|
|
|
|
|
|
isa => 'Int', |
137
|
|
|
|
|
|
|
traits => ['Counter'] |
138
|
|
|
|
|
|
|
); |
139
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 METHODS |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# METHODS |
147
|
|
|
|
|
|
|
######################################## |
148
|
|
|
|
|
|
|
# Construction |
149
|
|
|
|
|
|
|
#------------------# |
150
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
151
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# Class Methods |
154
|
|
|
|
|
|
|
#------------------# |
155
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
156
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# Instance Methods |
159
|
|
|
|
|
|
|
#------------------# |
160
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
161
|
|
|
|
|
|
|
=head2 C<< my $robot_rules = $rules_parser->parse_content($url, $content, $content_type, $robot_name) >> |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Parsers robots.txt data in C<$content> for the User-agent(s) specified in |
164
|
|
|
|
|
|
|
C<$robot_name> returning a C<CrawlerCommons::RobotRules> object corresponding |
165
|
|
|
|
|
|
|
to the rules defined for C<$robot_name>. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=over |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item * C<$url> |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
URL string that's parsed in a URI object to provide scheme, authority, and path |
172
|
|
|
|
|
|
|
for sitemap directive values. If the directive's value begins with a '/', it |
173
|
|
|
|
|
|
|
overrides the path value provided by this URL context string. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item * C<$content> |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
The text content of the robots.txt file to be parsed. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item * C<$content_type> |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
The content-type of the robots.txt content to be parsed. Assumes text/plain by |
182
|
|
|
|
|
|
|
default. If type is text/html, the parser will attempt to strip-out html tags |
183
|
|
|
|
|
|
|
from the content. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=item * C<$robot_name> |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
A string signifying for which user-agent(s) the rules should be extracted. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=back |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |
192
|
|
|
|
|
|
|
sub parse_content { |
193
|
73
|
|
|
73
|
0
|
1196
|
my ($self, $url, $content, $content_type, $robot_name) = @_; |
194
|
|
|
|
|
|
|
|
195
|
73
|
100
|
50
|
|
|
516
|
return CrawlerCommons::RobotRules->new( |
196
|
|
|
|
|
|
|
_mode => $CrawlerCommons::RobotRules::ALLOW_ALL) |
197
|
|
|
|
|
|
|
if ( ($content // '') eq '' ); |
198
|
|
|
|
|
|
|
|
199
|
71
|
|
|
|
|
447
|
my $content_len = length( $content ); |
200
|
71
|
|
|
|
|
191
|
my $offset = 0; |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# handle UTF-8, UTF-16LE, UTF-16BE content |
203
|
71
|
100
|
66
|
|
|
1653
|
if ( ($content_len >= 3) && (substr($content, 0, 1) eq "\xEF") && |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
204
|
|
|
|
|
|
|
(substr($content, 1, 1) eq "\xBB") && |
205
|
|
|
|
|
|
|
(substr($content, 2, 1) eq "\xBF") ) { |
206
|
1
|
|
|
|
|
2
|
$offset = 3; |
207
|
1
|
|
|
|
|
3
|
$content_len -= 3; |
208
|
1
|
|
|
|
|
3
|
$content = substr( $content, 3); |
209
|
1
|
|
|
|
|
7
|
$content = decode('UTF-8', $content); |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFF") && |
212
|
|
|
|
|
|
|
(substr($content, 1, 1) eq "\xFE") ) { |
213
|
1
|
|
|
|
|
2
|
$offset = 2; |
214
|
1
|
|
|
|
|
3
|
$content_len -= 2; |
215
|
1
|
|
|
|
|
4
|
$content = substr( $content, 2); |
216
|
1
|
|
|
|
|
8
|
$content = decode('UTF-16LE', $content); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFE") && |
219
|
|
|
|
|
|
|
(substr($content, 1, 1) eq "\xFF") ) { |
220
|
1
|
|
|
|
|
3
|
$offset = 2; |
221
|
1
|
|
|
|
|
3
|
$content_len -= 2; |
222
|
1
|
|
|
|
|
5
|
$content = substr( $content, 2); |
223
|
1
|
|
|
|
|
7
|
$content = decode('UTF-16BE', $content); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# set flags that trigger the stripping of '<' and '>' from content |
227
|
71
|
50
|
33
|
|
|
4115
|
my $is_html_type = ($content_type // '') ne '' && |
228
|
|
|
|
|
|
|
lc( $content_type // '') =~ m!^text/html! ? 1 : 0; |
229
|
|
|
|
|
|
|
|
230
|
71
|
|
|
|
|
207
|
my $has_html = 0; |
231
|
71
|
100
|
50
|
|
|
1090
|
if ( $is_html_type || ($content // '') =~ $SIMPLE_HTML_PATTERN ) { |
|
|
|
66
|
|
|
|
|
232
|
3
|
100
|
50
|
|
|
27
|
if ( ($content // '') !~ $USER_AGENT_PATTERN ) { |
233
|
1
|
|
|
|
|
10
|
$self->log->warn( "Found non-robots.txt HTML file: $url"); |
234
|
|
|
|
|
|
|
|
235
|
1
|
|
|
|
|
126
|
return CrawlerCommons::RobotRules->new( |
236
|
|
|
|
|
|
|
_mode => $CrawlerCommons::RobotRules::ALLOW_ALL); |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
else { |
240
|
2
|
50
|
|
|
|
8
|
if ( $is_html_type ) { |
241
|
0
|
|
|
|
|
0
|
$self->log->info( |
242
|
|
|
|
|
|
|
"HTML content type returned for robots.txt file: $url"); |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
else { |
245
|
2
|
|
|
|
|
11
|
$self->log->warn("Found HTML in robots.txt file: $url"); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
2
|
|
|
|
|
186
|
$has_html = 1; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
70
|
|
|
|
|
2941
|
my $parse_state = |
253
|
|
|
|
|
|
|
CrawlerCommons::ParseState->new( |
254
|
|
|
|
|
|
|
url => $url, target_name => lc($robot_name) ); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# DEBUG |
257
|
70
|
|
|
|
|
483
|
$self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state1'])); |
258
|
|
|
|
|
|
|
|
259
|
70
|
|
|
|
|
19495
|
for my $line ( split( m!(?:\n|\r|\r\n|\x0085|\x2028|\x2029)!, $content) ) { |
260
|
1835
|
|
|
|
|
6038
|
$self->log->trace("Input Line: [$line]\n"); |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# strip html tags |
263
|
1835
|
100
|
|
|
|
40097
|
$line =~ s!<[^>]+>!!g if $has_html; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
# trim comments |
266
|
1835
|
100
|
|
|
|
5515
|
if (my $hash_idx = index( $line, '#') ) { |
267
|
1799
|
100
|
|
|
|
3955
|
$line = substr($line, 0, $hash_idx ) if $hash_idx >= 0; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# trim whitespace |
271
|
1835
|
|
|
|
|
10071
|
$line =~ s!^\s+|\s+$!!; |
272
|
1835
|
100
|
|
|
|
5313
|
next if length( $line ) == 0; |
273
|
|
|
|
|
|
|
|
274
|
1561
|
|
|
|
|
3869
|
my $robot_token = $self->_tokenize( $line ); |
275
|
|
|
|
|
|
|
|
276
|
1561
|
100
|
|
|
|
50018
|
do { |
277
|
923
|
|
|
|
|
3163
|
$self->_handle_user_agent( $parse_state, $robot_token ); |
278
|
923
|
|
|
|
|
28695
|
next; |
279
|
|
|
|
|
|
|
} if $robot_token->directive->is_user_agent; |
280
|
|
|
|
|
|
|
|
281
|
638
|
100
|
|
|
|
18557
|
do { |
282
|
494
|
|
|
|
|
1833
|
$self->_handle_disallow( $parse_state, $robot_token ); |
283
|
494
|
|
|
|
|
20396
|
next; |
284
|
|
|
|
|
|
|
} if $robot_token->directive->is_disallow; |
285
|
|
|
|
|
|
|
|
286
|
144
|
100
|
|
|
|
4385
|
do { |
287
|
57
|
|
|
|
|
291
|
$self->_handle_allow( $parse_state, $robot_token ); |
288
|
57
|
|
|
|
|
2216
|
next; |
289
|
|
|
|
|
|
|
} if $robot_token->directive->is_allow; |
290
|
|
|
|
|
|
|
|
291
|
87
|
100
|
|
|
|
2731
|
do { |
292
|
17
|
|
|
|
|
76
|
$self->_handle_crawl_delay( $parse_state, $robot_token ); |
293
|
17
|
|
|
|
|
720
|
next; |
294
|
|
|
|
|
|
|
} if $robot_token->directive->is_crawl_delay; |
295
|
|
|
|
|
|
|
|
296
|
70
|
100
|
|
|
|
2123
|
do { |
297
|
19
|
|
|
|
|
104
|
$self->_handle_sitemap( $parse_state, $robot_token ); |
298
|
19
|
|
|
|
|
1094
|
next; |
299
|
|
|
|
|
|
|
} if $robot_token->directive->is_sitemap; |
300
|
|
|
|
|
|
|
|
301
|
51
|
100
|
|
|
|
1554
|
do { |
302
|
1
|
|
|
|
|
7
|
$self->_handle_http( $parse_state, $robot_token ); |
303
|
1
|
|
|
|
|
34
|
next; |
304
|
|
|
|
|
|
|
} if $robot_token->directive->is_http; |
305
|
|
|
|
|
|
|
|
306
|
50
|
100
|
|
|
|
1491
|
do { |
307
|
13
|
|
|
|
|
166
|
$self->_report_warning( |
308
|
|
|
|
|
|
|
sprintf( |
309
|
|
|
|
|
|
|
"Unknown line in robots.txt file (size %d): %s", |
310
|
|
|
|
|
|
|
length( $content ), |
311
|
|
|
|
|
|
|
$line |
312
|
|
|
|
|
|
|
), |
313
|
|
|
|
|
|
|
$url |
314
|
|
|
|
|
|
|
); |
315
|
13
|
|
|
|
|
784
|
$parse_state->is_finished_agent_fields( 1 ); |
316
|
13
|
|
|
|
|
442
|
next; |
317
|
|
|
|
|
|
|
} if $robot_token->directive->is_missing; |
318
|
|
|
|
|
|
|
|
319
|
37
|
100
|
|
|
|
1058
|
do { |
320
|
26
|
|
|
|
|
241
|
$self->_report_warning( |
321
|
|
|
|
|
|
|
sprintf( |
322
|
|
|
|
|
|
|
"Unknown directive in robots.txt file: %s", |
323
|
|
|
|
|
|
|
$line |
324
|
|
|
|
|
|
|
), |
325
|
|
|
|
|
|
|
$url |
326
|
|
|
|
|
|
|
); |
327
|
26
|
|
|
|
|
1302
|
$parse_state->is_finished_agent_fields( 1 ); |
328
|
26
|
|
|
|
|
911
|
next; |
329
|
|
|
|
|
|
|
} if $robot_token->directive->is_unknown; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
70
|
|
|
|
|
820
|
$self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state2'])); |
333
|
|
|
|
|
|
|
|
334
|
70
|
|
|
|
|
14965
|
my $robot_rules = $parse_state->current_rules(); |
335
|
70
|
100
|
|
|
|
2474
|
if ( $robot_rules->crawl_delay > $MAX_CRAWL_DELAY ) { |
336
|
1
|
|
|
|
|
28
|
return CrawlerCommons::RobotRules->new( |
337
|
|
|
|
|
|
|
_mode => $CrawlerCommons::RobotRules::ALLOW_NONE ); |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
else { |
340
|
69
|
|
|
|
|
390
|
$robot_rules->sort_rules; |
341
|
69
|
|
|
|
|
2513
|
return $robot_rules; |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# Private Methods |
347
|
|
|
|
|
|
|
#------------------# |
348
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
349
|
|
|
|
|
|
|
sub _handle_allow_or_disallow { |
350
|
551
|
|
|
551
|
|
1425
|
my ($self, $state, $token, $allow_or_disallow ) = @_; |
351
|
|
|
|
|
|
|
|
352
|
551
|
|
|
|
|
1634
|
$self->log->trace(Data::Dumper->Dump([\@_],['_handle_allow_or_disallow'])); |
353
|
|
|
|
|
|
|
|
354
|
551
|
100
|
|
|
|
253741
|
return if $state->is_skip_agents; |
355
|
|
|
|
|
|
|
|
356
|
413
|
|
|
|
|
13669
|
$state->is_finished_agent_fields( 1 ); |
357
|
|
|
|
|
|
|
|
358
|
413
|
100
|
|
|
|
12683
|
return unless $state->is_adding_rules; |
359
|
|
|
|
|
|
|
|
360
|
265
|
|
50
|
|
|
7765
|
my $path = $token->data // ''; |
361
|
|
|
|
|
|
|
try { |
362
|
265
|
|
|
265
|
|
21846
|
$path = uri_unescape( $path ); |
363
|
265
|
|
|
|
|
3003
|
utf8::encode( $path ); |
364
|
265
|
100
|
|
|
|
793
|
if ( length( $path ) == 0 ) { |
365
|
11
|
|
|
|
|
66
|
$state->clear_rules; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
else { |
368
|
254
|
|
|
|
|
1003
|
$state->add_rule( $path, $allow_or_disallow ); |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
catch { |
372
|
0
|
|
|
0
|
|
0
|
$self->_report_warning( |
373
|
|
|
|
|
|
|
"Error parsing robot rules - can't decode path: $path\n$_", |
374
|
|
|
|
|
|
|
$state->url |
375
|
|
|
|
|
|
|
); |
376
|
265
|
|
|
|
|
2838
|
}; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
379
|
57
|
|
|
57
|
|
218
|
sub _handle_allow { shift->_handle_allow_or_disallow( @_, 1 ); } |
380
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
381
|
|
|
|
|
|
|
sub _handle_crawl_delay { |
382
|
17
|
|
|
17
|
|
48
|
my ($self, $state, $token) = @_; |
383
|
|
|
|
|
|
|
|
384
|
17
|
|
|
|
|
69
|
$self->log->trace(Data::Dumper->Dump([$state, $token],['state','token'])); |
385
|
|
|
|
|
|
|
|
386
|
17
|
100
|
|
|
|
3341
|
return if $state->is_skip_agents; |
387
|
|
|
|
|
|
|
|
388
|
12
|
|
|
|
|
432
|
$state->is_finished_agent_fields( 1 ); |
389
|
|
|
|
|
|
|
|
390
|
12
|
100
|
|
|
|
398
|
return unless $state->is_adding_rules; |
391
|
|
|
|
|
|
|
|
392
|
9
|
|
|
|
|
286
|
my $delay = $token->data; |
393
|
|
|
|
|
|
|
try { |
394
|
9
|
|
|
9
|
|
782
|
my $delay_ms = $delay * 1000; |
395
|
9
|
|
|
|
|
61
|
$state->set_crawl_delay( $delay_ms ); |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
catch { |
398
|
0
|
|
|
0
|
|
0
|
$self->_report_warning( |
399
|
|
|
|
|
|
|
"Error parsing robot rules - can't decode crawl delay: $delay", |
400
|
|
|
|
|
|
|
$state->url |
401
|
|
|
|
|
|
|
); |
402
|
9
|
|
|
|
|
110
|
}; |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
405
|
494
|
|
|
494
|
|
1457
|
sub _handle_disallow { shift->_handle_allow_or_disallow( @_, 0 ); } |
406
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
407
|
|
|
|
|
|
|
sub _handle_http { |
408
|
1
|
|
|
1
|
|
5
|
my ($self, $state, $token) = @_; |
409
|
1
|
|
|
|
|
26
|
my $url_fragment = $token->data; |
410
|
1
|
50
|
|
|
|
7
|
if ( index( $url_fragment, 'sitemap' ) ) { |
411
|
1
|
|
|
|
|
37
|
my $fixed_token = CrawlerCommons::RobotToken->new( |
412
|
|
|
|
|
|
|
data => 'http:' . $url_fragment, |
413
|
|
|
|
|
|
|
directive => |
414
|
|
|
|
|
|
|
CrawlerCommons::RobotDirective |
415
|
|
|
|
|
|
|
->get_directive('sitemap'), |
416
|
|
|
|
|
|
|
); |
417
|
1
|
|
|
|
|
10
|
$self->_handle_sitemap( $state, $fixed_token ); |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
else { |
420
|
0
|
|
|
|
|
0
|
$self->_report_warning( |
421
|
|
|
|
|
|
|
"Fournd raw non-sitemap URL: http:$url_fragment", $state->url); |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
425
|
|
|
|
|
|
|
sub _handle_sitemap { |
426
|
20
|
|
|
20
|
|
66
|
my ($self, $state, $token) = @_; |
427
|
20
|
|
|
|
|
634
|
my $sitemap = $token->data; |
428
|
|
|
|
|
|
|
try { |
429
|
20
|
|
|
20
|
|
2622
|
my $sitemap_url = URI->new_abs( $sitemap, URI->new( $state->url ) ); |
430
|
20
|
|
50
|
|
|
5867
|
my $host = $sitemap_url->host() // ''; |
431
|
|
|
|
|
|
|
|
432
|
20
|
|
|
|
|
899
|
$self->log->trace(<<"DUMP"); |
433
|
|
|
|
|
|
|
# _handle_sitemap |
434
|
|
|
|
|
|
|
################### |
435
|
|
|
|
|
|
|
sitemap $sitemap |
436
|
|
|
|
|
|
|
sitemap_url $sitemap_url |
437
|
|
|
|
|
|
|
host $host |
438
|
20
|
|
|
|
|
1203
|
url ${\$state->url} |
439
|
|
|
|
|
|
|
DUMP |
440
|
|
|
|
|
|
|
|
441
|
20
|
50
|
|
|
|
296
|
$state->add_sitemap( $sitemap_url->as_string ) if ( $host ne '' ); |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
catch { |
444
|
0
|
|
|
0
|
|
0
|
$self->_report_warning( "Invalid URL with sitemap directive: $sitemap", |
445
|
|
|
|
|
|
|
$state->url ); |
446
|
20
|
|
|
|
|
274
|
}; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
449
|
|
|
|
|
|
|
sub _handle_user_agent { |
450
|
923
|
|
|
923
|
|
1993
|
my ($self, $state, $token) = @_; |
451
|
923
|
100
|
|
|
|
28740
|
if ( $state->is_matched_real_name ) { |
452
|
121
|
100
|
|
|
|
3758
|
$state->is_skip_agents( 1 ) if $state->is_finished_agent_fields; |
453
|
121
|
|
|
|
|
279
|
return; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
802
|
100
|
|
|
|
24149
|
if ( $state->is_finished_agent_fields ) { |
457
|
62
|
|
|
|
|
1984
|
$state->is_finished_agent_fields( 0 ); |
458
|
62
|
|
|
|
|
1991
|
$state->is_adding_rules( 0 ); |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
802
|
|
|
|
|
23535
|
for my $target_name ( split(/,/, lc( $state->target_name ) ) ) { |
462
|
806
|
|
|
|
|
23122
|
for my $agent_name ( split( m! |\t|,!, $token->data ) ) { |
463
|
1362
|
|
50
|
|
|
5679
|
( $agent_name = lc( $agent_name // '' ) ) =~ s!^\s+|\s+$!!g; |
464
|
|
|
|
|
|
|
|
465
|
1362
|
100
|
100
|
|
|
5734
|
if ( $agent_name eq '*' && !$state->is_matched_wildcard ) { |
|
|
100
|
|
|
|
|
|
466
|
39
|
|
|
|
|
1221
|
$state->is_matched_wildcard( 1 ); |
467
|
39
|
|
|
|
|
1273
|
$state->is_adding_rules( 1 ); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
elsif ($agent_name ne '') { |
470
|
1315
|
|
|
|
|
3027
|
for my $target_name_split ( split(/ /, $target_name) ) { |
471
|
1340
|
100
|
|
|
|
4697
|
if (index( $target_name_split, $agent_name ) == 0 ) { |
472
|
33
|
|
|
|
|
1164
|
$state->is_matched_real_name( 1 ); |
473
|
33
|
|
|
|
|
1025
|
$state->is_adding_rules( 1 ); |
474
|
33
|
|
|
|
|
196
|
$state->clear_rules; |
475
|
33
|
|
|
|
|
139
|
last; |
476
|
|
|
|
|
|
|
} |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
483
|
|
|
|
|
|
|
sub _report_warning { |
484
|
39
|
|
|
39
|
|
160
|
my ($self, $msg, $url) = @_; |
485
|
39
|
|
|
|
|
1971
|
$self->increment_warnings; |
486
|
|
|
|
|
|
|
|
487
|
39
|
|
|
|
|
1266
|
my $warning_count = $self->num_warnings; |
488
|
39
|
100
|
|
|
|
187
|
$self->log->warn("Problem processing robots.txt for $url") |
489
|
|
|
|
|
|
|
if $warning_count == 1; |
490
|
|
|
|
|
|
|
|
491
|
39
|
100
|
|
|
|
670
|
$self->log->warn( $msg ) if $warning_count < $MAX_WARNINGS; |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
494
|
|
|
|
|
|
|
sub _tokenize { |
495
|
1561
|
|
|
1561
|
|
3466
|
my ($self, $line) = @_; |
496
|
|
|
|
|
|
|
|
497
|
1561
|
|
|
|
|
3961
|
$self->log->trace("Parsing line: [$line]"); |
498
|
|
|
|
|
|
|
|
499
|
1561
|
|
|
|
|
29918
|
my $lower_line = lc( $line ); |
500
|
1561
|
|
|
|
|
6144
|
my ($directive) = ($lower_line =~ m!^([^:\s]+)!); |
501
|
1561
|
|
50
|
|
|
3625
|
$directive //= ''; |
502
|
|
|
|
|
|
|
|
503
|
1561
|
100
|
100
|
|
|
66170
|
if ( $directive =~ m!^acap\-! || |
504
|
|
|
|
|
|
|
CrawlerCommons::RobotDirective->directive_exists( $directive ) ){ |
505
|
|
|
|
|
|
|
|
506
|
1522
|
|
|
|
|
5307
|
my $data_portion = substr($line, length( $directive )); |
507
|
1522
|
|
|
|
|
9689
|
( my $data ) = ( $data_portion =~ m!$COLON_DIRECTIVE_PATTERN! ); |
508
|
1522
|
100
|
|
|
|
4370
|
( $data ) = ( $data_portion =~ m!$BLANK_DIRECTIVE_PATTERN! ) |
509
|
|
|
|
|
|
|
unless defined $data; |
510
|
1522
|
|
50
|
|
|
3007
|
$data //= ''; |
511
|
1522
|
|
|
|
|
5836
|
$data =~ s!^\s+|\s+$!!; |
512
|
|
|
|
|
|
|
|
513
|
1522
|
|
|
|
|
4839
|
$self->log->trace(<<"DUMP"); |
514
|
|
|
|
|
|
|
# _tokenize dump |
515
|
|
|
|
|
|
|
################# |
516
|
|
|
|
|
|
|
line [$line] |
517
|
|
|
|
|
|
|
directive [$directive] |
518
|
|
|
|
|
|
|
data_portion [$data_portion] |
519
|
|
|
|
|
|
|
data [$data] |
520
|
|
|
|
|
|
|
DUMP |
521
|
|
|
|
|
|
|
|
522
|
1522
|
100
|
|
|
|
91990
|
my $robot_directive = |
523
|
|
|
|
|
|
|
CrawlerCommons::RobotDirective->get_directive( |
524
|
|
|
|
|
|
|
$directive =~ m!^acap-!i ? 'acap-' : $directive ); |
525
|
|
|
|
|
|
|
|
526
|
1522
|
|
|
|
|
45419
|
return CrawlerCommons::RobotToken->new( |
527
|
|
|
|
|
|
|
data => $data, directive => $robot_directive |
528
|
|
|
|
|
|
|
); |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
else { |
531
|
39
|
100
|
|
|
|
1864
|
my $robot_directive = |
532
|
|
|
|
|
|
|
CrawlerCommons::RobotDirective->get_directive( |
533
|
|
|
|
|
|
|
$lower_line =~ m![ \t]*:[ \t]*(.*)! ? 'unknown' : 'missing' ); |
534
|
|
|
|
|
|
|
|
535
|
39
|
|
|
|
|
1339
|
return CrawlerCommons::RobotToken->new( |
536
|
|
|
|
|
|
|
data => $line, directive => $robot_directive |
537
|
|
|
|
|
|
|
); |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
#-----------------------------------------------------------------------------# |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
############################################################################### |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
############################################################################### |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=head1 AUTHOR |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
Adam K Robinson <akrobinson74@gmail.com> |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
This software is copyright (c) 2017 by Adam K Robinson. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=cut |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
1; |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
__END__ |