|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 CrawlerCommons::RobotRulesParser - parser for robots.txt files  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  use CrawlerCommons::RobotRulesParser;  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $rules_parser = CrawlerCommons::RobotRulesParser->new;  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $content = "User-agent: *\r\nDisallow: *images";  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $content_type = "text/plain";  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $robot_names = "any-old-robot";  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $url = "http://domain.com/";  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $robot_rules =  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    $rules_parser->parse_content($url, $content, $content_type, $robot_names);  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  say "We're allowed to crawl the index :)"  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   if $robot_rules->is_allowed( "https://www.domain.com/index.html");  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  say "Not allowed to crawl: $_" unless $robot_rules->is_allowed( $_ )  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    for ("http://www.domain.com/images/some_file.png",  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         "http://www.domain.com/images/another_file.png");  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This module is a fairly close reproduction of the Crawler-Commons  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<SimpleRobotRulesParser|http://crawler-commons.github.io/crawler-commons/0.7/crawlercommons/robots/SimpleRobotRulesParser.html>  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 From BaseRobotsParser javadoc:  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Parse the robots.txt file in <i>content</i>, and return rules appropriate  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  for processing paths by <i>userAgent</i>. Note that multiple agent names  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  may be provided as comma-separated values; the order of these shouldn't  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  matter, as the file is parsed in order, and each agent name found in the  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  file will be compared to every agent name found in robotNames.  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Also note that names are lower-cased before comparison, and that any  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  robot name you pass shouldn't contain commas or spaces; if the name has  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  spaces, it will be split into multiple names, each of which will be  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  compared against agent names in the robots.txt file. An agent name is  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  considered a match if it's a prefix match on the provided robot name. For  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  example, if you pass in "Mozilla Crawlerbot-super 1.0", this would match  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  "crawlerbot" as the agent name, because of splitting on spaces,  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  lower-casing, and the prefix match rule.  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The method failedFetch is not implemented.  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###############################################################################  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package CrawlerCommons::RobotRulesParser;  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # MODULE IMPORTS  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ########################################  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Pragmas  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
59
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
185873
 | 
 use 5.10.1;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
60
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
9
 | 
 use strict;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
    | 
| 
61
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
325
 | 
 use utf8;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
62
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
42
 | 
 use warnings;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
    | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # CPAN/Core  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
66
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
234
 | 
 use Const::Fast;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1657
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
67
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
756
 | 
 use Encode qw(decode encode);  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14573
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
117
 | 
    | 
| 
68
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
267
 | 
 use Try::Tiny;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1379
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
86
 | 
    | 
| 
69
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
481
 | 
 use URI::Escape;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2391
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
123
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Moose Setup  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
73
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
666
 | 
 use Moose;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
791673
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
    | 
| 
74
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
14198
 | 
 use namespace::autoclean;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12225
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Moose Pragmas  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 with 'MooseX::Log::Log4perl';  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Custom Modules  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
82
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
750
 | 
 use CrawlerCommons::RobotDirective;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
153
 | 
    | 
| 
83
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
1088
 | 
 use CrawlerCommons::ParseState;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
    | 
| 
84
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
21
 | 
 use CrawlerCommons::RobotRules;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
    | 
| 
85
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
939
 | 
 use CrawlerCommons::RobotToken;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
563
 | 
    | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # VARIABLES/CONSTANTS  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ########################################  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Constants  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $DEBUG                 => $ENV{DEBUG} // 0;  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $TEST                  => $ENV{TEST} // 0;  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $BLANK_DIRECTIVE_PATTERN=> qr![ \t]+(.*)!o;  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $COLON_DIRECTIVE_PATTERN=> qr![ \t]*:[ \t]*(.*)!o;  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $MAX_CRAWL_DELAY       => 300000;  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $MAX_WARNINGS          => 5;  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $SIMPLE_HTML_PATTERN   => qr!<(?:html|head|body)\s*>!is;  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 const my $USER_AGENT_PATTERN    => qr!user-agent:!i;  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Variables  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # setup   | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 BEGIN {  | 
| 
107
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
19
 | 
     require Log::Log4perl;  | 
| 
108
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3879
 | 
     Log::Log4perl->easy_init($Log::Log4perl::ERROR)  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       unless $Log::Log4perl::Logger::INITIALIZED;  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 VERSION  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Version 0.03  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '0.03';  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # MOOSE ATTRIBUTES  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ########################################  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Class  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Instance  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'num_warnings'              => (  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default                     => 0,  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     handles                     => {  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         increment_warnings      => 'inc',  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     },  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is                          => 'ro',  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa                         => 'Int',  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     traits                      => ['Counter']  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 METHODS  | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # METHODS  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ########################################  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Construction  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Class Methods  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Instance Methods  | 
| 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 C<< my $robot_rules = $rules_parser->parse_content($url, $content, $content_type, $robot_name) >>  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Parsers robots.txt data in C<$content> for the User-agent(s) specified in  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<$robot_name> returning a C<CrawlerCommons::RobotRules> object corresponding  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to the rules defined for C<$robot_name>.  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over  | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * C<$url>  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 URL string that's parsed in a URI object to provide scheme, authority, and path  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for sitemap directive values.  If the directive's value begins with a '/', it  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 overrides the path value provided by this URL context string.  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * C<$content>  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The text content of the robots.txt file to be parsed.  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * C<$content_type>  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The content-type of the robots.txt content to be parsed.  Assumes text/plain by  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 default.  If type is text/html, the parser will attempt to strip-out html tags  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 from the content.  | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item * C<$robot_name>  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 A string signifying for which user-agent(s) the rules should be extracted.  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub parse_content {  | 
| 
193
 | 
73
 | 
 
 | 
 
 | 
  
73
  
 | 
  
0
  
 | 
1196
 | 
     my ($self, $url, $content, $content_type, $robot_name) = @_;  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
195
 | 
73
 | 
  
100
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
516
 | 
     return CrawlerCommons::RobotRules->new(  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       _mode => $CrawlerCommons::RobotRules::ALLOW_ALL)  | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( ($content // '') eq '' );  | 
| 
198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
199
 | 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
447
 | 
     my $content_len = length( $content );  | 
| 
200
 | 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
191
 | 
     my $offset = 0;  | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # handle UTF-8, UTF-16LE, UTF-16BE content  | 
| 
203
 | 
71
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
1653
 | 
     if ( ($content_len >= 3) && (substr($content, 0, 1) eq "\xEF") &&  | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          (substr($content, 1, 1) eq "\xBB") &&  | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          (substr($content, 2, 1) eq "\xBF") ) {  | 
| 
206
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
         $offset = 3;  | 
| 
207
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $content_len -= 3;  | 
| 
208
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $content = substr( $content, 3);  | 
| 
209
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         $content = decode('UTF-8', $content);  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFF") &&  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          (substr($content, 1, 1) eq "\xFE") ) {  | 
| 
213
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
         $offset = 2;  | 
| 
214
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $content_len -= 2;  | 
| 
215
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
         $content = substr( $content, 2);  | 
| 
216
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
         $content = decode('UTF-16LE', $content);  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFE") &&  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          (substr($content, 1, 1) eq "\xFF") ) {  | 
| 
220
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $offset = 2;  | 
| 
221
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         $content_len -= 2;  | 
| 
222
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
         $content = substr( $content, 2);  | 
| 
223
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         $content = decode('UTF-16BE', $content);  | 
| 
224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
225
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
226
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # set flags that trigger the stripping of '<' and '>' from content  | 
| 
227
 | 
71
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
4115
 | 
     my $is_html_type = ($content_type // '') ne '' &&  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       lc( $content_type // '') =~ m!^text/html! ? 1 : 0;  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
230
 | 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
207
 | 
     my $has_html = 0;  | 
| 
231
 | 
71
 | 
  
100
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
1090
 | 
     if ( $is_html_type || ($content // '') =~ $SIMPLE_HTML_PATTERN ) {  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
232
 | 
3
 | 
  
100
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
27
 | 
         if ( ($content // '') !~ $USER_AGENT_PATTERN ) {  | 
| 
233
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
             $self->log->warn( "Found non-robots.txt HTML file: $url");  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
235
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
126
 | 
             return CrawlerCommons::RobotRules->new(  | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               _mode => $CrawlerCommons::RobotRules::ALLOW_ALL);  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
240
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
             if ( $is_html_type ) {  | 
| 
241
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $self->log->info(  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                   "HTML content type returned for robots.txt file: $url");  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             else {  | 
| 
245
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
                 $self->log->warn("Found HTML in robots.txt file: $url");  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
248
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
186
 | 
             $has_html = 1;  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
252
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2941
 | 
     my $parse_state =  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       CrawlerCommons::ParseState->new(  | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         url => $url, target_name => lc($robot_name) );  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # DEBUG  | 
| 
257
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
483
 | 
     $self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state1']));  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
259
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19495
 | 
     for my $line ( split( m!(?:\n|\r|\r\n|\x0085|\x2028|\x2029)!, $content) ) {  | 
| 
260
 | 
1835
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6038
 | 
         $self->log->trace("Input Line: [$line]\n");  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # strip html tags  | 
| 
263
 | 
1835
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
40097
 | 
         $line =~ s!<[^>]+>!!g if $has_html;  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # trim comments  | 
| 
266
 | 
1835
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
5515
 | 
         if (my $hash_idx = index( $line, '#') ) {  | 
| 
267
 | 
1799
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3955
 | 
             $line = substr($line, 0, $hash_idx ) if $hash_idx >= 0;  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # trim whitespace  | 
| 
271
 | 
1835
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10071
 | 
         $line =~ s!^\s+|\s+$!!;  | 
| 
272
 | 
1835
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
5313
 | 
         next if length( $line ) == 0;  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
274
 | 
1561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3869
 | 
         my $robot_token = $self->_tokenize( $line );  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
276
 | 
1561
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
50018
 | 
         do {  | 
| 
277
 | 
923
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3163
 | 
             $self->_handle_user_agent( $parse_state, $robot_token );  | 
| 
278
 | 
923
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28695
 | 
             next;  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_user_agent;  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
281
 | 
638
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
18557
 | 
         do {  | 
| 
282
 | 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1833
 | 
             $self->_handle_disallow( $parse_state, $robot_token );  | 
| 
283
 | 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20396
 | 
             next;  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_disallow;  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
286
 | 
144
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4385
 | 
         do {  | 
| 
287
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
291
 | 
             $self->_handle_allow( $parse_state, $robot_token );  | 
| 
288
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2216
 | 
             next;  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_allow;  | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
291
 | 
87
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2731
 | 
         do {  | 
| 
292
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
76
 | 
             $self->_handle_crawl_delay( $parse_state, $robot_token );  | 
| 
293
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
720
 | 
             next;  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_crawl_delay;  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
296
 | 
70
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2123
 | 
         do {  | 
| 
297
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
104
 | 
             $self->_handle_sitemap( $parse_state, $robot_token );  | 
| 
298
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1094
 | 
             next;  | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_sitemap;  | 
| 
300
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
301
 | 
51
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1554
 | 
         do {  | 
| 
302
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
             $self->_handle_http( $parse_state, $robot_token );  | 
| 
303
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
             next;  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_http;  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
306
 | 
50
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1491
 | 
         do {  | 
| 
307
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
166
 | 
             $self->_report_warning(  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               sprintf(  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 "Unknown line in robots.txt file (size %d): %s",  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 length( $content ),  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $line  | 
| 
312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               ),  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               $url  | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             );  | 
| 
315
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
784
 | 
             $parse_state->is_finished_agent_fields( 1 );  | 
| 
316
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
442
 | 
             next;  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_missing;  | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
319
 | 
37
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1058
 | 
         do {  | 
| 
320
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
241
 | 
             $self->_report_warning(  | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               sprintf(  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 "Unknown directive in robots.txt file: %s",  | 
| 
323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $line  | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               ),  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               $url  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             );  | 
| 
327
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1302
 | 
             $parse_state->is_finished_agent_fields( 1 );  | 
| 
328
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
911
 | 
             next;  | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } if $robot_token->directive->is_unknown;  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
332
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
820
 | 
     $self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state2']));  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
334
 | 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14965
 | 
     my $robot_rules = $parse_state->current_rules();  | 
| 
335
 | 
70
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2474
 | 
     if ( $robot_rules->crawl_delay > $MAX_CRAWL_DELAY ) {  | 
| 
336
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
         return CrawlerCommons::RobotRules->new(  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           _mode => $CrawlerCommons::RobotRules::ALLOW_NONE );  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
340
 | 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
390
 | 
         $robot_rules->sort_rules;  | 
| 
341
 | 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2513
 | 
         return $robot_rules;  | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Private Methods  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #------------------#  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _handle_allow_or_disallow {  | 
| 
350
 | 
551
 | 
 
 | 
 
 | 
  
551
  
 | 
 
 | 
1425
 | 
     my ($self, $state, $token, $allow_or_disallow ) = @_;  | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
352
 | 
551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1634
 | 
     $self->log->trace(Data::Dumper->Dump([\@_],['_handle_allow_or_disallow']));  | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
354
 | 
551
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
253741
 | 
     return if $state->is_skip_agents;  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
356
 | 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13669
 | 
     $state->is_finished_agent_fields( 1 );  | 
| 
357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
358
 | 
413
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
12683
 | 
     return unless $state->is_adding_rules;  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
360
 | 
265
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
7765
 | 
     my $path = $token->data // '';  | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     try {  | 
| 
362
 | 
265
 | 
 
 | 
 
 | 
  
265
  
 | 
 
 | 
21846
 | 
         $path = uri_unescape( $path );  | 
| 
363
 | 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3003
 | 
         utf8::encode( $path );  | 
| 
364
 | 
265
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
793
 | 
         if ( length( $path ) == 0 ) {  | 
| 
365
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
             $state->clear_rules;  | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
368
 | 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1003
 | 
             $state->add_rule( $path, $allow_or_disallow );  | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     catch {  | 
| 
372
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
         $self->_report_warning(  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           "Error parsing robot rules - can't decode path: $path\n$_",  | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $state->url  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
376
 | 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2838
 | 
     };  | 
| 
377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
379
 | 
57
 | 
 
 | 
 
 | 
  
57
  
 | 
 
 | 
218
 | 
 sub _handle_allow { shift->_handle_allow_or_disallow( @_, 1 ); }  | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _handle_crawl_delay {  | 
| 
382
 | 
17
 | 
 
 | 
 
 | 
  
17
  
 | 
 
 | 
48
 | 
     my ($self, $state, $token) = @_;  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
384
 | 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
     $self->log->trace(Data::Dumper->Dump([$state, $token],['state','token']));  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
386
 | 
17
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3341
 | 
     return if $state->is_skip_agents;  | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
388
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
432
 | 
     $state->is_finished_agent_fields( 1 );  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
390
 | 
12
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
398
 | 
     return unless $state->is_adding_rules;  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
392
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
286
 | 
     my $delay = $token->data;  | 
| 
393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     try {  | 
| 
394
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
782
 | 
         my $delay_ms = $delay * 1000;  | 
| 
395
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
         $state->set_crawl_delay( $delay_ms );  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     catch {  | 
| 
398
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
         $self->_report_warning(  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             "Error parsing robot rules - can't decode crawl delay: $delay",  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $state->url  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
402
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
110
 | 
     };  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
405
 | 
494
 | 
 
 | 
 
 | 
  
494
  
 | 
 
 | 
1457
 | 
 sub _handle_disallow { shift->_handle_allow_or_disallow( @_, 0 ); }  | 
| 
406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _handle_http {  | 
| 
408
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
     my ($self, $state, $token) = @_;  | 
| 
409
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
26
 | 
     my $url_fragment = $token->data;  | 
| 
410
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     if ( index( $url_fragment, 'sitemap' ) ) {  | 
| 
411
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
         my $fixed_token = CrawlerCommons::RobotToken->new(  | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             data        => 'http:' . $url_fragment,  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             directive   =>  | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             CrawlerCommons::RobotDirective  | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
              ->get_directive('sitemap'),  | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
417
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         $self->_handle_sitemap( $state, $fixed_token );  | 
| 
418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
420
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->_report_warning(  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           "Fournd raw non-sitemap URL: http:$url_fragment", $state->url);  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _handle_sitemap {  | 
| 
426
 | 
20
 | 
 
 | 
 
 | 
  
20
  
 | 
 
 | 
66
 | 
     my ($self, $state, $token) = @_;  | 
| 
427
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
634
 | 
     my $sitemap = $token->data;  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     try {  | 
| 
429
 | 
20
 | 
 
 | 
 
 | 
  
20
  
 | 
 
 | 
2622
 | 
         my $sitemap_url = URI->new_abs( $sitemap, URI->new( $state->url ) );  | 
| 
430
 | 
20
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
5867
 | 
         my $host = $sitemap_url->host() // '';  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
432
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
899
 | 
         $self->log->trace(<<"DUMP");  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # _handle_sitemap  | 
| 
434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###################  | 
| 
435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sitemap     $sitemap  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sitemap_url $sitemap_url  | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 host        $host  | 
| 
438
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1203
 | 
 url         ${\$state->url}  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 DUMP  | 
| 
440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
441
 | 
20
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
296
 | 
         $state->add_sitemap( $sitemap_url->as_string ) if ( $host ne '' );  | 
| 
442
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     catch {  | 
| 
444
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
         $self->_report_warning( "Invalid URL with sitemap directive: $sitemap",  | 
| 
445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                 $state->url );  | 
| 
446
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
274
 | 
     };  | 
| 
447
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _handle_user_agent {  | 
| 
450
 | 
923
 | 
 
 | 
 
 | 
  
923
  
 | 
 
 | 
1993
 | 
     my ($self, $state, $token) = @_;  | 
| 
451
 | 
923
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
28740
 | 
     if ( $state->is_matched_real_name ) {  | 
| 
452
 | 
121
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3758
 | 
         $state->is_skip_agents( 1 ) if $state->is_finished_agent_fields;  | 
| 
453
 | 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
279
 | 
         return;  | 
| 
454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
456
 | 
802
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
24149
 | 
     if ( $state->is_finished_agent_fields ) {  | 
| 
457
 | 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1984
 | 
         $state->is_finished_agent_fields( 0 );  | 
| 
458
 | 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1991
 | 
         $state->is_adding_rules( 0 );  | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
461
 | 
802
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23535
 | 
     for my $target_name ( split(/,/, lc( $state->target_name ) ) ) {  | 
| 
462
 | 
806
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23122
 | 
          for my $agent_name ( split( m! |\t|,!, $token->data ) ) {  | 
| 
463
 | 
1362
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
5679
 | 
              ( $agent_name = lc( $agent_name // '' ) ) =~ s!^\s+|\s+$!!g;  | 
| 
464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
465
 | 
1362
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
5734
 | 
             if ( $agent_name eq '*' && !$state->is_matched_wildcard ) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
466
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1221
 | 
                 $state->is_matched_wildcard( 1 );  | 
| 
467
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1273
 | 
                 $state->is_adding_rules( 1 );  | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             elsif ($agent_name ne '') {  | 
| 
470
 | 
1315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3027
 | 
                 for my $target_name_split ( split(/ /, $target_name) ) {  | 
| 
471
 | 
1340
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4697
 | 
                     if (index( $target_name_split, $agent_name ) == 0 ) {  | 
| 
472
 | 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1164
 | 
                         $state->is_matched_real_name( 1 );  | 
| 
473
 | 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1025
 | 
                         $state->is_adding_rules( 1 );  | 
| 
474
 | 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
196
 | 
                         $state->clear_rules;  | 
| 
475
 | 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
139
 | 
                         last;  | 
| 
476
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          }  | 
| 
480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
481
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _report_warning {  | 
| 
484
 | 
39
 | 
 
 | 
 
 | 
  
39
  
 | 
 
 | 
160
 | 
     my ($self, $msg, $url) = @_;  | 
| 
485
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1971
 | 
     $self->increment_warnings;  | 
| 
486
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
487
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1266
 | 
     my $warning_count = $self->num_warnings;  | 
| 
488
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
187
 | 
     $self->log->warn("Problem processing robots.txt for $url")  | 
| 
489
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       if $warning_count == 1;  | 
| 
490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
491
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
670
 | 
     $self->log->warn( $msg ) if $warning_count <  $MAX_WARNINGS;  | 
| 
492
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _tokenize {  | 
| 
495
 | 
1561
 | 
 
 | 
 
 | 
  
1561
  
 | 
 
 | 
3466
 | 
     my ($self, $line) = @_;  | 
| 
496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
497
 | 
1561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3961
 | 
     $self->log->trace("Parsing line: [$line]");  | 
| 
498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
499
 | 
1561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29918
 | 
     my $lower_line = lc( $line );  | 
| 
500
 | 
1561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6144
 | 
     my ($directive) = ($lower_line =~ m!^([^:\s]+)!);  | 
| 
501
 | 
1561
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
3625
 | 
     $directive //= '';  | 
| 
502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
503
 | 
1561
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
66170
 | 
     if ( $directive =~ m!^acap\-! ||  | 
| 
504
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
          CrawlerCommons::RobotDirective->directive_exists( $directive ) ){  | 
| 
505
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
506
 | 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5307
 | 
         my $data_portion = substr($line, length( $directive ));  | 
| 
507
 | 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9689
 | 
         ( my $data ) = ( $data_portion =~ m!$COLON_DIRECTIVE_PATTERN! );  | 
| 
508
 | 
1522
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4370
 | 
         ( $data ) = ( $data_portion =~ m!$BLANK_DIRECTIVE_PATTERN! )  | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           unless defined $data;  | 
| 
510
 | 
1522
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
3007
 | 
         $data //= '';  | 
| 
511
 | 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5836
 | 
         $data =~ s!^\s+|\s+$!!;  | 
| 
512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
513
 | 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4839
 | 
         $self->log->trace(<<"DUMP");  | 
| 
514
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # _tokenize dump  | 
| 
515
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #################  | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 line            [$line]  | 
| 
517
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 directive       [$directive]  | 
| 
518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 data_portion    [$data_portion]  | 
| 
519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 data            [$data]  | 
| 
520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 DUMP  | 
| 
521
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
522
 | 
1522
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
91990
 | 
         my $robot_directive =  | 
| 
523
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           CrawlerCommons::RobotDirective->get_directive(  | 
| 
524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $directive =~ m!^acap-!i ? 'acap-' : $directive );    | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
526
 | 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45419
 | 
         return CrawlerCommons::RobotToken->new(  | 
| 
527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           data => $data, directive => $robot_directive  | 
| 
528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
531
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1864
 | 
         my $robot_directive =  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         CrawlerCommons::RobotDirective->get_directive(  | 
| 
533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $lower_line =~ m![ \t]*:[ \t]*(.*)! ? 'unknown' : 'missing' );  | 
| 
534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
535
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1339
 | 
         return CrawlerCommons::RobotToken->new(  | 
| 
536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           data => $line, directive => $robot_directive  | 
| 
537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );   | 
| 
538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #-----------------------------------------------------------------------------#  | 
| 
541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###############################################################################  | 
| 
543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __PACKAGE__->meta->make_immutable;  | 
| 
545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
546
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###############################################################################  | 
| 
547
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR  | 
| 
549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Adam K Robinson <akrobinson74@gmail.com>  | 
| 
551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 COPYRIGHT AND LICENSE  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
554
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This software is copyright (c) 2017 by Adam K Robinson.  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.  | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |