| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | =head1 NAME | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | CrawlerCommons::RobotRules - the result of a parsed robots.txt | 
| 4 |  |  |  |  |  |  |  | 
| 5 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | use CrawlerCommons::RobotRules; | 
| 8 |  |  |  |  |  |  | use CrawlerCommons::RobotRulesParser; | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | my $rules_parser = CrawlerCommons::RobotRulesParser->new; | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | my $content = "User-agent: *\r\nDisallow: *images"; | 
| 13 |  |  |  |  |  |  | my $content_type = "text/plain"; | 
| 14 |  |  |  |  |  |  | my $robot_names = "any-old-robot"; | 
| 15 |  |  |  |  |  |  | my $url = "http://domain.com/"; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | my $robot_rules = | 
| 18 |  |  |  |  |  |  | $rules_parser->parse_content($url, $content, $content_type, $robot_names); | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | # obtain the 'mode' of the robot rules object | 
| 21 |  |  |  |  |  |  | say "Anything Goes!!!!" if $robot_rules->is_allow_all; | 
| 22 |  |  |  |  |  |  | say "Nothing to see here!" if $robot_rules->is_allow_none; | 
| 23 |  |  |  |  |  |  | say "Default robot rules mode..." if $robot_rules->is_allow_some; | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | # are we allowed to crawl a URL (returns 1 if so, 0 if not) | 
| 26 |  |  |  |  |  |  | say "We're allowed to crawl the index :)" | 
| 27 |  |  |  |  |  |  | if $robot_rules->is_allowed( "https://www.domain.com/index.html"); | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | say "Not allowed to crawl: $_" unless $robot_rules->is_allowed( $_ ) | 
| 30 |  |  |  |  |  |  | for ("http://www.domain.com/images/some_file.png", | 
| 31 |  |  |  |  |  |  | "http://www.domain.com/images/another_file.png"); | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | This object is the result of parsing a single robots.txt file | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =cut | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | ############################################################################### | 
| 40 |  |  |  |  |  |  | package CrawlerCommons::RobotRules; | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | # MODULE IMPORTS | 
| 43 |  |  |  |  |  |  | ######################################## | 
| 44 |  |  |  |  |  |  | # Pragmas | 
| 45 |  |  |  |  |  |  | #------------------# | 
| 46 | 2 |  |  | 2 |  | 35 | use 5.10.1; | 
|  | 2 |  |  |  |  | 8 |  | 
| 47 | 2 |  |  | 2 |  | 12 | use strict; | 
|  | 2 |  |  |  |  | 6 |  | 
|  | 2 |  |  |  |  | 44 |  | 
| 48 | 2 |  |  | 2 |  | 15 | use utf8; | 
|  | 2 |  |  |  |  | 6 |  | 
|  | 2 |  |  |  |  | 15 |  | 
| 49 | 2 |  |  | 2 |  | 61 | use warnings; | 
|  | 2 |  |  |  |  | 6 |  | 
|  | 2 |  |  |  |  | 77 |  | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | # CPAN/Core | 
| 52 |  |  |  |  |  |  | #------------------# | 
| 53 | 2 |  |  | 2 |  | 13 | use Const::Fast; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 22 |  | 
| 54 | 2 |  |  | 2 |  | 141 | use Try::Tiny; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 99 |  | 
| 55 | 2 |  |  | 2 |  | 969 | use URI; | 
|  | 2 |  |  |  |  | 5461 |  | 
|  | 2 |  |  |  |  | 65 |  | 
| 56 | 2 |  |  | 2 |  | 15 | use URI::Escape; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 111 |  | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | # Moose Setup | 
| 59 |  |  |  |  |  |  | #------------------# | 
| 60 | 2 |  |  | 2 |  | 13 | use Moose; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 17 |  | 
| 61 | 2 |  |  | 2 |  | 17516 | use namespace::autoclean; | 
|  | 2 |  |  |  |  | 6 |  | 
|  | 2 |  |  |  |  | 25 |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | # Moose Pragmas | 
| 64 |  |  |  |  |  |  | #------------------# | 
| 65 |  |  |  |  |  |  | with 'MooseX::Log::Log4perl'; | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | # Custom Modules | 
| 68 |  |  |  |  |  |  | #------------------# | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | # VARIABLES/CONSTANTS | 
| 72 |  |  |  |  |  |  | ######################################## | 
| 73 |  |  |  |  |  |  | # Debug Constants | 
| 74 |  |  |  |  |  |  | #------------------# | 
| 75 |  |  |  |  |  |  | const my $DEBUG                 => $ENV{DEBUG} // 0; | 
| 76 |  |  |  |  |  |  | const my $TEST                  => $ENV{TEST} // 1; | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | const our $ALLOW_ALL            => 'allow_all'; | 
| 79 |  |  |  |  |  |  | const our $ALLOW_NONE           => 'allow_none'; | 
| 80 |  |  |  |  |  |  | const our $ALLOW_SOME           => 'allow_some'; | 
| 81 |  |  |  |  |  |  | const my $ROBOT_RULES_MODES     => | 
| 82 |  |  |  |  |  |  | ["$ALLOW_ALL", "$ALLOW_NONE", "$ALLOW_SOME"]; | 
| 83 |  |  |  |  |  |  | const our $UNSET_CRAWL_DELAY    => 0xffffffff * -1; | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | # Constants | 
| 86 |  |  |  |  |  |  | #------------------# | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | # Variables | 
| 89 |  |  |  |  |  |  | #------------------# | 
| 90 |  |  |  |  |  |  | =head1 VERSION | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | Version 0.03 | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | =cut | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | our $VERSION = '0.03'; | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | # setup logging, if not present | 
| 99 |  |  |  |  |  |  | BEGIN { | 
| 100 | 2 |  |  | 2 |  | 1724 | require Log::Log4perl; | 
| 101 | 2 | 100 |  |  |  | 35037 | Log::Log4perl->easy_init($Log::Log4perl::ERROR) | 
| 102 |  |  |  |  |  |  | unless $Log::Log4perl::Logger::INITIALIZED; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | # ATTRIBUTES | 
| 107 |  |  |  |  |  |  | ######################################## | 
| 108 |  |  |  |  |  |  | # Class | 
| 109 |  |  |  |  |  |  | #------------------# | 
| 110 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 111 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | # Instance | 
| 114 |  |  |  |  |  |  | #------------------# | 
| 115 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 116 |  |  |  |  |  |  | has 'crawl_delay'               => ( | 
| 117 |  |  |  |  |  |  | default                     => $UNSET_CRAWL_DELAY, | 
| 118 |  |  |  |  |  |  | is                          => 'rw', | 
| 119 |  |  |  |  |  |  | isa                         => 'Int', | 
| 120 |  |  |  |  |  |  | writer                      => 'set_crawl_delay', | 
| 121 |  |  |  |  |  |  | ); | 
| 122 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 123 |  |  |  |  |  |  | has '_defer_visits'             => ( | 
| 124 |  |  |  |  |  |  | default                     => 0, | 
| 125 |  |  |  |  |  |  | is                          => 'rw', | 
| 126 |  |  |  |  |  |  | isa                         => 'Bool', | 
| 127 |  |  |  |  |  |  | traits                      => ['Bool'], | 
| 128 |  |  |  |  |  |  | ); | 
| 129 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 130 |  |  |  |  |  |  | has '_mode'                     => ( | 
| 131 |  |  |  |  |  |  | enum                        => $ROBOT_RULES_MODES, | 
| 132 |  |  |  |  |  |  | handles                     => 1, | 
| 133 |  |  |  |  |  |  | is                          => 'ro', | 
| 134 |  |  |  |  |  |  | required                    => 1, | 
| 135 |  |  |  |  |  |  | traits                      => ['Enumeration'], | 
| 136 |  |  |  |  |  |  | ); | 
| 137 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 138 |  |  |  |  |  |  | has '_rules'                    => ( | 
| 139 |  |  |  |  |  |  | default                     => sub {[]}, | 
| 140 |  |  |  |  |  |  | handles                     => { | 
| 141 |  |  |  |  |  |  | '_add_rule'             => 'push', | 
| 142 |  |  |  |  |  |  | 'clear_rules'           => 'clear', | 
| 143 |  |  |  |  |  |  | '_get_rules'            => 'elements', | 
| 144 |  |  |  |  |  |  | }, | 
| 145 |  |  |  |  |  |  | is                          => 'ro', | 
| 146 |  |  |  |  |  |  | isa                         => 'ArrayRef[CrawlerCommons::RobotRule]', | 
| 147 |  |  |  |  |  |  | traits                      => ['Array'], | 
| 148 |  |  |  |  |  |  | writer                      => '_set_rules', | 
| 149 |  |  |  |  |  |  | ); | 
| 150 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 151 |  |  |  |  |  |  | has '_sitemaps'                 => ( | 
| 152 |  |  |  |  |  |  | default                     => sub {[]}, | 
| 153 |  |  |  |  |  |  | handles                     => { | 
| 154 |  |  |  |  |  |  | _add_sitemap            => 'push', | 
| 155 |  |  |  |  |  |  | get_sitemap             => 'get', | 
| 156 |  |  |  |  |  |  | get_sitemaps            => 'elements', | 
| 157 |  |  |  |  |  |  | sitemaps_size           => 'count', | 
| 158 |  |  |  |  |  |  | }, | 
| 159 |  |  |  |  |  |  | is                          => 'ro', | 
| 160 |  |  |  |  |  |  | isa                         => 'ArrayRef[Str]', | 
| 161 |  |  |  |  |  |  | traits                      => ['Array'], | 
| 162 |  |  |  |  |  |  | ); | 
| 163 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | =head1 METHODS | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | =cut | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | # METHODS | 
| 170 |  |  |  |  |  |  | ######################################## | 
| 171 |  |  |  |  |  |  | # Constructor | 
| 172 |  |  |  |  |  |  | #------------------# | 
| 173 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 174 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | # Class Methods | 
| 177 |  |  |  |  |  |  | #------------------# | 
| 178 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 179 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | # Instance Methods | 
| 182 |  |  |  |  |  |  | #------------------# | 
| 183 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 184 |  |  |  |  |  |  | sub add_rule { | 
| 185 | 254 |  |  | 254 | 0 | 643 | my ($self, $prefix, $allow) = @_; | 
| 186 | 254 | 50 | 66 |  |  | 1036 | $allow = 1 if !$allow && length($prefix) == 0; | 
| 187 | 254 |  |  |  |  | 7915 | $self->_add_rule( | 
| 188 |  |  |  |  |  |  | CrawlerCommons::RobotRule->new(  _prefix => $prefix, _allow => $allow ) | 
| 189 |  |  |  |  |  |  | ); | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 192 |  |  |  |  |  |  | sub add_sitemap { | 
| 193 | 20 |  |  | 20 | 0 | 63 | my ($self, $sitemap) = @_; | 
| 194 | 20 |  |  |  |  | 812 | $self->_add_sitemap( $sitemap ); | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 197 |  |  |  |  |  |  | =head2 C<< my $true_or_false = $robot_rules->is_allowed( $url ) >> | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | Returns 1 if we're allowed to crawl the URL represented by C<$url> and 0 | 
| 200 |  |  |  |  |  |  | otherwise.  Will return 1 if the method C<is_allow_all()> returns true, | 
| 201 |  |  |  |  |  |  | otherwise, if C<is_allow_none> is false, returns 1 if there is an allow rule or | 
| 202 |  |  |  |  |  |  | no disallow rule for this URL. | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | =over | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | =item * C<$url> | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | The URL whose path is used to search for a matching rule within the object for | 
| 209 |  |  |  |  |  |  | evaluation. | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | =back | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | =cut | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | sub is_allowed { | 
| 216 | 286 |  |  | 286 | 0 | 1312 | my ($self, $url) = @_; | 
| 217 | 286 | 100 |  |  |  | 11983 | return 0 if $self->is_allow_none; | 
| 218 | 285 | 100 |  |  |  | 10438 | return 1 if $self->is_allow_all; | 
| 219 | 282 |  |  |  |  | 1044 | my $path_with_query = $self->_get_path( $url, 1); | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | # always allow robots.txt | 
| 222 | 282 | 100 |  |  |  | 5165 | return 1 if $path_with_query eq '/robots.txt'; | 
| 223 |  |  |  |  |  |  |  | 
| 224 | 277 |  |  |  |  | 11832 | for my $rule ($self->_get_rules) { | 
| 225 | 733 | 100 |  |  |  | 21833 | return $rule->_allow | 
| 226 |  |  |  |  |  |  | if $self->_rule_matches( $path_with_query, $rule->_prefix ); | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 | 156 |  |  |  |  | 885 | return 1; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 232 |  |  |  |  |  |  | sub sort_rules { | 
| 233 | 69 |  |  | 69 | 0 | 183 | my $self = shift; | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | $self->_set_rules( | 
| 236 | 748 | 50 |  |  |  | 20138 | [ sort {length( $b->_prefix ) <=> length( $a->_prefix ) || | 
| 237 | 69 |  |  |  |  | 187 | $b->_allow <=> $a->_allow} @{ $self->_rules } | 
|  | 69 |  |  |  |  | 2271 |  | 
| 238 |  |  |  |  |  |  | ] | 
| 239 |  |  |  |  |  |  | ); | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | # Private Methods | 
| 245 |  |  |  |  |  |  | #------------------# | 
| 246 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 247 |  |  |  |  |  |  | sub _get_path() { | 
| 248 | 282 |  |  | 282 |  | 776 | my ($self, $url, $with_query) = @_; | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | try { | 
| 251 | 282 |  |  | 282 |  | 21511 | my $uri = URI->new( $url ); | 
| 252 | 282 |  |  |  |  | 34994 | my $path = $uri->path(); | 
| 253 | 282 |  | 50 |  |  | 3664 | my $path_query = $uri->path_query() // ''; | 
| 254 |  |  |  |  |  |  |  | 
| 255 | 282 | 50 | 33 |  |  | 4135 | $path = $path_query if ($with_query && $path_query ne ''); | 
| 256 |  |  |  |  |  |  |  | 
| 257 | 282 | 50 | 33 |  |  | 1196 | if (not(defined($path)) || $path eq '') { | 
| 258 | 0 |  |  |  |  | 0 | return '/'; | 
| 259 |  |  |  |  |  |  | } | 
| 260 |  |  |  |  |  |  | else { | 
| 261 | 282 |  |  |  |  | 1051 | $path = uri_unescape( $path ); | 
| 262 | 282 |  |  |  |  | 2976 | utf8::encode( $path ); | 
| 263 | 282 |  |  |  |  | 1037 | return $path; | 
| 264 |  |  |  |  |  |  | } | 
| 265 |  |  |  |  |  |  | } | 
| 266 |  |  |  |  |  |  | catch { | 
| 267 | 0 |  |  | 0 |  | 0 | return '/'; | 
| 268 | 282 |  |  |  |  | 2271 | }; | 
| 269 |  |  |  |  |  |  | } | 
| 270 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 271 |  |  |  |  |  |  | sub _rule_matches { | 
| 272 | 733 |  |  | 733 |  | 1642 | my ($self, $text, $pattern) = @_; | 
| 273 | 733 |  |  |  |  | 1180 | my $pattern_pos = my $text_pos = 0; | 
| 274 | 733 |  |  |  |  | 1297 | my $pattern_end = length( $pattern ); | 
| 275 | 733 |  |  |  |  | 1055 | my $text_end = length( $text ); | 
| 276 |  |  |  |  |  |  |  | 
| 277 | 733 | 100 |  |  |  | 1592 | my $contains_end_char = $pattern =~ m!\$! ? 1 : 0; | 
| 278 | 733 | 100 |  |  |  | 1338 | $pattern_end -= 1 if $contains_end_char; | 
| 279 |  |  |  |  |  |  |  | 
| 280 | 733 |  | 100 |  |  | 2566 | while ( ( $pattern_pos < $pattern_end ) && ( $text_pos < $text_end ) ) { | 
| 281 | 781 |  |  |  |  | 1551 | my $wildcard_pos = index( $pattern, '*', $pattern_pos ); | 
| 282 | 781 | 100 |  |  |  | 1760 | $wildcard_pos = $pattern_end if $wildcard_pos == -1; | 
| 283 |  |  |  |  |  |  |  | 
| 284 | 781 |  |  |  |  | 2120 | $self->log->trace( <<"DUMP" ); | 
| 285 |  |  |  |  |  |  | # _rule_matches wildcard... | 
| 286 |  |  |  |  |  |  | ############################ | 
| 287 |  |  |  |  |  |  | pattern         $pattern | 
| 288 |  |  |  |  |  |  | pattern_end     $pattern_end | 
| 289 |  |  |  |  |  |  | wildcard_pos    $wildcard_pos | 
| 290 |  |  |  |  |  |  | DUMP | 
| 291 |  |  |  |  |  |  |  | 
| 292 | 781 | 100 |  |  |  | 18987 | if ( $wildcard_pos == $pattern_pos ) { | 
| 293 | 31 |  |  |  |  | 70 | $pattern_pos += 1; | 
| 294 | 31 | 100 |  |  |  | 316 | return 1 if $pattern_pos >= $pattern_end; | 
| 295 |  |  |  |  |  |  |  | 
| 296 | 24 |  |  |  |  | 62 | my $pattern_piece_end = index( $pattern, '*', $pattern_pos); | 
| 297 | 24 | 100 |  |  |  | 74 | $pattern_piece_end = $pattern_end if $pattern_piece_end == -1; | 
| 298 |  |  |  |  |  |  |  | 
| 299 | 24 |  |  |  |  | 50 | my $matched = 0; | 
| 300 | 24 |  |  |  |  | 48 | my $pattern_piece_len = $pattern_piece_end - $pattern_pos; | 
| 301 | 24 |  | 100 |  |  | 119 | while ( ( $text_pos + $pattern_piece_len <=  $text_end ) | 
| 302 |  |  |  |  |  |  | && !$matched ) { | 
| 303 |  |  |  |  |  |  |  | 
| 304 | 199 |  |  |  |  | 303 | $matched = 1; | 
| 305 |  |  |  |  |  |  |  | 
| 306 | 199 |  | 100 |  |  | 608 | for ( my $i = 0; $i < $pattern_piece_len && $matched; $i++ ) { | 
| 307 | 261 | 100 |  |  |  | 1026 | $matched = 0 | 
| 308 |  |  |  |  |  |  | if substr( $text, $text_pos + $i, 1 ) ne | 
| 309 |  |  |  |  |  |  | substr( $pattern, $pattern_pos + $i, 1 ); | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  |  | 
| 312 | 199 | 100 |  |  |  | 677 | $text_pos += 1 unless $matched; | 
| 313 |  |  |  |  |  |  | } | 
| 314 |  |  |  |  |  |  |  | 
| 315 | 24 | 100 |  |  |  | 114 | return 0 unless $matched; | 
| 316 |  |  |  |  |  |  | } | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | else { | 
| 319 | 750 |  | 100 |  |  | 2459 | while ( ( $pattern_pos < $wildcard_pos ) && | 
| 320 |  |  |  |  |  |  | ( $text_pos < $text_end ) ) { | 
| 321 |  |  |  |  |  |  |  | 
| 322 | 1992 |  |  |  |  | 4191 | $self->log->trace( <<"DUMP" ); | 
| 323 |  |  |  |  |  |  | # _rule_matches dump | 
| 324 |  |  |  |  |  |  | ##################### | 
| 325 |  |  |  |  |  |  | text        $text | 
| 326 |  |  |  |  |  |  | text_pos    $text_pos | 
| 327 |  |  |  |  |  |  | pattern     $pattern | 
| 328 |  |  |  |  |  |  | pattern_pos $pattern_pos | 
| 329 |  |  |  |  |  |  | DUMP | 
| 330 | 1992 | 100 |  |  |  | 38284 | return 0 if substr( $text, $text_pos++, 1) ne | 
| 331 |  |  |  |  |  |  | substr( $pattern, $pattern_pos++, 1); | 
| 332 |  |  |  |  |  |  | } | 
| 333 |  |  |  |  |  |  | } | 
| 334 |  |  |  |  |  |  | } | 
| 335 |  |  |  |  |  |  |  | 
| 336 | 214 |  | 100 |  |  | 758 | while ( ( $pattern_pos < $pattern_end ) && | 
| 337 |  |  |  |  |  |  | ( substr( $pattern, $pattern_pos, 1 ) eq '*' ) ) { | 
| 338 | 2 |  |  |  |  | 9 | $pattern_pos++; | 
| 339 |  |  |  |  |  |  | } | 
| 340 |  |  |  |  |  |  |  | 
| 341 | 214 | 100 | 100 |  |  | 4876 | return ( $pattern_pos == $pattern_end ) && | 
| 342 |  |  |  |  |  |  | ( ( $text_pos == $text_end ) || !$contains_end_char ) ? 1 : 0; | 
| 343 |  |  |  |  |  |  | } | 
| 344 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 345 |  |  |  |  |  |  | ############################################################################### | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | ############################################################################### | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  | =pod | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | =cut | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | ############################################################################### | 
| 357 |  |  |  |  |  |  | package CrawlerCommons::RobotRule; | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | # MODULE IMPORTS | 
| 360 |  |  |  |  |  |  | ######################################## | 
| 361 |  |  |  |  |  |  | # Pragmas | 
| 362 |  |  |  |  |  |  | #------------------# | 
| 363 | 2 |  |  | 2 |  | 2849 | use 5.10.1; | 
|  | 2 |  |  |  |  | 8 |  | 
| 364 | 2 |  |  | 2 |  | 13 | use strict; | 
|  | 2 |  |  |  |  | 6 |  | 
|  | 2 |  |  |  |  | 48 |  | 
| 365 | 2 |  |  | 2 |  | 12 | use utf8; | 
|  | 2 |  |  |  |  | 7 |  | 
|  | 2 |  |  |  |  | 14 |  | 
| 366 | 2 |  |  | 2 |  | 50 | use warnings; | 
|  | 2 |  |  |  |  | 36 |  | 
|  | 2 |  |  |  |  | 119 |  | 
| 367 |  |  |  |  |  |  |  | 
| 368 |  |  |  |  |  |  | # CPAN/Core | 
| 369 |  |  |  |  |  |  | #------------------# | 
| 370 | 2 |  |  | 2 |  | 16 | use Const::Fast; | 
|  | 2 |  |  |  |  | 11 |  | 
|  | 2 |  |  |  |  | 19 |  | 
| 371 | 2 |  |  | 2 |  | 161 | use Try::Tiny; | 
|  | 2 |  |  |  |  | 24 |  | 
|  | 2 |  |  |  |  | 99 |  | 
| 372 |  |  |  |  |  |  |  | 
| 373 |  |  |  |  |  |  | # Moose Setup | 
| 374 |  |  |  |  |  |  | #------------------# | 
| 375 | 2 |  |  | 2 |  | 13 | use Moose; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 13 |  | 
| 376 | 2 |  |  | 2 |  | 18284 | use namespace::autoclean; | 
|  | 2 |  |  |  |  | 30 |  | 
|  | 2 |  |  |  |  | 16 |  | 
| 377 |  |  |  |  |  |  |  | 
| 378 |  |  |  |  |  |  | # Moose Pragmas | 
| 379 |  |  |  |  |  |  | #------------------# | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | # Custom Modules | 
| 382 |  |  |  |  |  |  | #------------------# | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  |  | 
| 386 |  |  |  |  |  |  | # VARIABLES/CONSTANTS | 
| 387 |  |  |  |  |  |  | ######################################## | 
| 388 |  |  |  |  |  |  | # Debug Constants | 
| 389 |  |  |  |  |  |  | #------------------# | 
| 390 |  |  |  |  |  |  |  | 
| 391 |  |  |  |  |  |  | # Constants | 
| 392 |  |  |  |  |  |  | #------------------# | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | # Variables | 
| 395 |  |  |  |  |  |  | #------------------# | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | # ATTRIBUTES | 
| 398 |  |  |  |  |  |  | ######################################## | 
| 399 |  |  |  |  |  |  | # Class | 
| 400 |  |  |  |  |  |  | #------------------# | 
| 401 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 402 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | # Instance | 
| 405 |  |  |  |  |  |  | #------------------# | 
| 406 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 407 |  |  |  |  |  |  | has '_allow'                    => ( | 
| 408 |  |  |  |  |  |  | is                          => 'ro', | 
| 409 |  |  |  |  |  |  | isa                         => 'Bool', | 
| 410 |  |  |  |  |  |  | required                    => 1, | 
| 411 |  |  |  |  |  |  | ); | 
| 412 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 413 |  |  |  |  |  |  | has '_prefix'                   => ( | 
| 414 |  |  |  |  |  |  | is                          => 'ro', | 
| 415 |  |  |  |  |  |  | isa                         => 'Str', | 
| 416 |  |  |  |  |  |  | ); | 
| 417 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | # METHODS | 
| 420 |  |  |  |  |  |  | ######################################## | 
| 421 |  |  |  |  |  |  | # Constructor | 
| 422 |  |  |  |  |  |  | #------------------# | 
| 423 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 424 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 425 |  |  |  |  |  |  |  | 
| 426 |  |  |  |  |  |  | # Class Methods | 
| 427 |  |  |  |  |  |  | #------------------# | 
| 428 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 429 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | # Instance Methods | 
| 432 |  |  |  |  |  |  | #------------------# | 
| 433 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 434 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | # Private Methods | 
| 437 |  |  |  |  |  |  | #------------------# | 
| 438 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 439 |  |  |  |  |  |  | #-----------------------------------------------------------------------------# | 
| 440 |  |  |  |  |  |  | ############################################################################### | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | ############################################################################### | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | =head1 AUTHOR | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | Adam Robinson <akrobinson74@gmail.com> | 
| 449 |  |  |  |  |  |  |  | 
| 450 |  |  |  |  |  |  | =cut | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | 1; | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | __END__ |