File Coverage

blib/lib/WWW/RobotRules/Extended.pm
Criterion Covered Total %
statement 6 135 4.4
branch 0 72 0.0
condition 0 15 0.0
subroutine 2 15 13.3
pod 3 3 100.0
total 11 240 4.5


line stmt bran cond sub pod time code
1             package WWW::RobotRules::Extended;
2              
3 1     1   37959 use URI;
  1         14033  
  1         1033  
4              
5             # The following methods must be provided by the subclass.
6             sub agent;
7             sub is_me;
8             sub visit;
9             sub no_visits;
10             sub last_visits;
11             sub fresh_until;
12             sub push_rules;
13             sub clear_rules;
14             sub rules;
15             sub dump;
16              
17              
18              
19             =head1 NAME
20              
21             WWW::RobotRules::Extended - database of robots.txt-derived permissions.
22             This is a fork of WWW::RobotRules
23              
24             You should use WWW::RobotsRules::Extended if you want
25             to act as Googlebot : Google accept some improvments like "allow" directives
26             or wildcards "*" into rules
27              
28              
29             =head1 VERSION
30              
31             Version 0.02
32              
33             =cut
34              
35             our $VERSION = '0.02';
36              
37              
38             =head1 SYNOPSIS
39              
40             Quick summary of what the module does.
41              
42              
43             use WWW::RobotRules::Extended;
44             use LWP::Simple qw(get);
45            
46             my $rules = WWW::RobotRules::Extended->new('MOMspider/1.0');
47              
48             {
49             my $url = "http://some.place/robots.txt";
50             my $robots_txt = get $url;
51             $rules->parse($url, $robots_txt) if defined $robots_txt;
52             }
53              
54             {
55             my $url = "http://some.other.place/robots.txt";
56             my $robots_txt = get $url;
57             $rules->parse($url, $robots_txt) if defined $robots_txt;
58             }
59              
60             # Now we can check if a URL is valid for those servers
61             # whose "robots.txt" files we've gotten and parsed:
62             if($rules->allowed($url)) {
63             $c = get $url;
64             ...
65             }
66              
67              
68             =head1 DESCRIPTION
69              
70             This module parses F files as specified in
71             "A Standard for Robot Exclusion", at
72            
73              
74             It also parses rules that contains wildcards '*' and allow directives
75             like Google does.
76              
77             Webmasters can use the F file to forbid conforming
78             robots from accessing parts of their web site.
79              
80             The parsed files are kept in a WWW::RobotRules::Extended object, and this object
81             provides methods to check if access to a given URL is prohibited. The
82             same WWW::RobotRules::Extended object can be used for one or more parsed
83             F files on any number of hosts.
84              
85              
86             =head1 EXPORT
87              
88             A list of functions that can be exported. You can delete this section
89             if you don't export anything, such as for a purely object-oriented module.
90              
91             =head2 new
92             This is the constructor for WWW::RobotRules::Extended objects. The first
93             argument given to new() is the name of the robot.
94             =cut
95              
96             sub new {
97 0     0 1   my($class, $ua) = @_;
98              
99             # This ugly hack is needed to ensure backwards compatibility.
100             # The "WWW::RobotRules::Extended" class is now really abstract.
101 0 0         $class = "WWW::RobotRules::Extended::InCore" if $class eq "WWW::RobotRules::Extended";
102              
103 0           my $self = bless { }, $class;
104 0           $self->agent($ua);
105 0           $self;
106             }
107              
108              
109             =head2 parse
110             The parse() method takes as arguments the URL that was used to
111             retrieve the F file, and the contents of the file.
112              
113             $rules->allowed($uri)
114              
115             Returns TRUE if this robot is allowed to retrieve this URL.
116             =cut
117              
118             sub parse {
119 0     0 1   my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
120 0           $robot_txt_uri = URI->new("$robot_txt_uri");
121 0           my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
122              
123 0           $self->clear_rules($netloc);
124 0   0       $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
125              
126 0           my $ua;
127 0           my $is_me = 0; # 1 iff this record is for me
128 0           my $is_anon = 0; # 1 iff this record is for *
129 0           my @me_alloweddisallowed = (); # rules allowed or disallowed for me
130 0           my @anon_alloweddisallowed = (); # rules allowed or disallowed for *
131              
132             # blank lines are significant, so turn CRLF into LF to avoid generating
133             # false ones
134 0           $txt =~ s/\015\012/\012/g;
135              
136             # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
137 0           for(split(/[\012\015]/, $txt)) {
138              
139             # Lines containing only a comment are discarded completely, and
140             # therefore do not indicate a record boundary.
141 0 0         next if /^\s*\#/;
142              
143 0           s/\s*\#.*//; # remove comments at end-of-line
144              
145 0 0         if (/^\s*User-Agent\s*:\s*(.*)/i) {
    0          
    0          
146 0           $ua = $1;
147 0           $ua =~ s/\s+$//;
148              
149 0 0         if ($ua eq '*') { # if it's directive for all bots
    0          
150 0           $is_anon = 1;
151             }
152             elsif($self->is_me($ua)) { #if it's directives for this bot
153 0           $is_me = 1;
154             }
155             else {
156 0           $is_me = 0;
157 0           $is_anon = 0;
158             }
159             }
160             elsif (/^\s*(Disallow|Allow)\s*:\s*(.*)/i) {
161 0 0         unless (defined $ua) {
162 0 0         warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
163 0           $is_anon = 1; # assume that User-agent: * was intended
164             }
165 0           my $verb = $1;
166 0           my $allowdisallow = $2;
167 0           $allowdisallow =~ s/\s+$//;
168 0 0         if (length $allowdisallow) {
169 0           my $ignore;
170 0           eval {
171 0           my $u = URI->new_abs($allowdisallow, $robot_txt_uri);
172 0 0         $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
173 0 0         $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
174 0 0         $ignore++ if $u->port ne $robot_txt_uri->port;
175 0           $allowdisallow = $u->path_query;
176 0 0         $allowdisallow = "/" unless length $allowdisallow;
177             };
178 0 0         next if $@;
179 0 0         next if $ignore;
180             }
181              
182             # transform rules into regexp
183             # for instance : /shared/* => ^\/shared\/.*
184 0           my $rule = "^".$allowdisallow;
185 0           $rule=~ s/\//\\\//g;
186 0           $rule=~ s/\*/\.*/g;
187 0           $rule=~ s/\[/\\[/g;
188 0           $rule=~ s/\]/\\]/g;
189 0           $rule=~ s/\?/\\?/g;
190 0           $rule=~ s/\./\\./g;
191              
192 0 0         if (length $allowdisallow) {
193 0 0         if ($is_me) {
    0          
194 0           push(@me_alloweddisallowed, $verb." ".$rule);
195             }
196             elsif ($is_anon) {
197 0           push(@anon_alloweddisallowed, $verb." ".$rule);
198             }
199             }
200             }
201             elsif (/\S\s*:/) {
202             # ignore
203             }
204             else {
205 0 0         warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
206             }
207             }
208              
209 0 0         if ($is_me) {
210 0           $self->push_rules($netloc, @me_alloweddisallowed);
211             }
212             else {
213 0           $self->push_rules($netloc, @anon_alloweddisallowed);
214             }
215             }
216              
217             =head2 is_me
218              
219             =cut
220              
221              
222             =head2 allowed
223              
224             Returns TRUE if this robot is allowed to retrieve this URL.
225              
226             =cut
227              
228             sub allowed {
229 0     0 1   my($self, $uri) = @_;
230 0           $uri = URI->new("$uri");
231              
232 0 0 0       return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
233             # Robots.txt applies to only those schemes.
234              
235 0           my $netloc = $uri->host . ":" . $uri->port;
236              
237 0           my $fresh_until = $self->fresh_until($netloc);
238 0 0 0       return -1 if !defined($fresh_until) || $fresh_until < time;
239              
240 0           my $str = $uri->path_query;
241 0           my $rule;
242             my $verb;
243 0           my $rline;
244              
245 0           my $result=1; # by default, all is allowed
246 0           for $rline ($self->rules($netloc)) {
247 0 0         if ($rline =~ /^(Disallow|Allow)\s*(.*)/i) {
248 0           $verb=lc($1);
249 0           $rule=$2;
250 0 0         if ($str =~ /$rule/) {
251 0 0         if ($verb eq "allow") { # here, the rule allows, so i return now
252 0           return 1;
253             };
254 0 0         if ($verb eq "disallow") { # here, the rule is disallowed, but we need to verify further
255             # if another "allow" rule is present for this url
256 0           $result=0;
257             }
258             }
259 0           $rule="";
260             }
261             }
262 0           return $result; # the rules have all been verified, if there is a matching disallow rule, $result should be 0
263             }
264              
265              
266             =head1 SUBROUTINES/METHODS
267              
268              
269              
270             =head1 AUTHOR
271              
272             Yannick Simon, C<< >>
273              
274             =head1 BUGS
275              
276             Please report any bugs or feature requests to C, or through
277             the web interface at L. I will be notified, and then you'll
278             automatically be notified of progress on your bug as I make changes.
279              
280              
281              
282              
283             =head1 SUPPORT
284              
285             You can find documentation for this module with the perldoc command.
286              
287             perldoc WWW::RobotRules::Extended
288              
289              
290             You can also look for information at:
291              
292             =over 4
293              
294             =item * RT: CPAN's request tracker (report bugs here)
295              
296             L
297              
298             =item * AnnoCPAN: Annotated CPAN documentation
299              
300             L
301              
302             =item * CPAN Ratings
303              
304             L
305              
306             =item * Search CPAN
307              
308             L
309              
310             =back
311              
312             =head1 Extended ROBOTS.TXT EXAMPLES
313              
314             The following example "/robots.txt" file specifies that no robots
315             should visit any URL starting with "/cyberworld/map/" or "/tmp/":
316              
317             User-agent: *
318             Disallow: /cyberworld/map/ # This is an infinite virtual URL space
319             Disallow: /tmp/ # these will soon disappear
320              
321             This example "/robots.txt" file specifies that no robots should visit
322             any URL starting with "/cyberworld/map/", except the robot called
323             "cybermapper":
324              
325             User-agent: *
326             Disallow: /cyberworld/map/ # This is an infinite virtual URL space
327              
328             # Cybermapper knows where to go.
329             User-agent: cybermapper
330             Disallow:
331              
332             This example indicates that no robots should visit this site further:
333              
334             # go away
335             User-agent: *
336             Disallow: /
337              
338             This is an example of a malformed robots.txt file.
339              
340             # robots.txt for ancientcastle.example.com
341             # I've locked myself away.
342             User-agent: *
343             Disallow: /
344             # The castle is your home now, so you can go anywhere you like.
345             User-agent: Belle
346             Disallow: /west-wing/ # except the west wing!
347             # It's good to be the Prince...
348             User-agent: Beast
349             Disallow:
350              
351             This file is missing the required blank lines between records.
352             However, the intention is clear.
353              
354              
355             This is an example of an extended robots.txt file
356             tou have a real example of this kind of rules on http://www.google.com/robots.txt
357              
358             # Block every url that contains &p=
359             User-agent: *
360             Disallow: /*&p=
361            
362             This is an example of an extended robots.txt file.
363              
364             # Block every url but the ones that begin with /shared
365             User-agent: *
366             Disallow: /
367             Allow: /shared/
368              
369              
370              
371             =head1 SEE ALSO
372              
373             L, L, L
374              
375              
376             =head1 ACKNOWLEDGEMENTS
377              
378              
379             =head1 LICENSE AND COPYRIGHT
380              
381             Copyright 2011, Yannick Simon
382             Copyright 1995-2009, Gisle Aas
383             Copyright 1995, Martijn Koster
384              
385             This program is free software; you can redistribute it and/or modify it
386             under the terms of either: the GNU General Public License as published
387             by the Free Software Foundation; or the Artistic License.
388              
389             See http://dev.perl.org/licenses/ for more information.
390              
391              
392             =cut
393              
394             1; # End of WWW::RobotRules::Extended
395              
396              
397              
398              
399             package WWW::RobotRules::Extended::InCore;
400              
401 1     1   8 use vars qw(@ISA);
  1         2  
  1         620  
402             @ISA = qw(WWW::RobotRules::Extended);
403              
404              
405             sub is_me {
406 0     0     my($self, $ua_line) = @_;
407 0           my $me = $self->agent;
408              
409             # See whether my short-name is a substring of the
410             # "User-Agent: ..." line that we were passed:
411              
412 0 0         if(index(lc($me), lc($ua_line)) >= 0) {
413 0           return 1;
414             }
415             else {
416 0           return '';
417             }
418             }
419              
420              
421             sub agent {
422 0     0     my ($self, $name) = @_;
423 0           my $old = $self->{'ua'};
424 0 0         if ($name) {
425             # Strip it so that it's just the short name.
426             # I.e., "FooBot" => "FooBot"
427             # "FooBot/1.2" => "FooBot"
428             # "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
429              
430 0 0         $name = $1 if $name =~ m/(\S+)/; # get first word
431 0           $name =~ s!/.*!!; # get rid of version
432 0 0 0       unless ($old && $old eq $name) {
433 0           delete $self->{'loc'}; # all old info is now stale
434 0           $self->{'ua'} = $name;
435             }
436             }
437 0           $old;
438             }
439              
440              
441             sub visit {
442 0     0     my($self, $netloc, $time) = @_;
443 0 0         return unless $netloc;
444 0   0       $time ||= time;
445 0           $self->{'loc'}{$netloc}{'last'} = $time;
446 0           my $count = \$self->{'loc'}{$netloc}{'count'};
447 0 0         if (!defined $$count) {
448 0           $$count = 1;
449             }
450             else {
451 0           $$count++;
452             }
453             }
454              
455              
456             sub no_visits {
457 0     0     my ($self, $netloc) = @_;
458 0           $self->{'loc'}{$netloc}{'count'};
459             }
460              
461              
462             sub last_visit {
463 0     0     my ($self, $netloc) = @_;
464 0           $self->{'loc'}{$netloc}{'last'};
465             }
466              
467              
468             sub fresh_until {
469 0     0     my ($self, $netloc, $fresh_until) = @_;
470 0           my $old = $self->{'loc'}{$netloc}{'fresh'};
471 0 0         if (defined $fresh_until) {
472 0           $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
473             }
474 0           $old;
475             }
476              
477              
478             sub push_rules {
479 0     0     my($self, $netloc, @rules) = @_;
480 0           push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
  0            
481             }
482              
483              
484             sub clear_rules {
485 0     0     my($self, $netloc) = @_;
486 0           delete $self->{'loc'}{$netloc}{'rules'};
487             }
488              
489              
490             sub rules {
491 0     0     my($self, $netloc) = @_;
492 0 0         if (defined $self->{'loc'}{$netloc}{'rules'}) {
493 0           return @{$self->{'loc'}{$netloc}{'rules'}};
  0            
494             }
495             else {
496 0           return ();
497             }
498             }
499              
500              
501             sub dump
502             {
503 0     0     my $self = shift;
504 0           for (keys %$self) {
505 0 0         next if $_ eq 'loc';
506 0           print "$_ = $self->{$_}\n";
507             }
508 0           for (keys %{$self->{'loc'}}) {
  0            
509 0           my @rules = $self->rules($_);
510 0           print "$_: ", join("; ", @rules), "\n";
511             }
512             }
513