File Coverage

blib/lib/WWW/RobotRules/Extended.pm

Criterion	Covered	Total	%
statement	6	135	4.4
branch	0	72	0.0
condition	0	15	0.0
subroutine	2	15	13.3
pod	3	3	100.0
total	11	240	4.5

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::RobotRules::Extended;
2
3	1			1		37959	use URI;
	1					14033
	1					1033
4
5							# The following methods must be provided by the subclass.
6							sub agent;
7							sub is_me;
8							sub visit;
9							sub no_visits;
10							sub last_visits;
11							sub fresh_until;
12							sub push_rules;
13							sub clear_rules;
14							sub rules;
15							sub dump;
16
17
18
19							=head1 NAME
20
21							WWW::RobotRules::Extended - database of robots.txt-derived permissions.
22							This is a fork of WWW::RobotRules
23
24							You should use WWW::RobotsRules::Extended if you want
25							to act as Googlebot : Google accept some improvments like "allow" directives
26							or wildcards "*" into rules
27
28
29							=head1 VERSION
30
31							Version 0.02
32
33							=cut
34
35							our $VERSION = '0.02';
36
37
38							=head1 SYNOPSIS
39
40							Quick summary of what the module does.
41
42
43							use WWW::RobotRules::Extended;
44							use LWP::Simple qw(get);
45
46							my $rules = WWW::RobotRules::Extended->new('MOMspider/1.0');
47
48							{
49							my $url = "http://some.place/robots.txt";
50							my $robots_txt = get $url;
51							$rules->parse($url, $robots_txt) if defined $robots_txt;
52							}
53
54							{
55							my $url = "http://some.other.place/robots.txt";
56							my $robots_txt = get $url;
57							$rules->parse($url, $robots_txt) if defined $robots_txt;
58							}
59
60							# Now we can check if a URL is valid for those servers
61							# whose "robots.txt" files we've gotten and parsed:
62							if($rules->allowed($url)) {
63							$c = get $url;
64							...
65							}
66
67
68							=head1 DESCRIPTION
69
70							This module parses F files as specified in
71							"A Standard for Robot Exclusion", at
72
73
74							It also parses rules that contains wildcards '*' and allow directives
75							like Google does.
76
77							Webmasters can use the F file to forbid conforming
78							robots from accessing parts of their web site.
79
80							The parsed files are kept in a WWW::RobotRules::Extended object, and this object
81							provides methods to check if access to a given URL is prohibited. The
82							same WWW::RobotRules::Extended object can be used for one or more parsed
83							F files on any number of hosts.
84
85
86							=head1 EXPORT
87
88							A list of functions that can be exported. You can delete this section
89							if you don't export anything, such as for a purely object-oriented module.
90
91							=head2 new
92							This is the constructor for WWW::RobotRules::Extended objects. The first
93							argument given to new() is the name of the robot.
94							=cut
95
96							sub new {
97	0			0	1		my($class, $ua) = @_;
98
99							# This ugly hack is needed to ensure backwards compatibility.
100							# The "WWW::RobotRules::Extended" class is now really abstract.
101	0	0					$class = "WWW::RobotRules::Extended::InCore" if $class eq "WWW::RobotRules::Extended";
102
103	0						my $self = bless { }, $class;
104	0						$self->agent($ua);
105	0						$self;
106							}
107
108
109							=head2 parse
110							The parse() method takes as arguments the URL that was used to
111							retrieve the F file, and the contents of the file.
112
113							$rules->allowed($uri)
114
115							Returns TRUE if this robot is allowed to retrieve this URL.
116							=cut
117
118							sub parse {
119	0			0	1		my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
120	0						$robot_txt_uri = URI->new("$robot_txt_uri");
121	0						my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
122
123	0						$self->clear_rules($netloc);
124	0		0				$self->fresh_until($netloc, $fresh_until \|\| (time + 365243600));
125
126	0						my $ua;
127	0						my $is_me = 0; # 1 iff this record is for me
128	0						my $is_anon = 0; # 1 iff this record is for *
129	0						my @me_alloweddisallowed = (); # rules allowed or disallowed for me
130	0						my @anon_alloweddisallowed = (); # rules allowed or disallowed for *
131
132							# blank lines are significant, so turn CRLF into LF to avoid generating
133							# false ones
134	0						$txt =~ s/\015\012/\012/g;
135
136							# split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
137	0						for(split(/[\012\015]/, $txt)) {
138
139							# Lines containing only a comment are discarded completely, and
140							# therefore do not indicate a record boundary.
141	0	0					next if /^\s*\#/;
142
143	0						s/\s\#.//; # remove comments at end-of-line
144
145	0	0					if (/^\sUser-Agent\s:\s(.)/i) {
		0
		0
146	0						$ua = $1;
147	0						$ua =~ s/\s+$//;
148
149	0	0					if ($ua eq '*') { # if it's directive for all bots
		0
150	0						$is_anon = 1;
151							}
152							elsif($self->is_me($ua)) { #if it's directives for this bot
153	0						$is_me = 1;
154							}
155							else {
156	0						$is_me = 0;
157	0						$is_anon = 0;
158							}
159							}
160							elsif (/^\s(Disallow\|Allow)\s:\s(.)/i) {
161	0	0					unless (defined $ua) {
162	0	0					warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
163	0						$is_anon = 1; # assume that User-agent: * was intended
164							}
165	0						my $verb = $1;
166	0						my $allowdisallow = $2;
167	0						$allowdisallow =~ s/\s+$//;
168	0	0					if (length $allowdisallow) {
169	0						my $ignore;
170	0						eval {
171	0						my $u = URI->new_abs($allowdisallow, $robot_txt_uri);
172	0	0					$ignore++ if $u->scheme ne $robot_txt_uri->scheme;
173	0	0					$ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
174	0	0					$ignore++ if $u->port ne $robot_txt_uri->port;
175	0						$allowdisallow = $u->path_query;
176	0	0					$allowdisallow = "/" unless length $allowdisallow;
177							};
178	0	0					next if $@;
179	0	0					next if $ignore;
180							}
181
182							# transform rules into regexp
183							# for instance : /shared/* => ^\/shared\/.*
184	0						my $rule = "^".$allowdisallow;
185	0						$rule=~ s/\//\\\//g;
186	0						$rule=~ s/\/\./g;
187	0						$rule=~ s/\[/\\[/g;
188	0						$rule=~ s/\]/\\]/g;
189	0						$rule=~ s/\?/\\?/g;
190	0						$rule=~ s/\./\\./g;
191
192	0	0					if (length $allowdisallow) {
193	0	0					if ($is_me) {
		0
194	0						push(@me_alloweddisallowed, $verb." ".$rule);
195							}
196							elsif ($is_anon) {
197	0						push(@anon_alloweddisallowed, $verb." ".$rule);
198							}
199							}
200							}
201							elsif (/\S\s*:/) {
202							# ignore
203							}
204							else {
205	0	0					warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
206							}
207							}
208
209	0	0					if ($is_me) {
210	0						$self->push_rules($netloc, @me_alloweddisallowed);
211							}
212							else {
213	0						$self->push_rules($netloc, @anon_alloweddisallowed);
214							}
215							}
216
217							=head2 is_me
218
219							=cut
220
221
222							=head2 allowed
223
224							Returns TRUE if this robot is allowed to retrieve this URL.
225
226							=cut
227
228							sub allowed {
229	0			0	1		my($self, $uri) = @_;
230	0						$uri = URI->new("$uri");
231
232	0	0	0				return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
233							# Robots.txt applies to only those schemes.
234
235	0						my $netloc = $uri->host . ":" . $uri->port;
236
237	0						my $fresh_until = $self->fresh_until($netloc);
238	0	0	0				return -1 if !defined($fresh_until) \|\| $fresh_until < time;
239
240	0						my $str = $uri->path_query;
241	0						my $rule;
242							my $verb;
243	0						my $rline;
244
245	0						my $result=1; # by default, all is allowed
246	0						for $rline ($self->rules($netloc)) {
247	0	0					if ($rline =~ /^(Disallow\|Allow)\s(.)/i) {
248	0						$verb=lc($1);
249	0						$rule=$2;
250	0	0					if ($str =~ /$rule/) {
251	0	0					if ($verb eq "allow") { # here, the rule allows, so i return now
252	0						return 1;
253							};
254	0	0					if ($verb eq "disallow") { # here, the rule is disallowed, but we need to verify further
255							# if another "allow" rule is present for this url
256	0						$result=0;
257							}
258							}
259	0						$rule="";
260							}
261							}
262	0						return $result; # the rules have all been verified, if there is a matching disallow rule, $result should be 0
263							}
264
265
266							=head1 SUBROUTINES/METHODS
267
268
269
270							=head1 AUTHOR
271
272							Yannick Simon, C<< >>
273
274							=head1 BUGS
275
276							Please report any bugs or feature requests to C, or through
277							the web interface at L. I will be notified, and then you'll
278							automatically be notified of progress on your bug as I make changes.
279
280
281
282
283							=head1 SUPPORT
284
285							You can find documentation for this module with the perldoc command.
286
287							perldoc WWW::RobotRules::Extended
288
289
290							You can also look for information at:
291
292							=over 4
293
294							=item * RT: CPAN's request tracker (report bugs here)
295
296							L
297
298							=item * AnnoCPAN: Annotated CPAN documentation
299
300							L
301
302							=item * CPAN Ratings
303
304							L
305
306							=item * Search CPAN
307
308							L
309
310							=back
311
312							=head1 Extended ROBOTS.TXT EXAMPLES
313
314							The following example "/robots.txt" file specifies that no robots
315							should visit any URL starting with "/cyberworld/map/" or "/tmp/":
316
317							User-agent: *
318							Disallow: /cyberworld/map/ # This is an infinite virtual URL space
319							Disallow: /tmp/ # these will soon disappear
320
321							This example "/robots.txt" file specifies that no robots should visit
322							any URL starting with "/cyberworld/map/", except the robot called
323							"cybermapper":
324
325							User-agent: *
326							Disallow: /cyberworld/map/ # This is an infinite virtual URL space
327
328							# Cybermapper knows where to go.
329							User-agent: cybermapper
330							Disallow:
331
332							This example indicates that no robots should visit this site further:
333
334							# go away
335							User-agent: *
336							Disallow: /
337
338							This is an example of a malformed robots.txt file.
339
340							# robots.txt for ancientcastle.example.com
341							# I've locked myself away.
342							User-agent: *
343							Disallow: /
344							# The castle is your home now, so you can go anywhere you like.
345							User-agent: Belle
346							Disallow: /west-wing/ # except the west wing!
347							# It's good to be the Prince...
348							User-agent: Beast
349							Disallow:
350
351							This file is missing the required blank lines between records.
352							However, the intention is clear.
353
354
355							This is an example of an extended robots.txt file
356							tou have a real example of this kind of rules on http://www.google.com/robots.txt
357
358							# Block every url that contains &p=
359							User-agent: *
360							Disallow: /*&p=
361
362							This is an example of an extended robots.txt file.
363
364							# Block every url but the ones that begin with /shared
365							User-agent: *
366							Disallow: /
367							Allow: /shared/
368
369
370
371							=head1 SEE ALSO
372
373							L, L, L
374
375
376							=head1 ACKNOWLEDGEMENTS
377
378
379							=head1 LICENSE AND COPYRIGHT
380
381							Copyright 2011, Yannick Simon
382							Copyright 1995-2009, Gisle Aas
383							Copyright 1995, Martijn Koster
384
385							This program is free software; you can redistribute it and/or modify it
386							under the terms of either: the GNU General Public License as published
387							by the Free Software Foundation; or the Artistic License.
388
389							See http://dev.perl.org/licenses/ for more information.
390
391
392							=cut
393
394							1; # End of WWW::RobotRules::Extended
395
396
397
398
399							package WWW::RobotRules::Extended::InCore;
400
401	1			1		8	use vars qw(@ISA);
	1					2
	1					620
402							@ISA = qw(WWW::RobotRules::Extended);
403
404
405							sub is_me {
406	0			0			my($self, $ua_line) = @_;
407	0						my $me = $self->agent;
408
409							# See whether my short-name is a substring of the
410							# "User-Agent: ..." line that we were passed:
411
412	0	0					if(index(lc($me), lc($ua_line)) >= 0) {
413	0						return 1;
414							}
415							else {
416	0						return '';
417							}
418							}
419
420
421							sub agent {
422	0			0			my ($self, $name) = @_;
423	0						my $old = $self->{'ua'};
424	0	0					if ($name) {
425							# Strip it so that it's just the short name.
426							# I.e., "FooBot" => "FooBot"
427							# "FooBot/1.2" => "FooBot"
428							# "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
429
430	0	0					$name = $1 if $name =~ m/(\S+)/; # get first word
431	0						$name =~ s!/.*!!; # get rid of version
432	0	0	0				unless ($old && $old eq $name) {
433	0						delete $self->{'loc'}; # all old info is now stale
434	0						$self->{'ua'} = $name;
435							}
436							}
437	0						$old;
438							}
439
440
441							sub visit {
442	0			0			my($self, $netloc, $time) = @_;
443	0	0					return unless $netloc;
444	0		0				$time \|\|= time;
445	0						$self->{'loc'}{$netloc}{'last'} = $time;
446	0						my $count = \$self->{'loc'}{$netloc}{'count'};
447	0	0					if (!defined $$count) {
448	0						$$count = 1;
449							}
450							else {
451	0						$$count++;
452							}
453							}
454
455
456							sub no_visits {
457	0			0			my ($self, $netloc) = @_;
458	0						$self->{'loc'}{$netloc}{'count'};
459							}
460
461
462							sub last_visit {
463	0			0			my ($self, $netloc) = @_;
464	0						$self->{'loc'}{$netloc}{'last'};
465							}
466
467
468							sub fresh_until {
469	0			0			my ($self, $netloc, $fresh_until) = @_;
470	0						my $old = $self->{'loc'}{$netloc}{'fresh'};
471	0	0					if (defined $fresh_until) {
472	0						$self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
473							}
474	0						$old;
475							}
476
477
478							sub push_rules {
479	0			0			my($self, $netloc, @rules) = @_;
480	0						push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
	0
481							}
482
483
484							sub clear_rules {
485	0			0			my($self, $netloc) = @_;
486	0						delete $self->{'loc'}{$netloc}{'rules'};
487							}
488
489
490							sub rules {
491	0			0			my($self, $netloc) = @_;
492	0	0					if (defined $self->{'loc'}{$netloc}{'rules'}) {
493	0						return @{$self->{'loc'}{$netloc}{'rules'}};
	0
494							}
495							else {
496	0						return ();
497							}
498							}
499
500
501							sub dump
502							{
503	0			0			my $self = shift;
504	0						for (keys %$self) {
505	0	0					next if $_ eq 'loc';
506	0						print "$_ = $self->{$_}\n";
507							}
508	0						for (keys %{$self->{'loc'}}) {
	0
509	0						my @rules = $self->rules($_);
510	0						print "$_: ", join("; ", @rules), "\n";
511							}
512							}
513