|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Logfile::EPrints::Filter::RobotsTxt;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Logfile::EPrints::Filter::RobotsTxt - Filter Web log hits using a database of robot's IPs  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 OPTIONS  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item file  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Specify the robots DBM file to use.  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require bytes;  | 
| 
20
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
38
 | 
 use Fcntl;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2066
 | 
    | 
| 
21
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
39
 | 
 use SDBM_File;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
419
 | 
    | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
66
 | 
 use constant BOT_CACHE => '/usr/local/share/Logfile/botcache.db';  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
422
 | 
    | 
| 
24
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
34
 | 
 use constant CACHE_TIMEOUT => 60*60*24*30; # 30 days  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
284
 | 
    | 
| 
25
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
32
 | 
 use vars qw( $AUTOLOAD );  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3717
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
29
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
1059
 | 
 	my ($class,%args) = @_;  | 
| 
30
 | 
1
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
12
 | 
 	my $self = bless \%args, ref($class) || $class;  | 
| 
31
 | 
1
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
9
 | 
 	my $filename = $args{'file'} || BOT_CACHE;  | 
| 
32
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
 	tie %{$self->{cache}}, 'SDBM_File', $filename, O_CREAT|O_RDWR, 0644  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
140
 | 
    | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		or die "Unable to open robots cache database at $filename: $!";  | 
| 
34
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
 	my @KEYS;  | 
| 
35
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
 	while( my ($key, $value) = each %{$self->{cache}} )  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
    | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
37
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		my ($utime,$agent) = unpack("la*", $value);  | 
| 
38
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		push @KEYS, $key if( $utime < time - CACHE_TIMEOUT );  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
40
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
 	delete $self->{cache}->{$_} for @KEYS;  | 
| 
41
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
 	$self;  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub DESTROY  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
46
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
871
 | 
 	my $self = shift;  | 
| 
47
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
 	untie %{$self->{cache}};  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
120
 | 
    | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub AUTOLOAD  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
52
 | 
1122
 | 
 
 | 
 
 | 
  
1122
  
 | 
 
 | 
3871
 | 
 	$AUTOLOAD =~ s/^.*:://;  | 
| 
53
 | 
1122
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3204
 | 
 	return if $AUTOLOAD =~ /[A-Z]$/;  | 
| 
54
 | 
1122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1591
 | 
 	my ($self,$hit) = @_;  | 
| 
55
 | 
1122
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
5316
 | 
 	if( defined($hit->page) && $hit->page =~ /robots\.txt$/ )  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
57
 | 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
 		$self->robotstxt($hit);  | 
| 
58
 | 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
258
 | 
 		return undef;  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
60
 | 
1101
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3585
 | 
 	if( defined(my $value = $self->{cache}->{$hit->address}) )  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		#warn "Ignoring hit from " . $hit->address . " (" . $self->{cache}->{$hit->address} . ")";  | 
| 
63
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
35
 | 
 		my( $utime ) = unpack("l",$value);  | 
| 
64
 | 
7
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
 		if( $utime > CACHE_TIMEOUT )  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		{  | 
| 
66
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
 			delete $self->{cache}->{$hit->address};  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		else  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		{  | 
| 
70
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			return undef;  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
74
 | 
1101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6109
 | 
 	return $self->{handler}->$AUTOLOAD($hit);  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub robotstxt  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
79
 | 
21
 | 
 
 | 
 
 | 
  
21
  
 | 
  
0
  
 | 
26
 | 
 	my ($self,$hit) = @_;  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	#warn "Got new robot: " . join(',',$hit->address,$hit->utime,$hit->agent) . "\n";  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# SDBM_File format only supports upto 1008 bytes  | 
| 
82
 | 
21
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
111
 | 
 	$self->{cache}->{$hit->address} = bytes::substr(pack("la*",$hit->utime,$hit->agent||''),0,1008);  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |