line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Logfile::EPrints::Filter::RobotsTxt; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Logfile::EPrints::Filter::RobotsTxt - Filter Web log hits using a database of robot's IPs |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 OPTIONS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=over 4 |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=item file |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Specify the robots DBM file to use. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=back |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=cut |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
require bytes; |
20
|
6
|
|
|
6
|
|
38
|
use Fcntl; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
2066
|
|
21
|
6
|
|
|
6
|
|
39
|
use SDBM_File; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
419
|
|
22
|
|
|
|
|
|
|
|
23
|
6
|
|
|
6
|
|
66
|
use constant BOT_CACHE => '/usr/local/share/Logfile/botcache.db'; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
422
|
|
24
|
6
|
|
|
6
|
|
34
|
use constant CACHE_TIMEOUT => 60*60*24*30; # 30 days |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
284
|
|
25
|
6
|
|
|
6
|
|
32
|
use vars qw( $AUTOLOAD ); |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
3717
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub new |
28
|
|
|
|
|
|
|
{ |
29
|
1
|
|
|
1
|
0
|
1059
|
my ($class,%args) = @_; |
30
|
1
|
|
33
|
|
|
12
|
my $self = bless \%args, ref($class) || $class; |
31
|
1
|
|
50
|
|
|
9
|
my $filename = $args{'file'} || BOT_CACHE; |
32
|
1
|
50
|
|
|
|
2
|
tie %{$self->{cache}}, 'SDBM_File', $filename, O_CREAT|O_RDWR, 0644 |
|
1
|
|
|
|
|
140
|
|
33
|
|
|
|
|
|
|
or die "Unable to open robots cache database at $filename: $!"; |
34
|
1
|
|
|
|
|
3
|
my @KEYS; |
35
|
1
|
|
|
|
|
2
|
while( my ($key, $value) = each %{$self->{cache}} ) |
|
1
|
|
|
|
|
31
|
|
36
|
|
|
|
|
|
|
{ |
37
|
0
|
|
|
|
|
0
|
my ($utime,$agent) = unpack("la*", $value); |
38
|
0
|
0
|
|
|
|
0
|
push @KEYS, $key if( $utime < time - CACHE_TIMEOUT ); |
39
|
|
|
|
|
|
|
} |
40
|
1
|
|
|
|
|
5
|
delete $self->{cache}->{$_} for @KEYS; |
41
|
1
|
|
|
|
|
12
|
$self; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub DESTROY |
45
|
|
|
|
|
|
|
{ |
46
|
1
|
|
|
1
|
|
871
|
my $self = shift; |
47
|
1
|
|
|
|
|
3
|
untie %{$self->{cache}}; |
|
1
|
|
|
|
|
120
|
|
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub AUTOLOAD |
51
|
|
|
|
|
|
|
{ |
52
|
1122
|
|
|
1122
|
|
3871
|
$AUTOLOAD =~ s/^.*:://; |
53
|
1122
|
50
|
|
|
|
3204
|
return if $AUTOLOAD =~ /[A-Z]$/; |
54
|
1122
|
|
|
|
|
1591
|
my ($self,$hit) = @_; |
55
|
1122
|
100
|
66
|
|
|
5316
|
if( defined($hit->page) && $hit->page =~ /robots\.txt$/ ) |
56
|
|
|
|
|
|
|
{ |
57
|
21
|
|
|
|
|
59
|
$self->robotstxt($hit); |
58
|
21
|
|
|
|
|
258
|
return undef; |
59
|
|
|
|
|
|
|
} |
60
|
1101
|
100
|
|
|
|
3585
|
if( defined(my $value = $self->{cache}->{$hit->address}) ) |
61
|
|
|
|
|
|
|
{ |
62
|
|
|
|
|
|
|
#warn "Ignoring hit from " . $hit->address . " (" . $self->{cache}->{$hit->address} . ")"; |
63
|
7
|
|
|
|
|
35
|
my( $utime ) = unpack("l",$value); |
64
|
7
|
50
|
|
|
|
23
|
if( $utime > CACHE_TIMEOUT ) |
65
|
|
|
|
|
|
|
{ |
66
|
7
|
|
|
|
|
22
|
delete $self->{cache}->{$hit->address}; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
else |
69
|
|
|
|
|
|
|
{ |
70
|
0
|
|
|
|
|
0
|
return undef; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
1101
|
|
|
|
|
6109
|
return $self->{handler}->$AUTOLOAD($hit); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub robotstxt |
78
|
|
|
|
|
|
|
{ |
79
|
21
|
|
|
21
|
0
|
26
|
my ($self,$hit) = @_; |
80
|
|
|
|
|
|
|
#warn "Got new robot: " . join(',',$hit->address,$hit->utime,$hit->agent) . "\n"; |
81
|
|
|
|
|
|
|
# SDBM_File format only supports upto 1008 bytes |
82
|
21
|
|
50
|
|
|
111
|
$self->{cache}->{$hit->address} = bytes::substr(pack("la*",$hit->utime,$hit->agent||''),0,1008); |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
1; |