line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package Plack::Middleware::DetectRobots; |
3
|
|
|
|
|
|
|
{ |
4
|
|
|
|
|
|
|
$Plack::Middleware::DetectRobots::VERSION = '0.02'; |
5
|
|
|
|
|
|
|
} |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# ABSTRACT: Automatically set a flag in the environment if a robot client is detected |
8
|
|
|
|
|
|
|
|
9
|
5
|
|
|
5
|
|
38403
|
use strict; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
1097
|
|
10
|
5
|
|
|
5
|
|
32
|
use warnings; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
423
|
|
11
|
|
|
|
|
|
|
|
12
|
5
|
|
|
5
|
|
1176
|
use parent qw(Plack::Middleware); |
|
5
|
|
|
|
|
295
|
|
|
5
|
|
|
|
|
51
|
|
13
|
5
|
|
|
5
|
|
21368
|
use Plack::Util::Accessor qw( env_key basic_check extended_check generic_check local_regexp ); |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
48
|
|
14
|
5
|
|
|
5
|
|
11778
|
use Regexp::Assemble qw(); |
|
5
|
|
|
|
|
155528
|
|
|
5
|
|
|
|
|
190
|
|
15
|
5
|
|
|
5
|
|
65
|
use feature 'state'; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
7522
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub prepare_app { |
18
|
4
|
|
|
4
|
1
|
898
|
my $self = shift; |
19
|
4
|
100
|
|
|
|
24
|
$self->basic_check(1) unless defined $self->basic_check; |
20
|
4
|
|
|
|
|
488
|
return; |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub call { |
24
|
66
|
|
|
66
|
1
|
1020154
|
my ( $self, $env ) = @_; |
25
|
|
|
|
|
|
|
|
26
|
66
|
|
|
|
|
127
|
state $reList = _read_list(); |
27
|
66
|
|
|
|
|
110
|
state $basic = _assemble( $reList, 'basic' ); |
28
|
66
|
|
|
|
|
50704
|
state $extended = _assemble( $reList, 'extended' ); |
29
|
66
|
|
|
|
|
606220
|
state $generic = _assemble( $reList, 'generic' ); |
30
|
66
|
|
|
|
|
13340
|
$reList = undef; |
31
|
|
|
|
|
|
|
|
32
|
66
|
50
|
|
|
|
470
|
my $key = defined( $self->env_key ) ? $self->env_key : 'robot_client'; |
33
|
|
|
|
|
|
|
|
34
|
66
|
|
|
|
|
546
|
my $ua = $env->{'HTTP_USER_AGENT'}; |
35
|
|
|
|
|
|
|
|
36
|
66
|
|
|
|
|
163
|
$env->{$key} = 0; |
37
|
|
|
|
|
|
|
|
38
|
66
|
|
|
|
|
219
|
my $local = $self->local_regexp; |
39
|
66
|
100
|
66
|
|
|
498
|
if ( defined($local) and ( ref $local eq ref qr// ) and ( $ua =~ $local ) ) { |
|
|
|
100
|
|
|
|
|
40
|
1
|
|
|
|
|
3
|
$env->{$key} = 'LOCAL'; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
66
|
100
|
100
|
|
|
353
|
if ( !$env->{$key} and $self->basic_check ) { |
44
|
63
|
100
|
|
|
|
2289
|
if ( $ua =~ $basic ) { |
45
|
21
|
|
|
|
|
58
|
$env->{$key} = 'BASIC'; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
66
|
100
|
100
|
|
|
331
|
if ( !$env->{$key} and $self->extended_check ) { |
50
|
14
|
100
|
|
|
|
1095
|
if ( $ua =~ $extended ) { |
51
|
4
|
|
|
|
|
10
|
$env->{$key} = 'EXTENDED'; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
66
|
100
|
100
|
|
|
521
|
if ( !$env->{$key} and $self->generic_check ) { |
56
|
14
|
100
|
|
|
|
528
|
if ( $ua =~ $generic ) { |
57
|
4
|
|
|
|
|
14
|
$env->{$key} = 'GENERIC'; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
66
|
|
|
|
|
667
|
return $self->app->($env); |
62
|
|
|
|
|
|
|
} ## end sub call |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub _assemble { |
65
|
12
|
|
|
12
|
|
53
|
my ( $bots, $type ) = @_; |
66
|
|
|
|
|
|
|
|
67
|
12
|
|
|
|
|
114
|
my $ra = Regexp::Assemble->new( flags => 'i' ); |
68
|
12
|
|
|
|
|
1307
|
foreach my $r ( @{ $bots->{$type} } ) { |
|
12
|
|
|
|
|
55
|
|
69
|
3176
|
|
|
|
|
630603
|
$ra->add($r); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
12
|
|
|
|
|
2047
|
return $ra->re; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub _read_list { |
76
|
4
|
|
|
4
|
|
28
|
my $bots = { basic => [], extended => [], generic => [], }; |
77
|
4
|
|
|
|
|
12
|
my $currentType = 'basic'; |
78
|
|
|
|
|
|
|
|
79
|
4
|
|
|
|
|
15
|
state $pos = tell(Plack::Middleware::DetectRobots::DATA); |
80
|
4
|
50
|
|
|
|
32
|
if ( $ENV{'HARNESS_ACTIVE'} ) { |
81
|
4
|
|
|
|
|
46
|
seek( Plack::Middleware::DetectRobots::DATA, $pos, 0 ); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
4
|
|
|
|
|
145
|
while () { |
85
|
3180
|
|
|
|
|
4523
|
chomp; |
86
|
3180
|
100
|
|
|
|
6370
|
next unless $_; |
87
|
3176
|
100
|
|
|
|
6408
|
$currentType = 'extended' if /\A##\s+EXTENDED/; |
88
|
3176
|
100
|
|
|
|
7822
|
$currentType = 'generic' if /\A##\s+GENERIC/; |
89
|
|
|
|
|
|
|
|
90
|
3176
|
|
|
|
|
3141
|
push @{ $bots->{$currentType} }, $_; |
|
3176
|
|
|
|
|
13593
|
|
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
4
|
50
|
|
|
|
31
|
if ( !$ENV{'HARNESS_ACTIVE'} ) { |
94
|
0
|
|
|
|
|
0
|
close Plack::Middleware::DetectRobots::DATA; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
4
|
|
|
|
|
22
|
return $bots; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
1; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=pod |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=encoding utf-8 |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 NAME |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Plack::Middleware::DetectRobots - Automatically set a flag in the environment if a robot client is detected |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 VERSION |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
version 0.02 |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 SYNOPSIS |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
use Plack::Builder; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my $app = sub { ... } # as usual |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
builder { |
121
|
|
|
|
|
|
|
enable 'DetectRobots'; |
122
|
|
|
|
|
|
|
# or: enable 'DetectRobots', env_key => 'psgix.robot_client'; |
123
|
|
|
|
|
|
|
# or: enable 'DetectRobots', extended_check => 1, generic_check => 1; |
124
|
|
|
|
|
|
|
$app; |
125
|
|
|
|
|
|
|
}; |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# ... and later ... |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
if ( $env->{'robot_client'} ) { |
130
|
|
|
|
|
|
|
# ... do something ... |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head1 DESCRIPTION |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
This Plack middleware uses the list of robots that is part of the |
136
|
|
|
|
|
|
|
L software package to |
137
|
|
|
|
|
|
|
analyse the C HTTP header and to set an environment |
138
|
|
|
|
|
|
|
flag to either a true or false value depending on the detection |
139
|
|
|
|
|
|
|
of a robot client. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Once activated it checks the User-Agent HTTP header against a |
142
|
|
|
|
|
|
|
basic list of patterns for common bots. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
If you activate the appropriate options, it can also use an extended |
145
|
|
|
|
|
|
|
list for the detection of less common bots (cf. C) |
146
|
|
|
|
|
|
|
and / or a list of quite generic patterns to detect unknown bots |
147
|
|
|
|
|
|
|
(cf. C). |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
You may also pass in your own regular expression as a string for |
150
|
|
|
|
|
|
|
further checks (cf. ). |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The checks are executed in this order: |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
B<1.> Local regular expression |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
B<2.> Basic check |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
B<3.> Extended check |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
B<4.> Generic check |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
If a check yields a positive result (i.e.: detects a bot) the |
163
|
|
|
|
|
|
|
remaining checks are skipped. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Depending on the check which detected a bot, the environment flag |
166
|
|
|
|
|
|
|
is set to one of these values: C, C, C, or |
167
|
|
|
|
|
|
|
C. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
If no bot is detected, the flag is set to C<0>. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
The default name of the flag in the environment is C, |
172
|
|
|
|
|
|
|
but this can be customized by setting the C option when |
173
|
|
|
|
|
|
|
enabling this middleware. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
It might make sense to use C by default instead, |
176
|
|
|
|
|
|
|
but the PSGI spec states that the "'psgix.' prefix is reserved for |
177
|
|
|
|
|
|
|
officially blessed extensions" - which does not apply to this module. |
178
|
|
|
|
|
|
|
You may, however, set the key to C yourself |
179
|
|
|
|
|
|
|
by using the C option mentioned before. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 WARNING |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
This software is currently considered BETA and still needs to |
184
|
|
|
|
|
|
|
be seriously tested! |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 ROBOTS LIST |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Based on B of |
189
|
|
|
|
|
|
|
L. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
B that list might be somewhat dated, as I did not find bingbot |
192
|
|
|
|
|
|
|
in the list of common bots (only in the extended list) while it's |
193
|
|
|
|
|
|
|
predecessor msnbot was considered common. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head1 CONFIGURATION |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
You may specify the following option when enabling the middleware: |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=over 4 |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=item C |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Set the name of the entry in the environment hash. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=item C |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
You may deactivate the standard checks by setting this option to |
208
|
|
|
|
|
|
|
a false value. E.g. if your are only interested in obscure bots |
209
|
|
|
|
|
|
|
or in your local pattern checks. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
By setting this option to a false value while simultaneously |
212
|
|
|
|
|
|
|
passing a regular expression to C one can imitate |
213
|
|
|
|
|
|
|
the behaviour of L. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=item C |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Determines if an extended list of less often seen robots is also |
218
|
|
|
|
|
|
|
checked for. |
219
|
|
|
|
|
|
|
By default, only common robots are checked for, because the extended |
220
|
|
|
|
|
|
|
check requires a rather large and complex regular expression. |
221
|
|
|
|
|
|
|
Set this param to a true value to change the default behaviour. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item C |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Determines if the User-Agent string is also analysed to determine |
226
|
|
|
|
|
|
|
if it contains certain strings that generically identify the |
227
|
|
|
|
|
|
|
client as a bot, e.g. "spider" or "crawler" |
228
|
|
|
|
|
|
|
By default, this check is not performed, even though it uses only |
229
|
|
|
|
|
|
|
a relatively short and simple regex.. |
230
|
|
|
|
|
|
|
Set this param to a true value to change the default behaviour. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=item C |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
You may optionally pass in your own regular expression (as a Regexp |
235
|
|
|
|
|
|
|
object using C) to check for additional patterns in the |
236
|
|
|
|
|
|
|
User-Agent string. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=back |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=head1 SEE ALSO |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
L, L, L, |
243
|
|
|
|
|
|
|
L |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
The functionality provided by C is |
246
|
|
|
|
|
|
|
basically the same as that of this module, but it requires you to |
247
|
|
|
|
|
|
|
pass in your own regular expression and does not include a default |
248
|
|
|
|
|
|
|
list of known bots. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=head1 AUTHOR |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Heiko Jansen |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
This software is copyright (c) 2014 by Heiko Jansen. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
259
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
__DATA__ |