line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/HTML-RobotsMETA/trunk/lib/HTML/RobotsMETA.pm 4223 2007-10-29T06:42:26.630870Z daisuke $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Copyright (c) 2007 Daisuke Maki |
4
|
|
|
|
|
|
|
# All rights reserved. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package HTML::RobotsMETA; |
7
|
2
|
|
|
2
|
|
32932
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
87
|
|
8
|
2
|
|
|
2
|
|
13
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
64
|
|
9
|
2
|
|
|
2
|
|
2894
|
use HTML::Parser; |
|
2
|
|
|
|
|
15086
|
|
|
2
|
|
|
|
|
211
|
|
10
|
2
|
|
|
2
|
|
1692
|
use HTML::RobotsMETA::Rules; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
1185
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.00004'; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new |
14
|
|
|
|
|
|
|
{ |
15
|
1
|
|
|
1
|
1
|
13
|
my $class = shift; |
16
|
1
|
|
|
|
|
5
|
my $self = bless {}, $class; |
17
|
1
|
|
|
|
|
3
|
return $self; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub parser |
21
|
|
|
|
|
|
|
{ |
22
|
4
|
|
|
4
|
1
|
6
|
my $self = shift; |
23
|
4
|
|
66
|
|
|
22
|
return $self->{parser} ||= HTML::Parser->new( |
24
|
|
|
|
|
|
|
api_version => 3, |
25
|
|
|
|
|
|
|
$self->get_parser_callbacks |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub get_parser_callbacks |
30
|
|
|
|
|
|
|
{ |
31
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
32
|
|
|
|
|
|
|
return ( |
33
|
1
|
|
|
24
|
|
15
|
start_h => [ sub { $self->_parse_start_h(@_) }, "tagname, attr" ] |
|
24
|
|
|
|
|
65
|
|
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub parse_rules |
38
|
|
|
|
|
|
|
{ |
39
|
4
|
|
|
4
|
1
|
1400
|
my $self = shift; |
40
|
|
|
|
|
|
|
|
41
|
4
|
|
|
|
|
11
|
my @rules; |
42
|
4
|
|
|
|
|
16
|
local $self->{rules} = \@rules; |
43
|
|
|
|
|
|
|
|
44
|
4
|
|
|
|
|
13
|
my $parser = $self->parser(); |
45
|
|
|
|
|
|
|
|
46
|
4
|
|
|
|
|
120
|
$parser->parse(@_); |
47
|
4
|
|
|
|
|
13
|
$parser->eof; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# merge rules that were found in this document |
50
|
4
|
|
|
|
|
8
|
my %directives = (map { %$_ } @rules); |
|
4
|
|
|
|
|
21
|
|
51
|
4
|
|
|
|
|
24
|
return HTML::RobotsMETA::Rules->new(%directives); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _parse_start_h |
55
|
|
|
|
|
|
|
{ |
56
|
24
|
|
|
24
|
|
38
|
my ($self, $tag, $attr) = @_; |
57
|
|
|
|
|
|
|
|
58
|
24
|
100
|
|
|
|
118
|
return unless $tag eq 'meta'; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# the "name" attribute may contain either "robots", or user-specified |
61
|
|
|
|
|
|
|
# robot name, which is specific to a particular crawler |
62
|
|
|
|
|
|
|
# XXX - Handle the specific agent part later |
63
|
8
|
100
|
66
|
|
|
84
|
return unless defined $attr->{name} && $attr->{name} =~ /^robots$/; |
64
|
|
|
|
|
|
|
|
65
|
4
|
|
|
|
|
8
|
my %directives; |
66
|
|
|
|
|
|
|
# Allowed values |
67
|
|
|
|
|
|
|
# FOLLOW |
68
|
|
|
|
|
|
|
# NOFOLLOW |
69
|
|
|
|
|
|
|
# INDEX |
70
|
|
|
|
|
|
|
# NOINDEX |
71
|
|
|
|
|
|
|
# ARCHIVE |
72
|
|
|
|
|
|
|
# NOARCHIVE |
73
|
|
|
|
|
|
|
# SERVE |
74
|
|
|
|
|
|
|
# NOSERVER |
75
|
|
|
|
|
|
|
# NOIMAGEINDEX |
76
|
|
|
|
|
|
|
# NOIMAGECLICK |
77
|
|
|
|
|
|
|
# ALL |
78
|
|
|
|
|
|
|
# NONE |
79
|
4
|
|
|
|
|
15
|
my $content = lc $attr->{content}; |
80
|
4
|
|
|
|
|
31
|
while ($content =~ /((?:no)?(follow|index|archive|serve)|(?:noimage(?:index|click))|all|none)/g) { |
81
|
6
|
|
|
|
|
38
|
$directives{$1}++; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
4
|
|
|
|
|
5
|
push @{$self->{rules}}, \%directives; |
|
4
|
|
|
|
|
33
|
|
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
1; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
__END__ |