line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/GunghoX-FollowLinks/trunk/lib/GunghoX/FollowLinks/Parser/HTML.pm 9010 2007-11-13T02:08:07.210715Z daisuke $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp> |
4
|
|
|
|
|
|
|
# All rights reserved. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package GunghoX::FollowLinks::Parser::HTML; |
7
|
3
|
|
|
3
|
|
109451
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
69
|
|
8
|
3
|
|
|
3
|
|
11
|
use warnings; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
66
|
|
9
|
3
|
|
|
3
|
|
9
|
use base qw(GunghoX::FollowLinks::Parser); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
712
|
|
10
|
|
|
|
|
|
|
use HTML::Parser; |
11
|
|
|
|
|
|
|
use HTML::Tagset; |
12
|
|
|
|
|
|
|
use URI; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors($_) for qw(parser); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub new |
17
|
|
|
|
|
|
|
{ |
18
|
|
|
|
|
|
|
my $class = shift; |
19
|
|
|
|
|
|
|
my $parser = HTML::Parser->new( |
20
|
|
|
|
|
|
|
start_h => [ \&_start, "self,tagname,attr" ], |
21
|
|
|
|
|
|
|
report_tags => [ keys %HTML::Tagset::linkElements ], |
22
|
|
|
|
|
|
|
); |
23
|
|
|
|
|
|
|
return $class->next::method( |
24
|
|
|
|
|
|
|
content_type => 'text/html', |
25
|
|
|
|
|
|
|
@_, |
26
|
|
|
|
|
|
|
parser => $parser |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub _start |
31
|
|
|
|
|
|
|
{ |
32
|
|
|
|
|
|
|
my ($self, $tag, $attr) = @_; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $links = $HTML::Tagset::linkElements{ $tag }; |
35
|
|
|
|
|
|
|
$links = [ $links ] unless ref $links; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $container = $self->{ 'container' }; |
38
|
|
|
|
|
|
|
my $c = $self->{ 'context' }; |
39
|
|
|
|
|
|
|
my $response = $self->{ 'response' }; |
40
|
|
|
|
|
|
|
my $base = $response->request->uri; |
41
|
|
|
|
|
|
|
foreach my $link_attr (@$links) { |
42
|
|
|
|
|
|
|
next unless exists $attr->{ $link_attr }; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $url = URI->new_abs( $attr->{ $link_attr }, $base ); |
45
|
|
|
|
|
|
|
if ($container->follow_if_allowed( $c, $response, $url, { tag => $tag, attr => $attr } )) { |
46
|
|
|
|
|
|
|
$self->{ 'count' }++; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub parse |
52
|
|
|
|
|
|
|
{ |
53
|
|
|
|
|
|
|
my ($self, $c, $response) = @_; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
my $parser = $self->parser; |
56
|
|
|
|
|
|
|
local $parser->{ 'response' } = $response; |
57
|
|
|
|
|
|
|
local $parser->{ 'container' } = $self; |
58
|
|
|
|
|
|
|
local $parser->{ 'context' } = $c; |
59
|
|
|
|
|
|
|
local $parser->{ 'count' } = 0; |
60
|
|
|
|
|
|
|
$parser->parse( $response->content ); |
61
|
|
|
|
|
|
|
$parser->eof; |
62
|
|
|
|
|
|
|
return $parser->{ 'count' }; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
1; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
__END__ |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 NAME |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
GunghoX::FollowLinks::Parser::HTML - FollowLinks Parser For HTML Documents |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head1 METHODS |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head2 new |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 parse |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=cut |