line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/GunghoX-FollowLinks/trunk/lib/GunghoX/FollowLinks/Parser/HTML.pm 40584 2008-01-29T14:54:08.742000Z daisuke $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp> |
4
|
|
|
|
|
|
|
# All rights reserved. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package GunghoX::FollowLinks::Parser::HTML; |
7
|
3
|
|
|
3
|
|
153736
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
91
|
|
8
|
3
|
|
|
3
|
|
12
|
use warnings; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
95
|
|
9
|
3
|
|
|
3
|
|
9
|
use base qw(GunghoX::FollowLinks::Parser); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
845
|
|
10
|
|
|
|
|
|
|
use HTML::Parser; |
11
|
|
|
|
|
|
|
use HTML::Tagset; |
12
|
|
|
|
|
|
|
use URI; |
13
|
|
|
|
|
|
|
use List::Util qw(shuffle); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors($_) for qw(parser); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub new |
18
|
|
|
|
|
|
|
{ |
19
|
|
|
|
|
|
|
my $class = shift; |
20
|
|
|
|
|
|
|
my $parser = HTML::Parser->new( |
21
|
|
|
|
|
|
|
start_h => [ \&_start, "self,tagname,attr" ], |
22
|
|
|
|
|
|
|
report_tags => [ keys %HTML::Tagset::linkElements ], |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
return $class->next::method( |
25
|
|
|
|
|
|
|
content_type => 'text/html', |
26
|
|
|
|
|
|
|
@_, |
27
|
|
|
|
|
|
|
parser => $parser |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub _start |
32
|
|
|
|
|
|
|
{ |
33
|
|
|
|
|
|
|
my ($self, $tag, $attr) = @_; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $links = $HTML::Tagset::linkElements{ $tag }; |
36
|
|
|
|
|
|
|
$links = [ $links ] unless ref $links; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
my $container = $self->{ 'container' }; |
39
|
|
|
|
|
|
|
my $c = $self->{ 'context' }; |
40
|
|
|
|
|
|
|
my $response = $self->{ 'response' }; |
41
|
|
|
|
|
|
|
my $base = $response->request->uri; |
42
|
|
|
|
|
|
|
foreach my $link_attr (shuffle @$links) { |
43
|
|
|
|
|
|
|
next unless exists $attr->{ $link_attr }; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $url = URI->new_abs( $attr->{ $link_attr }, $base ); |
46
|
|
|
|
|
|
|
if ($container->follow_if_allowed( $c, $response, $url, { tag => $tag, attr => $attr } )) { |
47
|
|
|
|
|
|
|
$self->{ 'count' }++; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub parse |
53
|
|
|
|
|
|
|
{ |
54
|
|
|
|
|
|
|
my ($self, $c, $response) = @_; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
my $parser = $self->parser; |
57
|
|
|
|
|
|
|
local $parser->{ 'response' } = $response; |
58
|
|
|
|
|
|
|
local $parser->{ 'container' } = $self; |
59
|
|
|
|
|
|
|
local $parser->{ 'context' } = $c; |
60
|
|
|
|
|
|
|
local $parser->{ 'count' } = 0; |
61
|
|
|
|
|
|
|
$parser->parse( $response->content ); |
62
|
|
|
|
|
|
|
$parser->eof; |
63
|
|
|
|
|
|
|
return $parser->{ 'count' }; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
1; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
__END__ |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 NAME |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
GunghoX::FollowLinks::Parser::HTML - FollowLinks Parser For HTML Documents |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 METHODS |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=head2 new |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head2 parse |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=cut |