line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::LinkExtor; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require HTML::Parser; |
4
|
|
|
|
|
|
|
our @ISA = qw(HTML::Parser); |
5
|
|
|
|
|
|
|
our $VERSION = '3.81'; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTML::LinkExtor - Extract links from an HTML document |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
require HTML::LinkExtor; |
14
|
|
|
|
|
|
|
$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); |
15
|
|
|
|
|
|
|
sub cb { |
16
|
|
|
|
|
|
|
my($tag, %links) = @_; |
17
|
|
|
|
|
|
|
print "$tag @{[%links]}\n"; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
$p->parse_file("index.html"); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 DESCRIPTION |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
I is an HTML parser that extracts links from an |
24
|
|
|
|
|
|
|
HTML document. The I is a subclass of |
25
|
|
|
|
|
|
|
I. This means that the document should be given to the |
26
|
|
|
|
|
|
|
parser by calling the $p->parse() or $p->parse_file() methods. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=cut |
29
|
|
|
|
|
|
|
|
30
|
2
|
|
|
2
|
|
1007
|
use strict; |
|
2
|
|
|
|
|
13
|
|
|
2
|
|
|
|
|
62
|
|
31
|
2
|
|
|
2
|
|
973
|
use HTML::Tagset (); |
|
2
|
|
|
|
|
2954
|
|
|
2
|
|
|
|
|
1007
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# legacy (some applications grabs this hash directly) |
34
|
|
|
|
|
|
|
our %LINK_ELEMENT; |
35
|
|
|
|
|
|
|
*LINK_ELEMENT = \%HTML::Tagset::linkElements; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=over 4 |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=item $p = HTML::LinkExtor->new |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=item $p = HTML::LinkExtor->new( $callback ) |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=item $p = HTML::LinkExtor->new( $callback, $base ) |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
The constructor takes two optional arguments. The first is a reference |
46
|
|
|
|
|
|
|
to a callback routine. It will be called as links are found. If a |
47
|
|
|
|
|
|
|
callback is not provided, then links are just accumulated internally |
48
|
|
|
|
|
|
|
and can be retrieved by calling the $p->links() method. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
The $base argument is an optional base URL used to absolutize all URLs found. |
51
|
|
|
|
|
|
|
You need to have the I module installed if you provide $base. |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
The callback is called with the lowercase tag name as first argument, |
54
|
|
|
|
|
|
|
and then all link attributes as separate key/value pairs. All |
55
|
|
|
|
|
|
|
non-link attributes are removed. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=cut |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub new |
60
|
|
|
|
|
|
|
{ |
61
|
3
|
|
|
3
|
1
|
1734
|
my($class, $cb, $base) = @_; |
62
|
3
|
|
|
|
|
48
|
my $self = $class->SUPER::new( |
63
|
|
|
|
|
|
|
start_h => ["_start_tag", "self,tagname,attr"], |
64
|
|
|
|
|
|
|
report_tags => [keys %HTML::Tagset::linkElements], |
65
|
|
|
|
|
|
|
); |
66
|
3
|
|
|
|
|
21
|
$self->{extractlink_cb} = $cb; |
67
|
3
|
100
|
|
|
|
10
|
if ($base) { |
68
|
1
|
|
|
|
|
8
|
require URI; |
69
|
1
|
|
|
|
|
7
|
$self->{extractlink_base} = URI->new($base); |
70
|
|
|
|
|
|
|
} |
71
|
3
|
|
|
|
|
8089
|
$self; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub _start_tag |
75
|
|
|
|
|
|
|
{ |
76
|
11
|
|
|
11
|
|
156
|
my($self, $tag, $attr) = @_; |
77
|
|
|
|
|
|
|
|
78
|
11
|
|
|
|
|
23
|
my $base = $self->{extractlink_base}; |
79
|
11
|
|
|
|
|
18
|
my $links = $HTML::Tagset::linkElements{$tag}; |
80
|
11
|
50
|
|
|
|
32
|
$links = [$links] unless ref $links; |
81
|
|
|
|
|
|
|
|
82
|
11
|
|
|
|
|
16
|
my @links; |
83
|
|
|
|
|
|
|
my $a; |
84
|
11
|
|
|
|
|
27
|
for $a (@$links) { |
85
|
17
|
100
|
|
|
|
542
|
next unless exists $attr->{$a}; |
86
|
10
|
|
|
|
|
35
|
(my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 |
|
10
|
|
|
|
|
20
|
|
87
|
10
|
100
|
|
|
|
90
|
push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); |
88
|
|
|
|
|
|
|
} |
89
|
11
|
100
|
|
|
|
912
|
return unless @links; |
90
|
8
|
|
|
|
|
29
|
$self->_found_link($tag, @links); |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _found_link |
94
|
|
|
|
|
|
|
{ |
95
|
8
|
|
|
8
|
|
12
|
my $self = shift; |
96
|
8
|
|
|
|
|
17
|
my $cb = $self->{extractlink_cb}; |
97
|
8
|
100
|
|
|
|
17
|
if ($cb) { |
98
|
4
|
|
|
|
|
10
|
&$cb(@_); |
99
|
|
|
|
|
|
|
} else { |
100
|
4
|
|
|
|
|
7
|
push(@{$self->{'links'}}, [@_]); |
|
4
|
|
|
|
|
71
|
|
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item $p->links |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Returns a list of all links found in the document. The returned |
107
|
|
|
|
|
|
|
values will be anonymous arrays with the following elements: |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
[$tag, $attr => $url1, $attr2 => $url2,...] |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
The $p->links method will also truncate the internal link list. This |
112
|
|
|
|
|
|
|
means that if the method is called twice without any parsing |
113
|
|
|
|
|
|
|
between them the second call will return an empty list. |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Also note that $p->links will always be empty if a callback routine |
116
|
|
|
|
|
|
|
was provided when the I was created. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=cut |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub links |
121
|
|
|
|
|
|
|
{ |
122
|
2
|
|
|
2
|
1
|
15
|
my $self = shift; |
123
|
2
|
100
|
|
|
|
12
|
exists($self->{'links'}) ? @{delete $self->{'links'}} : (); |
|
1
|
|
|
|
|
6
|
|
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# We override the parse_file() method so that we can clear the links |
127
|
|
|
|
|
|
|
# before we start a new file. |
128
|
|
|
|
|
|
|
sub parse_file |
129
|
|
|
|
|
|
|
{ |
130
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
131
|
0
|
|
|
|
|
|
delete $self->{'links'}; |
132
|
0
|
|
|
|
|
|
$self->SUPER::parse_file(@_); |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=back |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head1 EXAMPLE |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
This is an example showing how you can extract links from a document |
140
|
|
|
|
|
|
|
received using LWP: |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
use LWP::UserAgent; |
143
|
|
|
|
|
|
|
use HTML::LinkExtor; |
144
|
|
|
|
|
|
|
use URI::URL; |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
$url = "http://www.perl.org/"; # for instance |
147
|
|
|
|
|
|
|
$ua = LWP::UserAgent->new; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# Set up a callback that collect image links |
150
|
|
|
|
|
|
|
my @imgs = (); |
151
|
|
|
|
|
|
|
sub callback { |
152
|
|
|
|
|
|
|
my($tag, %attr) = @_; |
153
|
|
|
|
|
|
|
return if $tag ne 'img'; # we only look closer at |
154
|
|
|
|
|
|
|
push(@imgs, values %attr); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Make the parser. Unfortunately, we don't know the base yet |
158
|
|
|
|
|
|
|
# (it might be different from $url) |
159
|
|
|
|
|
|
|
$p = HTML::LinkExtor->new(\&callback); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# Request document and parse it as it arrives |
162
|
|
|
|
|
|
|
$res = $ua->request(HTTP::Request->new(GET => $url), |
163
|
|
|
|
|
|
|
sub {$p->parse($_[0])}); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Expand all image URLs to absolute ones |
166
|
|
|
|
|
|
|
my $base = $res->base; |
167
|
|
|
|
|
|
|
@imgs = map { $_ = url($_, $base)->abs; } @imgs; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Print them out |
170
|
|
|
|
|
|
|
print join("\n", @imgs), "\n"; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 SEE ALSO |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
L, L, L, L |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 COPYRIGHT |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Copyright 1996-2001 Gisle Aas. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or |
181
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=cut |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
1; |