File Coverage

blib/lib/HTML/LinkExtor.pm

Criterion	Covered	Total	%
statement	34	37	91.8
branch	13	14	92.8
condition			n/a
subroutine	6	7	85.7
pod	3	3	100.0
total	56	61	91.8

line	stmt	bran	sub	pod	time	code
1						package HTML::LinkExtor;
2
3						require HTML::Parser;
4						our @ISA = qw(HTML::Parser);
5						our $VERSION = '3.80';
6
7						=head1 NAME
8
9						HTML::LinkExtor - Extract links from an HTML document
10
11						=head1 SYNOPSIS
12
13						require HTML::LinkExtor;
14						$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
15						sub cb {
16						my($tag, %links) = @_;
17						print "$tag @{[%links]}\n";
18						}
19						$p->parse_file("index.html");
20
21						=head1 DESCRIPTION
22
23						I is an HTML parser that extracts links from an
24						HTML document. The I is a subclass of
25						I. This means that the document should be given to the
26						parser by calling the $p->parse() or $p->parse_file() methods.
27
28						=cut
29
30	2		2		1005	use strict;
	2				15
	2				65
31	2		2		876	use HTML::Tagset ();
	2				2867
	2				1011
32
33						# legacy (some applications grabs this hash directly)
34						our %LINK_ELEMENT;
35						*LINK_ELEMENT = \%HTML::Tagset::linkElements;
36
37						=over 4
38
39						=item $p = HTML::LinkExtor->new
40
41						=item $p = HTML::LinkExtor->new( $callback )
42
43						=item $p = HTML::LinkExtor->new( $callback, $base )
44
45						The constructor takes two optional arguments. The first is a reference
46						to a callback routine. It will be called as links are found. If a
47						callback is not provided, then links are just accumulated internally
48						and can be retrieved by calling the $p->links() method.
49
50						The $base argument is an optional base URL used to absolutize all URLs found.
51						You need to have the I module installed if you provide $base.
52
53						The callback is called with the lowercase tag name as first argument,
54						and then all link attributes as separate key/value pairs. All
55						non-link attributes are removed.
56
57						=cut
58
59						sub new
60						{
61	3		3	1	1197	my($class, $cb, $base) = @_;
62	3				55	my $self = $class->SUPER::new(
63						start_h => ["_start_tag", "self,tagname,attr"],
64						report_tags => [keys %HTML::Tagset::linkElements],
65						);
66	3				24	$self->{extractlink_cb} = $cb;
67	3	100			10	if ($base) {
68	1				6	require URI;
69	1				6	$self->{extractlink_base} = URI->new($base);
70						}
71	3				8160	$self;
72						}
73
74						sub _start_tag
75						{
76	11		11		177	my($self, $tag, $attr) = @_;
77
78	11				23	my $base = $self->{extractlink_base};
79	11				21	my $links = $HTML::Tagset::linkElements{$tag};
80	11	50			28	$links = [$links] unless ref $links;
81
82	11				17	my @links;
83						my $a;
84	11				26	for $a (@$links) {
85	17	100			539	next unless exists $attr->{$a};
86	10				34	(my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
	10				21
87	10	100			84	push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
88						}
89	11	100			846	return unless @links;
90	8				29	$self->_found_link($tag, @links);
91						}
92
93						sub _found_link
94						{
95	8		8		14	my $self = shift;
96	8				13	my $cb = $self->{extractlink_cb};
97	8	100			18	if ($cb) {
98	4				9	&$cb(@_);
99						} else {
100	4				5	push(@{$self->{'links'}}, [@_]);
	4				58
101						}
102						}
103
104						=item $p->links
105
106						Returns a list of all links found in the document. The returned
107						values will be anonymous arrays with the following elements:
108
109						[$tag, $attr => $url1, $attr2 => $url2,...]
110
111						The $p->links method will also truncate the internal link list. This
112						means that if the method is called twice without any parsing
113						between them the second call will return an empty list.
114
115						Also note that $p->links will always be empty if a callback routine
116						was provided when the I was created.
117
118						=cut
119
120						sub links
121						{
122	2		2	1	14	my $self = shift;
123	2	100			10	exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
	1				5
124						}
125
126						# We override the parse_file() method so that we can clear the links
127						# before we start a new file.
128						sub parse_file
129						{
130	0		0	1		my $self = shift;
131	0					delete $self->{'links'};
132	0					$self->SUPER::parse_file(@_);
133						}
134
135						=back
136
137						=head1 EXAMPLE
138
139						This is an example showing how you can extract links from a document
140						received using LWP:
141
142						use LWP::UserAgent;
143						use HTML::LinkExtor;
144						use URI::URL;
145
146						$url = "http://www.perl.org/"; # for instance
147						$ua = LWP::UserAgent->new;
148
149						# Set up a callback that collect image links
150						my @imgs = ();
151						sub callback {
152						my($tag, %attr) = @_;
153						return if $tag ne 'img'; # we only look closer at
154						push(@imgs, values %attr);
155						}
156
157						# Make the parser. Unfortunately, we don't know the base yet
158						# (it might be different from $url)
159						$p = HTML::LinkExtor->new(\&callback);
160
161						# Request document and parse it as it arrives
162						$res = $ua->request(HTTP::Request->new(GET => $url),
163						sub {$p->parse($_[0])});
164
165						# Expand all image URLs to absolute ones
166						my $base = $res->base;
167						@imgs = map { $_ = url($_, $base)->abs; } @imgs;
168
169						# Print them out
170						print join("\n", @imgs), "\n";
171
172						=head1 SEE ALSO
173
174						L, L, L, L
175
176						=head1 COPYRIGHT
177
178						Copyright 1996-2001 Gisle Aas.
179
180						This library is free software; you can redistribute it and/or
181						modify it under the same terms as Perl itself.
182
183						=cut
184
185						1;