File Coverage

blib/lib/HTML/LinkExtor.pm
Criterion Covered Total %
statement 34 37 91.8
branch 13 14 92.8
condition n/a
subroutine 6 7 85.7
pod 3 3 100.0
total 56 61 91.8


line stmt bran cond sub pod time code
1             package HTML::LinkExtor;
2              
3             require HTML::Parser;
4             our @ISA = qw(HTML::Parser);
5             our $VERSION = '3.81';
6              
7             =head1 NAME
8              
9             HTML::LinkExtor - Extract links from an HTML document
10              
11             =head1 SYNOPSIS
12              
13             require HTML::LinkExtor;
14             $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
15             sub cb {
16             my($tag, %links) = @_;
17             print "$tag @{[%links]}\n";
18             }
19             $p->parse_file("index.html");
20              
21             =head1 DESCRIPTION
22              
23             I is an HTML parser that extracts links from an
24             HTML document. The I is a subclass of
25             I. This means that the document should be given to the
26             parser by calling the $p->parse() or $p->parse_file() methods.
27              
28             =cut
29              
30 2     2   1007 use strict;
  2         13  
  2         62  
31 2     2   973 use HTML::Tagset ();
  2         2954  
  2         1007  
32              
33             # legacy (some applications grabs this hash directly)
34             our %LINK_ELEMENT;
35             *LINK_ELEMENT = \%HTML::Tagset::linkElements;
36              
37             =over 4
38              
39             =item $p = HTML::LinkExtor->new
40              
41             =item $p = HTML::LinkExtor->new( $callback )
42              
43             =item $p = HTML::LinkExtor->new( $callback, $base )
44              
45             The constructor takes two optional arguments. The first is a reference
46             to a callback routine. It will be called as links are found. If a
47             callback is not provided, then links are just accumulated internally
48             and can be retrieved by calling the $p->links() method.
49              
50             The $base argument is an optional base URL used to absolutize all URLs found.
51             You need to have the I module installed if you provide $base.
52              
53             The callback is called with the lowercase tag name as first argument,
54             and then all link attributes as separate key/value pairs. All
55             non-link attributes are removed.
56              
57             =cut
58              
59             sub new
60             {
61 3     3 1 1734 my($class, $cb, $base) = @_;
62 3         48 my $self = $class->SUPER::new(
63             start_h => ["_start_tag", "self,tagname,attr"],
64             report_tags => [keys %HTML::Tagset::linkElements],
65             );
66 3         21 $self->{extractlink_cb} = $cb;
67 3 100       10 if ($base) {
68 1         8 require URI;
69 1         7 $self->{extractlink_base} = URI->new($base);
70             }
71 3         8089 $self;
72             }
73              
74             sub _start_tag
75             {
76 11     11   156 my($self, $tag, $attr) = @_;
77              
78 11         23 my $base = $self->{extractlink_base};
79 11         18 my $links = $HTML::Tagset::linkElements{$tag};
80 11 50       32 $links = [$links] unless ref $links;
81              
82 11         16 my @links;
83             my $a;
84 11         27 for $a (@$links) {
85 17 100       542 next unless exists $attr->{$a};
86 10         35 (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
  10         20  
87 10 100       90 push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
88             }
89 11 100       912 return unless @links;
90 8         29 $self->_found_link($tag, @links);
91             }
92              
93             sub _found_link
94             {
95 8     8   12 my $self = shift;
96 8         17 my $cb = $self->{extractlink_cb};
97 8 100       17 if ($cb) {
98 4         10 &$cb(@_);
99             } else {
100 4         7 push(@{$self->{'links'}}, [@_]);
  4         71  
101             }
102             }
103              
104             =item $p->links
105              
106             Returns a list of all links found in the document. The returned
107             values will be anonymous arrays with the following elements:
108              
109             [$tag, $attr => $url1, $attr2 => $url2,...]
110              
111             The $p->links method will also truncate the internal link list. This
112             means that if the method is called twice without any parsing
113             between them the second call will return an empty list.
114              
115             Also note that $p->links will always be empty if a callback routine
116             was provided when the I was created.
117              
118             =cut
119              
120             sub links
121             {
122 2     2 1 15 my $self = shift;
123 2 100       12 exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
  1         6  
124             }
125              
126             # We override the parse_file() method so that we can clear the links
127             # before we start a new file.
128             sub parse_file
129             {
130 0     0 1   my $self = shift;
131 0           delete $self->{'links'};
132 0           $self->SUPER::parse_file(@_);
133             }
134              
135             =back
136              
137             =head1 EXAMPLE
138              
139             This is an example showing how you can extract links from a document
140             received using LWP:
141              
142             use LWP::UserAgent;
143             use HTML::LinkExtor;
144             use URI::URL;
145              
146             $url = "http://www.perl.org/"; # for instance
147             $ua = LWP::UserAgent->new;
148              
149             # Set up a callback that collect image links
150             my @imgs = ();
151             sub callback {
152             my($tag, %attr) = @_;
153             return if $tag ne 'img'; # we only look closer at
154             push(@imgs, values %attr);
155             }
156              
157             # Make the parser. Unfortunately, we don't know the base yet
158             # (it might be different from $url)
159             $p = HTML::LinkExtor->new(\&callback);
160              
161             # Request document and parse it as it arrives
162             $res = $ua->request(HTTP::Request->new(GET => $url),
163             sub {$p->parse($_[0])});
164              
165             # Expand all image URLs to absolute ones
166             my $base = $res->base;
167             @imgs = map { $_ = url($_, $base)->abs; } @imgs;
168              
169             # Print them out
170             print join("\n", @imgs), "\n";
171              
172             =head1 SEE ALSO
173              
174             L, L, L, L
175              
176             =head1 COPYRIGHT
177              
178             Copyright 1996-2001 Gisle Aas.
179              
180             This library is free software; you can redistribute it and/or
181             modify it under the same terms as Perl itself.
182              
183             =cut
184              
185             1;