line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::SimpleLinkExtor; |
2
|
8
|
|
|
8
|
|
10515
|
use strict; |
|
8
|
|
|
|
|
19
|
|
|
8
|
|
|
|
|
241
|
|
3
|
|
|
|
|
|
|
|
4
|
8
|
|
|
8
|
|
39
|
use warnings; |
|
8
|
|
|
|
|
15
|
|
|
8
|
|
|
|
|
199
|
|
5
|
8
|
|
|
8
|
|
33
|
no warnings; |
|
8
|
|
|
|
|
16
|
|
|
8
|
|
|
|
|
317
|
|
6
|
|
|
|
|
|
|
|
7
|
8
|
|
|
8
|
|
4364
|
use subs qw(); |
|
8
|
|
|
|
|
199
|
|
|
8
|
|
|
|
|
230
|
|
8
|
8
|
|
|
8
|
|
44
|
use vars qw( $AUTOLOAD ); |
|
8
|
|
|
|
|
15
|
|
|
8
|
|
|
|
|
446
|
|
9
|
|
|
|
|
|
|
|
10
|
8
|
|
|
8
|
|
3872
|
use AutoLoader; |
|
8
|
|
|
|
|
12097
|
|
|
8
|
|
|
|
|
47
|
|
11
|
8
|
|
|
8
|
|
327
|
use Carp qw(carp); |
|
8
|
|
|
|
|
17
|
|
|
8
|
|
|
|
|
412
|
|
12
|
8
|
|
|
8
|
|
3766
|
use HTML::LinkExtor; |
|
8
|
|
|
|
|
69680
|
|
|
8
|
|
|
|
|
295
|
|
13
|
8
|
|
|
8
|
|
5665
|
use LWP::UserAgent; |
|
8
|
|
|
|
|
349710
|
|
|
8
|
|
|
|
|
319
|
|
14
|
8
|
|
|
8
|
|
69
|
use URI; |
|
8
|
|
|
|
|
19
|
|
|
8
|
|
|
|
|
309
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '1.273'; |
17
|
|
|
|
|
|
|
|
18
|
8
|
|
|
8
|
|
49
|
use parent qw(HTML::LinkExtor); |
|
8
|
|
|
|
|
20
|
|
|
8
|
|
|
|
|
49
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %AUTO_METHODS = qw( |
21
|
|
|
|
|
|
|
background attribute |
22
|
|
|
|
|
|
|
href attribute |
23
|
|
|
|
|
|
|
src attribute |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
a tag |
26
|
|
|
|
|
|
|
area tag |
27
|
|
|
|
|
|
|
base tag |
28
|
|
|
|
|
|
|
body tag |
29
|
|
|
|
|
|
|
img tag |
30
|
|
|
|
|
|
|
frame tag |
31
|
|
|
|
|
|
|
iframe tag |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
script tag |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
10
|
|
|
10
|
|
19725
|
sub DESTROY { 1 }; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub AUTOLOAD { |
40
|
19
|
|
|
19
|
|
19363
|
my $self = shift; |
41
|
19
|
|
|
|
|
36
|
my $method = $AUTOLOAD; |
42
|
|
|
|
|
|
|
|
43
|
19
|
|
|
|
|
110
|
$method =~ s/.*:://; |
44
|
|
|
|
|
|
|
|
45
|
19
|
100
|
|
|
|
63
|
unless( exists $AUTO_METHODS{$method} ) { |
46
|
3
|
|
|
|
|
398
|
carp __PACKAGE__ . ": method $method unknown"; |
47
|
3
|
|
|
|
|
252
|
return; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
16
|
|
|
|
|
39
|
$self->_extract( $method ); |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub can { |
54
|
14
|
|
|
14
|
1
|
7351
|
my( $self, @methods ) = @_; |
55
|
|
|
|
|
|
|
|
56
|
14
|
|
|
|
|
33
|
foreach my $method ( @methods ) { |
57
|
14
|
100
|
|
|
|
40
|
return 0 unless $self->_can( $method ); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
8
|
|
|
|
|
30
|
return 1; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _can { |
64
|
8
|
|
|
8
|
|
2072
|
no strict 'refs'; |
|
8
|
|
|
|
|
21
|
|
|
8
|
|
|
|
|
11783
|
|
65
|
|
|
|
|
|
|
|
66
|
14
|
100
|
|
14
|
|
77
|
return 1 if exists $AUTO_METHODS{ $_[1] }; |
67
|
11
|
100
|
|
|
|
21
|
return 1 if defined &{"$_[1]"}; |
|
11
|
|
|
|
|
94
|
|
68
|
|
|
|
|
|
|
|
69
|
6
|
|
|
|
|
73
|
return 0; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub _init_links { |
73
|
20
|
|
|
20
|
|
37
|
my $self = shift; |
74
|
20
|
|
|
|
|
36
|
my $links = shift; |
75
|
20
|
100
|
|
|
|
78
|
do { |
76
|
10
|
|
|
|
|
32
|
delete $self->{'_SimpleLinkExtor_links'}; |
77
|
|
|
|
|
|
|
return |
78
|
10
|
|
|
|
|
22
|
} unless ref $links eq ref []; |
79
|
|
|
|
|
|
|
|
80
|
10
|
|
|
|
|
38
|
$self->{'_SimpleLinkExtor_links'} = $links; |
81
|
|
|
|
|
|
|
|
82
|
10
|
|
|
|
|
23
|
$self; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _link_refs { |
86
|
42
|
|
|
42
|
|
75
|
my $self = shift; |
87
|
|
|
|
|
|
|
|
88
|
42
|
|
|
|
|
67
|
my @link_refs; |
89
|
|
|
|
|
|
|
# XXX: this is a bad way to do this. I should check if the |
90
|
|
|
|
|
|
|
# value is a reference. If I want to reset the links, for |
91
|
|
|
|
|
|
|
# instance, I can't just set it to [] because it then goes |
92
|
|
|
|
|
|
|
# through this branch. In _init_links I have to use a delete |
93
|
|
|
|
|
|
|
# which I really don't like. I don't have time to rewrite this |
94
|
|
|
|
|
|
|
# right now though --brian, 20050816 |
95
|
42
|
100
|
|
|
|
118
|
if( ref $self->{'_SimpleLinkExtor_links'} ) { |
96
|
33
|
|
|
|
|
52
|
@link_refs = @{$self->{'_SimpleLinkExtor_links'}}; |
|
33
|
|
|
|
|
100
|
|
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
else { |
99
|
|
|
|
|
|
|
@link_refs = map { |
100
|
9
|
|
|
|
|
43
|
HTML::SimpleLinkExtor::LinkRef->new( $_ ) |
|
181
|
|
|
|
|
378
|
|
101
|
|
|
|
|
|
|
} $self->SUPER::links(); |
102
|
9
|
|
|
|
|
41
|
$self->_init_links( \@link_refs ); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# defined() so that an empty string means "do not resolve" |
106
|
42
|
100
|
|
|
|
111
|
unless( defined $self->{'_SimpleLinkExtor_base'} ) { |
107
|
19
|
|
|
|
|
31
|
my $count = -1; |
108
|
19
|
|
|
|
|
32
|
my $found = 0; |
109
|
19
|
|
|
|
|
37
|
foreach my $link ( @link_refs ) { |
110
|
393
|
|
|
|
|
518
|
$count++; |
111
|
393
|
100
|
66
|
|
|
831
|
next unless $link->[0] eq 'base' and $link->[1] eq 'href'; |
112
|
2
|
|
|
|
|
3
|
$found = 1; |
113
|
2
|
|
|
|
|
5
|
$self->{'_SimpleLinkExtor_base'} = $link->[-1]; |
114
|
2
|
|
|
|
|
5
|
last; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#remove the BASE HREF link - Good idea, bad idea? |
118
|
|
|
|
|
|
|
#splice @link_refs, $count, 1, () if $found; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
42
|
|
|
|
|
122
|
$self->_add_base(\@link_refs); |
122
|
|
|
|
|
|
|
|
123
|
42
|
|
|
|
|
252
|
return @link_refs; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _extract { |
127
|
16
|
|
|
16
|
|
26
|
my $self = shift; |
128
|
16
|
|
|
|
|
26
|
my $type = shift; |
129
|
|
|
|
|
|
|
|
130
|
16
|
100
|
|
|
|
43
|
my $method = $AUTO_METHODS{$type} eq 'tag' ? 'tag' : 'attribute'; |
131
|
|
|
|
|
|
|
|
132
|
63
|
|
|
|
|
104
|
my @links = map { $_->linkref } |
133
|
16
|
|
|
|
|
33
|
grep { $_->$method() eq $type } |
|
400
|
|
|
|
|
691
|
|
134
|
|
|
|
|
|
|
$self->_link_refs; |
135
|
|
|
|
|
|
|
|
136
|
16
|
|
|
|
|
72
|
return @links; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub _add_base { |
140
|
42
|
|
|
42
|
|
64
|
my $self = shift; |
141
|
42
|
|
|
|
|
78
|
my $array_ref = shift; |
142
|
|
|
|
|
|
|
|
143
|
42
|
|
|
|
|
75
|
my $base = $self->{'_SimpleLinkExtor_base'}; |
144
|
42
|
100
|
|
|
|
107
|
return unless $base; |
145
|
|
|
|
|
|
|
|
146
|
25
|
|
|
|
|
53
|
foreach my $tuple ( @$array_ref ) { |
147
|
606
|
|
|
|
|
3507
|
foreach my $index ( 1 .. $#$tuple ) { |
148
|
1308
|
100
|
|
|
|
27364
|
next unless exists $AUTO_METHODS{ $tuple->[$index] }; |
149
|
|
|
|
|
|
|
|
150
|
606
|
|
|
|
|
1352
|
my $url = URI->new( $tuple->[$index + 1] ); |
151
|
606
|
50
|
|
|
|
37691
|
next unless ref $url; |
152
|
606
|
|
|
|
|
1272
|
$tuple->[$index + 1] = $url->abs($base); |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=encoding utf8 |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 NAME |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
HTML::SimpleLinkExtor - Extract links from HTML |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 SYNOPSIS |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
use HTML::SimpleLinkExtor; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
my $extor = HTML::SimpleLinkExtor->new(); |
168
|
|
|
|
|
|
|
$extor->parse_file($filename); |
169
|
|
|
|
|
|
|
#--or-- |
170
|
|
|
|
|
|
|
$extor->parse($html); |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
$extor->parse_file($other_file); # get more links |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
$extor->clear_links; # reset the link list |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
#extract all of the links |
177
|
|
|
|
|
|
|
@all_links = $extor->links; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#extract the img links |
180
|
|
|
|
|
|
|
@img_srcs = $extor->img; |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
#extract the frame links |
183
|
|
|
|
|
|
|
@frame_srcs = $extor->frame; |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
#extract the hrefs |
186
|
|
|
|
|
|
|
@area_hrefs = $extor->area; |
187
|
|
|
|
|
|
|
@a_hrefs = $extor->a; |
188
|
|
|
|
|
|
|
@base_hrefs = $extor->base; |
189
|
|
|
|
|
|
|
@hrefs = $extor->href; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#extract the body background link |
192
|
|
|
|
|
|
|
@body_bg = $extor->body; |
193
|
|
|
|
|
|
|
@background = $extor->background; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
@links = $extor->schemes( 'http' ); |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 DESCRIPTION |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This is a simple HTML link extractor designed for the person who does |
200
|
|
|
|
|
|
|
not want to deal with the intricacies of C or the |
201
|
|
|
|
|
|
|
de-referencing needed to get links out of C. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
You can extract all the links or some of the links (based on the HTML |
204
|
|
|
|
|
|
|
tag name or attribute name). If a C<< >> tag is found, |
205
|
|
|
|
|
|
|
all of the relative URLs will be resolved according to that reference. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
This module is simply a subclass around C, so it can |
208
|
|
|
|
|
|
|
only parse what that module can handle. Invalid HTML or XHTML may |
209
|
|
|
|
|
|
|
cause problems. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
If you parse multiple files, the link list grows and contains the |
212
|
|
|
|
|
|
|
aggregate list of links for all of the files parsed. If you want to |
213
|
|
|
|
|
|
|
reset the link list between files, use the clear_links method. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 Class Methods |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=over |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new() |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
Create the link extractor object. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new('') |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new($base) |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Create the link extractor object and resolve the relative URLs |
228
|
|
|
|
|
|
|
accoridng to the supplied base URL. The supplied base URL overrides |
229
|
|
|
|
|
|
|
any other base URL found in the HTML. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Create the link extractor object and do not resolve relative |
232
|
|
|
|
|
|
|
links. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=cut |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub new { |
237
|
10
|
|
|
10
|
1
|
29201
|
my $class = shift; |
238
|
10
|
|
|
|
|
22
|
my $base = shift; |
239
|
|
|
|
|
|
|
|
240
|
10
|
|
|
|
|
69
|
my $self = new HTML::LinkExtor; |
241
|
10
|
|
|
|
|
1152
|
bless $self, $class; |
242
|
|
|
|
|
|
|
|
243
|
10
|
|
|
|
|
40
|
$self->{'_SimpleLinkExtor_base'} = $base; |
244
|
10
|
|
|
|
|
61
|
$self->{'_ua'} = LWP::UserAgent->new; |
245
|
10
|
|
|
|
|
19504
|
$self->_init_links; |
246
|
|
|
|
|
|
|
|
247
|
10
|
|
|
|
|
33
|
return $self; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->ua; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Returns the internal user agent, an C object. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut |
255
|
|
|
|
|
|
|
|
256
|
2
|
|
|
2
|
1
|
11
|
sub ua { $_[0]->{_ua} } |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_tags( TAG [, TAG ] ) |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
C keeps an internal list of HTML tags (such as |
261
|
|
|
|
|
|
|
'a' and 'img') that have URLs as values. If you run into another tag |
262
|
|
|
|
|
|
|
that this module doesn't handle, please send it to me and I'll add it. |
263
|
|
|
|
|
|
|
Until then you can add that tag to the internal list. This affects |
264
|
|
|
|
|
|
|
the entire class, including previously created objects. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub add_tags { |
269
|
1
|
|
|
1
|
1
|
2852
|
my $self = shift; |
270
|
1
|
|
|
|
|
3
|
my $tag = lc shift; |
271
|
|
|
|
|
|
|
|
272
|
1
|
|
|
|
|
4
|
$AUTO_METHODS{ $tag } = 'tag'; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_attributes( ATTR [, ATTR] ) |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
C keeps an internal list of HTML tag attributes |
278
|
|
|
|
|
|
|
(such as 'href' and 'src') that have URLs as values. If you run into |
279
|
|
|
|
|
|
|
another attribute that this module doesn't handle, please send it to |
280
|
|
|
|
|
|
|
me and I'll add it. Until then you can add that attribute to the |
281
|
|
|
|
|
|
|
internal list. This affects the entire class, including previously |
282
|
|
|
|
|
|
|
created objects. |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=cut |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item can() |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
A smarter C that can tell which attributes are also methods. |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub add_attributes { |
293
|
1
|
|
|
1
|
1
|
2813
|
my $self = shift; |
294
|
1
|
|
|
|
|
3
|
my $attr = lc shift; |
295
|
|
|
|
|
|
|
|
296
|
1
|
|
|
|
|
3
|
$AUTO_METHODS{ $attr } = 'attribute'; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_tags( TAG [, TAG ] ) |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Take tags out of the internal list that C uses |
302
|
|
|
|
|
|
|
to extract URLs. This affects the entire class, including previously |
303
|
|
|
|
|
|
|
created objects. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub remove_tags { |
308
|
1
|
|
|
1
|
1
|
2760
|
my $self = shift; |
309
|
1
|
|
|
|
|
3
|
my $tag = lc shift; |
310
|
|
|
|
|
|
|
|
311
|
1
|
|
|
|
|
4
|
delete $AUTO_METHODS{ $tag }; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_attributes( ATTR [, ATTR] ) |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Takes attributes out of the internal list that |
317
|
|
|
|
|
|
|
C uses to extract URLs. This affects the entire |
318
|
|
|
|
|
|
|
class, including previously created objects. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub remove_attributes { |
323
|
1
|
|
|
1
|
1
|
2847
|
my $self = shift; |
324
|
1
|
|
|
|
|
3
|
my $attr = lc shift; |
325
|
|
|
|
|
|
|
|
326
|
1
|
|
|
|
|
3
|
delete $AUTO_METHODS{ $attr }; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->attribute_list |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
Returns a list of the attributes C pays |
332
|
|
|
|
|
|
|
attention to. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub attribute_list { |
337
|
4
|
|
|
4
|
1
|
727
|
my $self = shift; |
338
|
|
|
|
|
|
|
|
339
|
4
|
|
|
|
|
16
|
grep { $AUTO_METHODS{ $_ } eq 'attribute' } keys %AUTO_METHODS; |
|
46
|
|
|
|
|
86
|
|
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->tag_list |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Returns a list of the tags C pays attention to. |
345
|
|
|
|
|
|
|
These tags have convenience methods. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=back |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=cut |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub tag_list { |
352
|
4
|
|
|
4
|
1
|
6173
|
my $self = shift; |
353
|
|
|
|
|
|
|
|
354
|
4
|
|
|
|
|
18
|
grep { $AUTO_METHODS{ $_ } eq 'tag' } keys %AUTO_METHODS; |
|
46
|
|
|
|
|
92
|
|
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=head2 Object methods |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=over 4 |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=item $extor->parse_file( $filename ) |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Parse the file for links. Inherited from C. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=item $extor->parse_url( $url [, $ua] ) |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Fetch URL and parse its content for links. |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=cut |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
sub parse_url { |
375
|
2
|
|
|
2
|
1
|
1295
|
my $data = $_[0]->ua->get( $_[1] )->content; |
376
|
|
|
|
|
|
|
|
377
|
2
|
100
|
|
|
|
29081
|
return unless $data; |
378
|
|
|
|
|
|
|
|
379
|
1
|
|
|
|
|
16
|
$_[0]->parse( $data ); |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=item $extor->parse( $data ) |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
Parse the HTML in C<$data>. Inherited from C. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=item $extor->clear_links |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Clear the link list. This way, you can use the same parser for |
389
|
|
|
|
|
|
|
another file. |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=cut |
392
|
|
|
|
|
|
|
|
393
|
1
|
|
|
1
|
1
|
2793
|
sub clear_links { $_[0]->_init_links( [] ) } |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item $extor->links |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Return a list of the links. |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=cut |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
sub links { |
402
|
76
|
|
|
|
|
120
|
map { $_->linkref } |
403
|
6
|
|
|
6
|
1
|
7942
|
grep { $_[0]->_is_an_allowed_tag( $_->tag ) } |
|
77
|
|
|
|
|
147
|
|
404
|
|
|
|
|
|
|
$_[0]->_link_refs |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub _is_an_allowed_tag { |
408
|
|
|
|
|
|
|
exists $AUTO_METHODS{$_[1]} |
409
|
|
|
|
|
|
|
and |
410
|
77
|
100
|
|
77
|
|
299
|
$AUTO_METHODS{$_[1]} eq 'tag' |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=item $extor->img |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of the |
416
|
|
|
|
|
|
|
IMG. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=cut |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item $extor->frame |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
423
|
|
|
|
|
|
|
the FRAME. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=cut |
426
|
|
|
|
|
|
|
|
427
|
1
|
|
|
1
|
1
|
13
|
sub frames { ( $_[0]->frame, $_[0]->iframe ) } |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item $extor->iframe |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
432
|
|
|
|
|
|
|
the IFRAME. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item $extor->frames |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Returns the combined list from frame and iframe. |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=item $extor->src |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of any |
441
|
|
|
|
|
|
|
tag. |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item $extor->a |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
446
|
|
|
|
|
|
|
A tags. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item $extor->area |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
451
|
|
|
|
|
|
|
AREA tags. |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=item $extor->base |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
456
|
|
|
|
|
|
|
BASE tags. There should only be one. |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=item $extor->href |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of any |
461
|
|
|
|
|
|
|
tag. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=item $extor->body, $extor->background |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
Return the link from the BODY tag's BACKGROUND attribute. |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=item $extor->script |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Return the link from the SCRIPT tag's SRC attribute |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=item $extor->schemes( SCHEME, [ SCHEME, ... ] ) |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
Return the links that use any of SCHEME. These must be absolute URLs (which |
474
|
|
|
|
|
|
|
might include those converted to absolute URLs by specifying a |
475
|
|
|
|
|
|
|
base). SCHEME is case-insensitive. You can specify more than one |
476
|
|
|
|
|
|
|
scheme. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
479
|
|
|
|
|
|
|
the count of the matching links. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=cut |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub schemes { |
484
|
16
|
|
|
16
|
1
|
28597
|
my( $self, @schemes ) = @_; |
485
|
|
|
|
|
|
|
|
486
|
16
|
|
|
|
|
29
|
my %schemes = map { lc, lc } @schemes; |
|
20
|
|
|
|
|
72
|
|
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
my @links = |
489
|
|
|
|
|
|
|
grep { |
490
|
416
|
|
|
|
|
590
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
416
|
|
|
|
|
907
|
|
491
|
416
|
|
|
|
|
36645
|
exists $schemes{ $scheme }; |
492
|
|
|
|
|
|
|
} |
493
|
16
|
|
|
|
|
44
|
map { $_->linkref } |
|
416
|
|
|
|
|
609
|
|
494
|
|
|
|
|
|
|
$self->_link_refs; |
495
|
|
|
|
|
|
|
|
496
|
16
|
100
|
|
|
|
118
|
wantarray ? @links : scalar @links; |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=item $extor->absolute_links |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
Returns the absolute URLs (which might include those converted to |
502
|
|
|
|
|
|
|
absolute URLs by specifying a base). |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
505
|
|
|
|
|
|
|
the count of the matching links. |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=cut |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
sub absolute_links { |
510
|
2
|
|
|
2
|
1
|
3360
|
my $self = shift; |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
my @links = |
513
|
|
|
|
|
|
|
grep { |
514
|
52
|
|
|
|
|
77
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
52
|
|
|
|
|
113
|
|
515
|
52
|
|
|
|
|
14811
|
length $scheme; |
516
|
|
|
|
|
|
|
} |
517
|
2
|
|
|
|
|
5
|
map { $_->linkref } |
|
52
|
|
|
|
|
80
|
|
518
|
|
|
|
|
|
|
$self->_link_refs; |
519
|
|
|
|
|
|
|
|
520
|
2
|
100
|
|
|
|
20
|
wantarray ? @links : scalar @links; |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item $extor->relative_links |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Returns the relatives URLs (which might exclude those converted to |
526
|
|
|
|
|
|
|
absolute URLs by specifying a base or having a base in the document). |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
529
|
|
|
|
|
|
|
the count of the matching links. |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=cut |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
sub relative_links { |
535
|
2
|
|
|
2
|
1
|
3416
|
my $self = shift; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
my @links = |
538
|
|
|
|
|
|
|
grep { |
539
|
52
|
|
|
|
|
76
|
my $scheme = eval { URI->new( $_ )->scheme }; |
|
52
|
|
|
|
|
107
|
|
540
|
52
|
|
|
|
|
14445
|
! defined $scheme; |
541
|
|
|
|
|
|
|
} |
542
|
2
|
|
|
|
|
7
|
map { $_->linkref } |
|
52
|
|
|
|
|
76
|
|
543
|
|
|
|
|
|
|
$self->_link_refs; |
544
|
|
|
|
|
|
|
|
545
|
2
|
100
|
|
|
|
17
|
wantarray ? @links : scalar @links; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=back |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=head1 TO DO |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
This module doesn't handle all of the HTML tags that might |
553
|
|
|
|
|
|
|
have links. If someone wants those, I'll add them, or you |
554
|
|
|
|
|
|
|
can edit C<%AUTO_METHODS> in the source. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head1 CREDITS |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Will Crain who identified a problem with IMG links that had |
559
|
|
|
|
|
|
|
a USEMAP attribute. |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=head1 AUTHORS |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
brian d foy, C<< >> |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Maintained by Nigel Horne, C<< >> |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Copyright © 2004-2019, brian d foy . All rights reserved. |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
572
|
|
|
|
|
|
|
it under the terms of the Artistic License 2.0. |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=cut |
575
|
|
|
|
|
|
|
|
576
|
0
|
|
|
|
|
0
|
BEGIN { |
577
|
|
|
|
|
|
|
package |
578
|
|
|
|
|
|
|
HTML::SimpleLinkExtor::LinkRef; |
579
|
8
|
|
|
8
|
|
215
|
use Carp qw(croak); |
|
8
|
|
|
0
|
|
29
|
|
|
8
|
|
|
|
|
1521
|
|
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub new { |
582
|
181
|
|
|
181
|
|
285
|
my( $class, $arrayref ) = @_; |
583
|
181
|
50
|
|
|
|
357
|
croak "Not an array reference argument!" unless ref $arrayref eq ref []; |
584
|
181
|
|
|
|
|
349
|
bless $arrayref, $class; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
402
|
|
|
402
|
|
830
|
sub tag { $_[0]->[0] } |
588
|
75
|
|
|
75
|
|
137
|
sub attribute { $_[0]->[1] } |
589
|
659
|
|
|
659
|
|
1046
|
sub linkref { $_[0]->[2] } |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
1; |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
__END__ |