line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::SimpleLinkExtor; |
2
|
8
|
|
|
8
|
|
8789
|
use strict; |
|
8
|
|
|
|
|
16
|
|
|
8
|
|
|
|
|
238
|
|
3
|
|
|
|
|
|
|
|
4
|
8
|
|
|
8
|
|
36
|
use warnings; |
|
8
|
|
|
|
|
14
|
|
|
8
|
|
|
|
|
178
|
|
5
|
8
|
|
|
8
|
|
30
|
no warnings; |
|
8
|
|
|
|
|
13
|
|
|
8
|
|
|
|
|
341
|
|
6
|
|
|
|
|
|
|
|
7
|
8
|
|
|
8
|
|
3596
|
use subs qw(); |
|
8
|
|
|
|
|
196
|
|
|
8
|
|
|
|
|
184
|
|
8
|
8
|
|
|
8
|
|
37
|
use vars qw( $AUTOLOAD ); |
|
8
|
|
|
|
|
16
|
|
|
8
|
|
|
|
|
373
|
|
9
|
|
|
|
|
|
|
|
10
|
8
|
|
|
8
|
|
3133
|
use AutoLoader; |
|
8
|
|
|
|
|
9758
|
|
|
8
|
|
|
|
|
38
|
|
11
|
8
|
|
|
8
|
|
268
|
use Carp qw(carp); |
|
8
|
|
|
|
|
29
|
|
|
8
|
|
|
|
|
386
|
|
12
|
8
|
|
|
8
|
|
3053
|
use HTML::LinkExtor; |
|
8
|
|
|
|
|
57452
|
|
|
8
|
|
|
|
|
302
|
|
13
|
8
|
|
|
8
|
|
4812
|
use LWP::UserAgent; |
|
8
|
|
|
|
|
307240
|
|
|
8
|
|
|
|
|
287
|
|
14
|
8
|
|
|
8
|
|
64
|
use URI; |
|
8
|
|
|
|
|
14
|
|
|
8
|
|
|
|
|
263
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '1.272'; |
17
|
|
|
|
|
|
|
|
18
|
8
|
|
|
8
|
|
42
|
use parent qw(HTML::LinkExtor); |
|
8
|
|
|
|
|
20
|
|
|
8
|
|
|
|
|
44
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %AUTO_METHODS = qw( |
21
|
|
|
|
|
|
|
background attribute |
22
|
|
|
|
|
|
|
href attribute |
23
|
|
|
|
|
|
|
src attribute |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
a tag |
26
|
|
|
|
|
|
|
area tag |
27
|
|
|
|
|
|
|
base tag |
28
|
|
|
|
|
|
|
body tag |
29
|
|
|
|
|
|
|
img tag |
30
|
|
|
|
|
|
|
frame tag |
31
|
|
|
|
|
|
|
iframe tag |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
script tag |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
10
|
|
|
10
|
|
16615
|
sub DESTROY { 1 }; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub AUTOLOAD { |
40
|
19
|
|
|
19
|
|
16829
|
my $self = shift; |
41
|
19
|
|
|
|
|
35
|
my $method = $AUTOLOAD; |
42
|
|
|
|
|
|
|
|
43
|
19
|
|
|
|
|
103
|
$method =~ s/.*:://; |
44
|
|
|
|
|
|
|
|
45
|
19
|
100
|
|
|
|
57
|
unless( exists $AUTO_METHODS{$method} ) { |
46
|
3
|
|
|
|
|
349
|
carp __PACKAGE__ . ": method $method unknown"; |
47
|
3
|
|
|
|
|
187
|
return; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
16
|
|
|
|
|
39
|
$self->_extract( $method ); |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub can { |
54
|
14
|
|
|
14
|
1
|
6340
|
my( $self, @methods ) = @_; |
55
|
|
|
|
|
|
|
|
56
|
14
|
|
|
|
|
27
|
foreach my $method ( @methods ) { |
57
|
14
|
100
|
|
|
|
26
|
return 0 unless $self->_can( $method ); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
8
|
|
|
|
|
23
|
return 1; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _can { |
64
|
8
|
|
|
8
|
|
1660
|
no strict 'refs'; |
|
8
|
|
|
|
|
18
|
|
|
8
|
|
|
|
|
9386
|
|
65
|
|
|
|
|
|
|
|
66
|
14
|
100
|
|
14
|
|
45
|
return 1 if exists $AUTO_METHODS{ $_[1] }; |
67
|
11
|
100
|
|
|
|
16
|
return 1 if defined &{"$_[1]"}; |
|
11
|
|
|
|
|
78
|
|
68
|
|
|
|
|
|
|
|
69
|
6
|
|
|
|
|
59
|
return 0; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub _init_links { |
73
|
20
|
|
|
20
|
|
35
|
my $self = shift; |
74
|
20
|
|
|
|
|
30
|
my $links = shift; |
75
|
20
|
100
|
|
|
|
65
|
do { |
76
|
10
|
|
|
|
|
22
|
delete $self->{'_SimpleLinkExtor_links'}; |
77
|
|
|
|
|
|
|
return |
78
|
10
|
|
|
|
|
17
|
} unless ref $links eq ref []; |
79
|
|
|
|
|
|
|
|
80
|
10
|
|
|
|
|
32
|
$self->{'_SimpleLinkExtor_links'} = $links; |
81
|
|
|
|
|
|
|
|
82
|
10
|
|
|
|
|
18
|
$self; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _link_refs { |
86
|
42
|
|
|
42
|
|
60
|
my $self = shift; |
87
|
|
|
|
|
|
|
|
88
|
42
|
|
|
|
|
53
|
my @link_refs; |
89
|
|
|
|
|
|
|
# XXX: this is a bad way to do this. I should check if the |
90
|
|
|
|
|
|
|
# value is a reference. If I want to reset the links, for |
91
|
|
|
|
|
|
|
# instance, I can't just set it to [] because it then goes |
92
|
|
|
|
|
|
|
# through this branch. In _init_links I have to use a delete |
93
|
|
|
|
|
|
|
# which I really don't like. I don't have time to rewrite this |
94
|
|
|
|
|
|
|
# right now though --brian, 20050816 |
95
|
42
|
100
|
|
|
|
107
|
if( ref $self->{'_SimpleLinkExtor_links'} ) { |
96
|
33
|
|
|
|
|
56
|
@link_refs = @{$self->{'_SimpleLinkExtor_links'}}; |
|
33
|
|
|
|
|
81
|
|
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
else { |
99
|
|
|
|
|
|
|
@link_refs = map { |
100
|
9
|
|
|
|
|
46
|
HTML::SimpleLinkExtor::LinkRef->new( $_ ) |
|
181
|
|
|
|
|
360
|
|
101
|
|
|
|
|
|
|
} $self->SUPER::links(); |
102
|
9
|
|
|
|
|
34
|
$self->_init_links( \@link_refs ); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# defined() so that an empty string means "do not resolve" |
106
|
42
|
100
|
|
|
|
106
|
unless( defined $self->{'_SimpleLinkExtor_base'} ) { |
107
|
19
|
|
|
|
|
28
|
my $count = -1; |
108
|
19
|
|
|
|
|
22
|
my $found = 0; |
109
|
19
|
|
|
|
|
39
|
foreach my $link ( @link_refs ) { |
110
|
393
|
|
|
|
|
404
|
$count++; |
111
|
393
|
100
|
66
|
|
|
693
|
next unless $link->[0] eq 'base' and $link->[1] eq 'href'; |
112
|
2
|
|
|
|
|
4
|
$found = 1; |
113
|
2
|
|
|
|
|
5
|
$self->{'_SimpleLinkExtor_base'} = $link->[-1]; |
114
|
2
|
|
|
|
|
4
|
last; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#remove the BASE HREF link - Good idea, bad idea? |
118
|
|
|
|
|
|
|
#splice @link_refs, $count, 1, () if $found; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
42
|
|
|
|
|
107
|
$self->_add_base(\@link_refs); |
122
|
|
|
|
|
|
|
|
123
|
42
|
|
|
|
|
223
|
return @link_refs; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _extract { |
127
|
16
|
|
|
16
|
|
19
|
my $self = shift; |
128
|
16
|
|
|
|
|
23
|
my $type = shift; |
129
|
|
|
|
|
|
|
|
130
|
16
|
100
|
|
|
|
40
|
my $method = $AUTO_METHODS{$type} eq 'tag' ? 'tag' : 'attribute'; |
131
|
|
|
|
|
|
|
|
132
|
63
|
|
|
|
|
88
|
my @links = map { $_->linkref } |
133
|
16
|
|
|
|
|
30
|
grep { $_->$method() eq $type } |
|
400
|
|
|
|
|
552
|
|
134
|
|
|
|
|
|
|
$self->_link_refs; |
135
|
|
|
|
|
|
|
|
136
|
16
|
|
|
|
|
71
|
return @links; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub _add_base { |
140
|
42
|
|
|
42
|
|
55
|
my $self = shift; |
141
|
42
|
|
|
|
|
52
|
my $array_ref = shift; |
142
|
|
|
|
|
|
|
|
143
|
42
|
|
|
|
|
72
|
my $base = $self->{'_SimpleLinkExtor_base'}; |
144
|
42
|
100
|
|
|
|
85
|
return unless $base; |
145
|
|
|
|
|
|
|
|
146
|
25
|
|
|
|
|
40
|
foreach my $tuple ( @$array_ref ) { |
147
|
606
|
|
|
|
|
2874
|
foreach my $index ( 1 .. $#$tuple ) { |
148
|
1308
|
100
|
|
|
|
22306
|
next unless exists $AUTO_METHODS{ $tuple->[$index] }; |
149
|
|
|
|
|
|
|
|
150
|
606
|
|
|
|
|
1180
|
my $url = URI->new( $tuple->[$index + 1] ); |
151
|
606
|
50
|
|
|
|
32351
|
next unless ref $url; |
152
|
606
|
|
|
|
|
1050
|
$tuple->[$index + 1] = $url->abs($base); |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=encoding utf8 |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 NAME |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
HTML::SimpleLinkExtor - Extract links from HTML |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 SYNOPSIS |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
use HTML::SimpleLinkExtor; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
my $extor = HTML::SimpleLinkExtor->new(); |
168
|
|
|
|
|
|
|
$extor->parse_file($filename); |
169
|
|
|
|
|
|
|
#--or-- |
170
|
|
|
|
|
|
|
$extor->parse($html); |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
$extor->parse_file($other_file); # get more links |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
$extor->clear_links; # reset the link list |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
#extract all of the links |
177
|
|
|
|
|
|
|
@all_links = $extor->links; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#extract the img links |
180
|
|
|
|
|
|
|
@img_srcs = $extor->img; |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
#extract the frame links |
183
|
|
|
|
|
|
|
@frame_srcs = $extor->frame; |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
#extract the hrefs |
186
|
|
|
|
|
|
|
@area_hrefs = $extor->area; |
187
|
|
|
|
|
|
|
@a_hrefs = $extor->a; |
188
|
|
|
|
|
|
|
@base_hrefs = $extor->base; |
189
|
|
|
|
|
|
|
@hrefs = $extor->href; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#extract the body background link |
192
|
|
|
|
|
|
|
@body_bg = $extor->body; |
193
|
|
|
|
|
|
|
@background = $extor->background; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
@links = $extor->schemes( 'http' ); |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 DESCRIPTION |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
THIS IS AN ABANDONED MODULE. THERE IS NO SUPPORT. YOU CAN ADOPT IT |
200
|
|
|
|
|
|
|
IF YOU LIKE: https://pause.perl.org/pause/query?ACTION=pause_04about#takeover |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This is a simple HTML link extractor designed for the person who does |
203
|
|
|
|
|
|
|
not want to deal with the intricacies of C or the |
204
|
|
|
|
|
|
|
de-referencing needed to get links out of C. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
You can extract all the links or some of the links (based on the HTML |
207
|
|
|
|
|
|
|
tag name or attribute name). If a C<< >> tag is found, |
208
|
|
|
|
|
|
|
all of the relative URLs will be resolved according to that reference. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
This module is simply a subclass around C, so it can |
211
|
|
|
|
|
|
|
only parse what that module can handle. Invalid HTML or XHTML may |
212
|
|
|
|
|
|
|
cause problems. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
If you parse multiple files, the link list grows and contains the |
215
|
|
|
|
|
|
|
aggregate list of links for all of the files parsed. If you want to |
216
|
|
|
|
|
|
|
reset the link list between files, use the clear_links method. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=head2 Class Methods |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=over |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new() |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Create the link extractor object. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new('') |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new($base) |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Create the link extractor object and resolve the relative URLs |
231
|
|
|
|
|
|
|
accoridng to the supplied base URL. The supplied base URL overrides |
232
|
|
|
|
|
|
|
any other base URL found in the HTML. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Create the link extractor object and do not resolve relative |
235
|
|
|
|
|
|
|
links. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=cut |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
sub new { |
240
|
10
|
|
|
10
|
1
|
23782
|
my $class = shift; |
241
|
10
|
|
|
|
|
20
|
my $base = shift; |
242
|
|
|
|
|
|
|
|
243
|
10
|
|
|
|
|
48
|
my $self = new HTML::LinkExtor; |
244
|
10
|
|
|
|
|
917
|
bless $self, $class; |
245
|
|
|
|
|
|
|
|
246
|
10
|
|
|
|
|
32
|
$self->{'_SimpleLinkExtor_base'} = $base; |
247
|
10
|
|
|
|
|
50
|
$self->{'_ua'} = LWP::UserAgent->new; |
248
|
10
|
|
|
|
|
15866
|
$self->_init_links; |
249
|
|
|
|
|
|
|
|
250
|
10
|
|
|
|
|
27
|
return $self; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->ua; |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Returns the internal user agent, an C object. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=cut |
258
|
|
|
|
|
|
|
|
259
|
2
|
|
|
2
|
1
|
8
|
sub ua { $_[0]->{_ua} } |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_tags( TAG [, TAG ] ) |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
C keeps an internal list of HTML tags (such as |
264
|
|
|
|
|
|
|
'a' and 'img') that have URLs as values. If you run into another tag |
265
|
|
|
|
|
|
|
that this module doesn't handle, please send it to me and I'll add it. |
266
|
|
|
|
|
|
|
Until then you can add that tag to the internal list. This affects |
267
|
|
|
|
|
|
|
the entire class, including previously created objects. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=cut |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub add_tags { |
272
|
1
|
|
|
1
|
1
|
1989
|
my $self = shift; |
273
|
1
|
|
|
|
|
2
|
my $tag = lc shift; |
274
|
|
|
|
|
|
|
|
275
|
1
|
|
|
|
|
3
|
$AUTO_METHODS{ $tag } = 'tag'; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_attributes( ATTR [, ATTR] ) |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
C keeps an internal list of HTML tag attributes |
281
|
|
|
|
|
|
|
(such as 'href' and 'src') that have URLs as values. If you run into |
282
|
|
|
|
|
|
|
another attribute that this module doesn't handle, please send it to |
283
|
|
|
|
|
|
|
me and I'll add it. Until then you can add that attribute to the |
284
|
|
|
|
|
|
|
internal list. This affects the entire class, including previously |
285
|
|
|
|
|
|
|
created objects. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=cut |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=item can() |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
A smarter C that can tell which attributes are also methods. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=cut |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
sub add_attributes { |
296
|
1
|
|
|
1
|
1
|
1949
|
my $self = shift; |
297
|
1
|
|
|
|
|
2
|
my $attr = lc shift; |
298
|
|
|
|
|
|
|
|
299
|
1
|
|
|
|
|
3
|
$AUTO_METHODS{ $attr } = 'attribute'; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_tags( TAG [, TAG ] ) |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Take tags out of the internal list that C uses |
305
|
|
|
|
|
|
|
to extract URLs. This affects the entire class, including previously |
306
|
|
|
|
|
|
|
created objects. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=cut |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub remove_tags { |
311
|
1
|
|
|
1
|
1
|
1965
|
my $self = shift; |
312
|
1
|
|
|
|
|
3
|
my $tag = lc shift; |
313
|
|
|
|
|
|
|
|
314
|
1
|
|
|
|
|
2
|
delete $AUTO_METHODS{ $tag }; |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_attributes( ATTR [, ATTR] ) |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
Takes attributes out of the internal list that |
320
|
|
|
|
|
|
|
C uses to extract URLs. This affects the entire |
321
|
|
|
|
|
|
|
class, including previously created objects. |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=cut |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
sub remove_attributes { |
326
|
1
|
|
|
1
|
1
|
1943
|
my $self = shift; |
327
|
1
|
|
|
|
|
2
|
my $attr = lc shift; |
328
|
|
|
|
|
|
|
|
329
|
1
|
|
|
|
|
3
|
delete $AUTO_METHODS{ $attr }; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->attribute_list |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Returns a list of the attributes C pays |
335
|
|
|
|
|
|
|
attention to. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=cut |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub attribute_list { |
340
|
4
|
|
|
4
|
1
|
520
|
my $self = shift; |
341
|
|
|
|
|
|
|
|
342
|
4
|
|
|
|
|
11
|
grep { $AUTO_METHODS{ $_ } eq 'attribute' } keys %AUTO_METHODS; |
|
46
|
|
|
|
|
72
|
|
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->tag_list |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Returns a list of the tags C pays attention to. |
348
|
|
|
|
|
|
|
These tags have convenience methods. |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=back |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=cut |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub tag_list { |
355
|
4
|
|
|
4
|
1
|
4119
|
my $self = shift; |
356
|
|
|
|
|
|
|
|
357
|
4
|
|
|
|
|
13
|
grep { $AUTO_METHODS{ $_ } eq 'tag' } keys %AUTO_METHODS; |
|
46
|
|
|
|
|
75
|
|
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=head2 Object methods |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=over 4 |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=item $extor->parse_file( $filename ) |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Parse the file for links. Inherited from C. |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=cut |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item $extor->parse_url( $url [, $ua] ) |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Fetch URL and parse its content for links. |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=cut |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
sub parse_url { |
378
|
2
|
|
|
2
|
1
|
798
|
my $data = $_[0]->ua->get( $_[1] )->content; |
379
|
|
|
|
|
|
|
|
380
|
2
|
100
|
|
|
|
23624
|
return unless $data; |
381
|
|
|
|
|
|
|
|
382
|
1
|
|
|
|
|
12
|
$_[0]->parse( $data ); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=item $extor->parse( $data ) |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
Parse the HTML in C<$data>. Inherited from C. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=item $extor->clear_links |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Clear the link list. This way, you can use the same parser for |
392
|
|
|
|
|
|
|
another file. |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
395
|
|
|
|
|
|
|
|
396
|
1
|
|
|
1
|
1
|
2340
|
sub clear_links { $_[0]->_init_links( [] ) } |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=item $extor->links |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Return a list of the links. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=cut |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub links { |
405
|
76
|
|
|
|
|
91
|
map { $_->linkref } |
406
|
6
|
|
|
6
|
1
|
6352
|
grep { $_[0]->_is_an_allowed_tag( $_->tag ) } |
|
77
|
|
|
|
|
135
|
|
407
|
|
|
|
|
|
|
$_[0]->_link_refs |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub _is_an_allowed_tag { |
411
|
|
|
|
|
|
|
exists $AUTO_METHODS{$_[1]} |
412
|
|
|
|
|
|
|
and |
413
|
77
|
100
|
|
77
|
|
237
|
$AUTO_METHODS{$_[1]} eq 'tag' |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=item $extor->img |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of the |
419
|
|
|
|
|
|
|
IMG. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=cut |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=item $extor->frame |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
426
|
|
|
|
|
|
|
the FRAME. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=cut |
429
|
|
|
|
|
|
|
|
430
|
1
|
|
|
1
|
1
|
11
|
sub frames { ( $_[0]->frame, $_[0]->iframe ) } |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item $extor->iframe |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
435
|
|
|
|
|
|
|
the IFRAME. |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=item $extor->frames |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Returns the combined list from frame and iframe. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=item $extor->src |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of any |
444
|
|
|
|
|
|
|
tag. |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=item $extor->a |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
449
|
|
|
|
|
|
|
A tags. |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=item $extor->area |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
454
|
|
|
|
|
|
|
AREA tags. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item $extor->base |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
459
|
|
|
|
|
|
|
BASE tags. There should only be one. |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=item $extor->href |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of any |
464
|
|
|
|
|
|
|
tag. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=item $extor->body, $extor->background |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
Return the link from the BODY tag's BACKGROUND attribute. |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=item $extor->script |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Return the link from the SCRIPT tag's SRC attribute |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=item $extor->schemes( SCHEME, [ SCHEME, ... ] ) |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
Return the links that use any of SCHEME. These must be absolute URLs (which |
477
|
|
|
|
|
|
|
might include those converted to absolute URLs by specifying a |
478
|
|
|
|
|
|
|
base). SCHEME is case-insensitive. You can specify more than one |
479
|
|
|
|
|
|
|
scheme. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
482
|
|
|
|
|
|
|
the count of the matching links. |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=cut |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
sub schemes { |
487
|
16
|
|
|
16
|
1
|
21052
|
my( $self, @schemes ) = @_; |
488
|
|
|
|
|
|
|
|
489
|
16
|
|
|
|
|
27
|
my %schemes = map { lc, lc } @schemes; |
|
20
|
|
|
|
|
60
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
my @links = |
492
|
|
|
|
|
|
|
grep { |
493
|
416
|
|
|
|
|
514
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
416
|
|
|
|
|
665
|
|
494
|
416
|
|
|
|
|
28905
|
exists $schemes{ $scheme }; |
495
|
|
|
|
|
|
|
} |
496
|
16
|
|
|
|
|
35
|
map { $_->linkref } |
|
416
|
|
|
|
|
483
|
|
497
|
|
|
|
|
|
|
$self->_link_refs; |
498
|
|
|
|
|
|
|
|
499
|
16
|
100
|
|
|
|
90
|
wantarray ? @links : scalar @links; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=item $extor->absolute_links |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
Returns the absolute URLs (which might include those converted to |
505
|
|
|
|
|
|
|
absolute URLs by specifying a base). |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
508
|
|
|
|
|
|
|
the count of the matching links. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=cut |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
sub absolute_links { |
513
|
2
|
|
|
2
|
1
|
3420
|
my $self = shift; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
my @links = |
516
|
|
|
|
|
|
|
grep { |
517
|
52
|
|
|
|
|
70
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
52
|
|
|
|
|
95
|
|
518
|
52
|
|
|
|
|
12865
|
length $scheme; |
519
|
|
|
|
|
|
|
} |
520
|
2
|
|
|
|
|
12
|
map { $_->linkref } |
|
52
|
|
|
|
|
67
|
|
521
|
|
|
|
|
|
|
$self->_link_refs; |
522
|
|
|
|
|
|
|
|
523
|
2
|
100
|
|
|
|
21
|
wantarray ? @links : scalar @links; |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=item $extor->relative_links |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
Returns the relatives URLs (which might exclude those converted to |
529
|
|
|
|
|
|
|
absolute URLs by specifying a base or having a base in the document). |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
532
|
|
|
|
|
|
|
the count of the matching links. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=cut |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub relative_links { |
538
|
2
|
|
|
2
|
1
|
2731
|
my $self = shift; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
my @links = |
541
|
|
|
|
|
|
|
grep { |
542
|
52
|
|
|
|
|
59
|
my $scheme = eval { URI->new( $_ )->scheme }; |
|
52
|
|
|
|
|
96
|
|
543
|
52
|
|
|
|
|
11129
|
! defined $scheme; |
544
|
|
|
|
|
|
|
} |
545
|
2
|
|
|
|
|
4
|
map { $_->linkref } |
|
52
|
|
|
|
|
61
|
|
546
|
|
|
|
|
|
|
$self->_link_refs; |
547
|
|
|
|
|
|
|
|
548
|
2
|
100
|
|
|
|
11
|
wantarray ? @links : scalar @links; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=back |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=head1 TO DO |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
This module doesn't handle all of the HTML tags that might |
556
|
|
|
|
|
|
|
have links. If someone wants those, I'll add them, or you |
557
|
|
|
|
|
|
|
can edit C<%AUTO_METHODS> in the source. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=head1 CREDITS |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
Will Crain who identified a problem with IMG links that had |
562
|
|
|
|
|
|
|
a USEMAP attribute. |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 SOURCE AVAILABILITY |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
This module is in Github |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
https://github.com:CPAN-Adoptable-Modules/html-simplelinkextor.git |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=head1 AUTHORS |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
brian d foy, C<< >> |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
Copyright © 2004-2019, brian d foy . All rights reserved. |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
579
|
|
|
|
|
|
|
it under the terms of the Artistic License 2.0. |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=cut |
582
|
|
|
|
|
|
|
|
583
|
0
|
|
|
|
|
0
|
BEGIN { |
584
|
|
|
|
|
|
|
package |
585
|
|
|
|
|
|
|
HTML::SimpleLinkExtor::LinkRef; |
586
|
8
|
|
|
8
|
|
122
|
use Carp qw(croak); |
|
8
|
|
|
0
|
|
16
|
|
|
8
|
|
|
|
|
1341
|
|
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
sub new { |
589
|
181
|
|
|
181
|
|
238
|
my( $class, $arrayref ) = @_; |
590
|
181
|
50
|
|
|
|
301
|
croak "Not an array reference argument!" unless ref $arrayref eq ref []; |
591
|
181
|
|
|
|
|
285
|
bless $arrayref, $class; |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
402
|
|
|
402
|
|
665
|
sub tag { $_[0]->[0] } |
595
|
75
|
|
|
75
|
|
131
|
sub attribute { $_[0]->[1] } |
596
|
659
|
|
|
659
|
|
863
|
sub linkref { $_[0]->[2] } |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
1; |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
__END__ |