| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTML::SimpleLinkExtor; |
|
2
|
8
|
|
|
8
|
|
10515
|
use strict; |
|
|
8
|
|
|
|
|
19
|
|
|
|
8
|
|
|
|
|
241
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
8
|
|
|
8
|
|
39
|
use warnings; |
|
|
8
|
|
|
|
|
15
|
|
|
|
8
|
|
|
|
|
199
|
|
|
5
|
8
|
|
|
8
|
|
33
|
no warnings; |
|
|
8
|
|
|
|
|
16
|
|
|
|
8
|
|
|
|
|
317
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
8
|
|
|
8
|
|
4364
|
use subs qw(); |
|
|
8
|
|
|
|
|
199
|
|
|
|
8
|
|
|
|
|
230
|
|
|
8
|
8
|
|
|
8
|
|
44
|
use vars qw( $AUTOLOAD ); |
|
|
8
|
|
|
|
|
15
|
|
|
|
8
|
|
|
|
|
446
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
8
|
|
|
8
|
|
3872
|
use AutoLoader; |
|
|
8
|
|
|
|
|
12097
|
|
|
|
8
|
|
|
|
|
47
|
|
|
11
|
8
|
|
|
8
|
|
327
|
use Carp qw(carp); |
|
|
8
|
|
|
|
|
17
|
|
|
|
8
|
|
|
|
|
412
|
|
|
12
|
8
|
|
|
8
|
|
3766
|
use HTML::LinkExtor; |
|
|
8
|
|
|
|
|
69680
|
|
|
|
8
|
|
|
|
|
295
|
|
|
13
|
8
|
|
|
8
|
|
5665
|
use LWP::UserAgent; |
|
|
8
|
|
|
|
|
349710
|
|
|
|
8
|
|
|
|
|
319
|
|
|
14
|
8
|
|
|
8
|
|
69
|
use URI; |
|
|
8
|
|
|
|
|
19
|
|
|
|
8
|
|
|
|
|
309
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '1.273'; |
|
17
|
|
|
|
|
|
|
|
|
18
|
8
|
|
|
8
|
|
49
|
use parent qw(HTML::LinkExtor); |
|
|
8
|
|
|
|
|
20
|
|
|
|
8
|
|
|
|
|
49
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %AUTO_METHODS = qw( |
|
21
|
|
|
|
|
|
|
background attribute |
|
22
|
|
|
|
|
|
|
href attribute |
|
23
|
|
|
|
|
|
|
src attribute |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
a tag |
|
26
|
|
|
|
|
|
|
area tag |
|
27
|
|
|
|
|
|
|
base tag |
|
28
|
|
|
|
|
|
|
body tag |
|
29
|
|
|
|
|
|
|
img tag |
|
30
|
|
|
|
|
|
|
frame tag |
|
31
|
|
|
|
|
|
|
iframe tag |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
script tag |
|
34
|
|
|
|
|
|
|
); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
|
37
|
10
|
|
|
10
|
|
19725
|
sub DESTROY { 1 }; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub AUTOLOAD { |
|
40
|
19
|
|
|
19
|
|
19363
|
my $self = shift; |
|
41
|
19
|
|
|
|
|
36
|
my $method = $AUTOLOAD; |
|
42
|
|
|
|
|
|
|
|
|
43
|
19
|
|
|
|
|
110
|
$method =~ s/.*:://; |
|
44
|
|
|
|
|
|
|
|
|
45
|
19
|
100
|
|
|
|
63
|
unless( exists $AUTO_METHODS{$method} ) { |
|
46
|
3
|
|
|
|
|
398
|
carp __PACKAGE__ . ": method $method unknown"; |
|
47
|
3
|
|
|
|
|
252
|
return; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
16
|
|
|
|
|
39
|
$self->_extract( $method ); |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub can { |
|
54
|
14
|
|
|
14
|
1
|
7351
|
my( $self, @methods ) = @_; |
|
55
|
|
|
|
|
|
|
|
|
56
|
14
|
|
|
|
|
33
|
foreach my $method ( @methods ) { |
|
57
|
14
|
100
|
|
|
|
40
|
return 0 unless $self->_can( $method ); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
8
|
|
|
|
|
30
|
return 1; |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _can { |
|
64
|
8
|
|
|
8
|
|
2072
|
no strict 'refs'; |
|
|
8
|
|
|
|
|
21
|
|
|
|
8
|
|
|
|
|
11783
|
|
|
65
|
|
|
|
|
|
|
|
|
66
|
14
|
100
|
|
14
|
|
77
|
return 1 if exists $AUTO_METHODS{ $_[1] }; |
|
67
|
11
|
100
|
|
|
|
21
|
return 1 if defined &{"$_[1]"}; |
|
|
11
|
|
|
|
|
94
|
|
|
68
|
|
|
|
|
|
|
|
|
69
|
6
|
|
|
|
|
73
|
return 0; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub _init_links { |
|
73
|
20
|
|
|
20
|
|
37
|
my $self = shift; |
|
74
|
20
|
|
|
|
|
36
|
my $links = shift; |
|
75
|
20
|
100
|
|
|
|
78
|
do { |
|
76
|
10
|
|
|
|
|
32
|
delete $self->{'_SimpleLinkExtor_links'}; |
|
77
|
|
|
|
|
|
|
return |
|
78
|
10
|
|
|
|
|
22
|
} unless ref $links eq ref []; |
|
79
|
|
|
|
|
|
|
|
|
80
|
10
|
|
|
|
|
38
|
$self->{'_SimpleLinkExtor_links'} = $links; |
|
81
|
|
|
|
|
|
|
|
|
82
|
10
|
|
|
|
|
23
|
$self; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _link_refs { |
|
86
|
42
|
|
|
42
|
|
75
|
my $self = shift; |
|
87
|
|
|
|
|
|
|
|
|
88
|
42
|
|
|
|
|
67
|
my @link_refs; |
|
89
|
|
|
|
|
|
|
# XXX: this is a bad way to do this. I should check if the |
|
90
|
|
|
|
|
|
|
# value is a reference. If I want to reset the links, for |
|
91
|
|
|
|
|
|
|
# instance, I can't just set it to [] because it then goes |
|
92
|
|
|
|
|
|
|
# through this branch. In _init_links I have to use a delete |
|
93
|
|
|
|
|
|
|
# which I really don't like. I don't have time to rewrite this |
|
94
|
|
|
|
|
|
|
# right now though --brian, 20050816 |
|
95
|
42
|
100
|
|
|
|
118
|
if( ref $self->{'_SimpleLinkExtor_links'} ) { |
|
96
|
33
|
|
|
|
|
52
|
@link_refs = @{$self->{'_SimpleLinkExtor_links'}}; |
|
|
33
|
|
|
|
|
100
|
|
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
else { |
|
99
|
|
|
|
|
|
|
@link_refs = map { |
|
100
|
9
|
|
|
|
|
43
|
HTML::SimpleLinkExtor::LinkRef->new( $_ ) |
|
|
181
|
|
|
|
|
378
|
|
|
101
|
|
|
|
|
|
|
} $self->SUPER::links(); |
|
102
|
9
|
|
|
|
|
41
|
$self->_init_links( \@link_refs ); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# defined() so that an empty string means "do not resolve" |
|
106
|
42
|
100
|
|
|
|
111
|
unless( defined $self->{'_SimpleLinkExtor_base'} ) { |
|
107
|
19
|
|
|
|
|
31
|
my $count = -1; |
|
108
|
19
|
|
|
|
|
32
|
my $found = 0; |
|
109
|
19
|
|
|
|
|
37
|
foreach my $link ( @link_refs ) { |
|
110
|
393
|
|
|
|
|
518
|
$count++; |
|
111
|
393
|
100
|
66
|
|
|
831
|
next unless $link->[0] eq 'base' and $link->[1] eq 'href'; |
|
112
|
2
|
|
|
|
|
3
|
$found = 1; |
|
113
|
2
|
|
|
|
|
5
|
$self->{'_SimpleLinkExtor_base'} = $link->[-1]; |
|
114
|
2
|
|
|
|
|
5
|
last; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#remove the BASE HREF link - Good idea, bad idea? |
|
118
|
|
|
|
|
|
|
#splice @link_refs, $count, 1, () if $found; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
42
|
|
|
|
|
122
|
$self->_add_base(\@link_refs); |
|
122
|
|
|
|
|
|
|
|
|
123
|
42
|
|
|
|
|
252
|
return @link_refs; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _extract { |
|
127
|
16
|
|
|
16
|
|
26
|
my $self = shift; |
|
128
|
16
|
|
|
|
|
26
|
my $type = shift; |
|
129
|
|
|
|
|
|
|
|
|
130
|
16
|
100
|
|
|
|
43
|
my $method = $AUTO_METHODS{$type} eq 'tag' ? 'tag' : 'attribute'; |
|
131
|
|
|
|
|
|
|
|
|
132
|
63
|
|
|
|
|
104
|
my @links = map { $_->linkref } |
|
133
|
16
|
|
|
|
|
33
|
grep { $_->$method() eq $type } |
|
|
400
|
|
|
|
|
691
|
|
|
134
|
|
|
|
|
|
|
$self->_link_refs; |
|
135
|
|
|
|
|
|
|
|
|
136
|
16
|
|
|
|
|
72
|
return @links; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub _add_base { |
|
140
|
42
|
|
|
42
|
|
64
|
my $self = shift; |
|
141
|
42
|
|
|
|
|
78
|
my $array_ref = shift; |
|
142
|
|
|
|
|
|
|
|
|
143
|
42
|
|
|
|
|
75
|
my $base = $self->{'_SimpleLinkExtor_base'}; |
|
144
|
42
|
100
|
|
|
|
107
|
return unless $base; |
|
145
|
|
|
|
|
|
|
|
|
146
|
25
|
|
|
|
|
53
|
foreach my $tuple ( @$array_ref ) { |
|
147
|
606
|
|
|
|
|
3507
|
foreach my $index ( 1 .. $#$tuple ) { |
|
148
|
1308
|
100
|
|
|
|
27364
|
next unless exists $AUTO_METHODS{ $tuple->[$index] }; |
|
149
|
|
|
|
|
|
|
|
|
150
|
606
|
|
|
|
|
1352
|
my $url = URI->new( $tuple->[$index + 1] ); |
|
151
|
606
|
50
|
|
|
|
37691
|
next unless ref $url; |
|
152
|
606
|
|
|
|
|
1272
|
$tuple->[$index + 1] = $url->abs($base); |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=encoding utf8 |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head1 NAME |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
HTML::SimpleLinkExtor - Extract links from HTML |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
use HTML::SimpleLinkExtor; |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
my $extor = HTML::SimpleLinkExtor->new(); |
|
168
|
|
|
|
|
|
|
$extor->parse_file($filename); |
|
169
|
|
|
|
|
|
|
#--or-- |
|
170
|
|
|
|
|
|
|
$extor->parse($html); |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
$extor->parse_file($other_file); # get more links |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
$extor->clear_links; # reset the link list |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
#extract all of the links |
|
177
|
|
|
|
|
|
|
@all_links = $extor->links; |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#extract the img links |
|
180
|
|
|
|
|
|
|
@img_srcs = $extor->img; |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
#extract the frame links |
|
183
|
|
|
|
|
|
|
@frame_srcs = $extor->frame; |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
#extract the hrefs |
|
186
|
|
|
|
|
|
|
@area_hrefs = $extor->area; |
|
187
|
|
|
|
|
|
|
@a_hrefs = $extor->a; |
|
188
|
|
|
|
|
|
|
@base_hrefs = $extor->base; |
|
189
|
|
|
|
|
|
|
@hrefs = $extor->href; |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#extract the body background link |
|
192
|
|
|
|
|
|
|
@body_bg = $extor->body; |
|
193
|
|
|
|
|
|
|
@background = $extor->background; |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
@links = $extor->schemes( 'http' ); |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This is a simple HTML link extractor designed for the person who does |
|
200
|
|
|
|
|
|
|
not want to deal with the intricacies of C or the |
|
201
|
|
|
|
|
|
|
de-referencing needed to get links out of C. |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
You can extract all the links or some of the links (based on the HTML |
|
204
|
|
|
|
|
|
|
tag name or attribute name). If a C<< >> tag is found, |
|
205
|
|
|
|
|
|
|
all of the relative URLs will be resolved according to that reference. |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
This module is simply a subclass around C, so it can |
|
208
|
|
|
|
|
|
|
only parse what that module can handle. Invalid HTML or XHTML may |
|
209
|
|
|
|
|
|
|
cause problems. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
If you parse multiple files, the link list grows and contains the |
|
212
|
|
|
|
|
|
|
aggregate list of links for all of the files parsed. If you want to |
|
213
|
|
|
|
|
|
|
reset the link list between files, use the clear_links method. |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 Class Methods |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=over |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new() |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
Create the link extractor object. |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new('') |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item $extor = HTML::SimpleLinkExtor->new($base) |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Create the link extractor object and resolve the relative URLs |
|
228
|
|
|
|
|
|
|
accoridng to the supplied base URL. The supplied base URL overrides |
|
229
|
|
|
|
|
|
|
any other base URL found in the HTML. |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Create the link extractor object and do not resolve relative |
|
232
|
|
|
|
|
|
|
links. |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=cut |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub new { |
|
237
|
10
|
|
|
10
|
1
|
29201
|
my $class = shift; |
|
238
|
10
|
|
|
|
|
22
|
my $base = shift; |
|
239
|
|
|
|
|
|
|
|
|
240
|
10
|
|
|
|
|
69
|
my $self = new HTML::LinkExtor; |
|
241
|
10
|
|
|
|
|
1152
|
bless $self, $class; |
|
242
|
|
|
|
|
|
|
|
|
243
|
10
|
|
|
|
|
40
|
$self->{'_SimpleLinkExtor_base'} = $base; |
|
244
|
10
|
|
|
|
|
61
|
$self->{'_ua'} = LWP::UserAgent->new; |
|
245
|
10
|
|
|
|
|
19504
|
$self->_init_links; |
|
246
|
|
|
|
|
|
|
|
|
247
|
10
|
|
|
|
|
33
|
return $self; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->ua; |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Returns the internal user agent, an C object. |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut |
|
255
|
|
|
|
|
|
|
|
|
256
|
2
|
|
|
2
|
1
|
11
|
sub ua { $_[0]->{_ua} } |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_tags( TAG [, TAG ] ) |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
C keeps an internal list of HTML tags (such as |
|
261
|
|
|
|
|
|
|
'a' and 'img') that have URLs as values. If you run into another tag |
|
262
|
|
|
|
|
|
|
that this module doesn't handle, please send it to me and I'll add it. |
|
263
|
|
|
|
|
|
|
Until then you can add that tag to the internal list. This affects |
|
264
|
|
|
|
|
|
|
the entire class, including previously created objects. |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub add_tags { |
|
269
|
1
|
|
|
1
|
1
|
2852
|
my $self = shift; |
|
270
|
1
|
|
|
|
|
3
|
my $tag = lc shift; |
|
271
|
|
|
|
|
|
|
|
|
272
|
1
|
|
|
|
|
4
|
$AUTO_METHODS{ $tag } = 'tag'; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->add_attributes( ATTR [, ATTR] ) |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
C keeps an internal list of HTML tag attributes |
|
278
|
|
|
|
|
|
|
(such as 'href' and 'src') that have URLs as values. If you run into |
|
279
|
|
|
|
|
|
|
another attribute that this module doesn't handle, please send it to |
|
280
|
|
|
|
|
|
|
me and I'll add it. Until then you can add that attribute to the |
|
281
|
|
|
|
|
|
|
internal list. This affects the entire class, including previously |
|
282
|
|
|
|
|
|
|
created objects. |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=cut |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item can() |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
A smarter C that can tell which attributes are also methods. |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub add_attributes { |
|
293
|
1
|
|
|
1
|
1
|
2813
|
my $self = shift; |
|
294
|
1
|
|
|
|
|
3
|
my $attr = lc shift; |
|
295
|
|
|
|
|
|
|
|
|
296
|
1
|
|
|
|
|
3
|
$AUTO_METHODS{ $attr } = 'attribute'; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_tags( TAG [, TAG ] ) |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Take tags out of the internal list that C uses |
|
302
|
|
|
|
|
|
|
to extract URLs. This affects the entire class, including previously |
|
303
|
|
|
|
|
|
|
created objects. |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub remove_tags { |
|
308
|
1
|
|
|
1
|
1
|
2760
|
my $self = shift; |
|
309
|
1
|
|
|
|
|
3
|
my $tag = lc shift; |
|
310
|
|
|
|
|
|
|
|
|
311
|
1
|
|
|
|
|
4
|
delete $AUTO_METHODS{ $tag }; |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->remove_attributes( ATTR [, ATTR] ) |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Takes attributes out of the internal list that |
|
317
|
|
|
|
|
|
|
C uses to extract URLs. This affects the entire |
|
318
|
|
|
|
|
|
|
class, including previously created objects. |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub remove_attributes { |
|
323
|
1
|
|
|
1
|
1
|
2847
|
my $self = shift; |
|
324
|
1
|
|
|
|
|
3
|
my $attr = lc shift; |
|
325
|
|
|
|
|
|
|
|
|
326
|
1
|
|
|
|
|
3
|
delete $AUTO_METHODS{ $attr }; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->attribute_list |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
Returns a list of the attributes C pays |
|
332
|
|
|
|
|
|
|
attention to. |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub attribute_list { |
|
337
|
4
|
|
|
4
|
1
|
727
|
my $self = shift; |
|
338
|
|
|
|
|
|
|
|
|
339
|
4
|
|
|
|
|
16
|
grep { $AUTO_METHODS{ $_ } eq 'attribute' } keys %AUTO_METHODS; |
|
|
46
|
|
|
|
|
86
|
|
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item HTML::SimpleLinkExtor->tag_list |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
Returns a list of the tags C pays attention to. |
|
345
|
|
|
|
|
|
|
These tags have convenience methods. |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=back |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=cut |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub tag_list { |
|
352
|
4
|
|
|
4
|
1
|
6173
|
my $self = shift; |
|
353
|
|
|
|
|
|
|
|
|
354
|
4
|
|
|
|
|
18
|
grep { $AUTO_METHODS{ $_ } eq 'tag' } keys %AUTO_METHODS; |
|
|
46
|
|
|
|
|
92
|
|
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=head2 Object methods |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=over 4 |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=item $extor->parse_file( $filename ) |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Parse the file for links. Inherited from C. |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=item $extor->parse_url( $url [, $ua] ) |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
Fetch URL and parse its content for links. |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=cut |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
sub parse_url { |
|
375
|
2
|
|
|
2
|
1
|
1295
|
my $data = $_[0]->ua->get( $_[1] )->content; |
|
376
|
|
|
|
|
|
|
|
|
377
|
2
|
100
|
|
|
|
29081
|
return unless $data; |
|
378
|
|
|
|
|
|
|
|
|
379
|
1
|
|
|
|
|
16
|
$_[0]->parse( $data ); |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=item $extor->parse( $data ) |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
Parse the HTML in C<$data>. Inherited from C. |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=item $extor->clear_links |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Clear the link list. This way, you can use the same parser for |
|
389
|
|
|
|
|
|
|
another file. |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=cut |
|
392
|
|
|
|
|
|
|
|
|
393
|
1
|
|
|
1
|
1
|
2793
|
sub clear_links { $_[0]->_init_links( [] ) } |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item $extor->links |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Return a list of the links. |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=cut |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
sub links { |
|
402
|
76
|
|
|
|
|
120
|
map { $_->linkref } |
|
403
|
6
|
|
|
6
|
1
|
7942
|
grep { $_[0]->_is_an_allowed_tag( $_->tag ) } |
|
|
77
|
|
|
|
|
147
|
|
|
404
|
|
|
|
|
|
|
$_[0]->_link_refs |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub _is_an_allowed_tag { |
|
408
|
|
|
|
|
|
|
exists $AUTO_METHODS{$_[1]} |
|
409
|
|
|
|
|
|
|
and |
|
410
|
77
|
100
|
|
77
|
|
299
|
$AUTO_METHODS{$_[1]} eq 'tag' |
|
411
|
|
|
|
|
|
|
} |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=item $extor->img |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of the |
|
416
|
|
|
|
|
|
|
IMG. |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=cut |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item $extor->frame |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
|
423
|
|
|
|
|
|
|
the FRAME. |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=cut |
|
426
|
|
|
|
|
|
|
|
|
427
|
1
|
|
|
1
|
1
|
13
|
sub frames { ( $_[0]->frame, $_[0]->iframe ) } |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item $extor->iframe |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Return a list of all the links from all the SRC attributes of |
|
432
|
|
|
|
|
|
|
the IFRAME. |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item $extor->frames |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Returns the combined list from frame and iframe. |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=item $extor->src |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Return a list of the links from all the SRC attributes of any |
|
441
|
|
|
|
|
|
|
tag. |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item $extor->a |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
|
446
|
|
|
|
|
|
|
A tags. |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item $extor->area |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
|
451
|
|
|
|
|
|
|
AREA tags. |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=item $extor->base |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of the |
|
456
|
|
|
|
|
|
|
BASE tags. There should only be one. |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=item $extor->href |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
Return a list of the links from all the HREF attributes of any |
|
461
|
|
|
|
|
|
|
tag. |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=item $extor->body, $extor->background |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
Return the link from the BODY tag's BACKGROUND attribute. |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=item $extor->script |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Return the link from the SCRIPT tag's SRC attribute |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=item $extor->schemes( SCHEME, [ SCHEME, ... ] ) |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
Return the links that use any of SCHEME. These must be absolute URLs (which |
|
474
|
|
|
|
|
|
|
might include those converted to absolute URLs by specifying a |
|
475
|
|
|
|
|
|
|
base). SCHEME is case-insensitive. You can specify more than one |
|
476
|
|
|
|
|
|
|
scheme. |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
|
479
|
|
|
|
|
|
|
the count of the matching links. |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=cut |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub schemes { |
|
484
|
16
|
|
|
16
|
1
|
28597
|
my( $self, @schemes ) = @_; |
|
485
|
|
|
|
|
|
|
|
|
486
|
16
|
|
|
|
|
29
|
my %schemes = map { lc, lc } @schemes; |
|
|
20
|
|
|
|
|
72
|
|
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
my @links = |
|
489
|
|
|
|
|
|
|
grep { |
|
490
|
416
|
|
|
|
|
590
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
|
416
|
|
|
|
|
907
|
|
|
491
|
416
|
|
|
|
|
36645
|
exists $schemes{ $scheme }; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
16
|
|
|
|
|
44
|
map { $_->linkref } |
|
|
416
|
|
|
|
|
609
|
|
|
494
|
|
|
|
|
|
|
$self->_link_refs; |
|
495
|
|
|
|
|
|
|
|
|
496
|
16
|
100
|
|
|
|
118
|
wantarray ? @links : scalar @links; |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=item $extor->absolute_links |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
Returns the absolute URLs (which might include those converted to |
|
502
|
|
|
|
|
|
|
absolute URLs by specifying a base). |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
|
505
|
|
|
|
|
|
|
the count of the matching links. |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=cut |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
sub absolute_links { |
|
510
|
2
|
|
|
2
|
1
|
3360
|
my $self = shift; |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
my @links = |
|
513
|
|
|
|
|
|
|
grep { |
|
514
|
52
|
|
|
|
|
77
|
my $scheme = eval { lc URI->new( $_ )->scheme }; |
|
|
52
|
|
|
|
|
113
|
|
|
515
|
52
|
|
|
|
|
14811
|
length $scheme; |
|
516
|
|
|
|
|
|
|
} |
|
517
|
2
|
|
|
|
|
5
|
map { $_->linkref } |
|
|
52
|
|
|
|
|
80
|
|
|
518
|
|
|
|
|
|
|
$self->_link_refs; |
|
519
|
|
|
|
|
|
|
|
|
520
|
2
|
100
|
|
|
|
20
|
wantarray ? @links : scalar @links; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item $extor->relative_links |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Returns the relatives URLs (which might exclude those converted to |
|
526
|
|
|
|
|
|
|
absolute URLs by specifying a base or having a base in the document). |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
In list context it returns the links. In scalar context it returns |
|
529
|
|
|
|
|
|
|
the count of the matching links. |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=cut |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
sub relative_links { |
|
535
|
2
|
|
|
2
|
1
|
3416
|
my $self = shift; |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
my @links = |
|
538
|
|
|
|
|
|
|
grep { |
|
539
|
52
|
|
|
|
|
76
|
my $scheme = eval { URI->new( $_ )->scheme }; |
|
|
52
|
|
|
|
|
107
|
|
|
540
|
52
|
|
|
|
|
14445
|
! defined $scheme; |
|
541
|
|
|
|
|
|
|
} |
|
542
|
2
|
|
|
|
|
7
|
map { $_->linkref } |
|
|
52
|
|
|
|
|
76
|
|
|
543
|
|
|
|
|
|
|
$self->_link_refs; |
|
544
|
|
|
|
|
|
|
|
|
545
|
2
|
100
|
|
|
|
17
|
wantarray ? @links : scalar @links; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=back |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=head1 TO DO |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
This module doesn't handle all of the HTML tags that might |
|
553
|
|
|
|
|
|
|
have links. If someone wants those, I'll add them, or you |
|
554
|
|
|
|
|
|
|
can edit C<%AUTO_METHODS> in the source. |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head1 CREDITS |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Will Crain who identified a problem with IMG links that had |
|
559
|
|
|
|
|
|
|
a USEMAP attribute. |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=head1 AUTHORS |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
brian d foy, C<< >> |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Maintained by Nigel Horne, C<< >> |
|
566
|
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
568
|
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Copyright © 2004-2019, brian d foy . All rights reserved. |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
|
572
|
|
|
|
|
|
|
it under the terms of the Artistic License 2.0. |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=cut |
|
575
|
|
|
|
|
|
|
|
|
576
|
0
|
|
|
|
|
0
|
BEGIN { |
|
577
|
|
|
|
|
|
|
package |
|
578
|
|
|
|
|
|
|
HTML::SimpleLinkExtor::LinkRef; |
|
579
|
8
|
|
|
8
|
|
215
|
use Carp qw(croak); |
|
|
8
|
|
|
0
|
|
29
|
|
|
|
8
|
|
|
|
|
1521
|
|
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub new { |
|
582
|
181
|
|
|
181
|
|
285
|
my( $class, $arrayref ) = @_; |
|
583
|
181
|
50
|
|
|
|
357
|
croak "Not an array reference argument!" unless ref $arrayref eq ref []; |
|
584
|
181
|
|
|
|
|
349
|
bless $arrayref, $class; |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
|
|
587
|
402
|
|
|
402
|
|
830
|
sub tag { $_[0]->[0] } |
|
588
|
75
|
|
|
75
|
|
137
|
sub attribute { $_[0]->[1] } |
|
589
|
659
|
|
|
659
|
|
1046
|
sub linkref { $_[0]->[2] } |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
1; |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
__END__ |