line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::LinkExtractor;
|
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
16030
|
use strict;
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
102
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
1883
|
use HTML::TokeParser 2; # use HTML::TokeParser::Simple 2;
|
|
2
|
|
|
|
|
27086
|
|
|
2
|
|
|
|
|
65
|
|
6
|
2
|
|
|
2
|
|
1713
|
use URI 1;
|
|
2
|
|
|
|
|
17247
|
|
|
2
|
|
|
|
|
107
|
|
7
|
2
|
|
|
2
|
|
22
|
use Carp qw( croak );
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
173
|
|
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
11
|
use vars qw( $VERSION );
|
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
118
|
|
10
|
1
|
|
|
|
|
4
|
$VERSION = '0.13';
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
## The html tags which might have URLs
|
13
|
|
|
|
|
|
|
# the master list of tagolas and required attributes (to constitute a link)
|
14
|
2
|
|
|
2
|
|
11
|
use vars qw( %TAGS );
|
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
318
|
|
15
|
1
|
|
|
|
|
76
|
%TAGS = (
|
16
|
|
|
|
|
|
|
a => [qw( href )],
|
17
|
|
|
|
|
|
|
applet => [qw( archive code codebase src )],
|
18
|
|
|
|
|
|
|
area => [qw( href )],
|
19
|
|
|
|
|
|
|
base => [qw( href )],
|
20
|
|
|
|
|
|
|
bgsound => [qw( src )],
|
21
|
|
|
|
|
|
|
blockquote => [qw( cite )],
|
22
|
|
|
|
|
|
|
body => [qw( background )],
|
23
|
|
|
|
|
|
|
del => [qw( cite )],
|
24
|
|
|
|
|
|
|
div => [qw( src )], # IE likes it, but don't know where it's documented
|
25
|
|
|
|
|
|
|
embed => [qw( pluginspage pluginurl src )],
|
26
|
|
|
|
|
|
|
form => [qw( action )],
|
27
|
|
|
|
|
|
|
frame => [qw( src longdesc )],
|
28
|
|
|
|
|
|
|
iframe => [qw( src )],
|
29
|
|
|
|
|
|
|
ilayer => [qw( background src )],
|
30
|
|
|
|
|
|
|
img => [qw( dynsrc longdesc lowsrc src usemap )],
|
31
|
|
|
|
|
|
|
input => [qw( dynsrc lowsrc src )],
|
32
|
|
|
|
|
|
|
ins => [qw( cite )],
|
33
|
|
|
|
|
|
|
isindex => [qw( action )], # real oddball
|
34
|
|
|
|
|
|
|
layer => [qw( src )],
|
35
|
|
|
|
|
|
|
link => [qw( src href )],
|
36
|
|
|
|
|
|
|
object => [qw( archive classid code codebase data usemap )],
|
37
|
|
|
|
|
|
|
q => [qw( cite )],
|
38
|
|
|
|
|
|
|
script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG!
|
39
|
|
|
|
|
|
|
sound => [qw( src )],
|
40
|
|
|
|
|
|
|
table => [qw( background )],
|
41
|
|
|
|
|
|
|
td => [qw( background )],
|
42
|
|
|
|
|
|
|
th => [qw( background )],
|
43
|
|
|
|
|
|
|
tr => [qw( background )],
|
44
|
|
|
|
|
|
|
## the exotic cases
|
45
|
|
|
|
|
|
|
meta => undef,
|
46
|
|
|
|
|
|
|
'!doctype' => [qw( url )], # is really a process instruction
|
47
|
|
|
|
|
|
|
);
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
## tags which contain <.*?> STUFF TO GET \w+>
|
50
|
2
|
|
|
2
|
|
10
|
use vars qw( @TAGS_IN_NEED );
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
425
|
|
51
|
1
|
|
|
|
|
6
|
@TAGS_IN_NEED = qw(
|
52
|
|
|
|
|
|
|
a
|
53
|
|
|
|
|
|
|
blockquote
|
54
|
|
|
|
|
|
|
del
|
55
|
|
|
|
|
|
|
ins
|
56
|
|
|
|
|
|
|
q
|
57
|
|
|
|
|
|
|
);
|
58
|
|
|
|
|
|
|
|
59
|
2
|
|
|
2
|
|
21
|
use vars qw( @VALID_URL_ATTRIBUTES );
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
5269
|
|
60
|
1
|
|
|
|
|
8
|
@VALID_URL_ATTRIBUTES = qw(
|
61
|
|
|
|
|
|
|
action
|
62
|
|
|
|
|
|
|
archive
|
63
|
|
|
|
|
|
|
background
|
64
|
|
|
|
|
|
|
cite
|
65
|
|
|
|
|
|
|
classid
|
66
|
|
|
|
|
|
|
code
|
67
|
|
|
|
|
|
|
codebase
|
68
|
|
|
|
|
|
|
data
|
69
|
|
|
|
|
|
|
dynsrc
|
70
|
|
|
|
|
|
|
href
|
71
|
|
|
|
|
|
|
longdesc
|
72
|
|
|
|
|
|
|
lowsrc
|
73
|
|
|
|
|
|
|
pluginspage
|
74
|
|
|
|
|
|
|
pluginurl
|
75
|
|
|
|
|
|
|
src
|
76
|
|
|
|
|
|
|
usemap
|
77
|
|
|
|
|
|
|
);
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub new {
|
81
|
5
|
|
|
5
|
1
|
813650
|
my($class, $cb, $base, $strip) = @_;
|
82
|
5
|
|
|
|
|
26
|
my $self = bless {}, $class;
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
85
|
5
|
100
|
|
|
|
44
|
$self->{_cb} = $cb if defined $cb;
|
86
|
5
|
100
|
|
|
|
49
|
$self->{_base} = URI->new($base) if defined $base;
|
87
|
5
|
|
100
|
|
|
10300
|
$self->strip( $strip || 0 );
|
88
|
|
|
|
|
|
|
|
89
|
5
|
|
|
|
|
17
|
return $self;
|
90
|
|
|
|
|
|
|
}
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub strip {
|
93
|
17
|
|
|
17
|
1
|
762
|
my( $self, $on ) = @_;
|
94
|
17
|
100
|
|
|
|
75
|
return $self->{_strip} unless defined $on;
|
95
|
6
|
100
|
|
|
|
54
|
return $self->{_strip} = $on ? 1 : 0;
|
96
|
|
|
|
|
|
|
}
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
## $p= HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); # ## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filecontents);
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub parse {
|
101
|
5
|
|
|
5
|
1
|
728
|
my( $this, $hmmm ) = @_;
|
102
|
5
|
|
|
|
|
51
|
my $tp = new HTML::TokeParser( $hmmm ); # my $tp = new HTML::TokeParser::Simple( $hmmm );
|
103
|
|
|
|
|
|
|
|
104
|
5
|
50
|
|
|
|
914
|
unless($tp) {
|
105
|
0
|
|
|
|
|
0
|
croak qq[ Couldn't create a HTML::TokeParser object: $!]; # croak qq[ Couldn't create a HTML::TokeParser::Simple object: $!];
|
106
|
|
|
|
|
|
|
}
|
107
|
|
|
|
|
|
|
|
108
|
5
|
|
|
|
|
12
|
$this->{_tp} = $tp;
|
109
|
|
|
|
|
|
|
|
110
|
5
|
|
|
|
|
22
|
$this->_parsola();
|
111
|
5
|
|
|
|
|
36
|
return();
|
112
|
|
|
|
|
|
|
}
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub _parsola {
|
115
|
5
|
|
|
5
|
|
11
|
my $self = shift;
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
## a stack of links for keeping track of TEXT
|
118
|
|
|
|
|
|
|
## which is all of "text"
|
119
|
5
|
|
|
|
|
10
|
my @TEXT = ();
|
120
|
5
|
|
|
|
|
14
|
$self->{_LINKS} = [];
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# ["S", $tag, $attr, $attrseq, $text]
|
124
|
|
|
|
|
|
|
# ["E", $tag, $text]
|
125
|
|
|
|
|
|
|
# ["T", $text, $is_data]
|
126
|
|
|
|
|
|
|
# ["C", $text]
|
127
|
|
|
|
|
|
|
# ["D", $text]
|
128
|
|
|
|
|
|
|
# ["PI", $token0, $text]
|
129
|
|
|
|
|
|
|
|
130
|
5
|
|
|
|
|
28
|
while (my $T = $self->{_tp}->get_token() ) {
|
131
|
51
|
|
|
|
|
683
|
my $NL; #NewLink
|
132
|
51
|
|
|
|
|
74
|
my $Tag = $T->[1]; # my $Tag = $T->return_tag;
|
133
|
51
|
|
|
|
|
62
|
my $got_TAGS_IN_NEED=0;
|
134
|
|
|
|
|
|
|
## Start tag?
|
135
|
51
|
100
|
|
|
|
137
|
if($T->[0] eq 'S' ) { # if($T->is_start_tag) {
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
136
|
15
|
100
|
|
|
|
44
|
next unless exists $TAGS{$Tag};
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
## Do we have a tag for which we want to capture text?
|
139
|
14
|
|
|
|
|
42
|
$got_TAGS_IN_NEED = 0;
|
140
|
14
|
|
|
|
|
23
|
$got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;
|
|
70
|
|
|
|
|
376
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
## then check to see if we got things besides META :)
|
143
|
14
|
100
|
|
|
|
44
|
if(defined $TAGS{ $Tag }) {
|
|
|
50
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
145
|
13
|
|
|
|
|
16
|
for my $Btag(@{$TAGS{$Tag}}) {
|
|
13
|
|
|
|
|
32
|
|
146
|
|
|
|
|
|
|
## and we check if they do have one with a value
|
147
|
25
|
100
|
|
|
|
72
|
if(exists $T->[2]->{ $Btag }) { # if(exists $T->return_attr()->{ $Btag }) {
|
148
|
|
|
|
|
|
|
|
149
|
13
|
|
|
|
|
16
|
$NL = $T->[2]; # $NL = $T->return_attr();
|
150
|
|
|
|
|
|
|
## TAGS_IN_NEED are tags in deed (start capturing the STUFF)
|
151
|
13
|
100
|
|
|
|
29
|
if($got_TAGS_IN_NEED) {
|
152
|
9
|
|
|
|
|
13
|
push @TEXT, $NL;
|
153
|
9
|
|
|
|
|
39
|
$NL->{_TEXT} = "";
|
154
|
|
|
|
|
|
|
}
|
155
|
|
|
|
|
|
|
}
|
156
|
|
|
|
|
|
|
}
|
157
|
|
|
|
|
|
|
}elsif($Tag eq 'meta') {
|
158
|
1
|
|
|
|
|
3
|
$NL = $T->[2]; # $NL = $T->return_attr();
|
159
|
|
|
|
|
|
|
|
160
|
1
|
50
|
33
|
|
|
21
|
if(defined $$NL{content} and length $$NL{content} and (
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
161
|
|
|
|
|
|
|
defined $$NL{'http-equiv'} && $$NL{'http-equiv'} =~ /refresh/i
|
162
|
|
|
|
|
|
|
or
|
163
|
|
|
|
|
|
|
defined $$NL{'name'} && $$NL{'name'} =~ /refresh/i
|
164
|
|
|
|
|
|
|
) ) {
|
165
|
|
|
|
|
|
|
|
166
|
1
|
|
|
|
|
11
|
my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2;
|
167
|
1
|
|
|
|
|
3
|
my $base = $self->{_base};
|
168
|
1
|
50
|
|
|
|
4
|
$$NL{url} = URI->new_abs( $url, $base ) if $base;
|
169
|
1
|
50
|
|
|
|
5
|
$$NL{url} = $url unless exists $$NL{url};
|
170
|
1
|
50
|
|
|
|
6
|
$$NL{timeout} = $timeout if $timeout;
|
171
|
|
|
|
|
|
|
}
|
172
|
|
|
|
|
|
|
}
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
## In case we got nested tags
|
175
|
14
|
100
|
|
|
|
36
|
if(@TEXT) {
|
176
|
12
|
|
|
|
|
32
|
$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
|
177
|
|
|
|
|
|
|
}
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
## Text?
|
180
|
|
|
|
|
|
|
}elsif($T->[0] eq 'T' ) { # }elsif($T->is_text) {
|
181
|
26
|
100
|
|
|
|
75
|
$TEXT[-1]->{_TEXT} .= $T->[-2] if @TEXT; # $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
|
182
|
|
|
|
|
|
|
## Declaration?
|
183
|
|
|
|
|
|
|
}elsif($T->[0] eq 'D' ) { # }elsif($T->is_declaration) {
|
184
|
|
|
|
|
|
|
## We look at declarations, to get anly custom .dtd's (tis linky)
|
185
|
1
|
|
|
|
|
3
|
my $text = $T->[-1] ; # my $text = $T->as_is;
|
186
|
1
|
50
|
|
|
|
12
|
if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
|
187
|
1
|
|
|
|
|
9
|
$NL = { raw => $text, url => $1, tag => '!doctype' };
|
188
|
|
|
|
|
|
|
}
|
189
|
|
|
|
|
|
|
## End tag?
|
190
|
|
|
|
|
|
|
}elsif($T->[0] eq 'E' ){ # }elsif($T->is_end_tag){
|
191
|
|
|
|
|
|
|
## these be ignored (maybe not in between tags
|
192
|
|
|
|
|
|
|
## unless we're stacking (bug #5723)
|
193
|
9
|
50
|
33
|
|
|
49
|
if(@TEXT and exists $TAGS{$Tag}) {
|
194
|
9
|
|
|
|
|
21
|
$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
|
195
|
9
|
|
|
|
|
11
|
my $pop = pop @TEXT;
|
196
|
9
|
100
|
|
|
|
23
|
$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
|
197
|
9
|
100
|
|
|
|
18
|
$pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
|
198
|
9
|
100
|
|
|
|
329
|
$self->{_cb}->($self, $pop) if exists $self->{_cb};
|
199
|
|
|
|
|
|
|
}
|
200
|
|
|
|
|
|
|
}
|
201
|
|
|
|
|
|
|
|
202
|
50
|
100
|
|
|
|
249
|
if(defined $NL) {
|
203
|
15
|
|
|
|
|
29
|
$$NL{tag} = $Tag;
|
204
|
|
|
|
|
|
|
|
205
|
15
|
|
|
|
|
23
|
my $base = $self->{_base};
|
206
|
|
|
|
|
|
|
|
207
|
15
|
|
|
|
|
53
|
for my $at( @VALID_URL_ATTRIBUTES ) {
|
208
|
240
|
100
|
|
|
|
891
|
if( exists $$NL{$at} ) {
|
209
|
13
|
100
|
|
|
|
181
|
$$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
|
210
|
|
|
|
|
|
|
}
|
211
|
|
|
|
|
|
|
}
|
212
|
|
|
|
|
|
|
|
213
|
15
|
100
|
|
|
|
49
|
if(exists $self->{_cb}) {
|
214
|
3
|
100
|
66
|
|
|
25
|
$self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
|
215
|
|
|
|
|
|
|
} else {
|
216
|
12
|
|
|
|
|
13
|
push @{$self->{_LINKS}}, $NL;
|
|
12
|
|
|
|
|
77
|
|
217
|
|
|
|
|
|
|
}
|
218
|
|
|
|
|
|
|
}
|
219
|
|
|
|
|
|
|
}## endof while (my $token = $p->get_token)
|
220
|
|
|
|
|
|
|
|
221
|
5
|
|
|
|
|
46
|
undef $self->{_tp};
|
222
|
5
|
|
|
|
|
10
|
return();
|
223
|
|
|
|
|
|
|
}
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub links {
|
226
|
5
|
|
|
5
|
1
|
52
|
my $self = shift;
|
227
|
|
|
|
|
|
|
## just like HTML::LinkExtor's
|
228
|
5
|
|
|
|
|
48
|
return $self->{_LINKS};
|
229
|
|
|
|
|
|
|
}
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub _stripHTML {
|
233
|
4
|
|
|
4
|
|
6
|
my $HtmlRef = shift;
|
234
|
4
|
|
|
|
|
11
|
my $tp = new HTML::TokeParser( $HtmlRef ); # my $tp = new HTML::TokeParser::Simple( $HtmlRef );
|
235
|
4
|
|
|
|
|
442
|
my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED)
|
236
|
|
|
|
|
|
|
# otherwise it ain't come from LinkExtractor
|
237
|
4
|
50
|
|
|
|
133
|
if($t->[0] eq 'S' ) { # if($t->is_start_tag) {
|
238
|
4
|
|
|
|
|
20
|
return $tp->get_trimmed_text( '/'.$t->[1] ); # return $tp->get_trimmed_text( '/'.$t->return_tag );
|
239
|
|
|
|
|
|
|
} else {
|
240
|
0
|
|
|
|
|
|
require Data::Dumper;
|
241
|
0
|
|
|
|
|
|
local $Data::Dumper::Indent=1;
|
242
|
0
|
|
|
|
|
|
die " IMPOSSIBLE!!!! ",
|
243
|
|
|
|
|
|
|
Data::Dumper::Dumper(
|
244
|
|
|
|
|
|
|
'$HtmlRef',$HtmlRef,
|
245
|
|
|
|
|
|
|
'$t', $t,
|
246
|
|
|
|
|
|
|
);
|
247
|
|
|
|
|
|
|
}
|
248
|
|
|
|
|
|
|
}
|
249
|
|
|
|
|
|
|
|
250
|
1
|
|
|
|
|
2
|
1;
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
package main;
|
253
|
|
|
|
|
|
|
|
254
|
1
|
50
|
|
|
|
4
|
unless(caller()) {
|
255
|
1
|
|
|
|
|
1496
|
require Data::Dumper;
|
256
|
1
|
50
|
|
|
|
8647
|
if(@ARGV) {
|
257
|
0
|
|
|
|
|
0
|
for my $file( @ARGV ) {
|
258
|
0
|
0
|
|
|
|
0
|
if( -e $file ) {
|
259
|
0
|
|
|
|
|
0
|
my $LX = new HTML::LinkExtractor( );
|
260
|
0
|
|
|
|
|
0
|
$LX->parse( $file );
|
261
|
0
|
|
|
|
|
0
|
print Data::Dumper::Dumper($LX->links);
|
262
|
0
|
|
|
|
|
0
|
undef $LX;
|
263
|
|
|
|
|
|
|
} else {
|
264
|
0
|
|
|
|
|
0
|
warn "The file `$file' doesn't exist\n";
|
265
|
|
|
|
|
|
|
}
|
266
|
|
|
|
|
|
|
}
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
} else {
|
269
|
|
|
|
|
|
|
|
270
|
1
|
|
|
|
|
4
|
my $INPUT = q{
|
271
|
|
|
|
|
|
|
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
1
|
274
|
|
|
|
|
|
|
2
|
275
|
|
|
|
|
|
|
3
|
276
|
|
|
|
|
|
|
4 Perlmonks.org
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
5
|
280
|
|
|
|
|
|
|
hello there
|
281
|
|
|
|
|
|
|
6
|
282
|
|
|
|
|
|
|
7 now
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
8 To be or not to be.
|
286
|
|
|
|
|
|
|
9
|
287
|
|
|
|
|
|
|
Just Another Perl Hacker,
|
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
};
|
290
|
|
|
|
|
|
|
|
291
|
1
|
|
|
|
|
13
|
my $LX = new HTML::LinkExtractor();
|
292
|
1
|
|
|
|
|
8
|
$LX->parse(\$INPUT);
|
293
|
|
|
|
|
|
|
|
294
|
1
|
|
|
|
|
2
|
print scalar(@{$LX->links()})." we GOT\n";
|
|
1
|
|
|
|
|
4
|
|
295
|
1
|
|
|
|
|
4
|
print Data::Dumper::Dumper( $LX->links() );
|
296
|
|
|
|
|
|
|
}
|
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
}
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
__END__
|