File Coverage

blib/lib/HTML/LinkExtractor.pm

Criterion	Covered	Total	%
statement	111	122	90.9
branch	54	68	79.4
condition	8	17	47.0
subroutine	14	14	100.0
pod	4	4	100.0
total	191	225	84.8

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::LinkExtractor;
2
3	2			2		16030	use strict;
	2					4
	2					102
4
5	2			2		1883	use HTML::TokeParser 2; # use HTML::TokeParser::Simple 2;
	2					27086
	2					65
6	2			2		1713	use URI 1;
	2					17247
	2					107
7	2			2		22	use Carp qw( croak );
	2					4
	2					173
8
9	2			2		11	use vars qw( $VERSION );
	2					2
	2					118
10	1					4	$VERSION = '0.13';
11
12							## The html tags which might have URLs
13							# the master list of tagolas and required attributes (to constitute a link)
14	2			2		11	use vars qw( %TAGS );
	2					2
	2					318
15	1					76	%TAGS = (
16							a => [qw( href )],
17							applet => [qw( archive code codebase src )],
18							area => [qw( href )],
19							base => [qw( href )],
20							bgsound => [qw( src )],
21							blockquote => [qw( cite )],
22							body => [qw( background )],
23							del => [qw( cite )],
24							div => [qw( src )], # IE likes it, but don't know where it's documented
25							embed => [qw( pluginspage pluginurl src )],
26							form => [qw( action )],
27							frame => [qw( src longdesc )],
28							iframe => [qw( src )],
29							ilayer => [qw( background src )],
30							img => [qw( dynsrc longdesc lowsrc src usemap )],
31							input => [qw( dynsrc lowsrc src )],
32							ins => [qw( cite )],
33							isindex => [qw( action )], # real oddball
34							layer => [qw( src )],
35							link => [qw( src href )],
36							object => [qw( archive classid code codebase data usemap )],
37							q => [qw( cite )],
38							script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG!
39							sound => [qw( src )],
40							table => [qw( background )],
41							td => [qw( background )],
42							th => [qw( background )],
43							tr => [qw( background )],
44							## the exotic cases
45							meta => undef,
46							'!doctype' => [qw( url )], # is really a process instruction
47							);
48
49							## tags which contain <.*?> STUFF TO GET
50	2			2		10	use vars qw( @TAGS_IN_NEED );
	2					4
	2					425
51	1					6	@TAGS_IN_NEED = qw(
52							a
53							blockquote
54							del
55							ins
56							q
57							);
58
59	2			2		21	use vars qw( @VALID_URL_ATTRIBUTES );
	2					4
	2					5269
60	1					8	@VALID_URL_ATTRIBUTES = qw(
61							action
62							archive
63							background
64							cite
65							classid
66							code
67							codebase
68							data
69							dynsrc
70							href
71							longdesc
72							lowsrc
73							pluginspage
74							pluginurl
75							src
76							usemap
77							);
78
79
80							sub new {
81	5			5	1	813650	my($class, $cb, $base, $strip) = @_;
82	5					26	my $self = bless {}, $class;
83
84
85	5	100				44	$self->{_cb} = $cb if defined $cb;
86	5	100				49	$self->{_base} = URI->new($base) if defined $base;
87	5		100			10300	$self->strip( $strip \|\| 0 );
88
89	5					17	return $self;
90							}
91
92							sub strip {
93	17			17	1	762	my( $self, $on ) = @_;
94	17	100				75	return $self->{_strip} unless defined $on;
95	6	100				54	return $self->{_strip} = $on ? 1 : 0;
96							}
97
98							## $p= HTML::TokeParser->new($filename \|\| FILEHANDLE \|\|\$filecontents); # ## $p= HTML::TokeParser::Simple->new($filename \|\| FILEHANDLE \|\|\$filecontents);
99
100							sub parse {
101	5			5	1	728	my( $this, $hmmm ) = @_;
102	5					51	my $tp = new HTML::TokeParser( $hmmm ); # my $tp = new HTML::TokeParser::Simple( $hmmm );
103
104	5	50				914	unless($tp) {
105	0					0	croak qq[ Couldn't create a HTML::TokeParser object: $!]; # croak qq[ Couldn't create a HTML::TokeParser::Simple object: $!];
106							}
107
108	5					12	$this->{_tp} = $tp;
109
110	5					22	$this->_parsola();
111	5					36	return();
112							}
113
114							sub _parsola {
115	5			5		11	my $self = shift;
116
117							## a stack of links for keeping track of TEXT
118							## which is all of "text"
119	5					10	my @TEXT = ();
120	5					14	$self->{_LINKS} = [];
121
122
123							# ["S", $tag, $attr, $attrseq, $text]
124							# ["E", $tag, $text]
125							# ["T", $text, $is_data]
126							# ["C", $text]
127							# ["D", $text]
128							# ["PI", $token0, $text]
129
130	5					28	while (my $T = $self->{_tp}->get_token() ) {
131	51					683	my $NL; #NewLink
132	51					74	my $Tag = $T->[1]; # my $Tag = $T->return_tag;
133	51					62	my $got_TAGS_IN_NEED=0;
134							## Start tag?
135	51	100				137	if($T->[0] eq 'S' ) { # if($T->is_start_tag) {
		100
		100
		50
136	15	100				44	next unless exists $TAGS{$Tag};
137
138							## Do we have a tag for which we want to capture text?
139	14					42	$got_TAGS_IN_NEED = 0;
140	14					23	$got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;
	70					376
141
142							## then check to see if we got things besides META :)
143	14	100				44	if(defined $TAGS{ $Tag }) {
		50
144
145	13					16	for my $Btag(@{$TAGS{$Tag}}) {
	13					32
146							## and we check if they do have one with a value
147	25	100				72	if(exists $T->[2]->{ $Btag }) { # if(exists $T->return_attr()->{ $Btag }) {
148
149	13					16	$NL = $T->[2]; # $NL = $T->return_attr();
150							## TAGS_IN_NEED are tags in deed (start capturing the STUFF)
151	13	100				29	if($got_TAGS_IN_NEED) {
152	9					13	push @TEXT, $NL;
153	9					39	$NL->{_TEXT} = "";
154							}
155							}
156							}
157							}elsif($Tag eq 'meta') {
158	1					3	$NL = $T->[2]; # $NL = $T->return_attr();
159
160	1	50	33			21	if(defined $$NL{content} and length $$NL{content} and (
			33
			33
161							defined $$NL{'http-equiv'} && $$NL{'http-equiv'} =~ /refresh/i
162							or
163							defined $$NL{'name'} && $$NL{'name'} =~ /refresh/i
164							) ) {
165
166	1					11	my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2;
167	1					3	my $base = $self->{_base};
168	1	50				4	$$NL{url} = URI->new_abs( $url, $base ) if $base;
169	1	50				5	$$NL{url} = $url unless exists $$NL{url};
170	1	50				6	$$NL{timeout} = $timeout if $timeout;
171							}
172							}
173
174							## In case we got nested tags
175	14	100				36	if(@TEXT) {
176	12					32	$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
177							}
178
179							## Text?
180							}elsif($T->[0] eq 'T' ) { # }elsif($T->is_text) {
181	26	100				75	$TEXT[-1]->{_TEXT} .= $T->[-2] if @TEXT; # $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
182							## Declaration?
183							}elsif($T->[0] eq 'D' ) { # }elsif($T->is_declaration) {
184							## We look at declarations, to get anly custom .dtd's (tis linky)
185	1					3	my $text = $T->[-1] ; # my $text = $T->as_is;
186	1	50				12	if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
187	1					9	$NL = { raw => $text, url => $1, tag => '!doctype' };
188							}
189							## End tag?
190							}elsif($T->[0] eq 'E' ){ # }elsif($T->is_end_tag){
191							## these be ignored (maybe not in between tags
192							## unless we're stacking (bug #5723)
193	9	50	33			49	if(@TEXT and exists $TAGS{$Tag}) {
194	9					21	$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
195	9					11	my $pop = pop @TEXT;
196	9	100				23	$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
197	9	100				18	$pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
198	9	100				329	$self->{_cb}->($self, $pop) if exists $self->{_cb};
199							}
200							}
201
202	50	100				249	if(defined $NL) {
203	15					29	$$NL{tag} = $Tag;
204
205	15					23	my $base = $self->{_base};
206
207	15					53	for my $at( @VALID_URL_ATTRIBUTES ) {
208	240	100				891	if( exists $$NL{$at} ) {
209	13	100				181	$$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
210							}
211							}
212
213	15	100				49	if(exists $self->{_cb}) {
214	3	100	66			25	$self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
215							} else {
216	12					13	push @{$self->{_LINKS}}, $NL;
	12					77
217							}
218							}
219							}## endof while (my $token = $p->get_token)
220
221	5					46	undef $self->{_tp};
222	5					10	return();
223							}
224
225							sub links {
226	5			5	1	52	my $self = shift;
227							## just like HTML::LinkExtor's
228	5					48	return $self->{_LINKS};
229							}
230
231
232							sub _stripHTML {
233	4			4		6	my $HtmlRef = shift;
234	4					11	my $tp = new HTML::TokeParser( $HtmlRef ); # my $tp = new HTML::TokeParser::Simple( $HtmlRef );
235	4					442	my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED)
236							# otherwise it ain't come from LinkExtractor
237	4	50				133	if($t->[0] eq 'S' ) { # if($t->is_start_tag) {
238	4					20	return $tp->get_trimmed_text( '/'.$t->[1] ); # return $tp->get_trimmed_text( '/'.$t->return_tag );
239							} else {
240	0						require Data::Dumper;
241	0						local $Data::Dumper::Indent=1;
242	0						die " IMPOSSIBLE!!!! ",
243							Data::Dumper::Dumper(
244							'$HtmlRef',$HtmlRef,
245							'$t', $t,
246							);
247							}
248							}
249
250	1					2	1;
251
252							package main;
253
254	1	50				4	unless(caller()) {
255	1					1496	require Data::Dumper;
256	1	50				8647	if(@ARGV) {
257	0					0	for my $file( @ARGV ) {
258	0	0				0	if( -e $file ) {
259	0					0	my $LX = new HTML::LinkExtractor( );
260	0					0	$LX->parse( $file );
261	0					0	print Data::Dumper::Dumper($LX->links);
262	0					0	undef $LX;
263							} else {
264	0					0	warn "The file `$file' doesn't exist\n";
265							}
266							}
267
268							} else {
269
270	1					4	my $INPUT = q{
271							COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
272
273							1
274							2
275							3
276							4 Perlmonks.org
277
278
279							5
280							hello there
281							6
282							7 now
283
284
285							8 To be or not to be.
286							9
287							Just Another Perl Hacker,
288
289							};
290
291	1					13	my $LX = new HTML::LinkExtractor();
292	1					8	$LX->parse(\$INPUT);
293
294	1					2	print scalar(@{$LX->links()})." we GOT\n";
	1					4
295	1					4	print Data::Dumper::Dumper( $LX->links() );
296							}
297
298							}
299
300							__END__