File Coverage

blib/lib/HTML/Scrape.pm

Criterion	Covered	Total	%
statement	117	121	96.6
branch	43	48	89.5
condition	17	21	80.9
subroutine	13	13	100.0
pod	2	2	100.0
total	192	205	93.6

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Scrape;
2
3	3			3		210604	use 5.10.1;
	3					20
4	3			3		16	use strict;
	3					5
	3					55
5	3			3		12	use warnings;
	3					5
	3					174
6
7							=head1 NAME
8
9							HTML::Scrape - Helper functions for scraping text from HTML tags
10
11							=head1 VERSION
12
13							Version 0.3.0
14
15							=cut
16
17							our $VERSION = '0.3.0';
18
19							our $WARNINGS = 1;
20
21	3			3		1729	use HTML::Parser;
	3					17455
	3					102
22	3			3		1504	use HTML::TokeParser;
	3					12010
	3					91
23	3			3		21	use HTML::Tagset;
	3					6
	3					3186
24
25
26							=head1 SYNOPSIS
27
28							Handy helpers for common HTML scraping tasks.
29
30							use HTML::Scrape;
31
32							my $ids = HTML::Scrape::scrape_all_ids( $html );
33
34							=head1 WARNINGS
35
36							You can enable parsing warnings by setting C<$HTML::Scrape::WARNINGS>
37							to a true value. By default, no warnings are emitted.
38
39							=head1 NOTES FOR FUTURE DOCS
40
41							If a tag exists but has no content, including empty tags like C<< >>,
42							then it will have an empty string for content. This way you can test
43							for existence of these tags.
44
45							=head1 FUNCTIONS
46
47							=head2 scrape_id( $id, $html )
48
49							Scrapes the text of the single ID C<$id> from C<$html>.
50
51							=cut
52
53							sub scrape_id {
54	23			23	1	22772	my $id = shift;
55	23					41	my $html = shift;
56
57	23					45	my $all_ids = scrape_all_ids( $html, $id );
58
59	23					101	return $all_ids->{$id};
60							}
61
62
63							=head2 scrape_all_ids( $html [, $specific_id ] )
64
65							Parses the entire web page and returns all the text in a hashref keyed on ID.
66
67							If you pass in C<$specific_id>, then only that ID will be scraped,
68							and parsing will stop once it is found. The better way to do this is by
69							calling C.
70
71							=cut
72
73							sub scrape_all_ids {
74	47			47	1	12186	my $html = shift;
75	47					64	my $wanted_id = shift;
76
77	47					233	my $p = HTML::Parser->new(
78							start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ],
79							end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ],
80							text_h => [ \&_parser_handle_text, 'self, dtext' ],
81							);
82	47					2771	$p->{stack} = [];
83	47					100	$p->{ids} = {};
84	47	100				114	if ( defined $wanted_id ) {
85	42					76	$p->{wanted_id} = $wanted_id;
86							}
87
88	47					161	$p->empty_element_tags(1);
89	47	50				325	$p->parse($html) if defined($html);
90	47					140	$p->eof;
91
92	47	100				92	if ( !defined $wanted_id ) {
93							# With a wanted_id, we would have stopped parsing early and left tags on the stack, so don't check.
94	5	100				7	if ( my $n = scalar @{$p->{stack}} ) {
	5					29
95	1					4	_warn( "$n tag(s) unclosed at end of document: " . join( ', ', map { $_->[0] } @{$p->{stack}} ) );
	6					26
	1					3
96							}
97							}
98
99	47					250	return $p->{ids};
100							}
101
102
103							sub _parser_handle_start {
104	624			624		1006	my $parser = shift;
105	624					768	my $tagname = shift;
106	624					720	my $attr = shift;
107	624					721	my $line = shift;
108	624					726	my $column = shift;
109
110	624					873	my $id = $attr->{id};
111
112	624	100				1491	if ( $HTML::Tagset::emptyElement{$tagname} ) {
113	68	100	100			206	if ( $tagname eq 'br' \|\| $tagname eq 'hr' ) {
114	48					71	_parser_handle_text( $parser, ' ' );
115							}
116
117	68	100				120	if ( $id ) {
118	14	100				26	if ( defined($parser->{wanted_id}) ) {
119	12	100				26	if ( $id eq $parser->{wanted_id} ) {
120	4					8	$parser->{ids}{$id} = '';
121	4					13	$parser->eof;
122	4					12	return;
123							}
124							}
125							else {
126	2					4	$parser->{ids}{$id} = '';
127							}
128							}
129
130	64					224	return;
131							}
132
133							# Add space if this tag is one that causes whitespace when rendered.
134	556	100	66			1708	if ( $tagname eq 'br' \|\| !$HTML::Tagset::isPhraseMarkup{$tagname} ) {
135	365					591	_parser_handle_text( $parser, ' ' );
136							}
137
138							# If it's a dupe ID, warn and ignore the ID.
139	556	50	66			1124	if ( defined($id) && exists $parser->{ids}{$id} ) {
140	0					0	_warn( "Duplicate ID $id found in <$tagname> at $line:$column" );
141	0					0	$id = undef;
142							}
143
144	556					741	my $stack = $parser->{stack};
145
146							# Tags like and that don't have to close themselves get closed another of them comes along.
147							# For example:
148							#
149							# whatever
150							# thingy
151							#
152	556	100	66			1085	if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) {
	134		100			469
153	36					50	my $item = pop @{$stack};
	36					48
154	36					65	_close_tag( $parser, $item );
155							}
156
157	556					865	push @{$stack}, [ $tagname, $id, '' ];
	556					1268
158
159	556					2108	return;
160							}
161
162
163							sub _parser_handle_end {
164	458			458		697	my $parser = shift;
165	458					571	my $tagname = shift;
166	458					555	my $line = shift;
167	458					523	my $column = shift;
168
169	458	100				898	return if $HTML::Tagset::emptyElement{$tagname};
170
171	436					540	my $stack = $parser->{stack};
172
173							# Deal with tags that close others. Implicitly close the previous tag if it's li, dt, dd or p.
174	436	50				480	if ( @{$stack} ) {
	436					802
175	436					549	my $previous_item = $stack->[-1];
176	436					527	my $previous_tagname = $previous_item->[0];
177
178							my $this_tag_closes_previous_one =
179							( $tagname ne $previous_tagname )
180							&&
181							(
182							( ($tagname eq 'ul' \|\| $tagname eq 'ol') && $previous_tagname eq 'li' )
183							\|\|
184							( ($tagname eq 'dl') && ($previous_tagname eq 'dt' \|\| $previous_tagname eq 'dd') )
185							\|\|
186	436		100			905	( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' )
187							)
188							;
189	436	100				693	if ( $this_tag_closes_previous_one ) {
190	14					21	_close_tag( $parser, pop @{$stack} );
	14					24
191							}
192							}
193
194	436	50				535	if ( !@{$stack} ) {
	436					773
195	0					0	_warn( "Unexpected closing at $line:$column" );
196	0					0	return;
197							}
198	436	100				762	if ( $tagname ne $stack->[-1][0] ) {
199	4					20	_warn( "Unexpected closing at $line:$column: Expecting [-1][0]>" );
200	4					13	return;
201							}
202
203	432					546	_close_tag( $parser, pop @{$stack} );
	432					869
204
205							# Add space if this tag is one that causes whitespace when rendered.
206	432	100	66			1444	if ( $tagname eq 'br' \|\| !$HTML::Tagset::isPhraseMarkup{$tagname} ) {
207	241					388	_parser_handle_text( $parser, ' ' );
208							}
209
210	432					1315	return;
211							}
212
213
214							sub _parser_handle_text {
215	1615			1615		2226	my $parser = shift;
216	1615					2041	my $text = shift;
217
218	1615					1867	for my $item ( @{$parser->{stack}} ) {
	1615					2694
219	6481	100				10570	if ( $item->[1] ) { # Only accumulate text for tags with IDs.
220	1356					2063	$item->[2] .= $text;
221							}
222							}
223
224	1615					4425	return;
225							}
226
227
228							sub _close_tag {
229	482			482		561	my $parser = shift;
230	482					547	my $item = shift;
231
232	482					538	my (undef, $id, $text) = @{$item};
	482					880
233	482	100				827	if ( defined $id ) {
234	137					185	my $keepit;
235
236	137	100				249	if ( defined $parser->{wanted_id} ) {
237							# We're searching for a specific ID.
238	109	100				181	if ( $id eq $parser->{wanted_id} ) {
239	34					51	$keepit = 1;
240	34					76	$parser->eof;
241							}
242							else {
243							# No need to keep the text of an ID we don't care about.
244							}
245							}
246							else {
247	28					38	$keepit = 1;
248							}
249
250	137	100				232	if ( $keepit ) {
251	62					225	$text =~ s/^\s+//;
252	62					327	$text =~ s/\s+$//;
253	62					481	$text =~ s/\s+/ /g;
254	62					159	$parser->{ids}{$id} = $text;
255							}
256							}
257
258	482					681	return;
259							}
260
261
262							sub _warn {
263	5	50		5		48	warn @_, "\n" if $WARNINGS;
264
265	5					28	return;
266							}
267
268
269							=head1 AUTHOR
270
271							Andy Lester, C<< >>
272
273							=head1 BUGS
274
275							Please report any bugs or feature requests at L..
276
277							=head1 SUPPORT
278
279							You can find documentation for this module with the perldoc command.
280
281							perldoc HTML::Scrape
282
283							You can also look for information at:
284
285							=over 4
286
287							=item * Search CPAN
288
289							L
290
291							=back
292
293							=head1 LICENSE AND COPYRIGHT
294
295							This software is Copyright (c) 2023 by Andy Lester.
296
297							This is free software, licensed under: The Artistic License 2.0 (GPL Compatible)
298
299							=cut
300
301							1; # End of HTML::Scrape