line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Scrape; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
214236
|
use 5.10.1; |
|
3
|
|
|
|
|
23
|
|
4
|
3
|
|
|
3
|
|
16
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
58
|
|
5
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
153
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTML::Scrape - The great new HTML::Scrape! |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.2.0 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.2.0'; |
18
|
|
|
|
|
|
|
|
19
|
3
|
|
|
3
|
|
1830
|
use HTML::Parser; |
|
3
|
|
|
|
|
17500
|
|
|
3
|
|
|
|
|
112
|
|
20
|
3
|
|
|
3
|
|
3070
|
use HTML::TokeParser; |
|
3
|
|
|
|
|
13515
|
|
|
3
|
|
|
|
|
91
|
|
21
|
3
|
|
|
3
|
|
19
|
use HTML::Tagset; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
2641
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Handy helpers for common HTML scraping tasks. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use HTML::Scrape; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $ids = HTML::Scrape::scrape_all_ids( $html ); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 FUNCTIONS |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head2 scrape_id( $id, $html ) |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
Scrapes the text of the single ID C<$id> from C<$html>. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub scrape_id { |
41
|
15
|
|
|
15
|
1
|
14072
|
my $id = shift; |
42
|
15
|
|
|
|
|
21
|
my $html = shift; |
43
|
|
|
|
|
|
|
|
44
|
15
|
|
|
|
|
41
|
my $all_ids = scrape_all_ids( $html, $id ); |
45
|
|
|
|
|
|
|
|
46
|
15
|
|
|
|
|
67
|
return $all_ids->{$id}; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 scrape_all_ids( $html [, $specific_id ] ) |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Parses the entire web page and returns all the text in a hashref keyed on ID. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
If you pass in C<$specific_id>, then only that ID will be scraped, |
55
|
|
|
|
|
|
|
and parsing will stop once it is found. The better way to do this is by |
56
|
|
|
|
|
|
|
calling C. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=cut |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub scrape_all_ids { |
61
|
30
|
|
|
30
|
1
|
6447
|
my $html = shift; |
62
|
30
|
|
|
|
|
41
|
my $wanted_id = shift; |
63
|
|
|
|
|
|
|
|
64
|
30
|
|
|
|
|
154
|
my $p = HTML::Parser->new( |
65
|
|
|
|
|
|
|
start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ], |
66
|
|
|
|
|
|
|
end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ], |
67
|
|
|
|
|
|
|
text_h => [ \&_parser_handle_text, 'self, dtext' ], |
68
|
|
|
|
|
|
|
); |
69
|
30
|
|
|
|
|
1682
|
$p->{stack} = []; |
70
|
30
|
|
|
|
|
67
|
$p->{ids} = {}; |
71
|
30
|
100
|
|
|
|
73
|
if ( defined $wanted_id ) { |
72
|
27
|
|
|
|
|
46
|
$p->{wanted_id} = $wanted_id; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
30
|
|
|
|
|
82
|
$p->empty_element_tags(1); |
76
|
30
|
|
|
|
|
195
|
$p->parse($html); |
77
|
30
|
|
|
|
|
96
|
$p->eof; |
78
|
|
|
|
|
|
|
|
79
|
30
|
100
|
|
|
|
61
|
if ( !defined $wanted_id ) { |
80
|
|
|
|
|
|
|
# With a wanted_id, we would have stopped parsing early and left tags on the stack, so don't check. |
81
|
3
|
100
|
|
|
|
6
|
if ( my $n = scalar @{$p->{stack}} ) { |
|
3
|
|
|
|
|
23
|
|
82
|
1
|
|
|
|
|
12
|
warn "$n tag(s) unclosed at end of document.\n"; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
30
|
|
|
|
|
166
|
return $p->{ids}; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub _parser_handle_start { |
91
|
529
|
|
|
529
|
|
823
|
my $parser = shift; |
92
|
529
|
|
|
|
|
678
|
my $tagname = shift; |
93
|
529
|
|
|
|
|
618
|
my $attr = shift; |
94
|
529
|
|
|
|
|
633
|
my $line = shift; |
95
|
529
|
|
|
|
|
601
|
my $column = shift; |
96
|
|
|
|
|
|
|
|
97
|
529
|
100
|
|
|
|
1071
|
return if $HTML::Tagset::emptyElement{$tagname}; |
98
|
|
|
|
|
|
|
|
99
|
505
|
|
|
|
|
667
|
my $id = $attr->{id}; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# If it's a dupe ID, warn and ignore the ID. |
102
|
505
|
50
|
66
|
|
|
1020
|
if ( defined($id) && exists $parser->{ids}{$id} ) { |
103
|
0
|
|
|
|
|
0
|
warn "Duplicate ID $id found in <$tagname> at $line:$column\n"; |
104
|
0
|
|
|
|
|
0
|
$id = undef; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
505
|
|
|
|
|
652
|
my $stack = $parser->{stack}; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Tags like and that don't have to close themselves get closed another of them comes along. |
110
|
|
|
|
|
|
|
# For example: |
111
|
|
|
|
|
|
|
# |
112
|
|
|
|
|
|
|
# whatever |
113
|
|
|
|
|
|
|
# thingy |
114
|
|
|
|
|
|
|
# |
115
|
505
|
100
|
66
|
|
|
920
|
if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) { |
|
125
|
|
100
|
|
|
442
|
|
116
|
36
|
|
|
|
|
45
|
my $item = pop @{$stack}; |
|
36
|
|
|
|
|
50
|
|
117
|
36
|
|
|
|
|
62
|
_close_tag( $parser, $item ); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
505
|
|
|
|
|
611
|
push @{$stack}, [ $tagname, $id, '' ]; |
|
505
|
|
|
|
|
1119
|
|
121
|
|
|
|
|
|
|
|
122
|
505
|
|
|
|
|
1780
|
return; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _parser_handle_end { |
127
|
402
|
|
|
402
|
|
589
|
my $parser = shift; |
128
|
402
|
|
|
|
|
504
|
my $tagname = shift; |
129
|
402
|
|
|
|
|
481
|
my $line = shift; |
130
|
402
|
|
|
|
|
459
|
my $column = shift; |
131
|
|
|
|
|
|
|
|
132
|
402
|
100
|
|
|
|
747
|
return if $HTML::Tagset::emptyElement{$tagname}; |
133
|
|
|
|
|
|
|
|
134
|
389
|
|
|
|
|
497
|
my $stack = $parser->{stack}; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Deal with tags that close others. |
137
|
389
|
50
|
|
|
|
442
|
if ( @{$stack} ) { |
|
389
|
|
|
|
|
677
|
|
138
|
389
|
|
|
|
|
517
|
my $previous_item = $stack->[-1]; |
139
|
389
|
|
|
|
|
474
|
my $previous_tagname = $previous_item->[0]; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname; |
142
|
|
|
|
|
|
|
my $this_tag_closes_previous_one = |
143
|
|
|
|
|
|
|
( $tagname ne $previous_tagname ) |
144
|
|
|
|
|
|
|
&& |
145
|
|
|
|
|
|
|
( |
146
|
|
|
|
|
|
|
( ($tagname eq 'ul' || $tagname eq 'ol') && $previous_tagname eq 'li' ) |
147
|
|
|
|
|
|
|
|| |
148
|
|
|
|
|
|
|
( ($tagname eq 'dl') && ($previous_tagname eq 'dt' || $previous_tagname eq 'dd') ) |
149
|
|
|
|
|
|
|
|| |
150
|
389
|
|
100
|
|
|
817
|
( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' ) |
151
|
|
|
|
|
|
|
) |
152
|
|
|
|
|
|
|
; |
153
|
389
|
100
|
|
|
|
644
|
if ( $this_tag_closes_previous_one ) { |
154
|
14
|
|
|
|
|
18
|
_close_tag( $parser, pop @{$stack} ); |
|
14
|
|
|
|
|
25
|
|
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
389
|
50
|
|
|
|
495
|
if ( !@{$stack} ) { |
|
389
|
|
|
|
|
666
|
|
159
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column\n"; |
160
|
0
|
|
|
|
|
0
|
return; |
161
|
|
|
|
|
|
|
} |
162
|
389
|
100
|
|
|
|
693
|
if ( $tagname ne $stack->[-1][0] ) { |
163
|
4
|
|
|
|
|
40
|
warn "Unexpected closing $tagname> at $line:$column: Expecting $stack->[-1][0]>\n"; |
164
|
4
|
|
|
|
|
30
|
return; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
385
|
|
|
|
|
461
|
_close_tag( $parser, pop @{$stack} ); |
|
385
|
|
|
|
|
791
|
|
168
|
|
|
|
|
|
|
|
169
|
385
|
|
|
|
|
1175
|
return; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub _parser_handle_text { |
174
|
833
|
|
|
833
|
|
1207
|
my $parser = shift; |
175
|
833
|
|
|
|
|
1073
|
my $text = shift; |
176
|
|
|
|
|
|
|
|
177
|
833
|
|
|
|
|
947
|
for my $item ( @{$parser->{stack}} ) { |
|
833
|
|
|
|
|
1443
|
|
178
|
4300
|
100
|
|
|
|
6984
|
if ( $item->[1] ) { # Only accumulate text for tags with IDs. |
179
|
792
|
|
|
|
|
1200
|
$item->[2] .= $text; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
833
|
|
|
|
|
2940
|
return; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _close_tag { |
188
|
435
|
|
|
435
|
|
541
|
my $parser = shift; |
189
|
435
|
|
|
|
|
516
|
my $item = shift; |
190
|
|
|
|
|
|
|
|
191
|
435
|
|
|
|
|
499
|
my (undef, $id, $text) = @{$item}; |
|
435
|
|
|
|
|
714
|
|
192
|
435
|
100
|
|
|
|
752
|
if ( defined $id ) { |
193
|
109
|
|
|
|
|
126
|
my $keepit; |
194
|
|
|
|
|
|
|
|
195
|
109
|
100
|
|
|
|
194
|
if ( defined $parser->{wanted_id} ) { |
196
|
|
|
|
|
|
|
# We're searching for a specific ID. |
197
|
86
|
100
|
|
|
|
144
|
if ( $id eq $parser->{wanted_id} ) { |
198
|
24
|
|
|
|
|
33
|
$keepit = 1; |
199
|
24
|
|
|
|
|
59
|
$parser->eof; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
else { |
202
|
|
|
|
|
|
|
# No need to keep the text of an ID we don't care about. |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
else { |
206
|
23
|
|
|
|
|
32
|
$keepit = 1; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
109
|
100
|
|
|
|
177
|
if ( $keepit ) { |
210
|
47
|
|
|
|
|
160
|
$text =~ s/^\s+//; |
211
|
47
|
|
|
|
|
261
|
$text =~ s/\s+$//; |
212
|
47
|
|
|
|
|
417
|
$text =~ s/\s+/ /g; |
213
|
47
|
|
|
|
|
127
|
$parser->{ids}{$id} = $text; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
435
|
|
|
|
|
629
|
return; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHOR |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Andy Lester, C<< >> |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 BUGS |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Please report any bugs or feature requests at L.. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head1 SUPPORT |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
perldoc HTML::Scrape |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
You can also look for information at: |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=over 4 |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item * Search CPAN |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
L |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=back |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Andy Lester. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
This is free software, licensed under: The Artistic License 2.0 (GPL Compatible) |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
1; # End of HTML::Scrape |