line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Scrape; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
140399
|
use 5.10.1; |
|
2
|
|
|
|
|
12
|
|
4
|
2
|
|
|
2
|
|
17
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
38
|
|
5
|
2
|
|
|
2
|
|
9
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
134
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTML::Scrape - The great new HTML::Scrape! |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.1.0 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.1.0'; |
18
|
|
|
|
|
|
|
|
19
|
2
|
|
|
2
|
|
1214
|
use HTML::Parser; |
|
2
|
|
|
|
|
11663
|
|
|
2
|
|
|
|
|
66
|
|
20
|
2
|
|
|
2
|
|
994
|
use HTML::Tagset; |
|
2
|
|
|
|
|
2670
|
|
|
2
|
|
|
|
|
1636
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Handy helpers for common HTML scraping tasks. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use HTML::Scrape; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my $ids = HTML::Scrape::scrape_all_ids( $html ); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 FUNCTIONS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 scrape_all_ids( $html ) |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
Parses the entire web page and returns all the text in a hashref keyed on ID. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub scrape_all_ids { |
40
|
2
|
|
|
2
|
1
|
1197
|
my $html = shift; |
41
|
|
|
|
|
|
|
|
42
|
2
|
|
|
|
|
17
|
my $p = HTML::Parser->new( |
43
|
|
|
|
|
|
|
start_h => [ \&_parser_handle_start, 'self, tagname, attr, line, column' ], |
44
|
|
|
|
|
|
|
end_h => [ \&_parser_handle_end, 'self, tagname, line, column' ], |
45
|
|
|
|
|
|
|
text_h => [ \&_parser_handle_text, 'self, dtext' ], |
46
|
|
|
|
|
|
|
ignore_elements => [qw(script style)], |
47
|
|
|
|
|
|
|
); |
48
|
2
|
|
|
|
|
167
|
$p->{stack} = []; |
49
|
2
|
|
|
|
|
5
|
$p->{ids} = {}; |
50
|
|
|
|
|
|
|
|
51
|
2
|
|
|
|
|
15
|
$p->empty_element_tags(1); |
52
|
2
|
|
|
|
|
21
|
$p->parse($html); |
53
|
2
|
|
|
|
|
16
|
$p->eof; |
54
|
|
|
|
|
|
|
|
55
|
2
|
50
|
|
|
|
2
|
if ( my $n = scalar @{$p->{stack}} ) { |
|
2
|
|
|
|
|
7
|
|
56
|
0
|
|
|
|
|
0
|
warn "$n tag(s) unclosed at end of document.\n"; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
2
|
|
|
|
|
15
|
return $p->{ids}; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub _parser_handle_start { |
64
|
22
|
|
|
22
|
|
30
|
my $parser = shift; |
65
|
22
|
|
|
|
|
33
|
my $tagname = shift; |
66
|
22
|
|
|
|
|
26
|
my $attr = shift; |
67
|
22
|
|
|
|
|
37
|
my $line = shift; |
68
|
22
|
|
|
|
|
22
|
my $column = shift; |
69
|
|
|
|
|
|
|
|
70
|
22
|
100
|
|
|
|
56
|
return if $HTML::Tagset::emptyElement{$tagname}; |
71
|
|
|
|
|
|
|
|
72
|
19
|
|
|
|
|
22
|
my $id = $attr->{id}; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# If it's a dupe ID, warn and ignore the ID. |
75
|
19
|
50
|
66
|
|
|
53
|
if ( defined($id) && exists $parser->{ids}{$id} ) { |
76
|
0
|
|
|
|
|
0
|
warn "Duplicate ID $id found in <$tagname> at $line:$column\n"; |
77
|
0
|
|
|
|
|
0
|
$id = undef; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
19
|
|
|
|
|
29
|
my $stack = $parser->{stack}; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Tags like and that don't have to close themselves get closed another of them comes along. |
83
|
|
|
|
|
|
|
# For example: |
84
|
|
|
|
|
|
|
# |
85
|
|
|
|
|
|
|
# whatever |
86
|
|
|
|
|
|
|
# thingy |
87
|
|
|
|
|
|
|
# |
88
|
19
|
100
|
66
|
|
|
43
|
if ( $HTML::Tagset::optionalEndTag{$tagname} && @{$stack} && $stack->[-1][0] eq $tagname ) { |
|
8
|
|
100
|
|
|
31
|
|
89
|
3
|
|
|
|
|
5
|
my $item = pop @{$stack}; |
|
3
|
|
|
|
|
4
|
|
90
|
3
|
|
|
|
|
20
|
_close_tag( $parser, $item ); |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
19
|
|
|
|
|
23
|
push @{$stack}, [ $tagname, $id, '' ]; |
|
19
|
|
|
|
|
64
|
|
94
|
|
|
|
|
|
|
|
95
|
19
|
|
|
|
|
73
|
return; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _parser_handle_end { |
100
|
16
|
|
|
16
|
|
28
|
my $parser = shift; |
101
|
16
|
|
|
|
|
21
|
my $tagname = shift; |
102
|
16
|
|
|
|
|
19
|
my $line = shift; |
103
|
16
|
|
|
|
|
19
|
my $column = shift; |
104
|
|
|
|
|
|
|
|
105
|
16
|
100
|
|
|
|
34
|
return if $HTML::Tagset::emptyElement{$tagname}; |
106
|
|
|
|
|
|
|
|
107
|
14
|
|
|
|
|
18
|
my $stack = $parser->{stack}; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Deal with tags that close others. |
110
|
14
|
50
|
|
|
|
16
|
if ( @{$stack} ) { |
|
14
|
|
|
|
|
29
|
|
111
|
14
|
|
|
|
|
18
|
my $previous_item = $stack->[-1]; |
112
|
14
|
|
|
|
|
20
|
my $previous_tagname = $previous_item->[0]; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
#warn "tagname $tagname hprase markup = " , $HTML::Tagset::isPhraseMarkup{$tagname} // 'undef', ' previous = ' . $previous_tagname; |
115
|
|
|
|
|
|
|
my $this_tag_closes_previous_one = |
116
|
|
|
|
|
|
|
( $tagname ne $previous_tagname ) |
117
|
|
|
|
|
|
|
&& |
118
|
|
|
|
|
|
|
( |
119
|
|
|
|
|
|
|
( ($tagname eq 'ul' || $tagname eq 'ol') && $previous_tagname eq 'li' ) |
120
|
|
|
|
|
|
|
|| |
121
|
|
|
|
|
|
|
( ($tagname eq 'dl') && ($previous_tagname eq 'dt' || $previous_tagname eq 'dd') ) |
122
|
|
|
|
|
|
|
|| |
123
|
14
|
|
66
|
|
|
45
|
( !$HTML::Tagset::isPhraseMarkup{$tagname} && $previous_tagname eq 'p' ) |
124
|
|
|
|
|
|
|
) |
125
|
|
|
|
|
|
|
; |
126
|
14
|
100
|
|
|
|
26
|
if ( $this_tag_closes_previous_one ) { |
127
|
2
|
|
|
|
|
2
|
_close_tag( $parser, pop @{$stack} ); |
|
2
|
|
|
|
|
5
|
|
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
14
|
50
|
|
|
|
18
|
if ( !@{$stack} ) { |
|
14
|
|
|
|
|
29
|
|
132
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column\n"; |
133
|
0
|
|
|
|
|
0
|
return; |
134
|
|
|
|
|
|
|
} |
135
|
14
|
50
|
|
|
|
26
|
if ( $tagname ne $stack->[-1][0] ) { |
136
|
0
|
|
|
|
|
0
|
warn "Unexpected closing $tagname> at $line:$column: Expecting $stack->[-1][0]>\n"; |
137
|
0
|
|
|
|
|
0
|
return; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
14
|
|
|
|
|
17
|
_close_tag( $parser, pop @{$stack} ); |
|
14
|
|
|
|
|
30
|
|
141
|
|
|
|
|
|
|
|
142
|
14
|
|
|
|
|
46
|
return; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub _parser_handle_text { |
147
|
38
|
|
|
38
|
|
56
|
my $parser = shift; |
148
|
38
|
|
|
|
|
53
|
my $text = shift; |
149
|
|
|
|
|
|
|
|
150
|
38
|
|
|
|
|
42
|
for my $item ( @{$parser->{stack}} ) { |
|
38
|
|
|
|
|
68
|
|
151
|
87
|
100
|
|
|
|
149
|
if ( $item->[1] ) { # Only accumulate text for tags with IDs. |
152
|
17
|
|
|
|
|
41
|
$item->[2] .= $text; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
38
|
|
|
|
|
156
|
return; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub _close_tag { |
161
|
19
|
|
|
19
|
|
26
|
my $parser = shift; |
162
|
19
|
|
|
|
|
21
|
my $item = shift; |
163
|
|
|
|
|
|
|
|
164
|
19
|
|
|
|
|
20
|
my (undef, $id, $text) = @{$item}; |
|
19
|
|
|
|
|
37
|
|
165
|
19
|
100
|
|
|
|
34
|
if ( defined $id ) { |
166
|
10
|
|
|
|
|
34
|
$text =~ s/^\s+//; |
167
|
10
|
|
|
|
|
35
|
$text =~ s/\s+$//; |
168
|
10
|
|
|
|
|
28
|
$text =~ s/\s+/ /g; |
169
|
10
|
|
|
|
|
22
|
$parser->{ids}{$id} = $text; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
19
|
|
|
|
|
32
|
return; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 AUTHOR |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Andy Lester, C<< >> |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 BUGS |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Please report any bugs or feature requests at L.. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head1 SUPPORT |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
perldoc HTML::Scrape |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
You can also look for information at: |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=over 4 |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item * Search CPAN |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
L |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=back |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This software is Copyright (c) 2023 by Andy Lester. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
This is free software, licensed under: The Artistic License 2.0 (GPL Compatible) |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=cut |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
1; # End of HTML::Scrape |