File Coverage

blib/lib/App/scrape.pm

Criterion	Covered	Total	%
statement	74	76	97.3
branch	22	24	91.6
condition	4	10	40.0
subroutine	6	6	100.0
pod	0	1	0.0
total	106	117	90.6

line	stmt	bran	cond	sub	pod	time	code
1							package App::scrape;
2	4			4		267420	use strict;
	4					35
	4					116
3	4			4		2250	use URI;
	4					27351
	4					123
4	4			4		2129	use HTML::TreeBuilder::XPath;
	4					258980
	4					35
5	4			4		2129	use HTML::Selector::XPath 'selector_to_xpath';
	4					10583
	4					263
6	4			4		27	use Exporter 'import';
	4					10
	4					2645
7
8							our $VERSION = '0.08';
9
10							our @EXPORT_OK = qw;
11
12							=head1 NAME
13
14							App::scrape - simple HTML scraping
15
16							=head1 ABSTRACT
17
18							This is a simple module to extract data from HTML by
19							specifying CSS3 or XPath selectors.
20
21							=head1 SYNOPSIS
22
23							use App::scrape 'scrape';
24							use LWP::Simple 'get';
25							use Data::Dumper;
26
27							my $html = get('http://perlmonks.org');
28							my @posts = scrape(
29							$html,
30							['a','a@href'],
31							{
32							absolute => [qw[href src rel]],
33							base => 'http://perlmonks.org',
34							},
35							);
36							print Dumper \@posts;
37
38							my @posts = scrape(
39							$html,
40							{
41							title => 'a',
42							url => 'a@href',
43							},
44							{
45							absolute => [qw[href src rel]],
46							base => 'http://perlmonks.org',
47							},
48							);
49							print Dumper \@posts;
50
51							=head1 DESCRIPTION
52
53							This module implements yet another scraping engine
54							to extract data from HTML.
55
56							This engine does not (yet) support nested data
57							structures. For an engine that supports nesting, see
58							L.
59
60							=cut
61
62							sub scrape {
63	6			6	0	8778	my ($html, $selectors, $options) = @_;
64
65	6		50			20	$options \|\|= {};
66	6					11	my $delete_tree;
67	6	100				18	if (! ref $options->{tree}) {
68	4					36	$options->{tree} = HTML::TreeBuilder::XPath->new;
69	4					1204	$options->{tree}->parse($html);
70	4					6961	$options->{tree}->eof;
71	4					678	$delete_tree = 1;
72							};
73	6					14	my $tree = $options->{tree};
74
75	6		50			37	$options->{make_uri} \|\|= {};
76	6					11	my %make_uri = %{$options->{make_uri}};
	6					18
77
78							# now fetch all "rows" from the page. We do this once to avoid
79							# fetching a page multiple times
80	6					12	my @rows;
81
82	6					21	my %known_uri = (
83							'href' => 1, # a@href
84							'src' => 1, # img@src , script@src
85							);
86
87	6					10	my @selectors;
88	6	100				20	if (ref $selectors eq 'ARRAY') {
89	4					10	@selectors = @$selectors
90							} else {
91	2					10	@selectors = map { $selectors->{ $_ } } sort keys %$selectors;
	2					6
92							};
93
94	6					10	my $rowidx=0;
95	6					9	my $found_max = 0;
96	6					13	for my $selector (@selectors) {
97	6					33	my ($attr);
98	6					12	my $s = $selector;
99	6	100				34	if ($selector =~ s!/?\@(\w+)$!!) {
100	3					10	$attr = $1;
101							};
102	6	100				26	if ($selector !~ m!^\.?/!) {
103	5					23	$selector = selector_to_xpath( $selector );
104							};
105							# We always make the selector relative to the current node:
106	6	100				578	$selector = ".$selector" unless $selector =~ /^\./;
107	6					13	my @nodes;
108	6	100				19	if (! defined $attr) {
109	3					20	@nodes = map { $_->as_trimmed_text } $tree->findnodes($selector);
	10					7090
110							} else {
111	3		33			43	$make_uri{ $rowidx } \|\|= (($known_uri{ lc $attr }) and ! $options->{no_known_uri});
			33
112	3					28	@nodes = $tree->findvalues("$selector/\@$attr");
113							};
114	6	100				6342	if ($make_uri{ $rowidx }) {
115	3					9	@nodes = map { URI->new_abs( $_, $options->{base} )->as_string } @nodes;
	7					18327
116							};
117	6	100				8527	if( $found_max < @nodes) {
118	5					9	$found_max = @nodes
119							};
120	6					14	push @rows, \@nodes;
121	6					17	$rowidx++;
122							};
123
124							# Now convert the result from rows to columns
125	6					31	my @result;
126	6					27	for my $idx (0.. $found_max-1) {
127							push @result, [ map {
128	14					28	$rows[$_]->[$idx]
	18					53
129							} 0..$#rows ];
130							};
131
132							# Now check what the user wants, array or hash:
133	6	100				35	if( ref $selectors eq 'HASH') {
134							@result = map {
135	2					5	my $arr = $_;
	4					6
136	4					7	my $i = 0;
137	4					11	my @keys = sort { $a cmp $b } keys( %$selectors );
	4					11
138							$_ = +{
139	4					7	map { $_ => $arr->[$i++] } @keys
	8					18
140							};
141							$_->{ $options->{url_field}} = $options->{base}
142	4	50				10	if( $options->{url_field} );
143	4					9	$_
144							} @result
145							} else {
146	4	50				14	if( $options->{url_field} ) {
147	0					0	for my $row (@result) {
148	0					0	unshift @$row, $options->{base};
149							};
150							};
151							};
152
153	6	100				35	$tree->delete
154							if $delete_tree;
155							@result
156	6					489	};
157
158							=head1 SEE ALSO
159
160							L - the scraper inspiring this module
161
162							=head1 REPOSITORY
163
164							The public repository of this module is
165							L.
166
167							=head1 SUPPORT
168
169							The public support forum of this program is
170							L.
171
172							=head1 AUTHOR
173
174							Max Maischein C
175
176							=head1 COPYRIGHT (c)
177
178							Copyright 2011-2011 by Max Maischein C.
179
180							=head1 LICENSE
181
182							This module is released under the same terms as Perl itself.
183
184							=cut