File Coverage

blib/lib/HTML/Grabber.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package HTML::Grabber;
2				{
3				$HTML::Grabber::VERSION = '0.7';
4				}
5
6				# ABSTRACT: jQuery style DOM traversal/manipulation
7
8	1	1	24293	use strict;
	1		3
	1		34
9	1	1	5	use warnings;
	1		2
	1		26
10
11	1	1	1424	use Moose;
	0
	0
12				use HTML::Selector::XPath qw(selector_to_xpath);
13				use XML::LibXML qw(:libxml);
14
15				my $parser = XML::LibXML->new;
16				$parser->recover(1);
17
18				has nodes => (
19				traits => ['Array'],
20				isa => 'ArrayRef[XML::LibXML::Node]',
21				writer => '_nodes',
22				required => 1,
23				default => sub { [] },
24				handles => {
25				nodes => 'elements',
26				length => 'count',
27				},
28				);
29
30				=head1 NAME
31
32				HTML::Grabber
33
34				=head1 SYNOPSIS
35
36				use HTML::Grabber;
37				use LWP::Simple;
38
39				my $dom = HTML::Grabber->new( html => get('http://twitter.com/ned0r') );
40
41				$dom->find('.tweet-content')->each(sub {
42				my $body = $_->find('.tweet-text')->text;
43				my $when = $_->find('.js-tweet-timestamp')->attr('data-time');
44				my $link = $_->find('.js-permalink')->attr('href');
45				print "$body $when (link: $link)\n";
46				});
47
48				=head1 DESCRIPTION
49
50				HTML::Grabber provides a jQuery style interface to HTML documents. This makes
51				parsing and manipulating HTML documents trivially simple for those people
52				familiar with L<http://jquery.com>.
53
54				It uses L<XML::LibXML> for DOM parsing/manipulation and
55				L<HTML::Selector::XPath> for converting CSS expressions into XPath.
56
57				=head1 AUTHOR
58
59				Martyn Smith <martyn@dollyfish.net.nz>
60
61				=head1 SELECTORS
62
63				All selectors are CSS. They are internally converted to XPath using
64				L<HTML::Selector::XPath>. If some creative selector you're trying isn't working
65				as expected, it may well be worth checking out the documentation for that
66				module to see if it's supported.
67
68				=head1 METHODS
69
70				=head2 BUILD
71
72				=cut
73
74				sub BUILD {
75				my ($self, $args) = @_;
76
77				if ( exists $args->{html} ) {
78				my $dom = $parser->parse_html_string($args->{html}, { suppress_warnings => 1, suppress_errors => 1 });
79				$self->_nodes([$dom]);
80				}
81				}
82
83				=head2 find( $selector )
84
85				Get descendants of each element in the current set of matched elements,
86				filtered by a selector.
87
88				=cut
89				sub find {
90				my ($self, $selector) = @_;
91
92				my $xpath = selector_to_xpath($selector, root => './');
93
94				my @nodes;
95				foreach my $node ( $self->nodes ) {
96				push @nodes, $node->findnodes($xpath);
97				}
98				return $self->new(
99				nodes => [uniq(@nodes)],
100				);
101				}
102
103				=head2 prev( [ $selector ] )
104
105				Get the immediately preceding sibling of each element in the set of matched
106				elements, optionally filtered by a selector.
107
108				=cut
109				sub prev {
110				my ($self, $selector) = @_;
111
112				my @nodes;
113				foreach my $node ( $self->nodes ) {
114				my $prev = $node;
115				do {
116				$prev = $prev->previousSibling;
117				} while $prev and $prev->nodeType != XML_ELEMENT_NODE;
118				push @nodes, $prev if $prev;
119				}
120				my $return = $self->new(
121				nodes => [uniq(@nodes)],
122				);
123				return $return->filter($selector) if $selector;
124				return $return;
125				}
126
127				=head2 next( [ $selector ] )
128
129				Get the immediately preceding sibling of each element in the set of matched
130				elements, optionally filtered by a selector.
131
132				=cut
133				sub next {
134				my ($self, $selector) = @_;
135
136				my @nodes;
137				foreach my $node ( $self->nodes ) {
138				my $next = $node;
139				do {
140				$next = $next->nextSibling;
141				} while $next and $next->nodeType != XML_ELEMENT_NODE;
142				push @nodes, $next if $next;
143				}
144				my $return = $self->new(
145				nodes => [uniq(@nodes)],
146				);
147				return $return->filter($selector) if $selector;
148				return $return;
149				}
150
151				=head2 filter( $selector )
152
153				Reduce the set of matched elements to those that match the selector
154
155				=cut
156
157				sub filter {
158				my ($self, $selector) = @_;
159
160				my $xpath = selector_to_xpath($selector, root => '..');
161
162				my @nodes;
163
164				foreach my $node ( $self->nodes ) {
165				push @nodes, $node if grep { $node->isSameNode($_) } $node->findnodes($xpath);
166				}
167
168				return $self->new(
169				nodes => [uniq(@nodes)],
170				);
171				}
172
173				=head2 text_filter( $match )
174
175				Filter the current set of matched elements to those that contain the text
176				specified by $match. If you prefer, $match can also be a Regexp
177
178				=cut
179				sub text_filter {
180				my ($self, $match) = @_;
181
182				my $regexp = $match;
183				$regexp = qr/\Q$regexp\E/ unless UNIVERSAL::isa($regexp, 'Regexp');
184
185				my @nodes;
186				foreach my $node ( $self->nodes ) {
187				push @nodes, $node if $node->findvalue('.') =~ $match;
188				}
189				return $self->new(
190				nodes => [uniq(@nodes)],
191				);
192				}
193
194				=head2 parent()
195
196				Get the parent of each element in the current set of matched elements
197
198				=cut
199				sub parent {
200				my ($self) = @_;
201
202				my @nodes;
203				foreach my $node ( $self->nodes ) {
204				push @nodes, $node->parentNode if $node->parentNode;
205				}
206				return $self->new(
207				nodes => [uniq(@nodes)],
208				);
209				}
210
211				=head2 text()
212
213				Get the combined text contents of each element in the set of matched elements,
214				including their descendants.
215
216				=cut
217				sub text {
218				my ($self) = @_;
219
220				return join('', map { $_->findvalue('.') } shift->nodes);
221				}
222
223				=head2 text_array()
224
225				Return text for each element as a list
226
227				=cut
228				sub text_array {
229				my ($self) = @_;
230
231				return map { $_->findvalue('.') } shift->nodes;
232				}
233
234				=head2 html()
235
236				Return the HTML of the currently matched elements
237
238				=cut
239				sub html {
240				my ($self) = @_;
241
242				return join('', map { $_->toString } shift->nodes);
243				}
244
245				=head2 html_array()
246
247				Return the HTML each element as a list
248
249				=cut
250				sub html_array {
251				my ($self) = @_;
252
253				return map { $_->toString } shift->nodes;
254				}
255
256				=head2 remove()
257
258				Removes the matched nodes from the DOM tree returning them
259
260				=cut
261				sub remove {
262				my ($self) = @_;
263
264				foreach my $node ( $self->nodes ) {
265				next unless $node->parentNode;
266				$node->parentNode->removeChild($node);
267				}
268
269				return $self;
270				}
271
272				=head2 attr( $attribute )
273
274				Get the value of an attribute for the first element in the set of matched
275				elements.
276
277				=cut
278				sub attr {
279				my ($self, $attr) = @_;
280
281				my ($node) = $self->nodes;
282
283				return unless $node;
284
285				return $node->findvalue("./\@$attr");
286				}
287
288				=head2 each
289
290				Execute a sub for each matched node
291
292				=cut
293				sub each {
294				my ($self, $sub) = @_;
295
296				foreach my $node ( $self->nodes ) {
297				local $_ = $self->new(nodes => [$node]);
298				$sub->($_);
299				}
300				}
301
302				=head2 map
303
304				Execute a sub for each matched node returning a list containing the result of
305				each sub
306
307				=cut
308				sub map {
309				my ($self, $sub) = @_;
310
311				my @results;
312
313				foreach my $node ( $self->nodes ) {
314				local $_ = $self->new(nodes => [$node]);
315				push @results, $sub->($_);
316				}
317
318				return @results;
319				}
320
321				=head1 CLASS METHODS
322
323				=head2 uniq( @nodes )
324
325				Internal method for taking a list of L<XML::LibXML::Element>s and returning a
326				unique list
327
328				=cut
329
330				sub uniq {
331				my (@nodes) = @_;
332				my %seen;
333
334				my @unique;
335
336				foreach my $node ( @nodes ) {
337				push @unique, $node unless $seen{$node->nodePath}++;
338				}
339
340				return @unique;
341				}
342
343				__PACKAGE__->meta->make_immutable;
344
345				1;