File Coverage

blib/lib/HTML/Selector/XPath.pm

Criterion	Covered	Total	%
statement	131	143	91.6
branch	112	128	87.5
condition	13	14	92.8
subroutine	13	14	92.8
pod	4	10	40.0
total	273	309	88.3

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Selector::XPath;
2
3	8			8		765475	use strict;
	8					86
	8					227
4	8			8		171	use 5.008_001;
	8					28
5							our $VERSION = '0.27';
6
7							require Exporter;
8							our @EXPORT_OK = qw(selector_to_xpath);
9							*import = \&Exporter::import;
10
11	8			8		70	use Carp;
	8					22
	8					23267
12
13							sub selector_to_xpath {
14	23			23	1	71376	__PACKAGE__->new(shift)->to_xpath(@_);
15							}
16
17							# XXX: Identifiers should also allow any characters U+00A0 and higher, and any
18							# escaped characters.
19							my $ident = qr/(?![0-9]\|-[-0-9])[-_a-zA-Z0-9]+/;
20
21							my $reg = {
22							# tag name/id/class
23							element => qr/^([#.]?)([^\s'"#.\/:@,=~>()\[\]\|])((\\|)([a-z0-9\\_-]*))?/i,
24							# attribute presence
25							attr1 => qr/^\[ \s* ($ident) \s* \]/x,
26							# attribute value match
27							attr2 => qr/^\[ \s* ($ident) \s*
28							( [~\|^\$!]? = ) \s
29							(?: ($ident) \| "([^"])" \| '([^'])') \s* \] /x,
30							badattr => qr/^\[/,
31							attrN => qr/^:not\(/i, # we chop off the closing parenthesis below in the code
32							pseudo => qr/^:([-\w]+\(?)/i,
33							# adjacency/direct descendance
34							combinator => qr/^(\s*[>+~\s](?!,))/i,
35							# rule separator
36							comma => qr/^\s,\s/i,
37							};
38
39							sub new {
40	184			184	1	346189	my($class, $exp) = @_;
41	184					720	bless { expression => $exp }, $class;
42							}
43
44							sub selector {
45	0			0	0	0	my $self = shift;
46	0	0				0	$self->{expression} = shift if @_;
47	0					0	$self->{expression};
48							}
49
50							sub convert_attribute_match {
51	30			30	0	131	my ($left,$op,$right) = @_;
52							# negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
53	30	100				167	if ($op eq '!=') {
		100
		100
		100
		100
		100
54	2					9	"\@$left!='$right'";
55							} elsif ($op eq '~=') { # substring attribute match
56	3					14	"contains(concat(' ', \@$left, ' '), ' $right ')";
57							} elsif ($op eq '*=') { # real substring attribute match
58	6					28	"contains(\@$left, '$right')";
59							} elsif ($op eq '\|=') {
60	3					20	"\@$left='$right' or starts-with(\@$left, '$right-')";
61							} elsif ($op eq '^=') {
62	4					18	"starts-with(\@$left,'$^N')";
63							} elsif ($op eq '$=') {
64	2					6	my $n = length($^N) - 1;
65	2					11	"substring(\@$left,string-length(\@$left)-$n)='$^N'";
66							} else { # exact match
67	10					44	"\@$left='$^N'";
68							}
69							}
70
71							sub _generate_child {
72	48			48		134	my ($direction,$name,$a,$b) = @_;
73	48	100				126	if ($a == 0) { # 0n+b
		100
74	32					66	$b--;
75	32					247	"[count($direction-sibling::$name) = $b and parent::*]"
76							} elsif ($a > 0) { # an + b
77	11					121	return "[not((count($direction-sibling::$name)+1)<$b) and ((count($direction-sibling::$name) + 1) - $b) mod $a = 0 and parent::*]"
78							} else { # -an + $b
79	5					17	$a = -$a;
80	5					55	return "[not((count($direction-sibling::$name)+1)>$b) and (($b - (count($direction-sibling::$name) + 1)) mod $a) = 0 and parent::*]"
81							}
82							}
83
84							sub nth_child {
85	32			32	0	73	my ($a,$b) = @_;
86	32	100				80	if (@_ == 1) {
87	15					36	($a,$b) = (0,$a);
88							}
89	32					83	_generate_child('preceding', '*', $a, $b);
90							}
91
92							sub nth_last_child {
93	11			11	0	32	my ($a,$b) = @_;
94	11	100				29	if (@_ == 1) {
95	7					31	($a,$b) = (0,$a);
96							}
97	11					68	_generate_child('following', '*', $a, $b);
98							}
99
100							# A hacky recursive descent
101							# Only descends for :not(...)
102							sub consume_An_plus_B {
103	25			25	0	47	my( $rrule ) = @_;
104
105	25					50	my( $A, $B );
106
107	25	100				235	if( $$rrule =~ s/^odd\s*\)// ) {
		100
		100
		50
108	1					3	($A,$B) = (2, 1)
109							} elsif( $$rrule =~ s/^even\s*\)// ) {
110	1					3	($A,$B) = (2, 0)
111							} elsif( $$rrule =~ s/^\s(-?\d+)\s\)// ) {
112	9					31	($A,$B) = (0, $1)
113							} elsif( $$rrule =~ s/^\s(-?\d)\sn\s(?:\+\s(\d+))?\s\)// ) {
114	14		50			85	($A,$B) = (($1 // '1'), $2 // 0);
			100
115	14	100				42	if( $A eq '-') {
		100
116	3					11	$A = '-1';
117							} elsif( $A eq '' ) {
118	1					3	$A = '1';
119							}
120							} else {
121	0					0	croak "Can't parse formula from '$$rrule'";
122							}
123
124	25					80	return ($A, $B);
125							}
126
127							sub consume {
128	190			190	0	419	my ($self, $rule, %parms) = @_;
129	190		100			699	my $root = $parms{root} \|\| '/';
130
131	190	100				466	return [$rule,''] if $rule =~ m!^/!; # If we start with a slash, we're already an XPath?!
132
133	189					503	my @parts = ("$root/");
134	189					287	my $last_rule = '';
135	189					277	my @next_parts;
136
137							my $wrote_tag;
138	189					272	my $root_index = 0; # points to the current root
139							# Loop through each "unit" of the rule
140	189		100			729	while (length $rule && $rule ne $last_rule) {
141	291					451	$last_rule = $rule;
142
143	291					2120	$rule =~ s/^\s\|\s$//g;
144	291	50				695	last unless length $rule;
145
146							# Prepend explicit first selector if we have an implicit selector
147							# (that is, if we start with a combinator)
148	291	100				1241	if ($rule =~ /$reg->{combinator}/) {
149	1					3	$rule = "* $rule";
150							}
151
152							# Match elements
153	291	50				1855	if ($rule =~ s/$reg->{element}//) {
154	291					987	my ($id_class,$name,$lang) = ($1,$2,$3);
155
156							# to add *[1]/self:: for follow-sibling
157	291	50				662	if (@next_parts) {
158	0					0	push @parts, @next_parts; #, (pop @parts);
159	0					0	@next_parts = ();
160							}
161
162	291	100	100			911	my $tag = $id_class eq '' ? $name \|\| '' : '';
163
164	291	100	100			793	if (defined $parms{prefix} and not $tag =~ /[*:\|]/) {
165	12					28	$tag = join ':', $parms{prefix}, $tag;
166							}
167
168	291	100				652	if (! $wrote_tag++) {
169	250					480	push @parts, $tag;
170							}
171
172							# XXX Shouldn't the RE allow both, ID and class?
173	291	100				850	if ($id_class eq '#') { # ID
		100
174	21					57	push @parts, "[\@id='$name']";
175							} elsif ($id_class eq '.') { # class
176	27					87	push @parts, "[contains(concat(' ', normalize-space(\@class), ' '), ' $name ')]";
177							};
178							};
179
180							# Match attribute selectors
181	291	100				2175	if ($rule =~ s/$reg->{attr2}//) {
		100
		100
182	23					73	push @parts, "[", convert_attribute_match( $1, $2, $^N ), "]";
183							} elsif ($rule =~ s/$reg->{attr1}//) {
184							# If we have no tag output yet, write the tag:
185	6	50				29	if (! $wrote_tag++) {
186	0					0	push @parts, '*';
187							}
188	6					23	push @parts, "[\@$1]";
189							} elsif ($rule =~ $reg->{badattr}) {
190	14					1161	Carp::croak "Invalid attribute-value selector '$rule'";
191							}
192
193							# Match negation
194	277	100				1026	if ($rule =~ s/$reg->{attrN}//) {
195							# Now we parse the rest, and after parsing the subexpression
196							# has stopped, we must find a matching closing parenthesis:
197	14	100				168	if ($rule =~ s/$reg->{attr2}//) {
		100
		50
198	7					26	push @parts, "[not(", convert_attribute_match( $1, $2, $^N ), ")]";
199							} elsif ($rule =~ s/$reg->{attr1}//) {
200	1					6	push @parts, "[not(\@$1)]";
201							} elsif ($rule =~ /$reg->{badattr}/) {
202	0					0	Carp::croak "Invalid negated attribute-value selector ':not($rule)'";
203							} else {
204	6					75	my( $new_parts, $leftover ) = $self->consume( $rule, %parms );
205	6					19	shift @$new_parts; # remove '//'
206	6					20	my $xpath = join '', @$new_parts;
207
208	6					24	push @parts, "[not(self::$xpath)]";
209	6					21	$rule = $leftover;
210							}
211	14	50				91	$rule =~ s!^\s*\)!!
212							or die "Unbalanced parentheses at '$rule'";
213							}
214
215							# Ignore pseudoclasses/pseudoelements
216	277					1095	while ($rule =~ s/$reg->{pseudo}//) {
217	84	100				264	if ( my @expr = $self->parse_pseudo($1, \$rule) ) {
		100
		100
		100
		50
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		50
218	3					53	push @parts, @expr;
219							} elsif ( $1 eq 'disabled') {
220	1					5	push @parts, '[@disabled]';
221							} elsif ( $1 eq 'checked') {
222	1					6	push @parts, '[@checked]';
223							} elsif ( $1 eq 'selected') {
224	1					4	push @parts, '[@selected]';
225							} elsif ( $1 eq 'text') {
226	0					0	push @parts, '*[@type="text"]';
227							} elsif ( $1 eq 'first-child') {
228							# Translates to :nth-child(1)
229	13					40	push @parts, nth_child(1);
230							} elsif ( $1 eq 'last-child') {
231	5					25	push @parts, nth_last_child(1);
232							} elsif ( $1 eq 'only-child') {
233	2					10	push @parts, nth_child(1), nth_last_child(1);
234							} elsif ($1 =~ /^lang\($/) {
235	11	50				59	$rule =~ s/\s([\w\-]+)\s\)//
236							or Carp::croak "Can't parse language part from $rule";
237	11					69	push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
238							} elsif ($1 =~ /^nth-child\(\s*$/) {
239	17					46	my( $A, $B ) = consume_An_plus_B(\$rule);
240	17					46	push @parts, nth_child($A, $B);
241							} elsif ($1 =~ /^nth-last-child\(\s*$/) {
242	4					12	my( $A, $B ) = consume_An_plus_B(\$rule);
243	4					13	push @parts, nth_last_child($A, $B);
244							} elsif ($1 =~ /^first-of-type\s*$/) {
245	1					3	my $type = $parts[-1];
246	1					4	push @parts, _generate_child('preceding', $type, 0, 1);
247
248							} elsif ($1 =~ /^nth-of-type\(\s*$/) {
249	4					14	my( $A, $B ) = consume_An_plus_B(\$rule);
250	4					11	my $type = $parts[-1];
251	4					14	push @parts, _generate_child('preceding', $type, $A, $B);
252
253							} elsif ($1 =~ /^last-of-type$/) {
254	2					12	push @parts, "[last()]";
255
256							# Err?! This one does not really exist in the CSS spec...
257							# Why did I add this?
258							} elsif ($1 =~ /^contains\($/) {
259	13	100				86	if( $rule =~ s/^\s"([^"])"\s*\)// ) {
		50
260	11					80	push @parts, qq{[text()[contains(string(.),"$1")]]};
261							} elsif( $rule =~ s/^\s'([^'])'\s*\)// ) {
262	2					13	push @parts, qq{[text()[contains(string(.),"$1")]]};
263							} else {
264	0					0	return( \@parts, $rule );
265							#die "Malformed string in :contains(): '$rule'";
266							};
267							} elsif ( $1 eq 'root') {
268							# This will give surprising results if you do E > F:root
269	3					14	$parts[$root_index] = $root;
270							} elsif ( $1 eq 'empty') {
271	3					15	push @parts, "[not(* or text())]";
272							} else {
273	0					0	Carp::croak "Can't translate '$1' pseudo-class";
274							}
275							}
276
277							# Match combinators (whitespace, >, + and ~)
278	277	100				967	if ($rule =~ s/$reg->{combinator}//) {
279	53					134	my $match = $1;
280	53	100				266	if ($match =~ />/) {
		100
		100
		50
281	13					29	push @parts, "/";
282							} elsif ($match =~ /\+/) {
283	5					12	push @parts, "/following-sibling::*[1]/self::";
284							} elsif ($match =~ /\~/) {
285	11					26	push @parts, "/following-sibling::";
286							} elsif ($match =~ /^\s*$/) {
287	24					56	push @parts, "//"
288							} else {
289	0					0	die "Weird combinator '$match'"
290							}
291
292							# new context
293	53					97	undef $wrote_tag;
294							}
295
296							# Match commas
297	277	100				1280	if ($rule =~ s/$reg->{comma}//) {
298	8					32	push @parts, " \| ", "$root/"; # ending one rule and beginning another
299	8					20	$root_index = $#parts;
300	8					34	undef $wrote_tag;
301							}
302							}
303	175					682	return \@parts, $rule
304							}
305
306							sub to_xpath {
307	184			184	1	535	my $self = shift;
308	184	50				522	my $rule = $self->{expression} or return;
309	184					405	my %parms = @_;
310
311	184					526	my($result,$leftover) = $self->consume( $rule, %parms );
312	170	50				392	$leftover
313							and die "Invalid rule, couldn't parse '$leftover'";
314	170					916	return join '', @$result;
315
316							}
317
318				81	1		sub parse_pseudo {
319							# nop
320							}
321
322							1;
323							__END__