File Coverage

blib/lib/HTML/Selector/XPath.pm

Criterion	Covered	Total	%
statement	131	144	90.9
branch	113	130	86.9
condition	12	12	100.0
subroutine	13	14	92.8
pod	4	10	40.0
total	273	310	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Selector::XPath;
2
3	8			8		779057	use strict;
	8					85
	8					220
4	8			8		160	use 5.008_001;
	8					64
5							our $VERSION = '0.28';
6
7							require Exporter;
8							our @EXPORT_OK = qw(selector_to_xpath);
9							*import = \&Exporter::import;
10
11	8			8		66	use Carp;
	8					15
	8					23152
12
13							sub selector_to_xpath {
14	23			23	1	69185	__PACKAGE__->new(shift)->to_xpath(@_);
15							}
16
17							# XXX: Identifiers should also allow any characters U+00A0 and higher, and any
18							# escaped characters.
19							my $ident = qr/(?![0-9]\|-[-0-9])[-_a-zA-Z0-9]+/;
20
21							my $reg = {
22							# tag name/id/class
23							element => qr/^([#.]?)([^\s'"#.\/:@,=~>()\[\]\|])((\\|)([a-z0-9\\_-]*))?/i,
24							# attribute presence
25							attr1 => qr/^\[ \s* ($ident) \s* \]/x,
26							# attribute value match
27							attr2 => qr/^\[ \s* ($ident) \s*
28							( [~\|^\$!]? = ) \s
29							(?: ($ident) \| "([^"])" \| '([^'])') \s* \] /x,
30							badattr => qr/^\[/,
31							attrN => qr/^:not\(/i, # we chop off the closing parenthesis below in the code
32							pseudo => qr/^:([-\w]+\(?)/i,
33							# adjacency/direct descendance
34							combinator => qr/^(\s*[>+~\s](?!,))/i,
35							# rule separator
36							comma => qr/^\s,\s/i,
37							};
38
39							sub new {
40	184			184	1	344592	my($class, $exp) = @_;
41	184					672	bless { expression => $exp }, $class;
42							}
43
44							sub selector {
45	0			0	0	0	my $self = shift;
46	0	0				0	$self->{expression} = shift if @_;
47	0					0	$self->{expression};
48							}
49
50							sub convert_attribute_match {
51	30			30	0	131	my ($left,$op,$right) = @_;
52							# negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
53	30	100				153	if ($op eq '!=') {
		100
		100
		100
		100
		100
54	2					11	"\@$left!='$right'";
55							} elsif ($op eq '~=') { # substring attribute match
56	3					14	"contains(concat(' ', \@$left, ' '), ' $right ')";
57							} elsif ($op eq '*=') { # real substring attribute match
58	6					28	"contains(\@$left, '$right')";
59							} elsif ($op eq '\|=') {
60	3					20	"\@$left='$right' or starts-with(\@$left, '$right-')";
61							} elsif ($op eq '^=') {
62	4					23	"starts-with(\@$left,'$^N')";
63							} elsif ($op eq '$=') {
64	2					5	my $n = length($^N) - 1;
65	2					12	"substring(\@$left,string-length(\@$left)-$n)='$^N'";
66							} else { # exact match
67	10					45	"\@$left='$^N'";
68							}
69							}
70
71							sub _generate_child {
72	48			48		128	my ($direction,$name,$a,$b) = @_;
73	48	100				116	if ($a == 0) { # 0n+b
		100
74	32					53	$b--;
75	32					213	"[count($direction-sibling::$name) = $b and parent::*]"
76							} elsif ($a > 0) { # an + b
77	11					115	return "[not((count($direction-sibling::$name)+1)<$b) and ((count($direction-sibling::$name) + 1) - $b) mod $a = 0 and parent::*]"
78							} else { # -an + $b
79	5					20	$a = -$a;
80	5					57	return "[not((count($direction-sibling::$name)+1)>$b) and (($b - (count($direction-sibling::$name) + 1)) mod $a) = 0 and parent::*]"
81							}
82							}
83
84							sub nth_child {
85	32			32	0	66	my ($a,$b) = @_;
86	32	100				81	if (@_ == 1) {
87	15					35	($a,$b) = (0,$a);
88							}
89	32					71	_generate_child('preceding', '*', $a, $b);
90							}
91
92							sub nth_last_child {
93	11			11	0	27	my ($a,$b) = @_;
94	11	100				29	if (@_ == 1) {
95	7					17	($a,$b) = (0,$a);
96							}
97	11					27	_generate_child('following', '*', $a, $b);
98							}
99
100							# A hacky recursive descent
101							# Only descends for :not(...)
102							sub consume_An_plus_B {
103	25			25	0	50	my( $rrule ) = @_;
104
105	25					51	my( $A, $B );
106
107	25	100				229	if( $$rrule =~ s/^odd\s*\)// ) {
		100
		100
		50
108	1					4	($A,$B) = (2, 1)
109							} elsif( $$rrule =~ s/^even\s*\)// ) {
110	1					3	($A,$B) = (2, 0)
111							} elsif( $$rrule =~ s/^\s(-?\d+)\s\)// ) {
112	9					31	($A,$B) = (0, $1)
113							} elsif( $$rrule =~ s/^\s(-?\d)\sn\s(?:\+\s(\d+))?\s\)// ) {
114	14		100			82	($A,$B) = ($1, $2 \|\| 0);
115	14	50				84	if( ! defined $A ) {
		100
		100
116	0					0	$A = '0';
117							} elsif( $A eq '-') {
118	3					8	$A = '-1';
119							} elsif( $A eq '' ) {
120	1					2	$A = '1';
121							}
122							} else {
123	0					0	croak "Can't parse formula from '$$rrule'";
124							}
125
126	25					87	return ($A, $B);
127							}
128
129							sub consume {
130	190			190	0	427	my ($self, $rule, %parms) = @_;
131	190		100			668	my $root = $parms{root} \|\| '/';
132
133	190	100				474	return [$rule,''] if $rule =~ m!^/!; # If we start with a slash, we're already an XPath?!
134
135	189					505	my @parts = ("$root/");
136	189					284	my $last_rule = '';
137	189					304	my @next_parts;
138
139							my $wrote_tag;
140	189					290	my $root_index = 0; # points to the current root
141							# Loop through each "unit" of the rule
142	189		100			733	while (length $rule && $rule ne $last_rule) {
143	291					503	$last_rule = $rule;
144
145	291					1966	$rule =~ s/^\s\|\s$//g;
146	291	50				819	last unless length $rule;
147
148							# Prepend explicit first selector if we have an implicit selector
149							# (that is, if we start with a combinator)
150	291	100				1290	if ($rule =~ /$reg->{combinator}/) {
151	1					3	$rule = "* $rule";
152							}
153
154							# Match elements
155	291	50				1804	if ($rule =~ s/$reg->{element}//) {
156	291					980	my ($id_class,$name,$lang) = ($1,$2,$3);
157
158							# to add *[1]/self:: for follow-sibling
159	291	50				651	if (@next_parts) {
160	0					0	push @parts, @next_parts; #, (pop @parts);
161	0					0	@next_parts = ();
162							}
163
164	291	100	100			899	my $tag = $id_class eq '' ? $name \|\| '' : '';
165
166	291	100	100			808	if (defined $parms{prefix} and not $tag =~ /[*:\|]/) {
167	12					76	$tag = join ':', $parms{prefix}, $tag;
168							}
169
170	291	100				653	if (! $wrote_tag++) {
171	250					477	push @parts, $tag;
172							}
173
174							# XXX Shouldn't the RE allow both, ID and class?
175	291	100				768	if ($id_class eq '#') { # ID
		100
176	21					61	push @parts, "[\@id='$name']";
177							} elsif ($id_class eq '.') { # class
178	27					78	push @parts, "[contains(concat(' ', normalize-space(\@class), ' '), ' $name ')]";
179							};
180							};
181
182							# Match attribute selectors
183	291	100				2198	if ($rule =~ s/$reg->{attr2}//) {
		100
		100
184	23					74	push @parts, "[", convert_attribute_match( $1, $2, $^N ), "]";
185							} elsif ($rule =~ s/$reg->{attr1}//) {
186							# If we have no tag output yet, write the tag:
187	6	50				28	if (! $wrote_tag++) {
188	0					0	push @parts, '*';
189							}
190	6					20	push @parts, "[\@$1]";
191							} elsif ($rule =~ $reg->{badattr}) {
192	14					1247	Carp::croak "Invalid attribute-value selector '$rule'";
193							}
194
195							# Match negation
196	277	100				982	if ($rule =~ s/$reg->{attrN}//) {
197							# Now we parse the rest, and after parsing the subexpression
198							# has stopped, we must find a matching closing parenthesis:
199	14	100				138	if ($rule =~ s/$reg->{attr2}//) {
		100
		50
200	7					24	push @parts, "[not(", convert_attribute_match( $1, $2, $^N ), ")]";
201							} elsif ($rule =~ s/$reg->{attr1}//) {
202	1					6	push @parts, "[not(\@$1)]";
203							} elsif ($rule =~ /$reg->{badattr}/) {
204	0					0	Carp::croak "Invalid negated attribute-value selector ':not($rule)'";
205							} else {
206	6					75	my( $new_parts, $leftover ) = $self->consume( $rule, %parms );
207	6					17	shift @$new_parts; # remove '//'
208	6					22	my $xpath = join '', @$new_parts;
209
210	6					18	push @parts, "[not(self::$xpath)]";
211	6					17	$rule = $leftover;
212							}
213	14	50				87	$rule =~ s!^\s*\)!!
214							or die "Unbalanced parentheses at '$rule'";
215							}
216
217							# Ignore pseudoclasses/pseudoelements
218	277					1092	while ($rule =~ s/$reg->{pseudo}//) {
219	84	100				282	if ( my @expr = $self->parse_pseudo($1, \$rule) ) {
		100
		100
		100
		50
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		50
220	3					48	push @parts, @expr;
221							} elsif ( $1 eq 'disabled') {
222	1					5	push @parts, '[@disabled]';
223							} elsif ( $1 eq 'checked') {
224	1					5	push @parts, '[@checked]';
225							} elsif ( $1 eq 'selected') {
226	1					5	push @parts, '[@selected]';
227							} elsif ( $1 eq 'text') {
228	0					0	push @parts, '*[@type="text"]';
229							} elsif ( $1 eq 'first-child') {
230							# Translates to :nth-child(1)
231	13					32	push @parts, nth_child(1);
232							} elsif ( $1 eq 'last-child') {
233	5					21	push @parts, nth_last_child(1);
234							} elsif ( $1 eq 'only-child') {
235	2					11	push @parts, nth_child(1), nth_last_child(1);
236							} elsif ($1 =~ /^lang\($/) {
237	11	50				59	$rule =~ s/\s([\w\-]+)\s\)//
238							or Carp::croak "Can't parse language part from $rule";
239	11					69	push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
240							} elsif ($1 =~ /^nth-child\(\s*$/) {
241	17					46	my( $A, $B ) = consume_An_plus_B(\$rule);
242	17					47	push @parts, nth_child($A, $B);
243							} elsif ($1 =~ /^nth-last-child\(\s*$/) {
244	4					13	my( $A, $B ) = consume_An_plus_B(\$rule);
245	4					17	push @parts, nth_last_child($A, $B);
246							} elsif ($1 =~ /^first-of-type\s*$/) {
247	1					3	my $type = $parts[-1];
248	1					3	push @parts, _generate_child('preceding', $type, 0, 1);
249
250							} elsif ($1 =~ /^nth-of-type\(\s*$/) {
251	4					14	my( $A, $B ) = consume_An_plus_B(\$rule);
252	4					11	my $type = $parts[-1];
253	4					15	push @parts, _generate_child('preceding', $type, $A, $B);
254
255							} elsif ($1 =~ /^last-of-type$/) {
256	2					16	push @parts, "[last()]";
257
258							# Err?! This one does not really exist in the CSS spec...
259							# Why did I add this?
260							} elsif ($1 =~ /^contains\($/) {
261	13	100				80	if( $rule =~ s/^\s"([^"])"\s*\)// ) {
		50
262	11					76	push @parts, qq{[text()[contains(string(.),"$1")]]};
263							} elsif( $rule =~ s/^\s'([^'])'\s*\)// ) {
264	2					13	push @parts, qq{[text()[contains(string(.),"$1")]]};
265							} else {
266	0					0	return( \@parts, $rule );
267							#die "Malformed string in :contains(): '$rule'";
268							};
269							} elsif ( $1 eq 'root') {
270							# This will give surprising results if you do E > F:root
271	3					14	$parts[$root_index] = $root;
272							} elsif ( $1 eq 'empty') {
273	3					14	push @parts, "[not(* or text())]";
274							} else {
275	0					0	Carp::croak "Can't translate '$1' pseudo-class";
276							}
277							}
278
279							# Match combinators (whitespace, >, + and ~)
280	277	100				971	if ($rule =~ s/$reg->{combinator}//) {
281	53					126	my $match = $1;
282	53	100				254	if ($match =~ />/) {
		100
		100
		50
283	13					54	push @parts, "/";
284							} elsif ($match =~ /\+/) {
285	5					12	push @parts, "/following-sibling::*[1]/self::";
286							} elsif ($match =~ /\~/) {
287	11					17	push @parts, "/following-sibling::";
288							} elsif ($match =~ /^\s*$/) {
289	24					50	push @parts, "//"
290							} else {
291	0					0	die "Weird combinator '$match'"
292							}
293
294							# new context
295	53					98	undef $wrote_tag;
296							}
297
298							# Match commas
299	277	100				1212	if ($rule =~ s/$reg->{comma}//) {
300	8					31	push @parts, " \| ", "$root/"; # ending one rule and beginning another
301	8					18	$root_index = $#parts;
302	8					33	undef $wrote_tag;
303							}
304							}
305	175					714	return \@parts, $rule
306							}
307
308							sub to_xpath {
309	184			184	1	573	my $self = shift;
310	184	50				562	my $rule = $self->{expression} or return;
311	184					341	my %parms = @_;
312
313	184					510	my($result,$leftover) = $self->consume( $rule, %parms );
314	170	50				385	$leftover
315							and die "Invalid rule, couldn't parse '$leftover'";
316	170					873	return join '', @$result;
317
318							}
319
320				81	1		sub parse_pseudo {
321							# nop
322							}
323
324							1;
325							__END__