line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Query; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.09'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
use Badger::Class |
6
|
11
|
|
|
|
|
274
|
version => $VERSION, |
7
|
|
|
|
|
|
|
debug => 0, |
8
|
|
|
|
|
|
|
base => 'Badger::Base', |
9
|
|
|
|
|
|
|
utils => 'blessed', |
10
|
|
|
|
|
|
|
import => 'class CLASS', |
11
|
|
|
|
|
|
|
vars => 'AUTOLOAD', |
12
|
|
|
|
|
|
|
constants => 'ARRAY', |
13
|
|
|
|
|
|
|
constant => { |
14
|
|
|
|
|
|
|
ELEMENT => 'HTML::Element', |
15
|
|
|
|
|
|
|
BUILDER => 'HTML::TreeBuilder', |
16
|
|
|
|
|
|
|
}, |
17
|
|
|
|
|
|
|
exports => { |
18
|
|
|
|
|
|
|
any => 'Query', |
19
|
|
|
|
|
|
|
hooks => { |
20
|
|
|
|
|
|
|
query => \&_export_query_to_element, |
21
|
|
|
|
|
|
|
}, |
22
|
|
|
|
|
|
|
}, |
23
|
|
|
|
|
|
|
messages => { |
24
|
|
|
|
|
|
|
no_elements => 'No elements specified to query', |
25
|
|
|
|
|
|
|
no_query => 'No query specified', |
26
|
|
|
|
|
|
|
no_source => 'No argument specified for source: %s', |
27
|
|
|
|
|
|
|
bad_element => 'Invalid element specified: %s', |
28
|
|
|
|
|
|
|
bad_source => 'Invalid source specified: %s', |
29
|
|
|
|
|
|
|
bad_query => 'Invalid query specified: %s', |
30
|
|
|
|
|
|
|
bad_spec => 'Invalid specification "%s" in query: %s', |
31
|
|
|
|
|
|
|
is_empty => 'The query does not contain any elements', |
32
|
11
|
|
|
11
|
|
1754546
|
}; |
|
11
|
|
|
|
|
22275
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
our $SOURCES = { |
35
|
|
|
|
|
|
|
text => sub { |
36
|
|
|
|
|
|
|
class(BUILDER)->load; |
37
|
|
|
|
|
|
|
BUILDER->new_from_content(shift); |
38
|
|
|
|
|
|
|
}, |
39
|
|
|
|
|
|
|
file => sub { |
40
|
|
|
|
|
|
|
class(BUILDER)->load; |
41
|
|
|
|
|
|
|
BUILDER->new_from_file(shift); |
42
|
|
|
|
|
|
|
}, |
43
|
|
|
|
|
|
|
tree => sub { |
44
|
|
|
|
|
|
|
$_[0] |
45
|
|
|
|
|
|
|
}, |
46
|
|
|
|
|
|
|
query => sub { |
47
|
|
|
|
|
|
|
ref $_[0] eq ARRAY |
48
|
|
|
|
|
|
|
? @{ $_[0] } |
49
|
|
|
|
|
|
|
: $_[0]; |
50
|
|
|
|
|
|
|
}, |
51
|
|
|
|
|
|
|
}; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub Query (@) { |
54
|
48
|
|
|
48
|
1
|
94857
|
CLASS->new(@_); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub new { |
58
|
169
|
|
|
169
|
1
|
14112
|
my $class = shift; |
59
|
169
|
|
|
|
|
212
|
my ($element, @elements, $type, $code, $select); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# expand a single list ref into items |
62
|
169
|
100
|
100
|
|
|
645
|
unshift @_, @{ shift @_ } |
|
2
|
|
|
|
|
5
|
|
63
|
|
|
|
|
|
|
if @_ == 1 && ref $_[0] eq ARRAY; |
64
|
|
|
|
|
|
|
|
65
|
169
|
|
66
|
|
|
540
|
$class = ref $class || $class; |
66
|
|
|
|
|
|
|
|
67
|
169
|
|
|
|
|
848
|
my $self = { |
68
|
|
|
|
|
|
|
error => undef, |
69
|
|
|
|
|
|
|
suppress_errors => undef, |
70
|
|
|
|
|
|
|
match_self => undef, |
71
|
|
|
|
|
|
|
elements => \@elements, |
72
|
|
|
|
|
|
|
specificity => {} |
73
|
|
|
|
|
|
|
}; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# each element should be an HTML::Element object, although we might |
76
|
|
|
|
|
|
|
# want to subclass this module to recognise a different kind of object, |
77
|
|
|
|
|
|
|
# so we get the element class from the ELEMENT constant method which a |
78
|
|
|
|
|
|
|
# subclass can re-define. |
79
|
169
|
|
|
|
|
528
|
my $element_class = $class->ELEMENT; |
80
|
|
|
|
|
|
|
|
81
|
169
|
|
|
|
|
587
|
while (@_) { |
82
|
293
|
|
|
|
|
386
|
$element = shift; |
83
|
293
|
|
|
|
|
304
|
$class->debug("argument: ".$element) if DEBUG; |
84
|
|
|
|
|
|
|
|
85
|
293
|
100
|
|
|
|
1223
|
if (! ref $element) { |
|
|
50
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# a non-reference item is a source type (text, file, tree) |
87
|
|
|
|
|
|
|
# followed by the source, or if it's the last argument following |
88
|
|
|
|
|
|
|
# one ore more element options or named argument pairs then it's |
89
|
|
|
|
|
|
|
# a selection query |
90
|
19
|
100
|
|
|
|
55
|
if (@_) { |
|
|
100
|
|
|
|
|
|
91
|
12
|
|
|
|
|
23
|
$type = $element; |
92
|
12
|
|
50
|
|
|
55
|
$code = $SOURCES->{ $type } |
93
|
|
|
|
|
|
|
|| return $class->error_msg( bad_source => $type ); |
94
|
12
|
|
|
|
|
20
|
$element = shift; |
95
|
12
|
|
|
|
|
12
|
$class->debug("source $type: $element") if DEBUG; |
96
|
12
|
|
|
|
|
48
|
unshift(@_, $code->($element)); |
97
|
12
|
|
|
|
|
53024
|
next; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
elsif (@elements) { |
100
|
5
|
|
|
|
|
10
|
$select = $element; |
101
|
5
|
|
|
|
|
10
|
last; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
elsif (blessed $element) { |
105
|
|
|
|
|
|
|
# otherwise it should be an HTML::Element object or another |
106
|
|
|
|
|
|
|
# HTML::Query object |
107
|
274
|
100
|
|
|
|
962
|
if ($element->isa($element_class)) { |
|
|
50
|
|
|
|
|
|
108
|
272
|
|
|
|
|
408
|
push(@elements, $element); |
109
|
272
|
|
|
|
|
709
|
next; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
elsif ($element->isa($class)) { |
112
|
2
|
|
|
|
|
3
|
push(@elements, @{$element->get_elements}); |
|
2
|
|
|
|
|
7
|
|
113
|
2
|
|
|
|
|
6
|
next; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
2
|
|
|
|
|
18
|
return $class->error_msg( bad_element => $element ); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
167
|
|
|
|
|
363
|
bless $self, $class; |
121
|
|
|
|
|
|
|
|
122
|
167
|
100
|
|
|
|
532
|
return defined $select ? $self->query($select) : $self; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub query { |
126
|
131
|
|
|
131
|
1
|
10542
|
my ($self, $query) = @_; |
127
|
131
|
|
|
|
|
206
|
my @result; |
128
|
131
|
|
|
|
|
158
|
my $ops = 0; |
129
|
131
|
|
|
|
|
145
|
my $pos = 0; |
130
|
|
|
|
|
|
|
|
131
|
131
|
|
|
|
|
545
|
$self->{error} = undef; |
132
|
|
|
|
|
|
|
|
133
|
131
|
100
|
100
|
|
|
628
|
return $self->error_msg('no_query') |
134
|
|
|
|
|
|
|
unless defined $query && length $query; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# multiple specs can be comma separated, e.g. "table tr td, li a, div.foo" |
137
|
129
|
|
|
|
|
339
|
COMMA: while (1) { |
138
|
|
|
|
|
|
|
# each comma-separated traversal spec is applied downward from |
139
|
|
|
|
|
|
|
# the source elements in the $self->{elements} query |
140
|
136
|
|
|
|
|
153
|
my @elements = @{$self->get_elements}; |
|
136
|
|
|
|
|
283
|
|
141
|
136
|
|
|
|
|
179
|
my $comops = 0; |
142
|
|
|
|
|
|
|
|
143
|
136
|
|
|
|
|
141
|
my $specificity = 0; |
144
|
136
|
|
100
|
|
|
1227
|
my $startpos = pos($query) || 0; |
145
|
|
|
|
|
|
|
|
146
|
136
|
|
|
|
|
336
|
my $hack_sequence = 0; # look for '* html' |
147
|
|
|
|
|
|
|
|
148
|
136
|
|
|
|
|
135
|
warn "Starting new COMMA" if DEBUG; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# for each whitespace delimited descendant spec we grok the correct |
151
|
|
|
|
|
|
|
# parameters for look_down() and apply them to each source element |
152
|
|
|
|
|
|
|
# e.g. "table tr td" |
153
|
136
|
|
|
|
|
131
|
SEQUENCE: while (1) { |
154
|
380
|
|
|
|
|
364
|
my @args; |
155
|
380
|
|
100
|
|
|
1309
|
$pos = pos($query) || 0; |
156
|
380
|
|
|
|
|
447
|
my $relationship = ''; |
157
|
380
|
|
|
|
|
348
|
my $leading_whitespace; |
158
|
|
|
|
|
|
|
|
159
|
380
|
|
|
|
|
342
|
warn "Starting new SEQUENCE" if DEBUG; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# ignore any leading whitespace |
162
|
380
|
100
|
|
|
|
1297
|
if ($query =~ / \G (\s+) /cgsx) { |
163
|
104
|
50
|
|
|
|
246
|
$leading_whitespace = defined($1) ? 1 : 0; |
164
|
104
|
|
|
|
|
105
|
warn "removing leading whitespace\n" if DEBUG; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# grandchild selector is whitespace sensitive, requires leading whitespace |
168
|
380
|
100
|
100
|
|
|
1608
|
if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) { |
|
|
|
100
|
|
|
|
|
169
|
|
|
|
|
|
|
# can't have a relationship modifier as the first part of the query |
170
|
6
|
|
|
|
|
12
|
$relationship = $1; |
171
|
6
|
|
|
|
|
6
|
warn "relationship = $relationship\n" if DEBUG; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# get other relationship modifiers |
175
|
380
|
100
|
|
|
|
1178
|
if ($query =~ / \G (>|\+) \s* /cgx) { |
176
|
|
|
|
|
|
|
# can't have a relationship modifier as the first part of the query |
177
|
27
|
|
|
|
|
45
|
$relationship = $1; |
178
|
27
|
|
|
|
|
28
|
warn "relationship = $relationship\n" if DEBUG; |
179
|
27
|
50
|
|
|
|
71
|
if (!$comops) { |
180
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $relationship, $query ) ); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# optional leading word is a tag name |
185
|
380
|
100
|
|
|
|
1176
|
if ($query =~ / \G ([\w\*]+) /cgx) { |
186
|
228
|
|
|
|
|
598
|
my $tag = $1; |
187
|
|
|
|
|
|
|
|
188
|
228
|
100
|
|
|
|
468
|
if ($tag =~ m/\*/) { |
189
|
18
|
100
|
66
|
|
|
110
|
if (($leading_whitespace || $comops == 0) && ($tag eq '*')) { |
|
|
|
66
|
|
|
|
|
190
|
14
|
|
|
|
|
21
|
warn "universal tag\n" if DEBUG; |
191
|
14
|
|
|
|
|
60
|
push(@args, _tag => qr/\w+/); |
192
|
|
|
|
|
|
|
|
193
|
14
|
100
|
|
|
|
40
|
if ($comops == 0) { #we need to catch the case where we see '* html' |
194
|
7
|
|
|
|
|
13
|
$hack_sequence++; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
else { |
198
|
4
|
|
|
|
|
16
|
return $self->_report_error( $self->message( bad_spec => $tag, $query ) ); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
else { |
202
|
210
|
|
|
|
|
368
|
warn "html tag\n" if DEBUG; |
203
|
210
|
|
|
|
|
272
|
$specificity += 1; # standard tags are worth 1 point |
204
|
210
|
|
|
|
|
363
|
push( @args, _tag => $tag ); |
205
|
|
|
|
|
|
|
|
206
|
210
|
100
|
100
|
|
|
1208
|
if ($comops == 1 && $tag eq 'html') { |
207
|
1
|
|
|
|
|
3
|
$hack_sequence++; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# loop to collect a description about this specific part of the rule |
213
|
376
|
|
|
|
|
603
|
while (1) { |
214
|
528
|
|
|
|
|
677
|
my $work = scalar @args; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# that can be followed by (or the query can start with) a #id |
217
|
528
|
100
|
|
|
|
1350
|
if ($query =~ / \G \# ([\w\-]+) /cgx) { |
218
|
39
|
|
|
|
|
48
|
$specificity += 100; |
219
|
39
|
|
|
|
|
96
|
push( @args, id => $1 ); |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# and/or a .class |
223
|
528
|
100
|
|
|
|
1239
|
if ($query =~ / \G \. ([\w\-]+) /cgx) { |
224
|
73
|
|
|
|
|
102
|
$specificity += 10; |
225
|
73
|
|
|
|
|
1766
|
push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x ); |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# and/or none or more [ ] attribute specs |
229
|
528
|
100
|
|
|
|
1776
|
if ($query =~ / \G \[ (.*?) \] /cgx) { |
230
|
47
|
|
|
|
|
124
|
my $attribute = $1; |
231
|
47
|
|
|
|
|
61
|
$specificity += 10; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
#if we have an operator |
234
|
47
|
100
|
|
|
|
395
|
if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) { |
235
|
38
|
|
|
|
|
522
|
my ($name,$attribute_op,$value) = ($1,$2,$3); |
236
|
|
|
|
|
|
|
|
237
|
38
|
50
|
33
|
|
|
180
|
unless (defined($name) && length($name)) { |
238
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $name, $query ) ); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
38
|
|
|
|
|
39
|
warn "operator $attribute_op" if DEBUG; |
242
|
|
|
|
|
|
|
|
243
|
38
|
50
|
|
|
|
75
|
if (defined $value) { |
244
|
38
|
|
|
|
|
67
|
for ($value) { |
245
|
38
|
|
|
|
|
108
|
s/^['"]//; |
246
|
38
|
|
|
|
|
126
|
s/['"]$//; |
247
|
|
|
|
|
|
|
} |
248
|
38
|
100
|
|
|
|
102
|
if ($attribute_op eq '=') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
249
|
33
|
|
|
|
|
92
|
push( @args, $name => $value); |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
elsif ($attribute_op eq '|=') { |
252
|
2
|
|
|
|
|
51
|
push(@args, $name => qr/\b${value}-?/) |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
elsif ($attribute_op eq '~=') { |
255
|
3
|
|
|
|
|
41
|
push(@args, $name => qr/\b${value}\b/) |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
else { |
258
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
else { |
262
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
else { |
266
|
9
|
100
|
66
|
|
|
55
|
unless (defined($attribute) && length($attribute)) { |
267
|
2
|
|
|
|
|
35
|
return $self->_report_error( $self->message( bad_spec => $attribute, $query ) ); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# add a regex to match anything (or nothing) |
271
|
7
|
|
|
|
|
39
|
push( @args, $attribute => qr/.*/ ); |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
# and/or one or more pseudo-classes |
275
|
526
|
100
|
|
|
|
1185
|
if ($query =~ / \G : :? ([\w\-]+) /cgx) { |
276
|
7
|
|
|
|
|
12
|
my $pseudoclass = $1; |
277
|
7
|
|
|
|
|
9
|
$specificity += 10; |
278
|
|
|
|
|
|
|
|
279
|
7
|
100
|
|
|
|
18
|
if ($pseudoclass eq 'first-child') { |
|
|
50
|
|
|
|
|
|
280
|
4
|
|
|
21
|
|
36
|
push( @args, sub { ! grep { ref $_ } $_[0]->left() } ); |
|
21
|
|
|
|
|
592
|
|
|
21
|
|
|
|
|
243
|
|
281
|
|
|
|
|
|
|
} elsif ($pseudoclass eq 'last-child') { |
282
|
3
|
|
|
15
|
|
12
|
push( @args, sub { ! grep { ref $_ } $_[0]->right() } ); |
|
15
|
|
|
|
|
493
|
|
|
15
|
|
|
|
|
207
|
|
283
|
|
|
|
|
|
|
} else { |
284
|
0
|
|
|
|
|
0
|
warn "Pseudoclass :$pseudoclass not supported"; |
285
|
0
|
|
|
|
|
0
|
next; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# keep going until this particular expression is fully processed |
290
|
526
|
100
|
|
|
|
1555
|
last unless scalar(@args) > $work; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# we must have something in @args by now or we didn't find any |
294
|
|
|
|
|
|
|
# valid query specification this time around |
295
|
374
|
100
|
|
|
|
819
|
last SEQUENCE unless @args; |
296
|
|
|
|
|
|
|
|
297
|
244
|
|
|
|
|
415
|
$self->debug( |
298
|
|
|
|
|
|
|
'Parsed ', substr($query, $pos, pos($query) - $pos), |
299
|
|
|
|
|
|
|
' into args [', join(', ', @args), ']' |
300
|
|
|
|
|
|
|
) if DEBUG; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# we want to skip certain hack sequences like '* html' |
303
|
244
|
100
|
|
|
|
806
|
if ($hack_sequence == 2) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
304
|
1
|
|
|
|
|
4
|
@elements = []; # clear out our stored elements to match behaviour of modern browsers |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
# we're just looking for any descendent |
307
|
|
|
|
|
|
|
elsif( !$relationship ) { |
308
|
210
|
100
|
|
|
|
620
|
if ($self->{match_self}) { |
309
|
|
|
|
|
|
|
# if we are re-querying, be sure to match ourselves not just descendents |
310
|
2
|
|
|
|
|
5
|
@elements = map { $_->look_down(@args) } @elements; |
|
4
|
|
|
|
|
72
|
|
311
|
|
|
|
|
|
|
} else { |
312
|
|
|
|
|
|
|
# look_down() will match self in addition to descendents, |
313
|
|
|
|
|
|
|
# so we explicitly disallow matches on self as we iterate |
314
|
|
|
|
|
|
|
# thru the list. The other cases below already exclude self. |
315
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=58918 |
316
|
208
|
|
|
|
|
208
|
my @accumulator; |
317
|
208
|
|
|
|
|
323
|
foreach my $e (@elements) { |
318
|
253
|
100
|
|
|
|
3762
|
if ($e->root() == $e) { |
319
|
110
|
|
|
|
|
2337
|
push(@accumulator, $e->look_down(@args)); |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
else { |
322
|
143
|
|
|
|
|
2299
|
push(@accumulator, grep { $_ != $e } $e->look_down(@args)); |
|
242
|
|
|
|
|
19796
|
|
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
} |
325
|
208
|
|
|
|
|
44360
|
@elements = @accumulator; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
# immediate child selector |
329
|
|
|
|
|
|
|
elsif( $relationship eq '>' ) { |
330
|
|
|
|
|
|
|
@elements = map { |
331
|
16
|
|
|
|
|
27
|
$_->look_down( |
332
|
|
|
|
|
|
|
@args, |
333
|
|
|
|
|
|
|
sub { |
334
|
61
|
|
|
61
|
|
2377
|
my $tag = shift; |
335
|
61
|
|
|
|
|
68
|
my $root = $_; |
336
|
|
|
|
|
|
|
|
337
|
61
|
|
|
|
|
152
|
return $tag->depth == $root->depth + 1; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
) |
340
|
36
|
|
|
|
|
1293
|
} @elements; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
# immediate sibling selector |
343
|
|
|
|
|
|
|
elsif( $relationship eq '+' ) { |
344
|
|
|
|
|
|
|
@elements = map { |
345
|
11
|
|
|
|
|
20
|
$_->parent->look_down( |
346
|
|
|
|
|
|
|
@args, |
347
|
|
|
|
|
|
|
sub { |
348
|
253
|
|
|
253
|
|
6773
|
my $tag = shift; |
349
|
253
|
|
|
|
|
281
|
my $root = $_; |
350
|
253
|
|
|
|
|
617
|
my @prev_sibling = $tag->left; |
351
|
|
|
|
|
|
|
# get prev next non-text sibling |
352
|
253
|
|
|
|
|
12967
|
foreach my $sibling (reverse @prev_sibling) { |
353
|
280
|
100
|
|
|
|
967
|
next unless ref $sibling; |
354
|
69
|
|
|
|
|
225
|
return $sibling == $root; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
) |
358
|
61
|
|
|
|
|
1153
|
} @elements; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
# grandchild selector |
361
|
|
|
|
|
|
|
elsif( $relationship eq '*' ) { |
362
|
|
|
|
|
|
|
@elements = map { |
363
|
6
|
|
|
|
|
9
|
$_->look_down( |
364
|
|
|
|
|
|
|
@args, |
365
|
|
|
|
|
|
|
sub { |
366
|
40
|
|
|
40
|
|
1688
|
my $tag = shift; |
367
|
40
|
|
|
|
|
46
|
my $root = $_; |
368
|
|
|
|
|
|
|
|
369
|
40
|
|
|
|
|
89
|
return $tag->depth > $root->depth + 1; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
) |
372
|
9
|
|
|
|
|
259
|
} @elements; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# so we can check we've done something |
376
|
244
|
|
|
|
|
1389
|
$comops++; |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# dedup the results we've gotten |
379
|
244
|
|
|
|
|
576
|
@elements = $self->_dedup(\@elements); |
380
|
|
|
|
|
|
|
|
381
|
244
|
|
|
|
|
598
|
map { warn $_->as_HTML } @elements if DEBUG; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
130
|
100
|
|
|
|
239
|
if ($comops) { |
385
|
129
|
|
|
|
|
133
|
$self->debug( |
386
|
|
|
|
|
|
|
'Added', scalar(@elements), ' elements to results' |
387
|
|
|
|
|
|
|
) if DEBUG; |
388
|
|
|
|
|
|
|
|
389
|
129
|
|
|
|
|
310
|
my $selector = substr ($query,$startpos, $pos - $startpos); |
390
|
129
|
|
|
|
|
5827
|
$self->_add_specificity($selector,$specificity); |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
#add in the recent pass |
393
|
129
|
|
|
|
|
185
|
push(@result,@elements); |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# dedup the results across the result sets, necessary for comma based selectors |
396
|
129
|
|
|
|
|
291
|
@result = $self->_dedup(\@result); |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# sort the result set... |
399
|
129
|
|
|
|
|
504
|
@result = sort _by_address @result; |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# update op counter for complete query to include ops performed |
402
|
|
|
|
|
|
|
# in this fragment |
403
|
129
|
|
|
|
|
195
|
$ops += $comops; |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
else { |
406
|
|
|
|
|
|
|
# looks like we got an empty comma section, e.g. : ",x, ,y," |
407
|
|
|
|
|
|
|
# so we'll ignore it |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
130
|
100
|
|
|
|
443
|
last COMMA unless $query =~ / \G \s*,\s* /cgsx; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# check for any trailing text in the query that we couldn't parse |
414
|
123
|
50
|
|
|
|
274
|
if ($query =~ / \G (.+?) \s* $ /cgsx) { |
415
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $1, $query ) ); |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# check that we performed at least one query operation |
419
|
123
|
100
|
|
|
|
249
|
unless ($ops) { |
420
|
1
|
|
|
|
|
9
|
return $self->_report_error( $self->message( bad_query => $query ) ); |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
122
|
100
|
|
|
|
400
|
return wantarray ? @result : $self->_new_match_self(@result); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# return elements stored from last query |
427
|
|
|
|
|
|
|
sub get_elements { |
428
|
334
|
|
|
334
|
1
|
526
|
my $self = shift; |
429
|
|
|
|
|
|
|
|
430
|
334
|
50
|
|
|
|
1385
|
return wantarray ? @{$self->{elements}} : $self->{elements}; |
|
0
|
|
|
|
|
0
|
|
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
########################################################################################################### |
434
|
|
|
|
|
|
|
# from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity |
435
|
|
|
|
|
|
|
########################################################################################################### |
436
|
|
|
|
|
|
|
# A selector's specificity is calculated as follows: |
437
|
|
|
|
|
|
|
# |
438
|
|
|
|
|
|
|
# * count the number of ID attributes in the selector (= a) |
439
|
|
|
|
|
|
|
# * count the number of other attributes and pseudo-classes in the selector (= b) |
440
|
|
|
|
|
|
|
# * count the number of element names in the selector (= c) |
441
|
|
|
|
|
|
|
# * ignore pseudo-elements. |
442
|
|
|
|
|
|
|
# |
443
|
|
|
|
|
|
|
# Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity. |
444
|
|
|
|
|
|
|
# |
445
|
|
|
|
|
|
|
# Example(s): |
446
|
|
|
|
|
|
|
# |
447
|
|
|
|
|
|
|
# Some examples: |
448
|
|
|
|
|
|
|
# |
449
|
|
|
|
|
|
|
# * {} /* a=0 b=0 c=0 -> specificity = 0 */ |
450
|
|
|
|
|
|
|
# LI {} /* a=0 b=0 c=1 -> specificity = 1 */ |
451
|
|
|
|
|
|
|
# UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */ |
452
|
|
|
|
|
|
|
# UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */ |
453
|
|
|
|
|
|
|
# H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */ |
454
|
|
|
|
|
|
|
# UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */ |
455
|
|
|
|
|
|
|
# LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */ |
456
|
|
|
|
|
|
|
# #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */ |
457
|
|
|
|
|
|
|
########################################################################################################### |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# calculate and return the specificity for the provided selector |
460
|
|
|
|
|
|
|
sub get_specificity { |
461
|
24
|
|
|
24
|
1
|
29
|
my ($self,$selector) = @_; |
462
|
|
|
|
|
|
|
|
463
|
24
|
50
|
|
|
|
64
|
unless (exists $self->{specificity}->{$selector}) { |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# if the invoking tree happened to be large this could get expensive real fast |
466
|
|
|
|
|
|
|
# instead load up an empty instance and query that. |
467
|
24
|
|
|
|
|
48
|
local $self->{elements} = []; |
468
|
24
|
|
|
|
|
43
|
$self->query($selector); |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
24
|
|
|
|
|
57
|
return $self->{specificity}->{$selector}; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
sub suppress_errors { |
475
|
11
|
|
|
11
|
0
|
247
|
my ($self, $setting) = @_; |
476
|
|
|
|
|
|
|
|
477
|
11
|
100
|
|
|
|
29
|
if (defined($setting)) { |
478
|
2
|
|
|
|
|
5
|
$self->{suppress_errors} = $setting; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
11
|
|
|
|
|
34
|
return $self->{suppress_errors}; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
sub get_error { |
485
|
6
|
|
|
6
|
0
|
550
|
my ($self) = @_; |
486
|
|
|
|
|
|
|
|
487
|
6
|
|
|
|
|
24
|
return $self->{error}; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub list { |
491
|
|
|
|
|
|
|
# return list of items or return unblessed list ref of items |
492
|
0
|
0
|
|
0
|
1
|
0
|
return wantarray ? @{ $_[0] } : [ @{ $_[0] } ]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
sub size { |
496
|
100
|
|
|
100
|
1
|
11354
|
my $self = shift; |
497
|
100
|
|
|
|
|
131
|
return scalar @{$self->get_elements}; |
|
100
|
|
|
|
|
202
|
|
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
sub first { |
501
|
6
|
|
|
6
|
1
|
8
|
my $self = shift; |
502
|
|
|
|
|
|
|
|
503
|
6
|
50
|
|
|
|
6
|
return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty'); |
|
6
|
|
|
|
|
14
|
|
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub last { |
507
|
4
|
|
|
4
|
1
|
685
|
my $self = shift; |
508
|
|
|
|
|
|
|
|
509
|
4
|
50
|
|
|
|
6
|
return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty'); |
|
4
|
|
|
|
|
7
|
|
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
#################################################################### |
513
|
|
|
|
|
|
|
# |
514
|
|
|
|
|
|
|
# Everything below here is a private method subject to change |
515
|
|
|
|
|
|
|
# |
516
|
|
|
|
|
|
|
#################################################################### |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub _add_specificity { |
519
|
129
|
|
|
129
|
|
203
|
my ($self, $selector, $specificity) = @_; |
520
|
|
|
|
|
|
|
|
521
|
129
|
|
|
|
|
347
|
$self->{specificity}->{$selector} = $specificity; |
522
|
|
|
|
|
|
|
|
523
|
129
|
|
|
|
|
204
|
return(); |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
sub _report_error { |
527
|
7
|
|
|
7
|
|
976
|
my ($self, $message) = @_; |
528
|
|
|
|
|
|
|
|
529
|
7
|
100
|
|
|
|
17
|
if ($self->suppress_errors()) { |
530
|
6
|
50
|
|
|
|
15
|
if (defined($message)) { |
531
|
6
|
|
|
|
|
10
|
$self->{error} = $message; |
532
|
|
|
|
|
|
|
} |
533
|
6
|
|
|
|
|
28
|
return undef; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
else { |
536
|
1
|
|
|
|
|
4
|
$self->error($message); # this will DIE |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# this Just Works[tm] because first arg is HTML::Element object |
541
|
|
|
|
|
|
|
sub _export_query_to_element { |
542
|
2
|
|
|
2
|
|
475
|
class(ELEMENT)->load->method( |
543
|
|
|
|
|
|
|
query => \&Query, |
544
|
|
|
|
|
|
|
); |
545
|
|
|
|
|
|
|
} |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
# remove duplicate elements in the case where elements are nested between multiple matching elements |
548
|
|
|
|
|
|
|
sub _dedup { |
549
|
373
|
|
|
373
|
|
498
|
my ($self,$elements) = @_; |
550
|
|
|
|
|
|
|
|
551
|
373
|
|
|
|
|
609
|
my %seen = (); |
552
|
373
|
|
|
|
|
440
|
my @unique = (); |
553
|
|
|
|
|
|
|
|
554
|
373
|
|
|
|
|
340
|
foreach my $item (@{$elements}) { |
|
373
|
|
|
|
|
638
|
|
555
|
925
|
100
|
|
|
|
2672
|
if (!exists($seen{$item})) { |
556
|
830
|
|
|
|
|
3181165
|
push(@unique, $item); |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
|
559
|
925
|
|
|
|
|
2280
|
$seen{$item}++; |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
|
562
|
373
|
|
|
|
|
1389
|
return @unique; |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
# utility method to assist in sorting of query return sets |
566
|
|
|
|
|
|
|
sub _by_address |
567
|
|
|
|
|
|
|
{ |
568
|
241
|
|
|
241
|
|
332
|
my $self = shift; |
569
|
|
|
|
|
|
|
|
570
|
241
|
|
|
|
|
695
|
my @a = split /\./, $a->address(); |
571
|
241
|
|
|
|
|
19158
|
my @b = split /\./, $b->address(); |
572
|
|
|
|
|
|
|
|
573
|
241
|
100
|
|
|
|
18305
|
my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b; |
574
|
|
|
|
|
|
|
|
575
|
241
|
|
|
|
|
795
|
for (my $index=0; $index<$max; $index++) { |
576
|
|
|
|
|
|
|
|
577
|
1098
|
50
|
66
|
|
|
4360
|
if (!defined($a[$index]) && !defined($b[$index])) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
578
|
0
|
|
|
|
|
0
|
return 0; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
elsif (!defined($a[$index])) { |
581
|
49
|
|
|
|
|
147
|
return -1; |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
elsif(!defined($b[$index])) { |
584
|
26
|
|
|
|
|
76
|
return 1; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
1023
|
100
|
|
|
|
2051
|
if ($a[$index] == $b[$index]) { |
588
|
863
|
|
|
|
|
1986
|
next; #move to the next |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
else { |
591
|
160
|
|
|
|
|
563
|
return $a[$index] <=> $b[$index]; |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
} |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
# instantiate an instance with match_self turned on, for use with |
597
|
|
|
|
|
|
|
# follow-up queries, so they match the top-most elements. |
598
|
|
|
|
|
|
|
sub _new_match_self { |
599
|
118
|
|
|
118
|
|
151
|
my $self = shift; |
600
|
|
|
|
|
|
|
|
601
|
118
|
|
|
|
|
287
|
my $result = $self->new(@_); |
602
|
|
|
|
|
|
|
|
603
|
118
|
|
|
|
|
197
|
$result->{match_self} = 1; |
604
|
118
|
|
|
|
|
640
|
return $result; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub AUTOLOAD { |
608
|
76
|
|
|
76
|
|
297
|
my $self = shift; |
609
|
76
|
|
|
|
|
587
|
my ($method) = ($AUTOLOAD =~ /([^:]+)$/ ); |
610
|
76
|
50
|
|
|
|
215
|
return if $method eq 'DESTROY'; |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
# we allow Perl to catch any unknown methods that the user might |
613
|
|
|
|
|
|
|
# try to call against the HTML::Element objects in the query |
614
|
162
|
|
|
|
|
5528
|
my @results = |
615
|
76
|
|
|
|
|
153
|
map { $_->$method(@_) } |
616
|
76
|
|
|
|
|
92
|
@{$self->get_elements}; |
617
|
|
|
|
|
|
|
|
618
|
76
|
50
|
|
|
|
13485
|
return wantarray ? @results : \@results; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
1; |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head1 NAME |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
HTML::Query - jQuery-like selection queries for HTML::Element |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head1 SYNOPSIS |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Creating an C object using the L constructor |
630
|
|
|
|
|
|
|
subroutine: |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
use HTML::Query 'Query'; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# using named parameters |
635
|
|
|
|
|
|
|
$q = Query( text => $text ); # HTML text |
636
|
|
|
|
|
|
|
$q = Query( file => $file ); # HTML file |
637
|
|
|
|
|
|
|
$q = Query( tree => $tree ); # HTML::Element object |
638
|
|
|
|
|
|
|
$q = Query( query => $query ); # HTML::Query object |
639
|
|
|
|
|
|
|
$q = Query( |
640
|
|
|
|
|
|
|
text => $text1, # or any combination |
641
|
|
|
|
|
|
|
text => $text2, # of the above |
642
|
|
|
|
|
|
|
file => $file1, |
643
|
|
|
|
|
|
|
file => $file2, |
644
|
|
|
|
|
|
|
tree => $tree, |
645
|
|
|
|
|
|
|
query => $query, |
646
|
|
|
|
|
|
|
); |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# passing elements as positional arguments |
649
|
|
|
|
|
|
|
$q = Query( $tree ); # HTML::Element object(s) |
650
|
|
|
|
|
|
|
$q = Query( $tree1, $tree2, $tree3, ... ); |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# or from one or more existing queries |
653
|
|
|
|
|
|
|
$q = Query( $query1 ); # HTML::Query object(s) |
654
|
|
|
|
|
|
|
$q = Query( $query1, $query2, $query3, ... ); |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# or a mixture |
657
|
|
|
|
|
|
|
$q = Query( $tree1, $query1, $tree2, $query2 ); |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# the final argument (in all cases) can be a selector |
660
|
|
|
|
|
|
|
my $spec = 'ul.menu li a'; # |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
$q = Query( $tree, $spec ); |
663
|
|
|
|
|
|
|
$q = Query( $query, $spec ); |
664
|
|
|
|
|
|
|
$q = Query( $tree1, $tree2, $query1, $query2, $spec ); |
665
|
|
|
|
|
|
|
$q = Query( text => $text, $spec ); |
666
|
|
|
|
|
|
|
$q = Query( file => $file, $spec ); |
667
|
|
|
|
|
|
|
$q = Query( tree => $tree, $spec ); |
668
|
|
|
|
|
|
|
$q = Query( query => $query, $spec ); |
669
|
|
|
|
|
|
|
$q = Query( |
670
|
|
|
|
|
|
|
text => $text, |
671
|
|
|
|
|
|
|
file => $file, |
672
|
|
|
|
|
|
|
# ...etc... |
673
|
|
|
|
|
|
|
$spec |
674
|
|
|
|
|
|
|
); |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
Or using the OO L constructor method (which the L |
677
|
|
|
|
|
|
|
subroutine maps onto): |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
use HTML::Query; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
$q = HTML::Query->new( |
682
|
|
|
|
|
|
|
# accepts the same arguments as Query() |
683
|
|
|
|
|
|
|
) |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
Or by monkey-patching a L method into L. |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
688
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# build a tree |
691
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
692
|
|
|
|
|
|
|
$tree->parse_file($filename); |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
# call the query() method on any element |
695
|
|
|
|
|
|
|
my $query = $tree->query($spec); |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
Once you have a query, you can start selecting elements: |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
@r = $q->query('a')->get_elements(); # all ... elements |
700
|
|
|
|
|
|
|
@r = $q->query('a#menu')->get_elements(); # all with "menu" id |
701
|
|
|
|
|
|
|
@r = $q->query('#menu')->get_elements(); # all elements with "menu" id |
702
|
|
|
|
|
|
|
@r = $q->query('a.menu')->get_elements(); # all with "menu" class |
703
|
|
|
|
|
|
|
@r = $q->query('.menu')->get_elements(); # all elements with "menu" class |
704
|
|
|
|
|
|
|
@r = $q->query('a[href]')->get_elements(); # all with 'href' attr |
705
|
|
|
|
|
|
|
@r = $q->query('a[href=foo]')->get_elements(); # all with 'href="foo"' attr |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# you can specify elements within elements... |
708
|
|
|
|
|
|
|
@r = $q->query('ul.menu li a')->get_elements(); # |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
# and use commas to delimit multiple path specs for different elements |
711
|
|
|
|
|
|
|
@r = $q->query('table tr td a, form input[type=submit]')->get_elements(); |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# query() in scalar context returns a new query |
714
|
|
|
|
|
|
|
$r = $q->query('table')->get_elements();; # find all tables |
715
|
|
|
|
|
|
|
$s = $r->query('tr')->get_elements(); # find all rows in all those tables |
716
|
|
|
|
|
|
|
$t = $s->query('td')->get_elements(); # and all cells in those rows... |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
Inspecting query elements: |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
# get number of elements in query |
721
|
|
|
|
|
|
|
my $size = $q->size |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# get first/last element in query |
724
|
|
|
|
|
|
|
my $first = $q->first; |
725
|
|
|
|
|
|
|
my $last = $q->last; |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# convert query to list or list ref of HTML::Element objects |
728
|
|
|
|
|
|
|
my $list = $q->list; # list ref in scalar context |
729
|
|
|
|
|
|
|
my @list = $q->list; # list in list context |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
All other methods are mapped onto the L objects |
732
|
|
|
|
|
|
|
in the query: |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
print $query->as_trimmed_text; # print trimmed text for each element |
735
|
|
|
|
|
|
|
print $query->as_HTML; # print each element as HTML |
736
|
|
|
|
|
|
|
$query->delete; # call delete() on each element |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
=head1 DESCRIPTION |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
The C module is an add-on for the L module |
741
|
|
|
|
|
|
|
set. It provides a simple way to select one or more elements from a tree using |
742
|
|
|
|
|
|
|
a query syntax inspired by jQuery. This selector syntax will be reassuringly |
743
|
|
|
|
|
|
|
familiar to anyone who has ever written a CSS selector. |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
C is not an attempt to provide a complete (or even near-complete) |
746
|
|
|
|
|
|
|
implementation of jQuery in Perl (see Ingy's L module for a |
747
|
|
|
|
|
|
|
more ambitious attempt at that). Rather, it borrows some of the tried and |
748
|
|
|
|
|
|
|
tested selector syntax from jQuery (and CSS) that can easily be mapped onto |
749
|
|
|
|
|
|
|
the C method provided by the L |
750
|
|
|
|
|
|
|
module. |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
=head2 Creating a Query |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
The easiest way to create a query is using the exportable L |
755
|
|
|
|
|
|
|
subroutine. |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
use HTML::Query 'Query'; # note capital 'Q' |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
It accepts a C or C named parameter and will create an |
760
|
|
|
|
|
|
|
C object from the HTML source text or file, respectively. |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
my $query = Query( text => $text ); |
763
|
|
|
|
|
|
|
my $query = Query( file => $file ); |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
This delegates to L to parse the |
766
|
|
|
|
|
|
|
HTML into a tree of L objects. The root |
767
|
|
|
|
|
|
|
element returned is then wrapped in an C object. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
If you already have one or more L objects that |
770
|
|
|
|
|
|
|
you want to query then you can pass them to the L subroutine as |
771
|
|
|
|
|
|
|
arguments. For example, you can explicitly use |
772
|
|
|
|
|
|
|
L to parse an HTML document into a tree: |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
775
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
776
|
|
|
|
|
|
|
$tree->parse_file($filename); |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
And then create an C object for the tree either using an |
779
|
|
|
|
|
|
|
explicit C named parameter: |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
my $query = Query( tree => $tree ); |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Or implicitly using positional arguments. |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
my $query = Query( $tree ); |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
If you want to query across multiple elements, then pass each one as a |
788
|
|
|
|
|
|
|
positional argument. |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
my $query = Query( $tree1, $tree2, $tree3 ); |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
You can also create a new query from one or more existing queries, |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
my $query = Query( query => $query ); # named parameter |
795
|
|
|
|
|
|
|
my $query = Query( $query1, $query2 ); # positional arguments. |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
You can mix and match these different parameters and positional arguments |
798
|
|
|
|
|
|
|
to create a query across several different sources. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
$q = Query( |
801
|
|
|
|
|
|
|
text => $text1, |
802
|
|
|
|
|
|
|
text => $text2, |
803
|
|
|
|
|
|
|
file => $file1, |
804
|
|
|
|
|
|
|
file => $file2, |
805
|
|
|
|
|
|
|
tree => $tree, |
806
|
|
|
|
|
|
|
query => $query, |
807
|
|
|
|
|
|
|
); |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
The L subroutine is a simple wrapper around the L |
810
|
|
|
|
|
|
|
constructor method. You can instantiate your objects manually if you prefer. |
811
|
|
|
|
|
|
|
The L method accepts the same arguments as for the L |
812
|
|
|
|
|
|
|
subroutine (in fact, the L subroutine simply forwards all |
813
|
|
|
|
|
|
|
arguments to the L method). |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
use HTML::Query; |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
my $query = HTML::Query->new( |
818
|
|
|
|
|
|
|
# same argument format as for Query() |
819
|
|
|
|
|
|
|
); |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
A final way to use C is to have it add a L method |
822
|
|
|
|
|
|
|
to L. The C import hook (all lower |
823
|
|
|
|
|
|
|
case) can be specified to make this so. |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
826
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
829
|
|
|
|
|
|
|
$tree->parse_file($filename); |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
# now all HTML::Elements have a query() method |
832
|
|
|
|
|
|
|
my @items = $tree->query('ul li')->get_elements(); # find all list items |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
This approach, often referred to as I, should be used |
835
|
|
|
|
|
|
|
carefully and sparingly. It involves a violation of |
836
|
|
|
|
|
|
|
L's namespace that could have unpredictable |
837
|
|
|
|
|
|
|
results with a future version of the module (e.g. one which defines its own |
838
|
|
|
|
|
|
|
C method that does something different). Treat it as something that |
839
|
|
|
|
|
|
|
is great to get a quick job done right now, but probably not something to be |
840
|
|
|
|
|
|
|
used in production code without careful consideration of the implications. |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
=head2 Selecting Elements |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
Having created an C object by one of the methods outlined above, |
845
|
|
|
|
|
|
|
you can now fetch descendant elements in the tree using a simple query syntax. |
846
|
|
|
|
|
|
|
For example, to fetch all the C<< EaE >> elements in the tree, you can |
847
|
|
|
|
|
|
|
write: |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
@links = $query->query('a')->get_elements(); |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
Or, if you want the elements that have a specific C attribute defined |
852
|
|
|
|
|
|
|
with a value of, say C |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
@links = $query->query('a.menu')->get_elements(); |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
More generally, you can look for the existence of any attribute and optionally |
857
|
|
|
|
|
|
|
provide a specific value for it. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
@links = $query->query('a[href]')->get_elements(); # any href attribute |
860
|
|
|
|
|
|
|
@links = $query->query('a[href=index.html]')->get_elements(); # specific value |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
You can also find an element (or elements) by specifying an id. |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
@links = $query->query('#menu')->get_elements(); # any element with id="menu" |
865
|
|
|
|
|
|
|
@links = $query->query('ul#menu')->get_elements(); # ul element with id="menu" |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
You can provide multiple selection criteria to find elements within elements |
868
|
|
|
|
|
|
|
within elements, and so on. For example, to find all links in a menu, |
869
|
|
|
|
|
|
|
you can write: |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# matches: |
872
|
|
|
|
|
|
|
@links = $query->query('ul.menu li a')->get_elements(); |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
You can separate different criteria using commas. For example, to fetch all |
875
|
|
|
|
|
|
|
table rows and C elements with a C class: |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
@elems = $query->('table tr, span.foo')->get_elements(); |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=head2 Query Results |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
When called in list context, as shown in the examples above, the L |
882
|
|
|
|
|
|
|
method returns a list of L objects matching the |
883
|
|
|
|
|
|
|
search criteria. In scalar context, the L method returns a new |
884
|
|
|
|
|
|
|
C object containing the L objects |
885
|
|
|
|
|
|
|
found. You can then call the L method against that object to further |
886
|
|
|
|
|
|
|
refine the query. The L method applies the selection to all elements |
887
|
|
|
|
|
|
|
stored in the query. |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
my $tables = $query->query('table'); # query for tables |
890
|
|
|
|
|
|
|
my $rows = $tables->query('tr'); # requery for all rows in those tables |
891
|
|
|
|
|
|
|
my $cells = $rows->query('td')->get_elements(); # return back all the cells in those rows |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
=head2 Inspection Methods |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
The L method returns the number of elements in the query. The |
896
|
|
|
|
|
|
|
L and L methods return the first and last items in the |
897
|
|
|
|
|
|
|
query, respectively. |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
if ($query->size) { |
900
|
|
|
|
|
|
|
print "from ", $query->first->as_trimmed_text, " to ", $query->last->as_trimmed_text; |
901
|
|
|
|
|
|
|
} |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
If you want to extract the L objects from the |
904
|
|
|
|
|
|
|
query you can call the L method. This returns a list of |
905
|
|
|
|
|
|
|
L objects in list context, or a reference to a |
906
|
|
|
|
|
|
|
list in scalar context. |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
@elems = $query->list; |
909
|
|
|
|
|
|
|
$elems = $query->list; |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=head2 Element Methods |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
Any other methods are automatically applied to each element in the list. For |
914
|
|
|
|
|
|
|
example, to call the C method on all the |
915
|
|
|
|
|
|
|
L objects in the query, you can write: |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
print $query->as_trimmed_text; |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
In list context, this method returns a list of the return values from |
920
|
|
|
|
|
|
|
calling the method on each element. In scalar context it returns a |
921
|
|
|
|
|
|
|
reference to a list of return values. |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
@text_blocks = $query->as_trimmed_text; |
924
|
|
|
|
|
|
|
$text_blocks = $query->as_trimmed_text; |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
See L for further information on the methods it |
927
|
|
|
|
|
|
|
provides. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
=head1 QUERY SYNTAX |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
=head2 Basic Selectors |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=head3 element |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
Matches all elements of a particular type. |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
@elems = $query->query('table')->get_elements(); #
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=head3 #id |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
Matches all elements with a specific id attribute. |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
@elems = $query->query('#menu')->get_elements() # |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
This can be combined with an element type: |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
@elems = $query->query('ul#menu')->get_elements(); # |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=head3 .class |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
Matches all elements with a specific class attribute. |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
@elems = $query->query('.info')->get_elements(); # |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
This can be combined with an element type and/or element id: |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
@elems = $query->query('p.info')->get_elements(); # |
958
|
|
|
|
|
|
|
@elems = $query->query('p#foo.info')->get_elements(); # |
959
|
|
|
|
|
|
|
@elems = $query->query('#foo.info')->get_elements(); # |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
The selectors listed above can be combined in a whitespace delimited |
962
|
|
|
|
|
|
|
sequence to select down through a hierarchy of elements. Consider the |
963
|
|
|
|
|
|
|
following table: |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
To locate the cells that we're interested in, we can write: |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
@elems = $query->query('table.search tr.result td.value')->get_elements(); |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
=head2 Attribute Selectors |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
W3C CSS 2 specification defines new constructs through which to select |
982
|
|
|
|
|
|
|
based on specific attributes within elements. See the following link for the spec: |
983
|
|
|
|
|
|
|
L |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=head3 [attr] |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
Matches elements that have the specified attribute, including any where |
988
|
|
|
|
|
|
|
the attribute has no value. |
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
@elems = $query->query('[href]')->get_elements(); # |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
This can be combined with any of the above selectors. For example: |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
@elems = $query->query('a[href]')->get_elements(); # |
995
|
|
|
|
|
|
|
@elems = $query->query('a.menu[href]')->get_elements(); # |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
998
|
|
|
|
|
|
|
match I of them will be selected. |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
@elems = $query->query('a[href][rel]')->get_elements(); # |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
=head3 [attr=value] |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
Matches elements that have an attribute set to a specific value. The |
1005
|
|
|
|
|
|
|
value can be quoted in either single or double quotes, or left unquoted. |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
@elems = $query->query('[href=index.html]')->get_elements(); |
1008
|
|
|
|
|
|
|
@elems = $query->query('[href="index.html"]')->get_elements(); |
1009
|
|
|
|
|
|
|
@elems = $query->query("[href='index.html']")->get_elements(); |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1012
|
|
|
|
|
|
|
match I of them will be selected. |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
@elems = $query->query('a[href=index.html][rel=home]')->get_elements(); |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
=head3 [attr|=value] |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
Matches any element X whose foo attribute has a hyphen-separated list of |
1019
|
|
|
|
|
|
|
values beginning (from the left) with bar. The value can be quoted in either |
1020
|
|
|
|
|
|
|
single or double quotes, or left unquoted. |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
@elems = $query->query('[lang|=en]')->get_elements(); |
1023
|
|
|
|
|
|
|
@elems = $query->query('p[class|="example"]')->get_elements(); |
1024
|
|
|
|
|
|
|
@elems = $query->query("img[alt|='fig']")->get_elements(); |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1027
|
|
|
|
|
|
|
match I of them will be selected. |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
@elems = $query->query('p[class|="external"][lang|="en"]')->get_elements(); |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=head3 [attr~=value] |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
Matches any element X whose foo attribute value is a list of space-separated |
1034
|
|
|
|
|
|
|
values, one of which is exactly equal to bar. The value can be quoted in either |
1035
|
|
|
|
|
|
|
single or double quotes, or left unquoted. |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
@elems = $query->query('[lang~=en]')->get_elements(); |
1038
|
|
|
|
|
|
|
@elems = $query->query('p[class~="example"]')->get_elements(); |
1039
|
|
|
|
|
|
|
@elems = $query->query("img[alt~='fig']")->get_elements(); |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1042
|
|
|
|
|
|
|
match I of them will be selected. |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
@elems = $query->query('p[class~="external"][lang~="en"]')->get_elements(); |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
KNOWN BUG: you can't have a C<]> character in the attribute value because |
1047
|
|
|
|
|
|
|
it confuses the query parser. Fixing this is TODO. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=head2 Universal Selector |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
W3C CSS 2 specification defines a new construct through which to select |
1052
|
|
|
|
|
|
|
any element within the document below a given hierarchy. |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
http://www.w3.org/TR/css3-selectors/#universal-selector |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
@elems = $query->query('*')->get_elements(); |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
=head2 Combinator Selectors |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
W3C CSS 2 specification defines new constructs through which to select |
1061
|
|
|
|
|
|
|
based on heirarchy with the DOM. See the following link for the spec: |
1062
|
|
|
|
|
|
|
L |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=head3 Immediate Descendents (children) |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
When you combine selectors with whitespace elements are selected if |
1067
|
|
|
|
|
|
|
they are descended from the parent in some way. But if you just want |
1068
|
|
|
|
|
|
|
to select the children (and not the grandchildren, great-grandchildren, |
1069
|
|
|
|
|
|
|
etc) then you can combine the selectors with the C<< > >> character. |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
@elems = $query->query('a > img')->get_elements(); |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
=head3 Non-Immediate Descendents |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
If you just want any descendents that aren't children then you can combine |
1076
|
|
|
|
|
|
|
selectors with the C<*> character. |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
@elems = $query->query('div * a')->get_elements(); |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
=head3 Immediate Siblings |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
If you want to use a sibling relationship then you can can join selectors |
1083
|
|
|
|
|
|
|
with the C<+> character. |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
@elems = $query->query('img + span')->get_elements(); |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
=head2 Pseudo-classes |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
W3C CSS 2 and CSS 3 specifications define new concepts of pseudo-classes to |
1090
|
|
|
|
|
|
|
permit formatting based on information that lies outside the document tree. |
1091
|
|
|
|
|
|
|
See the following link for the most recent spec: |
1092
|
|
|
|
|
|
|
L |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
HTML::Query currently has limited support for CSS 2, and no support for CSS 3. |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
Patches are *highly* encouraged to help add support here. |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=head3 -child pseudo-classes |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
If you want to return child elements within a certain position then -child |
1101
|
|
|
|
|
|
|
pseudo-classes (:first-child, :last-child) are what you're looking for. |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
@elems = $query->query('table td:first-child')->get_elements; |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=head3 Link pseudo-classes: :link and :visited |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
Unsupported. |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
The :link pseudo-class is to be implemented, currently unsupported. |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
It is not possible to locate :visited outside of a browser context due to it's |
1112
|
|
|
|
|
|
|
dynamic nature. |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=head3 Dynamic pseudo-classes |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
Unsupported. |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
It is not possible to locate these classes(:hover, :active, :focus) outside |
1119
|
|
|
|
|
|
|
of a browser context due to their dynamic nature. |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
=head3 Language pseudo-class |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
Unsupported. |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
Functionality for the :lang pseudo-class is largely replicated by using an |
1126
|
|
|
|
|
|
|
attribute selector for lang combined with a universal selector query. |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
If this is insufficient I'd love to see a patch adding support for it. |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
=head3 Other pseudo-classes |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
W3C CSS 3 added a number of new behaviors that need support. At |
1133
|
|
|
|
|
|
|
this time there is no support for them, but we should work on adding support. |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
Patches are very welcome. |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
=head2 Pseudo-elements |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
W3C CSS 2 and CSS 3 specification defines new concepts of pseudo-elements to |
1140
|
|
|
|
|
|
|
permit formatting based on information that lies outside the document tree. |
1141
|
|
|
|
|
|
|
See the following link for the most recent spec: |
1142
|
|
|
|
|
|
|
L |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
At this time there is no support for pseudo-elements, but we are working |
1145
|
|
|
|
|
|
|
on adding support. |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
Patches are very welcome. |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
=head2 Combining Selectors |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
You can combine basic and hierarchical selectors into a single query |
1152
|
|
|
|
|
|
|
by separating each part with a comma. The query will select all matching |
1153
|
|
|
|
|
|
|
elements for each of the comma-delimited selectors. For example, to |
1154
|
|
|
|
|
|
|
find all C, C and C elements in a tree: |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
@elems = $query->query('a, b, i')->get_elements(); |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
Each of these selectors can be arbitrarily complex. |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
@elems = $query->query( |
1161
|
|
|
|
|
|
|
'table.search[width=100%] tr.result[valign=top] td.value, |
1162
|
|
|
|
|
|
|
form.search input[type=submit], |
1163
|
|
|
|
|
|
|
a[href=index.html]' |
1164
|
|
|
|
|
|
|
)->get_elements(); |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
=head1 EXPORT HOOKS |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head2 Query |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
The C constructor subroutine (note the capital letter) can be |
1171
|
|
|
|
|
|
|
exported as a convenient way to create C objects. It simply |
1172
|
|
|
|
|
|
|
forwards all arguments to the L constructor method. |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
use HTML::Query 'Query'; |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
my $query = Query( file => $file, 'ul.menu li a' ); |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
=head2 query |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
The C export hook can be called to monkey-patch a L method |
1181
|
|
|
|
|
|
|
into the L module. |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
This is considered questionable behaviour in polite society which regards it |
1184
|
|
|
|
|
|
|
as a violation of the inner sanctity of the L. |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
But if you're the kind of person that doesn't mind a bit of occasional |
1187
|
|
|
|
|
|
|
namespace abuse for the sake of getting the job done, then go right ahead. |
1188
|
|
|
|
|
|
|
Just don't blame me if it all blows up later. |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
1191
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
# build a tree |
1194
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
1195
|
|
|
|
|
|
|
$tree->parse_file($filename); |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
# call the query() method on any element |
1198
|
|
|
|
|
|
|
my $query = $tree->query('ul li a'); |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
=head1 METHODS |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
The C object is a subclass of L and |
1203
|
|
|
|
|
|
|
inherits all of its method. |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=head2 new(@elements,$selector) |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
This constructor method is used to create a new C object. It |
1208
|
|
|
|
|
|
|
expects a list of any number (including zero) of |
1209
|
|
|
|
|
|
|
L or C objects. |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
# single HTML::Element object |
1212
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem); |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
# multiple element object |
1215
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem1, $elem2, $elem3, ...); |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
# copy elements from an existing query |
1218
|
|
|
|
|
|
|
my $query = HTML::Query->new($another_query); |
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
# copy elements from several queries |
1221
|
|
|
|
|
|
|
my $query = HTML::Query->new($query1, $query2, $query3); |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
# or a mixture |
1224
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem1, $query1, $elem2, $query3); |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
You can also use named parameters to specify an alternate source for a |
1227
|
|
|
|
|
|
|
element. |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
$query = HTML::Query->new( file => $file ); |
1230
|
|
|
|
|
|
|
$query = HTML::Query->new( text => $text ); |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
In this case, the L module is used to |
1233
|
|
|
|
|
|
|
parse the source file or text into a tree of L |
1234
|
|
|
|
|
|
|
objects. |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
For the sake of completeness, you can also specify element trees and queries |
1237
|
|
|
|
|
|
|
using named parameters: |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
$query = HTML::Query->new( tree => $tree ); |
1240
|
|
|
|
|
|
|
$query = HTML::Query->new( query => $query ); |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
You can freely mix and match elements, queries and named sources. The |
1243
|
|
|
|
|
|
|
query will be constructed as an aggregate across them all. |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
$q = HTML::Query->new( |
1246
|
|
|
|
|
|
|
text => $text1, |
1247
|
|
|
|
|
|
|
text => $text2, |
1248
|
|
|
|
|
|
|
file => $file1, |
1249
|
|
|
|
|
|
|
file => $file2, |
1250
|
|
|
|
|
|
|
tree => $tree, |
1251
|
|
|
|
|
|
|
query => $query1, |
1252
|
|
|
|
|
|
|
); |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
The final, optional argument can be a selector specification. This is |
1255
|
|
|
|
|
|
|
immediately passed to the L method which will return a new query |
1256
|
|
|
|
|
|
|
with only those elements selected. |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
my $spec = 'ul.menu li a'; # |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
my $query = HTML::Query->new( $tree, $spec ); |
1261
|
|
|
|
|
|
|
my $query = HTML::Query->new( text => $text, $spec ); |
1262
|
|
|
|
|
|
|
my $query = HTML::Query->new( |
1263
|
|
|
|
|
|
|
text => $text, |
1264
|
|
|
|
|
|
|
file => $file, |
1265
|
|
|
|
|
|
|
$spec |
1266
|
|
|
|
|
|
|
); |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
The list of arguments can also be passed by reference to a list. |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
=head2 query($spec) |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
This method locates the descendant elements identified by the C<$spec> |
1275
|
|
|
|
|
|
|
argument for each element in the query. It then interally stores the results |
1276
|
|
|
|
|
|
|
for requerying or return. See get_elements(). |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1279
|
|
|
|
|
|
|
my $results = $query->query($spec); |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
See L<"QUERY SYNTAX"> for the permitted syntax of the C<$spec> argument. |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
=head2 get_elements() |
1284
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
This method returns the stored results from a query. In list context it returns a list of |
1286
|
|
|
|
|
|
|
matching L objects. In scalar context it returns a reference to |
1287
|
|
|
|
|
|
|
the results array. |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1290
|
|
|
|
|
|
|
my $results = $query->query($spec); |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
my @elements = $results->query($spec)->get_elements(); |
1293
|
|
|
|
|
|
|
my $elements = $results->query($spec)->get_elements(); |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
=head2 get_specificity() |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
A selector's specificity is calculated as follows: |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
* count the number of ID attributes in the selector (= a) |
1302
|
|
|
|
|
|
|
* count the number of other attributes and pseudo-classes in the selector (= b) |
1303
|
|
|
|
|
|
|
* count the number of element names in the selector (= c) |
1304
|
|
|
|
|
|
|
* ignore pseudo-elements. |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted |
1307
|
|
|
|
|
|
|
as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD. |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
See the following spec for additional details: |
1310
|
|
|
|
|
|
|
L |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
=head2 size() |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
Returns the number of elements in the query. |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
=head2 first() |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
Returns the first element in the query. |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
my $elem = $query->first; |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
If the query is empty then an exception will be thrown. If you would rather |
1323
|
|
|
|
|
|
|
have an undefined value returned then you can use the C method inherited |
1324
|
|
|
|
|
|
|
from L. This effectively wraps the call to |
1325
|
|
|
|
|
|
|
C in an C block to catch any exceptions thrown. |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
my $elem = $query->try('first') || warn "no first element\n"; |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
=head2 last() |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
Similar to L, but returning the last element in the query. |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
my $elem = $query->last; |
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
=head2 list() |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
Returns a list of the L object in the query in |
1338
|
|
|
|
|
|
|
list context, or a reference to a list in scalar context. |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
my @elems = $query->list; |
1341
|
|
|
|
|
|
|
my $elems = $query->list; |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
=head2 AUTOLOAD |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
The C method maps any other method calls to the |
1346
|
|
|
|
|
|
|
L objects in the list. When called in list |
1347
|
|
|
|
|
|
|
context it returns a list of the values returned from calling the method on |
1348
|
|
|
|
|
|
|
each element. In scalar context it returns a reference to a list of return |
1349
|
|
|
|
|
|
|
values. |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
my @text_blocks = $query->as_trimmed_text; |
1352
|
|
|
|
|
|
|
my $text_blocks = $query->as_trimmed_text; |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
=head1 KNOWN BUGS |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
=head2 Attribute Values |
1357
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
It is not possible to use C<]> in an attribute value. This is due to a |
1359
|
|
|
|
|
|
|
limitation in the parser which will be fixed RSN. |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
=head1 AUTHOR |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
Andy Wardley L |
1364
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
=head1 MAINTAINER |
1366
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
Kevin Kamel |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
Vivek Khera |
1372
|
|
|
|
|
|
|
Michael Peters |
1373
|
|
|
|
|
|
|
David Gray |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
=head1 COPYRIGHT |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
Copyright (C) 2010 Andy Wardley. All Rights Reserved. |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
1380
|
|
|
|
|
|
|
under the same terms as Perl itself. |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
=head1 SEE ALSO |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
L, L, |
1385
|
|
|
|
|
|
|
L, L, L |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
=cut |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
# Local Variables: |
1390
|
|
|
|
|
|
|
# mode: Perl |
1391
|
|
|
|
|
|
|
# perl-indent-level: 4 |
1392
|
|
|
|
|
|
|
# indent-tabs-mode: nil |
1393
|
|
|
|
|
|
|
# End: |
1394
|
|
|
|
|
|
|
# |
1395
|
|
|
|
|
|
|
# vim: expandtab shiftwidth=4: |
|