line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Query; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.08'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
use Badger::Class |
6
|
11
|
|
|
|
|
266
|
version => $VERSION, |
7
|
|
|
|
|
|
|
debug => 0, |
8
|
|
|
|
|
|
|
base => 'Badger::Base', |
9
|
|
|
|
|
|
|
utils => 'blessed', |
10
|
|
|
|
|
|
|
import => 'class CLASS', |
11
|
|
|
|
|
|
|
vars => 'AUTOLOAD', |
12
|
|
|
|
|
|
|
constants => 'ARRAY', |
13
|
|
|
|
|
|
|
constant => { |
14
|
|
|
|
|
|
|
ELEMENT => 'HTML::Element', |
15
|
|
|
|
|
|
|
BUILDER => 'HTML::TreeBuilder', |
16
|
|
|
|
|
|
|
}, |
17
|
|
|
|
|
|
|
exports => { |
18
|
|
|
|
|
|
|
any => 'Query', |
19
|
|
|
|
|
|
|
hooks => { |
20
|
|
|
|
|
|
|
query => \&_export_query_to_element, |
21
|
|
|
|
|
|
|
}, |
22
|
|
|
|
|
|
|
}, |
23
|
|
|
|
|
|
|
messages => { |
24
|
|
|
|
|
|
|
no_elements => 'No elements specified to query', |
25
|
|
|
|
|
|
|
no_query => 'No query specified', |
26
|
|
|
|
|
|
|
no_source => 'No argument specified for source: %s', |
27
|
|
|
|
|
|
|
bad_element => 'Invalid element specified: %s', |
28
|
|
|
|
|
|
|
bad_source => 'Invalid source specified: %s', |
29
|
|
|
|
|
|
|
bad_query => 'Invalid query specified: %s', |
30
|
|
|
|
|
|
|
bad_spec => 'Invalid specification "%s" in query: %s', |
31
|
|
|
|
|
|
|
is_empty => 'The query does not contain any elements', |
32
|
11
|
|
|
11
|
|
1957925
|
}; |
|
11
|
|
|
|
|
20764
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
our $SOURCES = { |
35
|
|
|
|
|
|
|
text => sub { |
36
|
|
|
|
|
|
|
class(BUILDER)->load; |
37
|
|
|
|
|
|
|
BUILDER->new_from_content(shift); |
38
|
|
|
|
|
|
|
}, |
39
|
|
|
|
|
|
|
file => sub { |
40
|
|
|
|
|
|
|
class(BUILDER)->load; |
41
|
|
|
|
|
|
|
BUILDER->new_from_file(shift); |
42
|
|
|
|
|
|
|
}, |
43
|
|
|
|
|
|
|
tree => sub { |
44
|
|
|
|
|
|
|
$_[0] |
45
|
|
|
|
|
|
|
}, |
46
|
|
|
|
|
|
|
query => sub { |
47
|
|
|
|
|
|
|
ref $_[0] eq ARRAY |
48
|
|
|
|
|
|
|
? @{ $_[0] } |
49
|
|
|
|
|
|
|
: $_[0]; |
50
|
|
|
|
|
|
|
}, |
51
|
|
|
|
|
|
|
}; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub Query (@) { |
54
|
48
|
|
|
48
|
1
|
90876
|
CLASS->new(@_); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub new { |
58
|
167
|
|
|
167
|
1
|
16020
|
my $class = shift; |
59
|
167
|
|
|
|
|
187
|
my ($element, @elements, $type, $code, $select); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# expand a single list ref into items |
62
|
167
|
100
|
100
|
|
|
661
|
unshift @_, @{ shift @_ } |
|
2
|
|
|
|
|
7
|
|
63
|
|
|
|
|
|
|
if @_ == 1 && ref $_[0] eq ARRAY; |
64
|
|
|
|
|
|
|
|
65
|
167
|
|
66
|
|
|
514
|
$class = ref $class || $class; |
66
|
|
|
|
|
|
|
|
67
|
167
|
|
|
|
|
798
|
my $self = { |
68
|
|
|
|
|
|
|
error => undef, |
69
|
|
|
|
|
|
|
suppress_errors => undef, |
70
|
|
|
|
|
|
|
match_self => undef, |
71
|
|
|
|
|
|
|
elements => \@elements, |
72
|
|
|
|
|
|
|
specificity => {} |
73
|
|
|
|
|
|
|
}; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# each element should be an HTML::Element object, although we might |
76
|
|
|
|
|
|
|
# want to subclass this module to recognise a different kind of object, |
77
|
|
|
|
|
|
|
# so we get the element class from the ELEMENT constant method which a |
78
|
|
|
|
|
|
|
# subclass can re-define. |
79
|
167
|
|
|
|
|
586
|
my $element_class = $class->ELEMENT; |
80
|
|
|
|
|
|
|
|
81
|
167
|
|
|
|
|
376
|
while (@_) { |
82
|
287
|
|
|
|
|
327
|
$element = shift; |
83
|
287
|
|
|
|
|
264
|
$class->debug("argument: ".$element) if DEBUG; |
84
|
|
|
|
|
|
|
|
85
|
287
|
100
|
|
|
|
1089
|
if (! ref $element) { |
|
|
50
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# a non-reference item is a source type (text, file, tree) |
87
|
|
|
|
|
|
|
# followed by the source, or if it's the last argument following |
88
|
|
|
|
|
|
|
# one ore more element options or named argument pairs then it's |
89
|
|
|
|
|
|
|
# a selection query |
90
|
19
|
100
|
|
|
|
61
|
if (@_) { |
|
|
100
|
|
|
|
|
|
91
|
12
|
|
|
|
|
18
|
$type = $element; |
92
|
12
|
|
50
|
|
|
93
|
$code = $SOURCES->{ $type } |
93
|
|
|
|
|
|
|
|| return $class->error_msg( bad_source => $type ); |
94
|
12
|
|
|
|
|
20
|
$element = shift; |
95
|
12
|
|
|
|
|
13
|
$class->debug("source $type: $element") if DEBUG; |
96
|
12
|
|
|
|
|
41
|
unshift(@_, $code->($element)); |
97
|
12
|
|
|
|
|
51462
|
next; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
elsif (@elements) { |
100
|
5
|
|
|
|
|
9
|
$select = $element; |
101
|
5
|
|
|
|
|
8
|
last; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
elsif (blessed $element) { |
105
|
|
|
|
|
|
|
# otherwise it should be an HTML::Element object or another |
106
|
|
|
|
|
|
|
# HTML::Query object |
107
|
268
|
100
|
|
|
|
863
|
if ($element->isa($element_class)) { |
|
|
50
|
|
|
|
|
|
108
|
266
|
|
|
|
|
370
|
push(@elements, $element); |
109
|
266
|
|
|
|
|
685
|
next; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
elsif ($element->isa($class)) { |
112
|
2
|
|
|
|
|
4
|
push(@elements, @{$element->get_elements}); |
|
2
|
|
|
|
|
8
|
|
113
|
2
|
|
|
|
|
6
|
next; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
2
|
|
|
|
|
19
|
return $class->error_msg( bad_element => $element ); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
165
|
|
|
|
|
353
|
bless $self, $class; |
121
|
|
|
|
|
|
|
|
122
|
165
|
100
|
|
|
|
12810
|
return defined $select ? $self->query($select) : $self; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub query { |
126
|
129
|
|
|
129
|
1
|
4117
|
my ($self, $query) = @_; |
127
|
129
|
|
|
|
|
146
|
my @result; |
128
|
129
|
|
|
|
|
165
|
my $ops = 0; |
129
|
129
|
|
|
|
|
151
|
my $pos = 0; |
130
|
|
|
|
|
|
|
|
131
|
129
|
|
|
|
|
193
|
$self->{error} = undef; |
132
|
|
|
|
|
|
|
|
133
|
129
|
100
|
100
|
|
|
717
|
return $self->error_msg('no_query') |
134
|
|
|
|
|
|
|
unless defined $query && length $query; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# multiple specs can be comma separated, e.g. "table tr td, li a, div.foo" |
137
|
127
|
|
|
|
|
156
|
COMMA: while (1) { |
138
|
|
|
|
|
|
|
# each comma-separated traversal spec is applied downward from |
139
|
|
|
|
|
|
|
# the source elements in the $self->{elements} query |
140
|
134
|
|
|
|
|
141
|
my @elements = @{$self->get_elements}; |
|
134
|
|
|
|
|
256
|
|
141
|
134
|
|
|
|
|
174
|
my $comops = 0; |
142
|
|
|
|
|
|
|
|
143
|
134
|
|
|
|
|
135
|
my $specificity = 0; |
144
|
134
|
|
100
|
|
|
465
|
my $startpos = pos($query) || 0; |
145
|
|
|
|
|
|
|
|
146
|
134
|
|
|
|
|
146
|
my $hack_sequence = 0; # look for '* html' |
147
|
|
|
|
|
|
|
|
148
|
134
|
|
|
|
|
127
|
warn "Starting new COMMA" if DEBUG; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# for each whitespace delimited descendant spec we grok the correct |
151
|
|
|
|
|
|
|
# parameters for look_down() and apply them to each source element |
152
|
|
|
|
|
|
|
# e.g. "table tr td" |
153
|
134
|
|
|
|
|
127
|
SEQUENCE: while (1) { |
154
|
373
|
|
|
|
|
363
|
my @args; |
155
|
373
|
|
100
|
|
|
1007
|
$pos = pos($query) || 0; |
156
|
373
|
|
|
|
|
468
|
my $relationship = ''; |
157
|
373
|
|
|
|
|
343
|
my $leading_whitespace; |
158
|
|
|
|
|
|
|
|
159
|
373
|
|
|
|
|
380
|
warn "Starting new SEQUENCE" if DEBUG; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# ignore any leading whitespace |
162
|
373
|
100
|
|
|
|
1270
|
if ($query =~ / \G (\s+) /cgsx) { |
163
|
101
|
50
|
|
|
|
228
|
$leading_whitespace = defined($1) ? 1 : 0; |
164
|
101
|
|
|
|
|
104
|
warn "removing leading whitespace\n" if DEBUG; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# grandchild selector is whitespace sensitive, requires leading whitespace |
168
|
373
|
100
|
100
|
|
|
1192
|
if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) { |
|
|
|
100
|
|
|
|
|
169
|
|
|
|
|
|
|
# can't have a relationship modifier as the first part of the query |
170
|
6
|
|
|
|
|
11
|
$relationship = $1; |
171
|
6
|
|
|
|
|
20
|
warn "relationship = $relationship\n" if DEBUG; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# get other relationship modifiers |
175
|
373
|
100
|
|
|
|
917
|
if ($query =~ / \G (>|\+) \s* /cgx) { |
176
|
|
|
|
|
|
|
# can't have a relationship modifier as the first part of the query |
177
|
27
|
|
|
|
|
47
|
$relationship = $1; |
178
|
27
|
|
|
|
|
27
|
warn "relationship = $relationship\n" if DEBUG; |
179
|
27
|
50
|
|
|
|
52
|
if (!$comops) { |
180
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $relationship, $query ) ); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# optional leading word is a tag name |
185
|
373
|
100
|
|
|
|
1035
|
if ($query =~ / \G ([\w\*]+) /cgx) { |
186
|
223
|
|
|
|
|
419
|
my $tag = $1; |
187
|
|
|
|
|
|
|
|
188
|
223
|
100
|
|
|
|
458
|
if ($tag =~ m/\*/) { |
189
|
18
|
100
|
66
|
|
|
119
|
if (($leading_whitespace || $comops == 0) && ($tag eq '*')) { |
|
|
|
66
|
|
|
|
|
190
|
14
|
|
|
|
|
14
|
warn "universal tag\n" if DEBUG; |
191
|
14
|
|
|
|
|
60
|
push(@args, _tag => qr/\w+/); |
192
|
|
|
|
|
|
|
|
193
|
14
|
100
|
|
|
|
47
|
if ($comops == 0) { #we need to catch the case where we see '* html' |
194
|
7
|
|
|
|
|
15
|
$hack_sequence++; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
else { |
198
|
4
|
|
|
|
|
18
|
return $self->_report_error( $self->message( bad_spec => $tag, $query ) ); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
else { |
202
|
205
|
|
|
|
|
208
|
warn "html tag\n" if DEBUG; |
203
|
205
|
|
|
|
|
229
|
$specificity += 1; # standard tags are worth 1 point |
204
|
205
|
|
|
|
|
372
|
push( @args, _tag => $tag ); |
205
|
|
|
|
|
|
|
|
206
|
205
|
100
|
100
|
|
|
686
|
if ($comops == 1 && $tag eq 'html') { |
207
|
1
|
|
|
|
|
2
|
$hack_sequence++; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# loop to collect a description about this specific part of the rule |
213
|
369
|
|
|
|
|
448
|
while (1) { |
214
|
519
|
|
|
|
|
607
|
my $work = scalar @args; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# that can be followed by (or the query can start with) a #id |
217
|
519
|
100
|
|
|
|
2308
|
if ($query =~ / \G \# ([\w\-]+) /cgx) { |
218
|
39
|
|
|
|
|
56
|
$specificity += 100; |
219
|
39
|
|
|
|
|
105
|
push( @args, id => $1 ); |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# and/or a .class |
223
|
519
|
100
|
|
|
|
1314
|
if ($query =~ / \G \. ([\w\-]+) /cgx) { |
224
|
73
|
|
|
|
|
100
|
$specificity += 10; |
225
|
73
|
|
|
|
|
12316
|
push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x ); |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# and/or none or more [ ] attribute specs |
229
|
519
|
100
|
|
|
|
1315
|
if ($query =~ / \G \[ (.*?) \] /cgx) { |
230
|
47
|
|
|
|
|
95
|
my $attribute = $1; |
231
|
47
|
|
|
|
|
49
|
$specificity += 10; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
#if we have an operator |
234
|
47
|
100
|
|
|
|
269
|
if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) { |
235
|
38
|
|
|
|
|
124
|
my ($name,$attribute_op,$value) = ($1,$2,$3); |
236
|
|
|
|
|
|
|
|
237
|
38
|
50
|
33
|
|
|
199
|
unless (defined($name) && length($name)) { |
238
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $name, $query ) ); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
38
|
|
|
|
|
38
|
warn "operator $attribute_op" if DEBUG; |
242
|
|
|
|
|
|
|
|
243
|
38
|
50
|
|
|
|
71
|
if (defined $value) { |
244
|
38
|
|
|
|
|
64
|
for ($value) { |
245
|
38
|
|
|
|
|
98
|
s/^['"]//; |
246
|
38
|
|
|
|
|
134
|
s/['"]$//; |
247
|
|
|
|
|
|
|
} |
248
|
38
|
100
|
|
|
|
101
|
if ($attribute_op eq '=') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
249
|
33
|
|
|
|
|
90
|
push( @args, $name => $value); |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
elsif ($attribute_op eq '|=') { |
252
|
2
|
|
|
|
|
24
|
push(@args, $name => qr/\b${value}-?/) |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
elsif ($attribute_op eq '~=') { |
255
|
3
|
|
|
|
|
39
|
push(@args, $name => qr/\b${value}\b/) |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
else { |
258
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
else { |
262
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) ); |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
else { |
266
|
9
|
100
|
66
|
|
|
55
|
unless (defined($attribute) && length($attribute)) { |
267
|
2
|
|
|
|
|
25
|
return $self->_report_error( $self->message( bad_spec => $attribute, $query ) ); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# add a regex to match anything (or nothing) |
271
|
7
|
|
|
|
|
46
|
push( @args, $attribute => qr/.*/ ); |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
# and/or one or more pseudo-classes |
275
|
517
|
100
|
|
|
|
957
|
if ($query =~ / \G : ([\w\-]+) /cgx) { |
276
|
5
|
|
|
|
|
8
|
my $pseudoclass = $1; |
277
|
5
|
|
|
|
|
7
|
$specificity += 10; |
278
|
|
|
|
|
|
|
|
279
|
5
|
100
|
|
|
|
14
|
if ($pseudoclass eq 'first-child') { |
|
|
50
|
|
|
|
|
|
280
|
3
|
|
|
12
|
|
37
|
push( @args, sub { ! grep { ref $_ } $_[0]->left() } ); |
|
12
|
|
|
|
|
347
|
|
|
12
|
|
|
|
|
132
|
|
281
|
|
|
|
|
|
|
} elsif ($pseudoclass eq 'last-child') { |
282
|
2
|
|
|
12
|
|
14
|
push( @args, sub { ! grep { ref $_ } $_[0]->right() } ); |
|
12
|
|
|
|
|
505
|
|
|
12
|
|
|
|
|
175
|
|
283
|
|
|
|
|
|
|
} else { |
284
|
0
|
|
|
|
|
0
|
warn "Pseudoclass :$pseudoclass not supported"; |
285
|
0
|
|
|
|
|
0
|
next; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# keep going until this particular expression is fully processed |
290
|
517
|
100
|
|
|
|
1098
|
last unless scalar(@args) > $work; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# we must have something in @args by now or we didn't find any |
294
|
|
|
|
|
|
|
# valid query specification this time around |
295
|
367
|
100
|
|
|
|
753
|
last SEQUENCE unless @args; |
296
|
|
|
|
|
|
|
|
297
|
239
|
|
|
|
|
234
|
$self->debug( |
298
|
|
|
|
|
|
|
'Parsed ', substr($query, $pos, pos($query) - $pos), |
299
|
|
|
|
|
|
|
' into args [', join(', ', @args), ']' |
300
|
|
|
|
|
|
|
) if DEBUG; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# we want to skip certain hack sequences like '* html' |
303
|
239
|
100
|
|
|
|
611
|
if ($hack_sequence == 2) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
304
|
1
|
|
|
|
|
4
|
@elements = []; # clear out our stored elements to match behaviour of modern browsers |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
# we're just looking for any descendent |
307
|
|
|
|
|
|
|
elsif( !$relationship ) { |
308
|
205
|
100
|
|
|
|
435
|
if ($self->{match_self}) { |
309
|
|
|
|
|
|
|
# if we are re-querying, be sure to match ourselves not just descendents |
310
|
2
|
|
|
|
|
5
|
@elements = map { $_->look_down(@args) } @elements; |
|
4
|
|
|
|
|
82
|
|
311
|
|
|
|
|
|
|
} else { |
312
|
|
|
|
|
|
|
# look_down() will match self in addition to descendents, |
313
|
|
|
|
|
|
|
# so we explicitly disallow matches on self as we iterate |
314
|
|
|
|
|
|
|
# thru the list. The other cases below already exclude self. |
315
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=58918 |
316
|
203
|
|
|
|
|
238
|
my @accumulator; |
317
|
203
|
|
|
|
|
292
|
foreach my $e (@elements) { |
318
|
248
|
100
|
|
|
|
3245
|
if ($e->root() == $e) { |
319
|
108
|
|
|
|
|
950
|
push(@accumulator, $e->look_down(@args)); |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
else { |
322
|
140
|
|
|
|
|
1718
|
push(@accumulator, grep { $_ != $e } $e->look_down(@args)); |
|
235
|
|
|
|
|
12438
|
|
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
} |
325
|
203
|
|
|
|
|
44929
|
@elements = @accumulator; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
# immediate child selector |
329
|
|
|
|
|
|
|
elsif( $relationship eq '>' ) { |
330
|
|
|
|
|
|
|
@elements = map { |
331
|
16
|
|
|
|
|
44
|
$_->look_down( |
332
|
|
|
|
|
|
|
@args, |
333
|
|
|
|
|
|
|
sub { |
334
|
61
|
|
|
61
|
|
2238
|
my $tag = shift; |
335
|
61
|
|
|
|
|
65
|
my $root = $_; |
336
|
|
|
|
|
|
|
|
337
|
61
|
|
|
|
|
146
|
return $tag->depth == $root->depth + 1; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
) |
340
|
36
|
|
|
|
|
1302
|
} @elements; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
# immediate sibling selector |
343
|
|
|
|
|
|
|
elsif( $relationship eq '+' ) { |
344
|
|
|
|
|
|
|
@elements = map { |
345
|
11
|
|
|
|
|
19
|
$_->parent->look_down( |
346
|
|
|
|
|
|
|
@args, |
347
|
|
|
|
|
|
|
sub { |
348
|
253
|
|
|
253
|
|
6095
|
my $tag = shift; |
349
|
253
|
|
|
|
|
256
|
my $root = $_; |
350
|
253
|
|
|
|
|
534
|
my @prev_sibling = $tag->left; |
351
|
|
|
|
|
|
|
# get prev next non-text sibling |
352
|
253
|
|
|
|
|
3845
|
foreach my $sibling (reverse @prev_sibling) { |
353
|
280
|
100
|
|
|
|
842
|
next unless ref $sibling; |
354
|
69
|
|
|
|
|
220
|
return $sibling == $root; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
) |
358
|
61
|
|
|
|
|
1012
|
} @elements; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
# grandchild selector |
361
|
|
|
|
|
|
|
elsif( $relationship eq '*' ) { |
362
|
|
|
|
|
|
|
@elements = map { |
363
|
6
|
|
|
|
|
9
|
$_->look_down( |
364
|
|
|
|
|
|
|
@args, |
365
|
|
|
|
|
|
|
sub { |
366
|
40
|
|
|
40
|
|
1886
|
my $tag = shift; |
367
|
40
|
|
|
|
|
41
|
my $root = $_; |
368
|
|
|
|
|
|
|
|
369
|
40
|
|
|
|
|
93
|
return $tag->depth > $root->depth + 1; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
) |
372
|
9
|
|
|
|
|
169
|
} @elements; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# so we can check we've done something |
376
|
239
|
|
|
|
|
1390
|
$comops++; |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# dedup the results we've gotten |
379
|
239
|
|
|
|
|
568
|
@elements = $self->_dedup(\@elements); |
380
|
|
|
|
|
|
|
|
381
|
239
|
|
|
|
|
593
|
map { warn $_->as_HTML } @elements if DEBUG; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
128
|
100
|
|
|
|
233
|
if ($comops) { |
385
|
127
|
|
|
|
|
119
|
$self->debug( |
386
|
|
|
|
|
|
|
'Added', scalar(@elements), ' elements to results' |
387
|
|
|
|
|
|
|
) if DEBUG; |
388
|
|
|
|
|
|
|
|
389
|
127
|
|
|
|
|
481
|
my $selector = substr ($query,$startpos, $pos - $startpos); |
390
|
127
|
|
|
|
|
314
|
$self->_add_specificity($selector,$specificity); |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
#add in the recent pass |
393
|
127
|
|
|
|
|
179
|
push(@result,@elements); |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# dedup the results across the result sets, necessary for comma based selectors |
396
|
127
|
|
|
|
|
476
|
@result = $self->_dedup(\@result); |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# sort the result set... |
399
|
127
|
|
|
|
|
473
|
@result = sort _by_address @result; |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# update op counter for complete query to include ops performed |
402
|
|
|
|
|
|
|
# in this fragment |
403
|
127
|
|
|
|
|
198
|
$ops += $comops; |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
else { |
406
|
|
|
|
|
|
|
# looks like we got an empty comma section, e.g. : ",x, ,y," |
407
|
|
|
|
|
|
|
# so we'll ignore it |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
128
|
100
|
|
|
|
411
|
last COMMA unless $query =~ / \G \s*,\s* /cgsx; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# check for any trailing text in the query that we couldn't parse |
414
|
121
|
50
|
|
|
|
257
|
if ($query =~ / \G (.+?) \s* $ /cgsx) { |
415
|
0
|
|
|
|
|
0
|
return $self->_report_error( $self->message( bad_spec => $1, $query ) ); |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# check that we performed at least one query operation |
419
|
121
|
100
|
|
|
|
229
|
unless ($ops) { |
420
|
1
|
|
|
|
|
6
|
return $self->_report_error( $self->message( bad_query => $query ) ); |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
120
|
100
|
|
|
|
382
|
return wantarray ? @result : $self->_new_match_self(@result); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# return elements stored from last query |
427
|
|
|
|
|
|
|
sub get_elements { |
428
|
328
|
|
|
328
|
1
|
362
|
my $self = shift; |
429
|
|
|
|
|
|
|
|
430
|
328
|
50
|
|
|
|
1390
|
return wantarray ? @{$self->{elements}} : $self->{elements}; |
|
0
|
|
|
|
|
0
|
|
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
########################################################################################################### |
434
|
|
|
|
|
|
|
# from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity |
435
|
|
|
|
|
|
|
########################################################################################################### |
436
|
|
|
|
|
|
|
# A selector's specificity is calculated as follows: |
437
|
|
|
|
|
|
|
# |
438
|
|
|
|
|
|
|
# * count the number of ID attributes in the selector (= a) |
439
|
|
|
|
|
|
|
# * count the number of other attributes and pseudo-classes in the selector (= b) |
440
|
|
|
|
|
|
|
# * count the number of element names in the selector (= c) |
441
|
|
|
|
|
|
|
# * ignore pseudo-elements. |
442
|
|
|
|
|
|
|
# |
443
|
|
|
|
|
|
|
# Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity. |
444
|
|
|
|
|
|
|
# |
445
|
|
|
|
|
|
|
# Example(s): |
446
|
|
|
|
|
|
|
# |
447
|
|
|
|
|
|
|
# Some examples: |
448
|
|
|
|
|
|
|
# |
449
|
|
|
|
|
|
|
# * {} /* a=0 b=0 c=0 -> specificity = 0 */ |
450
|
|
|
|
|
|
|
# LI {} /* a=0 b=0 c=1 -> specificity = 1 */ |
451
|
|
|
|
|
|
|
# UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */ |
452
|
|
|
|
|
|
|
# UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */ |
453
|
|
|
|
|
|
|
# H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */ |
454
|
|
|
|
|
|
|
# UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */ |
455
|
|
|
|
|
|
|
# LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */ |
456
|
|
|
|
|
|
|
# #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */ |
457
|
|
|
|
|
|
|
########################################################################################################### |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=pod |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=item specificity() |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
A selector's specificity is calculated as follows: |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
* count the number of ID attributes in the selector (= a) |
468
|
|
|
|
|
|
|
* count the number of other attributes and pseudo-classes in the selector (= b) |
469
|
|
|
|
|
|
|
* count the number of element names in the selector (= c) |
470
|
|
|
|
|
|
|
* ignore pseudo-elements. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted |
473
|
|
|
|
|
|
|
as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
See the following spec for additional details: |
476
|
|
|
|
|
|
|
L |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=back |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=cut |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub get_specificity { |
483
|
24
|
|
|
24
|
0
|
38
|
my ($self,$selector) = @_; |
484
|
|
|
|
|
|
|
|
485
|
24
|
50
|
|
|
|
70
|
unless (exists $self->{specificity}->{$selector}) { |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# if the invoking tree happened to be large this could get expensive real fast |
488
|
|
|
|
|
|
|
# instead load up an empty instance and query that. |
489
|
24
|
|
|
|
|
58
|
local $self->{elements} = []; |
490
|
24
|
|
|
|
|
53
|
$self->query($selector); |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
24
|
|
|
|
|
71
|
return $self->{specificity}->{$selector}; |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub suppress_errors { |
497
|
11
|
|
|
11
|
0
|
228
|
my ($self, $setting) = @_; |
498
|
|
|
|
|
|
|
|
499
|
11
|
100
|
|
|
|
29
|
if (defined($setting)) { |
500
|
2
|
|
|
|
|
6
|
$self->{suppress_errors} = $setting; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
11
|
|
|
|
|
31
|
return $self->{suppress_errors}; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub get_error { |
507
|
6
|
|
|
6
|
0
|
524
|
my ($self) = @_; |
508
|
|
|
|
|
|
|
|
509
|
6
|
|
|
|
|
24
|
return $self->{error}; |
510
|
|
|
|
|
|
|
} |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
sub list { |
513
|
|
|
|
|
|
|
# return list of items or return unblessed list ref of items |
514
|
0
|
0
|
|
0
|
1
|
0
|
return wantarray ? @{ $_[0] } : [ @{ $_[0] } ]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
sub size { |
518
|
98
|
|
|
98
|
1
|
8913
|
my $self = shift; |
519
|
98
|
|
|
|
|
108
|
return scalar @{$self->get_elements}; |
|
98
|
|
|
|
|
200
|
|
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub first { |
523
|
6
|
|
|
6
|
1
|
8
|
my $self = shift; |
524
|
|
|
|
|
|
|
|
525
|
6
|
50
|
|
|
|
5
|
return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty'); |
|
6
|
|
|
|
|
12
|
|
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
sub last { |
529
|
4
|
|
|
4
|
1
|
716
|
my $self = shift; |
530
|
|
|
|
|
|
|
|
531
|
4
|
50
|
|
|
|
6
|
return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty'); |
|
4
|
|
|
|
|
7
|
|
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
#################################################################### |
535
|
|
|
|
|
|
|
# |
536
|
|
|
|
|
|
|
# Everything below here is a private method subject to change |
537
|
|
|
|
|
|
|
# |
538
|
|
|
|
|
|
|
#################################################################### |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub _add_specificity { |
541
|
127
|
|
|
127
|
|
201
|
my ($self, $selector, $specificity) = @_; |
542
|
|
|
|
|
|
|
|
543
|
127
|
|
|
|
|
353
|
$self->{specificity}->{$selector} = $specificity; |
544
|
|
|
|
|
|
|
|
545
|
127
|
|
|
|
|
193
|
return(); |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub _report_error { |
549
|
7
|
|
|
7
|
|
900
|
my ($self, $message) = @_; |
550
|
|
|
|
|
|
|
|
551
|
7
|
100
|
|
|
|
18
|
if ($self->suppress_errors()) { |
552
|
6
|
50
|
|
|
|
15
|
if (defined($message)) { |
553
|
6
|
|
|
|
|
12
|
$self->{error} = $message; |
554
|
|
|
|
|
|
|
} |
555
|
6
|
|
|
|
|
32
|
return undef; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
else { |
558
|
1
|
|
|
|
|
3
|
$self->error($message); # this will DIE |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# this Just Works[tm] because first arg is HTML::Element object |
563
|
|
|
|
|
|
|
sub _export_query_to_element { |
564
|
2
|
|
|
2
|
|
558
|
class(ELEMENT)->load->method( |
565
|
|
|
|
|
|
|
query => \&Query, |
566
|
|
|
|
|
|
|
); |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
# remove duplicate elements in the case where elements are nested between multiple matching elements |
570
|
|
|
|
|
|
|
sub _dedup { |
571
|
366
|
|
|
366
|
|
491
|
my ($self,$elements) = @_; |
572
|
|
|
|
|
|
|
|
573
|
366
|
|
|
|
|
528
|
my %seen = (); |
574
|
366
|
|
|
|
|
425
|
my @unique = (); |
575
|
|
|
|
|
|
|
|
576
|
366
|
|
|
|
|
372
|
foreach my $item (@{$elements}) { |
|
366
|
|
|
|
|
667
|
|
577
|
910
|
100
|
|
|
|
2239
|
if (!exists($seen{$item})) { |
578
|
815
|
|
|
|
|
1011
|
push(@unique, $item); |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
|
581
|
910
|
|
|
|
|
2162
|
$seen{$item}++; |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
|
584
|
366
|
|
|
|
|
1517
|
return @unique; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# utility method to assist in sorting of query return sets |
588
|
|
|
|
|
|
|
sub _by_address |
589
|
|
|
|
|
|
|
{ |
590
|
235
|
|
|
235
|
|
350
|
my $self = shift; |
591
|
|
|
|
|
|
|
|
592
|
235
|
|
|
|
|
669
|
my @a = split /\./, $a->address(); |
593
|
235
|
|
|
|
|
21960
|
my @b = split /\./, $b->address(); |
594
|
|
|
|
|
|
|
|
595
|
235
|
100
|
|
|
|
17016
|
my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b; |
596
|
|
|
|
|
|
|
|
597
|
235
|
|
|
|
|
726
|
for (my $index=0; $index<$max; $index++) { |
598
|
|
|
|
|
|
|
|
599
|
1071
|
50
|
66
|
|
|
3875
|
if (!defined($a[$index]) && !defined($b[$index])) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
600
|
0
|
|
|
|
|
0
|
return 0; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
elsif (!defined($a[$index])) { |
603
|
49
|
|
|
|
|
151
|
return -1; |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
elsif(!defined($b[$index])) { |
606
|
26
|
|
|
|
|
80
|
return 1; |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
996
|
100
|
|
|
|
1939
|
if ($a[$index] == $b[$index]) { |
610
|
842
|
|
|
|
|
1892
|
next; #move to the next |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
else { |
613
|
154
|
|
|
|
|
579
|
return $a[$index] <=> $b[$index]; |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
# instantiate an instance with match_self turned on, for use with |
619
|
|
|
|
|
|
|
# follow-up queries, so they match the top-most elements. |
620
|
|
|
|
|
|
|
sub _new_match_self { |
621
|
116
|
|
|
116
|
|
147
|
my $self = shift; |
622
|
|
|
|
|
|
|
|
623
|
116
|
|
|
|
|
296
|
my $result = $self->new(@_); |
624
|
|
|
|
|
|
|
|
625
|
116
|
|
|
|
|
201
|
$result->{match_self} = 1; |
626
|
116
|
|
|
|
|
611
|
return $result; |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
sub AUTOLOAD { |
630
|
74
|
|
|
74
|
|
296
|
my $self = shift; |
631
|
74
|
|
|
|
|
547
|
my ($method) = ($AUTOLOAD =~ /([^:]+)$/ ); |
632
|
74
|
50
|
|
|
|
211
|
return if $method eq 'DESTROY'; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# we allow Perl to catch any unknown methods that the user might |
635
|
|
|
|
|
|
|
# try to call against the HTML::Element objects in the query |
636
|
156
|
|
|
|
|
5160
|
my @results = |
637
|
74
|
|
|
|
|
149
|
map { $_->$method(@_) } |
638
|
74
|
|
|
|
|
92
|
@{$self->get_elements}; |
639
|
|
|
|
|
|
|
|
640
|
74
|
50
|
|
|
|
3871
|
return wantarray ? @results : \@results; |
641
|
|
|
|
|
|
|
} |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
1; |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
=head1 NAME |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
HTML::Query - jQuery-like selection queries for HTML::Element |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
=head1 SYNOPSIS |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
Creating an C object using the L constructor |
652
|
|
|
|
|
|
|
subroutine: |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
use HTML::Query 'Query'; |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# using named parameters |
657
|
|
|
|
|
|
|
$q = Query( text => $text ); # HTML text |
658
|
|
|
|
|
|
|
$q = Query( file => $file ); # HTML file |
659
|
|
|
|
|
|
|
$q = Query( tree => $tree ); # HTML::Element object |
660
|
|
|
|
|
|
|
$q = Query( query => $query ); # HTML::Query object |
661
|
|
|
|
|
|
|
$q = Query( |
662
|
|
|
|
|
|
|
text => $text1, # or any combination |
663
|
|
|
|
|
|
|
text => $text2, # of the above |
664
|
|
|
|
|
|
|
file => $file1, |
665
|
|
|
|
|
|
|
file => $file2, |
666
|
|
|
|
|
|
|
tree => $tree, |
667
|
|
|
|
|
|
|
query => $query, |
668
|
|
|
|
|
|
|
); |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
# passing elements as positional arguments |
671
|
|
|
|
|
|
|
$q = Query( $tree ); # HTML::Element object(s) |
672
|
|
|
|
|
|
|
$q = Query( $tree1, $tree2, $tree3, ... ); |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# or from one or more existing queries |
675
|
|
|
|
|
|
|
$q = Query( $query1 ); # HTML::Query object(s) |
676
|
|
|
|
|
|
|
$q = Query( $query1, $query2, $query3, ... ); |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
# or a mixture |
679
|
|
|
|
|
|
|
$q = Query( $tree1, $query1, $tree2, $query2 ); |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# the final argument (in all cases) can be a selector |
682
|
|
|
|
|
|
|
my $spec = 'ul.menu li a'; # |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
$q = Query( $tree, $spec ); |
685
|
|
|
|
|
|
|
$q = Query( $query, $spec ); |
686
|
|
|
|
|
|
|
$q = Query( $tree1, $tree2, $query1, $query2, $spec ); |
687
|
|
|
|
|
|
|
$q = Query( text => $text, $spec ); |
688
|
|
|
|
|
|
|
$q = Query( file => $file, $spec ); |
689
|
|
|
|
|
|
|
$q = Query( tree => $tree, $spec ); |
690
|
|
|
|
|
|
|
$q = Query( query => $query, $spec ); |
691
|
|
|
|
|
|
|
$q = Query( |
692
|
|
|
|
|
|
|
text => $text, |
693
|
|
|
|
|
|
|
file => $file, |
694
|
|
|
|
|
|
|
# ...etc... |
695
|
|
|
|
|
|
|
$spec |
696
|
|
|
|
|
|
|
); |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
Or using the OO L constructor method (which the L |
699
|
|
|
|
|
|
|
subroutine maps onto): |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
use HTML::Query; |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
$q = HTML::Query->new( |
704
|
|
|
|
|
|
|
# accepts the same arguments as Query() |
705
|
|
|
|
|
|
|
) |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
Or by monkey-patching a L method into L. |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
710
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# build a tree |
713
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
714
|
|
|
|
|
|
|
$tree->parse_file($filename); |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
# call the query() method on any element |
717
|
|
|
|
|
|
|
my $query = $tree->query($spec); |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
Once you have a query, you can start selecting elements: |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
@r = $q->query('a')->get_elements(); # all ... elements |
722
|
|
|
|
|
|
|
@r = $q->query('a#menu')->get_elements(); # all with "menu" id |
723
|
|
|
|
|
|
|
@r = $q->query('#menu')->get_elements(); # all elements with "menu" id |
724
|
|
|
|
|
|
|
@r = $q->query('a.menu')->get_elements(); # all with "menu" class |
725
|
|
|
|
|
|
|
@r = $q->query('.menu')->get_elements(); # all elements with "menu" class |
726
|
|
|
|
|
|
|
@r = $q->query('a[href]')->get_elements(); # all with 'href' attr |
727
|
|
|
|
|
|
|
@r = $q->query('a[href=foo]')->get_elements(); # all with 'href="foo"' attr |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# you can specify elements within elements... |
730
|
|
|
|
|
|
|
@r = $q->query('ul.menu li a')->get_elements(); # |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# and use commas to delimit multiple path specs for different elements |
733
|
|
|
|
|
|
|
@r = $q->query('table tr td a, form input[type=submit]')->get_elements(); |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# query() in scalar context returns a new query |
736
|
|
|
|
|
|
|
$r = $q->query('table')->get_elements();; # find all tables |
737
|
|
|
|
|
|
|
$s = $r->query('tr')->get_elements(); # find all rows in all those tables |
738
|
|
|
|
|
|
|
$t = $s->query('td')->get_elements(); # and all cells in those rows... |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
Inspecting query elements: |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# get number of elements in query |
743
|
|
|
|
|
|
|
my $size = $q->size |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# get first/last element in query |
746
|
|
|
|
|
|
|
my $first = $q->first; |
747
|
|
|
|
|
|
|
my $last = $q->last; |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# convert query to list or list ref of HTML::Element objects |
750
|
|
|
|
|
|
|
my $list = $q->list; # list ref in scalar context |
751
|
|
|
|
|
|
|
my @list = $q->list; # list in list context |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
All other methods are mapped onto the L objects |
754
|
|
|
|
|
|
|
in the query: |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
print $query->as_trimmed_text; # print trimmed text for each element |
757
|
|
|
|
|
|
|
print $query->as_HTML; # print each element as HTML |
758
|
|
|
|
|
|
|
$query->delete; # call delete() on each element |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head1 DESCRIPTION |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
The C module is an add-on for the L module |
763
|
|
|
|
|
|
|
set. It provides a simple way to select one or more elements from a tree using |
764
|
|
|
|
|
|
|
a query syntax inspired by jQuery. This selector syntax will be reassuringly |
765
|
|
|
|
|
|
|
familiar to anyone who has ever written a CSS selector. |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
C is not an attempt to provide a complete (or even near-complete) |
768
|
|
|
|
|
|
|
implementation of jQuery in Perl (see Ingy's L module for a |
769
|
|
|
|
|
|
|
more ambitious attempt at that). Rather, it borrows some of the tried and |
770
|
|
|
|
|
|
|
tested selector syntax from jQuery (and CSS) that can easily be mapped onto |
771
|
|
|
|
|
|
|
the C method provided by the L |
772
|
|
|
|
|
|
|
module. |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=head2 Creating a Query |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
The easiest way to create a query is using the exportable L |
777
|
|
|
|
|
|
|
subroutine. |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
use HTML::Query 'Query'; # note capital 'Q' |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
It accepts a C or C named parameter and will create an |
782
|
|
|
|
|
|
|
C object from the HTML source text or file, respectively. |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
my $query = Query( text => $text ); |
785
|
|
|
|
|
|
|
my $query = Query( file => $file ); |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
This delegates to L to parse the |
788
|
|
|
|
|
|
|
HTML into a tree of L objects. The root |
789
|
|
|
|
|
|
|
element returned is then wrapped in an C object. |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
If you already have one or more L objects that |
792
|
|
|
|
|
|
|
you want to query then you can pass them to the L subroutine as |
793
|
|
|
|
|
|
|
arguments. For example, you can explicitly use |
794
|
|
|
|
|
|
|
L to parse an HTML document into a tree: |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
797
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
798
|
|
|
|
|
|
|
$tree->parse_file($filename); |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
And then create an C object for the tree either using an |
801
|
|
|
|
|
|
|
explicit C named parameter: |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
my $query = Query( tree => $tree ); |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
Or implicitly using positional arguments. |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
my $query = Query( $tree ); |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
If you want to query across multiple elements, then pass each one as a |
810
|
|
|
|
|
|
|
positional argument. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
my $query = Query( $tree1, $tree2, $tree3 ); |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
You can also create a new query from one or more existing queries, |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
my $query = Query( query => $query ); # named parameter |
817
|
|
|
|
|
|
|
my $query = Query( $query1, $query2 ); # positional arguments. |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
You can mix and match these different parameters and positional arguments |
820
|
|
|
|
|
|
|
to create a query across several different sources. |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
$q = Query( |
823
|
|
|
|
|
|
|
text => $text1, |
824
|
|
|
|
|
|
|
text => $text2, |
825
|
|
|
|
|
|
|
file => $file1, |
826
|
|
|
|
|
|
|
file => $file2, |
827
|
|
|
|
|
|
|
tree => $tree, |
828
|
|
|
|
|
|
|
query => $query, |
829
|
|
|
|
|
|
|
); |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
The L subroutine is a simple wrapper around the L |
832
|
|
|
|
|
|
|
constructor method. You can instantiate your objects manually if you prefer. |
833
|
|
|
|
|
|
|
The L method accepts the same arguments as for the L |
834
|
|
|
|
|
|
|
subroutine (in fact, the L subroutine simply forwards all |
835
|
|
|
|
|
|
|
arguments to the L method). |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
use HTML::Query; |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
my $query = HTML::Query->new( |
840
|
|
|
|
|
|
|
# same argument format as for Query() |
841
|
|
|
|
|
|
|
); |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
A final way to use C is to have it add a L method |
844
|
|
|
|
|
|
|
to L. The C import hook (all lower |
845
|
|
|
|
|
|
|
case) can be specified to make this so. |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
848
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
851
|
|
|
|
|
|
|
$tree->parse_file($filename); |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
# now all HTML::Elements have a query() method |
854
|
|
|
|
|
|
|
my @items = $tree->query('ul li')->get_elements(); # find all list items |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
This approach, often referred to as I, should be used |
857
|
|
|
|
|
|
|
carefully and sparingly. It involves a violation of |
858
|
|
|
|
|
|
|
L's namespace that could have unpredictable |
859
|
|
|
|
|
|
|
results with a future version of the module (e.g. one which defines its own |
860
|
|
|
|
|
|
|
C method that does something different). Treat it as something that |
861
|
|
|
|
|
|
|
is great to get a quick job done right now, but probably not something to be |
862
|
|
|
|
|
|
|
used in production code without careful consideration of the implications. |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
=head2 Selecting Elements |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
Having created an C object by one of the methods outlined above, |
867
|
|
|
|
|
|
|
you can now fetch descendant elements in the tree using a simple query syntax. |
868
|
|
|
|
|
|
|
For example, to fetch all the C<< EaE >> elements in the tree, you can |
869
|
|
|
|
|
|
|
write: |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
@links = $query->query('a')->get_elements(); |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
Or, if you want the elements that have a specific C attribute defined |
874
|
|
|
|
|
|
|
with a value of, say C |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
@links = $query->query('a.menu')->get_elements(); |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
More generally, you can look for the existence of any attribute and optionally |
879
|
|
|
|
|
|
|
provide a specific value for it. |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
@links = $query->query('a[href]')->get_elements(); # any href attribute |
882
|
|
|
|
|
|
|
@links = $query->query('a[href=index.html]')->get_elements(); # specific value |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
You can also find an element (or elements) by specifying an id. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
@links = $query->query('#menu')->get_elements(); # any element with id="menu" |
887
|
|
|
|
|
|
|
@links = $query->query('ul#menu')->get_elements(); # ul element with id="menu" |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
You can provide multiple selection criteria to find elements within elements |
890
|
|
|
|
|
|
|
within elements, and so on. For example, to find all links in a menu, |
891
|
|
|
|
|
|
|
you can write: |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
# matches: |
894
|
|
|
|
|
|
|
@links = $query->query('ul.menu li a')->get_elements(); |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
You can separate different criteria using commas. For example, to fetch all |
897
|
|
|
|
|
|
|
table rows and C elements with a C class: |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
@elems = $query->('table tr, span.foo')->get_elements(); |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=head2 Query Results |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
When called in list context, as shown in the examples above, the L |
904
|
|
|
|
|
|
|
method returns a list of L objects matching the |
905
|
|
|
|
|
|
|
search criteria. In scalar context, the L method returns a new |
906
|
|
|
|
|
|
|
C object containing the L objects |
907
|
|
|
|
|
|
|
found. You can then call the L method against that object to further |
908
|
|
|
|
|
|
|
refine the query. The L method applies the selection to all elements |
909
|
|
|
|
|
|
|
stored in the query. |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
my $tables = $query->query('table'); # query for tables |
912
|
|
|
|
|
|
|
my $rows = $tables->query('tr'); # requery for all rows in those tables |
913
|
|
|
|
|
|
|
my $cells = $rows->query('td')->get_elements(); # return back all the cells in those rows |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
=head2 Inspection Methods |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
The L method returns the number of elements in the query. The |
918
|
|
|
|
|
|
|
L and L methods return the first and last items in the |
919
|
|
|
|
|
|
|
query, respectively. |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
if ($query->size) { |
922
|
|
|
|
|
|
|
print "from ", $query->first->as_trimmed_text, " to ", $query->last->as_trimmed_text; |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
If you want to extract the L objects from the |
926
|
|
|
|
|
|
|
query you can call the L method. This returns a list of |
927
|
|
|
|
|
|
|
L objects in list context, or a reference to a |
928
|
|
|
|
|
|
|
list in scalar context. |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
@elems = $query->list; |
931
|
|
|
|
|
|
|
$elems = $query->list; |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=head2 Element Methods |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
Any other methods are automatically applied to each element in the list. For |
936
|
|
|
|
|
|
|
example, to call the C method on all the |
937
|
|
|
|
|
|
|
L objects in the query, you can write: |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
print $query->as_trimmed_text; |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
In list context, this method returns a list of the return values from |
942
|
|
|
|
|
|
|
calling the method on each element. In scalar context it returns a |
943
|
|
|
|
|
|
|
reference to a list of return values. |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
@text_blocks = $query->as_trimmed_text; |
946
|
|
|
|
|
|
|
$text_blocks = $query->as_trimmed_text; |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
See L for further information on the methods it |
949
|
|
|
|
|
|
|
provides. |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=head1 QUERY SYNTAX |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=head2 Basic Selectors |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=head3 element |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
Matches all elements of a particular type. |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
@elems = $query->query('table')->get_elements(); #
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=head3 #id |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
Matches all elements with a specific id attribute. |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
@elems = $query->query('#menu')->get_elements() # |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
This can be combined with an element type: |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
@elems = $query->query('ul#menu')->get_elements(); # |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
=head3 .class |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
Matches all elements with a specific class attribute. |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
@elems = $query->query('.info')->get_elements(); # |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
This can be combined with an element type and/or element id: |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
@elems = $query->query('p.info')->get_elements(); # |
980
|
|
|
|
|
|
|
@elems = $query->query('p#foo.info')->get_elements(); # |
981
|
|
|
|
|
|
|
@elems = $query->query('#foo.info')->get_elements(); # |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
The selectors listed above can be combined in a whitespace delimited |
984
|
|
|
|
|
|
|
sequence to select down through a hierarchy of elements. Consider the |
985
|
|
|
|
|
|
|
following table: |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
To locate the cells that we're interested in, we can write: |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
@elems = $query->query('table.search tr.result td.value')->get_elements(); |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
=head2 Attribute Selectors |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
W3C CSS 2 specification defines new constructs through which to select |
1004
|
|
|
|
|
|
|
based on specific attributes within elements. See the following link for the spec: |
1005
|
|
|
|
|
|
|
L |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head3 [attr] |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
Matches elements that have the specified attribute, including any where |
1010
|
|
|
|
|
|
|
the attribute has no value. |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
@elems = $query->query('[href]')->get_elements(); # |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
This can be combined with any of the above selectors. For example: |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
@elems = $query->query('a[href]')->get_elements(); # |
1017
|
|
|
|
|
|
|
@elems = $query->query('a.menu[href]')->get_elements(); # |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1020
|
|
|
|
|
|
|
match I of them will be selected. |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
@elems = $query->query('a[href][rel]')->get_elements(); # |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
=head3 [attr=value] |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
Matches elements that have an attribute set to a specific value. The |
1027
|
|
|
|
|
|
|
value can be quoted in either single or double quotes, or left unquoted. |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
@elems = $query->query('[href=index.html]')->get_elements(); |
1030
|
|
|
|
|
|
|
@elems = $query->query('[href="index.html"]')->get_elements(); |
1031
|
|
|
|
|
|
|
@elems = $query->query("[href='index.html']")->get_elements(); |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1034
|
|
|
|
|
|
|
match I of them will be selected. |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
@elems = $query->query('a[href=index.html][rel=home]')->get_elements(); |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
=head3 [attr|=value] |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
Matches any element X whose foo attribute has a hyphen-separated list of |
1041
|
|
|
|
|
|
|
values beginning (from the left) with bar. The value can be quoted in either |
1042
|
|
|
|
|
|
|
single or double quotes, or left unquoted. |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
@elems = $query->query('[lang|=en]')->get_elements(); |
1045
|
|
|
|
|
|
|
@elems = $query->query('p[class|="example"]')->get_elements(); |
1046
|
|
|
|
|
|
|
@elems = $query->query("img[alt|='fig']")->get_elements(); |
1047
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1049
|
|
|
|
|
|
|
match I of them will be selected. |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
@elems = $query->query('p[class|="external"][lang|="en"]')->get_elements(); |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=head3 [attr~=value] |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
Matches any element X whose foo attribute value is a list of space-separated |
1056
|
|
|
|
|
|
|
values, one of which is exactly equal to bar. The value can be quoted in either |
1057
|
|
|
|
|
|
|
single or double quotes, or left unquoted. |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
@elems = $query->query('[lang~=en]')->get_elements(); |
1060
|
|
|
|
|
|
|
@elems = $query->query('p[class~="example"]')->get_elements(); |
1061
|
|
|
|
|
|
|
@elems = $query->query("img[alt~='fig']")->get_elements(); |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
You can specify multiple attribute selectors. Only those elements that |
1064
|
|
|
|
|
|
|
match I of them will be selected. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
@elems = $query->query('p[class~="external"][lang~="en"]')->get_elements(); |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
KNOWN BUG: you can't have a C<]> character in the attribute value because |
1069
|
|
|
|
|
|
|
it confuses the query parser. Fixing this is TODO. |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
=head2 Universal Selector |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
W3C CSS 2 specification defines a new construct through which to select |
1074
|
|
|
|
|
|
|
any element within the document below a given hierarchy. |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
http://www.w3.org/TR/css3-selectors/#universal-selector |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
@elems = $query->query('*')->get_elements(); |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
=head2 Combinator Selectors |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
W3C CSS 2 specification defines new constructs through which to select |
1083
|
|
|
|
|
|
|
based on heirarchy with the DOM. See the following link for the spec: |
1084
|
|
|
|
|
|
|
L |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=head3 Immediate Descendents (children) |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
When you combine selectors with whitespace elements are selected if |
1089
|
|
|
|
|
|
|
they are descended from the parent in some way. But if you just want |
1090
|
|
|
|
|
|
|
to select the children (and not the grandchildren, great-grandchildren, |
1091
|
|
|
|
|
|
|
etc) then you can combine the selectors with the C<< > >> character. |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
@elems = $query->query('a > img')->get_elements(); |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
=head3 Non-Immediate Descendents |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
If you just want any descendents that aren't children then you can combine |
1098
|
|
|
|
|
|
|
selectors with the C<*> character. |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
@elems = $query->query('div * a')->get_elements(); |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=head3 Immediate Siblings |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
If you want to use a sibling relationship then you can can join selectors |
1105
|
|
|
|
|
|
|
with the C<+> character. |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
@elems = $query->query('img + span')->get_elements(); |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
=head2 Pseudo-classes |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
W3C CSS 2 and CSS 3 specifications define new concepts of pseudo-classes to |
1112
|
|
|
|
|
|
|
permit formatting based on information that lies outside the document tree. |
1113
|
|
|
|
|
|
|
See the following link for the most recent spec: |
1114
|
|
|
|
|
|
|
L |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
HTML::Query currently has limited support for CSS 2, and no support for CSS 3. |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
Patches are *highly* encouraged to help add support here. |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
=head3 -child pseudo-classes |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
If you want to return child elements within a certain position then -child |
1123
|
|
|
|
|
|
|
pseudo-classes (:first-child, :last-child) are what you're looking for. |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
@elems = $query->query('table td:first-child')->get_elements; |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=head3 Link pseudo-classes: :link and :visited |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
Unsupported. |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
The :link pseudo-class is to be implemented, currently unsupported. |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
It is not possible to locate :visited outside of a browser context due to it's |
1134
|
|
|
|
|
|
|
dynamic nature. |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=head3 Dynamic pseudo-classes |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
Unsupported. |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
It is not possible to locate these classes(:hover, :active, :focus) outside |
1141
|
|
|
|
|
|
|
of a browser context due to their dynamic nature. |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
=head3 Language pseudo-class |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
Unsupported. |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
Functionality for the :lang psuedo-class is largely replicated by using an |
1148
|
|
|
|
|
|
|
attribute selector for lang combined with a universal selector query. |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
If this is insufficient I'd love to see a patch adding support for it. |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=head3 Other pseudo-classes |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
W3C CSS 3 added a number of new behaviors that need support. At |
1155
|
|
|
|
|
|
|
this time there is no support for them, but we should work on adding support. |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
Patches are very welcome. |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
=head2 Pseudo-elements |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
W3C CSS 2 and CSS 3 specification defines new concepts of pseudo-elements to |
1162
|
|
|
|
|
|
|
permit formatting based on information that lies outside the document tree. |
1163
|
|
|
|
|
|
|
See the following link for the most recent spec: |
1164
|
|
|
|
|
|
|
L |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
At this time there is no support for pseudo-elements, but we are working |
1167
|
|
|
|
|
|
|
on adding support. |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
Patches are very welcome. |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
=head2 Combining Selectors |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
You can combine basic and hierarchical selectors into a single query |
1174
|
|
|
|
|
|
|
by separating each part with a comma. The query will select all matching |
1175
|
|
|
|
|
|
|
elements for each of the comma-delimited selectors. For example, to |
1176
|
|
|
|
|
|
|
find all C, C and C elements in a tree: |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
@elems = $query->query('a, b, i')->get_elements(); |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
Each of these selectors can be arbitrarily complex. |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
@elems = $query->query( |
1183
|
|
|
|
|
|
|
'table.search[width=100%] tr.result[valign=top] td.value, |
1184
|
|
|
|
|
|
|
form.search input[type=submit], |
1185
|
|
|
|
|
|
|
a[href=index.html]' |
1186
|
|
|
|
|
|
|
)->get_elements(); |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
=head1 EXPORT HOOKS |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
=head2 Query |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
The C constructor subroutine (note the capital letter) can be |
1193
|
|
|
|
|
|
|
exported as a convenient way to create C objects. It simply |
1194
|
|
|
|
|
|
|
forwards all arguments to the L constructor method. |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
use HTML::Query 'Query'; |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
my $query = Query( file => $file, 'ul.menu li a' ); |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
=head2 query |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
The C export hook can be called to monkey-patch a L method |
1203
|
|
|
|
|
|
|
into the L module. |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
This is considered questionable behaviour in polite society which regards it |
1206
|
|
|
|
|
|
|
as a violation of the inner sanctity of the L. |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
But if you're the kind of person that doesn't mind a bit of occasional |
1209
|
|
|
|
|
|
|
namespace abuse for the sake of getting the job done, then go right ahead. |
1210
|
|
|
|
|
|
|
Just don't blame me if it all blows up later. |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
use HTML::Query 'query'; # note lower case 'q' |
1213
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
# build a tree |
1216
|
|
|
|
|
|
|
my $tree = HTML::TreeBuilder->new; |
1217
|
|
|
|
|
|
|
$tree->parse_file($filename); |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# call the query() method on any element |
1220
|
|
|
|
|
|
|
my $query = $tree->query('ul li a'); |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
=head1 METHODS |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
The C object is a subclass of L and |
1225
|
|
|
|
|
|
|
inherits all of its method. |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
=head2 new(@elements,$selector) |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
This constructor method is used to create a new C object. It |
1230
|
|
|
|
|
|
|
expects a list of any number (including zero) of |
1231
|
|
|
|
|
|
|
L or C objects. |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
# single HTML::Element object |
1234
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem); |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
# multiple element object |
1237
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem1, $elem2, $elem3, ...); |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
# copy elements from an existing query |
1240
|
|
|
|
|
|
|
my $query = HTML::Query->new($another_query); |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
# copy elements from several queries |
1243
|
|
|
|
|
|
|
my $query = HTML::Query->new($query1, $query2, $query3); |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
# or a mixture |
1246
|
|
|
|
|
|
|
my $query = HTML::Query->new($elem1, $query1, $elem2, $query3); |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
You can also use named parameters to specify an alternate source for a |
1249
|
|
|
|
|
|
|
element. |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
$query = HTML::Query->new( file => $file ); |
1252
|
|
|
|
|
|
|
$query = HTML::Query->new( text => $text ); |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
In this case, the L module is used to |
1255
|
|
|
|
|
|
|
parse the source file or text into a tree of L |
1256
|
|
|
|
|
|
|
objects. |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
For the sake of completeness, you can also specify element trees and queries |
1259
|
|
|
|
|
|
|
using named parameters: |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
$query = HTML::Query->new( tree => $tree ); |
1262
|
|
|
|
|
|
|
$query = HTML::Query->new( query => $query ); |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
You can freely mix and match elements, queries and named sources. The |
1265
|
|
|
|
|
|
|
query will be constructed as an aggregate across them all. |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
$q = HTML::Query->new( |
1268
|
|
|
|
|
|
|
text => $text1, |
1269
|
|
|
|
|
|
|
text => $text2, |
1270
|
|
|
|
|
|
|
file => $file1, |
1271
|
|
|
|
|
|
|
file => $file2, |
1272
|
|
|
|
|
|
|
tree => $tree, |
1273
|
|
|
|
|
|
|
query => $query1, |
1274
|
|
|
|
|
|
|
); |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
The final, optional argument can be a selector specification. This is |
1277
|
|
|
|
|
|
|
immediately passed to the L method which will return a new query |
1278
|
|
|
|
|
|
|
with only those elements selected. |
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
my $spec = 'ul.menu li a'; # |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
my $query = HTML::Query->new( $tree, $spec ); |
1283
|
|
|
|
|
|
|
my $query = HTML::Query->new( text => $text, $spec ); |
1284
|
|
|
|
|
|
|
my $query = HTML::Query->new( |
1285
|
|
|
|
|
|
|
text => $text, |
1286
|
|
|
|
|
|
|
file => $file, |
1287
|
|
|
|
|
|
|
$spec |
1288
|
|
|
|
|
|
|
); |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
The list of arguments can also be passed by reference to a list. |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
=head2 query($spec) |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
This method locates the descendant elements identified by the C<$spec> |
1297
|
|
|
|
|
|
|
argument for each element in the query. It then interally stores the results |
1298
|
|
|
|
|
|
|
for requerying or return. See get_elements(). |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1301
|
|
|
|
|
|
|
my $results = $query->query($spec); |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
See L<"QUERY SYNTAX"> for the permitted syntax of the C<$spec> argument. |
1304
|
|
|
|
|
|
|
|
1305
|
|
|
|
|
|
|
=head2 get_elements() |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
This method returns the stored results from a query. In list context it returns a list of |
1308
|
|
|
|
|
|
|
matching L objects. In scalar context it returns a reference to |
1309
|
|
|
|
|
|
|
the results array. |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
my $query = HTML::Query->new(\@args); |
1312
|
|
|
|
|
|
|
my $results = $query->query($spec); |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
my @elements = $results->query($spec)->get_elements(); |
1315
|
|
|
|
|
|
|
my $elements = $results->query($spec)->get_elements(); |
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
=head2 size() |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
Returns the number of elements in the query. |
1320
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
=head2 first() |
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
Returns the first element in the query. |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
my $elem = $query->first; |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
If the query is empty then an exception will be thrown. If you would rather |
1328
|
|
|
|
|
|
|
have an undefined value returned then you can use the C method inherited |
1329
|
|
|
|
|
|
|
from L. This effectively wraps the call to |
1330
|
|
|
|
|
|
|
C in an C block to catch any exceptions thrown. |
1331
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
my $elem = $query->try('first') || warn "no first element\n"; |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
=head2 last() |
1335
|
|
|
|
|
|
|
|
1336
|
|
|
|
|
|
|
Similar to L, but returning the last element in the query. |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
my $elem = $query->last; |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
=head2 list() |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
Returns a list of the L object in the query in |
1343
|
|
|
|
|
|
|
list context, or a reference to a list in scalar context. |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
my @elems = $query->list; |
1346
|
|
|
|
|
|
|
my $elems = $query->list; |
1347
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
=head2 AUTOLOAD |
1349
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
The C method maps any other method calls to the |
1351
|
|
|
|
|
|
|
L objects in the list. When called in list |
1352
|
|
|
|
|
|
|
context it returns a list of the values returned from calling the method on |
1353
|
|
|
|
|
|
|
each element. In scalar context it returns a reference to a list of return |
1354
|
|
|
|
|
|
|
values. |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
my @text_blocks = $query->as_trimmed_text; |
1357
|
|
|
|
|
|
|
my $text_blocks = $query->as_trimmed_text; |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
=head1 KNOWN BUGS |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
=head2 Attribute Values |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
It is not possible to use C<]> in an attribute value. This is due to a |
1364
|
|
|
|
|
|
|
limitation in the parser which will be fixed RSN. |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
=head1 AUTHOR |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
Andy Wardley L |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
=head1 MAINTAINER |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
Kevin Kamel |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
Vivek Khera |
1377
|
|
|
|
|
|
|
Michael Peters |
1378
|
|
|
|
|
|
|
David Gray |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
=head1 COPYRIGHT |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
Copyright (C) 2010 Andy Wardley. All Rights Reserved. |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
1385
|
|
|
|
|
|
|
under the same terms as Perl itself. |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
=head1 SEE ALSO |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
L, L, |
1390
|
|
|
|
|
|
|
L, L, L |
1391
|
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
=cut |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
# Local Variables: |
1395
|
|
|
|
|
|
|
# mode: Perl |
1396
|
|
|
|
|
|
|
# perl-indent-level: 4 |
1397
|
|
|
|
|
|
|
# indent-tabs-mode: nil |
1398
|
|
|
|
|
|
|
# End: |
1399
|
|
|
|
|
|
|
# |
1400
|
|
|
|
|
|
|
# vim: expandtab shiftwidth=4: |
|