line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::Parser::Simple; |
2
|
|
|
|
|
|
|
|
3
|
10
|
|
|
10
|
|
136980
|
use strict; |
|
10
|
|
|
|
|
22
|
|
|
10
|
|
|
|
|
300
|
|
4
|
10
|
|
|
10
|
|
47
|
use warnings; |
|
10
|
|
|
|
|
22
|
|
|
10
|
|
|
|
|
211
|
|
5
|
|
|
|
|
|
|
|
6
|
10
|
|
|
10
|
|
3708
|
use Moo; |
|
10
|
|
|
|
|
66939
|
|
|
10
|
|
|
|
|
51
|
|
7
|
|
|
|
|
|
|
|
8
|
10
|
|
|
10
|
|
19871
|
use Tree::Simple; |
|
10
|
|
|
|
|
32902
|
|
|
10
|
|
|
|
|
81
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has block => |
11
|
|
|
|
|
|
|
( |
12
|
|
|
|
|
|
|
default => sub {return {} }, |
13
|
|
|
|
|
|
|
is => 'rw', |
14
|
|
|
|
|
|
|
); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has current_node => |
17
|
|
|
|
|
|
|
( |
18
|
|
|
|
|
|
|
default => sub {return ''}, |
19
|
|
|
|
|
|
|
is => 'rw', |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
has depth => |
23
|
|
|
|
|
|
|
( |
24
|
|
|
|
|
|
|
default => sub {return 0}, |
25
|
|
|
|
|
|
|
is => 'rw', |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
has empty => |
29
|
|
|
|
|
|
|
( |
30
|
|
|
|
|
|
|
default => sub {return {} }, |
31
|
|
|
|
|
|
|
is => 'rw', |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has inline => |
35
|
|
|
|
|
|
|
( |
36
|
|
|
|
|
|
|
default => sub {return {} }, |
37
|
|
|
|
|
|
|
is => 'rw', |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
has input_file => |
41
|
|
|
|
|
|
|
( |
42
|
|
|
|
|
|
|
default => sub {return ''}, |
43
|
|
|
|
|
|
|
is => 'rw', |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
has node_type => |
47
|
|
|
|
|
|
|
( |
48
|
|
|
|
|
|
|
default => sub {return 'global'}, |
49
|
|
|
|
|
|
|
is => 'rw', |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
has output_file => |
53
|
|
|
|
|
|
|
( |
54
|
|
|
|
|
|
|
default => sub {return ''}, |
55
|
|
|
|
|
|
|
is => 'rw', |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
has result => |
59
|
|
|
|
|
|
|
( |
60
|
|
|
|
|
|
|
default => sub {return ''}, |
61
|
|
|
|
|
|
|
is => 'rw', |
62
|
|
|
|
|
|
|
); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
has root => |
65
|
|
|
|
|
|
|
( |
66
|
|
|
|
|
|
|
default => sub {return ''}, |
67
|
|
|
|
|
|
|
is => 'rw', |
68
|
|
|
|
|
|
|
); |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
has self_close => |
71
|
|
|
|
|
|
|
( |
72
|
|
|
|
|
|
|
default => sub {return {} }, |
73
|
|
|
|
|
|
|
is => 'rw', |
74
|
|
|
|
|
|
|
); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
has tagged_attribute => |
77
|
|
|
|
|
|
|
( |
78
|
|
|
|
|
|
|
default => sub {return {} }, |
79
|
|
|
|
|
|
|
is => 'rw', |
80
|
|
|
|
|
|
|
); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
has verbose => |
83
|
|
|
|
|
|
|
( |
84
|
|
|
|
|
|
|
default => sub {return 0}, |
85
|
|
|
|
|
|
|
is => 'rw', |
86
|
|
|
|
|
|
|
); |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
has xhtml => |
89
|
|
|
|
|
|
|
( |
90
|
|
|
|
|
|
|
default => sub {return 0}, |
91
|
|
|
|
|
|
|
is => 'rw', |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
trigger => |
94
|
|
|
|
|
|
|
sub |
95
|
|
|
|
|
|
|
{ |
96
|
|
|
|
|
|
|
my($self, $new) = @_; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$self -> _set_tagged_attribute($new); |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
our $VERSION = '2.01'; |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# ----------------------------------------------- |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub BUILD |
107
|
|
|
|
|
|
|
{ |
108
|
9
|
|
|
9
|
0
|
76
|
my($self) = @_; |
109
|
|
|
|
|
|
|
|
110
|
9
|
|
|
|
|
382
|
$self -> block |
111
|
|
|
|
|
|
|
({ |
112
|
|
|
|
|
|
|
address => 1, |
113
|
|
|
|
|
|
|
applet => 1, |
114
|
|
|
|
|
|
|
blockquote => 1, |
115
|
|
|
|
|
|
|
button => 1, |
116
|
|
|
|
|
|
|
center => 1, |
117
|
|
|
|
|
|
|
dd => 1, |
118
|
|
|
|
|
|
|
del => 1, |
119
|
|
|
|
|
|
|
dir => 1, |
120
|
|
|
|
|
|
|
div => 1, |
121
|
|
|
|
|
|
|
dl => 1, |
122
|
|
|
|
|
|
|
dt => 1, |
123
|
|
|
|
|
|
|
fieldset => 1, |
124
|
|
|
|
|
|
|
form => 1, |
125
|
|
|
|
|
|
|
frameset => 1, |
126
|
|
|
|
|
|
|
hr => 1, |
127
|
|
|
|
|
|
|
iframe => 1, |
128
|
|
|
|
|
|
|
ins => 1, |
129
|
|
|
|
|
|
|
isindex => 1, |
130
|
|
|
|
|
|
|
li => 1, |
131
|
|
|
|
|
|
|
map => 1, |
132
|
|
|
|
|
|
|
menu => 1, |
133
|
|
|
|
|
|
|
noframes => 1, |
134
|
|
|
|
|
|
|
noscript => 1, |
135
|
|
|
|
|
|
|
object => 1, |
136
|
|
|
|
|
|
|
ol => 1, |
137
|
|
|
|
|
|
|
p => 1, |
138
|
|
|
|
|
|
|
pre => 1, |
139
|
|
|
|
|
|
|
script => 1, |
140
|
|
|
|
|
|
|
table => 1, |
141
|
|
|
|
|
|
|
tbody => 1, |
142
|
|
|
|
|
|
|
td => 1, |
143
|
|
|
|
|
|
|
tfoot => 1, |
144
|
|
|
|
|
|
|
th => 1, |
145
|
|
|
|
|
|
|
thead => 1, |
146
|
|
|
|
|
|
|
'tr' => 1, |
147
|
|
|
|
|
|
|
ul => 1, |
148
|
|
|
|
|
|
|
}); |
149
|
|
|
|
|
|
|
|
150
|
9
|
|
|
|
|
153
|
$self -> empty |
151
|
|
|
|
|
|
|
({ |
152
|
|
|
|
|
|
|
area => 1, |
153
|
|
|
|
|
|
|
base => 1, |
154
|
|
|
|
|
|
|
basefont => 1, |
155
|
|
|
|
|
|
|
br => 1, |
156
|
|
|
|
|
|
|
col => 1, |
157
|
|
|
|
|
|
|
embed => 1, |
158
|
|
|
|
|
|
|
frame => 1, |
159
|
|
|
|
|
|
|
hr => 1, |
160
|
|
|
|
|
|
|
img => 1, |
161
|
|
|
|
|
|
|
input => 1, |
162
|
|
|
|
|
|
|
isindex => 1, |
163
|
|
|
|
|
|
|
link => 1, |
164
|
|
|
|
|
|
|
meta => 1, |
165
|
|
|
|
|
|
|
param => 1, |
166
|
|
|
|
|
|
|
wbr => 1, |
167
|
|
|
|
|
|
|
}); |
168
|
|
|
|
|
|
|
|
169
|
9
|
|
|
|
|
263
|
$self -> inline |
170
|
|
|
|
|
|
|
({ |
171
|
|
|
|
|
|
|
a => 1, |
172
|
|
|
|
|
|
|
abbr => 1, |
173
|
|
|
|
|
|
|
acronym => 1, |
174
|
|
|
|
|
|
|
applet => 1, |
175
|
|
|
|
|
|
|
b => 1, |
176
|
|
|
|
|
|
|
basefont => 1, |
177
|
|
|
|
|
|
|
bdo => 1, |
178
|
|
|
|
|
|
|
big => 1, |
179
|
|
|
|
|
|
|
br => 1, |
180
|
|
|
|
|
|
|
button => 1, |
181
|
|
|
|
|
|
|
cite => 1, |
182
|
|
|
|
|
|
|
code => 1, |
183
|
|
|
|
|
|
|
del => 1, |
184
|
|
|
|
|
|
|
dfn => 1, |
185
|
|
|
|
|
|
|
em => 1, |
186
|
|
|
|
|
|
|
font => 1, |
187
|
|
|
|
|
|
|
i => 1, |
188
|
|
|
|
|
|
|
iframe => 1, |
189
|
|
|
|
|
|
|
img => 1, |
190
|
|
|
|
|
|
|
input => 1, |
191
|
|
|
|
|
|
|
ins => 1, |
192
|
|
|
|
|
|
|
kbd => 1, |
193
|
|
|
|
|
|
|
label => 1, |
194
|
|
|
|
|
|
|
map => 1, |
195
|
|
|
|
|
|
|
object => 1, |
196
|
|
|
|
|
|
|
'q' => 1, |
197
|
|
|
|
|
|
|
's' => 1, |
198
|
|
|
|
|
|
|
samp => 1, |
199
|
|
|
|
|
|
|
script => 1, |
200
|
|
|
|
|
|
|
select => 1, |
201
|
|
|
|
|
|
|
small => 1, |
202
|
|
|
|
|
|
|
span => 1, |
203
|
|
|
|
|
|
|
strike => 1, |
204
|
|
|
|
|
|
|
strong => 1, |
205
|
|
|
|
|
|
|
sub => 1, |
206
|
|
|
|
|
|
|
sup => 1, |
207
|
|
|
|
|
|
|
textarea => 1, |
208
|
|
|
|
|
|
|
tt => 1, |
209
|
|
|
|
|
|
|
u => 1, |
210
|
|
|
|
|
|
|
var => 1, |
211
|
|
|
|
|
|
|
}); |
212
|
|
|
|
|
|
|
|
213
|
9
|
|
|
|
|
93
|
$self -> self_close |
214
|
|
|
|
|
|
|
({ |
215
|
|
|
|
|
|
|
colgroup => 1, |
216
|
|
|
|
|
|
|
dd => 1, |
217
|
|
|
|
|
|
|
dt => 1, |
218
|
|
|
|
|
|
|
li => 1, |
219
|
|
|
|
|
|
|
options => 1, |
220
|
|
|
|
|
|
|
p => 1, |
221
|
|
|
|
|
|
|
td => 1, |
222
|
|
|
|
|
|
|
tfoot => 1, |
223
|
|
|
|
|
|
|
th => 1, |
224
|
|
|
|
|
|
|
thead => 1, |
225
|
|
|
|
|
|
|
'tr' => 1, |
226
|
|
|
|
|
|
|
}); |
227
|
|
|
|
|
|
|
|
228
|
9
|
|
|
|
|
104
|
$self -> current_node($self -> create_new_node('root', '', Tree::Simple -> ROOT) ); |
229
|
9
|
|
|
|
|
477
|
$self -> root($self -> current_node); |
230
|
|
|
|
|
|
|
|
231
|
9
|
100
|
|
|
|
61
|
if ($self -> xhtml) |
232
|
|
|
|
|
|
|
{ |
233
|
|
|
|
|
|
|
# Compared to the non-XHTML re, this has an extra ':' in the first []. |
234
|
|
|
|
|
|
|
|
235
|
2
|
|
|
|
|
9262
|
$self -> tagged_attribute |
236
|
|
|
|
|
|
|
( |
237
|
|
|
|
|
|
|
q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)# |
238
|
|
|
|
|
|
|
); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
else |
241
|
|
|
|
|
|
|
{ |
242
|
7
|
|
|
|
|
3548
|
$self -> tagged_attribute |
243
|
|
|
|
|
|
|
( |
244
|
|
|
|
|
|
|
q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)# |
245
|
|
|
|
|
|
|
); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} # End of BUILD. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# ----------------------------------------------- |
251
|
|
|
|
|
|
|
# Create a new node to store the new tag. |
252
|
|
|
|
|
|
|
# Each node has metadata: |
253
|
|
|
|
|
|
|
# o attributes: The tag's attributes, as a string with N spaces as a prefix. |
254
|
|
|
|
|
|
|
# o content: The content before the tag was parsed. |
255
|
|
|
|
|
|
|
# o name: The HTML tag. |
256
|
|
|
|
|
|
|
# o node_type: This holds 'global' before '' and between '' |
257
|
|
|
|
|
|
|
# and '', and after ''. It holds 'head' from |
258
|
|
|
|
|
|
|
# '' to ', and holds 'body' from '' to |
259
|
|
|
|
|
|
|
# ''. It's just there in case you need it. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub create_new_node |
262
|
|
|
|
|
|
|
{ |
263
|
82
|
|
|
82
|
0
|
146
|
my($self, $name, $attributes, $parent) = @_; |
264
|
82
|
|
|
|
|
542
|
my($metadata) = |
265
|
|
|
|
|
|
|
{ |
266
|
|
|
|
|
|
|
attributes => $attributes, |
267
|
|
|
|
|
|
|
content => [], |
268
|
|
|
|
|
|
|
depth => $self -> depth, |
269
|
|
|
|
|
|
|
name => $name, |
270
|
|
|
|
|
|
|
node_type => $self -> node_type, |
271
|
|
|
|
|
|
|
}; |
272
|
|
|
|
|
|
|
|
273
|
82
|
|
|
|
|
376
|
return Tree::Simple -> new($metadata, $parent); |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
} # End of create_new_node. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# ----------------------------------------------- |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub handle_comment |
280
|
|
|
|
|
|
|
{ |
281
|
2
|
|
|
2
|
0
|
5
|
my($self, $s) = @_; |
282
|
|
|
|
|
|
|
|
283
|
2
|
|
|
|
|
5
|
$self -> handle_content($s); |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
} # End of handle_comment. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# ----------------------------------------------- |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
sub handle_content |
290
|
|
|
|
|
|
|
{ |
291
|
111
|
|
|
111
|
0
|
230
|
my($self, $s) = @_; |
292
|
111
|
|
|
|
|
380
|
my($count) = $self -> current_node -> getChildCount; |
293
|
111
|
|
|
|
|
745
|
my($metadata) = $self -> current_node -> getNodeValue; |
294
|
111
|
|
|
|
|
862
|
$$metadata{'content'}[$count] .= $s; |
295
|
|
|
|
|
|
|
|
296
|
111
|
|
|
|
|
353
|
$self -> current_node -> setNodeValue($metadata); |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
} # End of handle_content. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
# ----------------------------------------------- |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub handle_doctype |
303
|
|
|
|
|
|
|
{ |
304
|
3
|
|
|
3
|
0
|
9
|
my($self, $s) = @_; |
305
|
|
|
|
|
|
|
|
306
|
3
|
|
|
|
|
9
|
$self -> handle_content($s); |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
} # End of handle_doctype. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# ----------------------------------------------- |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub handle_end_tag |
313
|
|
|
|
|
|
|
{ |
314
|
51
|
|
|
51
|
0
|
80
|
my($self, $tag_name) = @_; |
315
|
|
|
|
|
|
|
|
316
|
51
|
100
|
100
|
|
|
261
|
$self -> node_type('global') if ( ($tag_name eq 'head') || ($tag_name eq 'body') ); |
317
|
|
|
|
|
|
|
|
318
|
51
|
50
|
|
|
|
63
|
if (! ${$self -> empty}{$tag_name}) |
|
51
|
|
|
|
|
193
|
|
319
|
|
|
|
|
|
|
{ |
320
|
51
|
|
|
|
|
178
|
$self -> current_node($self -> current_node -> getParent); |
321
|
51
|
|
|
|
|
388
|
$self -> depth($self -> depth - 1); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
} # End of handle_end_tag. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# ----------------------------------------------- |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
sub handle_start_tag |
329
|
|
|
|
|
|
|
{ |
330
|
73
|
|
|
73
|
0
|
153
|
my($self, $tag_name, $attributes, $unary) = @_; |
331
|
|
|
|
|
|
|
|
332
|
73
|
|
|
|
|
186
|
$self -> depth($self -> depth + 1); |
333
|
|
|
|
|
|
|
|
334
|
73
|
100
|
|
|
|
221
|
if ($tag_name eq 'head') |
|
|
100
|
|
|
|
|
|
335
|
|
|
|
|
|
|
{ |
336
|
6
|
|
|
|
|
22
|
$self -> node_type('head'); |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
elsif ($tag_name eq 'body') |
339
|
|
|
|
|
|
|
{ |
340
|
8
|
|
|
|
|
27
|
$self -> node_type('body'); |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
73
|
|
|
|
|
203
|
my($node) = $self -> create_new_node($tag_name, $attributes, $self -> current_node); |
344
|
|
|
|
|
|
|
|
345
|
73
|
100
|
|
|
|
9488
|
$self -> current_node($node) if (! ${$self -> empty}{$tag_name}); |
|
73
|
|
|
|
|
456
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
} # End of handle_start_tag. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# ----------------------------------------------- |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
sub handle_xml_declaration |
352
|
|
|
|
|
|
|
{ |
353
|
2
|
|
|
2
|
0
|
7
|
my($self, $s) = @_; |
354
|
|
|
|
|
|
|
|
355
|
2
|
|
|
|
|
9
|
$self -> handle_content($s); |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
} # End of handle_xml_declaration. |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# ----------------------------------------------- |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub log |
362
|
|
|
|
|
|
|
{ |
363
|
3
|
|
|
3
|
1
|
6
|
my($self, $msg) = @_; |
364
|
|
|
|
|
|
|
|
365
|
3
|
50
|
|
|
|
19
|
print STDERR "$msg\n" if ($self -> verbose); |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
} # End of log. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# ----------------------------------------------- |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub parse |
372
|
|
|
|
|
|
|
{ |
373
|
9
|
|
|
9
|
1
|
709
|
my($self, $html) = @_; |
374
|
9
|
|
|
|
|
22
|
my($original) = $html; |
375
|
9
|
|
|
|
|
124
|
my(%special) = |
376
|
|
|
|
|
|
|
( |
377
|
|
|
|
|
|
|
script => 1, |
378
|
|
|
|
|
|
|
style => 1, |
379
|
|
|
|
|
|
|
); |
380
|
9
|
|
|
|
|
52
|
my($tagged_attribute) = $self -> tagged_attribute; |
381
|
|
|
|
|
|
|
|
382
|
9
|
|
|
|
|
15
|
my($in_content); |
383
|
|
|
|
|
|
|
my($offset); |
384
|
0
|
|
|
|
|
0
|
my(@stack, $s); |
385
|
|
|
|
|
|
|
|
386
|
9
|
|
|
|
|
42
|
for (; $html;) |
387
|
|
|
|
|
|
|
{ |
388
|
235
|
|
|
|
|
284
|
$in_content = 1; |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# Make sure we're not in a script or style element. |
391
|
|
|
|
|
|
|
|
392
|
235
|
50
|
66
|
|
|
1172
|
if (! $stack[$#stack] || ! $special{$stack[$#stack]}) |
393
|
|
|
|
|
|
|
{ |
394
|
|
|
|
|
|
|
# Rearrange order of testing so rarer possiblilites are further down. |
395
|
|
|
|
|
|
|
# Is it an end tag? |
396
|
|
|
|
|
|
|
|
397
|
235
|
|
|
|
|
425
|
$s = substr($html, 0, 2); |
398
|
|
|
|
|
|
|
|
399
|
235
|
100
|
|
|
|
534
|
if ($s eq '') |
400
|
|
|
|
|
|
|
{ |
401
|
51
|
50
|
|
|
|
223
|
if ($html =~ /^(<\/(\w+)[^>]*>)/) |
402
|
|
|
|
|
|
|
{ |
403
|
51
|
|
|
|
|
104
|
substr($html, 0, length $1) = ''; |
404
|
51
|
|
|
|
|
59
|
$in_content = 0; |
405
|
|
|
|
|
|
|
|
406
|
51
|
|
|
|
|
162
|
$self -> parse_end_tag($2, \@stack); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# Is it a start tag? |
411
|
|
|
|
|
|
|
|
412
|
235
|
100
|
|
|
|
474
|
if ($in_content) |
413
|
|
|
|
|
|
|
{ |
414
|
184
|
100
|
|
|
|
408
|
if (substr($html, 0, 1) eq '<') |
415
|
|
|
|
|
|
|
{ |
416
|
|
|
|
|
|
|
# Use lc() since tags are stored in this module in lower-case. |
417
|
|
|
|
|
|
|
|
418
|
81
|
100
|
|
|
|
7810
|
if (lc($html) =~ /$tagged_attribute/) |
419
|
|
|
|
|
|
|
{ |
420
|
|
|
|
|
|
|
# Since the regexp matched, save matches in lower-case. |
421
|
|
|
|
|
|
|
# Then, re-match to get attributes in original case. |
422
|
|
|
|
|
|
|
# In each case: |
423
|
|
|
|
|
|
|
# o $1 => The whole string which matched. |
424
|
|
|
|
|
|
|
# o $2 => The tag name. |
425
|
|
|
|
|
|
|
# o $3 => The attributes. |
426
|
|
|
|
|
|
|
# o $4 => The trailing / if any (aka $unity). |
427
|
|
|
|
|
|
|
# But we have to lower-case the prefix '<$tag' of the string |
428
|
|
|
|
|
|
|
# to ensure the 2nd regexp actually matches. |
429
|
|
|
|
|
|
|
|
430
|
73
|
|
|
|
|
293
|
my(@match) = ($2, $3, $4); |
431
|
73
|
|
|
|
|
236
|
substr($html, 0, length($2) + 1) = lc substr($html, 0, length($2) + 1); |
432
|
|
|
|
|
|
|
|
433
|
73
|
50
|
|
|
|
1028
|
if ($html =~ /$tagged_attribute/) |
434
|
|
|
|
|
|
|
{ |
435
|
73
|
|
|
|
|
183
|
substr($html, 0, length $1) = ''; |
436
|
73
|
|
|
|
|
79
|
$in_content = 0; |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Here we use $3 from the 2nd match to get the attributes in the original case. |
439
|
73
|
|
|
|
|
247
|
$self -> parse_start_tag($match[0], $3, $match[2], \@stack); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# Is it a comment? |
446
|
|
|
|
|
|
|
|
447
|
235
|
100
|
|
|
|
481
|
if ($in_content) |
448
|
|
|
|
|
|
|
{ |
449
|
111
|
|
|
|
|
159
|
$s = substr($html, 0, 4); |
450
|
|
|
|
|
|
|
|
451
|
111
|
100
|
|
|
|
230
|
if ($s eq ''); |
454
|
|
|
|
|
|
|
|
455
|
2
|
50
|
|
|
|
6
|
if ($offset >= 0) |
456
|
|
|
|
|
|
|
{ |
457
|
2
|
|
|
|
|
9
|
$self -> handle_comment(substr($html, 0, ($offset + 3) ) ); |
458
|
|
|
|
|
|
|
|
459
|
2
|
|
|
|
|
13
|
substr($html, 0, $offset + 3) = ''; |
460
|
2
|
|
|
|
|
3
|
$in_content = 0; |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# Is it a doctype? |
466
|
|
|
|
|
|
|
|
467
|
235
|
100
|
|
|
|
444
|
if ($in_content) |
468
|
|
|
|
|
|
|
{ |
469
|
109
|
|
|
|
|
150
|
$s = substr($html, 0, 9); |
470
|
|
|
|
|
|
|
|
471
|
109
|
100
|
|
|
|
228
|
if ($s eq '
|
472
|
|
|
|
|
|
|
{ |
473
|
3
|
|
|
|
|
7
|
$offset = index($html, '>'); |
474
|
|
|
|
|
|
|
|
475
|
3
|
50
|
|
|
|
11
|
if ($offset >= 0) |
476
|
|
|
|
|
|
|
{ |
477
|
3
|
|
|
|
|
17
|
$self -> handle_doctype(substr($html, 0, ($offset + 1) ) ); |
478
|
|
|
|
|
|
|
|
479
|
3
|
|
|
|
|
24
|
substr($html, 0, $offset + 1) = ''; |
480
|
3
|
|
|
|
|
14
|
$in_content = 0; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# Is is an XML declaration? |
486
|
|
|
|
|
|
|
|
487
|
235
|
100
|
100
|
|
|
5767
|
if ($self -> xhtml && $in_content) |
488
|
|
|
|
|
|
|
{ |
489
|
30
|
|
|
|
|
271
|
$s = substr($html, 0, 5); |
490
|
|
|
|
|
|
|
|
491
|
30
|
100
|
|
|
|
67
|
if ($s eq '
|
492
|
|
|
|
|
|
|
{ |
493
|
2
|
|
|
|
|
5
|
$offset = index($html, '?>'); |
494
|
|
|
|
|
|
|
|
495
|
2
|
50
|
|
|
|
7
|
if ($offset >= 0) |
496
|
|
|
|
|
|
|
{ |
497
|
2
|
|
|
|
|
14
|
$self -> handle_xml_declaration(substr($html, 0, ($offset + 2) ) ); |
498
|
|
|
|
|
|
|
|
499
|
2
|
|
|
|
|
19
|
substr($html, 0, $offset + 2) = ''; |
500
|
2
|
|
|
|
|
4
|
$in_content = 0; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
235
|
100
|
|
|
|
1661
|
if ($in_content) |
506
|
|
|
|
|
|
|
{ |
507
|
104
|
|
|
|
|
169
|
$offset = index($html, '<'); |
508
|
|
|
|
|
|
|
|
509
|
104
|
100
|
|
|
|
212
|
if ($offset < 0) |
510
|
|
|
|
|
|
|
{ |
511
|
7
|
|
|
|
|
22
|
$self -> handle_content($html); |
512
|
|
|
|
|
|
|
|
513
|
7
|
|
|
|
|
40
|
$html = ''; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
else |
516
|
|
|
|
|
|
|
{ |
517
|
97
|
|
|
|
|
303
|
$self -> handle_content(substr($html, 0, $offset) ); |
518
|
|
|
|
|
|
|
|
519
|
97
|
|
|
|
|
634
|
substr($html, 0, $offset) = ''; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
else |
524
|
|
|
|
|
|
|
{ |
525
|
0
|
|
|
|
|
0
|
my($re) = "(.*)<\/$stack[$#stack]\[^>]*>"; |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# lc() is needed because only lc tag names are pushed onto the stack. |
528
|
|
|
|
|
|
|
|
529
|
0
|
0
|
|
|
|
0
|
if (lc($html) =~ /$re/s) |
530
|
|
|
|
|
|
|
{ |
531
|
0
|
|
|
|
|
0
|
my($text) = $1; |
532
|
0
|
|
|
|
|
0
|
$text =~ s//$1/g; |
533
|
0
|
|
|
|
|
0
|
$text =~ s//$1/g; |
534
|
|
|
|
|
|
|
|
535
|
0
|
|
|
|
|
0
|
$self -> handle_content($text); |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
0
|
|
|
|
|
0
|
$self -> parse_end_tag($stack[$#stack], \@stack); |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
|
541
|
235
|
100
|
|
|
|
527
|
if ($html eq $original) |
542
|
|
|
|
|
|
|
{ |
543
|
1
|
|
|
|
|
2
|
my($msg) = 'Parse error. '; |
544
|
1
|
|
|
|
|
5
|
my($parent) = $self -> current_node -> getParent; |
545
|
|
|
|
|
|
|
|
546
|
1
|
|
|
|
|
5
|
my($metadata); |
547
|
|
|
|
|
|
|
|
548
|
1
|
50
|
33
|
|
|
20
|
if ($parent && $parent -> can('getNodeValue') ) |
549
|
|
|
|
|
|
|
{ |
550
|
1
|
|
|
|
|
4
|
$metadata = $parent -> getNodeValue; |
551
|
1
|
|
|
|
|
7
|
$msg .= "Parent tag: <$$metadata{'name'}>. "; |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
|
554
|
1
|
|
|
|
|
5
|
$metadata = $self -> current_node -> getNodeValue; |
555
|
1
|
|
|
|
|
12
|
$msg .= "Current tag: <$$metadata{'name'}>. Next 100 chars: " . substr($html, 0, 100); |
556
|
|
|
|
|
|
|
|
557
|
1
|
|
|
|
|
11
|
die "$msg\n"; |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
234
|
|
|
|
|
631
|
$original = $html; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# Clean up any remaining tags. |
564
|
|
|
|
|
|
|
|
565
|
8
|
|
|
|
|
32
|
$self -> parse_end_tag('', \@stack); |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# Return the invocant to allow method chaining. |
568
|
|
|
|
|
|
|
|
569
|
8
|
|
|
|
|
47
|
return $self; |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
} # End of parse. |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# ----------------------------------------------- |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
sub parse_end_tag |
576
|
|
|
|
|
|
|
{ |
577
|
59
|
|
|
59
|
0
|
110
|
my($self, $tag_name, $stack) = @_; |
578
|
59
|
|
|
|
|
82
|
$tag_name = lc $tag_name; |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# Find the closest opened tag of the same name. |
581
|
|
|
|
|
|
|
|
582
|
59
|
|
|
|
|
69
|
my($pos); |
583
|
|
|
|
|
|
|
|
584
|
59
|
100
|
|
|
|
165
|
if ($tag_name) |
585
|
|
|
|
|
|
|
{ |
586
|
51
|
|
|
|
|
154
|
for ($pos = $#$stack; $pos >= 0; $pos--) |
587
|
|
|
|
|
|
|
{ |
588
|
51
|
50
|
|
|
|
136
|
last if ($$stack[$pos] eq $tag_name); |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
else |
592
|
|
|
|
|
|
|
{ |
593
|
8
|
|
|
|
|
14
|
$pos = 0; |
594
|
|
|
|
|
|
|
} |
595
|
|
|
|
|
|
|
|
596
|
59
|
50
|
|
|
|
136
|
if ($pos >= 0) |
597
|
|
|
|
|
|
|
{ |
598
|
|
|
|
|
|
|
# Close all the open tags, up the stack. |
599
|
|
|
|
|
|
|
|
600
|
59
|
|
|
|
|
141
|
my($count) = 0; |
601
|
|
|
|
|
|
|
|
602
|
59
|
|
|
|
|
157
|
for (my($i) = $#$stack; $i >= $pos; $i--) |
603
|
|
|
|
|
|
|
{ |
604
|
51
|
|
|
|
|
66
|
$count++; |
605
|
|
|
|
|
|
|
|
606
|
51
|
|
|
|
|
123
|
$self -> handle_end_tag($$stack[$i]); |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# Remove the open elements from the stack. |
610
|
|
|
|
|
|
|
# Does not work: $#$stack = $pos. Could use splice(). |
611
|
|
|
|
|
|
|
|
612
|
59
|
|
|
|
|
230
|
pop @$stack for ($count); |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
} # End of parse_end_tag. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# ----------------------------------------------- |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub parse_file |
620
|
|
|
|
|
|
|
{ |
621
|
0
|
|
|
0
|
1
|
0
|
my($self, $input_file_name, $output_file_name) = @_; |
622
|
0
|
|
0
|
|
|
0
|
$input_file_name ||= $self -> input_file; |
623
|
0
|
|
0
|
|
|
0
|
$output_file_name ||= $self -> output_file; |
624
|
|
|
|
|
|
|
|
625
|
0
|
|
|
|
|
0
|
$self -> input_file($input_file_name); |
626
|
0
|
|
|
|
|
0
|
$self -> output_file($output_file_name); |
627
|
0
|
|
|
|
|
0
|
$self -> log("Reading $input_file_name"); |
628
|
|
|
|
|
|
|
|
629
|
0
|
0
|
|
|
|
0
|
open(INX, $input_file_name) || die "Can't open($input_file_name): $!\n"; |
630
|
0
|
|
|
|
|
0
|
my($html); |
631
|
0
|
|
|
|
|
0
|
read(INX, $html, -s INX); |
632
|
0
|
|
|
|
|
0
|
close INX; |
633
|
|
|
|
|
|
|
|
634
|
0
|
0
|
|
|
|
0
|
die "Can't read($input_file_name): $!\n" if (! defined $html); |
635
|
|
|
|
|
|
|
|
636
|
0
|
|
|
|
|
0
|
$self -> log('Parsing'); |
637
|
|
|
|
|
|
|
|
638
|
0
|
|
|
|
|
0
|
$self -> parse($html); |
639
|
|
|
|
|
|
|
|
640
|
0
|
|
|
|
|
0
|
$self -> log('Traversing'); |
641
|
|
|
|
|
|
|
|
642
|
0
|
|
|
|
|
0
|
$self -> traverse($self -> root); |
643
|
|
|
|
|
|
|
|
644
|
0
|
|
|
|
|
0
|
$self -> log("Writing $output_file_name"); |
645
|
|
|
|
|
|
|
|
646
|
0
|
0
|
|
|
|
0
|
open(OUT, "> $output_file_name") || die "Can't open(> $output_file_name): $!\n"; |
647
|
0
|
|
|
|
|
0
|
print OUT $self -> result; |
648
|
0
|
|
|
|
|
0
|
close OUT; |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# Return the invocant to allow method chaining. |
651
|
|
|
|
|
|
|
|
652
|
0
|
|
|
|
|
0
|
return $self; |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
} # End of parse_file. |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# ----------------------------------------------- |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
sub parse_start_tag |
659
|
|
|
|
|
|
|
{ |
660
|
73
|
|
|
73
|
0
|
165
|
my($self, $tag_name, $attributes, $unary, $stack) = @_; |
661
|
73
|
|
|
|
|
98
|
$tag_name = lc $tag_name; |
662
|
|
|
|
|
|
|
|
663
|
73
|
100
|
|
|
|
89
|
if (${$self -> block}{$tag_name}) |
|
73
|
|
|
|
|
286
|
|
664
|
|
|
|
|
|
|
{ |
665
|
23
|
|
33
|
|
|
74
|
for (; $#$stack >= 0 && ${$self -> inline}{$$stack[$#$stack]};) |
|
23
|
|
|
|
|
123
|
|
666
|
|
|
|
|
|
|
{ |
667
|
0
|
|
|
|
|
0
|
$self -> parse_end_tag($$stack[$#$stack], $stack); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
73
|
50
|
66
|
|
|
95
|
if (${$self -> self_close}{$tag_name} && ($$stack[$#$stack] eq $tag_name) ) |
|
73
|
|
|
|
|
302
|
|
672
|
|
|
|
|
|
|
{ |
673
|
0
|
|
|
|
|
0
|
$self -> parse_end_tag($tag_name, $stack); |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
73
|
|
66
|
|
|
89
|
$unary = ${$self -> empty}{$tag_name} || $unary; |
677
|
|
|
|
|
|
|
|
678
|
73
|
100
|
|
|
|
217
|
push @$stack, $tag_name if (! $unary); |
679
|
|
|
|
|
|
|
|
680
|
73
|
|
|
|
|
176
|
$self -> handle_start_tag($tag_name, $attributes, $unary); |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
} # End of parse_start_tag. |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
# ----------------------------------------------- |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
sub _set_tagged_attribute |
687
|
|
|
|
|
|
|
{ |
688
|
2
|
|
|
2
|
|
6
|
my($self, $new, $old) = @_; |
689
|
|
|
|
|
|
|
|
690
|
2
|
50
|
|
|
|
7
|
if ($new) |
691
|
|
|
|
|
|
|
{ |
692
|
2
|
|
|
|
|
71
|
$self -> tagged_attribute |
693
|
|
|
|
|
|
|
( |
694
|
|
|
|
|
|
|
# Compared to the non-XHTML re, this has an extra ':' in the first []. |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)# |
697
|
|
|
|
|
|
|
); |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
else |
700
|
|
|
|
|
|
|
{ |
701
|
0
|
|
|
|
|
0
|
$self -> tagged_attribute |
702
|
|
|
|
|
|
|
( |
703
|
|
|
|
|
|
|
q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)# |
704
|
|
|
|
|
|
|
); |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
} # End of _set_tagged_attribute. |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
# ----------------------------------------------- |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
sub traverse |
712
|
|
|
|
|
|
|
{ |
713
|
51
|
|
|
51
|
1
|
86
|
my($self, $node) = @_; |
714
|
51
|
|
|
|
|
131
|
my(@child) = $node -> getAllChildren; |
715
|
51
|
|
|
|
|
333
|
my($metadata) = $node -> getNodeValue; |
716
|
51
|
|
|
|
|
285
|
my($content) = $$metadata{'content'}; |
717
|
51
|
|
|
|
|
77
|
my($name) = $$metadata{'name'}; |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# Special check to avoid printing '' when we still need to output |
720
|
|
|
|
|
|
|
# the content of the root, e.g. the DOCTYPE. |
721
|
|
|
|
|
|
|
|
722
|
51
|
100
|
|
|
|
255
|
$self -> result($self -> result . "<$name$$metadata{'attributes'}>") if ($name ne 'root'); |
723
|
|
|
|
|
|
|
|
724
|
51
|
|
|
|
|
62
|
my($index); |
725
|
|
|
|
|
|
|
my($s); |
726
|
|
|
|
|
|
|
|
727
|
51
|
|
|
|
|
93
|
for $index (0 .. $#child) |
728
|
|
|
|
|
|
|
{ |
729
|
45
|
100
|
100
|
|
|
303
|
$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') ); |
730
|
45
|
|
|
|
|
198
|
$self -> traverse($child[$index]); |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
# Output the content after the last child node has been closed, |
734
|
|
|
|
|
|
|
# but before the current node is closed. |
735
|
|
|
|
|
|
|
|
736
|
51
|
|
|
|
|
81
|
$index = $#child + 1; |
737
|
|
|
|
|
|
|
|
738
|
51
|
100
|
66
|
|
|
330
|
$self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') ); |
739
|
51
|
100
|
100
|
|
|
60
|
$self -> result($self -> result . "$name>") if (! ${$self -> empty}{$name} && ($name ne 'root') ); |
|
51
|
|
|
|
|
370
|
|
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
# Return the invocant to allow method chaining. |
742
|
|
|
|
|
|
|
|
743
|
51
|
|
|
|
|
164
|
return $self; |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
} # End of traverse. |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
# ----------------------------------------------- |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
1; |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=head1 NAME |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
HTML::Parser::Simple - Parse nice HTML files without needing a compiler |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=head1 Synopsis |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
#!/usr/bin/env perl |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
use strict; |
760
|
|
|
|
|
|
|
use warnings; |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
use HTML::Parser::Simple; |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
# ------------------------- |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
# Method 1: |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
my($p) = HTML::Parser::Simple -> new |
769
|
|
|
|
|
|
|
( |
770
|
|
|
|
|
|
|
input_file => 'data/s.1.html', |
771
|
|
|
|
|
|
|
output_file => 'data/s.2.html', |
772
|
|
|
|
|
|
|
); |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
$p -> parse_file; |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
# Method 2: |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
my($p) = HTML::Parser::Simple -> new; |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
$p -> parse_file('data/s.1.html', 'data/s.2.html'); |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
# Method 3: |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
my($p) = HTML::Parser::Simple -> new; |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
print $p -> parse('...') -> traverse($p -> root) -> result; |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
Of course, these can be abbreviated by using method chaining. E.g. Method 2 could be: |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
HTML::Parser::Simple -> new -> parse_file('data/s.1.html', 'data/s.2.html'); |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
See scripts/parse.html.pl and scripts/parse.xhtml.pl. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=head1 Description |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
C is a pure Perl module. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
It parses HTML V 4 files, and generates a tree of nodes, with 1 node per HTML tag. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
The data associated with each node is documented in the L. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
See also L and L. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head1 Distributions |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
This module is available as a Unix-style distro (*.tgz). |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
See L for details. |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
See L for |
811
|
|
|
|
|
|
|
help on unpacking and installing. |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
=head1 Constructor and initialization |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
new(...) returns an object of type C. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
This is the class contructor. |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
Usage: C<< HTML::Parser::Simple -> new >>. |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
This method takes a hash of options. |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Call C<< new() >> as C<< new(option_1 => value_1, option_2 => value_2, ...) >>. |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
Available options (each one of which is also a method): |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=over 4 |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
=item o input_file => $a_file_name |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
This takes the file name, including the path, of the input file. |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
Default: '' (the empty string). |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
=item o output_file => $a_file_name |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
This takes the file name, including the path, of the output file. |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Default: '' (the empty string). |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=item o verbose => $Boolean |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
This takes either a 0 or a 1. |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
Write more or less progress messages. |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
Default: 0. |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=item o xhtml => $Boolean |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
This takes either a 0 or a 1. |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
0 means do not accept an XML declaration, such as |
854
|
|
|
|
|
|
|
at the start of the input file, and some other XHTML features, explained next. |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
1 means accept XHTML input. |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
Default: 0. |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
The only XHTML changes to this code, so far, are: |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=over 4 |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
=item o Accept the XML declaration |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
E.g.: . |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
=item o Accept attribute names containing the ':' char |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
E.g.: . |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
=back |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=back |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=head1 Methods |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head2 block() |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
Returns a hashref where the keys are the names of block-level HTML tags. |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
The corresponding values in the hashref are just 1. |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
Typical keys: address, form, p, table, tr. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
Note: Some keys, e.g. tr, are also returned by L. |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
=head2 current_node() |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
Returns the L object which the parser calls the current node. |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=head2 depth() |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
Returns the nesting depth of the current tag. |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
The method is just here in case you need it. |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=head2 empty() |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
Returns a hashref where the keys are the names of HTML tags of type empty. |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
The corresponding values in the hashref are just 1. |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
Typical keys: area, base, input, wbr. |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
=head2 inline() |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
Returns a hashref where the keys are the names of HTML tags of type inline. |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
The corresponding values in the hashref are just 1. |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
Typical keys: a, em, img, textarea. |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
=head2 input_file($in_file_name) |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
Gets or sets the input file name used by L. |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
Note: The parameters passed in to L, take precedence over the |
919
|
|
|
|
|
|
|
I and I parameters passed in to C<< new() >>, and over the internal values set with |
920
|
|
|
|
|
|
|
C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>. |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
'input_file' is a parameter to L. See L for details. |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
=head2 log($msg) |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
Print $msg to STDERR if C<< new() >> was called as C<< new(verbose => 1) >>, or if C<< $p -> verbose(1) >> |
927
|
|
|
|
|
|
|
was called. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
Otherwise, print nothing. |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
=head2 new() |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
This is the constructor. See L for details. |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
=head2 node_type() |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
Returns the type of the most recently created node, I, I, or I. |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
See the first question in the L for details. |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=head2 output_file($out_file_name) |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
Gets or sets the output file name used by L. |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Note: The parameters passed in to L, take precedence over the |
946
|
|
|
|
|
|
|
I and I parameters passed in to C<< new() >>, and over the internal values set with |
947
|
|
|
|
|
|
|
C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>. |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
'output_file' is a parameter to L. See L for details. |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=head2 parse($html) |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
Returns the invocant. Thus C<< $p -> parse >> returns $p. This allows for method chaining. See the L. |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
Parses the string of HTML in $html, and builds a tree of nodes. |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
After calling C<< $p -> parse($html) >>, you must call C<< $p -> traverse($p -> root) >> before calling |
958
|
|
|
|
|
|
|
C<< $p -> result >>. |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
Alternately, use C<< $p -> parse_file >>, which calls all these methods for you. |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
Note: C<< parse() >> may be called directly or via C<< parse_file() >>. |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
=head2 parse_file($input_file_name, $output_file_name) |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
Returns the invocant. Thus C<< $p -> parse_file >> returns $p. This allows for method chaining. See the L. |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
Parses the HTML in the input file, and writes the result to the output file. |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
C<< parse_file() >> calls L and L, using C<< $p -> root >> for $node. |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
Note: The parameters passed in to C<< parse_file($input_file_name, $output_file_name) >>, take precedence over the |
973
|
|
|
|
|
|
|
I and I parameters passed in to C<< new() >>, and over the internal values set with |
974
|
|
|
|
|
|
|
C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>. |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
Lastly, the parameters passed in to C<< parse_file($input_file_name, $output_file_name) >> are used to update |
977
|
|
|
|
|
|
|
the internal values set with the I and I parameters passed in to C<< new() >>, |
978
|
|
|
|
|
|
|
or set with calls to C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>. |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
=head2 result() |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
Returns the string which is the result of the parse. |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
See scripts/parse.html.pl. |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=head2 root() |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
Returns the L object which the parser calls the root of the tree of nodes. |
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
=head2 self_close() |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
Returns a hashref where the keys are the names of HTML tags of type self close. |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
The corresponding values in the hashref are just 1. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Typical keys: dd, dt, p, tr. |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
Note: Some keys, e.g. tr, are also returned by L. |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
=head2 tagged_attribute() |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
Returns a string to be used as a regexp, to capture tags and their optional attributes. |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
It does not return qr/$s/; it just returns $s. |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
This regexp takes one of two forms, depending on the state of the I option. See L. |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
The regexp has four (4) sets of capturing parentheses: |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
=over 4 |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
=item o 1 for the whole tag and attribute and trailing / combination |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
E.g.: <(....)> |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
=item o 1 for the tag itself |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
E.g.: <(img)...> |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
=item o 1 for the tag's optional attributes |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
E.g.: |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
=item o 1 for the tag's optional trailing / |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
E.g.: |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
=back |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=head2 traverse($node) |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
Returns the invocant. Thus C<< $p -> traverse >> returns $p. This allows for method chaining. See the L. |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
Traverses the tree of nodes, starting at $node. |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
You normally call this as C<< $p -> traverse($p -> root) >>, to ensure all nodes are visited. |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
See the L for sample code. |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
Or, see scripts/traverse.file.pl, which uses L, and calls C<< traverse($node) >> |
1041
|
|
|
|
|
|
|
via L. |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
=head2 verbose($Boolean) |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Gets or sets the verbose parameter. |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
'verbose' is a parameter to L. See L for details. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=head2 xhtml($Boolean) |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
Gets or sets the xhtml parameter. |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
If you call this after object creation, the I feature of L is used to call |
1054
|
|
|
|
|
|
|
L so as to correctly set the regexp which recognises xhtml. |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
'xhtm'> is a parameter to L. See L for details. |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
=head1 FAQ |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=head2 What is the format of the data stored in each node of the tree? |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
The data of each node is a hash ref. The keys/values of this hash ref are: |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
=over 4 |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item o attributes |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
This is the string of HTML attributes associated with the HTML tag. |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
Attributes are stored in lower-case. |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
So, will have an attributes string of
1073
|
|
|
|
|
|
|
" align = 'center' summary = 'body'". |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
Note the leading space. |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=item o content |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
This is an arrayref of bits and pieces of content. |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
Consider this fragment of HTML: |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
I did not say I liked debugging. |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
When parsing 'I did ', the number of child nodes (of ) is 0, since has not yet been detected. |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
So, 'I did ' is stored in the 0th element of the arrayref belonging to . |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
Likewise, 'not' is stored in the 0th element of the arrayref belonging to the node . |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
Next, ' say I ' is stored in the 1st element of the arrayref belonging to , |
1092
|
|
|
|
|
|
|
because it follows the 1st child node (). |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
Likewise, ' debugging' is stored in the 2nd element of the arrayref belonging to . |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
This way, the input string can be reproduced by successively outputting the elements of the arrayref of content |
1097
|
|
|
|
|
|
|
interspersed with the contents of the child nodes (processed recusively). |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
Note: If you are processing this tree, never forget that there can be content after the last child node has been closed, |
1100
|
|
|
|
|
|
|
but before the current node is closed. |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
Note: The DOCTYPE declaration is stored as the 0th element of the content of the root node. |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
=item o depth |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
The nesting depth of the tag within the document. |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
The root is at depth 0, '' is at depth 1, '' and '' are a depth 2, and so on. |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
It's just there in case you need it. |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
=item o name |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
So, the tag '' will mean the name is 'html'. |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
Tag names are stored in lower-case. |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
The root of the tree is called 'root', and holds the DOCTYPE, if any, as content. |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
The root has the node 'html' as the only child, of course. |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
=item o node_type |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
This holds 'global' before '' and between '' and '', and after ''. |
1125
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
It holds 'head' for all nodes from '' to '', and holds 'body' from '' to ''. |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
It's just there in case you need it. |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
=back |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
=head2 How are tags and attributes handled? |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
Tags are stored in lower-case, in a tree managed by L. |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
Attributes are stored in the same case as in the original HTML. |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
The root of the tree is returned be L. |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
=head2 How are HTML comments handled? |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
They are treated as content. This includes the prefix ''. |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
=head2 How is DOCTYPE handled? |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
It is treated as content belonging to the root of the tree. |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
=head2 How is the XML declaration handled? |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
It is treated as content belonging to the root of the tree. |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=head2 Does this module handle all HTML pages? |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
No, never. |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=head2 Which versions of HTML does this module handle? |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
Up to V 4. |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
=head2 What do I do if this module does not handle my HTML page? |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
Make yourself a nice cup of tea, and then fix your page. |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
=head2 Does this validate the HTML input? |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
No. |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
For example, if you feed in a HTML page without the title tag, this module does not care. |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
=head2 How do I view the output HTML? |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
There are various ways. |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
=over 4 |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
=item o See scripts/parse.html.pl |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
=item o By installing HTML::Revelation, of course! |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
Sample output: |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
L. |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
=back |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
=head2 How do I test this module (or my file)? |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
Preferably, see the previous question, or... |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
Suggested steps: |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
Note: There are quite a few files involved. Proceed with caution. |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
=over 4 |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
=item o Select a HTML file to test |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
Call this input.html. |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
=item o Run input.html thru reveal.pl |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
Reveal.pl ships with HTML::Revelation. |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
Call the output file output.1.html. |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
=item o Run input.html thru parse.html.pl |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
parse.html.pl ships with HTML::Parser::Simple. |
1209
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
Call the output file parsed.html. |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
=item o Run parsed.html thru reveal.pl |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
Call the output file output.2.html. |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
=item o Compare output.1.html and output.2.html |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
If they match, or even if they don't match, you're finished. |
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
=back |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
=head2 Will you implement a 'quirks' mode to handle my special HTML file? |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
No, never. |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
Help with quirks: L. |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
=head2 Is there anything I should be aware of? |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
Yes. If your HTML file is not nice, the interpretation of tag nesting will not match |
1231
|
|
|
|
|
|
|
your preconceptions. |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
In such cases, do not seek to fix the code. Instead, fix your (faulty) preconceptions, and fix your HTML file. |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
The 'a' tag, for example, is defined to be an inline tag, but the 'div' tag is a block-level tag. |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
I don't define 'a' to be inline, others do, e.g. L and hence HTML::Tagset. |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
Inline means: |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
NAME |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
will I be parsed as an 'a' containing a 'div'. |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
The 'a' tag will be closed before the 'div' is opened. So, the result will look like: |
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
NAME |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
To achieve what was presumably intended, use 'span': |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
NAME |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
Some people (*cough* *cough*) have had to redo their entire websites due to this very problem. |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
Of course, this is just one of a vast set of possible problems. |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
You have been warned. |
1258
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
=head2 Why did you use Tree::Simple but not Tree or Tree::Fast or Tree::DAG_Node? |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
During testing, Tree::Fast crashed, so I replaced it with Tree and everything worked. Spooky. |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
Late news: Tree does not cope with an arrayref stored in the metadata, so I've switched to Tree::DAG_Node. |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
Stop press: As an experiment I switched to Tree::Simple. Since it also works I'll just keep using it. |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
=head2 Why isn't this module called HTML::Parser::PurePerl? |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
=over 4 |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
=item o The API |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
That name sounds like a pure Perl version of the same API as used by HTML::Parser. |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
But the API's are not, and are not meant to be, compatible. |
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
=item o The tie-in |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
Some people might falsely assume HTML::Parser can automatically fall back to HTML::Parser::PurePerl in the absence of a compiler. |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
=back |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
=head2 How do I output my own stuff while traversing the tree? |
1284
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
=over 4 |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
=item o The sophisticated way |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
As always with OO code, sub-class! In this case, you write a new version of the traverse() method. |
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
See L, for example. It overrides L. |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
=item o The crude way |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
Alternately, implement another method in your sub-class, e.g. process(), which recurses like traverse(). |
1296
|
|
|
|
|
|
|
Then call parse() and process(). |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=back |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=head2 Is the code on github? |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
Yes. See: git://github.com/ronsavage/html--parser--simple.git |
1303
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
=head2 How is the source formatted? |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
I edit with UltraEdit. That means, in general, leading 4-space tabs. |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
All vertical alignment within lines is done manually with spaces. |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
Perl::Critic is off the agenda. |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
=head2 Why did you choose Moos? |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
For this year's (2012) Google Code-in, I had a quick look at 122 class-building classes, and decided |
1315
|
|
|
|
|
|
|
L was suitable, given it is pure-Perl and has the trigger feature I needed. |
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
See L. |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
=head1 Credits |
1320
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
This Perl HTML parser has been converted from a JavaScript one written by John Resig. |
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
L. |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
Well done John! |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
Note also the comments published here: |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
L. |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
=head1 Author |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
C was written by Ron Savage Iron@savage.net.auE> in 2009. |
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
Home page: L. |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
=head1 Copyright |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
Australian copyright (c) 2009 Ron Savage. |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
All Programs of mine are 'OSI Certified Open Source Software'; |
1342
|
|
|
|
|
|
|
you can redistribute them and/or modify them under the terms of |
1343
|
|
|
|
|
|
|
The Artistic License, a copy of which is available at: |
1344
|
|
|
|
|
|
|
http://www.opensource.org/licenses/index.html |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
=cut |
|