line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
##---------------------------------------------------------------------------- |
2
|
|
|
|
|
|
|
## HTML Object - ~/lib/HTML/Object.pm |
3
|
|
|
|
|
|
|
## Version v0.2.6 |
4
|
|
|
|
|
|
|
## Copyright(c) 2023 DEGUEST Pte. Ltd. |
5
|
|
|
|
|
|
|
## Author: Jacques Deguest <jack@deguest.jp> |
6
|
|
|
|
|
|
|
## Created 2021/04/20 |
7
|
|
|
|
|
|
|
## Modified 2023/05/18 |
8
|
|
|
|
|
|
|
## All rights reserved |
9
|
|
|
|
|
|
|
## |
10
|
|
|
|
|
|
|
## |
11
|
|
|
|
|
|
|
## This program is free software; you can redistribute it and/or modify it |
12
|
|
|
|
|
|
|
## under the same terms as Perl itself. |
13
|
|
|
|
|
|
|
##---------------------------------------------------------------------------- |
14
|
|
|
|
|
|
|
package HTML::Object; |
15
|
|
|
|
|
|
|
BEGIN |
16
|
|
|
|
|
|
|
{ |
17
|
29
|
|
|
29
|
|
330106
|
use strict; |
|
29
|
|
|
|
|
79
|
|
|
29
|
|
|
|
|
888
|
|
18
|
29
|
|
|
29
|
|
140
|
use warnings; |
|
29
|
|
|
|
|
86
|
|
|
29
|
|
|
|
|
794
|
|
19
|
29
|
|
|
29
|
|
154
|
use warnings::register; |
|
29
|
|
|
|
|
51
|
|
|
29
|
|
|
|
|
3512
|
|
20
|
29
|
|
|
29
|
|
1489
|
use parent qw( Module::Generic ); |
|
29
|
|
|
|
|
939
|
|
|
29
|
|
|
|
|
175
|
|
21
|
29
|
|
|
29
|
|
337478213
|
use vars qw( $DICT $LINK_ELEMENTS $FATAL_ERROR $GLOBAL_DOM $VERSION ); |
|
29
|
|
|
|
|
64
|
|
|
29
|
|
|
|
|
2027
|
|
22
|
29
|
|
|
29
|
|
17782
|
use curry; |
|
29
|
|
|
|
|
10443
|
|
|
29
|
|
|
|
|
1063
|
|
23
|
29
|
|
|
29
|
|
16935
|
use Devel::Confess; |
|
29
|
|
|
|
|
215590
|
|
|
29
|
|
|
|
|
155
|
|
24
|
29
|
|
|
29
|
|
2273
|
use Encode (); |
|
29
|
|
|
|
|
73
|
|
|
29
|
|
|
|
|
646
|
|
25
|
29
|
|
|
29
|
|
176
|
use Filter::Util::Call; |
|
29
|
|
|
|
|
61
|
|
|
29
|
|
|
|
|
2019
|
|
26
|
29
|
|
|
29
|
|
13823
|
use HTML::Object::Closing; |
|
29
|
|
|
|
|
138
|
|
|
29
|
|
|
|
|
539
|
|
27
|
29
|
|
|
29
|
|
28489
|
use HTML::Object::Comment; |
|
29
|
|
|
|
|
89
|
|
|
29
|
|
|
|
|
329
|
|
28
|
29
|
|
|
29
|
|
17890
|
use HTML::Object::Declaration; |
|
29
|
|
|
|
|
95
|
|
|
29
|
|
|
|
|
306
|
|
29
|
29
|
|
|
29
|
|
17751
|
use HTML::Object::Document; |
|
29
|
|
|
|
|
89
|
|
|
29
|
|
|
|
|
331
|
|
30
|
29
|
|
|
29
|
|
7339
|
use HTML::Object::Element; |
|
29
|
|
|
|
|
56
|
|
|
29
|
|
|
|
|
203
|
|
31
|
29
|
|
|
29
|
|
17117
|
use HTML::Object::Space; |
|
29
|
|
|
|
|
82
|
|
|
29
|
|
|
|
|
302
|
|
32
|
29
|
|
|
29
|
|
17492
|
use HTML::Object::Text; |
|
29
|
|
|
|
|
95
|
|
|
29
|
|
|
|
|
314
|
|
33
|
29
|
|
|
29
|
|
24719
|
use HTML::Parser; |
|
29
|
|
|
|
|
157235
|
|
|
29
|
|
|
|
|
1203
|
|
34
|
29
|
|
|
29
|
|
19454
|
use JSON; |
|
29
|
|
|
|
|
271567
|
|
|
29
|
|
|
|
|
182
|
|
35
|
29
|
|
|
29
|
|
39177
|
use Module::Generic::File qw( file ); |
|
29
|
|
|
|
|
312917390
|
|
|
29
|
|
|
|
|
539
|
|
36
|
29
|
|
|
29
|
|
12759
|
use Nice::Try; |
|
29
|
|
|
|
|
65
|
|
|
29
|
|
|
|
|
298
|
|
37
|
29
|
|
|
29
|
|
43269561
|
use Scalar::Util (); |
|
29
|
|
|
|
|
75
|
|
|
29
|
|
|
|
|
2354
|
|
38
|
29
|
|
|
29
|
|
114
|
our $VERSION = 'v0.2.6'; |
39
|
29
|
|
|
|
|
119
|
our $DICT = {}; |
40
|
29
|
|
|
|
|
64
|
our $LINK_ELEMENTS = {}; |
41
|
29
|
|
|
|
|
683
|
our $FATAL_ERROR = 0; |
42
|
|
|
|
|
|
|
}; |
43
|
|
|
|
|
|
|
|
44
|
29
|
|
|
29
|
|
180
|
use strict; |
|
29
|
|
|
|
|
55
|
|
|
29
|
|
|
|
|
892
|
|
45
|
29
|
|
|
29
|
|
154
|
use warnings; |
|
29
|
|
|
|
|
54
|
|
|
29
|
|
|
|
|
37745
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
{ |
48
|
|
|
|
|
|
|
my $me = file( __FILE__ ); |
49
|
|
|
|
|
|
|
my $path = $me->parent; |
50
|
|
|
|
|
|
|
my $dict_json = 'html_tags_dict.json'; |
51
|
|
|
|
|
|
|
my $tags_repo = $path->child( $dict_json ); |
52
|
|
|
|
|
|
|
if( $tags_repo->exists ) |
53
|
|
|
|
|
|
|
{ |
54
|
|
|
|
|
|
|
try |
55
|
|
|
|
|
|
|
{ |
56
|
|
|
|
|
|
|
my $json = $tags_repo->load_utf8 || |
57
|
|
|
|
|
|
|
die( "Unable to open html tags json dictionary \"$tags_repo\": ", $tags_repo->error, "\n" ); |
58
|
|
|
|
|
|
|
my $j = JSON->new->relaxed->utf8; |
59
|
|
|
|
|
|
|
my $hash = $j->decode( $json ); |
60
|
|
|
|
|
|
|
die( "No html tags found inside dictionary file \"$tags_repo\"\n" ) if( !scalar( keys( %{$hash->{dict}} ) ) ); |
61
|
|
|
|
|
|
|
$DICT = $hash->{dict}; |
62
|
|
|
|
|
|
|
for( keys( %$DICT ) ) |
63
|
|
|
|
|
|
|
{ |
64
|
|
|
|
|
|
|
if( exists( $DICT->{ $_ }->{link_in} ) ) |
65
|
|
|
|
|
|
|
{ |
66
|
|
|
|
|
|
|
$LINK_ELEMENTS->{ $_ } = $DICT->{ $_ }->{link_in}; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
catch( $e ) |
71
|
|
|
|
|
|
|
{ |
72
|
|
|
|
|
|
|
die( "Fatal error occurred while trying to load html tags json dictionary \"$tags_repo\": $e\n" ); |
73
|
29
|
|
|
29
|
|
247
|
} |
|
29
|
|
|
|
|
76
|
|
|
29
|
|
|
|
|
100135
|
|
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
else |
76
|
|
|
|
|
|
|
{ |
77
|
|
|
|
|
|
|
die( "Missing core file \"$dict_json\"\n" ); |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub import |
82
|
|
|
|
|
|
|
{ |
83
|
42
|
|
|
42
|
|
5112
|
my $class = shift( @_ ); |
84
|
42
|
|
|
|
|
146
|
my $hash = {}; |
85
|
42
|
|
|
|
|
297
|
for( my $i = 0; $i < scalar( @_ ); $i++ ) |
86
|
|
|
|
|
|
|
{ |
87
|
6
|
100
|
33
|
|
|
190
|
if( $_[$i] eq 'debug' || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
88
|
|
|
|
|
|
|
$_[$i] eq 'debug_code' || |
89
|
|
|
|
|
|
|
$_[$i] eq 'debug_file' || |
90
|
|
|
|
|
|
|
$_[$i] eq 'fatal_error' || |
91
|
|
|
|
|
|
|
$_[$i] eq 'global_dom' || |
92
|
|
|
|
|
|
|
$_[$i] eq 'try_catch' ) |
93
|
|
|
|
|
|
|
{ |
94
|
2
|
|
|
|
|
20
|
$hash->{ $_[$i] } = $_[$i+1]; |
95
|
2
|
|
|
|
|
12
|
CORE::splice( @_, $i, 2 ); |
96
|
2
|
|
|
|
|
9
|
$i--; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
} |
99
|
42
|
|
|
|
|
141
|
local $Exporter::ExportLevel = 1; |
100
|
42
|
|
|
|
|
2924
|
Exporter::import( $class, @_ ); |
101
|
42
|
50
|
|
|
|
335
|
$hash->{debug} = 0 if( !CORE::exists( $hash->{debug} ) ); |
102
|
42
|
100
|
|
|
|
336
|
$hash->{global_dom} = 0 if( !CORE::exists( $hash->{global_dom} ) ); |
103
|
42
|
50
|
|
|
|
275
|
$hash->{debug_code} = 0 if( !CORE::exists( $hash->{debug_code} ) ); |
104
|
42
|
50
|
|
|
|
281
|
$hash->{fatal_error} = 0 if( !CORE::exists( $hash->{fatal_error} ) ); |
105
|
42
|
50
|
|
|
|
289
|
$hash->{try_catch} = 0 if( !CORE::exists( $hash->{try_catch} ) ); |
106
|
42
|
50
|
|
|
|
228
|
if( $hash->{fatal_error} ) |
107
|
|
|
|
|
|
|
{ |
108
|
0
|
|
|
|
|
0
|
$FATAL_ERROR = 1; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
42
|
50
|
|
|
|
196
|
if( $hash->{try_catch} ) |
112
|
|
|
|
|
|
|
{ |
113
|
|
|
|
|
|
|
# Nice::Try is among our dependency, so we can load it safely |
114
|
0
|
|
|
|
|
0
|
require Nice::Try; |
115
|
0
|
|
|
|
|
0
|
Nice::Try->export_to_level( 1, @_ ); |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
42
|
100
|
|
|
|
1268
|
if( $hash->{global_dom} ) |
119
|
|
|
|
|
|
|
{ |
120
|
2
|
|
33
|
|
|
51
|
Filter::Util::Call::filter_add( bless( $hash => ( ref( $class ) || $class ) ) ); |
121
|
2
|
|
|
|
|
2133
|
require HTML::Object::XQuery; |
122
|
2
|
|
|
|
|
504
|
HTML::Object::XQuery->export_to_level( 1, @_ ); |
123
|
|
|
|
|
|
|
# Same as Firefox, Chrome or Safari do: default dom for blank page |
124
|
2
|
|
|
|
|
198
|
our $GLOBAL_DOM = __PACKAGE__->new( debug => $hash->{debug} )->parse( <<EOT ); |
125
|
|
|
|
|
|
|
<html><head></head><body></body></html> |
126
|
|
|
|
|
|
|
EOT |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub filter |
131
|
|
|
|
|
|
|
{ |
132
|
2
|
|
|
2
|
1
|
875
|
my( $self ) = @_ ; |
133
|
2
|
|
|
|
|
6
|
my( $status, $last_line ); |
134
|
2
|
|
|
|
|
5
|
my $line = 0; |
135
|
2
|
|
|
|
|
12
|
my $code = ''; |
136
|
2
|
50
|
|
|
|
13
|
if( !$self->{global_dom} ) |
137
|
|
|
|
|
|
|
{ |
138
|
0
|
|
|
|
|
0
|
Filter::Util::Call::filter_del(); |
139
|
0
|
|
|
|
|
0
|
$status = 1; |
140
|
0
|
|
|
|
|
0
|
return( $status ); |
141
|
|
|
|
|
|
|
} |
142
|
2
|
|
|
|
|
60
|
while( $status = Filter::Util::Call::filter_read() ) |
143
|
|
|
|
|
|
|
{ |
144
|
281
|
50
|
|
|
|
398
|
return( $status ) if( $status < 0 ); |
145
|
281
|
|
|
|
|
250
|
$line++; |
146
|
281
|
50
|
|
|
|
430
|
if( /^__(?:DATA|END)__/ ) |
147
|
|
|
|
|
|
|
{ |
148
|
0
|
|
|
|
|
0
|
last; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
s{ |
152
|
|
|
|
|
|
|
(?<!\\)\$\( |
153
|
|
|
|
|
|
|
} |
154
|
15
|
|
|
|
|
76
|
{ |
155
|
281
|
|
|
|
|
892
|
"xq(" |
156
|
|
|
|
|
|
|
}gexs; |
157
|
2
|
50
|
|
|
|
17
|
} |
158
|
|
|
|
|
|
|
if( $self->{debug_file} ) |
159
|
0
|
0
|
|
|
|
0
|
{ |
160
|
|
|
|
|
|
|
if( open( my $fh, ">$self->{debug_file}" ) ) |
161
|
0
|
|
|
|
|
0
|
{ |
162
|
0
|
|
|
|
|
0
|
binmode( $fh, ':utf8' ); |
163
|
0
|
|
|
|
|
0
|
print( $fh $_ ); |
164
|
|
|
|
|
|
|
close( $fh ); |
165
|
|
|
|
|
|
|
} |
166
|
2
|
|
|
|
|
65
|
} |
167
|
|
|
|
|
|
|
return( $line ); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub init |
171
|
72
|
|
|
72
|
1
|
28629
|
{ |
172
|
72
|
|
|
|
|
491
|
my $self = shift( @_ ); |
173
|
72
|
50
|
|
|
|
534
|
$self->{_init_strict_use_sub} = 1; |
174
|
72
|
50
|
|
|
|
548
|
$self->{_exception_class} = 'HTML::Object::Exception' unless( CORE::exists( $self->{_exception_class} ) ); |
175
|
72
|
|
|
|
|
9755
|
$self->SUPER::init( @_ ) || return( $self->pass_error ); |
176
|
|
|
|
|
|
|
my $p = HTML::Parser->new( |
177
|
|
|
|
|
|
|
api_version => 3, |
178
|
|
|
|
|
|
|
start_h => [ $self->curry::add_start, 'self, tagname, attr, attrseq, text, column, line, offset, offset_end'], |
179
|
|
|
|
|
|
|
end_h => [ $self->curry::add_end, 'self, tagname, attr, attrseq, text, column, line, offset, offset_end' ], |
180
|
|
|
|
|
|
|
marked_sections => 1, |
181
|
|
|
|
|
|
|
comment_h => [ $self->curry::add_comment, 'self, text, column, line, offset, offset_end'], |
182
|
|
|
|
|
|
|
declaration_h => [ $self->curry::add_declaration, 'self, text, column, line, offset, offset_end'], |
183
|
|
|
|
|
|
|
default_h => [ $self->curry::add_default, 'self, tagname, attr, attrseq, text, column, line, offset, offset_end'], |
184
|
|
|
|
|
|
|
text_h => [ $self->curry::add_text, 'self, text, column, line, offset, offset_end'], |
185
|
|
|
|
|
|
|
# This is not activated, because as per the documentation, this will call an 'end tag' caller, and this could imply <br></br> for other unknown tags, whereas with <br /> we know for sure this is an empty tag |
186
|
|
|
|
|
|
|
# empty_element_tags => 1, |
187
|
|
|
|
|
|
|
unbroken_text => 1, |
188
|
72
|
|
|
|
|
18390
|
); |
189
|
72
|
|
|
|
|
477
|
$self->{document} = ''; |
190
|
72
|
|
|
|
|
303
|
$self->{current_parent} = ''; |
191
|
72
|
|
|
|
|
300
|
$self->{_parser} = $p; |
192
|
72
|
|
|
|
|
581
|
$self->{_elems} = []; |
193
|
|
|
|
|
|
|
return( $self ); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub add_comment |
197
|
7
|
|
|
7
|
1
|
135
|
{ |
198
|
7
|
|
|
|
|
34
|
my $self = shift( @_ ); |
199
|
7
|
|
|
|
|
27
|
my @args = @_; |
200
|
7
|
|
|
|
|
121
|
my $opts = {}; |
201
|
7
|
|
|
|
|
60
|
my @p = qw( p raw col line offset offset_end ); |
202
|
7
|
|
|
|
|
50
|
@$opts{ @p } = @args; |
203
|
7
|
|
|
|
|
195
|
my $parent = $self->current_parent; |
204
|
7
|
|
|
|
|
103
|
my $val = $opts->{raw}; |
205
|
|
|
|
|
|
|
$val =~ s,^\<\!\-\-|\-\-\>$,,gs; |
206
|
|
|
|
|
|
|
my $e = $self->new_comment({ |
207
|
|
|
|
|
|
|
column => $opts->{col}, |
208
|
|
|
|
|
|
|
line => $opts->{line}, |
209
|
|
|
|
|
|
|
offset => $opts->{offset}, |
210
|
7
|
|
50
|
|
|
76
|
original => $opts->{raw}, |
211
|
|
|
|
|
|
|
parent => $parent, |
212
|
|
|
|
|
|
|
value => $val, |
213
|
|
|
|
|
|
|
debug => $self->debug, |
214
|
7
|
|
|
|
|
67
|
}) || return; |
215
|
7
|
|
|
|
|
928
|
$parent->children->push( $e ); |
216
|
|
|
|
|
|
|
return( $e ); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub add_declaration |
220
|
19
|
|
|
19
|
1
|
242
|
{ |
221
|
19
|
|
|
|
|
88
|
my $self = shift( @_ ); |
222
|
19
|
|
|
|
|
52
|
my @args = @_; |
223
|
19
|
|
|
|
|
195
|
my $opts = {}; |
224
|
19
|
|
|
|
|
111
|
my @p = qw( p raw col line offset offset_end ); |
225
|
19
|
|
|
|
|
145
|
@$opts{ @p } = @args; |
226
|
19
|
100
|
|
|
|
487
|
my $parent = $self->current_parent; |
227
|
|
|
|
|
|
|
return if( !$self->_is_a( $parent => 'HTML::Object::DOM::Document' ) ); |
228
|
|
|
|
|
|
|
my $e = $self->new_declaration({ |
229
|
|
|
|
|
|
|
column => $opts->{col}, |
230
|
|
|
|
|
|
|
line => $opts->{line}, |
231
|
|
|
|
|
|
|
offset => $opts->{offset}, |
232
|
18
|
|
|
|
|
1017
|
original => $opts->{raw}, |
233
|
|
|
|
|
|
|
parent => $parent, |
234
|
|
|
|
|
|
|
debug => $self->debug, |
235
|
|
|
|
|
|
|
}); |
236
|
18
|
|
|
|
|
189
|
# $parent->children->push( $e ); |
237
|
18
|
|
|
|
|
1115
|
$self->document->declaration( $e ); |
238
|
18
|
|
|
|
|
2191
|
$parent->children->push( $e ); |
239
|
|
|
|
|
|
|
return( $e ); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub add_default |
243
|
98
|
|
|
98
|
1
|
1727
|
{ |
244
|
98
|
|
|
|
|
434
|
my $self = shift( @_ ); |
245
|
98
|
|
|
|
|
295
|
my @args = @_; |
246
|
98
|
|
|
|
|
967
|
my $opts = {}; |
247
|
98
|
|
|
|
|
1005
|
my @p = qw( p tag attr seq raw col line offset offset_end ); |
248
|
98
|
50
|
33
|
|
|
1936
|
@$opts{ @p } = @args; |
249
|
|
|
|
|
|
|
return if( !CORE::length( $opts->{raw} ) && !defined( $opts->{tag} ) ); |
250
|
0
|
|
|
|
|
0
|
# Unknown tag, so we check if there is a "/>" to determine if this is an empty (void) tag or not |
251
|
0
|
|
|
|
|
0
|
my $attr = $opts->{attr}; |
252
|
0
|
0
|
|
|
|
0
|
my $def = {}; |
253
|
0
|
|
|
|
|
0
|
$def->{is_empty} = exists( $attr->{'/'} ) ? 1 : 0; |
254
|
0
|
0
|
|
|
|
0
|
my $parent = $self->current_parent; |
255
|
|
|
|
|
|
|
if( !length( $opts->{tag} ) ) |
256
|
0
|
|
|
|
|
0
|
{ |
257
|
|
|
|
|
|
|
return( $self->add_text( @args ) ); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
# Check the current parent and see if we need to close it. |
260
|
|
|
|
|
|
|
# If this new tag is a non-empty tag (i.e. non-void) and the current parent has not been closed, |
261
|
|
|
|
|
|
|
# implicitly close it now, by setting that tag's parent as the current parent |
262
|
|
|
|
|
|
|
# This is what Mozilla does: |
263
|
|
|
|
|
|
|
# Ref: <https://bugzilla.mozilla.org/show_bug.cgi?id=820926> |
264
|
|
|
|
|
|
|
# NOTE This needs to be done in post processing not during initial parsing, because at this point in the process we have not yet seen the closing tag, and we might see it later, so making guesses here is ill-advised. |
265
|
|
|
|
|
|
|
# if( !$parent->is_closed && |
266
|
|
|
|
|
|
|
# !$def->{is_empty} && |
267
|
|
|
|
|
|
|
# $parent && |
268
|
|
|
|
|
|
|
# !$parent->isa( 'HTML::Object::Document' ) && |
269
|
|
|
|
|
|
|
# $parent->tag ne 'html' ) |
270
|
|
|
|
|
|
|
# { |
271
|
|
|
|
|
|
|
# $parent = $parent->parent; |
272
|
|
|
|
|
|
|
# } |
273
|
|
|
|
|
|
|
my $e = $self->new_element({ |
274
|
|
|
|
|
|
|
attributes => $opts->{attr}, |
275
|
|
|
|
|
|
|
attributes_sequence => $opts->{seq}, |
276
|
|
|
|
|
|
|
column => $opts->{col}, |
277
|
|
|
|
|
|
|
is_empty => $def->{is_empty}, |
278
|
|
|
|
|
|
|
line => $opts->{line}, |
279
|
|
|
|
|
|
|
offset => $opts->{offset}, |
280
|
|
|
|
|
|
|
original => $opts->{raw}, |
281
|
|
|
|
|
|
|
parent => $parent, |
282
|
0
|
|
0
|
|
|
0
|
tag => $opts->{tag}, |
283
|
|
|
|
|
|
|
debug => $self->debug, |
284
|
0
|
|
|
|
|
0
|
}) || return; |
285
|
0
|
0
|
|
|
|
0
|
$parent->children->push( $e ); |
286
|
|
|
|
|
|
|
if( !$def->{is_empty} ) |
287
|
0
|
|
|
|
|
0
|
{ |
288
|
|
|
|
|
|
|
$self->current_parent( $e ); |
289
|
0
|
|
|
|
|
0
|
} |
290
|
|
|
|
|
|
|
return( $e ); |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
sub add_end |
294
|
217
|
|
|
217
|
1
|
3371
|
{ |
295
|
217
|
|
|
|
|
991
|
my $self = shift( @_ ); |
296
|
217
|
|
|
|
|
698
|
my @args = @_; |
297
|
217
|
|
|
|
|
2149
|
my $opts = {}; |
298
|
217
|
|
|
|
|
2220
|
my @p = qw( p tag attr seq raw col line offset offset_end ); |
299
|
217
|
|
|
|
|
1381
|
@$opts{ @p } = @args; |
300
|
217
|
|
|
|
|
6265
|
my $me = $self->current_parent; |
301
|
217
|
50
|
|
|
|
5225
|
my $parent = $me->parent; |
302
|
|
|
|
|
|
|
if( $opts->{tag} ne $me->tag ) |
303
|
0
|
0
|
|
|
|
0
|
{ |
304
|
|
|
|
|
|
|
warn( "Oops, something is wrong in the parsing. I was expecting a closing tag for \"", $me->tag, "\" that started at line \"", $me->line, "\" but instead found a closing tag for \"$opts->{tag}\" at line \"$opts->{line}\" and column \"$opts->{col}\": $opts->{raw}\n" ) if( $self->_warnings_is_enabled ); |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
else |
307
|
|
|
|
|
|
|
{ |
308
|
|
|
|
|
|
|
my $e = $self->new_closing({ |
309
|
|
|
|
|
|
|
attributes => $opts->{attr}, |
310
|
|
|
|
|
|
|
attributes_sequence => $opts->{seq}, |
311
|
|
|
|
|
|
|
column => $opts->{col}, |
312
|
|
|
|
|
|
|
line => $opts->{line}, |
313
|
|
|
|
|
|
|
offset => $opts->{offset}, |
314
|
|
|
|
|
|
|
original => $opts->{raw}, |
315
|
217
|
|
50
|
|
|
211794
|
tag => $opts->{tag}, |
316
|
|
|
|
|
|
|
debug => $self->debug, |
317
|
217
|
|
|
|
|
3892
|
}) || return; |
318
|
217
|
|
|
|
|
243499
|
$me->is_closed(1); |
319
|
|
|
|
|
|
|
$me->close_tag( $e ); |
320
|
217
|
|
|
|
|
13997
|
# $parent->children->push( $e ); |
321
|
|
|
|
|
|
|
$self->current_parent( $parent ); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
sub add_space |
326
|
369
|
|
|
369
|
1
|
10460
|
{ |
327
|
369
|
|
|
|
|
2339
|
my $self = shift( @_ ); |
328
|
369
|
|
|
|
|
67084
|
my $opts = $self->_get_args_as_hash( @_ ); |
329
|
369
|
|
50
|
|
|
9960
|
my $parent = $self->current_parent; |
330
|
369
|
|
|
|
|
2792
|
my $e = $self->new_space( $opts ) || return; |
331
|
369
|
|
|
|
|
48564
|
$parent->children->push( $e ); |
332
|
|
|
|
|
|
|
return( $e ); |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
sub add_start |
336
|
321
|
|
|
321
|
1
|
4318
|
{ |
337
|
321
|
|
|
|
|
1377
|
my $self = shift( @_ ); |
338
|
321
|
|
|
|
|
1035
|
my @args = @_; |
339
|
321
|
|
|
|
|
2594
|
my $opts = {}; |
340
|
321
|
|
|
|
|
3143
|
my @p = qw( p tag attr seq raw col line offset offset_end ); |
341
|
321
|
|
|
|
|
1944
|
@$opts{ @p } = @args; |
342
|
321
|
100
|
|
|
|
10047
|
my $parent = $self->current_parent; |
343
|
|
|
|
|
|
|
if( $opts->{tag} =~ s,/,, ) |
344
|
1
|
|
|
|
|
5
|
{ |
345
|
|
|
|
|
|
|
$opts->{attr}->{'/'} = '/'; |
346
|
321
|
|
|
|
|
2007
|
} |
347
|
|
|
|
|
|
|
my $def = $self->get_definition( $opts->{tag} ); |
348
|
321
|
50
|
|
|
|
1831
|
# Make some easy guess |
349
|
|
|
|
|
|
|
if( !scalar( keys( %$def ) ) ) |
350
|
0
|
0
|
|
|
|
0
|
{ |
351
|
|
|
|
|
|
|
$def->{is_empty} = 1 if( CORE::exists( $opts->{attr}->{'/'} ) ); |
352
|
|
|
|
|
|
|
# "Return HTMLUnknownElement" |
353
|
0
|
|
|
|
|
0
|
# <https://html.spec.whatwg.org/multipage/dom.html#htmlunknownelement> |
354
|
|
|
|
|
|
|
$def->{class} = 'HTML::Object::DOM::Unknown'; |
355
|
321
|
50
|
|
|
|
1643
|
} |
356
|
|
|
|
|
|
|
$def->{is_empty} = 0 unless( CORE::exists( $def->{is_empty} ) ); |
357
|
|
|
|
|
|
|
# Check the current parent and see if we need to close it. |
358
|
|
|
|
|
|
|
# If this new tag is a non-empty tag (i.e. non-void) and the current parent has not been closed, |
359
|
|
|
|
|
|
|
# implicitly close it now, by setting that tag's parent as the current parent |
360
|
|
|
|
|
|
|
# This is what Mozilla does: |
361
|
|
|
|
|
|
|
# Ref: <https://bugzilla.mozilla.org/show_bug.cgi?id=820926> |
362
|
|
|
|
|
|
|
# NOTE This needs to be done in post processing not during initial parsing, because at this point in the process we have not yet seen the closing tag, and we might see it later, so making guesses here is ill-advised. |
363
|
|
|
|
|
|
|
# if( !$parent->is_closed && |
364
|
|
|
|
|
|
|
# !$def->{is_empty} && |
365
|
|
|
|
|
|
|
# $parent && |
366
|
|
|
|
|
|
|
# !$parent->isa( 'HTML::Object::Document' ) && |
367
|
|
|
|
|
|
|
# $parent->tag ne 'html' ) |
368
|
|
|
|
|
|
|
# { |
369
|
|
|
|
|
|
|
# $parent = $parent->parent; |
370
|
321
|
|
100
|
|
|
1616
|
# } |
371
|
321
|
|
|
|
|
727
|
$def->{class} //= ''; |
372
|
|
|
|
|
|
|
my $e; |
373
|
|
|
|
|
|
|
my $params = |
374
|
|
|
|
|
|
|
{ |
375
|
|
|
|
|
|
|
attributes => $opts->{attr}, |
376
|
|
|
|
|
|
|
attributes_sequence => $opts->{seq}, |
377
|
|
|
|
|
|
|
column => $opts->{col}, |
378
|
|
|
|
|
|
|
is_empty => $def->{is_empty}, |
379
|
|
|
|
|
|
|
line => $opts->{line}, |
380
|
|
|
|
|
|
|
offset => $opts->{offset}, |
381
|
|
|
|
|
|
|
original => $opts->{raw}, |
382
|
|
|
|
|
|
|
parent => $parent, |
383
|
|
|
|
|
|
|
tag => $opts->{tag}, |
384
|
321
|
|
|
|
|
3076
|
# and |
385
|
|
|
|
|
|
|
debug => $self->debug, |
386
|
|
|
|
|
|
|
}; |
387
|
|
|
|
|
|
|
|
388
|
321
|
100
|
|
|
|
11448
|
# If this tag is handled by a special class, instantiate the object by this class |
389
|
|
|
|
|
|
|
if( $def->{class} ) |
390
|
278
|
|
50
|
|
|
1913
|
{ |
391
|
|
|
|
|
|
|
$e = $self->new_special( $def->{class} => $params ) || return; |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
else |
394
|
43
|
|
50
|
|
|
207
|
{ |
395
|
|
|
|
|
|
|
$e = $self->new_element( $params ) || return; |
396
|
321
|
|
|
|
|
1972
|
} |
397
|
|
|
|
|
|
|
$parent->children->push( $e ); |
398
|
321
|
100
|
|
|
|
31930
|
# If this element is an element that, by nature, can contain other elements we mark it as the last element seen so it can be used as a parent. When we close it, we switch the parent to its parent . |
399
|
|
|
|
|
|
|
if( !$def->{is_empty} ) |
400
|
221
|
|
|
|
|
3553
|
{ |
401
|
|
|
|
|
|
|
$self->current_parent( $e ); |
402
|
321
|
|
|
|
|
21052
|
} |
403
|
|
|
|
|
|
|
return( $e ); |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
sub add_text |
407
|
484
|
|
|
484
|
1
|
16973
|
{ |
408
|
484
|
|
|
|
|
2028
|
my $self = shift( @_ ); |
409
|
484
|
|
|
|
|
1558
|
my @args = @_; |
410
|
484
|
|
|
|
|
3083
|
my $opts = {}; |
411
|
484
|
|
|
|
|
3379
|
my @p = qw( p raw col line offset offset_end ); |
412
|
484
|
|
50
|
|
|
2377
|
@$opts{ @p } = @args; |
413
|
|
|
|
|
|
|
my $parent = $self->current_parent || |
414
|
484
|
|
|
|
|
13010
|
return( $self->error( "You must create a document first using the new_document() method first before adding text." ) ); |
415
|
|
|
|
|
|
|
my $e; |
416
|
|
|
|
|
|
|
# Text can be either some space or letters, digits (non-space characters) |
417
|
484
|
100
|
|
|
|
4910
|
# HTML::Parser does not make the difference, but we do |
418
|
|
|
|
|
|
|
if( $opts->{raw} =~ /^[[:blank:]\h\v]*$/ ) |
419
|
|
|
|
|
|
|
{ |
420
|
|
|
|
|
|
|
$e = $self->add_space( |
421
|
|
|
|
|
|
|
original => $opts->{raw}, |
422
|
|
|
|
|
|
|
column => $opts->{col}, |
423
|
|
|
|
|
|
|
line => $opts->{line}, |
424
|
|
|
|
|
|
|
offset => $opts->{offset}, |
425
|
|
|
|
|
|
|
parent => $parent, |
426
|
369
|
|
50
|
|
|
3588
|
value => $opts->{raw}, |
427
|
|
|
|
|
|
|
debug => $self->debug, |
428
|
|
|
|
|
|
|
# No 'value' set on purpose, because if none, then 'original' will be used by |
429
|
|
|
|
|
|
|
# as_string |
430
|
|
|
|
|
|
|
) || return; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
else |
433
|
|
|
|
|
|
|
{ |
434
|
|
|
|
|
|
|
$e = $self->new_text({ |
435
|
|
|
|
|
|
|
column => $opts->{col}, |
436
|
|
|
|
|
|
|
line => $opts->{line}, |
437
|
|
|
|
|
|
|
offset => $opts->{offset}, |
438
|
|
|
|
|
|
|
original => $opts->{raw}, |
439
|
|
|
|
|
|
|
parent => $parent, |
440
|
115
|
|
50
|
|
|
1081
|
value => $opts->{raw}, |
441
|
|
|
|
|
|
|
debug => $self->debug, |
442
|
115
|
|
|
|
|
1510
|
}) || return; |
443
|
|
|
|
|
|
|
$parent->children->push( $e ); |
444
|
484
|
|
|
|
|
33870
|
} |
445
|
|
|
|
|
|
|
return( $e ); |
446
|
|
|
|
|
|
|
} |
447
|
179
|
|
|
179
|
1
|
666
|
|
448
|
|
|
|
|
|
|
sub current_parent { return( shift->_set_get_object_without_init( 'current_parent', 'HTML::Object::Element', @_ ) ); } |
449
|
0
|
|
|
0
|
1
|
0
|
|
450
|
|
|
|
|
|
|
sub dictionary { return( $DICT ); } |
451
|
5
|
|
|
5
|
1
|
62
|
|
452
|
|
|
|
|
|
|
sub document { return( shift->_set_get_object( 'document', 'HTML::Object::Document', @_ ) ); } |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
sub get_definition |
455
|
340
|
|
|
340
|
1
|
1073
|
{ |
456
|
340
|
|
|
|
|
994
|
my $self = shift( @_ ); |
457
|
340
|
50
|
|
|
|
1597
|
my $tag = shift( @_ ); |
458
|
|
|
|
|
|
|
return( $self->error( "No tag was provided to get its definition." ) ) if( !length( $tag ) ); |
459
|
340
|
|
|
|
|
915
|
# Just to be sure |
460
|
340
|
50
|
|
|
|
2199
|
$tag = lc( $tag ); |
461
|
340
|
|
|
|
|
1806
|
return( {} ) if( !exists( $DICT->{ $tag } ) ); |
462
|
|
|
|
|
|
|
return( $DICT->{ $tag } ); |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub new_closing |
466
|
19
|
|
|
19
|
1
|
644
|
{ |
467
|
19
|
|
50
|
|
|
140
|
my $self = shift( @_ ); |
468
|
|
|
|
|
|
|
my $e = HTML::Object::Closing->new( @_ ) || |
469
|
19
|
|
|
|
|
293
|
return( $self->pass_error( HTML::Object::Closing->error ) ); |
470
|
|
|
|
|
|
|
return( $e ); |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
sub new_comment |
474
|
1
|
|
|
1
|
1
|
32
|
{ |
475
|
1
|
|
50
|
|
|
22
|
my $self = shift( @_ ); |
476
|
|
|
|
|
|
|
my $e = HTML::Object::Comment->new( @_ ) || |
477
|
1
|
|
|
|
|
14
|
return( $self->pass_error( HTML::Object::Comment->error ) ); |
478
|
|
|
|
|
|
|
return( $e ); |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
sub new_declaration |
482
|
0
|
|
|
0
|
1
|
0
|
{ |
483
|
0
|
|
0
|
|
|
0
|
my $self = shift( @_ ); |
484
|
|
|
|
|
|
|
my $e = HTML::Object::Declaration->new( @_ ) || |
485
|
0
|
|
|
|
|
0
|
return( $self->pass_error( HTML::Object::Declaration->error ) ); |
486
|
|
|
|
|
|
|
return( $e ); |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub new_document |
490
|
3
|
|
|
3
|
1
|
4096888
|
{ |
491
|
3
|
|
50
|
|
|
61
|
my $self = shift( @_ ); |
492
|
|
|
|
|
|
|
my $e = HTML::Object::Document->new( @_ ) || |
493
|
3
|
|
|
|
|
37
|
return( $self->pass_error( HTML::Object::Document->error ) ); |
494
|
|
|
|
|
|
|
return( $e ); |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
sub new_element |
498
|
35
|
|
|
35
|
1
|
82
|
{ |
499
|
35
|
|
50
|
|
|
198
|
my $self = shift( @_ ); |
500
|
|
|
|
|
|
|
my $e = HTML::Object::Element->new( @_ ) || |
501
|
35
|
|
|
|
|
593
|
return( $self->pass_error( HTML::Object::Element->error ) ); |
502
|
|
|
|
|
|
|
return( $e ); |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
sub new_space |
506
|
37
|
|
|
37
|
1
|
87
|
{ |
507
|
37
|
|
50
|
|
|
273
|
my $self = shift( @_ ); |
508
|
|
|
|
|
|
|
my $e = HTML::Object::Space->new( @_ ) || |
509
|
37
|
|
|
|
|
521
|
return( $self->pass_error( HTML::Object::Space->error ) ); |
510
|
|
|
|
|
|
|
return( $e ); |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
sub new_special |
514
|
292
|
|
|
292
|
1
|
829
|
{ |
515
|
292
|
|
|
|
|
891
|
my $self = shift( @_ ); |
516
|
292
|
50
|
|
|
|
1953
|
my $class = shift( @_ ); |
517
|
292
|
|
50
|
|
|
64475
|
$self->_load_class( $class ) || return( $self->pass_error ); |
518
|
292
|
|
|
|
|
4956
|
my $e = $class->new( @_ ) || return( $self->pass_error( $class->error ) ); |
519
|
|
|
|
|
|
|
return( $e ); |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub new_text |
523
|
8
|
|
|
8
|
1
|
243
|
{ |
524
|
8
|
|
50
|
|
|
68
|
my $self = shift( @_ ); |
525
|
|
|
|
|
|
|
my $e = HTML::Object::Text->new( @_ ) || |
526
|
8
|
|
|
|
|
109
|
return( $self->pass_error( HTML::Object::Text->error ) ); |
527
|
|
|
|
|
|
|
return( $e ); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
sub parse |
531
|
12
|
|
|
12
|
1
|
3724
|
{ |
532
|
12
|
|
|
|
|
40
|
my $self = shift( @_ ); |
533
|
12
|
|
|
|
|
84
|
my $this = shift( @_ ); |
534
|
12
|
100
|
33
|
|
|
382
|
my $opts = $self->_get_args_as_hash( @_ ); |
|
|
50
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
535
|
|
|
|
|
|
|
if( ref( $this ) eq 'CODE' || ref( $this ) eq 'GLOB' || "$this" =~ /<\w+/ || CORE::length( "$this" ) > 1024 ) |
536
|
11
|
|
|
|
|
82
|
{ |
537
|
|
|
|
|
|
|
return( $self->parse_data( $this, $opts ) ); |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
elsif( ref( $this ) ) |
540
|
0
|
|
|
|
|
0
|
{ |
541
|
|
|
|
|
|
|
return( $self->error( "I was provided a reference (", overload::StrVal( $this ), ") to parse html data, but I do not know what to do with it." ) ); |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
else |
544
|
1
|
|
|
|
|
11
|
{ |
545
|
|
|
|
|
|
|
return( $self->parse_file( $this, $opts ) ); |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
sub parse_data |
550
|
44
|
|
|
44
|
1
|
472
|
{ |
551
|
44
|
|
|
|
|
152
|
my $self = shift( @_ ); |
552
|
44
|
|
|
|
|
230
|
my $html = shift( @_ ); |
553
|
44
|
50
|
33
|
|
|
1959
|
my $opts = $self->_get_args_as_hash( @_ ); |
|
44
|
|
|
|
|
89
|
|
|
44
|
|
|
|
|
104
|
|
|
44
|
|
|
|
|
290
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
109
|
|
|
44
|
|
|
|
|
226
|
|
|
44
|
|
|
|
|
125
|
|
554
|
44
|
|
|
44
|
|
600
|
try |
555
|
44
|
50
|
|
|
|
289
|
{ |
556
|
|
|
|
|
|
|
if( $opts->{utf8} ) |
557
|
0
|
|
|
|
|
0
|
{ |
558
|
|
|
|
|
|
|
$html = Encode::decode( 'utf8', $html, Encode::FB_CROAK ); |
559
|
|
|
|
|
|
|
} |
560
|
44
|
0
|
50
|
|
|
348
|
} |
|
44
|
0
|
33
|
|
|
205
|
|
|
44
|
0
|
|
|
|
165
|
|
|
44
|
0
|
|
|
|
252
|
|
|
44
|
0
|
|
|
|
244
|
|
|
44
|
0
|
|
|
|
493
|
|
|
44
|
0
|
|
|
|
114
|
|
|
44
|
0
|
|
|
|
266
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
278
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
224
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
207
|
|
|
44
|
|
|
|
|
476
|
|
|
44
|
|
|
|
|
136
|
|
|
44
|
|
|
|
|
170
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
561
|
0
|
|
|
0
|
|
0
|
catch( $e ) |
562
|
0
|
|
|
|
|
0
|
{ |
563
|
29
|
0
|
0
|
29
|
|
268
|
return( $self->error( "Error found while utf8 decoding ", length( $html ), " bytes of html data provided." ) ); |
|
29
|
0
|
0
|
|
|
67
|
|
|
29
|
0
|
33
|
|
|
47051
|
|
|
0
|
0
|
33
|
|
|
0
|
|
|
0
|
0
|
33
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
33
|
|
|
0
|
|
|
0
|
0
|
33
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
44
|
0
|
|
|
|
163
|
|
|
0
|
0
|
|
|
|
0
|
|
|
44
|
0
|
|
|
|
2225
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
50
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
261
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
44
|
|
|
|
|
183
|
|
564
|
|
|
|
|
|
|
} |
565
|
44
|
|
|
|
|
107
|
|
566
|
44
|
50
|
33
|
|
|
277
|
my $e; |
567
|
|
|
|
|
|
|
if( length( $self->{current_parent} ) && $self->_is_object( $self->{current_parent} ) ) |
568
|
0
|
|
|
|
|
0
|
{ |
569
|
|
|
|
|
|
|
$e = $self->current_parent; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
else |
572
|
44
|
|
|
|
|
219
|
{ |
573
|
44
|
|
|
|
|
345
|
$e = $self->new_document( debug => $self->debug ); |
574
|
44
|
|
|
|
|
2494
|
$self->document( $e ); |
575
|
44
|
100
|
|
|
|
1707
|
$self->current_parent( $e ); |
576
|
|
|
|
|
|
|
if( $self->isa( 'HTML::Object::DOM' ) ) |
577
|
42
|
100
|
|
|
|
229
|
{ |
578
|
|
|
|
|
|
|
if( my $code = $self->onload ) |
579
|
2
|
|
|
|
|
1706
|
{ |
580
|
|
|
|
|
|
|
$e->onload( $code ); |
581
|
42
|
100
|
|
|
|
37448
|
} |
582
|
|
|
|
|
|
|
if( my $code = $self->onreadystatechange ) |
583
|
1
|
|
|
|
|
829
|
{ |
584
|
|
|
|
|
|
|
$e->onreadystatechange( $code ); |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
} |
587
|
44
|
|
|
|
|
35403
|
} |
588
|
44
|
|
|
|
|
1407
|
my $doc = $self->document; |
589
|
44
|
|
|
|
|
1054
|
my $p = $self->parser; |
590
|
44
|
|
|
|
|
763
|
$self->_set_state( 'loading' => $doc ); |
591
|
44
|
|
|
|
|
2073
|
$p->parse( $html ); |
592
|
44
|
|
|
|
|
402
|
$self->_set_state( 'interactive' => $doc ); |
593
|
44
|
|
|
|
|
297
|
$self->post_process( $e ); |
594
|
44
|
|
|
|
|
895
|
$self->_set_state( 'complete' => $doc ); |
595
|
44
|
|
|
|
|
509
|
$p->eof; |
596
|
|
|
|
|
|
|
return( $e ); |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
sub parse_file |
600
|
5
|
|
|
5
|
1
|
87
|
{ |
601
|
5
|
|
50
|
|
|
44
|
my $self = shift( @_ ); |
602
|
5
|
|
|
|
|
45
|
my $file = shift( @_ ) || return( $self->error( "No file to parse was provided." ) ); |
603
|
5
|
|
|
|
|
202
|
my $opts = $self->_get_args_as_hash( @_ ); |
604
|
5
|
50
|
|
|
|
737161
|
my $f = $self->new_file( $file ); |
|
|
50
|
|
|
|
|
|
605
|
|
|
|
|
|
|
if( !$f->exists ) |
606
|
0
|
|
|
|
|
0
|
{ |
607
|
|
|
|
|
|
|
return( $self->error( "File to parse \"$file\" does not exist." ) ); |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
elsif( $f->is_empty ) |
610
|
0
|
|
|
|
|
0
|
{ |
611
|
|
|
|
|
|
|
return( $self->error( "File to parse \"$file\" is empty." ) ); |
612
|
5
|
|
|
|
|
209217
|
} |
613
|
5
|
50
|
|
|
|
91
|
my $params = {}; |
614
|
5
|
|
50
|
|
|
132
|
$params->{binmode} = 'utf8' if( $opts->{utf8} ); |
615
|
|
|
|
|
|
|
my $io = $f->open( '<', $params ) || |
616
|
5
|
|
|
|
|
33890
|
return( $self->error( "Unable to open file to parse \"$file\": ", $f->error ) ); |
617
|
5
|
|
|
|
|
54
|
my $e = $self->new_document( _last_modified => $f->mtime ); |
618
|
5
|
100
|
|
|
|
416
|
$self->document( $e ); |
619
|
|
|
|
|
|
|
if( $self->isa( 'HTML::Object::DOM' ) ) |
620
|
4
|
50
|
|
|
|
35
|
{ |
621
|
|
|
|
|
|
|
if( my $code = $self->onload ) |
622
|
0
|
|
|
|
|
0
|
{ |
623
|
|
|
|
|
|
|
$e->onload( $code ); |
624
|
4
|
50
|
|
|
|
3779
|
} |
625
|
|
|
|
|
|
|
if( my $code = $self->onreadystatechange ) |
626
|
0
|
|
|
|
|
0
|
{ |
627
|
|
|
|
|
|
|
$e->onreadystatechange( $code ); |
628
|
|
|
|
|
|
|
} |
629
|
5
|
|
|
|
|
3448
|
} |
630
|
5
|
|
|
|
|
270
|
$self->current_parent( $e ); |
631
|
5
|
|
|
|
|
79
|
$self->_set_state( 'loading' => $e ); |
632
|
5
|
|
|
|
|
182
|
my $p = $self->parser; |
633
|
5
|
|
|
|
|
65
|
$p->parse_file( $io ); |
634
|
5
|
|
|
|
|
1495
|
$io->close; |
635
|
5
|
|
|
|
|
88
|
$self->_set_state( 'interactive' => $e ); |
636
|
5
|
|
|
|
|
44
|
$self->post_process( $e ); |
637
|
5
|
|
|
|
|
77
|
$self->_set_state( 'complete' => $e ); |
638
|
|
|
|
|
|
|
return( $e ); |
639
|
|
|
|
|
|
|
} |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub parse_url |
642
|
0
|
|
|
0
|
1
|
0
|
{ |
643
|
0
|
|
|
|
|
0
|
my $self = shift( @_ ); |
644
|
0
|
0
|
0
|
|
|
0
|
my $uri; |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
645
|
|
|
|
|
|
|
if( ( scalar( @_ ) == 1 && ref( $_[0] ) ne 'HASH' ) || |
646
|
|
|
|
|
|
|
( scalar( @_ ) > 1 && |
647
|
|
|
|
|
|
|
( |
648
|
|
|
|
|
|
|
( @_ % 2 ) || |
649
|
|
|
|
|
|
|
( scalar( @_ ) == 2 && ref( $_[1] ) eq 'HASH' ) |
650
|
|
|
|
|
|
|
) |
651
|
|
|
|
|
|
|
) ) |
652
|
0
|
|
|
|
|
0
|
{ |
653
|
|
|
|
|
|
|
$uri = shift( @_ ); |
654
|
0
|
|
|
|
|
0
|
} |
655
|
0
|
0
|
0
|
|
|
0
|
my $opts = $self->_get_args_as_hash( @_ ); |
656
|
0
|
0
|
|
|
|
0
|
$uri = CORE::delete( $opts->{uri} ) if( defined( $opts->{uri} ) && CORE::length( $opts->{uri} ) ); |
657
|
|
|
|
|
|
|
if( !$self->_load_class( 'LWP::UserAgent', { version => '6.49' } ) ) |
658
|
0
|
|
|
|
|
0
|
{ |
659
|
|
|
|
|
|
|
return( $self->error( "LWP::UserAgent version 6.49 or higher is required to use load()" ) ); |
660
|
0
|
0
|
|
|
|
0
|
} |
661
|
|
|
|
|
|
|
if( !$self->_load_class( 'URI', { version => '1.74' } ) ) |
662
|
0
|
|
|
|
|
0
|
{ |
663
|
|
|
|
|
|
|
return( $self->error( "URI version 1.74 or higher is required to use load()" ) ); |
664
|
0
|
|
0
|
|
|
0
|
} |
665
|
0
|
0
|
0
|
|
|
0
|
$opts->{timeout} //= 10; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
666
|
0
|
|
|
0
|
|
0
|
try |
667
|
0
|
|
|
|
|
0
|
{ |
668
|
|
|
|
|
|
|
$uri = URI->new( "$uri" ); |
669
|
0
|
0
|
0
|
|
|
0
|
} |
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
670
|
0
|
|
|
0
|
|
0
|
catch( $e ) |
671
|
0
|
|
|
|
|
0
|
{ |
672
|
29
|
0
|
0
|
29
|
|
254
|
return( $self->error( "Bad url provided \"$uri\": $e" ) ); |
|
29
|
0
|
0
|
|
|
88
|
|
|
29
|
0
|
0
|
|
|
34859
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
673
|
|
|
|
|
|
|
} |
674
|
0
|
|
|
|
|
0
|
|
675
|
0
|
0
|
0
|
|
|
0
|
my $content; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
676
|
0
|
|
|
0
|
|
0
|
try |
677
|
|
|
|
|
|
|
{ |
678
|
|
|
|
|
|
|
my $ua = LWP::UserAgent->new( |
679
|
|
|
|
|
|
|
agent => "HTML::Object/$VERSION", |
680
|
0
|
|
|
|
|
0
|
timeout => $opts->{timeout}, |
681
|
0
|
0
|
0
|
|
|
0
|
); |
|
0
|
|
|
|
|
0
|
|
682
|
0
|
0
|
0
|
|
|
0
|
my $resp = $ua->get( $uri, ( CORE::exists( $opts->{headers} ) && defined( $opts->{headers} ) && ref( $opts->{headers} ) eq 'HASH' && scalar( keys( %{$opts->{headers}} ) ) ) ? %{$opts->{headers}} : () ); |
683
|
|
|
|
|
|
|
if( $resp->header( 'Client-Warning' ) || !$resp->is_success ) |
684
|
0
|
|
|
|
|
0
|
{ |
685
|
|
|
|
|
|
|
return( $self->error({ |
686
|
|
|
|
|
|
|
code => $resp->code, |
687
|
|
|
|
|
|
|
message => $resp->message, |
688
|
|
|
|
|
|
|
}) ); |
689
|
0
|
|
|
|
|
0
|
} |
690
|
0
|
|
|
|
|
0
|
$content = $resp->decoded_content; |
691
|
|
|
|
|
|
|
$self->response( $resp ); |
692
|
0
|
0
|
0
|
|
|
0
|
} |
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
693
|
0
|
|
|
0
|
|
0
|
catch( $e ) |
694
|
0
|
|
|
|
|
0
|
{ |
695
|
29
|
0
|
0
|
29
|
|
250
|
return( $self->error( "Error making a GET request to $uri: $e" ) ); |
|
29
|
0
|
0
|
|
|
80
|
|
|
29
|
0
|
0
|
|
|
32603
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
0
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
696
|
0
|
|
|
|
|
0
|
} |
697
|
0
|
|
|
|
|
0
|
my $doc = $self->parse_data( $content ); |
698
|
0
|
|
|
|
|
0
|
$doc->uri( $uri ); |
699
|
|
|
|
|
|
|
return( $doc ); |
700
|
|
|
|
|
|
|
} |
701
|
49
|
|
|
49
|
1
|
251
|
|
702
|
|
|
|
|
|
|
sub parser { return( shift->_set_get_object_without_init( '_parser', 'HTML::Parser', @_ ) ); } |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub post_process |
705
|
266
|
|
|
266
|
1
|
166509
|
{ |
706
|
266
|
|
|
|
|
552
|
my $self = shift( @_ ); |
707
|
266
|
50
|
|
|
|
844
|
my $elem = shift( @_ ); |
708
|
266
|
50
|
|
|
|
3127
|
return if( !$self->_is_object( $elem ) ); |
709
|
|
|
|
|
|
|
return if( !$elem->isa( 'HTML::Object::Element' ) ); |
710
|
|
|
|
|
|
|
# Crawl through the tree and look for unclosed tags |
711
|
|
|
|
|
|
|
$elem->children->foreach(sub |
712
|
807
|
|
|
807
|
|
313229
|
{ |
713
|
807
|
100
|
66
|
|
|
8058
|
my $e = shift( @_ ); |
714
|
321
|
50
|
66
|
|
|
225564
|
return(1) if( $e->isa( 'HTML::Object::Closing' ) || $e->tag->substr( 0, 1 ) eq '_' ); |
|
|
100
|
100
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
715
|
|
|
|
|
|
|
if( $e->is_empty && $e->children->length ) |
716
|
|
|
|
|
|
|
{ |
717
|
|
|
|
|
|
|
} |
718
|
|
|
|
|
|
|
elsif( $e->is_empty && !$e->attributes->exists( '/' ) ) |
719
|
|
|
|
|
|
|
{ |
720
|
|
|
|
|
|
|
} |
721
|
|
|
|
|
|
|
elsif( !$e->is_empty && !$e->is_closed ) |
722
|
0
|
|
|
|
|
0
|
{ |
723
|
0
|
0
|
|
|
|
0
|
my $def = $self->get_definition( $e->tag ); |
724
|
|
|
|
|
|
|
if( !$def->{is_empty} ) |
725
|
|
|
|
|
|
|
{ |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
else |
728
|
|
|
|
|
|
|
{ |
729
|
|
|
|
|
|
|
} |
730
|
321
|
100
|
|
|
|
252565
|
} |
731
|
266
|
|
|
|
|
1325
|
$self->post_process( $e ) if( !$e->is_empty ); |
732
|
266
|
|
|
|
|
150001
|
}); |
733
|
|
|
|
|
|
|
return( $self ); |
734
|
|
|
|
|
|
|
} |
735
|
0
|
|
|
0
|
1
|
0
|
|
736
|
|
|
|
|
|
|
sub response { return( shift->_set_get_object_without_init( 'response', 'HTTP::Response', @_ ) ); } |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
sub sanity_check |
739
|
0
|
|
|
0
|
1
|
0
|
{ |
740
|
0
|
|
|
|
|
0
|
my $self = shift( @_ ); |
741
|
0
|
0
|
|
|
|
0
|
my $elem = shift( @_ ); |
742
|
0
|
0
|
|
|
|
0
|
return if( !$self->_is_object( $elem ) ); |
743
|
|
|
|
|
|
|
return if( !$elem->isa( 'HTML::Object::Element' ) ); |
744
|
|
|
|
|
|
|
# Crawl through the tree and look for unclosed tags |
745
|
|
|
|
|
|
|
$elem->children->foreach(sub |
746
|
0
|
|
|
0
|
|
0
|
{ |
747
|
0
|
0
|
0
|
|
|
0
|
my $e = shift( @_ ); |
748
|
0
|
0
|
0
|
|
|
0
|
return(1) if( $e->isa( 'HTML::Object::Closing' ) || $e->tag->substr( 0, 1 ) eq '_' ); |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
749
|
|
|
|
|
|
|
if( $e->is_empty && $e->children->length ) |
750
|
0
|
|
|
|
|
0
|
{ |
751
|
|
|
|
|
|
|
printf( STDOUT "Tag \"%s\" should be empty (void), but it has %d children.\n", $e->tag, $e->children->length ); |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
elsif( $e->is_empty && !$e->attributes->exists( '/' ) ) |
754
|
0
|
|
|
|
|
0
|
{ |
755
|
|
|
|
|
|
|
printf( STDOUT "Tag \"%s\" at line %d at row %d is an empty (void) tag, but it did not end with />\n", $e->tag, $e->line, $e->column ); |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
elsif( !$e->is_empty && $e->attributes->exists( '/' ) ) |
758
|
0
|
|
|
|
|
0
|
{ |
759
|
|
|
|
|
|
|
printf( STDOUT "Tag \"%s\" at line %d at row %d is marked as non-empty (non-void), but it ends with />\n", $e->tag, $e->line, $e->column ); |
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
elsif( !$e->is_empty && !$e->is_closed ) |
762
|
0
|
|
|
|
|
0
|
{ |
763
|
0
|
0
|
|
|
|
0
|
my $def = $self->get_definition( $e->tag ); |
764
|
|
|
|
|
|
|
if( !$def->{is_empty} ) |
765
|
0
|
|
|
|
|
0
|
{ |
766
|
|
|
|
|
|
|
printf( STDOUT "Tag \"%s\" at line %d at row %d is an enclosing tag, but it has not been closed.\n", $e->tag, $e->line, $e->column ); |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
else |
769
|
0
|
|
|
|
|
0
|
{ |
770
|
|
|
|
|
|
|
printf( STDOUT "Tag \"%s\" at line %d at row %d is an empty (void) tag, but it did not end with />\n", $e->tag, $e->line, $e->column ); |
771
|
|
|
|
|
|
|
} |
772
|
0
|
0
|
|
|
|
0
|
} |
773
|
0
|
|
|
|
|
0
|
$self->sanity_check( $e ) if( !$e->is_empty ); |
774
|
0
|
|
|
|
|
0
|
}); |
775
|
|
|
|
|
|
|
return( $self ); |
776
|
|
|
|
|
|
|
} |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
sub set_dom |
779
|
0
|
|
|
0
|
1
|
0
|
{ |
780
|
0
|
0
|
|
|
|
0
|
my( $this, $html ) = @_; |
781
|
|
|
|
|
|
|
if( defined( $html ) ) |
782
|
0
|
0
|
0
|
|
|
0
|
{ |
|
|
0
|
|
|
|
|
|
783
|
|
|
|
|
|
|
if( Scalar::Util::blessed( $html ) && $html->isa( 'HTML::Object::Document' ) ) |
784
|
0
|
|
|
|
|
0
|
{ |
785
|
|
|
|
|
|
|
$GLOBAL_DOM = $html; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
elsif( CORE::length( $html ) ) |
788
|
0
|
|
|
|
|
0
|
{ |
789
|
|
|
|
|
|
|
$GLOBAL_DOM = $this->new->parse( $html ); |
790
|
|
|
|
|
|
|
} |
791
|
0
|
|
|
|
|
0
|
} |
792
|
|
|
|
|
|
|
return( $this ); |
793
|
|
|
|
|
|
|
} |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
sub _set_state |
796
|
147
|
|
|
147
|
|
391
|
{ |
797
|
147
|
|
|
|
|
500
|
my $self = shift( @_ ); |
798
|
|
|
|
|
|
|
my( $state, $elem ) = @_; |
799
|
147
|
100
|
|
|
|
1097
|
# This feature is only applicable for HTML::Object::DOM |
800
|
|
|
|
|
|
|
return( $self ) unless( $self->isa( 'HTML::Object::DOM' ) ); |
801
|
138
|
50
|
33
|
|
|
1226
|
# ... and only for documents |
802
|
138
|
|
|
|
|
6782
|
return if( !defined( $elem ) || !$self->_is_a( $elem => 'HTML::Object::DOM::Document' ) ); |
803
|
138
|
|
|
|
|
140283
|
$elem->readyState( $state ); |
804
|
138
|
|
|
|
|
1726
|
require HTML::Object::Event; |
805
|
|
|
|
|
|
|
my $event = HTML::Object::Event->new( 'readystate', |
806
|
|
|
|
|
|
|
bubbles => 0, |
807
|
|
|
|
|
|
|
cancelable => 0, |
808
|
|
|
|
|
|
|
detail => { 'state' => $state, document => $elem }, |
809
|
|
|
|
|
|
|
target => $elem, |
810
|
|
|
|
|
|
|
); |
811
|
138
|
100
|
|
|
|
1682
|
# $elem->dispatchEvent( $event ); |
812
|
|
|
|
|
|
|
if( my $eh = $elem->onreadystatechange ) |
813
|
3
|
|
|
|
|
634
|
{ |
814
|
3
|
|
|
|
|
19
|
local $_ = $elem; |
815
|
3
|
50
|
|
|
|
2631
|
my $code = $eh->code; |
816
|
3
|
50
|
|
|
|
24
|
warn( "Value for event handler '$code' is not a code reference.\n" ) if( ref( $code ) ne 'CODE' ); |
817
|
|
|
|
|
|
|
$code->( $event ) if( ref( $code ) eq 'CODE' ); |
818
|
138
|
100
|
100
|
|
|
19372
|
} |
819
|
|
|
|
|
|
|
if( $state eq 'complete' && ( my $code = $elem->onload ) ) |
820
|
2
|
|
|
|
|
1777
|
{ |
821
|
2
|
|
|
|
|
14
|
local $_ = $elem; |
822
|
|
|
|
|
|
|
$code->( $event ); |
823
|
138
|
|
|
|
|
41729
|
} |
824
|
|
|
|
|
|
|
return( $self ); |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
1; |
828
|
|
|
|
|
|
|
# NOTE: POD |
829
|
|
|
|
|
|
|
__END__ |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=encoding utf-8 |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
=head1 NAME |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
HTML::Object - HTML Parser, Modifier and Query Interface |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=head1 SYNOPSIS |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
use HTML::Object; |
840
|
|
|
|
|
|
|
my $p = HTML::Object->new( debug => 5 ); |
841
|
|
|
|
|
|
|
my $doc = $p->parse( $file, { utf8 => 1 } ) || die( $p->error, "\n" ); |
842
|
|
|
|
|
|
|
print $doc->as_string; |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
or, using the HTML DOM implementation same as the Web API: |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
use HTML::Object::DOM global_dom => 1; |
847
|
|
|
|
|
|
|
# then you can also use HTML::Object::XQuery for jQuery like DOM manipulation |
848
|
|
|
|
|
|
|
my $p = HTML::Object::DOM->new; |
849
|
|
|
|
|
|
|
my $doc = $p->parse_data( $some_html ) || die( $p->error, "\n" ); |
850
|
|
|
|
|
|
|
$('div.inner')->after( "<p>Test</p>" ); |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
# returns an HTML::Object::DOM::Collection |
853
|
|
|
|
|
|
|
my $divs = $doc->getElementsByTagName( 'div' ); |
854
|
|
|
|
|
|
|
my $new = $doc->createElement( 'div' ); |
855
|
|
|
|
|
|
|
$new->setAttribute( id => 'newDiv' ); |
856
|
|
|
|
|
|
|
$divs->[0]->parent->replaceChild( $new, $divs->[0] ); |
857
|
|
|
|
|
|
|
# etc. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
To enable fatal error and also implement try-catch (using L<Nice::Try>) : |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
use HTML::Object fatal_error => 1, try_catch => 1; |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
=head1 VERSION |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
v0.2.6 |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
=head1 DESCRIPTION |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
This module is yet another HTML parser, manipulation and query interface, but probably the most comprehensive one. It uses the C parser from L<HTML::Parser> and has the unique particularity that it does not try to decode the entire html document tree only to re-encode it when printing out its data as string like so many other html parsers out there do. Instead, it modifies only the parts required. The rest is returned exactly as it was found in the HTML. This is faster and safer. |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
This module contains 144 modules to closely implement the HTML standard as documented on L<Mozilla documentation|https://developer.mozilla.org/en-US/docs/Web/API/HTML_DOM_API>. |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
It uses an external json data dictionary file of html tags (C<html_tags_dict.json>). |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
There are 3 ways to manipulate and query the html data: |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=over 4 |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=item 1. L<HTML::Object::Element> |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
This is lightweight and simple |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
=item 2. L<HTML::Object::DOM> |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
This is an alternative HTML parser also based on L<HTML::Parser>, and that implements fully the Web API with DOM (Data Object Model), so you can query the HTML with perl equivalent to JavaScript methods of the Web API. It has been designed to be strictly identical to the Web API. |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=item 3. L<HTML::Object::XQuery> |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
This interface provides a jQuery like API and requires the use of L<HTML::Object::DOM>. However, this is not designed to be a perl implementation of JavaScript, but rather a perl implementation of DOM manipulation methods found in jQuery. |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
=back |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
Note that this interface does not enforce HTML standard. It is up to you the developer to decide what value to use and where the HTML elements should go in the HTML tree and what to do with it. |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
=head1 METHODS |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=head2 new |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
Instantiate a new L<HTML::Object> object. |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
=head2 add_comment |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
This is a parser method called that will add a comment to the stack of html elements. |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=head2 add_declaration |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
This is a parser method called that will add a declaration to the stack of html elements. |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
=head2 add_default |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
This is a parser method called that will add a default html tag to the stack of html elements. |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
=head2 add_end |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
This is a parser method called that will add a closing html tag to the stack of html elements. |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=head2 add_space |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
This is a parser method called that will add a space to the stack of html elements. |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
=head2 add_start |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
This is a parser method called that will add a starting html tag to the stack of html elements. |
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
=head2 add_text |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
This is a parser method called that will add a text to the stack of html elements. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
=head2 current_parent |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
Sets or gets the current parent, which must be an L<HTML::Object::Element> object or an inheriting class. |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
=head2 dictionary |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
Returns an hash reference containing the HTML tags dictionary. Its structure is: |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
=over 4 |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=item * dict |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
This property reflects an hash containing all the known tags. Each tag has the following possible properties: |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=over 8 |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
=item * description |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
String |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=item * is_deprecated |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
Boolean value |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=item * is_empty |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
Boolean value |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
=item * is_inline |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
Boolean value |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=item * is_svg |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
Boolean value that describes whether this is a tag dedicated to svg. |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
=item * link_in |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
Array reference of HTML attributes containing links |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
=item * ref |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
The reference URL to the online web documentation for this tag. |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=back |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
=item * meta |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
This property holds an hash reference containing the following meta information: |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
=over 8 |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
=item * author |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
String |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=item * updated |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
ISO 8601 datetime |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
=item * version |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
Version number |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
=back |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
=back |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
=head2 document |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
Sets or gets the document L<HTML::Object::Document> object. |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
=head2 get_definition |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
Get the hash definition for a given tag (case does not matter). |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
The tags definition is taken from the external file C<html_tags_dict.json> that is provided with this package. |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 new_closing |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Closing>, passing it any arguments provided. |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
=head2 new_comment |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Comment>, passing it any arguments provided. |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
=head2 new_declaration |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Declaration>, passing it any arguments provided. |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
=head2 new_document |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Document>, passing it any arguments provided. |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
=head2 new_element |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Element>, passing it any arguments provided. |
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
=head2 new_space |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Space>, passing it any arguments provided. |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=head2 new_special |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
Provided with an HTML tag class name and hash or hash reference of options and this will load that class and instantiate an object passing it the options provided. It returns the object thus Instantiated. |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
This is used to instantiate object for special class to handle certain HTML tag, such as C<a> |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
=head2 new_text |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
Creates and returns a new closing html element L<HTML::Object::Text>, passing it any arguments provided. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
=head2 parse |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
Provided with some C<data> (see below), and some options as hash or hash reference and this will parse it and return a new L<HTML::Object::Document> object. |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
Possible accepted data are: |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
=over 4 |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=item I<code> |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
L</parse_data> will be called with it. |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=item I<glob> |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
L</parse_data> will be called with it. |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
=item I<string> |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
L</parse_file> will be called with it. |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
=back |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
Other reference will return an error. |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
=head2 parse_data |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
Provided with some C<data> and some options as hash or hash reference and this will parse the given data and return a L<HTML::Object::Document> object. |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
If the option I<utf8> is provided, the C<data> received will be converted to utf8 using L<Encode/decode>. If an error occurs decoding the data into utf8, the error will be set as an L<Module::Generic::Exception> object and undef will be returned. |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
=head2 parse_file |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
Provided with a file path and some options as hash or hash reference and this will parse the file. |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
If the option I<utf8> is provided, the file will be opened with L<perlfunc/binmode> set to C<utf8> |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
It returns a new L<HTML::Object::Document> |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=head2 parse_url |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
Provided with an URI supported by L<LWP::UserAgent> and this will issue a GET query and parse the resulting HTML data, and return a new L<HTML::Object::Document> or L<HTML::Object::DOM::Document> depending on which interface you use (either L<HTML::Object> or L<HTML::Object::DOM>. |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
If an error occurred, this will set an L<error|Module::Generic/error> and return C<undef>. |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
You can get the L<response|HTTP::Response> object with L</response> |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
=head2 parser |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
Sets or gets a L<HTML::Parser> object. |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
=head2 post_process |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
Provided with an L<HTML::Object::Element> and this will post process its parsing. |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
=head2 response |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
Get the latest L<HTTP::Response> object from the HTTP query made using L</parse_url> |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
=head2 sanity_check |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
Provided with an L<HTML::Object::Element> and this will perform some sanity checks and report the result on C<STDOUT>. |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=head2 set_dom |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
Provided with a L<HTML::Object::Document> object and this sets the global variable C<$GLOBAL_DOM>. This is particularly useful when using L<HTML::Object::XQuery> to do things like: |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
my $collection = $('div'); |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
=head1 CREDITS |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
Throughout the documentation of this distribution, a lot of descriptions, references and examples have been borrowed from Mozilla. I have also contributed to improving their documentation by fixing bugs and typos on their site. |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=head1 AUTHOR |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Jacques Deguest E<lt>F<jack@deguest.jp>E<gt> |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
=head1 SEE ALSO |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
L<HTML::Object::DOM>, L<HTML::Object::Element>, L<HTML::Object::XQuery> |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
Copyright (c) 2021 DEGUEST Pte. Ltd. |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
You can use, copy, modify and redistribute this package and associated files under the same terms as Perl itself. |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=cut |