blib/lib/HTML/TagParser.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 259 | 276 | 93.8 |
branch | 104 | 134 | 77.6 |
condition | 33 | 41 | 80.4 |
subroutine | 33 | 36 | 91.6 |
pod | 9 | 10 | 90.0 |
total | 438 | 497 | 88.1 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | =head1 NAME | ||||||
2 | |||||||
3 | HTML::TagParser - Yet another HTML document parser with DOM-like methods | ||||||
4 | |||||||
5 | =head1 SYNOPSIS | ||||||
6 | |||||||
7 | Parse a HTML file and find its |
||||||
8 | |||||||
9 | my $html = HTML::TagParser->new( "index-j.html" ); | ||||||
10 | my $elem = $html->getElementsByTagName( "title" ); | ||||||
11 | print " |
||||||
12 | |||||||
13 | Parse a HTML source and find its first | ||||||
14 | and find all input elements belonging to this form. | ||||||
15 | |||||||
16 | my $src = ''; | ||||||
17 | my $html = HTML::TagParser->new( $src ); | ||||||
18 | my $elem = $html->getElementsByTagName( "form" ); | ||||||
19 | print " | ||||||
20 | my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); | ||||||
21 | my $form = $first_inputs[0]->getParent(); | ||||||
22 | |||||||
23 | Fetch a HTML file via HTTP, and display its all elements and attributes. | ||||||
24 | |||||||
25 | my $url = 'http://www.kawa.net/xp/index-e.html'; | ||||||
26 | my $html = HTML::TagParser->new( $url ); | ||||||
27 | my @list = $html->getElementsByTagName( "a" ); | ||||||
28 | foreach my $elem ( @list ) { | ||||||
29 | my $tagname = $elem->tagName; | ||||||
30 | my $attr = $elem->attributes; | ||||||
31 | my $text = $elem->innerText; | ||||||
32 | print "<$tagname"; | ||||||
33 | foreach my $key ( sort keys %$attr ) { | ||||||
34 | print " $key=\"$attr->{$key}\""; | ||||||
35 | } | ||||||
36 | if ( $text eq "" ) { | ||||||
37 | print " />\n"; | ||||||
38 | } else { | ||||||
39 | print ">$text$tagname>\n"; | ||||||
40 | } | ||||||
41 | } | ||||||
42 | |||||||
43 | =head1 DESCRIPTION | ||||||
44 | |||||||
45 | HTML::TagParser is a pure Perl module which parses HTML/XHTML files. | ||||||
46 | This module provides some methods like DOM interface. | ||||||
47 | This module is not strict about XHTML format | ||||||
48 | because many of HTML pages are not strict. | ||||||
49 | You know, many pages use elemtents instead of |
||||||
50 | and have elements which are not closed. |
||||||
51 | |||||||
52 | =head1 METHODS | ||||||
53 | |||||||
54 | =head2 $html = HTML::TagParser->new(); | ||||||
55 | |||||||
56 | This method constructs an empty instance of the C |
||||||
57 | |||||||
58 | =head2 $html = HTML::TagParser->new( $url ); | ||||||
59 | |||||||
60 | If new() is called with a URL, | ||||||
61 | this method fetches a HTML file from remote web server and parses it | ||||||
62 | and returns its instance. | ||||||
63 | L |
||||||
64 | |||||||
65 | =head2 $html = HTML::TagParser->new( $file ); | ||||||
66 | |||||||
67 | If new() is called with a filename, | ||||||
68 | this method parses a local HTML file and returns its instance | ||||||
69 | |||||||
70 | =head2 $html = HTML::TagParser->new( "...snip..." ); | ||||||
71 | |||||||
72 | If new() is called with a string of HTML source code, | ||||||
73 | this method parses it and returns its instance. | ||||||
74 | |||||||
75 | =head2 $html->fetch( $url, %param ); | ||||||
76 | |||||||
77 | This method fetches a HTML file from remote web server and parse it. | ||||||
78 | The second argument is optional parameters for L |
||||||
79 | |||||||
80 | =head2 $html->open( $file ); | ||||||
81 | |||||||
82 | This method parses a local HTML file. | ||||||
83 | |||||||
84 | =head2 $html->parse( $source ); | ||||||
85 | |||||||
86 | This method parses a string of HTML source code. | ||||||
87 | |||||||
88 | =head2 $elem = $html->getElementById( $id ); | ||||||
89 | |||||||
90 | This method returns the element which id attribute is $id. | ||||||
91 | |||||||
92 | =head2 @elem = $html->getElementsByName( $name ); | ||||||
93 | |||||||
94 | This method returns an array of elements which name attribute is $name. | ||||||
95 | On scalar context, the first element is only retruned. | ||||||
96 | |||||||
97 | =head2 @elem = $html->getElementsByTagName( $tagname ); | ||||||
98 | |||||||
99 | This method returns an array of elements which tagName is $tagName. | ||||||
100 | On scalar context, the first element is only retruned. | ||||||
101 | |||||||
102 | =head2 @elem = $html->getElementsByClassName( $class ); | ||||||
103 | |||||||
104 | This method returns an array of elements which className is $tagName. | ||||||
105 | On scalar context, the first element is only retruned. | ||||||
106 | |||||||
107 | =head2 @elem = $html->getElementsByAttribute( $attrname, $value ); | ||||||
108 | |||||||
109 | This method returns an array of elements which $attrname attribute's value is $value. | ||||||
110 | On scalar context, the first element is only retruned. | ||||||
111 | |||||||
112 | =head1 HTML::TagParser::Element SUBCLASS | ||||||
113 | |||||||
114 | =head2 $tagname = $elem->tagName(); | ||||||
115 | |||||||
116 | This method returns $elem's tagName. | ||||||
117 | |||||||
118 | =head2 $text = $elem->id(); | ||||||
119 | |||||||
120 | This method returns $elem's id attribute. | ||||||
121 | |||||||
122 | =head2 $text = $elem->innerText(); | ||||||
123 | |||||||
124 | This method returns $elem's innerText without tags. | ||||||
125 | |||||||
126 | =head2 $subhtml = $elem->subTree(); | ||||||
127 | |||||||
128 | This method returns a new object of class HTML::Parser, | ||||||
129 | with all the elements that are in the DOM hierarchy under $elem. | ||||||
130 | |||||||
131 | =head2 $elem = $elem->nextSibling(); | ||||||
132 | |||||||
133 | This method returns the next sibling within the same parent. | ||||||
134 | It returns undef when called on a closing tag or on the lastChild node | ||||||
135 | of a parentNode. | ||||||
136 | |||||||
137 | =head2 $elem = $elem->previousSibling(); | ||||||
138 | |||||||
139 | This method returns the previous sibling within the same parent. | ||||||
140 | It returns undef when called on the firstChild node of a parentNode. | ||||||
141 | |||||||
142 | =head2 $child_elem = $elem->firstChild(); | ||||||
143 | |||||||
144 | This method returns the first child node of $elem. | ||||||
145 | It returns undef when called on a closing tag element or on a | ||||||
146 | non-container or empty container element. | ||||||
147 | |||||||
148 | =head2 $child_elems = $elem->childNodes(); | ||||||
149 | |||||||
150 | This method creates an array of all child nodes of $elem and returns the array by reference. | ||||||
151 | It returns an empty array-ref [] whenever firstChild() would return undef. | ||||||
152 | |||||||
153 | =head2 $child_elem = $elem->lastChild(); | ||||||
154 | |||||||
155 | This method returns the last child node of $elem. | ||||||
156 | It returns undef whenever firstChild() would return undef. | ||||||
157 | |||||||
158 | =head2 $parent = $elem->parentNode(); | ||||||
159 | |||||||
160 | This method returns the parent node of $elem. | ||||||
161 | It returns undef when called on root nodes. | ||||||
162 | |||||||
163 | =head2 $attr = $elem->attributes(); | ||||||
164 | |||||||
165 | This method returns a hash of $elem's all attributes. | ||||||
166 | |||||||
167 | =head2 $value = $elem->getAttribute( $key ); | ||||||
168 | |||||||
169 | This method returns the value of $elem's attributes which name is $key. | ||||||
170 | |||||||
171 | =head1 BUGS | ||||||
172 | |||||||
173 | The HTML-Parser is simple. Methods innerText and subTree may be | ||||||
174 | fooled by nested tags or embedded javascript code. | ||||||
175 | |||||||
176 | The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. | ||||||
177 | The most expensive ones are lastChild() and previousSibling(). | ||||||
178 | parentNode() is also expensive, but only once. It does caching. | ||||||
179 | |||||||
180 | The DOM tree is read-only, as this is just a parser. | ||||||
181 | |||||||
182 | =head1 INTERNATIONALIZATION | ||||||
183 | |||||||
184 | This module natively understands the character encoding used in document | ||||||
185 | by parsing its meta element. | ||||||
186 | |||||||
187 | |||||||
188 | |||||||
189 | The parsed document's encoding is converted | ||||||
190 | as this class's fixed internal encoding "UTF-8". | ||||||
191 | |||||||
192 | =head1 AUTHORS AND CONTRIBUTORS | ||||||
193 | |||||||
194 | drry [drry] | ||||||
195 | Juergen Weigert [jnw] | ||||||
196 | Yusuke Kawasaki [kawasaki] [kawanet] | ||||||
197 | Tim Wilde [twilde] | ||||||
198 | |||||||
199 | =head1 COPYRIGHT AND LICENSE | ||||||
200 | |||||||
201 | The following copyright notice applies to all the files provided in this | ||||||
202 | distribution, including binary files, unless explicitly noted otherwise. | ||||||
203 | |||||||
204 | Copyright 2006-2012 Yusuke Kawasaki | ||||||
205 | |||||||
206 | This program is free software; you can redistribute it and/or | ||||||
207 | modify it under the same terms as Perl itself. | ||||||
208 | |||||||
209 | =cut | ||||||
210 | # ---------------------------------------------------------------- | ||||||
211 | |||||||
212 | package HTML::TagParser; | ||||||
213 | 16 | 16 | 682399 | use 5.008_001; | |||
16 | 66 | ||||||
16 | 838 | ||||||
214 | 16 | 16 | 96 | use strict; | |||
16 | 36 | ||||||
16 | 1082 | ||||||
215 | 16 | 16 | 22975 | use Symbol (); | |||
16 | 33051 | ||||||
16 | 436 | ||||||
216 | 16 | 16 | 126 | use Carp (); | |||
16 | 26 | ||||||
16 | 854 | ||||||
217 | 16 | 16 | 34656 | use Encode (); | |||
16 | 427777 | ||||||
16 | 22745 | ||||||
218 | |||||||
219 | our $VERSION = "0.20"; | ||||||
220 | |||||||
221 | my $SEC_OF_DAY = 60 * 60 * 24; | ||||||
222 | |||||||
223 | # [000] '/' if closing tag. | ||||||
224 | # [001] tagName | ||||||
225 | # [002] attributes string (with trailing /, if self-closing tag). | ||||||
226 | # [003] content until next (nested) tag. | ||||||
227 | # [004] attributes hash cache. | ||||||
228 | # [005] innerText combined strings cache. | ||||||
229 | # [006] index of matching closing tag (or opening tag, if [000]=='/') | ||||||
230 | # [007] index of parent (aka container) tag. | ||||||
231 | # | ||||||
232 | sub new { | ||||||
233 | 26 | 26 | 1 | 1403953 | my $package = shift; | ||
234 | 26 | 56 | my $src = shift; | ||||
235 | 26 | 63 | my $self = {}; | ||||
236 | 26 | 73 | bless $self, $package; | ||||
237 | 26 | 100 | 102 | return $self unless defined $src; | |||
238 | |||||||
239 | 19 | 100 | 66 | 569 | if ( $src =~ m#^https?://\w# ) { | ||
100 | |||||||
50 | |||||||
240 | 1 | 7 | $self->fetch( $src, @_ ); | ||||
241 | } | ||||||
242 | elsif ( $src !~ m#[<>|]# && -f $src ) { | ||||||
243 | 12 | 47 | $self->open($src); | ||||
244 | } | ||||||
245 | elsif ( $src =~ /<.*>/ ) { | ||||||
246 | 6 | 33 | $self->parse($src); | ||||
247 | } | ||||||
248 | |||||||
249 | 19 | 92 | $self; | ||||
250 | } | ||||||
251 | |||||||
252 | sub fetch { | ||||||
253 | 2 | 2 | 1 | 8 | my $self = shift; | ||
254 | 2 | 6 | my $url = shift; | ||||
255 | 2 | 50 | 8 | if ( !defined $URI::Fetch::VERSION ) { | |||
256 | 0 | 0 | local $@; | ||||
257 | 0 | 0 | eval { require URI::Fetch; }; | ||||
0 | 0 | ||||||
258 | 0 | 0 | 0 | Carp::croak "URI::Fetch is required: $url" if $@; | |||
259 | } | ||||||
260 | 2 | 16 | my $res = URI::Fetch->fetch( $url, @_ ); | ||||
261 | 2 | 50 | 788652 | Carp::croak "URI::Fetch failed: $url" unless ref $res; | |||
262 | 2 | 50 | 12 | return if $res->is_error(); | |||
263 | 2 | 70 | $self->{modified} = $res->last_modified(); | ||||
264 | 2 | 29 | my $text = $res->content(); | ||||
265 | 2 | 23 | $self->parse( \$text ); | ||||
266 | } | ||||||
267 | |||||||
268 | sub open { | ||||||
269 | 13 | 13 | 1 | 21 | my $self = shift; | ||
270 | 13 | 27 | my $file = shift; | ||||
271 | 13 | 45 | my $text = HTML::TagParser::Util::read_text_file($file); | ||||
272 | 13 | 50 | 47 | return unless defined $text; | |||
273 | 13 | 223 | my $epoch = ( time() - ( -M $file ) * $SEC_OF_DAY ); | ||||
274 | 13 | 28 | $epoch -= $epoch % 60; | ||||
275 | 13 | 63 | $self->{modified} = $epoch; | ||||
276 | 13 | 50 | $self->parse( \$text ); | ||||
277 | } | ||||||
278 | |||||||
279 | sub parse { | ||||||
280 | 25 | 25 | 1 | 66 | my $self = shift; | ||
281 | 25 | 49 | my $text = shift; | ||||
282 | 25 | 100 | 111 | my $txtref = ref $text ? $text : \$text; | |||
283 | |||||||
284 | 25 | 112 | my $charset = HTML::TagParser::Util::find_meta_charset($txtref); | ||||
285 | 25 | 66 | 218 | $self->{charset} ||= $charset; | |||
286 | 25 | 100 | 100 | 136 | if ($charset && Encode::find_encoding($charset)) { | ||
287 | 11 | 40530 | HTML::TagParser::Util::encode_from_to( $txtref, $charset, "utf-8" ); | ||||
288 | } | ||||||
289 | 25 | 665 | my $flat = HTML::TagParser::Util::html_to_flat($txtref); | ||||
290 | 25 | 50 | 105 | Carp::croak "Null HTML document." unless scalar @$flat; | |||
291 | 25 | 120 | $self->{flat} = $flat; | ||||
292 | 25 | 192 | scalar @$flat; | ||||
293 | } | ||||||
294 | |||||||
295 | sub getElementsByTagName { | ||||||
296 | 35 | 35 | 1 | 25397 | my $self = shift; | ||
297 | 35 | 96 | my $tagname = lc(shift); | ||||
298 | |||||||
299 | 35 | 104 | my $flat = $self->{flat}; | ||||
300 | 35 | 73 | my $out = []; | ||||
301 | 35 | 234 | for( my $i = 0 ; $i <= $#$flat ; $i++ ) { | ||||
302 | 4450 | 100 | 12869 | next if ( $flat->[$i]->[001] ne $tagname ); | |||
303 | 79 | 100 | 192 | next if $flat->[$i]->[000]; # close | |||
304 | 75 | 335 | my $elem = HTML::TagParser::Element->new( $flat, $i ); | ||||
305 | 75 | 100 | 251 | return $elem unless wantarray; | |||
306 | 49 | 136 | push( @$out, $elem ); | ||||
307 | } | ||||||
308 | 9 | 50 | 28 | return unless wantarray; | |||
309 | 9 | 53 | @$out; | ||||
310 | } | ||||||
311 | |||||||
312 | sub getElementsByAttribute { | ||||||
313 | 34 | 34 | 1 | 61 | my $self = shift; | ||
314 | 34 | 962 | my $key = lc(shift); | ||||
315 | 34 | 49 | my $val = shift; | ||||
316 | |||||||
317 | 34 | 71 | my $flat = $self->{flat}; | ||||
318 | 34 | 66 | my $out = []; | ||||
319 | 34 | 128 | for ( my $i = 0 ; $i <= $#$flat ; $i++ ) { | ||||
320 | 5306 | 100 | 16376 | next if $flat->[$i]->[000]; # close | |||
321 | 2965 | 6794 | my $elem = HTML::TagParser::Element->new( $flat, $i ); | ||||
322 | 2965 | 12229 | my $attr = $elem->attributes(); | ||||
323 | 2965 | 100 | 13454 | next unless exists $attr->{$key}; | |||
324 | 635 | 100 | 3021 | next if ( $attr->{$key} ne $val ); | |||
325 | 85 | 100 | 413 | return $elem unless wantarray; | |||
326 | 57 | 198 | push( @$out, $elem ); | ||||
327 | } | ||||||
328 | 6 | 50 | 61 | return unless wantarray; | |||
329 | 6 | 107 | @$out; | ||||
330 | } | ||||||
331 | |||||||
332 | sub getElementsByClassName { | ||||||
333 | 5 | 5 | 1 | 13 | my $self = shift; | ||
334 | 5 | 11 | my $class = shift; | ||||
335 | 5 | 18 | return $self->getElementsByAttribute( "class", $class ); | ||||
336 | } | ||||||
337 | |||||||
338 | sub getElementsByName { | ||||||
339 | 5 | 5 | 1 | 107 | my $self = shift; | ||
340 | 5 | 11 | my $name = shift; | ||||
341 | 5 | 18 | return $self->getElementsByAttribute( "name", $name ); | ||||
342 | } | ||||||
343 | |||||||
344 | sub getElementById { | ||||||
345 | 15 | 15 | 1 | 3249 | my $self = shift; | ||
346 | 15 | 1025 | my $id = shift; | ||||
347 | 15 | 66 | return scalar $self->getElementsByAttribute( "id", $id ); | ||||
348 | } | ||||||
349 | |||||||
350 | sub modified { | ||||||
351 | 0 | 0 | 0 | 0 | $_[0]->{modified}; | ||
352 | } | ||||||
353 | |||||||
354 | # ---------------------------------------------------------------- | ||||||
355 | |||||||
356 | package HTML::TagParser::Element; | ||||||
357 | 16 | 16 | 181 | use strict; | |||
16 | 44 | ||||||
16 | 42861 | ||||||
358 | |||||||
359 | sub new { | ||||||
360 | 3052 | 3052 | 4065 | my $package = shift; | |||
361 | 3052 | 6142 | my $self = [@_]; | ||||
362 | 3052 | 14165 | bless $self, $package; | ||||
363 | 3052 | 9912 | $self; | ||||
364 | } | ||||||
365 | |||||||
366 | sub tagName { | ||||||
367 | 6 | 6 | 33 | my $self = shift; | |||
368 | 6 | 15 | my ( $flat, $cur ) = @$self; | ||||
369 | 6 | 39 | return $flat->[$cur]->[001]; | ||||
370 | } | ||||||
371 | |||||||
372 | sub id { | ||||||
373 | 2 | 2 | 3 | my $self = shift; | |||
374 | 2 | 13 | $self->getAttribute("id"); | ||||
375 | } | ||||||
376 | |||||||
377 | sub getAttribute { | ||||||
378 | 106 | 106 | 68545 | my $self = shift; | |||
379 | 106 | 214 | my $name = lc(shift); | ||||
380 | 106 | 264 | my $attr = $self->attributes(); | ||||
381 | 106 | 100 | 288 | return unless exists $attr->{$name}; | |||
382 | 96 | 524 | $attr->{$name}; | ||||
383 | } | ||||||
384 | |||||||
385 | sub innerText { | ||||||
386 | 43 | 43 | 5938 | my $self = shift; | |||
387 | 43 | 140 | my ( $flat, $cur ) = @$self; | ||||
388 | 43 | 83 | my $elem = $flat->[$cur]; | ||||
389 | 43 | 100 | 180 | return $elem->[005] if defined $elem->[005]; # cache | |||
390 | 36 | 50 | 99 | return if $elem->[000]; # | |||
391 | 36 | 50 | 66 | 1396 | return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # |
||
392 | |||||||
393 | 36 | 76 | my $tagname = $elem->[001]; | ||||
394 | 36 | 201 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
395 | 36 | 89 | my $list = []; | ||||
396 | 36 | 114 | for ( ; $cur < $closing ; $cur++ ) { | ||||
397 | 307 | 891 | push( @$list, $flat->[$cur]->[003] ); | ||||
398 | } | ||||||
399 | 36 | 86 | my $text = join( "", grep { $_ ne "" } @$list ); | ||||
307 | 578 | ||||||
400 | 36 | 765 | $text =~ s/^\s+|\s+$//sg; | ||||
401 | # $text = "" if ( $cur == $#$flat ); # end of source | ||||||
402 | 36 | 186 | $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); | ||||
403 | } | ||||||
404 | |||||||
405 | sub subTree | ||||||
406 | { | ||||||
407 | 0 | 0 | 0 | my $self = shift; | |||
408 | 0 | 0 | my ( $flat, $cur ) = @$self; | ||||
409 | 0 | 0 | my $elem = $flat->[$cur]; | ||||
410 | 0 | 0 | 0 | return if $elem->[000]; # | |||
411 | 0 | 0 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
412 | 0 | 0 | my $list = []; | ||||
413 | 0 | 0 | while (++$cur < $closing) | ||||
414 | { | ||||||
415 | 0 | 0 | push @$list, $flat->[$cur]; | ||||
416 | } | ||||||
417 | |||||||
418 | # allow the getElement...() methods on the returned object. | ||||||
419 | 0 | 0 | return bless { flat => $list }, 'HTML::TagParser'; | ||||
420 | } | ||||||
421 | |||||||
422 | |||||||
423 | sub nextSibling | ||||||
424 | { | ||||||
425 | 4 | 4 | 8 | my $self = shift; | |||
426 | 4 | 8 | my ( $flat, $cur ) = @$self; | ||||
427 | 4 | 6 | my $elem = $flat->[$cur]; | ||||
428 | |||||||
429 | 4 | 50 | 11 | return undef if $elem->[000]; # | |||
430 | 4 | 7 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
431 | 4 | 9 | my $next_s = $flat->[$closing+1]; | ||||
432 | 4 | 50 | 10 | return undef unless $next_s; | |||
433 | 4 | 100 | 15 | return undef if $next_s->[000]; # parent's | |||
434 | 2 | 7 | return HTML::TagParser::Element->new( $flat, $closing+1 ); | ||||
435 | } | ||||||
436 | |||||||
437 | sub firstChild | ||||||
438 | { | ||||||
439 | 1 | 1 | 2 | my $self = shift; | |||
440 | 1 | 2 | my ( $flat, $cur ) = @$self; | ||||
441 | 1 | 2 | my $elem = $flat->[$cur]; | ||||
442 | 1 | 50 | 19 | return undef if $elem->[000]; # | |||
443 | 1 | 4 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
444 | 1 | 50 | 6 | return undef if $closing <= $cur+1; # no children here. | |||
445 | 1 | 5 | return HTML::TagParser::Element->new( $flat, $cur+1 ); | ||||
446 | } | ||||||
447 | |||||||
448 | sub childNodes | ||||||
449 | { | ||||||
450 | 1 | 1 | 2 | my $self = shift; | |||
451 | 1 | 4 | my ( $flat, $cur ) = @$self; | ||||
452 | 1 | 6 | my $child = firstChild($self); | ||||
453 | 1 | 50 | 5 | return [] unless $child; # an empty array is easier for our callers than undef | |||
454 | 1 | 3 | my @c = ( $child ); | ||||
455 | 1 | 4 | while (defined ($child = nextSibling($child))) | ||||
456 | { | ||||||
457 | 1 | 3 | push @c, $child; | ||||
458 | } | ||||||
459 | 1 | 4 | return \@c; | ||||
460 | } | ||||||
461 | |||||||
462 | sub lastChild | ||||||
463 | { | ||||||
464 | 0 | 0 | 0 | my $c = childNodes(@_); | |||
465 | 0 | 0 | 0 | return undef unless $c->[0]; | |||
466 | 0 | 0 | return $c->[-1]; | ||||
467 | } | ||||||
468 | |||||||
469 | sub previousSibling | ||||||
470 | { | ||||||
471 | 7 | 7 | 7 | my $self = shift; | |||
472 | 7 | 10 | my ( $flat, $cur ) = @$self; | ||||
473 | |||||||
474 | ## This one is expensive. | ||||||
475 | ## We use find_closing() which walks forward. | ||||||
476 | ## We'd need a find_opening() which walks backwards. | ||||||
477 | ## So we walk backwards one by one and consult find_closing() | ||||||
478 | ## until we find $cur-1 or $cur. | ||||||
479 | |||||||
480 | 7 | 8 | my $idx = $cur-1; | ||||
481 | 7 | 15 | while ($idx >= 0) | ||||
482 | { | ||||||
483 | 16 | 100 | 100 | 49 | if ($flat->[$idx][000] && defined($flat->[$idx][006])) | ||
484 | { | ||||||
485 | 2 | 3 | $idx = $flat->[$idx][006]; # use cache for backwards skipping | ||||
486 | 2 | 39 | next; | ||||
487 | } | ||||||
488 | |||||||
489 | 14 | 26 | my $closing = HTML::TagParser::Util::find_closing($flat, $idx); | ||||
490 | 14 | 100 | 66 | 81 | return HTML::TagParser::Element->new( $flat, $idx ) | ||
33 | |||||||
491 | if defined $closing and ($closing == $cur || $closing == $cur-1); | ||||||
492 | 12 | 25 | $idx--; | ||||
493 | } | ||||||
494 | 5 | 20 | return undef; | ||||
495 | } | ||||||
496 | |||||||
497 | sub parentNode | ||||||
498 | { | ||||||
499 | 8 | 8 | 621 | my $self = shift; | |||
500 | 8 | 10 | my ( $flat, $cur ) = @$self; | ||||
501 | |||||||
502 | 8 | 100 | 26 | return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache | |||
503 | |||||||
504 | ## | ||||||
505 | ## This one is very expensive. | ||||||
506 | ## We use previousSibling() to walk backwards, and | ||||||
507 | ## previousSibling() is expensive. | ||||||
508 | ## | ||||||
509 | 5 | 5 | my $ps = $self; | ||||
510 | 5 | 6 | my $first = $self; | ||||
511 | |||||||
512 | 5 | 20 | while (defined($ps = previousSibling($ps))) { $first = $ps; } | ||||
2 | 4 | ||||||
513 | |||||||
514 | 5 | 7 | my $parent = $first->[1] - 1; | ||||
515 | 5 | 100 | 13 | return undef if $parent < 0; | |||
516 | 4 | 50 | 8 | die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; | |||
517 | |||||||
518 | 4 | 8 | $flat->[$cur][007] = $parent; # cache | ||||
519 | 4 | 10 | return HTML::TagParser::Element->new( $flat, $parent ) | ||||
520 | } | ||||||
521 | |||||||
522 | ## | ||||||
523 | ## feature: | ||||||
524 | ## self-closing tags have an additional attribute '/' => '/'. | ||||||
525 | ## | ||||||
526 | sub attributes { | ||||||
527 | 3071 | 3071 | 3190 | my $self = shift; | |||
528 | 3071 | 4877 | my ( $flat, $cur ) = @$self; | ||||
529 | 3071 | 4049 | my $elem = $flat->[$cur]; | ||||
530 | 3071 | 100 | 10154 | return $elem->[004] if ref $elem->[004]; # cache | |||
531 | 1666 | 100 | 5524 | return unless defined $elem->[002]; | |||
532 | 811 | 1129 | my $attr = {}; | ||||
533 | 811 | 19303 | while ( $elem->[002] =~ m{ | ||||
534 | ([^\s="']+)(\s*=\s*(?:["']((?(?<=")(?:\\"|[^"])*?|(?:\\'|[^'])*?))["']|([^'"\s=]+)['"]*))? | ||||||
535 | }sgx ) { | ||||||
536 | 1994 | 5730 | my $key = $1; | ||||
537 | 1994 | 3888 | my $test = $2; | ||||
538 | 1994 | 66 | 9916 | my $val = $3 || $4; | |||
539 | 1994 | 4059 | my $lckey = lc($key); | ||||
540 | 1994 | 100 | 22106 | if ($test) { | |||
541 | 1719 | 2433 | $key =~ tr/A-Z/a-z/; | ||||
542 | 1719 | 5784 | $val = HTML::TagParser::Util::xml_unescape( $val ); | ||||
543 | 1719 | 17542 | $attr->{$lckey} = $val; | ||||
544 | } | ||||||
545 | else { | ||||||
546 | 275 | 1670 | $attr->{$lckey} = $key; | ||||
547 | } | ||||||
548 | } | ||||||
549 | 811 | 1703 | $elem->[004] = $attr; # cache | ||||
550 | 811 | 2882 | $attr; | ||||
551 | } | ||||||
552 | |||||||
553 | # ---------------------------------------------------------------- | ||||||
554 | |||||||
555 | package HTML::TagParser::Util; | ||||||
556 | 16 | 16 | 147 | use strict; | |||
16 | 34 | ||||||
16 | 28142 | ||||||
557 | |||||||
558 | sub xml_unescape { | ||||||
559 | 1755 | 1755 | 3088 | my $str = shift; | |||
560 | 1755 | 100 | 4071 | return unless defined $str; | |||
561 | 1568 | 2674 | $str =~ s/"/"/g; | ||||
562 | 1568 | 1823 | $str =~ s/</ | ||||
563 | 1568 | 2305 | $str =~ s/>/>/g; | ||||
564 | 1568 | 10175 | $str =~ s/&/&/g; | ||||
565 | 1568 | 3970 | $str; | ||||
566 | } | ||||||
567 | |||||||
568 | sub read_text_file { | ||||||
569 | 13 | 13 | 22 | my $file = shift; | |||
570 | 13 | 56 | my $fh = Symbol::gensym(); | ||||
571 | 13 | 50 | 1320 | open( $fh, $file ) or Carp::croak "$! - $file\n"; | |||
572 | 13 | 125 | local $/ = undef; | ||||
573 | 13 | 757 | my $text = <$fh>; | ||||
574 | 13 | 148 | close($fh); | ||||
575 | 13 | 81 | $text; | ||||
576 | } | ||||||
577 | |||||||
578 | sub html_to_flat { | ||||||
579 | 25 | 25 | 45 | my $txtref = shift; # reference | |||
580 | 25 | 56 | my $flat = []; | ||||
581 | 25 | 86 | pos($$txtref) = undef; # reset matching position | ||||
582 | 25 | 294 | while ( $$txtref =~ m{ | ||||
583 | (?:[^<]*) < (?: | ||||||
584 | ( / )? ( [^/!<>\s"'=]+ ) | ||||||
585 | ( (?:"[^"]*"|'[^']*'|[^"'<>])+ )? | ||||||
586 | | | ||||||
587 | (!-- .*? -- | ![^\-] .*? ) | ||||||
588 | ) > ([^<]*) | ||||||
589 | }sxg ) { | ||||||
590 | # [000] $1 close | ||||||
591 | # [001] $2 tagName | ||||||
592 | # [002] $3 attributes | ||||||
593 | # $4 comment element | ||||||
594 | # [003] $5 content | ||||||
595 | 3914 | 100 | 10500 | next if defined $4; | |||
596 | 3868 | 19774 | my $array = [ $1, $2, $3, $5 ]; | ||||
597 | 3868 | 11232 | $array->[001] =~ tr/A-Z/a-z/; | ||||
598 | # $array->[003] =~ s/^\s+//s; | ||||||
599 | # $array->[003] =~ s/\s+$//s; | ||||||
600 | 3868 | 87087 | push( @$flat, $array ); | ||||
601 | } | ||||||
602 | 25 | 90 | $flat; | ||||
603 | } | ||||||
604 | |||||||
605 | ## returns 1 beyond the end, if not found. | ||||||
606 | ## returns undef if called on a closing tag | ||||||
607 | sub find_closing | ||||||
608 | { | ||||||
609 | 59 | 59 | 88 | my ($flat, $cur) = @_; | |||
610 | |||||||
611 | 59 | 100 | 194 | return $flat->[$cur][006] if $flat->[$cur][006]; # cache | |||
612 | 44 | 100 | 100 | 270 | return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing | ||
613 | |||||||
614 | 43 | 91 | my $name = $flat->[$cur][001]; | ||||
615 | 43 | 53 | my $pre_nest = 0; | ||||
616 | ## count how many levels deep this type of tag is nested. | ||||||
617 | 43 | 150 | my $idx; | ||||
618 | 43 | 138 | for ($idx = 0; $idx <= $cur; $idx++) | ||||
619 | { | ||||||
620 | 3318 | 6265 | my $e = $flat->[$idx]; | ||||
621 | 3318 | 100 | 8804 | next unless $e->[001] eq $name; | |||
622 | 278 | 50 | 100 | 1014 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
623 | 278 | 100 | 4882 | $pre_nest += ($e->[000]) ? -1 : 1; | |||
624 | 278 | 100 | 625 | $pre_nest = 0 if $pre_nest < 0; | |||
625 | 278 | 100 | 100 | 1322 | $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. | ||
626 | } | ||||||
627 | 43 | 77 | my $last_idx = $#$flat; | ||||
628 | |||||||
629 | ## we move last_idx closer, in case this container | ||||||
630 | ## has not all its subcontainers closed properly. | ||||||
631 | 43 | 72 | my $post_nest = 0; | ||||
632 | 43 | 151 | for ($idx = $last_idx; $idx > $cur; $idx--) | ||||
633 | { | ||||||
634 | 2526 | 3198 | my $e = $flat->[$idx]; | ||||
635 | 2526 | 100 | 14167 | next unless $e->[001] eq $name; | |||
636 | 45 | 84 | $last_idx = $idx-1; # remember where a matching tag was | ||||
637 | 45 | 50 | 100 | 362 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
638 | 45 | 100 | 136 | $post_nest -= ($e->[000]) ? -1 : 1; | |||
639 | 45 | 100 | 146 | $post_nest = 0 if $post_nest < 0; | |||
640 | 45 | 100 | 141 | last if $pre_nest <= $post_nest; | |||
641 | 7 | 50 | 66 | 37 | $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. | ||
642 | } | ||||||
643 | |||||||
644 | 43 | 68 | my $nest = 1; # we know it is not self-closing. start behind. | ||||
645 | |||||||
646 | 43 | 140 | for ($idx = $cur+1; $idx <= $last_idx; $idx++) | ||||
647 | { | ||||||
648 | 316 | 310 | my $e = $flat->[$idx]; | ||||
649 | 316 | 100 | 938 | next unless $e->[001] eq $name; | |||
650 | 17 | 100 | 100 | 116 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
651 | 14 | 100 | 42 | $nest += ($e->[000]) ? -1 : 1; | |||
652 | 14 | 100 | 37 | if ($nest <= 0) | |||
653 | { | ||||||
654 | 6 | 50 | 19 | die "assert " unless $e->[000]; | |||
655 | 6 | 16 | $e->[006] = $cur; # point back to opening tag | ||||
656 | 6 | 23 | return $flat->[$cur][006] = $idx; | ||||
657 | } | ||||||
658 | 8 | 100 | 100 | 106 | $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. | ||
659 | } | ||||||
660 | |||||||
661 | # not all closed, but cannot go further | ||||||
662 | 37 | 153 | return $flat->[$cur][006] = $last_idx+1; | ||||
663 | } | ||||||
664 | |||||||
665 | sub find_meta_charset { | ||||||
666 | 25 | 25 | 50 | my $txtref = shift; # reference | |||
667 | 25 | 368 | while ( $$txtref =~ m{ | ||||
668 | ]+\s )? http-equiv\s*=\s*['"]?Content-Type [^>]+ ) > | ||||||
669 | }sxgi ) { | ||||||
670 | 12 | 37 | my $args = $1; | ||||
671 | 12 | 50 | 99 | return $1 if ( $args =~ m# charset=['"]?([^'"\s/]+) #sxgi ); | |||
672 | } | ||||||
673 | 13 | 29 | undef; | ||||
674 | } | ||||||
675 | |||||||
676 | sub encode_from_to { | ||||||
677 | 11 | 11 | 37 | my ( $txtref, $from, $to ) = @_; | |||
678 | 11 | 50 | 164 | return if ( $from eq "" ); | |||
679 | 11 | 50 | 38 | return if ( $to eq "" ); | |||
680 | 11 | 100 | 46 | return $to if ( uc($from) eq uc($to) ); | |||
681 | 7 | 54 | Encode::from_to( $$txtref, $from, $to, Encode::XMLCREF() ); | ||||
682 | 7 | 5292 | return $to; | ||||
683 | } | ||||||
684 | |||||||
685 | # ---------------------------------------------------------------- | ||||||
686 | 1; | ||||||
687 | # ---------------------------------------------------------------- |