File Coverage

blib/lib/HTML/TagParser.pm

Criterion	Covered	Total	%
statement	259	276	93.8
branch	104	134	77.6
condition	33	41	80.4
subroutine	33	36	91.6
pod	9	10	90.0
total	438	497	88.1

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							HTML::TagParser - Yet another HTML document parser with DOM-like methods
4
5							=head1 SYNOPSIS
6
7							Parse a HTML file and find its element's value. </td> </tr> <tr> <td class="h" > <a name="8">8</a> </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td class="s"> </td> </tr> <tr> <td class="h" > <a name="9">9</a> </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td class="s"> my $html = HTML::TagParser->new( "index-j.html" ); </td> </tr> <tr> <td class="h" > <a name="10">10</a> </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td class="s"> my $elem = $html->getElementsByTagName( "title" ); </td> </tr> <tr> <td class="h" > <a name="11">11</a> </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td > </td> <td class="s"> print "<title>", $elem->innerText(), "\n" if ref $elem;
12
13							Parse a HTML source and find its first attribute's value
14							and find all input elements belonging to this form.
15
16							my $src = '...';
17							my $html = HTML::TagParser->new( $src );
18							my $elem = $html->getElementsByTagName( "form" );
19							print " getAttribute("action"), "\">\n" if ref $elem;
20							my @first_inputs = $elem->subTree()->getElementsByTagName( "input" );
21							my $form = $first_inputs[0]->getParent();
22
23							Fetch a HTML file via HTTP, and display its all elements and attributes.
24
25							my $url = 'http://www.kawa.net/xp/index-e.html';
26							my $html = HTML::TagParser->new( $url );
27							my @list = $html->getElementsByTagName( "a" );
28							foreach my $elem ( @list ) {
29							my $tagname = $elem->tagName;
30							my $attr = $elem->attributes;
31							my $text = $elem->innerText;
32							print "<$tagname";
33							foreach my $key ( sort keys %$attr ) {
34							print " $key=\"$attr->{$key}\"";
35							}
36							if ( $text eq "" ) {
37							print " />\n";
38							} else {
39							print ">$text\n";
40							}
41							}
42
43							=head1 DESCRIPTION
44
45							HTML::TagParser is a pure Perl module which parses HTML/XHTML files.
46							This module provides some methods like DOM interface.
47							This module is not strict about XHTML format
48							because many of HTML pages are not strict.
49							You know, many pages use elemtents instead of
50							and have elements which are not closed.
51
52							=head1 METHODS
53
54							=head2 $html = HTML::TagParser->new();
55
56							This method constructs an empty instance of the C class.
57
58							=head2 $html = HTML::TagParser->new( $url );
59
60							If new() is called with a URL,
61							this method fetches a HTML file from remote web server and parses it
62							and returns its instance.
63							L module is required to fetch a file.
64
65							=head2 $html = HTML::TagParser->new( $file );
66
67							If new() is called with a filename,
68							this method parses a local HTML file and returns its instance
69
70							=head2 $html = HTML::TagParser->new( "...snip..." );
71
72							If new() is called with a string of HTML source code,
73							this method parses it and returns its instance.
74
75							=head2 $html->fetch( $url, %param );
76
77							This method fetches a HTML file from remote web server and parse it.
78							The second argument is optional parameters for L module.
79
80							=head2 $html->open( $file );
81
82							This method parses a local HTML file.
83
84							=head2 $html->parse( $source );
85
86							This method parses a string of HTML source code.
87
88							=head2 $elem = $html->getElementById( $id );
89
90							This method returns the element which id attribute is $id.
91
92							=head2 @elem = $html->getElementsByName( $name );
93
94							This method returns an array of elements which name attribute is $name.
95							On scalar context, the first element is only retruned.
96
97							=head2 @elem = $html->getElementsByTagName( $tagname );
98
99							This method returns an array of elements which tagName is $tagName.
100							On scalar context, the first element is only retruned.
101
102							=head2 @elem = $html->getElementsByClassName( $class );
103
104							This method returns an array of elements which className is $tagName.
105							On scalar context, the first element is only retruned.
106
107							=head2 @elem = $html->getElementsByAttribute( $attrname, $value );
108
109							This method returns an array of elements which $attrname attribute's value is $value.
110							On scalar context, the first element is only retruned.
111
112							=head1 HTML::TagParser::Element SUBCLASS
113
114							=head2 $tagname = $elem->tagName();
115
116							This method returns $elem's tagName.
117
118							=head2 $text = $elem->id();
119
120							This method returns $elem's id attribute.
121
122							=head2 $text = $elem->innerText();
123
124							This method returns $elem's innerText without tags.
125
126							=head2 $subhtml = $elem->subTree();
127
128							This method returns a new object of class HTML::Parser,
129							with all the elements that are in the DOM hierarchy under $elem.
130
131							=head2 $elem = $elem->nextSibling();
132
133							This method returns the next sibling within the same parent.
134							It returns undef when called on a closing tag or on the lastChild node
135							of a parentNode.
136
137							=head2 $elem = $elem->previousSibling();
138
139							This method returns the previous sibling within the same parent.
140							It returns undef when called on the firstChild node of a parentNode.
141
142							=head2 $child_elem = $elem->firstChild();
143
144							This method returns the first child node of $elem.
145							It returns undef when called on a closing tag element or on a
146							non-container or empty container element.
147
148							=head2 $child_elems = $elem->childNodes();
149
150							This method creates an array of all child nodes of $elem and returns the array by reference.
151							It returns an empty array-ref [] whenever firstChild() would return undef.
152
153							=head2 $child_elem = $elem->lastChild();
154
155							This method returns the last child node of $elem.
156							It returns undef whenever firstChild() would return undef.
157
158							=head2 $parent = $elem->parentNode();
159
160							This method returns the parent node of $elem.
161							It returns undef when called on root nodes.
162
163							=head2 $attr = $elem->attributes();
164
165							This method returns a hash of $elem's all attributes.
166
167							=head2 $value = $elem->getAttribute( $key );
168
169							This method returns the value of $elem's attributes which name is $key.
170
171							=head1 BUGS
172
173							The HTML-Parser is simple. Methods innerText and subTree may be
174							fooled by nested tags or embedded javascript code.
175
176							The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results.
177							The most expensive ones are lastChild() and previousSibling().
178							parentNode() is also expensive, but only once. It does caching.
179
180							The DOM tree is read-only, as this is just a parser.
181
182							=head1 INTERNATIONALIZATION
183
184							This module natively understands the character encoding used in document
185							by parsing its meta element.
186
187
188
189							The parsed document's encoding is converted
190							as this class's fixed internal encoding "UTF-8".
191
192							=head1 AUTHORS AND CONTRIBUTORS
193
194							drry [drry]
195							Juergen Weigert [jnw]
196							Yusuke Kawasaki [kawasaki] [kawanet]
197							Tim Wilde [twilde]
198
199							=head1 COPYRIGHT AND LICENSE
200
201							The following copyright notice applies to all the files provided in this
202							distribution, including binary files, unless explicitly noted otherwise.
203
204							Copyright 2006-2012 Yusuke Kawasaki
205
206							This program is free software; you can redistribute it and/or
207							modify it under the same terms as Perl itself.
208
209							=cut
210							# ----------------------------------------------------------------
211
212							package HTML::TagParser;
213	16			16		682399	use 5.008_001;
	16					66
	16					838
214	16			16		96	use strict;
	16					36
	16					1082
215	16			16		22975	use Symbol ();
	16					33051
	16					436
216	16			16		126	use Carp ();
	16					26
	16					854
217	16			16		34656	use Encode ();
	16					427777
	16					22745
218
219							our $VERSION = "0.20";
220
221							my $SEC_OF_DAY = 60 * 60 * 24;
222
223							# [000] '/' if closing tag.
224							# [001] tagName
225							# [002] attributes string (with trailing /, if self-closing tag).
226							# [003] content until next (nested) tag.
227							# [004] attributes hash cache.
228							# [005] innerText combined strings cache.
229							# [006] index of matching closing tag (or opening tag, if [000]=='/')
230							# [007] index of parent (aka container) tag.
231							#
232							sub new {
233	26			26	1	1403953	my $package = shift;
234	26					56	my $src = shift;
235	26					63	my $self = {};
236	26					73	bless $self, $package;
237	26	100				102	return $self unless defined $src;
238
239	19	100	66			569	if ( $src =~ m#^https?://\w# ) {
		100
		50
240	1					7	$self->fetch( $src, @_ );
241							}
242							elsif ( $src !~ m#[<>\|]# && -f $src ) {
243	12					47	$self->open($src);
244							}
245							elsif ( $src =~ /<.*>/ ) {
246	6					33	$self->parse($src);
247							}
248
249	19					92	$self;
250							}
251
252							sub fetch {
253	2			2	1	8	my $self = shift;
254	2					6	my $url = shift;
255	2	50				8	if ( !defined $URI::Fetch::VERSION ) {
256	0					0	local $@;
257	0					0	eval { require URI::Fetch; };
	0					0
258	0	0				0	Carp::croak "URI::Fetch is required: $url" if $@;
259							}
260	2					16	my $res = URI::Fetch->fetch( $url, @_ );
261	2	50				788652	Carp::croak "URI::Fetch failed: $url" unless ref $res;
262	2	50				12	return if $res->is_error();
263	2					70	$self->{modified} = $res->last_modified();
264	2					29	my $text = $res->content();
265	2					23	$self->parse( \$text );
266							}
267
268							sub open {
269	13			13	1	21	my $self = shift;
270	13					27	my $file = shift;
271	13					45	my $text = HTML::TagParser::Util::read_text_file($file);
272	13	50				47	return unless defined $text;
273	13					223	my $epoch = ( time() - ( -M $file ) * $SEC_OF_DAY );
274	13					28	$epoch -= $epoch % 60;
275	13					63	$self->{modified} = $epoch;
276	13					50	$self->parse( \$text );
277							}
278
279							sub parse {
280	25			25	1	66	my $self = shift;
281	25					49	my $text = shift;
282	25	100				111	my $txtref = ref $text ? $text : \$text;
283
284	25					112	my $charset = HTML::TagParser::Util::find_meta_charset($txtref);
285	25		66			218	$self->{charset} \|\|= $charset;
286	25	100	100			136	if ($charset && Encode::find_encoding($charset)) {
287	11					40530	HTML::TagParser::Util::encode_from_to( $txtref, $charset, "utf-8" );
288							}
289	25					665	my $flat = HTML::TagParser::Util::html_to_flat($txtref);
290	25	50				105	Carp::croak "Null HTML document." unless scalar @$flat;
291	25					120	$self->{flat} = $flat;
292	25					192	scalar @$flat;
293							}
294
295							sub getElementsByTagName {
296	35			35	1	25397	my $self = shift;
297	35					96	my $tagname = lc(shift);
298
299	35					104	my $flat = $self->{flat};
300	35					73	my $out = [];
301	35					234	for( my $i = 0 ; $i <= $#$flat ; $i++ ) {
302	4450	100				12869	next if ( $flat->[$i]->[001] ne $tagname );
303	79	100				192	next if $flat->[$i]->[000]; # close
304	75					335	my $elem = HTML::TagParser::Element->new( $flat, $i );
305	75	100				251	return $elem unless wantarray;
306	49					136	push( @$out, $elem );
307							}
308	9	50				28	return unless wantarray;
309	9					53	@$out;
310							}
311
312							sub getElementsByAttribute {
313	34			34	1	61	my $self = shift;
314	34					962	my $key = lc(shift);
315	34					49	my $val = shift;
316
317	34					71	my $flat = $self->{flat};
318	34					66	my $out = [];
319	34					128	for ( my $i = 0 ; $i <= $#$flat ; $i++ ) {
320	5306	100				16376	next if $flat->[$i]->[000]; # close
321	2965					6794	my $elem = HTML::TagParser::Element->new( $flat, $i );
322	2965					12229	my $attr = $elem->attributes();
323	2965	100				13454	next unless exists $attr->{$key};
324	635	100				3021	next if ( $attr->{$key} ne $val );
325	85	100				413	return $elem unless wantarray;
326	57					198	push( @$out, $elem );
327							}
328	6	50				61	return unless wantarray;
329	6					107	@$out;
330							}
331
332							sub getElementsByClassName {
333	5			5	1	13	my $self = shift;
334	5					11	my $class = shift;
335	5					18	return $self->getElementsByAttribute( "class", $class );
336							}
337
338							sub getElementsByName {
339	5			5	1	107	my $self = shift;
340	5					11	my $name = shift;
341	5					18	return $self->getElementsByAttribute( "name", $name );
342							}
343
344							sub getElementById {
345	15			15	1	3249	my $self = shift;
346	15					1025	my $id = shift;
347	15					66	return scalar $self->getElementsByAttribute( "id", $id );
348							}
349
350							sub modified {
351	0			0	0	0	$_[0]->{modified};
352							}
353
354							# ----------------------------------------------------------------
355
356							package HTML::TagParser::Element;
357	16			16		181	use strict;
	16					44
	16					42861
358
359							sub new {
360	3052			3052		4065	my $package = shift;
361	3052					6142	my $self = [@_];
362	3052					14165	bless $self, $package;
363	3052					9912	$self;
364							}
365
366							sub tagName {
367	6			6		33	my $self = shift;
368	6					15	my ( $flat, $cur ) = @$self;
369	6					39	return $flat->[$cur]->[001];
370							}
371
372							sub id {
373	2			2		3	my $self = shift;
374	2					13	$self->getAttribute("id");
375							}
376
377							sub getAttribute {
378	106			106		68545	my $self = shift;
379	106					214	my $name = lc(shift);
380	106					264	my $attr = $self->attributes();
381	106	100				288	return unless exists $attr->{$name};
382	96					524	$attr->{$name};
383							}
384
385							sub innerText {
386	43			43		5938	my $self = shift;
387	43					140	my ( $flat, $cur ) = @$self;
388	43					83	my $elem = $flat->[$cur];
389	43	100				180	return $elem->[005] if defined $elem->[005]; # cache
390	36	50				99	return if $elem->[000]; #
391	36	50	66			1396	return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); #
392
393	36					76	my $tagname = $elem->[001];
394	36					201	my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
395	36					89	my $list = [];
396	36					114	for ( ; $cur < $closing ; $cur++ ) {
397	307					891	push( @$list, $flat->[$cur]->[003] );
398							}
399	36					86	my $text = join( "", grep { $_ ne "" } @$list );
	307					578
400	36					765	$text =~ s/^\s+\|\s+$//sg;
401							# $text = "" if ( $cur == $#$flat ); # end of source
402	36					186	$elem->[005] = HTML::TagParser::Util::xml_unescape( $text );
403							}
404
405							sub subTree
406							{
407	0			0		0	my $self = shift;
408	0					0	my ( $flat, $cur ) = @$self;
409	0					0	my $elem = $flat->[$cur];
410	0	0				0	return if $elem->[000]; #
411	0					0	my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
412	0					0	my $list = [];
413	0					0	while (++$cur < $closing)
414							{
415	0					0	push @$list, $flat->[$cur];
416							}
417
418							# allow the getElement...() methods on the returned object.
419	0					0	return bless { flat => $list }, 'HTML::TagParser';
420							}
421
422
423							sub nextSibling
424							{
425	4			4		8	my $self = shift;
426	4					8	my ( $flat, $cur ) = @$self;
427	4					6	my $elem = $flat->[$cur];
428
429	4	50				11	return undef if $elem->[000]; #
430	4					7	my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
431	4					9	my $next_s = $flat->[$closing+1];
432	4	50				10	return undef unless $next_s;
433	4	100				15	return undef if $next_s->[000]; # parent's
434	2					7	return HTML::TagParser::Element->new( $flat, $closing+1 );
435							}
436
437							sub firstChild
438							{
439	1			1		2	my $self = shift;
440	1					2	my ( $flat, $cur ) = @$self;
441	1					2	my $elem = $flat->[$cur];
442	1	50				19	return undef if $elem->[000]; #
443	1					4	my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
444	1	50				6	return undef if $closing <= $cur+1; # no children here.
445	1					5	return HTML::TagParser::Element->new( $flat, $cur+1 );
446							}
447
448							sub childNodes
449							{
450	1			1		2	my $self = shift;
451	1					4	my ( $flat, $cur ) = @$self;
452	1					6	my $child = firstChild($self);
453	1	50				5	return [] unless $child; # an empty array is easier for our callers than undef
454	1					3	my @c = ( $child );
455	1					4	while (defined ($child = nextSibling($child)))
456							{
457	1					3	push @c, $child;
458							}
459	1					4	return \@c;
460							}
461
462							sub lastChild
463							{
464	0			0		0	my $c = childNodes(@_);
465	0	0				0	return undef unless $c->[0];
466	0					0	return $c->[-1];
467							}
468
469							sub previousSibling
470							{
471	7			7		7	my $self = shift;
472	7					10	my ( $flat, $cur ) = @$self;
473
474							## This one is expensive.
475							## We use find_closing() which walks forward.
476							## We'd need a find_opening() which walks backwards.
477							## So we walk backwards one by one and consult find_closing()
478							## until we find $cur-1 or $cur.
479
480	7					8	my $idx = $cur-1;
481	7					15	while ($idx >= 0)
482							{
483	16	100	100			49	if ($flat->[$idx][000] && defined($flat->[$idx][006]))
484							{
485	2					3	$idx = $flat->[$idx][006]; # use cache for backwards skipping
486	2					39	next;
487							}
488
489	14					26	my $closing = HTML::TagParser::Util::find_closing($flat, $idx);
490	14	100	66			81	return HTML::TagParser::Element->new( $flat, $idx )
			33
491							if defined $closing and ($closing == $cur \|\| $closing == $cur-1);
492	12					25	$idx--;
493							}
494	5					20	return undef;
495							}
496
497							sub parentNode
498							{
499	8			8		621	my $self = shift;
500	8					10	my ( $flat, $cur ) = @$self;
501
502	8	100				26	return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache
503
504							##
505							## This one is very expensive.
506							## We use previousSibling() to walk backwards, and
507							## previousSibling() is expensive.
508							##
509	5					5	my $ps = $self;
510	5					6	my $first = $self;
511
512	5					20	while (defined($ps = previousSibling($ps))) { $first = $ps; }
	2					4
513
514	5					7	my $parent = $first->[1] - 1;
515	5	100				13	return undef if $parent < 0;
516	4	50				8	die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur;
517
518	4					8	$flat->[$cur][007] = $parent; # cache
519	4					10	return HTML::TagParser::Element->new( $flat, $parent )
520							}
521
522							##
523							## feature:
524							## self-closing tags have an additional attribute '/' => '/'.
525							##
526							sub attributes {
527	3071			3071		3190	my $self = shift;
528	3071					4877	my ( $flat, $cur ) = @$self;
529	3071					4049	my $elem = $flat->[$cur];
530	3071	100				10154	return $elem->[004] if ref $elem->[004]; # cache
531	1666	100				5524	return unless defined $elem->[002];
532	811					1129	my $attr = {};
533	811					19303	while ( $elem->[002] =~ m{
534							([^\s="']+)(\s=\s(?:["']((?(?<=")(?:\\"\|[^"])?\|(?:\\'\|[^'])?))["']\|([^'"\s=]+)['"]*))?
535							}sgx ) {
536	1994					5730	my $key = $1;
537	1994					3888	my $test = $2;
538	1994		66			9916	my $val = $3 \|\| $4;
539	1994					4059	my $lckey = lc($key);
540	1994	100				22106	if ($test) {
541	1719					2433	$key =~ tr/A-Z/a-z/;
542	1719					5784	$val = HTML::TagParser::Util::xml_unescape( $val );
543	1719					17542	$attr->{$lckey} = $val;
544							}
545							else {
546	275					1670	$attr->{$lckey} = $key;
547							}
548							}
549	811					1703	$elem->[004] = $attr; # cache
550	811					2882	$attr;
551							}
552
553							# ----------------------------------------------------------------
554
555							package HTML::TagParser::Util;
556	16			16		147	use strict;
	16					34
	16					28142
557
558							sub xml_unescape {
559	1755			1755		3088	my $str = shift;
560	1755	100				4071	return unless defined $str;
561	1568					2674	$str =~ s/"/"/g;
562	1568					1823	$str =~ s/</
563	1568					2305	$str =~ s/>/>/g;
564	1568					10175	$str =~ s/&/&/g;
565	1568					3970	$str;
566							}
567
568							sub read_text_file {
569	13			13		22	my $file = shift;
570	13					56	my $fh = Symbol::gensym();
571	13	50				1320	open( $fh, $file ) or Carp::croak "$! - $file\n";
572	13					125	local $/ = undef;
573	13					757	my $text = <$fh>;
574	13					148	close($fh);
575	13					81	$text;
576							}
577
578							sub html_to_flat {
579	25			25		45	my $txtref = shift; # reference
580	25					56	my $flat = [];
581	25					86	pos($$txtref) = undef; # reset matching position
582	25					294	while ( $$txtref =~ m{
583							(?:[^<]*) < (?:
584							( / )? ( [^/!<>\s"'=]+ )
585							( (?:"[^"]"\|'[^']'\|[^"'<>])+ )?
586							\|
587							(!-- .? -- \| ![^\-] .? )
588							) > ([^<]*)
589							}sxg ) {
590							# [000] $1 close
591							# [001] $2 tagName
592							# [002] $3 attributes
593							# $4 comment element
594							# [003] $5 content
595	3914	100				10500	next if defined $4;
596	3868					19774	my $array = [ $1, $2, $3, $5 ];
597	3868					11232	$array->[001] =~ tr/A-Z/a-z/;
598							# $array->[003] =~ s/^\s+//s;
599							# $array->[003] =~ s/\s+$//s;
600	3868					87087	push( @$flat, $array );
601							}
602	25					90	$flat;
603							}
604
605							## returns 1 beyond the end, if not found.
606							## returns undef if called on a closing tag
607							sub find_closing
608							{
609	59			59		88	my ($flat, $cur) = @_;
610
611	59	100				194	return $flat->[$cur][006] if $flat->[$cur][006]; # cache
612	44	100	100			270	return $flat->[$cur][006] = $cur if (($flat->[$cur][002]\|\|'') =~ m{/$}); # self-closing
613
614	43					91	my $name = $flat->[$cur][001];
615	43					53	my $pre_nest = 0;
616							## count how many levels deep this type of tag is nested.
617	43					150	my $idx;
618	43					138	for ($idx = 0; $idx <= $cur; $idx++)
619							{
620	3318					6265	my $e = $flat->[$idx];
621	3318	100				8804	next unless $e->[001] eq $name;
622	278	50	100			1014	next if (($e->[002]\|\|'') =~ m{/$}); # self-closing
623	278	100				4882	$pre_nest += ($e->[000]) ? -1 : 1;
624	278	100				625	$pre_nest = 0 if $pre_nest < 0;
625	278	100	100			1322	$idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
626							}
627	43					77	my $last_idx = $#$flat;
628
629							## we move last_idx closer, in case this container
630							## has not all its subcontainers closed properly.
631	43					72	my $post_nest = 0;
632	43					151	for ($idx = $last_idx; $idx > $cur; $idx--)
633							{
634	2526					3198	my $e = $flat->[$idx];
635	2526	100				14167	next unless $e->[001] eq $name;
636	45					84	$last_idx = $idx-1; # remember where a matching tag was
637	45	50	100			362	next if (($e->[002]\|\|'') =~ m{/$}); # self-closing
638	45	100				136	$post_nest -= ($e->[000]) ? -1 : 1;
639	45	100				146	$post_nest = 0 if $post_nest < 0;
640	45	100				141	last if $pre_nest <= $post_nest;
641	7	50	66			37	$idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards.
642							}
643
644	43					68	my $nest = 1; # we know it is not self-closing. start behind.
645
646	43					140	for ($idx = $cur+1; $idx <= $last_idx; $idx++)
647							{
648	316					310	my $e = $flat->[$idx];
649	316	100				938	next unless $e->[001] eq $name;
650	17	100	100			116	next if (($e->[002]\|\|'') =~ m{/$}); # self-closing
651	14	100				42	$nest += ($e->[000]) ? -1 : 1;
652	14	100				37	if ($nest <= 0)
653							{
654	6	50				19	die "assert " unless $e->[000];
655	6					16	$e->[006] = $cur; # point back to opening tag
656	6					23	return $flat->[$cur][006] = $idx;
657							}
658	8	100	100			106	$idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
659							}
660
661							# not all closed, but cannot go further
662	37					153	return $flat->[$cur][006] = $last_idx+1;
663							}
664
665							sub find_meta_charset {
666	25			25		50	my $txtref = shift; # reference
667	25					368	while ( $$txtref =~ m{
668							]+\s )? http-equiv\s=\s['"]?Content-Type [^>]+ ) >
669							}sxgi ) {
670	12					37	my $args = $1;
671	12	50				99	return $1 if ( $args =~ m# charset=['"]?([^'"\s/]+) #sxgi );
672							}
673	13					29	undef;
674							}
675
676							sub encode_from_to {
677	11			11		37	my ( $txtref, $from, $to ) = @_;
678	11	50				164	return if ( $from eq "" );
679	11	50				38	return if ( $to eq "" );
680	11	100				46	return $to if ( uc($from) eq uc($to) );
681	7					54	Encode::from_to( $$txtref, $from, $to, Encode::XMLCREF() );
682	7					5292	return $to;
683							}
684
685							# ----------------------------------------------------------------
686							1;
687							# ----------------------------------------------------------------