line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# $Id: libXML.pm,v 0.2 2009/02/21 11:47:58 dankogai Exp dankogai $ |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
package HTML::Tidy::libXML; |
5
|
1
|
|
|
1
|
|
39767
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
36
|
|
6
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
114
|
|
7
|
1
|
|
|
1
|
|
1255
|
use Encode; |
|
1
|
|
|
|
|
15155
|
|
|
1
|
|
|
|
|
104
|
|
8
|
1
|
|
|
1
|
|
445
|
use XML::LibXML; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
13
|
|
|
|
|
|
|
my $class = shift; |
14
|
|
|
|
|
|
|
my $lx = XML::LibXML->new; |
15
|
|
|
|
|
|
|
$lx->validation(0); |
16
|
|
|
|
|
|
|
$lx->recover_silently(1); |
17
|
|
|
|
|
|
|
bless { lx => $lx }, $class; |
18
|
|
|
|
|
|
|
} |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub html2dom { |
21
|
|
|
|
|
|
|
my ( $self, $html, $encoding ) = @_; |
22
|
|
|
|
|
|
|
$encoding ||= 'iso-8859-1'; |
23
|
|
|
|
|
|
|
$html =~ s/\r\n?/\n/msg; # normalize CRLF to LF |
24
|
|
|
|
|
|
|
$html = decode( $encoding, $html ); # leave the utf8 flag |
25
|
|
|
|
|
|
|
$self->{lx}->parse_html_string($html); |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub dom2xml { |
29
|
|
|
|
|
|
|
my ($self, $dom, $level) = @_; |
30
|
|
|
|
|
|
|
my $root = $dom->findnodes('/html')->shift; |
31
|
|
|
|
|
|
|
$root->setAttribute( xmlns => 'http://www.w3.org/1999/xhtml' ); |
32
|
|
|
|
|
|
|
for my $meta ( $dom->findnodes('//meta[@http-equiv!=""]') ) { |
33
|
|
|
|
|
|
|
$meta->setAttribute( content => 'text/html; charset=utf-8' ); |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
_tidy_dom($dom) if $level > 0; |
36
|
|
|
|
|
|
|
my $xhtml = $root->toString( 0, 'utf-8' ); # utf8 flag off |
37
|
|
|
|
|
|
|
return <
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
41
|
|
|
|
|
|
|
$xhtml |
42
|
|
|
|
|
|
|
EOT |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub html2xml { |
46
|
|
|
|
|
|
|
my ( $self, $html, $encoding, $level ) = @_; |
47
|
|
|
|
|
|
|
my $dom = $self->html2dom( $html, $encoding ); |
48
|
|
|
|
|
|
|
$self->dom2xml($dom, $level); |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub _tidy_dom { |
52
|
|
|
|
|
|
|
my $dom = shift; |
53
|
|
|
|
|
|
|
# remove empty attributes (like ) |
54
|
|
|
|
|
|
|
for my $node ( $dom->findnodes('//*[attribute::*=""]') ) { |
55
|
|
|
|
|
|
|
for my $attr ( $node->attributes ) { |
56
|
|
|
|
|
|
|
next if $attr->getValue; |
57
|
|
|
|
|
|
|
$node->removeAttribute( $attr->getName ); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
# handle |
70
|
|
|
|
|
|
|
$script->appendChild( $dom->createTextNode("") ); |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
# handle |