| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Webrobot::Html2XHtml; |
|
2
|
1
|
|
|
1
|
|
585
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
23
|
|
|
3
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
21
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# Author: Stefan Trcek |
|
6
|
|
|
|
|
|
|
# Copyright(c) 2004 ABAS Software AG |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
WWW::Webrobot::Html2XHtml - convert HTML to XML |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use WWW::Webrobot::Html2XHtml; |
|
15
|
|
|
|
|
|
|
my $converter = WWW::Webrobot::Html2XHtml -> new(); |
|
16
|
|
|
|
|
|
|
$converter->to_xhtml($dirty_html, $encoding); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 METHODS |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=over |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=cut |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
|
28
|
1
|
|
|
1
|
|
1001
|
use HTML::TreeBuilder; |
|
|
1
|
|
|
|
|
32369
|
|
|
|
1
|
|
|
|
|
12
|
|
|
29
|
1
|
|
|
1
|
|
41
|
use HTML::Entities; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
7586
|
|
|
30
|
1
|
|
|
1
|
|
680
|
use WWW::Webrobot::MyEncode qw/has_Encode octet_to_internal_utf8/; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
622
|
|
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $XML_HEADER = qq(\n); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my %e2c = |
|
37
|
|
|
|
|
|
|
map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})} |
|
38
|
|
|
|
|
|
|
grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value && $value < 256} |
|
39
|
|
|
|
|
|
|
keys %HTML::Entities::entity2char; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item new |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Constructor |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub new { |
|
49
|
4
|
|
|
4
|
1
|
5274
|
my $class = shift; |
|
50
|
4
|
|
33
|
|
|
34
|
my $self = bless({}, ref($class) || $class); |
|
51
|
4
|
|
|
|
|
12
|
return $self; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub html_decode_entities_utf8 { |
|
55
|
4
|
|
|
4
|
0
|
8
|
my ($value) = @_; |
|
56
|
4
|
|
|
|
|
26
|
foreach ($value) { |
|
57
|
4
|
50
|
33
|
|
|
22
|
s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg; |
|
|
3
|
|
|
|
|
26
|
|
|
58
|
4
|
0
|
0
|
|
|
15
|
s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ? pack("U", $c) : $1 /eg; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
59
|
4
|
100
|
|
|
|
25
|
s/(&(\w+);?)/ $e2c{$2} || $1 /eg; |
|
|
5
|
|
|
|
|
36
|
|
|
60
|
|
|
|
|
|
|
} |
|
61
|
4
|
|
|
|
|
9
|
return $value; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item to_xhtml($dirty_html, $encoding) |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Convert C<$dirty_html> to XML. |
|
67
|
|
|
|
|
|
|
C<$dirty_html> is a sequence of octets and is assumend to be |
|
68
|
|
|
|
|
|
|
coded in C<$encoding>. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub to_xhtml { |
|
73
|
4
|
|
|
4
|
1
|
17
|
my ($self, $dirty_html, $encoding) = @_; |
|
74
|
|
|
|
|
|
|
#return "NO VALID ENCODING='$encoding'\n" if ! $encoding; |
|
75
|
|
|
|
|
|
|
|
|
76
|
4
|
|
|
|
|
26
|
my $parser = new HTML::TreeBuilder(); |
|
77
|
4
|
|
|
|
|
831
|
$parser->no_space_compacting(1); |
|
78
|
4
|
|
|
|
|
41
|
$parser->ignore_ignorable_whitespace(0); |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Encode $dirty_html to Perls internal encoding UTF-8. |
|
81
|
4
|
|
|
|
|
34
|
$dirty_html = octet_to_internal_utf8($encoding, $dirty_html); |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Decode HTML entities, because HTML::TreeBuilder doesn't handle it right. |
|
84
|
|
|
|
|
|
|
# Can't use HTML::Entities::decode_entities because it uses 'chr($x)' |
|
85
|
|
|
|
|
|
|
# instead of 'pack("U",$x)' |
|
86
|
4
|
|
|
|
|
10
|
$dirty_html = html_decode_entities_utf8($dirty_html); |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Parse $dirty_html and encode all remaining bytes as html entities. |
|
89
|
|
|
|
|
|
|
# That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx |
|
90
|
4
|
|
|
|
|
57
|
my $tree = $parser->parse($dirty_html); |
|
91
|
4
|
|
|
|
|
3690
|
my $xml = $XML_HEADER . $tree->as_XML(); |
|
92
|
|
|
|
|
|
|
# $xml has all byte encoded as x; |
|
93
|
4
|
|
|
|
|
3684
|
$tree = $tree -> delete; |
|
94
|
|
|
|
|
|
|
|
|
95
|
4
|
50
|
|
|
|
321
|
if (! has_Encode()) { |
|
|
|
50
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is ASCII |
|
97
|
0
|
0
|
0
|
|
|
0
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg; |
|
|
0
|
|
|
|
|
0
|
|
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
elsif (Encode::is_utf8($xml)) { # SunOS 5.7 / perl 5.8.5 |
|
100
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is UTF-8 |
|
101
|
4
|
0
|
0
|
|
|
14
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("U", $2) /eg; |
|
|
0
|
|
|
|
|
0
|
|
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
else { # Linux perl 5.8.0/5.8.5, Win32 perl 5.8.0 |
|
104
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is ASCII |
|
105
|
0
|
0
|
0
|
|
|
0
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg; |
|
|
0
|
|
|
|
|
0
|
|
|
106
|
|
|
|
|
|
|
# Now we have an UTF-8 string and must Perl believe so too. |
|
107
|
0
|
|
|
|
|
0
|
Encode::_utf8_on($xml); |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
4
|
|
|
|
|
65
|
return $xml; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=back |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
1; |