line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Webrobot::Html2XHtml; |
2
|
1
|
|
|
1
|
|
585
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
3
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# Author: Stefan Trcek |
6
|
|
|
|
|
|
|
# Copyright(c) 2004 ABAS Software AG |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
WWW::Webrobot::Html2XHtml - convert HTML to XML |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use WWW::Webrobot::Html2XHtml; |
15
|
|
|
|
|
|
|
my $converter = WWW::Webrobot::Html2XHtml -> new(); |
16
|
|
|
|
|
|
|
$converter->to_xhtml($dirty_html, $encoding); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 METHODS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=over |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=cut |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
1
|
|
|
1
|
|
1001
|
use HTML::TreeBuilder; |
|
1
|
|
|
|
|
32369
|
|
|
1
|
|
|
|
|
12
|
|
29
|
1
|
|
|
1
|
|
41
|
use HTML::Entities; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7586
|
|
30
|
1
|
|
|
1
|
|
680
|
use WWW::Webrobot::MyEncode qw/has_Encode octet_to_internal_utf8/; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
622
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $XML_HEADER = qq(\n); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my %e2c = |
37
|
|
|
|
|
|
|
map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})} |
38
|
|
|
|
|
|
|
grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value && $value < 256} |
39
|
|
|
|
|
|
|
keys %HTML::Entities::entity2char; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item new |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Constructor |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub new { |
49
|
4
|
|
|
4
|
1
|
5274
|
my $class = shift; |
50
|
4
|
|
33
|
|
|
34
|
my $self = bless({}, ref($class) || $class); |
51
|
4
|
|
|
|
|
12
|
return $self; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub html_decode_entities_utf8 { |
55
|
4
|
|
|
4
|
0
|
8
|
my ($value) = @_; |
56
|
4
|
|
|
|
|
26
|
foreach ($value) { |
57
|
4
|
50
|
33
|
|
|
22
|
s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg; |
|
3
|
|
|
|
|
26
|
|
58
|
4
|
0
|
0
|
|
|
15
|
s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ? pack("U", $c) : $1 /eg; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
59
|
4
|
100
|
|
|
|
25
|
s/(&(\w+);?)/ $e2c{$2} || $1 /eg; |
|
5
|
|
|
|
|
36
|
|
60
|
|
|
|
|
|
|
} |
61
|
4
|
|
|
|
|
9
|
return $value; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item to_xhtml($dirty_html, $encoding) |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Convert C<$dirty_html> to XML. |
67
|
|
|
|
|
|
|
C<$dirty_html> is a sequence of octets and is assumend to be |
68
|
|
|
|
|
|
|
coded in C<$encoding>. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub to_xhtml { |
73
|
4
|
|
|
4
|
1
|
17
|
my ($self, $dirty_html, $encoding) = @_; |
74
|
|
|
|
|
|
|
#return "NO VALID ENCODING='$encoding'\n" if ! $encoding; |
75
|
|
|
|
|
|
|
|
76
|
4
|
|
|
|
|
26
|
my $parser = new HTML::TreeBuilder(); |
77
|
4
|
|
|
|
|
831
|
$parser->no_space_compacting(1); |
78
|
4
|
|
|
|
|
41
|
$parser->ignore_ignorable_whitespace(0); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# Encode $dirty_html to Perls internal encoding UTF-8. |
81
|
4
|
|
|
|
|
34
|
$dirty_html = octet_to_internal_utf8($encoding, $dirty_html); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Decode HTML entities, because HTML::TreeBuilder doesn't handle it right. |
84
|
|
|
|
|
|
|
# Can't use HTML::Entities::decode_entities because it uses 'chr($x)' |
85
|
|
|
|
|
|
|
# instead of 'pack("U",$x)' |
86
|
4
|
|
|
|
|
10
|
$dirty_html = html_decode_entities_utf8($dirty_html); |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Parse $dirty_html and encode all remaining bytes as html entities. |
89
|
|
|
|
|
|
|
# That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx |
90
|
4
|
|
|
|
|
57
|
my $tree = $parser->parse($dirty_html); |
91
|
4
|
|
|
|
|
3690
|
my $xml = $XML_HEADER . $tree->as_XML(); |
92
|
|
|
|
|
|
|
# $xml has all byte encoded as x; |
93
|
4
|
|
|
|
|
3684
|
$tree = $tree -> delete; |
94
|
|
|
|
|
|
|
|
95
|
4
|
50
|
|
|
|
321
|
if (! has_Encode()) { |
|
|
50
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is ASCII |
97
|
0
|
0
|
0
|
|
|
0
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg; |
|
0
|
|
|
|
|
0
|
|
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
elsif (Encode::is_utf8($xml)) { # SunOS 5.7 / perl 5.8.5 |
100
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is UTF-8 |
101
|
4
|
0
|
0
|
|
|
14
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("U", $2) /eg; |
|
0
|
|
|
|
|
0
|
|
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
else { # Linux perl 5.8.0/5.8.5, Win32 perl 5.8.0 |
104
|
|
|
|
|
|
|
# Decode UTF-8 characters and control characters, $xml is ASCII |
105
|
0
|
0
|
0
|
|
|
0
|
$xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg; |
|
0
|
|
|
|
|
0
|
|
106
|
|
|
|
|
|
|
# Now we have an UTF-8 string and must Perl believe so too. |
107
|
0
|
|
|
|
|
0
|
Encode::_utf8_on($xml); |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
4
|
|
|
|
|
65
|
return $xml; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=back |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
1; |