line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::WikiConverter::Normalizer; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
25806
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
106
|
|
4
|
3
|
|
|
3
|
|
18
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
98
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
18
|
use Carp; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
215
|
|
7
|
|
|
|
|
|
|
|
8
|
3
|
|
|
3
|
|
3475
|
use CSS; |
|
3
|
|
|
|
|
73478
|
|
|
3
|
|
|
|
|
108
|
|
9
|
3
|
|
|
3
|
|
21442
|
use HTML::Element; |
|
3
|
|
|
|
|
90053
|
|
|
3
|
|
|
|
|
25
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
HTML::WikiConverter::Normalizer - Convert CSS styles to (roughly) corresponding HTML |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 SYNOPSIS |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
use HTML::TreeBuilder; |
18
|
|
|
|
|
|
|
use HTML::WikiConverter::Normalizer; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
my $tree = new HTML::TreeBuilder(); |
21
|
|
|
|
|
|
|
$tree->parse( '<p><font style="font-style:italic; font-weight:bold">text</font></p>' ); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $norm = new HTML::WikiConverter::Normalizer(); |
24
|
|
|
|
|
|
|
$norm->normalize($tree); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Roughly gives "<p><font><b><i>text</i></b></font></p>" |
27
|
|
|
|
|
|
|
print $tree->as_HTML(); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 DESCRIPTION |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
L<HTML::WikiConverter> dialects convert HTML into wiki markup. Most |
32
|
|
|
|
|
|
|
(if not all) know nothing about CSS, nor do they take it into |
33
|
|
|
|
|
|
|
consideration when performing html-to-wiki conversion. But there is no |
34
|
|
|
|
|
|
|
good reason for, say, C<E<lt>font |
35
|
|
|
|
|
|
|
style="font-weight:bold"E<gt>textE<lt>/fontE<gt>> not to be converted |
36
|
|
|
|
|
|
|
into C<'''text'''> in the MediaWiki dialect. The same is true of other |
37
|
|
|
|
|
|
|
dialects, all of which should be able to use CSS information to |
38
|
|
|
|
|
|
|
produce wiki markup. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
The issue becomes especially problematic when considering that several |
41
|
|
|
|
|
|
|
WYSIWYG HTML editors (e.g. Mozilla's) produce this sort of CSS-heavy |
42
|
|
|
|
|
|
|
HTML. Prior to C<HTML::WikiConverter::Normalizer>, this HTML would |
43
|
|
|
|
|
|
|
have been essentially converted to text, the CSS information having |
44
|
|
|
|
|
|
|
been ignored by C<HTML::WikiConverter>. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
C<HTML::WikiConverter::Normalizer> avoids this with a few simple |
47
|
|
|
|
|
|
|
transformations that convert CSS styles into HTML tags. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 METHODS |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 new |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $norm = new HTML::WikiConverter::Normalizer(); |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Constructs a new normalizer |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=cut |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub new { |
60
|
30
|
|
|
30
|
1
|
13113
|
my( $pkg, %attrs ) = @_; |
61
|
30
|
|
|
|
|
167
|
my $self = bless \%attrs, $pkg; |
62
|
30
|
|
|
|
|
250
|
$self->{_css} = new CSS( { parser => 'CSS::Parse::Lite' } ); |
63
|
30
|
|
|
|
|
828
|
$self->{_handlers} = $self->handlers; |
64
|
30
|
|
|
|
|
134
|
return $self; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head2 normalize |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
$norm->normalize($elem); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Normalizes C<$elem> and all its descendents, where C<$elem> is an |
72
|
|
|
|
|
|
|
L<HTML::Element> object. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=cut |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub normalize { |
77
|
30
|
|
|
30
|
1
|
78
|
my( $self, $root ) = @_; |
78
|
30
|
|
|
|
|
84
|
$self->_normalize($root); |
79
|
29
|
|
|
|
|
697
|
$self->_postprocess($root); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=head1 SUBCLASSING |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
The following methods may be useful to subclasses. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=cut |
87
|
|
|
|
|
|
|
|
88
|
31
|
|
|
31
|
|
168
|
sub _css { shift->{_css} } |
89
|
60
|
|
|
60
|
|
174
|
sub _handlers { shift->{_handlers} } |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 handlers |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
my $handlers = $self->handlers; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Class method returning reference to an array of handlers used to |
96
|
|
|
|
|
|
|
convert CSS to HTML. Each handler is a hashref that specifies the CSS |
97
|
|
|
|
|
|
|
properties and values to match, and the HTML tags and attributes the |
98
|
|
|
|
|
|
|
matched properties will be converted to. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
The C<type>, C<name>, C<value>, and C<tag> keys may be used to match |
101
|
|
|
|
|
|
|
an element's property or attribute. C<type> may be either C<"css"> if |
102
|
|
|
|
|
|
|
matching a CSS property (in which case C<name> must contain the name |
103
|
|
|
|
|
|
|
of the property, and C<value> must contain the property value to |
104
|
|
|
|
|
|
|
match) or C<"attr"> if matching an HTML tag attribute (in which case |
105
|
|
|
|
|
|
|
C<name> must contain the name of the attribute, and C<value> must |
106
|
|
|
|
|
|
|
contain the attribute value to match). |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
C<value> may be a string (for an exact match), regex (which will be |
109
|
|
|
|
|
|
|
used to match against the element's property or attribute value), |
110
|
|
|
|
|
|
|
coderef (which will be passed the property or attribute value and is |
111
|
|
|
|
|
|
|
expected to return true on match, false otherwise), or C<"*"> (which |
112
|
|
|
|
|
|
|
matches any property or attribute value). A tag or list of tags can |
113
|
|
|
|
|
|
|
also be matched with the C<tag> key, which takes either a string or an |
114
|
|
|
|
|
|
|
arrayref. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
To specify what actions the handler will take, the C<new_tag>, |
117
|
|
|
|
|
|
|
C<new_attr>, and C<normalizer> keys are used. C<new_tag> is required |
118
|
|
|
|
|
|
|
and indicates the name of the tag that will be created. C<attribute> |
119
|
|
|
|
|
|
|
is optional and indicates the name of the attribute in the new tag |
120
|
|
|
|
|
|
|
that will take the value of the original CSS property. If a coderef is |
121
|
|
|
|
|
|
|
given as the C<normalizer>, it will be passed the value of the |
122
|
|
|
|
|
|
|
property/attribute and should return one suitable to be assigned to |
123
|
|
|
|
|
|
|
the new tag attribute. |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=cut |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub handlers { [ |
128
|
30
|
|
|
30
|
1
|
825
|
{ type => 'css', name => 'font-family', value => '*', new_tag => 'font', new_attr => 'face' }, |
129
|
|
|
|
|
|
|
{ type => 'css', name => 'font-size', value => '*', new_tag => 'font', new_attr => 'size', normalizer => \&_normalize_fontsize }, |
130
|
|
|
|
|
|
|
{ type => 'css', name => 'color', value => '*', new_tag => 'font', new_attr => 'color', normalizer => \&_normalize_color }, |
131
|
|
|
|
|
|
|
{ type => 'css', name => 'font-weight', value => 'bold', new_tag => 'b' }, |
132
|
|
|
|
|
|
|
{ type => 'css', name => 'text-decoration', value => 'underline', new_tag => 'u' }, |
133
|
|
|
|
|
|
|
{ type => 'css', name => 'font-style', value => 'italic', new_tag => 'i' }, |
134
|
|
|
|
|
|
|
{ type => 'attr', name => 'align', value => 'center', tag => [ qw/ div p / ], new_tag => 'center' }, |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# { type => 'attr', tag => 'font', attr => 'size', value => '*', new_tag => 'span', style => 'font-size' }, |
137
|
|
|
|
|
|
|
] } |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub _new_handlers { |
140
|
0
|
|
|
0
|
|
0
|
span_to_font => { |
141
|
|
|
|
|
|
|
xpath => '//[@style[contains(., "font-size")]]', |
142
|
|
|
|
|
|
|
}, |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _normalize_color { |
146
|
3
|
|
|
3
|
|
6
|
my( $self, $color ) = @_; |
147
|
3
|
|
|
|
|
9
|
return $color; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub _normalize_fontsize { |
151
|
0
|
|
|
0
|
|
0
|
my( $self, $size ) = @_; |
152
|
0
|
|
|
|
|
0
|
return $size; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub _normalize { |
156
|
30
|
|
|
30
|
|
44
|
my( $self, $node ) = @_; |
157
|
30
|
|
|
|
|
209
|
$self->_normalize_css( $node ); |
158
|
29
|
|
|
|
|
372
|
$self->_normalize_attrs( $node ); |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
sub _normalize_css { |
162
|
233
|
|
|
233
|
|
2136
|
my( $self, $node ) = @_; |
163
|
|
|
|
|
|
|
|
164
|
233
|
|
|
|
|
630
|
$node->objectify_text; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Recurse |
167
|
233
|
|
|
|
|
13207
|
$self->_normalize_css($_) for $node->content_list; |
168
|
|
|
|
|
|
|
|
169
|
224
|
100
|
|
|
|
2392
|
my $style_text = $node->attr('style') or return; |
170
|
11
|
|
|
|
|
296
|
my $full_css = "this { $style_text }"; |
171
|
|
|
|
|
|
|
|
172
|
11
|
|
|
|
|
35
|
$self->_css->read_string( $full_css ); |
173
|
10
|
|
|
|
|
25567
|
my $style = $self->_css->get_style_by_selector('this'); |
174
|
|
|
|
|
|
|
|
175
|
10
|
50
|
|
|
|
147
|
my @original_props = @{ $style->{properties} || [] }; |
|
10
|
|
|
|
|
63
|
|
176
|
10
|
|
|
|
|
27
|
my @new_props = ( ); |
177
|
|
|
|
|
|
|
|
178
|
10
|
|
|
|
|
26
|
foreach my $prop ( @original_props ) { |
179
|
21
|
|
|
|
|
144
|
my $handler = $self->_find_handler( type => 'css', name => $prop->{property}, value => $prop->{simple_value}, tag => $node->tag ); |
180
|
21
|
100
|
|
|
|
145
|
if( $handler ) { |
181
|
9
|
|
|
|
|
37
|
$self->_handle( $handler, $node, $prop->{simple_value} ); |
182
|
|
|
|
|
|
|
} else { |
183
|
12
|
|
|
|
|
31
|
push @new_props, $prop; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
10
|
|
|
|
|
118
|
$style->{properties} = \@new_props; |
188
|
10
|
|
|
|
|
47
|
chomp( my $style_string = $style->to_string ); |
189
|
10
|
|
|
|
|
17135
|
$style_string =~ s/^this \{\s*(.*?)\s*\}$/$1/; |
190
|
10
|
|
|
|
|
35
|
$style_string =~ s/\s+$//; |
191
|
10
|
|
100
|
|
|
55
|
$style_string ||= undef; |
192
|
|
|
|
|
|
|
|
193
|
10
|
|
|
|
|
46
|
$node->attr( style => $style_string ); |
194
|
10
|
|
|
|
|
163
|
$self->_css->purge(); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub _normalize_attrs { |
198
|
174
|
|
|
174
|
|
1340
|
my( $self, $node ) = @_; |
199
|
|
|
|
|
|
|
|
200
|
174
|
|
|
|
|
442
|
$node->objectify_text(); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# Recurse |
203
|
174
|
|
|
|
|
3373
|
$self->_normalize_attrs($_) for $node->content_list; |
204
|
|
|
|
|
|
|
|
205
|
174
|
|
|
|
|
1955
|
foreach my $attr ( $node->all_external_attr_names ) { |
206
|
39
|
|
|
|
|
432
|
my $attr_value = $node->attr($attr); |
207
|
39
|
|
|
|
|
529
|
my $handler = $self->_find_handler( type => 'attr', name => $attr, value => $attr_value, tag => $node->tag ); |
208
|
39
|
100
|
|
|
|
196
|
if( $handler ) { |
209
|
1
|
|
|
|
|
6
|
$self->_handle( $handler, $node, $attr_value ); |
210
|
1
|
|
|
|
|
18
|
$node->attr( $attr => undef ); |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
29
|
|
|
29
|
|
79
|
sub _postprocess { } |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub _handle { |
218
|
10
|
|
|
10
|
|
25
|
my( $self, $handler, $node, $value ) = @_; |
219
|
10
|
100
|
|
|
|
42
|
$value = $handler->{normalizer} ? $handler->{normalizer}->( $self, $value ) : $value; |
220
|
|
|
|
|
|
|
|
221
|
10
|
|
|
|
|
40
|
my %elem_attrs = ( ); |
222
|
10
|
100
|
|
|
|
126
|
$elem_attrs{$handler->{new_attr}} = $value if $handler->{new_attr}; |
223
|
|
|
|
|
|
|
|
224
|
10
|
|
|
|
|
61
|
my $new_elem = new HTML::Element( $handler->{new_tag}, %elem_attrs ); |
225
|
|
|
|
|
|
|
|
226
|
10
|
|
|
|
|
314
|
foreach my $c ( $node->content_list ) { |
227
|
10
|
|
|
|
|
95
|
$c->detach; |
228
|
10
|
|
|
|
|
173
|
$new_elem->push_content($c); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
10
|
|
|
|
|
237
|
$node->push_content($new_elem); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub _find_handler { |
235
|
60
|
|
|
60
|
|
611
|
my( $self, %args ) = @_; |
236
|
|
|
|
|
|
|
|
237
|
60
|
|
|
|
|
151
|
my @arg_keys = qw/ type name value /; |
238
|
60
|
|
|
|
|
102
|
for my $arg ( @arg_keys ) { |
239
|
180
|
50
|
|
|
|
539
|
if( not exists $args{$arg} ) { |
240
|
0
|
|
|
|
|
0
|
my( $t, $n, $v ) = @args{ @arg_keys }; |
241
|
0
|
|
0
|
|
|
0
|
$_ ||= '' for ( $t, $n, $v ); |
242
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
0
|
croak sprintf "missing required '$arg' key (type: %s, name: %s, value: %s)", $t, $n, $v; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
60
|
|
|
|
|
141
|
my $type = $args{type}; |
248
|
60
|
|
|
|
|
181
|
my $name = $args{name}; |
249
|
60
|
|
|
|
|
91
|
my $value = $args{value}; |
250
|
60
|
|
50
|
|
|
163
|
my $tag = $args{tag} || ''; |
251
|
|
|
|
|
|
|
|
252
|
60
|
|
|
|
|
88
|
foreach my $handler ( @{ $self->_handlers } ) { |
|
60
|
|
|
|
|
157
|
|
253
|
393
|
100
|
|
|
|
860
|
next unless $handler->{type} eq $type; |
254
|
147
|
100
|
|
|
|
717
|
next unless $handler->{name} eq $name; |
255
|
10
|
50
|
66
|
|
|
41
|
next if $handler->{tag} and ! $self->_match_handler_tag( $handler->{tag}, $tag ); |
256
|
|
|
|
|
|
|
|
257
|
10
|
50
|
|
|
|
68
|
if( ref $handler->{value} eq 'Regexp' ) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
258
|
0
|
0
|
|
|
|
0
|
return $handler if $value =~ $handler->{value} |
259
|
|
|
|
|
|
|
} elsif( ref $handler->{value} eq 'CODE' ) { |
260
|
0
|
0
|
|
|
|
0
|
return $handler if $handler->{value}->( $value ); |
261
|
|
|
|
|
|
|
} elsif( $handler->{value} eq '*') { |
262
|
4
|
|
|
|
|
22
|
return $handler; |
263
|
|
|
|
|
|
|
} else { |
264
|
6
|
50
|
|
|
|
38
|
return $handler if $handler->{value} eq $value; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
50
|
|
|
|
|
193
|
return; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub _match_handler_tag { |
272
|
1
|
|
|
1
|
|
4
|
my( $self, $handler_tag, $tag ) = @_; |
273
|
1
|
50
|
|
|
|
7
|
my %handler_tags = map { $_ => 1 } ( ref $handler_tag eq 'ARRAY' ? @$handler_tag : $handler_tag ); |
|
2
|
|
|
|
|
9
|
|
274
|
1
|
50
|
|
|
|
278
|
return $handler_tags{$tag} ? 1 : 0; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 SEE ALSO |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
L<CSS> |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 AUTHOR |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
David J. Iberri, C<< <diberri@cpan.org> >> |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head1 BUGS |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-html-wikiconverter |
288
|
|
|
|
|
|
|
at rt.cpan.org>, or through the web interface at |
289
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>. |
290
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of |
291
|
|
|
|
|
|
|
progress on your bug as I make changes. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
Copyright 2006 David J. Iberri, all rights reserved. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
298
|
|
|
|
|
|
|
under the same terms as Perl itself. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=cut |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
1; |