blib/lib/HTML/Normalize.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 122 | 253 | 48.2 |
branch | 39 | 126 | 30.9 |
condition | 9 | 45 | 20.0 |
subroutine | 20 | 27 | 74.0 |
pod | 3 | 3 | 100.0 |
total | 193 | 454 | 42.5 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::Normalize; | ||||||
2 | |||||||
3 | 1 | 1 | 102134 | use strict; | |||
1 | 2 | ||||||
1 | 29 | ||||||
4 | 1 | 1 | 5 | use warnings; | |||
1 | 2 | ||||||
1 | 27 | ||||||
5 | 1 | 1 | 7 | use HTML::Entities; | |||
1 | 2 | ||||||
1 | 75 | ||||||
6 | 1 | 1 | 8 | use HTML::TreeBuilder; | |||
1 | 1 | ||||||
1 | 6 | ||||||
7 | 1 | 1 | 23 | use HTML::Tagset; | |||
1 | 3 | ||||||
1 | 30 | ||||||
8 | 1 | 1 | 6 | use Carp; | |||
1 | 1 | ||||||
1 | 66 | ||||||
9 | |||||||
10 | BEGIN { | ||||||
11 | 1 | 1 | 7 | use Exporter (); | |||
1 | 2 | ||||||
1 | 21 | ||||||
12 | 1 | 1 | 4 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
1 | 2 | ||||||
1 | 121 | ||||||
13 | 1 | 1 | 7 | $VERSION = '1.0004'; | |||
14 | 1 | 17 | @ISA = qw(Exporter); | ||||
15 | 1 | 4 | @EXPORT = qw(); | ||||
16 | 1 | 15 | @EXPORT_OK = qw(); | ||||
17 | 1 | 3289 | %EXPORT_TAGS = (); | ||||
18 | } | ||||||
19 | |||||||
20 | =head1 NAME | ||||||
21 | |||||||
22 | HTML::Normalize - HTML light weight cleanup | ||||||
23 | |||||||
24 | =head1 VERSION | ||||||
25 | |||||||
26 | Version 1.0003 | ||||||
27 | |||||||
28 | =head1 SYNOPSIS | ||||||
29 | |||||||
30 | my $norm = HTML::Normalize->new (); | ||||||
31 | my $cleanHtml = $norm->cleanup (-html => $dirtyHtml); | ||||||
32 | |||||||
33 | =head1 DESCRIPTION | ||||||
34 | |||||||
35 | HTML::Normalize uses HTML::TreeBuilder to parse an HTML string then processes | ||||||
36 | the resultant tree to clean up various structural issues in the original HTML. | ||||||
37 | The result is then rendered using HTML::Element's as_HTML member. | ||||||
38 | |||||||
39 | Key structural clean ups fix tag soup (C<< foo >> becomes C<< | ||||||
40 | foo >>) and inline/block element nesting (C<< | ||||||
41 | foo >> becomes C<<foo >>). C<<>> |
||||||
42 | tags at the start or end of a link element are migrated out of the element. | ||||||
43 | |||||||
44 | Note that HTML::Normalize's approach to cleaning up tag soup is different than | ||||||
45 | that used by HTML::Tidy. HTML::Tidy tends to enforce nested and swaps end tags | ||||||
46 | to achieve that. HTML::Normalize inserts extra tags to allow correctly taged | ||||||
47 | overlapped markup. | ||||||
48 | |||||||
49 | HTML::Normalize can also remove attributes set to default values and empty | ||||||
50 | elements. For example a C<< >> | ||||||
51 | element would become and C<< >> and C<< | ||||||
52 | face="Verdana" size="1"> >> would be removed if Verdana size 1 is set as the | ||||||
53 | default font. | ||||||
54 | |||||||
55 | =head1 Methods | ||||||
56 | |||||||
57 | C |
||||||
58 | |||||||
59 | C |
||||||
60 | parses the HTML to generate the internal representation. It then edits the | ||||||
61 | internal representation and renders the result back into HTML. | ||||||
62 | |||||||
63 | Note that I |
||||||
64 | process. | ||||||
65 | |||||||
66 | Generally errors are handled by carping and may be detected in both I |
||||||
67 | I |
||||||
68 | |||||||
69 | =cut | ||||||
70 | |||||||
71 | =head2 new | ||||||
72 | |||||||
73 | Create a new C |
||||||
74 | |||||||
75 | my $norm = HTML::Normalize->new (); | ||||||
76 | |||||||
77 | =over 4 | ||||||
78 | |||||||
79 | =item I<-compact>: optional | ||||||
80 | |||||||
81 | Setting C<< -compact => 1 >> suppresses generation of 'optional' close tags. | ||||||
82 | This reduces the sizeof the output slightly at the expense of breaking any hope | ||||||
83 | of XHTML compliance. | ||||||
84 | |||||||
85 | =item I<-default>: optional - multiple | ||||||
86 | |||||||
87 | Define a default attribute for an element. Default attributes are removed if the | ||||||
88 | attribute value has not been overridden in a parent node. For element such as | ||||||
89 | 'font' this may result in the element being removed if no attributes remain. | ||||||
90 | |||||||
91 | C<-default> takes a string of the form 'tag attribute=value' as an argument. | ||||||
92 | For example: | ||||||
93 | |||||||
94 | -default => 'font face="Verdana"' | ||||||
95 | |||||||
96 | would specify that the face "Verdana" is the default face attribute for font | ||||||
97 | elements. | ||||||
98 | |||||||
99 | I |
||||||
100 | matches: | ||||||
101 | |||||||
102 | /(~|qr)\s*(.).*\1\s*$/ | ||||||
103 | |||||||
104 | except that the paired delimiters [], {}, () and <> are also accepted as pattern | ||||||
105 | delimiters. | ||||||
106 | |||||||
107 | Literal match values should not encode entities, but remember that quotes around | ||||||
108 | attribute values are optional for some values so the outer pair of quote | ||||||
109 | characters will be removed if present. The match value extends to the end of the | ||||||
110 | line and is not bounded by quote qharacters (except as noted earlier) so no | ||||||
111 | quoting of "special" characters is required - there are no special characters. | ||||||
112 | |||||||
113 | Multiple default attributes may be provided but only one default value is | ||||||
114 | allowed for any one tag/attribute pair. | ||||||
115 | |||||||
116 | Default values are case sensitive. However you can use the regular expression | ||||||
117 | form to overcome this limitation. | ||||||
118 | |||||||
119 | =item I<-distribute>: optional - default true | ||||||
120 | |||||||
121 | Distribute inline elements over children if the children are block level | ||||||
122 | elements. For example: | ||||||
123 | |||||||
124 | foo bar |
||||||
125 | |||||||
126 | becomes: | ||||||
127 | |||||||
128 | foo bar |
||||||
129 | |||||||
130 | This action is only taken if all the child elements are block level elements. | ||||||
131 | |||||||
132 | =item I<-expelbr>: optional - default true | ||||||
133 | |||||||
134 | If C<-expelbr> is true (the default) break elements at the edges of link | ||||||
135 | elements are expelled from the link element. Thus: | ||||||
136 | |||||||
137 | link text |
||||||
138 | |||||||
139 | becomes | ||||||
140 | |||||||
141 | link text |
||||||
142 | |||||||
143 | =item I<-html>: required | ||||||
144 | |||||||
145 | the HTML string to clean. | ||||||
146 | |||||||
147 | =item I<-indent>: optional - default ' ' | ||||||
148 | |||||||
149 | String used to indent formatted output. Ignored if I<-unformatted> is true. | ||||||
150 | |||||||
151 | =item I<-keepimplicit>: optional | ||||||
152 | |||||||
153 | as_HTML adds various HTML required sections such as head and body elements. By | ||||||
154 | default HTML::Normalize removes these elements so that it is suitable for | ||||||
155 | processing HTML fragments. Set C<-keepimplicit => 1> to render the implicit | ||||||
156 | elements. | ||||||
157 | |||||||
158 | Note that if this option is true, the extra nodes will be generated regardless | ||||||
159 | of their presence in the original HTML. | ||||||
160 | |||||||
161 | =item I<-maxlinelen>: optional - default 80 | ||||||
162 | |||||||
163 | Notional maximum line length if I<-selfrender> is true. The line length may be | ||||||
164 | exceeded if no suitable break position is found. Note that the current indent is | ||||||
165 | included in the line length. | ||||||
166 | |||||||
167 | =item I<-selfrender>: optional | ||||||
168 | |||||||
169 | Use the experimental HTML::Normalize code to render HTML rather than using | ||||||
170 | HTML::Element's renderer. This code has not been tested against a wide range of | ||||||
171 | HTML and may be unreliable. It's advantage is that it produces (in the author's | ||||||
172 | opinion) prettier output than HTML::Element's as_HTML member. | ||||||
173 | |||||||
174 | =item I<-unformatted>: optional | ||||||
175 | |||||||
176 | Suppress output formatting. By default as_HTML is called as | ||||||
177 | |||||||
178 | as_HTML (undef, ' ', {}) | ||||||
179 | |||||||
180 | which wraps and indents elements. Setting C<< -unformatted => 1 >> suppresses | ||||||
181 | generation of line breaks and indentation reducing the size of the output | ||||||
182 | slightly. | ||||||
183 | |||||||
184 | =back | ||||||
185 | |||||||
186 | =cut | ||||||
187 | |||||||
188 | my %paramTypes = ( | ||||||
189 | |||||||
190 | # 0: optional once | ||||||
191 | # 1: required once | ||||||
192 | # 2: optional, many allowed | ||||||
193 | -compact => [0, 0], | ||||||
194 | -default => [2, undef], | ||||||
195 | -distribute => [0, 1], | ||||||
196 | -expelbr => [0, 1], | ||||||
197 | -html => [1, undef], | ||||||
198 | -indent => [0, ' '], | ||||||
199 | -keepimplicit => [0, 0], | ||||||
200 | -maxlinelen => [0, 80], | ||||||
201 | -selfrender => [0, 0], | ||||||
202 | -unformatted => [0, 0], | ||||||
203 | ); | ||||||
204 | my $regex = ' | ||||||
205 | (?:~|qr)\s* | ||||||
206 | (?: | ||||||
207 | (.).*\4 # regex quote char delimited | ||||||
208 | |<.*> # regex <> delimited | ||||||
209 | |{.*} # regex {} delimited | ||||||
210 | |\[.*\] # regex [] delimited | ||||||
211 | |\(.*\) # regex () delimited | ||||||
212 | )i? # Regex match | ||||||
213 | '; | ||||||
214 | |||||||
215 | sub new { | ||||||
216 | 1 | 1 | 1 | 119 | my ($self, @params) = @_; | ||
217 | |||||||
218 | 1 | 50 | 6 | unless (ref $self) { | |||
219 | 1 | 3 | $self = bless {}, $self; | ||||
220 | 1 | 10 | $self->{both} = qr/^(del|ins)$/i; | ||||
221 | 1 | 6 | $self->{inline} = qr/^(b|i|s|font|span)$/i; | ||||
222 | 1 | 4 | $self->{block} = qr/^(p|table|div)$/i; | ||||
223 | 1 | 3 | $self->{needattr} = qr/^(font|span)$/i; | ||||
224 | 1 | 3 | $self->{selfclose} = qr/^(br)$/i; | ||||
225 | } | ||||||
226 | |||||||
227 | $self->_validateParams ( | ||||||
228 | 1 | 7 | \@params, | ||||
229 | [ | ||||||
230 | qw(-compact -default -distribute -expelbr -keepimplicit -unformatted ) | ||||||
231 | ], | ||||||
232 | [] | ||||||
233 | ); | ||||||
234 | |||||||
235 | # Add 'div' to the closure barriers list to avoid changing: | ||||||
236 | # foo |
||||||
237 | # into: | ||||||
238 | # foo |
||||||
239 | 1 | 3 | my $bar = \@HTML::Tagset::p_closure_barriers; | ||||
240 | 1 | 50 | 4 | push @$bar, 'div' unless grep { $_ eq 'div' } @$bar; | |||
15 | 30 | ||||||
241 | |||||||
242 | 1 | 4 | return $self; | ||||
243 | } | ||||||
244 | |||||||
245 | sub DESTROY { | ||||||
246 | 1 | 1 | 810 | my $self = shift; | |||
247 | 1 | 50 | 9 | $self->{root}->delete if $self->{root}; | |||
248 | } | ||||||
249 | |||||||
250 | sub _validateParams { | ||||||
251 | 2 | 2 | 6 | my ($self, $params, $okParams, $requiredParams) = @_; | |||
252 | |||||||
253 | 2 | 50 | 6 | $params ||= []; | |||
254 | 2 | 50 | 12 | $okParams ||= []; | |||
255 | 2 | 50 | 4 | $requiredParams ||= []; | |||
256 | |||||||
257 | # Validate parameters | ||||||
258 | 2 | 7 | while (@$params) { | ||||
259 | 1 | 8 | my ($key, $value) = splice @$params, 0, 2; | ||||
260 | |||||||
261 | 1 | 4 | $key = lc $key; | ||||
262 | 1 | 50 | 6 | croak "$key is not a valid parameter name" if !exists $paramTypes{$key}; | |||
263 | croak "$key parameter may only be used once" | ||||||
264 | 1 | 50 | 33 | 8 | if $paramTypes{$key}[0] < 2 && exists $self->{$key}; | ||
265 | |||||||
266 | 1 | 50 | 5 | if ($paramTypes{$key}[0] < 2) { | |||
267 | 1 | 2 | $self->{$key} = $value; | ||||
268 | 1 | 5 | next; | ||||
269 | } | ||||||
270 | |||||||
271 | 0 | 0 | push @{$self->{$key}}, $value; | ||||
0 | 0 | ||||||
272 | } | ||||||
273 | |||||||
274 | # Ensure we got required parameters | ||||||
275 | 2 | 6 | for my $key (@$requiredParams) { | ||||
276 | 1 | 50 | 4 | croak "Invalid parameter name: $key" unless exists $paramTypes{$key}; | |||
277 | 1 | 50 | 3 | $self->{$key} = $paramTypes{$key}[1] unless exists $self->{$key}; | |||
278 | 1 | 50 | 33 | 8 | next if $paramTypes{$key}[0] != 1 or exists $self->{$key}; | ||
279 | 0 | 0 | croak "The $key parameter is missing. It is required."; | ||||
280 | } | ||||||
281 | } | ||||||
282 | |||||||
283 | =head2 cleanup | ||||||
284 | |||||||
285 | C |
||||||
286 | |||||||
287 | my $cleanHtml = $norm->cleanup (); | ||||||
288 | |||||||
289 | =cut | ||||||
290 | |||||||
291 | sub cleanup { | ||||||
292 | 1 | 1 | 1 | 7 | my ($self, @params) = @_; | ||
293 | |||||||
294 | 1 | 27 | $self->_validateParams (\@params, [keys %paramTypes], ['-html']); | ||||
295 | |||||||
296 | # Check we got all required parameters and set any defaults | ||||||
297 | 1 | 6 | for my $param (keys %paramTypes) { | ||||
298 | 10 | 100 | 18 | next if exists $self->{$param}; | |||
299 | 9 | 100 | 21 | next if $paramTypes{$param}[0] > 1; | |||
300 | |||||||
301 | croak "A $param parameter must be provided. None was." | ||||||
302 | 8 | 50 | 15 | if $paramTypes{$param}[0] == 1; | |||
303 | |||||||
304 | # Set missing param to default | ||||||
305 | 8 | 16 | $self->{$param} = $paramTypes{$param}[1]; | ||||
306 | } | ||||||
307 | |||||||
308 | # Unpack any -default parameters | ||||||
309 | 1 | 2 | for my $default (@{$self->{-default}}) { | ||||
1 | 5 | ||||||
310 | 0 | 0 | my ($tag, $attrib, $value) = | ||||
311 | $default =~ / | ||||||
312 | (\w+)\s+ # Tag | ||||||
313 | (\w+)\s* # Attribute | ||||||
314 | (?:=\s*(?=[\w'"])|=(?=~)) | ||||||
315 | ( '[^']*' # Single quoted | ||||||
316 | |"[^"]*" # Double quoted | ||||||
317 | |\w+ # Unquoted | ||||||
318 | |$regex # regex match | ||||||
319 | )\s* # Value | ||||||
320 | $/x; | ||||||
321 | |||||||
322 | 0 | 0 | 0 | croak "Badly formed default attribute string: $default" | |||
323 | unless defined $value; | ||||||
324 | 0 | 0 | $_ = lc for $tag, $attrib; | ||||
325 | |||||||
326 | croak "Conflicting defaults given:\n" | ||||||
327 | . " $tag $attrib=$self->{defaults}{$tag}{$attrib}\n" | ||||||
328 | . "and\n $tag $attrib=$value\n" | ||||||
329 | if exists $self->{defaults}{$tag}{$attrib} | ||||||
330 | 0 | 0 | 0 | 0 | and $self->{defaults}{$tag}{$attrib} ne $value; | ||
331 | |||||||
332 | 0 | 0 | 0 | if ($value =~ /^()()()$regex$/x) { | |||
333 | # Compile regex | ||||||
334 | 0 | 0 | $value =~ s/^~\s*/qr/; | ||||
335 | 0 | 0 | $value = eval $value; | ||||
336 | } else { | ||||||
337 | # Strip quotes if present from match value | ||||||
338 | 0 | 0 | $value =~ s/^(['"])(.*)\1$/$2/; | ||||
339 | } | ||||||
340 | |||||||
341 | 0 | 0 | $self->{defaults}{$tag}{$attrib} = $value; | ||||
342 | } | ||||||
343 | |||||||
344 | 1 | 10 | $self->{root} = HTML::TreeBuilder->new; | ||||
345 | 1 | 378 | $self->{root}->parse_content ($self->{-html}); | ||||
346 | 1 | 1589 | $self->{root}->elementify (); | ||||
347 | |||||||
348 | 1 | 94 | 1 while $self->_cleanedupElt ($self->{root}); | ||||
349 | |||||||
350 | 1 | 3 | my $str = ''; | ||||
351 | |||||||
352 | 1 | 50 | 4 | if ($self->{-selfrender}) { | |||
353 | 0 | 0 | $self->{line} = ''; | ||||
354 | 0 | 0 | $str = $self->_render ($self->{root}, ''); | ||||
355 | 0 | 0 | 0 | $str .= "\n" if $str !~ /\n$/s; | |||
356 | } else { | ||||||
357 | 1 | 5 | my @renderOptions = (undef, ' ', {}); | ||||
358 | |||||||
359 | 1 | 50 | 3 | $renderOptions[1] = undef if $self->{-unformatted}; | |||
360 | 1 | 50 | 3 | $renderOptions[2] = undef if $self->{-compact}; | |||
361 | |||||||
362 | 1 | 2 | my $elt = $self->{root}; | ||||
363 | |||||||
364 | 1 | 50 | 4 | if (! $self->{-keepimplicit}) { | |||
365 | 1 | 5 | ($elt) = grep {$_->{_tag} eq 'body'} $self->{root}->descendents (); | ||||
7 | 298 | ||||||
366 | } | ||||||
367 | |||||||
368 | $str .= ref $_ ? $_->as_HTML (@renderOptions) : $_ | ||||||
369 | 1 | 50 | 2 | for @{$elt->{_content}}; | |||
1 | 11 | ||||||
370 | } | ||||||
371 | |||||||
372 | 1 | 826 | return $str; | ||||
373 | } | ||||||
374 | |||||||
375 | |||||||
376 | =head2 elements | ||||||
377 | |||||||
378 | C |
||||||
379 | generated by C |
||||||
380 | will return C |
||||||
381 | |||||||
382 | $norm->cleanup (); | ||||||
383 | my @elements = $norm->elements(); | ||||||
384 | |||||||
385 | =cut | ||||||
386 | |||||||
387 | sub elements { | ||||||
388 | 1 | 1 | 1 | 14 | my ($self) = @_; | ||
389 | 1 | 5 | my $root = $self->{root}; | ||||
390 | |||||||
391 | 1 | 6 | while ($root->implicit()) { | ||||
392 | 1 | 21 | ($root) = grep {$_->tag() ne 'head'} $root->content_list(); | ||||
2 | 15 | ||||||
393 | 1 | 50 | 9 | last if $root->tag() eq 'body'; | |||
394 | 0 | 0 | next; | ||||
395 | } | ||||||
396 | |||||||
397 | 1 | 9 | return $root->content_list(); | ||||
398 | } | ||||||
399 | |||||||
400 | |||||||
401 | sub _cleanedupElt { | ||||||
402 | 8 | 8 | 17 | my ($self, $parent) = @_; | |||
403 | |||||||
404 | 8 | 100 | 66 | 35 | return 0 unless ref $parent && ref $parent->{_content}; | ||
405 | |||||||
406 | 6 | 8 | my $rescan = 1; # Set true to rescan the child element list | ||||
407 | 6 | 9 | my $touched; | ||||
408 | |||||||
409 | 6 | 11 | while ($rescan) { | ||||
410 | 6 | 9 | $rescan = 0; # Assume another scan not required after current scan | ||||
411 | 6 | 9 | ++$touched; | ||||
412 | |||||||
413 | 6 | 14 | for my $elt ($parent->content_list ()) { | ||||
414 | 10 | 100 | 48 | next unless ref $elt; | |||
415 | |||||||
416 | 7 | 50 | 15 | ++$rescan, last if $self->_cleanedupElt ($elt); | |||
417 | 7 | 100 | 17 | next if exists $elt->{_implicit}; | |||
418 | |||||||
419 | 5 | 50 | 20 | ++$rescan, last if $self->_removedDefaults ($elt); | |||
420 | 5 | 50 | 11 | ++$rescan, last if $self->_distributedElements ($elt); | |||
421 | 5 | 50 | 9 | ++$rescan, last if $self->_normalizedElements ($elt); | |||
422 | 5 | 50 | 11 | ++$rescan, last if $self->_expeledBr ($elt); | |||
423 | 5 | 50 | 11 | ++$rescan, last if $self->_removedEmpty ($elt); | |||
424 | } | ||||||
425 | } | ||||||
426 | |||||||
427 | 6 | 14 | return $touched > 1; | ||||
428 | } | ||||||
429 | |||||||
430 | sub _distributedElements { | ||||||
431 | 5 | 5 | 9 | my ($self, $elt) = @_; | |||
432 | |||||||
433 | 5 | 50 | 11 | return 0 unless $self->{-distribute}; | |||
434 | return 0 | ||||||
435 | unless $elt->{_tag} =~ $self->{inline} | ||||||
436 | 5 | 50 | 33 | 33 | && $elt->{_tag} =~ $self->{needattr}; | ||
437 | |||||||
438 | 0 | 0 | my @elts = $elt->content_list (); | ||||
439 | 0 | 0 | 0 | my $blockElts = grep {ref $_ && $_->{_tag} =~ $self->{block}} @elts; | |||
0 | 0 | ||||||
440 | |||||||
441 | # Done unless all child elements are block level elements | ||||||
442 | 0 | 0 | 0 | 0 | return 0 unless @elts && @elts == $blockElts; | ||
443 | |||||||
444 | # Distribute inline element over and block elements | ||||||
445 | 0 | 0 | $elt->replace_with_content (); | ||||
446 | |||||||
447 | 0 | 0 | for my $block (@elts) { | ||||
448 | 0 | 0 | my @nested = $block->detach_content (); | ||||
449 | 0 | 0 | my $clone = $elt->clone (); | ||||
450 | |||||||
451 | 0 | 0 | $block->push_content ($clone); | ||||
452 | 0 | 0 | $clone->push_content (@nested); | ||||
453 | } | ||||||
454 | |||||||
455 | 0 | 0 | $elt->delete (); | ||||
456 | 0 | 0 | return 1; | ||||
457 | } | ||||||
458 | |||||||
459 | sub _normalizedElements { | ||||||
460 | 5 | 5 | 11 | my ($self, $elt) = @_; | |||
461 | |||||||
462 | 5 | 50 | 23 | return 0 unless $elt->{_tag} =~ $self->{inline}; | |||
463 | |||||||
464 | 0 | 0 | my @elts = $elt->content_list (); | ||||
465 | |||||||
466 | # Ok unless element contains single block level child | ||||||
467 | return 0 | ||||||
468 | unless @elts == 1 | ||||||
469 | && ref $elts[0] | ||||||
470 | 0 | 0 | 0 | 0 | && $elts[0]->{_tag} =~ $self->{block}; | ||
0 | |||||||
471 | |||||||
472 | # Invert order of inline and block elements | ||||||
473 | 0 | 0 | my @nested = $elts[0]->detach_content (); | ||||
474 | |||||||
475 | 0 | 0 | $elt->replace_with ($elts[0]); | ||||
476 | 0 | 0 | $elts[0]->push_content ($elt); | ||||
477 | 0 | 0 | $elt->push_content (@nested); | ||||
478 | 0 | 0 | $elt = $elts[0]; | ||||
479 | |||||||
480 | $_->replace_with_content ()->delete () | ||||||
481 | 0 | 0 | for grep {$self->_removedEmpty ($_)} @elts; | ||||
0 | 0 | ||||||
482 | |||||||
483 | 0 | 0 | return 1; | ||||
484 | } | ||||||
485 | |||||||
486 | sub _expeledBr { | ||||||
487 | 5 | 5 | 10 | my ($self, $elt) = @_; | |||
488 | |||||||
489 | 5 | 50 | 33 | 26 | return 0 unless $elt->{_tag} eq 'a' && $self->{-expelbr}; | ||
490 | 0 | 0 | 0 | return 0 unless exists $elt->{_content}; | |||
491 | |||||||
492 | 0 | 0 | my $adjusted; | ||||
493 | 0 | 0 | for my $index (0, -1) { | ||||
494 | 0 | 0 | my $br = $elt->{_content}[$index]; | ||||
495 | |||||||
496 | 0 | 0 | 0 | 0 | next unless ref $br && $br->{_tag} eq 'br'; | ||
497 | 0 | 0 | 0 | $index == 0 | |||
498 | ? $br->detach ()->preinsert ($br) | ||||||
499 | : $br->detach ()->postinsert ($br); | ||||||
500 | 0 | 0 | ++$adjusted; | ||||
501 | } | ||||||
502 | |||||||
503 | 0 | 0 | return $adjusted; | ||||
504 | } | ||||||
505 | |||||||
506 | sub _removedDefaults { | ||||||
507 | 5 | 5 | 12 | my ($self, $elt) = @_; | |||
508 | |||||||
509 | 5 | 50 | 19 | return 0 unless exists $self->{defaults}{$elt->{_tag}}; | |||
510 | |||||||
511 | 0 | 0 | my $delAttribs = $self->{defaults}{$elt->{_tag}}; | ||||
512 | |||||||
513 | 0 | 0 | for my $attrib (keys %$delAttribs) { | ||||
514 | 0 | 0 | 0 | next unless exists $elt->{$attrib}; | |||
515 | |||||||
516 | 0 | 0 | my $value = $delAttribs->{$attrib}; | ||||
517 | 0 | 0 | my @parentAttribs; | ||||
518 | 0 | 0 | my @criteria = (_tag => $elt->{_tag}); | ||||
519 | |||||||
520 | 0 | 0 | 0 | if ('Regexp' eq ref $value) { | |||
521 | 0 | 0 | 0 | next unless $elt->{$attrib} =~ $value; | |||
522 | push @criteria, sub { | ||||||
523 | 0 | 0 | 0 | my $attr = $_[0]->attr("$attrib"); | |||
524 | 0 | 0 | 0 | return 0 unless defined $attr; | |||
525 | 0 | 0 | return $attr !~ $value; | ||||
526 | 0 | 0 | }; | ||||
527 | } else { | ||||||
528 | 0 | 0 | my $value = $delAttribs->{$attrib}; | ||||
529 | |||||||
530 | 0 | 0 | 0 | next unless $elt->{$attrib} eq $value; | |||
531 | 0 | 0 | push @criteria, ($attrib => qr/^(?!\Q$value\E)/i); | ||||
532 | } | ||||||
533 | |||||||
534 | 0 | 0 | @parentAttribs = $elt->look_up (@criteria); | ||||
535 | |||||||
536 | # Don't delete attribute required to restore default | ||||||
537 | 0 | 0 | 0 | next if @parentAttribs; | |||
538 | 0 | 0 | delete $elt->{$attrib}; | ||||
539 | } | ||||||
540 | |||||||
541 | 0 | 0 | return $self->_removedEmpty ($elt); | ||||
542 | } | ||||||
543 | |||||||
544 | sub _removedEmpty { | ||||||
545 | 5 | 5 | 15 | my ($self, $elt) = @_; | |||
546 | |||||||
547 | 5 | 50 | 14 | return 0 if grep {!/^_/} $elt->all_attr_names (); | |||
14 | 67 | ||||||
548 | 5 | 50 | 26 | return 0 unless $elt->{_tag} =~ $self->{needattr}; | |||
549 | |||||||
550 | # Remove redundant element - no attributes left | ||||||
551 | 0 | $elt->replace_with ($elt->detach_content ()); | |||||
552 | 0 | $elt->delete (); | |||||
553 | 0 | return 1; | |||||
554 | } | ||||||
555 | |||||||
556 | sub _render { | ||||||
557 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
558 | |||||||
559 | return '' | ||||||
560 | 0 | 0 | 0 | unless $self->{-keepimplicit} || !$elt->{_implicit} || $elt->{_content}; | |||
0 | |||||||
561 | |||||||
562 | 0 | my $str = ''; | |||||
563 | |||||||
564 | 0 | 0 | 0 | if (! $self->{-keepimplicit} && $elt->{_implicit}) { | |||
0 | |||||||
0 | |||||||
565 | 0 | return $self->_renderContents ($elt, $indent); | |||||
566 | |||||||
567 | } elsif ($elt->{_tag} =~ $self->{selfclose}) { | ||||||
568 | 0 | $str .= $self->_append ("<$elt->{_tag} />", $indent); | |||||
569 | |||||||
570 | } elsif ($HTML::Tagset::isPhraseMarkup{$elt->{_tag}}) { | ||||||
571 | 0 | $str .= $self->_append ("<$elt->{_tag}", $indent); | |||||
572 | 0 | $str .= $self->_renderAttrs ($elt, $indent); | |||||
573 | 0 | $str .= $self->_renderContents ($elt, $indent); | |||||
574 | 0 | $str .= $self->_append ("$elt->{_tag}>",$indent); | |||||
575 | |||||||
576 | } else { | ||||||
577 | 0 | my $indented = "$indent$self->{-indent}"; | |||||
578 | |||||||
579 | 0 | $str = $self->_flushLine ($indent); | |||||
580 | 0 | $self->{line} .= "<$elt->{_tag}"; | |||||
581 | 0 | $self->{ishead} = 1; | |||||
582 | 0 | $str .= $self->_renderAttrs ($elt, $indented); | |||||
583 | 0 | $str .= $self->_renderContents ($elt, $indented); | |||||
584 | 0 | $str .= $self->_append ("$elt->{_tag}>", $indented); | |||||
585 | 0 | $str .= $self->_flushLine ($indented); | |||||
586 | } | ||||||
587 | |||||||
588 | 0 | return $str; | |||||
589 | } | ||||||
590 | |||||||
591 | sub _append { | ||||||
592 | 0 | 0 | my ($self, $tail, $indent) = @_; | ||||
593 | |||||||
594 | 0 | 0 | if ((length ($self->{line}) + length ($tail) + length ($indent)) > $self->{-maxlinelen}) { | ||||
595 | 0 | my $str = $self->_flushLine ($indent); | |||||
596 | |||||||
597 | 0 | $self->{line} = $tail; | |||||
598 | 0 | return $str; | |||||
599 | } else { | ||||||
600 | 0 | $self->{line} .= $tail; | |||||
601 | 0 | return ''; | |||||
602 | } | ||||||
603 | } | ||||||
604 | |||||||
605 | sub _flushLine { | ||||||
606 | 0 | 0 | my ($self, $indent) = @_; | ||||
607 | |||||||
608 | 0 | 0 | return '' unless length $self->{line}; | ||||
609 | |||||||
610 | 0 | my $str; | |||||
611 | |||||||
612 | 0 | 0 | if ($self->{-unformatted}) { | ||||
613 | 0 | $str = $self->{line}; | |||||
614 | |||||||
615 | } else { | ||||||
616 | 0 | 0 | if ($self->{ishead}) { | ||||
617 | 0 | substr ($indent, -length $self->{-indent}) = ''; | |||||
618 | 0 | $self->{isHead} = undef; | |||||
619 | } | ||||||
620 | |||||||
621 | 0 | $str = "$indent$self->{line}\n"; | |||||
622 | } | ||||||
623 | |||||||
624 | 0 | $self->{line} = ''; | |||||
625 | 0 | $str =~ s/\s+\n\z/\n/s; | |||||
626 | 0 | return $str; | |||||
627 | } | ||||||
628 | |||||||
629 | sub _renderAttrs { | ||||||
630 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
631 | 0 | my $str = ''; | |||||
632 | 0 | my @attrs = grep {! /^_/} keys %$elt; | |||||
0 | |||||||
633 | |||||||
634 | $str .= $self->_append ( | ||||||
635 | qq( $_=") . encode_entities ($elt->{$_}) . qq("), | ||||||
636 | $indent | ||||||
637 | ) | ||||||
638 | 0 | for sort @attrs; | |||||
639 | 0 | $self->{line} .= '>'; | |||||
640 | 0 | return $str; | |||||
641 | } | ||||||
642 | |||||||
643 | sub _renderContents { | ||||||
644 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
645 | 0 | my $str = ''; | |||||
646 | |||||||
647 | 0 | for my $subElt (@{$elt->{_content}}) { | |||||
0 | |||||||
648 | 0 | 0 | if (! ref $subElt) { | ||||
649 | 0 | $str .= $self->_renderText ($subElt, $indent); | |||||
650 | } else { | ||||||
651 | 0 | $str .= $self->_render ($subElt, $indent); | |||||
652 | } | ||||||
653 | } | ||||||
654 | |||||||
655 | 0 | return $str; | |||||
656 | } | ||||||
657 | |||||||
658 | |||||||
659 | sub _renderText { | ||||||
660 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
661 | 0 | my $str = $self->{line} . encode_entities ($elt); | |||||
662 | |||||||
663 | 0 | 0 | if ($self->{-unformatted}) { | ||||
664 | 0 | $self->{line} = ''; | |||||
665 | |||||||
666 | } else { | ||||||
667 | 0 | my $maxLen = $self->{-maxlinelen} - length $indent; | |||||
668 | |||||||
669 | 0 | 0 | $str =~ s/(.{1,$maxLen})\s+/$indent$1\n/g if length($str) > $maxLen; | ||||
670 | 0 | ($str, $self->{line}) = $str =~ /(.*\n)?(.*)/; | |||||
671 | 0 | 0 | $str = '' unless defined $str; | ||||
672 | 0 | 0 | $self->{line} = '' unless defined $self->{line}; | ||||
673 | } | ||||||
674 | |||||||
675 | 0 | return $str; | |||||
676 | } | ||||||
677 | |||||||
678 | |||||||
679 | 1; | ||||||
680 | |||||||
681 | =head1 BUGS | ||||||
682 | |||||||
683 | =head3 p/div/p parsing issue | ||||||
684 | |||||||
685 | HTML::TreeBuilder 3.23 and earlier misparses: | ||||||
686 | |||||||
687 | foo |
||||||
688 | |||||||
689 | as: | ||||||
690 | |||||||
691 | foo |
||||||
692 | |||||||
693 | A work around in HTML::Normalize turns that into | ||||||
694 | |||||||
695 | foo |
||||||
696 | |||||||
697 | which is probably still incorrect - div elements should not nest within p | ||||||
698 | elements. A better fix for the problem requires HTML::TreeBuilder to be fixed. | ||||||
699 | |||||||
700 | =head3 Bug reports and feature requests | ||||||
701 | |||||||
702 | Please report any other bugs or feature requests to | ||||||
703 | C |
||||||
704 | L |
||||||
705 | I will be notified, and then you'll automatically be notified of progress on | ||||||
706 | your bug as I make changes. | ||||||
707 | |||||||
708 | =head1 SUPPORT | ||||||
709 | |||||||
710 | This module is supported by the author through CPAN. The following links may be | ||||||
711 | of assistance: | ||||||
712 | |||||||
713 | =over 4 | ||||||
714 | |||||||
715 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
716 | |||||||
717 | L |
||||||
718 | |||||||
719 | =item * CPAN Ratings | ||||||
720 | |||||||
721 | L |
||||||
722 | |||||||
723 | =item * RT: CPAN's request tracker | ||||||
724 | |||||||
725 | L |
||||||
726 | |||||||
727 | =item * Search CPAN | ||||||
728 | |||||||
729 | L |
||||||
730 | |||||||
731 | =back | ||||||
732 | |||||||
733 | =head1 ACKNOWLEDGEMENTS | ||||||
734 | |||||||
735 | This module was inspired by Bart Lateur's PerlMonks node 'Cleaning up HTML' | ||||||
736 | (L |
||||||
737 | and the author. | ||||||
738 | |||||||
739 | =head1 AUTHOR | ||||||
740 | |||||||
741 | Peter Jaquiery | ||||||
742 | CPAN ID: GRANDPA | ||||||
743 | grandpa@cpan.org | ||||||
744 | |||||||
745 | =head1 COPYRIGHT & LICENSE | ||||||
746 | |||||||
747 | This program is free software; you can redistribute | ||||||
748 | it and/or modify it under the same terms as Perl itself. | ||||||
749 | |||||||
750 | The full text of the license can be found in the | ||||||
751 | LICENSE file included with this module. | ||||||
752 | |||||||
753 | =cut |