lib/HTML/Normalize.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 230 | 243 | 94.6 |
branch | 97 | 120 | 80.8 |
condition | 31 | 45 | 68.8 |
subroutine | 26 | 26 | 100.0 |
pod | 2 | 2 | 100.0 |
total | 386 | 436 | 88.5 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::Normalize; | ||||||
2 | |||||||
3 | 1 | 1 | 185153 | use strict; | |||
1 | 3 | ||||||
1 | 37 | ||||||
4 | 1 | 1 | 6 | use warnings; | |||
1 | 2 | ||||||
1 | 33 | ||||||
5 | 1 | 1 | 4 | use HTML::Entities; | |||
1 | 6 | ||||||
1 | 69 | ||||||
6 | 1 | 1 | 5 | use HTML::TreeBuilder; | |||
1 | 1 | ||||||
1 | 6 | ||||||
7 | 1 | 1 | 33 | use HTML::Tagset; | |||
1 | 1 | ||||||
1 | 25 | ||||||
8 | 1 | 1 | 4 | use Carp; | |||
1 | 8 | ||||||
1 | 78 | ||||||
9 | |||||||
10 | BEGIN { | ||||||
11 | 1 | 1 | 5 | use Exporter (); | |||
1 | 2 | ||||||
1 | 24 | ||||||
12 | 1 | 1 | 12 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
1 | 2 | ||||||
1 | 116 | ||||||
13 | 1 | 1 | 2 | $VERSION = '1.0003'; | |||
14 | 1 | 17 | @ISA = qw(Exporter); | ||||
15 | 1 | 2 | @EXPORT = qw(); | ||||
16 | 1 | 3 | @EXPORT_OK = qw(); | ||||
17 | 1 | 3898 | %EXPORT_TAGS = (); | ||||
18 | } | ||||||
19 | |||||||
20 | =head1 NAME | ||||||
21 | |||||||
22 | HTML::Normalize - HTML light weight cleanup | ||||||
23 | |||||||
24 | =head1 VERSION | ||||||
25 | |||||||
26 | Version 1.0003 | ||||||
27 | |||||||
28 | =head1 SYNOPSIS | ||||||
29 | |||||||
30 | my $norm = HTML::Normalize->new (); | ||||||
31 | my $cleanHtml = $norm->cleanup (-html => $dirtyHtml); | ||||||
32 | |||||||
33 | =head1 DESCRIPTION | ||||||
34 | |||||||
35 | HTML::Normalize uses HTML::TreeBuilder to parse an HTML string then processes | ||||||
36 | the resultant tree to clean up various structural issues in the original HTML. | ||||||
37 | The result is then rendered using HTML::Element's as_HTML member. | ||||||
38 | |||||||
39 | Key structural clean ups fix tag soup (C<< foo >> becomes C<< | ||||||
40 | foo >>) and inline/block element nesting (C<< | ||||||
41 | foo >> becomes C<<foo >>). C<<>> |
||||||
42 | tags at the start or end of a link element are migrated out of the element. | ||||||
43 | |||||||
44 | Note that HTML::Normalize's approach to cleaning up tag soup is different than | ||||||
45 | that used by HTML::Tidy. HTML::Tidy tends to enforce nested and swaps end tags | ||||||
46 | to achieve that. HTML::Normalize inserts extra tags to allow correctly taged | ||||||
47 | overlapped markup. | ||||||
48 | |||||||
49 | HTML::Normalize can also remove attributes set to default values and empty | ||||||
50 | elements. For example a C<< >> | ||||||
51 | element would become and C<< >> and C<< | ||||||
52 | face="Verdana" size="1"> >> would be removed if Verdana size 1 is set as the | ||||||
53 | default font. | ||||||
54 | |||||||
55 | =head1 Methods | ||||||
56 | |||||||
57 | C |
||||||
58 | |||||||
59 | C |
||||||
60 | parses the HTML to generate the internal representation. It then edits the | ||||||
61 | internal representation and renders the result back into HTML. | ||||||
62 | |||||||
63 | Note that I |
||||||
64 | process. | ||||||
65 | |||||||
66 | Generally errors are handled by carping and may be detected in both I |
||||||
67 | I |
||||||
68 | |||||||
69 | =cut | ||||||
70 | |||||||
71 | =head2 new | ||||||
72 | |||||||
73 | Create a new C |
||||||
74 | |||||||
75 | my $norm = HTML::Normalize->new (); | ||||||
76 | |||||||
77 | =over 4 | ||||||
78 | |||||||
79 | =item I<-compact>: optional | ||||||
80 | |||||||
81 | Setting C<< -compact => 1 >> suppresses generation of 'optional' close tags. | ||||||
82 | This reduces the sizeof the output slightly at the expense of breaking any hope | ||||||
83 | of XHTML compliance. | ||||||
84 | |||||||
85 | =item I<-default>: optional - multiple | ||||||
86 | |||||||
87 | Define a default attribute for an element. Default attributes are removed if the | ||||||
88 | attribute value has not been overridden in a parent node. For element such as | ||||||
89 | 'font' this may result in the element being removed if no attributes remain. | ||||||
90 | |||||||
91 | C<-default> takes a string of the form 'tag attribute=value' as an argument. | ||||||
92 | For example: | ||||||
93 | |||||||
94 | -default => 'font face="Verdana"' | ||||||
95 | |||||||
96 | would specify that the face "Verdana" is the default face attribute for font | ||||||
97 | elements. | ||||||
98 | |||||||
99 | I |
||||||
100 | matches: | ||||||
101 | |||||||
102 | /(~|qr)\s*(.).*\1\s*$/ | ||||||
103 | |||||||
104 | except that the paired delimiters [], {}, () and <> are also accepted as pattern | ||||||
105 | delimiters. | ||||||
106 | |||||||
107 | Literal match values should not encode entities, but remember that quotes around | ||||||
108 | attribute values are optional for some values so the outer pair of quote | ||||||
109 | characters will be removed if present. The match value extends to the end of the | ||||||
110 | line and is not bounded by quote qharacters (except as noted earlier) so no | ||||||
111 | quoting of "special" characters is required - there are no special characters. | ||||||
112 | |||||||
113 | Multiple default attributes may be provided but only one default value is | ||||||
114 | allowed for any one tag/attribute pair. | ||||||
115 | |||||||
116 | Default values are case sensitive. However you can use the regular expression | ||||||
117 | form to overcome this limitation. | ||||||
118 | |||||||
119 | =item I<-distribute>: optional - default true | ||||||
120 | |||||||
121 | Distribute inline elements over children if the children are block level | ||||||
122 | elements. For example: | ||||||
123 | |||||||
124 | foo bar |
||||||
125 | |||||||
126 | becomes: | ||||||
127 | |||||||
128 | foo bar |
||||||
129 | |||||||
130 | This action is only taken if all the child elements are block level elements. | ||||||
131 | |||||||
132 | =item I<-expelbr>: optional - default true | ||||||
133 | |||||||
134 | If C<-expelbr> is true (the default) break elements at the edges of link | ||||||
135 | elements are expelled from the link element. Thus: | ||||||
136 | |||||||
137 | link text |
||||||
138 | |||||||
139 | becomes | ||||||
140 | |||||||
141 | link text |
||||||
142 | |||||||
143 | =item I<-html>: required | ||||||
144 | |||||||
145 | the HTML string to clean. | ||||||
146 | |||||||
147 | =item I<-indent>: optional - default ' ' | ||||||
148 | |||||||
149 | String used to indent formatted output. Ignored if I<-unformatted> is true. | ||||||
150 | |||||||
151 | =item I<-keepimplicit>: optional | ||||||
152 | |||||||
153 | as_HTML adds various HTML required sections such as head and body elements. By | ||||||
154 | default HTML::Normalize removes these elements so that it is suitable for | ||||||
155 | processing HTML fragments. Set C<-keepimplicit => 1> to render the implicit | ||||||
156 | elements. | ||||||
157 | |||||||
158 | Note that if this option is true, the extra nodes will be generated regardless | ||||||
159 | of their presence in the original HTML. | ||||||
160 | |||||||
161 | =item I<-maxlinelen>: optional - default 80 | ||||||
162 | |||||||
163 | Notional maximum line length if I<-selfrender> is true. The line length may be | ||||||
164 | exceeded if no suitable break position is found. Note that the current indent is | ||||||
165 | included in the line length. | ||||||
166 | |||||||
167 | =item I<-selfrender>: optional | ||||||
168 | |||||||
169 | Use the experimental HTML::Normalize code to render HTML rather than using | ||||||
170 | HTML::Element's renderer. This code has not been tested against a wide range of | ||||||
171 | HTML and may be unreliable. It's advantage is that it produces (in the author's | ||||||
172 | opinion) prettier output than HTML::Element's as_HTML member. | ||||||
173 | |||||||
174 | =item I<-unformatted>: optional | ||||||
175 | |||||||
176 | Suppress output formatting. By default as_HTML is called as | ||||||
177 | |||||||
178 | as_HTML (undef, ' ', {}) | ||||||
179 | |||||||
180 | which wraps and indents elements. Setting C<< -unformatted => 1 >> suppresses | ||||||
181 | generation of line breaks and indentation reducing the size of the output | ||||||
182 | slightly. | ||||||
183 | |||||||
184 | =back | ||||||
185 | |||||||
186 | =cut | ||||||
187 | |||||||
188 | my %paramTypes = ( | ||||||
189 | |||||||
190 | # 0: optional once | ||||||
191 | # 1: required once | ||||||
192 | # 2: optional, many allowed | ||||||
193 | -compact => [0, 0], | ||||||
194 | -default => [2, undef], | ||||||
195 | -distribute => [0, 1], | ||||||
196 | -expelbr => [0, 1], | ||||||
197 | -html => [1, undef], | ||||||
198 | -indent => [0, ' '], | ||||||
199 | -keepimplicit => [0, 0], | ||||||
200 | -maxlinelen => [0, 80], | ||||||
201 | -selfrender => [0, 0], | ||||||
202 | -unformatted => [0, 0], | ||||||
203 | ); | ||||||
204 | my $regex = ' | ||||||
205 | (?:~|qr)\s* | ||||||
206 | (?: | ||||||
207 | (.).*\4 # regex quote char delimited | ||||||
208 | |<.*> # regex <> delimited | ||||||
209 | |{.*} # regex {} delimited | ||||||
210 | |\[.*\] # regex [] delimited | ||||||
211 | |\(.*\) # regex () delimited | ||||||
212 | )i? # Regex match | ||||||
213 | '; | ||||||
214 | |||||||
215 | sub new { | ||||||
216 | 19 | 19 | 1 | 256805 | my ($self, @params) = @_; | ||
217 | |||||||
218 | 19 | 50 | 110 | unless (ref $self) { | |||
219 | 19 | 95 | $self = bless {}, $self; | ||||
220 | 19 | 154 | $self->{both} = qr/^(del|ins)$/i; | ||||
221 | 19 | 112 | $self->{inline} = qr/^(b|i|s|font|span)$/i; | ||||
222 | 19 | 92 | $self->{block} = qr/^(p|table|div)$/i; | ||||
223 | 19 | 83 | $self->{needattr} = qr/^(font|span)$/i; | ||||
224 | 19 | 96 | $self->{selfclose} = qr/^(br)$/i; | ||||
225 | } | ||||||
226 | |||||||
227 | $self->_validateParams ( | ||||||
228 | 19 | 154 | \@params, | ||||
229 | [ | ||||||
230 | qw(-compact -default -distribute -expelbr -keepimplicit -unformatted ) | ||||||
231 | ], | ||||||
232 | [] | ||||||
233 | ); | ||||||
234 | |||||||
235 | # Add 'div' to the closure barriers list to avoid changing: | ||||||
236 | # foo |
||||||
237 | # into: | ||||||
238 | # foo |
||||||
239 | 19 | 9288 | my $bar = \@HTML::Tagset::p_closure_barriers; | ||||
240 | 19 | 50 | 81 | push @$bar, 'div' unless grep { $_ eq 'div' } @$bar; | |||
285 | 955 | ||||||
241 | |||||||
242 | 19 | 97 | return $self; | ||||
243 | } | ||||||
244 | |||||||
245 | sub DESTROY { | ||||||
246 | 19 | 19 | 209 | my $self = shift; | |||
247 | 19 | 50 | 179 | $self->{root}->delete if $self->{root}; | |||
248 | } | ||||||
249 | |||||||
250 | sub _validateParams { | ||||||
251 | 38 | 38 | 78 | my ($self, $params, $okParams, $requiredParams) = @_; | |||
252 | |||||||
253 | 38 | 50 | 125 | $params ||= []; | |||
254 | 38 | 50 | 81 | $okParams ||= []; | |||
255 | 38 | 50 | 86 | $requiredParams ||= []; | |||
256 | |||||||
257 | # Validate parameters | ||||||
258 | 38 | 96 | while (@$params) { | ||||
259 | 60 | 141 | my ($key, $value) = splice @$params, 0, 2; | ||||
260 | |||||||
261 | 60 | 130 | $key = lc $key; | ||||
262 | 60 | 50 | 154 | croak "$key is not a valid parameter name" if !exists $paramTypes{$key}; | |||
263 | 60 | 50 | 66 | 340 | croak "$key parameter may only be used once" | ||
264 | if $paramTypes{$key}[0] < 2 && exists $self->{$key}; | ||||||
265 | |||||||
266 | 60 | 100 | 153 | if ($paramTypes{$key}[0] < 2) { | |||
267 | 37 | 87 | $self->{$key} = $value; | ||||
268 | 37 | 103 | next; | ||||
269 | } | ||||||
270 | |||||||
271 | 23 | 31 | push @{$self->{$key}}, $value; | ||||
23 | 1261 | ||||||
272 | } | ||||||
273 | |||||||
274 | # Ensure we got required parameters | ||||||
275 | 38 | 182 | for my $key (@$requiredParams) { | ||||
276 | 19 | 50 | 56 | croak "Invalid parameter name: $key" unless exists $paramTypes{$key}; | |||
277 | 19 | 50 | 63 | $self->{$key} = $paramTypes{$key}[1] unless exists $self->{$key}; | |||
278 | 19 | 50 | 33 | 147 | next if $paramTypes{$key}[0] != 1 or exists $self->{$key}; | ||
279 | 0 | 0 | croak "The $key parameter is missing. It is required."; | ||||
280 | } | ||||||
281 | } | ||||||
282 | |||||||
283 | =head2 cleanup | ||||||
284 | |||||||
285 | C |
||||||
286 | |||||||
287 | my $cleanHtml = $norm->cleanup (); | ||||||
288 | |||||||
289 | =cut | ||||||
290 | |||||||
291 | sub cleanup { | ||||||
292 | 19 | 19 | 1 | 48 | my ($self, @params) = @_; | ||
293 | |||||||
294 | 19 | 402 | $self->_validateParams (\@params, [keys %paramTypes], ['-html']); | ||||
295 | |||||||
296 | # Check we got all required parameters and set any defaults | ||||||
297 | 19 | 109 | for my $param (keys %paramTypes) { | ||||
298 | 190 | 100 | 402 | next if exists $self->{$param}; | |||
299 | 147 | 100 | 8829 | next if $paramTypes{$param}[0] > 1; | |||
300 | |||||||
301 | 134 | 50 | 493 | croak "A $param parameter must be provided. None was." | |||
302 | if $paramTypes{$param}[0] == 1; | ||||||
303 | |||||||
304 | # Set missing param to default | ||||||
305 | 134 | 487 | $self->{$param} = $paramTypes{$param}[1]; | ||||
306 | } | ||||||
307 | |||||||
308 | # Unpack any -default parameters | ||||||
309 | 19 | 78 | for my $default (@{$self->{-default}}) { | ||||
19 | 80 | ||||||
310 | 23 | 431 | my ($tag, $attrib, $value) = | ||||
311 | $default =~ / | ||||||
312 | (\w+)\s+ # Tag | ||||||
313 | (\w+)\s* # Attribute | ||||||
314 | (?:=\s*(?=[\w'"])|=(?=~)) | ||||||
315 | ( '[^']*' # Single quoted | ||||||
316 | |"[^"]*" # Double quoted | ||||||
317 | |\w+ # Unquoted | ||||||
318 | |$regex # regex match | ||||||
319 | )\s* # Value | ||||||
320 | $/x; | ||||||
321 | |||||||
322 | 23 | 50 | 66 | croak "Badly formed default attribute string: $default" | |||
323 | unless defined $value; | ||||||
324 | 23 | 82 | $_ = lc for $tag, $attrib; | ||||
325 | |||||||
326 | 23 | 50 | 33 | 107 | croak "Conflicting defaults given:\n" | ||
327 | . " $tag $attrib=$self->{defaults}{$tag}{$attrib}\n" | ||||||
328 | . "and\n $tag $attrib=$value\n" | ||||||
329 | if exists $self->{defaults}{$tag}{$attrib} | ||||||
330 | and $self->{defaults}{$tag}{$attrib} ne $value; | ||||||
331 | |||||||
332 | 23 | 100 | 190 | if ($value =~ /^()()()$regex$/x) { | |||
333 | # Compile regex | ||||||
334 | 9 | 15 | $value =~ s/^~\s*/qr/; | ||||
335 | 9 | 1208 | $value = eval $value; | ||||
336 | } else { | ||||||
337 | # Strip quotes if present from match value | ||||||
338 | 14 | 74 | $value =~ s/^(['"])(.*)\1$/$2/; | ||||
339 | } | ||||||
340 | |||||||
341 | 23 | 128 | $self->{defaults}{$tag}{$attrib} = $value; | ||||
342 | } | ||||||
343 | |||||||
344 | 19 | 226 | $self->{root} = HTML::TreeBuilder->new; | ||||
345 | 19 | 8410 | $self->{root}->parse_content ($self->{-html}); | ||||
346 | 19 | 57288 | $self->{root}->elementify (); | ||||
347 | |||||||
348 | 19 | 4394 | 1 while $self->_cleanedupElt ($self->{root}); | ||||
349 | |||||||
350 | 19 | 40 | my $str = ''; | ||||
351 | |||||||
352 | 19 | 100 | 80 | if ($self->{-selfrender}) { | |||
353 | 11 | 28 | $self->{line} = ''; | ||||
354 | 11 | 44 | $str = $self->_render ($self->{root}, ''); | ||||
355 | } else { | ||||||
356 | 8 | 36 | my @renderOptions = (undef, ' ', {}); | ||||
357 | |||||||
358 | 8 | 100 | 49 | $renderOptions[1] = undef if $self->{-unformatted}; | |||
359 | 8 | 50 | 33 | $renderOptions[2] = undef if $self->{-compact}; | |||
360 | |||||||
361 | 8 | 18 | my $elt = $self->{root}; | ||||
362 | |||||||
363 | 8 | 100 | 28 | if (! $self->{-keepimplicit}) { | |||
364 | 7 | 42 | ($elt) = grep {$_->{_tag} eq 'body'} $self->{root}->descendents (); | ||||
31 | 5040 | ||||||
365 | } | ||||||
366 | |||||||
367 | 8 | 79 | $str .= ref $_ ? $_->as_HTML (@renderOptions) : $_ | ||||
368 | 8 | 50 | 14 | for @{$elt->{_content}}; | |||
369 | } | ||||||
370 | |||||||
371 | 19 | 6461 | return $str; | ||||
372 | } | ||||||
373 | |||||||
374 | sub _cleanedupElt { | ||||||
375 | 234 | 234 | 309 | my ($self, $parent) = @_; | |||
376 | |||||||
377 | 234 | 100 | 66 | 1870 | return 0 unless ref $parent && ref $parent->{_content}; | ||
378 | |||||||
379 | 185 | 228 | my $rescan = 1; # Set true to rescan the child element list | ||||
380 | 185 | 208 | my $touched; | ||||
381 | |||||||
382 | 185 | 393 | while ($rescan) { | ||||
383 | 219 | 243 | $rescan = 0; # Assume another scan not required after current scan | ||||
384 | 219 | 398 | ++$touched; | ||||
385 | |||||||
386 | 219 | 824 | for my $elt ($parent->content_list ()) { | ||||
387 | 410 | 100 | 3489 | next unless ref $elt; | |||
388 | |||||||
389 | 205 | 100 | 1100 | ++$rescan, last if $self->_cleanedupElt ($elt); | |||
390 | 188 | 100 | 580 | next if exists $elt->{_implicit}; | |||
391 | |||||||
392 | 120 | 100 | 911 | ++$rescan, last if $self->_removedDefaults ($elt); | |||
393 | 109 | 100 | 782 | ++$rescan, last if $self->_distributedElements ($elt); | |||
394 | 105 | 50 | 665 | ++$rescan, last if $self->_normalizedElements ($elt); | |||
395 | 105 | 100 | 284 | ++$rescan, last if $self->_expeledBr ($elt); | |||
396 | 104 | 100 | 344 | ++$rescan, last if $self->_removedEmpty ($elt); | |||
397 | } | ||||||
398 | } | ||||||
399 | |||||||
400 | 185 | 719 | return $touched > 1; | ||||
401 | } | ||||||
402 | |||||||
403 | sub _distributedElements { | ||||||
404 | 109 | 109 | 146 | my ($self, $elt) = @_; | |||
405 | |||||||
406 | 109 | 50 | 323 | return 0 unless $self->{-distribute}; | |||
407 | 109 | 100 | 100 | 1964 | return 0 | ||
408 | unless $elt->{_tag} =~ $self->{inline} | ||||||
409 | && $elt->{_tag} =~ $self->{needattr}; | ||||||
410 | |||||||
411 | 44 | 127 | my @elts = $elt->content_list (); | ||||
412 | 44 | 100 | 363 | my $blockElts = grep {ref $_ && $_->{_tag} =~ $self->{block}} @elts; | |||
65 | 303 | ||||||
413 | |||||||
414 | # Done unless all child elements are block level elements | ||||||
415 | 44 | 100 | 66 | 287 | return 0 unless @elts && @elts == $blockElts; | ||
416 | |||||||
417 | # Distribute inline element over and block elements | ||||||
418 | 4 | 20 | $elt->replace_with_content (); | ||||
419 | |||||||
420 | 4 | 105 | for my $block (@elts) { | ||||
421 | 5 | 33 | my @nested = $block->detach_content (); | ||||
422 | 5 | 62 | my $clone = $elt->clone (); | ||||
423 | |||||||
424 | 5 | 95 | $block->push_content ($clone); | ||||
425 | 5 | 77 | $clone->push_content (@nested); | ||||
426 | } | ||||||
427 | |||||||
428 | 4 | 71 | $elt->delete (); | ||||
429 | 4 | 91 | return 1; | ||||
430 | } | ||||||
431 | |||||||
432 | sub _normalizedElements { | ||||||
433 | 105 | 105 | 777 | my ($self, $elt) = @_; | |||
434 | |||||||
435 | 105 | 100 | 678 | return 0 unless $elt->{_tag} =~ $self->{inline}; | |||
436 | |||||||
437 | 42 | 163 | my @elts = $elt->content_list (); | ||||
438 | |||||||
439 | # Ok unless element contains single block level child | ||||||
440 | 42 | 50 | 100 | 529 | return 0 | ||
66 | |||||||
441 | unless @elts == 1 | ||||||
442 | && ref $elts[0] | ||||||
443 | && $elts[0]->{_tag} =~ $self->{block}; | ||||||
444 | |||||||
445 | # Invert order of inline and block elements | ||||||
446 | 0 | 0 | my @nested = $elts[0]->detach_content (); | ||||
447 | |||||||
448 | 0 | 0 | $elt->replace_with ($elts[0]); | ||||
449 | 0 | 0 | $elts[0]->push_content ($elt); | ||||
450 | 0 | 0 | $elt->push_content (@nested); | ||||
451 | 0 | 0 | $elt = $elts[0]; | ||||
452 | |||||||
453 | 0 | 0 | $_->replace_with_content ()->delete () | ||||
454 | 0 | 0 | for grep {$self->_removedEmpty ($_)} @elts; | ||||
455 | |||||||
456 | 0 | 0 | return 1; | ||||
457 | } | ||||||
458 | |||||||
459 | sub _expeledBr { | ||||||
460 | 105 | 105 | 141 | my ($self, $elt) = @_; | |||
461 | |||||||
462 | 105 | 100 | 100 | 1518 | return 0 unless $elt->{_tag} eq 'a' && $self->{-expelbr}; | ||
463 | 4 | 50 | 13 | return 0 unless exists $elt->{_content}; | |||
464 | |||||||
465 | 4 | 6 | my $adjusted; | ||||
466 | 4 | 7 | for my $index (0, -1) { | ||||
467 | 8 | 15 | my $br = $elt->{_content}[$index]; | ||||
468 | |||||||
469 | 8 | 100 | 66 | 947 | next unless ref $br && $br->{_tag} eq 'br'; | ||
470 | 2 | 100 | 18 | $index == 0 | |||
471 | ? $br->detach ()->preinsert ($br) | ||||||
472 | : $br->detach ()->postinsert ($br); | ||||||
473 | 2 | 130 | ++$adjusted; | ||||
474 | } | ||||||
475 | |||||||
476 | 4 | 16 | return $adjusted; | ||||
477 | } | ||||||
478 | |||||||
479 | sub _removedDefaults { | ||||||
480 | 120 | 120 | 161 | my ($self, $elt) = @_; | |||
481 | |||||||
482 | 120 | 100 | 2100 | return 0 unless exists $self->{defaults}{$elt->{_tag}}; | |||
483 | |||||||
484 | 32 | 204 | my $delAttribs = $self->{defaults}{$elt->{_tag}}; | ||||
485 | |||||||
486 | 32 | 87 | for my $attrib (keys %$delAttribs) { | ||||
487 | 141 | 100 | 324 | next unless exists $elt->{$attrib}; | |||
488 | |||||||
489 | 45 | 199 | my $value = $delAttribs->{$attrib}; | ||||
490 | 45 | 400 | my @parentAttribs; | ||||
491 | 45 | 113 | my @criteria = (_tag => $elt->{_tag}); | ||||
492 | |||||||
493 | 45 | 100 | 105 | if ('Regexp' eq ref $value) { | |||
494 | 19 | 100 | 127 | next unless $elt->{$attrib} =~ $value; | |||
495 | push @criteria, sub { | ||||||
496 | 20 | 20 | 4365 | my $attr = $_[0]->attr("$attrib"); | |||
497 | 20 | 50 | 227 | return 0 unless defined $attr; | |||
498 | 20 | 121 | return $attr !~ $value; | ||||
499 | 14 | 224 | }; | ||||
500 | } else { | ||||||
501 | 26 | 40 | my $value = $delAttribs->{$attrib}; | ||||
502 | |||||||
503 | 26 | 100 | 110 | next unless $elt->{$attrib} eq $value; | |||
504 | 20 | 575 | push @criteria, ($attrib => qr/^(?!\Q$value\E)/i); | ||||
505 | } | ||||||
506 | |||||||
507 | 34 | 286 | @parentAttribs = $elt->look_up (@criteria); | ||||
508 | |||||||
509 | # Don't delete attribute required to restore default | ||||||
510 | 34 | 100 | 3959 | next if @parentAttribs; | |||
511 | 24 | 129 | delete $elt->{$attrib}; | ||||
512 | } | ||||||
513 | |||||||
514 | 32 | 99 | return $self->_removedEmpty ($elt); | ||||
515 | } | ||||||
516 | |||||||
517 | sub _removedEmpty { | ||||||
518 | 136 | 136 | 199 | my ($self, $elt) = @_; | |||
519 | |||||||
520 | 136 | 100 | 1115 | return 0 if grep {!/^_/} $elt->all_attr_names (); | |||
466 | 2680 | ||||||
521 | 69 | 100 | 1985 | return 0 unless $elt->{_tag} =~ $self->{needattr}; | |||
522 | |||||||
523 | # Remove redundant element - no attributes left | ||||||
524 | 12 | 59 | $elt->replace_with ($elt->detach_content ()); | ||||
525 | 12 | 759 | $elt->delete (); | ||||
526 | 12 | 636 | return 1; | ||||
527 | } | ||||||
528 | |||||||
529 | sub _render { | ||||||
530 | 56 | 56 | 82 | my ($self, $elt, $indent) = @_; | |||
531 | |||||||
532 | 56 | 100 | 66 | 626 | return '' | ||
100 | |||||||
533 | unless $self->{-keepimplicit} || !$elt->{_implicit} || $elt->{_content}; | ||||||
534 | |||||||
535 | 45 | 58 | my $str = ''; | ||||
536 | |||||||
537 | 45 | 100 | 66 | 486 | if (! $self->{-keepimplicit} && $elt->{_implicit}) { | ||
50 | |||||||
100 | |||||||
538 | 22 | 200 | return $self->_renderContents ($elt, $indent); | ||||
539 | |||||||
540 | } elsif ($elt->{_tag} =~ $self->{selfclose}) { | ||||||
541 | 0 | 0 | $str .= $self->_append ("<$elt->{_tag} />", $indent); | ||||
542 | |||||||
543 | } elsif ($HTML::Tagset::isPhraseMarkup{$elt->{_tag}}) { | ||||||
544 | 7 | 34 | $str .= $self->_append ("<$elt->{_tag}", $indent); | ||||
545 | 7 | 22 | $str .= $self->_renderAttrs ($elt, $indent); | ||||
546 | 7 | 22 | $str .= $self->_renderContents ($elt, $indent); | ||||
547 | 7 | 30 | $str .= $self->_append ("$elt->{_tag}>",$indent); | ||||
548 | |||||||
549 | } else { | ||||||
550 | 16 | 53 | my $indented = "$indent$self->{-indent}"; | ||||
551 | |||||||
552 | 16 | 73 | $str = $self->_flushLine ($indent); | ||||
553 | 16 | 47 | $self->{line} .= "<$elt->{_tag}"; | ||||
554 | 16 | 31 | $self->{ishead} = 1; | ||||
555 | 16 | 51 | $str .= $self->_renderAttrs ($elt, $indented); | ||||
556 | 16 | 49 | $str .= $self->_renderContents ($elt, $indented); | ||||
557 | 16 | 62 | $str .= $self->_append ("$elt->{_tag}>", $indented); | ||||
558 | 16 | 161 | $str .= $self->_flushLine ($indented); | ||||
559 | } | ||||||
560 | |||||||
561 | 23 | 11747 | return $str; | ||||
562 | } | ||||||
563 | |||||||
564 | sub _append { | ||||||
565 | 39 | 39 | 178 | my ($self, $tail, $indent) = @_; | |||
566 | |||||||
567 | 39 | 50 | 238 | if ((length ($self->{line}) + length ($tail) + length ($indent)) > $self->{-maxlinelen}) { | |||
568 | 0 | 0 | my $str = $self->_flushLine ($indent); | ||||
569 | |||||||
570 | 0 | 0 | $self->{line} = $tail; | ||||
571 | 0 | 0 | return $str; | ||||
572 | } else { | ||||||
573 | 39 | 61 | $self->{line} .= $tail; | ||||
574 | 39 | 94 | return ''; | ||||
575 | } | ||||||
576 | } | ||||||
577 | |||||||
578 | sub _flushLine { | ||||||
579 | 32 | 32 | 46 | my ($self, $indent) = @_; | |||
580 | |||||||
581 | 32 | 100 | 231 | return '' unless length $self->{line}; | |||
582 | |||||||
583 | 18 | 18 | my $str; | ||||
584 | |||||||
585 | 18 | 100 | 44 | if ($self->{-unformatted}) { | |||
586 | 5 | 8 | $str = $self->{line}; | ||||
587 | |||||||
588 | } else { | ||||||
589 | 13 | 50 | 38 | if ($self->{ishead}) { | |||
590 | 13 | 52 | substr ($indent, -length $self->{-indent}) = ''; | ||||
591 | 13 | 26 | $self->{isHead} = undef; | ||||
592 | } | ||||||
593 | |||||||
594 | 13 | 35 | $str = "$indent$self->{line}\n"; | ||||
595 | } | ||||||
596 | |||||||
597 | 18 | 29 | $self->{line} = ''; | ||||
598 | 18 | 42 | return $str; | ||||
599 | } | ||||||
600 | |||||||
601 | sub _renderAttrs { | ||||||
602 | 23 | 23 | 42 | my ($self, $elt, $indent) = @_; | |||
603 | 23 | 28 | my $str = ''; | ||||
604 | 23 | 52 | my @attrs = grep {! /^_/} keys %$elt; | ||||
78 | 238 | ||||||
605 | |||||||
606 | $str .= $self->_append ( | ||||||
607 | qq( $_=") . encode_entities ($elt->{$_}) . qq("), | ||||||
608 | $indent | ||||||
609 | ) | ||||||
610 | 23 | 121 | for sort @attrs; | ||||
611 | 23 | 50 | $self->{line} .= '>'; | ||||
612 | 23 | 55 | return $str; | ||||
613 | } | ||||||
614 | |||||||
615 | sub _renderContents { | ||||||
616 | 45 | 45 | 75 | my ($self, $elt, $indent) = @_; | |||
617 | 45 | 135 | my $str = ''; | ||||
618 | |||||||
619 | 45 | 48 | for my $subElt (@{$elt->{_content}}) { | ||||
45 | 136 | ||||||
620 | 63 | 100 | 110 | if (! ref $subElt) { | |||
621 | 18 | 48 | $str .= $self->_renderText ($subElt, $indent); | ||||
622 | } else { | ||||||
623 | 45 | 203 | $str .= $self->_render ($subElt, $indent); | ||||
624 | } | ||||||
625 | } | ||||||
626 | |||||||
627 | 45 | 149 | return $str; | ||||
628 | } | ||||||
629 | |||||||
630 | |||||||
631 | sub _renderText { | ||||||
632 | 18 | 18 | 35 | my ($self, $elt, $indent) = @_; | |||
633 | 18 | 73 | my $str = $self->{line} . encode_entities ($elt); | ||||
634 | |||||||
635 | 18 | 100 | 344 | if ($self->{-unformatted}) { | |||
636 | 1 | 3 | $self->{line} = ''; | ||||
637 | |||||||
638 | } else { | ||||||
639 | 17 | 174 | my $maxLen = $self->{-maxlinelen} - length $indent; | ||||
640 | |||||||
641 | 17 | 83 | $str =~ s/(.{,$maxLen})\s+/$indent$1\n/g; | ||||
642 | 17 | 738 | ($str, $self->{line}) = $str =~ /(.*\n)?(.*)/; | ||||
643 | 17 | 50 | 52 | $str = '' unless defined $str; | |||
644 | 17 | 50 | 47 | $self->{line} = '' unless defined $self->{line}; | |||
645 | } | ||||||
646 | |||||||
647 | 18 | 67 | return $str; | ||||
648 | } | ||||||
649 | |||||||
650 | |||||||
651 | 1; | ||||||
652 | |||||||
653 | =head1 BUGS | ||||||
654 | |||||||
655 | =head3 p/div/p parsing issue | ||||||
656 | |||||||
657 | HTML::TreeBuilder 3.23 and earlier misparses: | ||||||
658 | |||||||
659 | foo |
||||||
660 | |||||||
661 | as: | ||||||
662 | |||||||
663 | foo |
||||||
664 | |||||||
665 | A work around in HTML::Normalize turns that into | ||||||
666 | |||||||
667 | foo |
||||||
668 | |||||||
669 | which is probably still incorrect - div elements should not nest within p | ||||||
670 | elements. A better fix for the problem requires HTML::TreeBuilder to be fixed. | ||||||
671 | |||||||
672 | =head3 Bug reports and feature requests | ||||||
673 | |||||||
674 | Please report any other bugs or feature requests to | ||||||
675 | C |
||||||
676 | L |
||||||
677 | I will be notified, and then you'll automatically be notified of progress on | ||||||
678 | your bug as I make changes. | ||||||
679 | |||||||
680 | =head1 SUPPORT | ||||||
681 | |||||||
682 | This module is supported by the author through CPAN. The following links may be | ||||||
683 | of assistance: | ||||||
684 | |||||||
685 | =over 4 | ||||||
686 | |||||||
687 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
688 | |||||||
689 | L |
||||||
690 | |||||||
691 | =item * CPAN Ratings | ||||||
692 | |||||||
693 | L |
||||||
694 | |||||||
695 | =item * RT: CPAN's request tracker | ||||||
696 | |||||||
697 | L |
||||||
698 | |||||||
699 | =item * Search CPAN | ||||||
700 | |||||||
701 | L |
||||||
702 | |||||||
703 | =back | ||||||
704 | |||||||
705 | =head1 ACKNOWLEDGEMENTS | ||||||
706 | |||||||
707 | This module was inspired by Bart Lateur's PerlMonks node 'Cleaning up HTML' | ||||||
708 | (L |
||||||
709 | and the author. | ||||||
710 | |||||||
711 | =head1 AUTHOR | ||||||
712 | |||||||
713 | Peter Jaquiery | ||||||
714 | CPAN ID: GRANDPA | ||||||
715 | grandpa@cpan.org | ||||||
716 | |||||||
717 | =head1 COPYRIGHT & LICENSE | ||||||
718 | |||||||
719 | This program is free software; you can redistribute | ||||||
720 | it and/or modify it under the same terms as Perl itself. | ||||||
721 | |||||||
722 | The full text of the license can be found in the | ||||||
723 | LICENSE file included with this module. | ||||||
724 | |||||||
725 | =cut | ||||||
726 |