| lib/HTML/Normalize.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 230 | 243 | 94.6 |
| branch | 97 | 120 | 80.8 |
| condition | 31 | 45 | 68.8 |
| subroutine | 26 | 26 | 100.0 |
| pod | 2 | 2 | 100.0 |
| total | 386 | 436 | 88.5 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::Normalize; | ||||||
| 2 | |||||||
| 3 | 1 | 1 | 185153 | use strict; | |||
| 1 | 3 | ||||||
| 1 | 37 | ||||||
| 4 | 1 | 1 | 6 | use warnings; | |||
| 1 | 2 | ||||||
| 1 | 33 | ||||||
| 5 | 1 | 1 | 4 | use HTML::Entities; | |||
| 1 | 6 | ||||||
| 1 | 69 | ||||||
| 6 | 1 | 1 | 5 | use HTML::TreeBuilder; | |||
| 1 | 1 | ||||||
| 1 | 6 | ||||||
| 7 | 1 | 1 | 33 | use HTML::Tagset; | |||
| 1 | 1 | ||||||
| 1 | 25 | ||||||
| 8 | 1 | 1 | 4 | use Carp; | |||
| 1 | 8 | ||||||
| 1 | 78 | ||||||
| 9 | |||||||
| 10 | BEGIN { | ||||||
| 11 | 1 | 1 | 5 | use Exporter (); | |||
| 1 | 2 | ||||||
| 1 | 24 | ||||||
| 12 | 1 | 1 | 12 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
| 1 | 2 | ||||||
| 1 | 116 | ||||||
| 13 | 1 | 1 | 2 | $VERSION = '1.0003'; | |||
| 14 | 1 | 17 | @ISA = qw(Exporter); | ||||
| 15 | 1 | 2 | @EXPORT = qw(); | ||||
| 16 | 1 | 3 | @EXPORT_OK = qw(); | ||||
| 17 | 1 | 3898 | %EXPORT_TAGS = (); | ||||
| 18 | } | ||||||
| 19 | |||||||
| 20 | =head1 NAME | ||||||
| 21 | |||||||
| 22 | HTML::Normalize - HTML light weight cleanup | ||||||
| 23 | |||||||
| 24 | =head1 VERSION | ||||||
| 25 | |||||||
| 26 | Version 1.0003 | ||||||
| 27 | |||||||
| 28 | =head1 SYNOPSIS | ||||||
| 29 | |||||||
| 30 | my $norm = HTML::Normalize->new (); | ||||||
| 31 | my $cleanHtml = $norm->cleanup (-html => $dirtyHtml); | ||||||
| 32 | |||||||
| 33 | =head1 DESCRIPTION | ||||||
| 34 | |||||||
| 35 | HTML::Normalize uses HTML::TreeBuilder to parse an HTML string then processes | ||||||
| 36 | the resultant tree to clean up various structural issues in the original HTML. | ||||||
| 37 | The result is then rendered using HTML::Element's as_HTML member. | ||||||
| 38 | |||||||
| 39 | Key structural clean ups fix tag soup (C<< foo >> becomes C<< | ||||||
| 40 | foo >>) and inline/block element nesting (C<< | ||||||
| 41 | foo >> becomes C<<foo >>). C<<>> |
||||||
| 42 | tags at the start or end of a link element are migrated out of the element. | ||||||
| 43 | |||||||
| 44 | Note that HTML::Normalize's approach to cleaning up tag soup is different than | ||||||
| 45 | that used by HTML::Tidy. HTML::Tidy tends to enforce nested and swaps end tags | ||||||
| 46 | to achieve that. HTML::Normalize inserts extra tags to allow correctly taged | ||||||
| 47 | overlapped markup. | ||||||
| 48 | |||||||
| 49 | HTML::Normalize can also remove attributes set to default values and empty | ||||||
| 50 | elements. For example a C<< >> | ||||||
| 51 | element would become and C<< >> and C<< | ||||||
| 52 | face="Verdana" size="1"> >> would be removed if Verdana size 1 is set as the | ||||||
| 53 | default font. | ||||||
| 54 | |||||||
| 55 | =head1 Methods | ||||||
| 56 | |||||||
| 57 | C |
||||||
| 58 | |||||||
| 59 | C |
||||||
| 60 | parses the HTML to generate the internal representation. It then edits the | ||||||
| 61 | internal representation and renders the result back into HTML. | ||||||
| 62 | |||||||
| 63 | Note that I |
||||||
| 64 | process. | ||||||
| 65 | |||||||
| 66 | Generally errors are handled by carping and may be detected in both I |
||||||
| 67 | I |
||||||
| 68 | |||||||
| 69 | =cut | ||||||
| 70 | |||||||
| 71 | =head2 new | ||||||
| 72 | |||||||
| 73 | Create a new C |
||||||
| 74 | |||||||
| 75 | my $norm = HTML::Normalize->new (); | ||||||
| 76 | |||||||
| 77 | =over 4 | ||||||
| 78 | |||||||
| 79 | =item I<-compact>: optional | ||||||
| 80 | |||||||
| 81 | Setting C<< -compact => 1 >> suppresses generation of 'optional' close tags. | ||||||
| 82 | This reduces the sizeof the output slightly at the expense of breaking any hope | ||||||
| 83 | of XHTML compliance. | ||||||
| 84 | |||||||
| 85 | =item I<-default>: optional - multiple | ||||||
| 86 | |||||||
| 87 | Define a default attribute for an element. Default attributes are removed if the | ||||||
| 88 | attribute value has not been overridden in a parent node. For element such as | ||||||
| 89 | 'font' this may result in the element being removed if no attributes remain. | ||||||
| 90 | |||||||
| 91 | C<-default> takes a string of the form 'tag attribute=value' as an argument. | ||||||
| 92 | For example: | ||||||
| 93 | |||||||
| 94 | -default => 'font face="Verdana"' | ||||||
| 95 | |||||||
| 96 | would specify that the face "Verdana" is the default face attribute for font | ||||||
| 97 | elements. | ||||||
| 98 | |||||||
| 99 | I |
||||||
| 100 | matches: | ||||||
| 101 | |||||||
| 102 | /(~|qr)\s*(.).*\1\s*$/ | ||||||
| 103 | |||||||
| 104 | except that the paired delimiters [], {}, () and <> are also accepted as pattern | ||||||
| 105 | delimiters. | ||||||
| 106 | |||||||
| 107 | Literal match values should not encode entities, but remember that quotes around | ||||||
| 108 | attribute values are optional for some values so the outer pair of quote | ||||||
| 109 | characters will be removed if present. The match value extends to the end of the | ||||||
| 110 | line and is not bounded by quote qharacters (except as noted earlier) so no | ||||||
| 111 | quoting of "special" characters is required - there are no special characters. | ||||||
| 112 | |||||||
| 113 | Multiple default attributes may be provided but only one default value is | ||||||
| 114 | allowed for any one tag/attribute pair. | ||||||
| 115 | |||||||
| 116 | Default values are case sensitive. However you can use the regular expression | ||||||
| 117 | form to overcome this limitation. | ||||||
| 118 | |||||||
| 119 | =item I<-distribute>: optional - default true | ||||||
| 120 | |||||||
| 121 | Distribute inline elements over children if the children are block level | ||||||
| 122 | elements. For example: | ||||||
| 123 | |||||||
| 124 | foo bar |
||||||
| 125 | |||||||
| 126 | becomes: | ||||||
| 127 | |||||||
| 128 | foo bar |
||||||
| 129 | |||||||
| 130 | This action is only taken if all the child elements are block level elements. | ||||||
| 131 | |||||||
| 132 | =item I<-expelbr>: optional - default true | ||||||
| 133 | |||||||
| 134 | If C<-expelbr> is true (the default) break elements at the edges of link | ||||||
| 135 | elements are expelled from the link element. Thus: | ||||||
| 136 | |||||||
| 137 | link text |
||||||
| 138 | |||||||
| 139 | becomes | ||||||
| 140 | |||||||
| 141 | link text |
||||||
| 142 | |||||||
| 143 | =item I<-html>: required | ||||||
| 144 | |||||||
| 145 | the HTML string to clean. | ||||||
| 146 | |||||||
| 147 | =item I<-indent>: optional - default ' ' | ||||||
| 148 | |||||||
| 149 | String used to indent formatted output. Ignored if I<-unformatted> is true. | ||||||
| 150 | |||||||
| 151 | =item I<-keepimplicit>: optional | ||||||
| 152 | |||||||
| 153 | as_HTML adds various HTML required sections such as head and body elements. By | ||||||
| 154 | default HTML::Normalize removes these elements so that it is suitable for | ||||||
| 155 | processing HTML fragments. Set C<-keepimplicit => 1> to render the implicit | ||||||
| 156 | elements. | ||||||
| 157 | |||||||
| 158 | Note that if this option is true, the extra nodes will be generated regardless | ||||||
| 159 | of their presence in the original HTML. | ||||||
| 160 | |||||||
| 161 | =item I<-maxlinelen>: optional - default 80 | ||||||
| 162 | |||||||
| 163 | Notional maximum line length if I<-selfrender> is true. The line length may be | ||||||
| 164 | exceeded if no suitable break position is found. Note that the current indent is | ||||||
| 165 | included in the line length. | ||||||
| 166 | |||||||
| 167 | =item I<-selfrender>: optional | ||||||
| 168 | |||||||
| 169 | Use the experimental HTML::Normalize code to render HTML rather than using | ||||||
| 170 | HTML::Element's renderer. This code has not been tested against a wide range of | ||||||
| 171 | HTML and may be unreliable. It's advantage is that it produces (in the author's | ||||||
| 172 | opinion) prettier output than HTML::Element's as_HTML member. | ||||||
| 173 | |||||||
| 174 | =item I<-unformatted>: optional | ||||||
| 175 | |||||||
| 176 | Suppress output formatting. By default as_HTML is called as | ||||||
| 177 | |||||||
| 178 | as_HTML (undef, ' ', {}) | ||||||
| 179 | |||||||
| 180 | which wraps and indents elements. Setting C<< -unformatted => 1 >> suppresses | ||||||
| 181 | generation of line breaks and indentation reducing the size of the output | ||||||
| 182 | slightly. | ||||||
| 183 | |||||||
| 184 | =back | ||||||
| 185 | |||||||
| 186 | =cut | ||||||
| 187 | |||||||
| 188 | my %paramTypes = ( | ||||||
| 189 | |||||||
| 190 | # 0: optional once | ||||||
| 191 | # 1: required once | ||||||
| 192 | # 2: optional, many allowed | ||||||
| 193 | -compact => [0, 0], | ||||||
| 194 | -default => [2, undef], | ||||||
| 195 | -distribute => [0, 1], | ||||||
| 196 | -expelbr => [0, 1], | ||||||
| 197 | -html => [1, undef], | ||||||
| 198 | -indent => [0, ' '], | ||||||
| 199 | -keepimplicit => [0, 0], | ||||||
| 200 | -maxlinelen => [0, 80], | ||||||
| 201 | -selfrender => [0, 0], | ||||||
| 202 | -unformatted => [0, 0], | ||||||
| 203 | ); | ||||||
| 204 | my $regex = ' | ||||||
| 205 | (?:~|qr)\s* | ||||||
| 206 | (?: | ||||||
| 207 | (.).*\4 # regex quote char delimited | ||||||
| 208 | |<.*> # regex <> delimited | ||||||
| 209 | |{.*} # regex {} delimited | ||||||
| 210 | |\[.*\] # regex [] delimited | ||||||
| 211 | |\(.*\) # regex () delimited | ||||||
| 212 | )i? # Regex match | ||||||
| 213 | '; | ||||||
| 214 | |||||||
| 215 | sub new { | ||||||
| 216 | 19 | 19 | 1 | 256805 | my ($self, @params) = @_; | ||
| 217 | |||||||
| 218 | 19 | 50 | 110 | unless (ref $self) { | |||
| 219 | 19 | 95 | $self = bless {}, $self; | ||||
| 220 | 19 | 154 | $self->{both} = qr/^(del|ins)$/i; | ||||
| 221 | 19 | 112 | $self->{inline} = qr/^(b|i|s|font|span)$/i; | ||||
| 222 | 19 | 92 | $self->{block} = qr/^(p|table|div)$/i; | ||||
| 223 | 19 | 83 | $self->{needattr} = qr/^(font|span)$/i; | ||||
| 224 | 19 | 96 | $self->{selfclose} = qr/^(br)$/i; | ||||
| 225 | } | ||||||
| 226 | |||||||
| 227 | $self->_validateParams ( | ||||||
| 228 | 19 | 154 | \@params, | ||||
| 229 | [ | ||||||
| 230 | qw(-compact -default -distribute -expelbr -keepimplicit -unformatted ) | ||||||
| 231 | ], | ||||||
| 232 | [] | ||||||
| 233 | ); | ||||||
| 234 | |||||||
| 235 | # Add 'div' to the closure barriers list to avoid changing: | ||||||
| 236 | # foo |
||||||
| 237 | # into: | ||||||
| 238 | # foo |
||||||
| 239 | 19 | 9288 | my $bar = \@HTML::Tagset::p_closure_barriers; | ||||
| 240 | 19 | 50 | 81 | push @$bar, 'div' unless grep { $_ eq 'div' } @$bar; | |||
| 285 | 955 | ||||||
| 241 | |||||||
| 242 | 19 | 97 | return $self; | ||||
| 243 | } | ||||||
| 244 | |||||||
| 245 | sub DESTROY { | ||||||
| 246 | 19 | 19 | 209 | my $self = shift; | |||
| 247 | 19 | 50 | 179 | $self->{root}->delete if $self->{root}; | |||
| 248 | } | ||||||
| 249 | |||||||
| 250 | sub _validateParams { | ||||||
| 251 | 38 | 38 | 78 | my ($self, $params, $okParams, $requiredParams) = @_; | |||
| 252 | |||||||
| 253 | 38 | 50 | 125 | $params ||= []; | |||
| 254 | 38 | 50 | 81 | $okParams ||= []; | |||
| 255 | 38 | 50 | 86 | $requiredParams ||= []; | |||
| 256 | |||||||
| 257 | # Validate parameters | ||||||
| 258 | 38 | 96 | while (@$params) { | ||||
| 259 | 60 | 141 | my ($key, $value) = splice @$params, 0, 2; | ||||
| 260 | |||||||
| 261 | 60 | 130 | $key = lc $key; | ||||
| 262 | 60 | 50 | 154 | croak "$key is not a valid parameter name" if !exists $paramTypes{$key}; | |||
| 263 | 60 | 50 | 66 | 340 | croak "$key parameter may only be used once" | ||
| 264 | if $paramTypes{$key}[0] < 2 && exists $self->{$key}; | ||||||
| 265 | |||||||
| 266 | 60 | 100 | 153 | if ($paramTypes{$key}[0] < 2) { | |||
| 267 | 37 | 87 | $self->{$key} = $value; | ||||
| 268 | 37 | 103 | next; | ||||
| 269 | } | ||||||
| 270 | |||||||
| 271 | 23 | 31 | push @{$self->{$key}}, $value; | ||||
| 23 | 1261 | ||||||
| 272 | } | ||||||
| 273 | |||||||
| 274 | # Ensure we got required parameters | ||||||
| 275 | 38 | 182 | for my $key (@$requiredParams) { | ||||
| 276 | 19 | 50 | 56 | croak "Invalid parameter name: $key" unless exists $paramTypes{$key}; | |||
| 277 | 19 | 50 | 63 | $self->{$key} = $paramTypes{$key}[1] unless exists $self->{$key}; | |||
| 278 | 19 | 50 | 33 | 147 | next if $paramTypes{$key}[0] != 1 or exists $self->{$key}; | ||
| 279 | 0 | 0 | croak "The $key parameter is missing. It is required."; | ||||
| 280 | } | ||||||
| 281 | } | ||||||
| 282 | |||||||
| 283 | =head2 cleanup | ||||||
| 284 | |||||||
| 285 | C |
||||||
| 286 | |||||||
| 287 | my $cleanHtml = $norm->cleanup (); | ||||||
| 288 | |||||||
| 289 | =cut | ||||||
| 290 | |||||||
| 291 | sub cleanup { | ||||||
| 292 | 19 | 19 | 1 | 48 | my ($self, @params) = @_; | ||
| 293 | |||||||
| 294 | 19 | 402 | $self->_validateParams (\@params, [keys %paramTypes], ['-html']); | ||||
| 295 | |||||||
| 296 | # Check we got all required parameters and set any defaults | ||||||
| 297 | 19 | 109 | for my $param (keys %paramTypes) { | ||||
| 298 | 190 | 100 | 402 | next if exists $self->{$param}; | |||
| 299 | 147 | 100 | 8829 | next if $paramTypes{$param}[0] > 1; | |||
| 300 | |||||||
| 301 | 134 | 50 | 493 | croak "A $param parameter must be provided. None was." | |||
| 302 | if $paramTypes{$param}[0] == 1; | ||||||
| 303 | |||||||
| 304 | # Set missing param to default | ||||||
| 305 | 134 | 487 | $self->{$param} = $paramTypes{$param}[1]; | ||||
| 306 | } | ||||||
| 307 | |||||||
| 308 | # Unpack any -default parameters | ||||||
| 309 | 19 | 78 | for my $default (@{$self->{-default}}) { | ||||
| 19 | 80 | ||||||
| 310 | 23 | 431 | my ($tag, $attrib, $value) = | ||||
| 311 | $default =~ / | ||||||
| 312 | (\w+)\s+ # Tag | ||||||
| 313 | (\w+)\s* # Attribute | ||||||
| 314 | (?:=\s*(?=[\w'"])|=(?=~)) | ||||||
| 315 | ( '[^']*' # Single quoted | ||||||
| 316 | |"[^"]*" # Double quoted | ||||||
| 317 | |\w+ # Unquoted | ||||||
| 318 | |$regex # regex match | ||||||
| 319 | )\s* # Value | ||||||
| 320 | $/x; | ||||||
| 321 | |||||||
| 322 | 23 | 50 | 66 | croak "Badly formed default attribute string: $default" | |||
| 323 | unless defined $value; | ||||||
| 324 | 23 | 82 | $_ = lc for $tag, $attrib; | ||||
| 325 | |||||||
| 326 | 23 | 50 | 33 | 107 | croak "Conflicting defaults given:\n" | ||
| 327 | . " $tag $attrib=$self->{defaults}{$tag}{$attrib}\n" | ||||||
| 328 | . "and\n $tag $attrib=$value\n" | ||||||
| 329 | if exists $self->{defaults}{$tag}{$attrib} | ||||||
| 330 | and $self->{defaults}{$tag}{$attrib} ne $value; | ||||||
| 331 | |||||||
| 332 | 23 | 100 | 190 | if ($value =~ /^()()()$regex$/x) { | |||
| 333 | # Compile regex | ||||||
| 334 | 9 | 15 | $value =~ s/^~\s*/qr/; | ||||
| 335 | 9 | 1208 | $value = eval $value; | ||||
| 336 | } else { | ||||||
| 337 | # Strip quotes if present from match value | ||||||
| 338 | 14 | 74 | $value =~ s/^(['"])(.*)\1$/$2/; | ||||
| 339 | } | ||||||
| 340 | |||||||
| 341 | 23 | 128 | $self->{defaults}{$tag}{$attrib} = $value; | ||||
| 342 | } | ||||||
| 343 | |||||||
| 344 | 19 | 226 | $self->{root} = HTML::TreeBuilder->new; | ||||
| 345 | 19 | 8410 | $self->{root}->parse_content ($self->{-html}); | ||||
| 346 | 19 | 57288 | $self->{root}->elementify (); | ||||
| 347 | |||||||
| 348 | 19 | 4394 | 1 while $self->_cleanedupElt ($self->{root}); | ||||
| 349 | |||||||
| 350 | 19 | 40 | my $str = ''; | ||||
| 351 | |||||||
| 352 | 19 | 100 | 80 | if ($self->{-selfrender}) { | |||
| 353 | 11 | 28 | $self->{line} = ''; | ||||
| 354 | 11 | 44 | $str = $self->_render ($self->{root}, ''); | ||||
| 355 | } else { | ||||||
| 356 | 8 | 36 | my @renderOptions = (undef, ' ', {}); | ||||
| 357 | |||||||
| 358 | 8 | 100 | 49 | $renderOptions[1] = undef if $self->{-unformatted}; | |||
| 359 | 8 | 50 | 33 | $renderOptions[2] = undef if $self->{-compact}; | |||
| 360 | |||||||
| 361 | 8 | 18 | my $elt = $self->{root}; | ||||
| 362 | |||||||
| 363 | 8 | 100 | 28 | if (! $self->{-keepimplicit}) { | |||
| 364 | 7 | 42 | ($elt) = grep {$_->{_tag} eq 'body'} $self->{root}->descendents (); | ||||
| 31 | 5040 | ||||||
| 365 | } | ||||||
| 366 | |||||||
| 367 | 8 | 79 | $str .= ref $_ ? $_->as_HTML (@renderOptions) : $_ | ||||
| 368 | 8 | 50 | 14 | for @{$elt->{_content}}; | |||
| 369 | } | ||||||
| 370 | |||||||
| 371 | 19 | 6461 | return $str; | ||||
| 372 | } | ||||||
| 373 | |||||||
| 374 | sub _cleanedupElt { | ||||||
| 375 | 234 | 234 | 309 | my ($self, $parent) = @_; | |||
| 376 | |||||||
| 377 | 234 | 100 | 66 | 1870 | return 0 unless ref $parent && ref $parent->{_content}; | ||
| 378 | |||||||
| 379 | 185 | 228 | my $rescan = 1; # Set true to rescan the child element list | ||||
| 380 | 185 | 208 | my $touched; | ||||
| 381 | |||||||
| 382 | 185 | 393 | while ($rescan) { | ||||
| 383 | 219 | 243 | $rescan = 0; # Assume another scan not required after current scan | ||||
| 384 | 219 | 398 | ++$touched; | ||||
| 385 | |||||||
| 386 | 219 | 824 | for my $elt ($parent->content_list ()) { | ||||
| 387 | 410 | 100 | 3489 | next unless ref $elt; | |||
| 388 | |||||||
| 389 | 205 | 100 | 1100 | ++$rescan, last if $self->_cleanedupElt ($elt); | |||
| 390 | 188 | 100 | 580 | next if exists $elt->{_implicit}; | |||
| 391 | |||||||
| 392 | 120 | 100 | 911 | ++$rescan, last if $self->_removedDefaults ($elt); | |||
| 393 | 109 | 100 | 782 | ++$rescan, last if $self->_distributedElements ($elt); | |||
| 394 | 105 | 50 | 665 | ++$rescan, last if $self->_normalizedElements ($elt); | |||
| 395 | 105 | 100 | 284 | ++$rescan, last if $self->_expeledBr ($elt); | |||
| 396 | 104 | 100 | 344 | ++$rescan, last if $self->_removedEmpty ($elt); | |||
| 397 | } | ||||||
| 398 | } | ||||||
| 399 | |||||||
| 400 | 185 | 719 | return $touched > 1; | ||||
| 401 | } | ||||||
| 402 | |||||||
| 403 | sub _distributedElements { | ||||||
| 404 | 109 | 109 | 146 | my ($self, $elt) = @_; | |||
| 405 | |||||||
| 406 | 109 | 50 | 323 | return 0 unless $self->{-distribute}; | |||
| 407 | 109 | 100 | 100 | 1964 | return 0 | ||
| 408 | unless $elt->{_tag} =~ $self->{inline} | ||||||
| 409 | && $elt->{_tag} =~ $self->{needattr}; | ||||||
| 410 | |||||||
| 411 | 44 | 127 | my @elts = $elt->content_list (); | ||||
| 412 | 44 | 100 | 363 | my $blockElts = grep {ref $_ && $_->{_tag} =~ $self->{block}} @elts; | |||
| 65 | 303 | ||||||
| 413 | |||||||
| 414 | # Done unless all child elements are block level elements | ||||||
| 415 | 44 | 100 | 66 | 287 | return 0 unless @elts && @elts == $blockElts; | ||
| 416 | |||||||
| 417 | # Distribute inline element over and block elements | ||||||
| 418 | 4 | 20 | $elt->replace_with_content (); | ||||
| 419 | |||||||
| 420 | 4 | 105 | for my $block (@elts) { | ||||
| 421 | 5 | 33 | my @nested = $block->detach_content (); | ||||
| 422 | 5 | 62 | my $clone = $elt->clone (); | ||||
| 423 | |||||||
| 424 | 5 | 95 | $block->push_content ($clone); | ||||
| 425 | 5 | 77 | $clone->push_content (@nested); | ||||
| 426 | } | ||||||
| 427 | |||||||
| 428 | 4 | 71 | $elt->delete (); | ||||
| 429 | 4 | 91 | return 1; | ||||
| 430 | } | ||||||
| 431 | |||||||
| 432 | sub _normalizedElements { | ||||||
| 433 | 105 | 105 | 777 | my ($self, $elt) = @_; | |||
| 434 | |||||||
| 435 | 105 | 100 | 678 | return 0 unless $elt->{_tag} =~ $self->{inline}; | |||
| 436 | |||||||
| 437 | 42 | 163 | my @elts = $elt->content_list (); | ||||
| 438 | |||||||
| 439 | # Ok unless element contains single block level child | ||||||
| 440 | 42 | 50 | 100 | 529 | return 0 | ||
| 66 | |||||||
| 441 | unless @elts == 1 | ||||||
| 442 | && ref $elts[0] | ||||||
| 443 | && $elts[0]->{_tag} =~ $self->{block}; | ||||||
| 444 | |||||||
| 445 | # Invert order of inline and block elements | ||||||
| 446 | 0 | 0 | my @nested = $elts[0]->detach_content (); | ||||
| 447 | |||||||
| 448 | 0 | 0 | $elt->replace_with ($elts[0]); | ||||
| 449 | 0 | 0 | $elts[0]->push_content ($elt); | ||||
| 450 | 0 | 0 | $elt->push_content (@nested); | ||||
| 451 | 0 | 0 | $elt = $elts[0]; | ||||
| 452 | |||||||
| 453 | 0 | 0 | $_->replace_with_content ()->delete () | ||||
| 454 | 0 | 0 | for grep {$self->_removedEmpty ($_)} @elts; | ||||
| 455 | |||||||
| 456 | 0 | 0 | return 1; | ||||
| 457 | } | ||||||
| 458 | |||||||
| 459 | sub _expeledBr { | ||||||
| 460 | 105 | 105 | 141 | my ($self, $elt) = @_; | |||
| 461 | |||||||
| 462 | 105 | 100 | 100 | 1518 | return 0 unless $elt->{_tag} eq 'a' && $self->{-expelbr}; | ||
| 463 | 4 | 50 | 13 | return 0 unless exists $elt->{_content}; | |||
| 464 | |||||||
| 465 | 4 | 6 | my $adjusted; | ||||
| 466 | 4 | 7 | for my $index (0, -1) { | ||||
| 467 | 8 | 15 | my $br = $elt->{_content}[$index]; | ||||
| 468 | |||||||
| 469 | 8 | 100 | 66 | 947 | next unless ref $br && $br->{_tag} eq 'br'; | ||
| 470 | 2 | 100 | 18 | $index == 0 | |||
| 471 | ? $br->detach ()->preinsert ($br) | ||||||
| 472 | : $br->detach ()->postinsert ($br); | ||||||
| 473 | 2 | 130 | ++$adjusted; | ||||
| 474 | } | ||||||
| 475 | |||||||
| 476 | 4 | 16 | return $adjusted; | ||||
| 477 | } | ||||||
| 478 | |||||||
| 479 | sub _removedDefaults { | ||||||
| 480 | 120 | 120 | 161 | my ($self, $elt) = @_; | |||
| 481 | |||||||
| 482 | 120 | 100 | 2100 | return 0 unless exists $self->{defaults}{$elt->{_tag}}; | |||
| 483 | |||||||
| 484 | 32 | 204 | my $delAttribs = $self->{defaults}{$elt->{_tag}}; | ||||
| 485 | |||||||
| 486 | 32 | 87 | for my $attrib (keys %$delAttribs) { | ||||
| 487 | 141 | 100 | 324 | next unless exists $elt->{$attrib}; | |||
| 488 | |||||||
| 489 | 45 | 199 | my $value = $delAttribs->{$attrib}; | ||||
| 490 | 45 | 400 | my @parentAttribs; | ||||
| 491 | 45 | 113 | my @criteria = (_tag => $elt->{_tag}); | ||||
| 492 | |||||||
| 493 | 45 | 100 | 105 | if ('Regexp' eq ref $value) { | |||
| 494 | 19 | 100 | 127 | next unless $elt->{$attrib} =~ $value; | |||
| 495 | push @criteria, sub { | ||||||
| 496 | 20 | 20 | 4365 | my $attr = $_[0]->attr("$attrib"); | |||
| 497 | 20 | 50 | 227 | return 0 unless defined $attr; | |||
| 498 | 20 | 121 | return $attr !~ $value; | ||||
| 499 | 14 | 224 | }; | ||||
| 500 | } else { | ||||||
| 501 | 26 | 40 | my $value = $delAttribs->{$attrib}; | ||||
| 502 | |||||||
| 503 | 26 | 100 | 110 | next unless $elt->{$attrib} eq $value; | |||
| 504 | 20 | 575 | push @criteria, ($attrib => qr/^(?!\Q$value\E)/i); | ||||
| 505 | } | ||||||
| 506 | |||||||
| 507 | 34 | 286 | @parentAttribs = $elt->look_up (@criteria); | ||||
| 508 | |||||||
| 509 | # Don't delete attribute required to restore default | ||||||
| 510 | 34 | 100 | 3959 | next if @parentAttribs; | |||
| 511 | 24 | 129 | delete $elt->{$attrib}; | ||||
| 512 | } | ||||||
| 513 | |||||||
| 514 | 32 | 99 | return $self->_removedEmpty ($elt); | ||||
| 515 | } | ||||||
| 516 | |||||||
| 517 | sub _removedEmpty { | ||||||
| 518 | 136 | 136 | 199 | my ($self, $elt) = @_; | |||
| 519 | |||||||
| 520 | 136 | 100 | 1115 | return 0 if grep {!/^_/} $elt->all_attr_names (); | |||
| 466 | 2680 | ||||||
| 521 | 69 | 100 | 1985 | return 0 unless $elt->{_tag} =~ $self->{needattr}; | |||
| 522 | |||||||
| 523 | # Remove redundant element - no attributes left | ||||||
| 524 | 12 | 59 | $elt->replace_with ($elt->detach_content ()); | ||||
| 525 | 12 | 759 | $elt->delete (); | ||||
| 526 | 12 | 636 | return 1; | ||||
| 527 | } | ||||||
| 528 | |||||||
| 529 | sub _render { | ||||||
| 530 | 56 | 56 | 82 | my ($self, $elt, $indent) = @_; | |||
| 531 | |||||||
| 532 | 56 | 100 | 66 | 626 | return '' | ||
| 100 | |||||||
| 533 | unless $self->{-keepimplicit} || !$elt->{_implicit} || $elt->{_content}; | ||||||
| 534 | |||||||
| 535 | 45 | 58 | my $str = ''; | ||||
| 536 | |||||||
| 537 | 45 | 100 | 66 | 486 | if (! $self->{-keepimplicit} && $elt->{_implicit}) { | ||
| 50 | |||||||
| 100 | |||||||
| 538 | 22 | 200 | return $self->_renderContents ($elt, $indent); | ||||
| 539 | |||||||
| 540 | } elsif ($elt->{_tag} =~ $self->{selfclose}) { | ||||||
| 541 | 0 | 0 | $str .= $self->_append ("<$elt->{_tag} />", $indent); | ||||
| 542 | |||||||
| 543 | } elsif ($HTML::Tagset::isPhraseMarkup{$elt->{_tag}}) { | ||||||
| 544 | 7 | 34 | $str .= $self->_append ("<$elt->{_tag}", $indent); | ||||
| 545 | 7 | 22 | $str .= $self->_renderAttrs ($elt, $indent); | ||||
| 546 | 7 | 22 | $str .= $self->_renderContents ($elt, $indent); | ||||
| 547 | 7 | 30 | $str .= $self->_append ("$elt->{_tag}>",$indent); | ||||
| 548 | |||||||
| 549 | } else { | ||||||
| 550 | 16 | 53 | my $indented = "$indent$self->{-indent}"; | ||||
| 551 | |||||||
| 552 | 16 | 73 | $str = $self->_flushLine ($indent); | ||||
| 553 | 16 | 47 | $self->{line} .= "<$elt->{_tag}"; | ||||
| 554 | 16 | 31 | $self->{ishead} = 1; | ||||
| 555 | 16 | 51 | $str .= $self->_renderAttrs ($elt, $indented); | ||||
| 556 | 16 | 49 | $str .= $self->_renderContents ($elt, $indented); | ||||
| 557 | 16 | 62 | $str .= $self->_append ("$elt->{_tag}>", $indented); | ||||
| 558 | 16 | 161 | $str .= $self->_flushLine ($indented); | ||||
| 559 | } | ||||||
| 560 | |||||||
| 561 | 23 | 11747 | return $str; | ||||
| 562 | } | ||||||
| 563 | |||||||
| 564 | sub _append { | ||||||
| 565 | 39 | 39 | 178 | my ($self, $tail, $indent) = @_; | |||
| 566 | |||||||
| 567 | 39 | 50 | 238 | if ((length ($self->{line}) + length ($tail) + length ($indent)) > $self->{-maxlinelen}) { | |||
| 568 | 0 | 0 | my $str = $self->_flushLine ($indent); | ||||
| 569 | |||||||
| 570 | 0 | 0 | $self->{line} = $tail; | ||||
| 571 | 0 | 0 | return $str; | ||||
| 572 | } else { | ||||||
| 573 | 39 | 61 | $self->{line} .= $tail; | ||||
| 574 | 39 | 94 | return ''; | ||||
| 575 | } | ||||||
| 576 | } | ||||||
| 577 | |||||||
| 578 | sub _flushLine { | ||||||
| 579 | 32 | 32 | 46 | my ($self, $indent) = @_; | |||
| 580 | |||||||
| 581 | 32 | 100 | 231 | return '' unless length $self->{line}; | |||
| 582 | |||||||
| 583 | 18 | 18 | my $str; | ||||
| 584 | |||||||
| 585 | 18 | 100 | 44 | if ($self->{-unformatted}) { | |||
| 586 | 5 | 8 | $str = $self->{line}; | ||||
| 587 | |||||||
| 588 | } else { | ||||||
| 589 | 13 | 50 | 38 | if ($self->{ishead}) { | |||
| 590 | 13 | 52 | substr ($indent, -length $self->{-indent}) = ''; | ||||
| 591 | 13 | 26 | $self->{isHead} = undef; | ||||
| 592 | } | ||||||
| 593 | |||||||
| 594 | 13 | 35 | $str = "$indent$self->{line}\n"; | ||||
| 595 | } | ||||||
| 596 | |||||||
| 597 | 18 | 29 | $self->{line} = ''; | ||||
| 598 | 18 | 42 | return $str; | ||||
| 599 | } | ||||||
| 600 | |||||||
| 601 | sub _renderAttrs { | ||||||
| 602 | 23 | 23 | 42 | my ($self, $elt, $indent) = @_; | |||
| 603 | 23 | 28 | my $str = ''; | ||||
| 604 | 23 | 52 | my @attrs = grep {! /^_/} keys %$elt; | ||||
| 78 | 238 | ||||||
| 605 | |||||||
| 606 | $str .= $self->_append ( | ||||||
| 607 | qq( $_=") . encode_entities ($elt->{$_}) . qq("), | ||||||
| 608 | $indent | ||||||
| 609 | ) | ||||||
| 610 | 23 | 121 | for sort @attrs; | ||||
| 611 | 23 | 50 | $self->{line} .= '>'; | ||||
| 612 | 23 | 55 | return $str; | ||||
| 613 | } | ||||||
| 614 | |||||||
| 615 | sub _renderContents { | ||||||
| 616 | 45 | 45 | 75 | my ($self, $elt, $indent) = @_; | |||
| 617 | 45 | 135 | my $str = ''; | ||||
| 618 | |||||||
| 619 | 45 | 48 | for my $subElt (@{$elt->{_content}}) { | ||||
| 45 | 136 | ||||||
| 620 | 63 | 100 | 110 | if (! ref $subElt) { | |||
| 621 | 18 | 48 | $str .= $self->_renderText ($subElt, $indent); | ||||
| 622 | } else { | ||||||
| 623 | 45 | 203 | $str .= $self->_render ($subElt, $indent); | ||||
| 624 | } | ||||||
| 625 | } | ||||||
| 626 | |||||||
| 627 | 45 | 149 | return $str; | ||||
| 628 | } | ||||||
| 629 | |||||||
| 630 | |||||||
| 631 | sub _renderText { | ||||||
| 632 | 18 | 18 | 35 | my ($self, $elt, $indent) = @_; | |||
| 633 | 18 | 73 | my $str = $self->{line} . encode_entities ($elt); | ||||
| 634 | |||||||
| 635 | 18 | 100 | 344 | if ($self->{-unformatted}) { | |||
| 636 | 1 | 3 | $self->{line} = ''; | ||||
| 637 | |||||||
| 638 | } else { | ||||||
| 639 | 17 | 174 | my $maxLen = $self->{-maxlinelen} - length $indent; | ||||
| 640 | |||||||
| 641 | 17 | 83 | $str =~ s/(.{,$maxLen})\s+/$indent$1\n/g; | ||||
| 642 | 17 | 738 | ($str, $self->{line}) = $str =~ /(.*\n)?(.*)/; | ||||
| 643 | 17 | 50 | 52 | $str = '' unless defined $str; | |||
| 644 | 17 | 50 | 47 | $self->{line} = '' unless defined $self->{line}; | |||
| 645 | } | ||||||
| 646 | |||||||
| 647 | 18 | 67 | return $str; | ||||
| 648 | } | ||||||
| 649 | |||||||
| 650 | |||||||
| 651 | 1; | ||||||
| 652 | |||||||
| 653 | =head1 BUGS | ||||||
| 654 | |||||||
| 655 | =head3 p/div/p parsing issue | ||||||
| 656 | |||||||
| 657 | HTML::TreeBuilder 3.23 and earlier misparses: | ||||||
| 658 | |||||||
| 659 | foo |
||||||
| 660 | |||||||
| 661 | as: | ||||||
| 662 | |||||||
| 663 | foo |
||||||
| 664 | |||||||
| 665 | A work around in HTML::Normalize turns that into | ||||||
| 666 | |||||||
| 667 | foo |
||||||
| 668 | |||||||
| 669 | which is probably still incorrect - div elements should not nest within p | ||||||
| 670 | elements. A better fix for the problem requires HTML::TreeBuilder to be fixed. | ||||||
| 671 | |||||||
| 672 | =head3 Bug reports and feature requests | ||||||
| 673 | |||||||
| 674 | Please report any other bugs or feature requests to | ||||||
| 675 | C |
||||||
| 676 | L |
||||||
| 677 | I will be notified, and then you'll automatically be notified of progress on | ||||||
| 678 | your bug as I make changes. | ||||||
| 679 | |||||||
| 680 | =head1 SUPPORT | ||||||
| 681 | |||||||
| 682 | This module is supported by the author through CPAN. The following links may be | ||||||
| 683 | of assistance: | ||||||
| 684 | |||||||
| 685 | =over 4 | ||||||
| 686 | |||||||
| 687 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
| 688 | |||||||
| 689 | L |
||||||
| 690 | |||||||
| 691 | =item * CPAN Ratings | ||||||
| 692 | |||||||
| 693 | L |
||||||
| 694 | |||||||
| 695 | =item * RT: CPAN's request tracker | ||||||
| 696 | |||||||
| 697 | L |
||||||
| 698 | |||||||
| 699 | =item * Search CPAN | ||||||
| 700 | |||||||
| 701 | L |
||||||
| 702 | |||||||
| 703 | =back | ||||||
| 704 | |||||||
| 705 | =head1 ACKNOWLEDGEMENTS | ||||||
| 706 | |||||||
| 707 | This module was inspired by Bart Lateur's PerlMonks node 'Cleaning up HTML' | ||||||
| 708 | (L |
||||||
| 709 | and the author. | ||||||
| 710 | |||||||
| 711 | =head1 AUTHOR | ||||||
| 712 | |||||||
| 713 | Peter Jaquiery | ||||||
| 714 | CPAN ID: GRANDPA | ||||||
| 715 | grandpa@cpan.org | ||||||
| 716 | |||||||
| 717 | =head1 COPYRIGHT & LICENSE | ||||||
| 718 | |||||||
| 719 | This program is free software; you can redistribute | ||||||
| 720 | it and/or modify it under the same terms as Perl itself. | ||||||
| 721 | |||||||
| 722 | The full text of the license can be found in the | ||||||
| 723 | LICENSE file included with this module. | ||||||
| 724 | |||||||
| 725 | =cut | ||||||
| 726 |