| blib/lib/HTML/Normalize.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 122 | 253 | 48.2 |
| branch | 39 | 126 | 30.9 |
| condition | 9 | 45 | 20.0 |
| subroutine | 20 | 27 | 74.0 |
| pod | 3 | 3 | 100.0 |
| total | 193 | 454 | 42.5 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::Normalize; | ||||||
| 2 | |||||||
| 3 | 1 | 1 | 102134 | use strict; | |||
| 1 | 2 | ||||||
| 1 | 29 | ||||||
| 4 | 1 | 1 | 5 | use warnings; | |||
| 1 | 2 | ||||||
| 1 | 27 | ||||||
| 5 | 1 | 1 | 7 | use HTML::Entities; | |||
| 1 | 2 | ||||||
| 1 | 75 | ||||||
| 6 | 1 | 1 | 8 | use HTML::TreeBuilder; | |||
| 1 | 1 | ||||||
| 1 | 6 | ||||||
| 7 | 1 | 1 | 23 | use HTML::Tagset; | |||
| 1 | 3 | ||||||
| 1 | 30 | ||||||
| 8 | 1 | 1 | 6 | use Carp; | |||
| 1 | 1 | ||||||
| 1 | 66 | ||||||
| 9 | |||||||
| 10 | BEGIN { | ||||||
| 11 | 1 | 1 | 7 | use Exporter (); | |||
| 1 | 2 | ||||||
| 1 | 21 | ||||||
| 12 | 1 | 1 | 4 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
| 1 | 2 | ||||||
| 1 | 121 | ||||||
| 13 | 1 | 1 | 7 | $VERSION = '1.0004'; | |||
| 14 | 1 | 17 | @ISA = qw(Exporter); | ||||
| 15 | 1 | 4 | @EXPORT = qw(); | ||||
| 16 | 1 | 15 | @EXPORT_OK = qw(); | ||||
| 17 | 1 | 3289 | %EXPORT_TAGS = (); | ||||
| 18 | } | ||||||
| 19 | |||||||
| 20 | =head1 NAME | ||||||
| 21 | |||||||
| 22 | HTML::Normalize - HTML light weight cleanup | ||||||
| 23 | |||||||
| 24 | =head1 VERSION | ||||||
| 25 | |||||||
| 26 | Version 1.0003 | ||||||
| 27 | |||||||
| 28 | =head1 SYNOPSIS | ||||||
| 29 | |||||||
| 30 | my $norm = HTML::Normalize->new (); | ||||||
| 31 | my $cleanHtml = $norm->cleanup (-html => $dirtyHtml); | ||||||
| 32 | |||||||
| 33 | =head1 DESCRIPTION | ||||||
| 34 | |||||||
| 35 | HTML::Normalize uses HTML::TreeBuilder to parse an HTML string then processes | ||||||
| 36 | the resultant tree to clean up various structural issues in the original HTML. | ||||||
| 37 | The result is then rendered using HTML::Element's as_HTML member. | ||||||
| 38 | |||||||
| 39 | Key structural clean ups fix tag soup (C<< foo >> becomes C<< | ||||||
| 40 | foo >>) and inline/block element nesting (C<< | ||||||
| 41 | foo >> becomes C<<foo >>). C<<>> |
||||||
| 42 | tags at the start or end of a link element are migrated out of the element. | ||||||
| 43 | |||||||
| 44 | Note that HTML::Normalize's approach to cleaning up tag soup is different than | ||||||
| 45 | that used by HTML::Tidy. HTML::Tidy tends to enforce nested and swaps end tags | ||||||
| 46 | to achieve that. HTML::Normalize inserts extra tags to allow correctly taged | ||||||
| 47 | overlapped markup. | ||||||
| 48 | |||||||
| 49 | HTML::Normalize can also remove attributes set to default values and empty | ||||||
| 50 | elements. For example a C<< >> | ||||||
| 51 | element would become and C<< >> and C<< | ||||||
| 52 | face="Verdana" size="1"> >> would be removed if Verdana size 1 is set as the | ||||||
| 53 | default font. | ||||||
| 54 | |||||||
| 55 | =head1 Methods | ||||||
| 56 | |||||||
| 57 | C |
||||||
| 58 | |||||||
| 59 | C |
||||||
| 60 | parses the HTML to generate the internal representation. It then edits the | ||||||
| 61 | internal representation and renders the result back into HTML. | ||||||
| 62 | |||||||
| 63 | Note that I |
||||||
| 64 | process. | ||||||
| 65 | |||||||
| 66 | Generally errors are handled by carping and may be detected in both I |
||||||
| 67 | I |
||||||
| 68 | |||||||
| 69 | =cut | ||||||
| 70 | |||||||
| 71 | =head2 new | ||||||
| 72 | |||||||
| 73 | Create a new C |
||||||
| 74 | |||||||
| 75 | my $norm = HTML::Normalize->new (); | ||||||
| 76 | |||||||
| 77 | =over 4 | ||||||
| 78 | |||||||
| 79 | =item I<-compact>: optional | ||||||
| 80 | |||||||
| 81 | Setting C<< -compact => 1 >> suppresses generation of 'optional' close tags. | ||||||
| 82 | This reduces the sizeof the output slightly at the expense of breaking any hope | ||||||
| 83 | of XHTML compliance. | ||||||
| 84 | |||||||
| 85 | =item I<-default>: optional - multiple | ||||||
| 86 | |||||||
| 87 | Define a default attribute for an element. Default attributes are removed if the | ||||||
| 88 | attribute value has not been overridden in a parent node. For element such as | ||||||
| 89 | 'font' this may result in the element being removed if no attributes remain. | ||||||
| 90 | |||||||
| 91 | C<-default> takes a string of the form 'tag attribute=value' as an argument. | ||||||
| 92 | For example: | ||||||
| 93 | |||||||
| 94 | -default => 'font face="Verdana"' | ||||||
| 95 | |||||||
| 96 | would specify that the face "Verdana" is the default face attribute for font | ||||||
| 97 | elements. | ||||||
| 98 | |||||||
| 99 | I |
||||||
| 100 | matches: | ||||||
| 101 | |||||||
| 102 | /(~|qr)\s*(.).*\1\s*$/ | ||||||
| 103 | |||||||
| 104 | except that the paired delimiters [], {}, () and <> are also accepted as pattern | ||||||
| 105 | delimiters. | ||||||
| 106 | |||||||
| 107 | Literal match values should not encode entities, but remember that quotes around | ||||||
| 108 | attribute values are optional for some values so the outer pair of quote | ||||||
| 109 | characters will be removed if present. The match value extends to the end of the | ||||||
| 110 | line and is not bounded by quote qharacters (except as noted earlier) so no | ||||||
| 111 | quoting of "special" characters is required - there are no special characters. | ||||||
| 112 | |||||||
| 113 | Multiple default attributes may be provided but only one default value is | ||||||
| 114 | allowed for any one tag/attribute pair. | ||||||
| 115 | |||||||
| 116 | Default values are case sensitive. However you can use the regular expression | ||||||
| 117 | form to overcome this limitation. | ||||||
| 118 | |||||||
| 119 | =item I<-distribute>: optional - default true | ||||||
| 120 | |||||||
| 121 | Distribute inline elements over children if the children are block level | ||||||
| 122 | elements. For example: | ||||||
| 123 | |||||||
| 124 | foo bar |
||||||
| 125 | |||||||
| 126 | becomes: | ||||||
| 127 | |||||||
| 128 | foo bar |
||||||
| 129 | |||||||
| 130 | This action is only taken if all the child elements are block level elements. | ||||||
| 131 | |||||||
| 132 | =item I<-expelbr>: optional - default true | ||||||
| 133 | |||||||
| 134 | If C<-expelbr> is true (the default) break elements at the edges of link | ||||||
| 135 | elements are expelled from the link element. Thus: | ||||||
| 136 | |||||||
| 137 | link text |
||||||
| 138 | |||||||
| 139 | becomes | ||||||
| 140 | |||||||
| 141 | link text |
||||||
| 142 | |||||||
| 143 | =item I<-html>: required | ||||||
| 144 | |||||||
| 145 | the HTML string to clean. | ||||||
| 146 | |||||||
| 147 | =item I<-indent>: optional - default ' ' | ||||||
| 148 | |||||||
| 149 | String used to indent formatted output. Ignored if I<-unformatted> is true. | ||||||
| 150 | |||||||
| 151 | =item I<-keepimplicit>: optional | ||||||
| 152 | |||||||
| 153 | as_HTML adds various HTML required sections such as head and body elements. By | ||||||
| 154 | default HTML::Normalize removes these elements so that it is suitable for | ||||||
| 155 | processing HTML fragments. Set C<-keepimplicit => 1> to render the implicit | ||||||
| 156 | elements. | ||||||
| 157 | |||||||
| 158 | Note that if this option is true, the extra nodes will be generated regardless | ||||||
| 159 | of their presence in the original HTML. | ||||||
| 160 | |||||||
| 161 | =item I<-maxlinelen>: optional - default 80 | ||||||
| 162 | |||||||
| 163 | Notional maximum line length if I<-selfrender> is true. The line length may be | ||||||
| 164 | exceeded if no suitable break position is found. Note that the current indent is | ||||||
| 165 | included in the line length. | ||||||
| 166 | |||||||
| 167 | =item I<-selfrender>: optional | ||||||
| 168 | |||||||
| 169 | Use the experimental HTML::Normalize code to render HTML rather than using | ||||||
| 170 | HTML::Element's renderer. This code has not been tested against a wide range of | ||||||
| 171 | HTML and may be unreliable. It's advantage is that it produces (in the author's | ||||||
| 172 | opinion) prettier output than HTML::Element's as_HTML member. | ||||||
| 173 | |||||||
| 174 | =item I<-unformatted>: optional | ||||||
| 175 | |||||||
| 176 | Suppress output formatting. By default as_HTML is called as | ||||||
| 177 | |||||||
| 178 | as_HTML (undef, ' ', {}) | ||||||
| 179 | |||||||
| 180 | which wraps and indents elements. Setting C<< -unformatted => 1 >> suppresses | ||||||
| 181 | generation of line breaks and indentation reducing the size of the output | ||||||
| 182 | slightly. | ||||||
| 183 | |||||||
| 184 | =back | ||||||
| 185 | |||||||
| 186 | =cut | ||||||
| 187 | |||||||
| 188 | my %paramTypes = ( | ||||||
| 189 | |||||||
| 190 | # 0: optional once | ||||||
| 191 | # 1: required once | ||||||
| 192 | # 2: optional, many allowed | ||||||
| 193 | -compact => [0, 0], | ||||||
| 194 | -default => [2, undef], | ||||||
| 195 | -distribute => [0, 1], | ||||||
| 196 | -expelbr => [0, 1], | ||||||
| 197 | -html => [1, undef], | ||||||
| 198 | -indent => [0, ' '], | ||||||
| 199 | -keepimplicit => [0, 0], | ||||||
| 200 | -maxlinelen => [0, 80], | ||||||
| 201 | -selfrender => [0, 0], | ||||||
| 202 | -unformatted => [0, 0], | ||||||
| 203 | ); | ||||||
| 204 | my $regex = ' | ||||||
| 205 | (?:~|qr)\s* | ||||||
| 206 | (?: | ||||||
| 207 | (.).*\4 # regex quote char delimited | ||||||
| 208 | |<.*> # regex <> delimited | ||||||
| 209 | |{.*} # regex {} delimited | ||||||
| 210 | |\[.*\] # regex [] delimited | ||||||
| 211 | |\(.*\) # regex () delimited | ||||||
| 212 | )i? # Regex match | ||||||
| 213 | '; | ||||||
| 214 | |||||||
| 215 | sub new { | ||||||
| 216 | 1 | 1 | 1 | 119 | my ($self, @params) = @_; | ||
| 217 | |||||||
| 218 | 1 | 50 | 6 | unless (ref $self) { | |||
| 219 | 1 | 3 | $self = bless {}, $self; | ||||
| 220 | 1 | 10 | $self->{both} = qr/^(del|ins)$/i; | ||||
| 221 | 1 | 6 | $self->{inline} = qr/^(b|i|s|font|span)$/i; | ||||
| 222 | 1 | 4 | $self->{block} = qr/^(p|table|div)$/i; | ||||
| 223 | 1 | 3 | $self->{needattr} = qr/^(font|span)$/i; | ||||
| 224 | 1 | 3 | $self->{selfclose} = qr/^(br)$/i; | ||||
| 225 | } | ||||||
| 226 | |||||||
| 227 | $self->_validateParams ( | ||||||
| 228 | 1 | 7 | \@params, | ||||
| 229 | [ | ||||||
| 230 | qw(-compact -default -distribute -expelbr -keepimplicit -unformatted ) | ||||||
| 231 | ], | ||||||
| 232 | [] | ||||||
| 233 | ); | ||||||
| 234 | |||||||
| 235 | # Add 'div' to the closure barriers list to avoid changing: | ||||||
| 236 | # foo |
||||||
| 237 | # into: | ||||||
| 238 | # foo |
||||||
| 239 | 1 | 3 | my $bar = \@HTML::Tagset::p_closure_barriers; | ||||
| 240 | 1 | 50 | 4 | push @$bar, 'div' unless grep { $_ eq 'div' } @$bar; | |||
| 15 | 30 | ||||||
| 241 | |||||||
| 242 | 1 | 4 | return $self; | ||||
| 243 | } | ||||||
| 244 | |||||||
| 245 | sub DESTROY { | ||||||
| 246 | 1 | 1 | 810 | my $self = shift; | |||
| 247 | 1 | 50 | 9 | $self->{root}->delete if $self->{root}; | |||
| 248 | } | ||||||
| 249 | |||||||
| 250 | sub _validateParams { | ||||||
| 251 | 2 | 2 | 6 | my ($self, $params, $okParams, $requiredParams) = @_; | |||
| 252 | |||||||
| 253 | 2 | 50 | 6 | $params ||= []; | |||
| 254 | 2 | 50 | 12 | $okParams ||= []; | |||
| 255 | 2 | 50 | 4 | $requiredParams ||= []; | |||
| 256 | |||||||
| 257 | # Validate parameters | ||||||
| 258 | 2 | 7 | while (@$params) { | ||||
| 259 | 1 | 8 | my ($key, $value) = splice @$params, 0, 2; | ||||
| 260 | |||||||
| 261 | 1 | 4 | $key = lc $key; | ||||
| 262 | 1 | 50 | 6 | croak "$key is not a valid parameter name" if !exists $paramTypes{$key}; | |||
| 263 | croak "$key parameter may only be used once" | ||||||
| 264 | 1 | 50 | 33 | 8 | if $paramTypes{$key}[0] < 2 && exists $self->{$key}; | ||
| 265 | |||||||
| 266 | 1 | 50 | 5 | if ($paramTypes{$key}[0] < 2) { | |||
| 267 | 1 | 2 | $self->{$key} = $value; | ||||
| 268 | 1 | 5 | next; | ||||
| 269 | } | ||||||
| 270 | |||||||
| 271 | 0 | 0 | push @{$self->{$key}}, $value; | ||||
| 0 | 0 | ||||||
| 272 | } | ||||||
| 273 | |||||||
| 274 | # Ensure we got required parameters | ||||||
| 275 | 2 | 6 | for my $key (@$requiredParams) { | ||||
| 276 | 1 | 50 | 4 | croak "Invalid parameter name: $key" unless exists $paramTypes{$key}; | |||
| 277 | 1 | 50 | 3 | $self->{$key} = $paramTypes{$key}[1] unless exists $self->{$key}; | |||
| 278 | 1 | 50 | 33 | 8 | next if $paramTypes{$key}[0] != 1 or exists $self->{$key}; | ||
| 279 | 0 | 0 | croak "The $key parameter is missing. It is required."; | ||||
| 280 | } | ||||||
| 281 | } | ||||||
| 282 | |||||||
| 283 | =head2 cleanup | ||||||
| 284 | |||||||
| 285 | C |
||||||
| 286 | |||||||
| 287 | my $cleanHtml = $norm->cleanup (); | ||||||
| 288 | |||||||
| 289 | =cut | ||||||
| 290 | |||||||
| 291 | sub cleanup { | ||||||
| 292 | 1 | 1 | 1 | 7 | my ($self, @params) = @_; | ||
| 293 | |||||||
| 294 | 1 | 27 | $self->_validateParams (\@params, [keys %paramTypes], ['-html']); | ||||
| 295 | |||||||
| 296 | # Check we got all required parameters and set any defaults | ||||||
| 297 | 1 | 6 | for my $param (keys %paramTypes) { | ||||
| 298 | 10 | 100 | 18 | next if exists $self->{$param}; | |||
| 299 | 9 | 100 | 21 | next if $paramTypes{$param}[0] > 1; | |||
| 300 | |||||||
| 301 | croak "A $param parameter must be provided. None was." | ||||||
| 302 | 8 | 50 | 15 | if $paramTypes{$param}[0] == 1; | |||
| 303 | |||||||
| 304 | # Set missing param to default | ||||||
| 305 | 8 | 16 | $self->{$param} = $paramTypes{$param}[1]; | ||||
| 306 | } | ||||||
| 307 | |||||||
| 308 | # Unpack any -default parameters | ||||||
| 309 | 1 | 2 | for my $default (@{$self->{-default}}) { | ||||
| 1 | 5 | ||||||
| 310 | 0 | 0 | my ($tag, $attrib, $value) = | ||||
| 311 | $default =~ / | ||||||
| 312 | (\w+)\s+ # Tag | ||||||
| 313 | (\w+)\s* # Attribute | ||||||
| 314 | (?:=\s*(?=[\w'"])|=(?=~)) | ||||||
| 315 | ( '[^']*' # Single quoted | ||||||
| 316 | |"[^"]*" # Double quoted | ||||||
| 317 | |\w+ # Unquoted | ||||||
| 318 | |$regex # regex match | ||||||
| 319 | )\s* # Value | ||||||
| 320 | $/x; | ||||||
| 321 | |||||||
| 322 | 0 | 0 | 0 | croak "Badly formed default attribute string: $default" | |||
| 323 | unless defined $value; | ||||||
| 324 | 0 | 0 | $_ = lc for $tag, $attrib; | ||||
| 325 | |||||||
| 326 | croak "Conflicting defaults given:\n" | ||||||
| 327 | . " $tag $attrib=$self->{defaults}{$tag}{$attrib}\n" | ||||||
| 328 | . "and\n $tag $attrib=$value\n" | ||||||
| 329 | if exists $self->{defaults}{$tag}{$attrib} | ||||||
| 330 | 0 | 0 | 0 | 0 | and $self->{defaults}{$tag}{$attrib} ne $value; | ||
| 331 | |||||||
| 332 | 0 | 0 | 0 | if ($value =~ /^()()()$regex$/x) { | |||
| 333 | # Compile regex | ||||||
| 334 | 0 | 0 | $value =~ s/^~\s*/qr/; | ||||
| 335 | 0 | 0 | $value = eval $value; | ||||
| 336 | } else { | ||||||
| 337 | # Strip quotes if present from match value | ||||||
| 338 | 0 | 0 | $value =~ s/^(['"])(.*)\1$/$2/; | ||||
| 339 | } | ||||||
| 340 | |||||||
| 341 | 0 | 0 | $self->{defaults}{$tag}{$attrib} = $value; | ||||
| 342 | } | ||||||
| 343 | |||||||
| 344 | 1 | 10 | $self->{root} = HTML::TreeBuilder->new; | ||||
| 345 | 1 | 378 | $self->{root}->parse_content ($self->{-html}); | ||||
| 346 | 1 | 1589 | $self->{root}->elementify (); | ||||
| 347 | |||||||
| 348 | 1 | 94 | 1 while $self->_cleanedupElt ($self->{root}); | ||||
| 349 | |||||||
| 350 | 1 | 3 | my $str = ''; | ||||
| 351 | |||||||
| 352 | 1 | 50 | 4 | if ($self->{-selfrender}) { | |||
| 353 | 0 | 0 | $self->{line} = ''; | ||||
| 354 | 0 | 0 | $str = $self->_render ($self->{root}, ''); | ||||
| 355 | 0 | 0 | 0 | $str .= "\n" if $str !~ /\n$/s; | |||
| 356 | } else { | ||||||
| 357 | 1 | 5 | my @renderOptions = (undef, ' ', {}); | ||||
| 358 | |||||||
| 359 | 1 | 50 | 3 | $renderOptions[1] = undef if $self->{-unformatted}; | |||
| 360 | 1 | 50 | 3 | $renderOptions[2] = undef if $self->{-compact}; | |||
| 361 | |||||||
| 362 | 1 | 2 | my $elt = $self->{root}; | ||||
| 363 | |||||||
| 364 | 1 | 50 | 4 | if (! $self->{-keepimplicit}) { | |||
| 365 | 1 | 5 | ($elt) = grep {$_->{_tag} eq 'body'} $self->{root}->descendents (); | ||||
| 7 | 298 | ||||||
| 366 | } | ||||||
| 367 | |||||||
| 368 | $str .= ref $_ ? $_->as_HTML (@renderOptions) : $_ | ||||||
| 369 | 1 | 50 | 2 | for @{$elt->{_content}}; | |||
| 1 | 11 | ||||||
| 370 | } | ||||||
| 371 | |||||||
| 372 | 1 | 826 | return $str; | ||||
| 373 | } | ||||||
| 374 | |||||||
| 375 | |||||||
| 376 | =head2 elements | ||||||
| 377 | |||||||
| 378 | C |
||||||
| 379 | generated by C |
||||||
| 380 | will return C |
||||||
| 381 | |||||||
| 382 | $norm->cleanup (); | ||||||
| 383 | my @elements = $norm->elements(); | ||||||
| 384 | |||||||
| 385 | =cut | ||||||
| 386 | |||||||
| 387 | sub elements { | ||||||
| 388 | 1 | 1 | 1 | 14 | my ($self) = @_; | ||
| 389 | 1 | 5 | my $root = $self->{root}; | ||||
| 390 | |||||||
| 391 | 1 | 6 | while ($root->implicit()) { | ||||
| 392 | 1 | 21 | ($root) = grep {$_->tag() ne 'head'} $root->content_list(); | ||||
| 2 | 15 | ||||||
| 393 | 1 | 50 | 9 | last if $root->tag() eq 'body'; | |||
| 394 | 0 | 0 | next; | ||||
| 395 | } | ||||||
| 396 | |||||||
| 397 | 1 | 9 | return $root->content_list(); | ||||
| 398 | } | ||||||
| 399 | |||||||
| 400 | |||||||
| 401 | sub _cleanedupElt { | ||||||
| 402 | 8 | 8 | 17 | my ($self, $parent) = @_; | |||
| 403 | |||||||
| 404 | 8 | 100 | 66 | 35 | return 0 unless ref $parent && ref $parent->{_content}; | ||
| 405 | |||||||
| 406 | 6 | 8 | my $rescan = 1; # Set true to rescan the child element list | ||||
| 407 | 6 | 9 | my $touched; | ||||
| 408 | |||||||
| 409 | 6 | 11 | while ($rescan) { | ||||
| 410 | 6 | 9 | $rescan = 0; # Assume another scan not required after current scan | ||||
| 411 | 6 | 9 | ++$touched; | ||||
| 412 | |||||||
| 413 | 6 | 14 | for my $elt ($parent->content_list ()) { | ||||
| 414 | 10 | 100 | 48 | next unless ref $elt; | |||
| 415 | |||||||
| 416 | 7 | 50 | 15 | ++$rescan, last if $self->_cleanedupElt ($elt); | |||
| 417 | 7 | 100 | 17 | next if exists $elt->{_implicit}; | |||
| 418 | |||||||
| 419 | 5 | 50 | 20 | ++$rescan, last if $self->_removedDefaults ($elt); | |||
| 420 | 5 | 50 | 11 | ++$rescan, last if $self->_distributedElements ($elt); | |||
| 421 | 5 | 50 | 9 | ++$rescan, last if $self->_normalizedElements ($elt); | |||
| 422 | 5 | 50 | 11 | ++$rescan, last if $self->_expeledBr ($elt); | |||
| 423 | 5 | 50 | 11 | ++$rescan, last if $self->_removedEmpty ($elt); | |||
| 424 | } | ||||||
| 425 | } | ||||||
| 426 | |||||||
| 427 | 6 | 14 | return $touched > 1; | ||||
| 428 | } | ||||||
| 429 | |||||||
| 430 | sub _distributedElements { | ||||||
| 431 | 5 | 5 | 9 | my ($self, $elt) = @_; | |||
| 432 | |||||||
| 433 | 5 | 50 | 11 | return 0 unless $self->{-distribute}; | |||
| 434 | return 0 | ||||||
| 435 | unless $elt->{_tag} =~ $self->{inline} | ||||||
| 436 | 5 | 50 | 33 | 33 | && $elt->{_tag} =~ $self->{needattr}; | ||
| 437 | |||||||
| 438 | 0 | 0 | my @elts = $elt->content_list (); | ||||
| 439 | 0 | 0 | 0 | my $blockElts = grep {ref $_ && $_->{_tag} =~ $self->{block}} @elts; | |||
| 0 | 0 | ||||||
| 440 | |||||||
| 441 | # Done unless all child elements are block level elements | ||||||
| 442 | 0 | 0 | 0 | 0 | return 0 unless @elts && @elts == $blockElts; | ||
| 443 | |||||||
| 444 | # Distribute inline element over and block elements | ||||||
| 445 | 0 | 0 | $elt->replace_with_content (); | ||||
| 446 | |||||||
| 447 | 0 | 0 | for my $block (@elts) { | ||||
| 448 | 0 | 0 | my @nested = $block->detach_content (); | ||||
| 449 | 0 | 0 | my $clone = $elt->clone (); | ||||
| 450 | |||||||
| 451 | 0 | 0 | $block->push_content ($clone); | ||||
| 452 | 0 | 0 | $clone->push_content (@nested); | ||||
| 453 | } | ||||||
| 454 | |||||||
| 455 | 0 | 0 | $elt->delete (); | ||||
| 456 | 0 | 0 | return 1; | ||||
| 457 | } | ||||||
| 458 | |||||||
| 459 | sub _normalizedElements { | ||||||
| 460 | 5 | 5 | 11 | my ($self, $elt) = @_; | |||
| 461 | |||||||
| 462 | 5 | 50 | 23 | return 0 unless $elt->{_tag} =~ $self->{inline}; | |||
| 463 | |||||||
| 464 | 0 | 0 | my @elts = $elt->content_list (); | ||||
| 465 | |||||||
| 466 | # Ok unless element contains single block level child | ||||||
| 467 | return 0 | ||||||
| 468 | unless @elts == 1 | ||||||
| 469 | && ref $elts[0] | ||||||
| 470 | 0 | 0 | 0 | 0 | && $elts[0]->{_tag} =~ $self->{block}; | ||
| 0 | |||||||
| 471 | |||||||
| 472 | # Invert order of inline and block elements | ||||||
| 473 | 0 | 0 | my @nested = $elts[0]->detach_content (); | ||||
| 474 | |||||||
| 475 | 0 | 0 | $elt->replace_with ($elts[0]); | ||||
| 476 | 0 | 0 | $elts[0]->push_content ($elt); | ||||
| 477 | 0 | 0 | $elt->push_content (@nested); | ||||
| 478 | 0 | 0 | $elt = $elts[0]; | ||||
| 479 | |||||||
| 480 | $_->replace_with_content ()->delete () | ||||||
| 481 | 0 | 0 | for grep {$self->_removedEmpty ($_)} @elts; | ||||
| 0 | 0 | ||||||
| 482 | |||||||
| 483 | 0 | 0 | return 1; | ||||
| 484 | } | ||||||
| 485 | |||||||
| 486 | sub _expeledBr { | ||||||
| 487 | 5 | 5 | 10 | my ($self, $elt) = @_; | |||
| 488 | |||||||
| 489 | 5 | 50 | 33 | 26 | return 0 unless $elt->{_tag} eq 'a' && $self->{-expelbr}; | ||
| 490 | 0 | 0 | 0 | return 0 unless exists $elt->{_content}; | |||
| 491 | |||||||
| 492 | 0 | 0 | my $adjusted; | ||||
| 493 | 0 | 0 | for my $index (0, -1) { | ||||
| 494 | 0 | 0 | my $br = $elt->{_content}[$index]; | ||||
| 495 | |||||||
| 496 | 0 | 0 | 0 | 0 | next unless ref $br && $br->{_tag} eq 'br'; | ||
| 497 | 0 | 0 | 0 | $index == 0 | |||
| 498 | ? $br->detach ()->preinsert ($br) | ||||||
| 499 | : $br->detach ()->postinsert ($br); | ||||||
| 500 | 0 | 0 | ++$adjusted; | ||||
| 501 | } | ||||||
| 502 | |||||||
| 503 | 0 | 0 | return $adjusted; | ||||
| 504 | } | ||||||
| 505 | |||||||
| 506 | sub _removedDefaults { | ||||||
| 507 | 5 | 5 | 12 | my ($self, $elt) = @_; | |||
| 508 | |||||||
| 509 | 5 | 50 | 19 | return 0 unless exists $self->{defaults}{$elt->{_tag}}; | |||
| 510 | |||||||
| 511 | 0 | 0 | my $delAttribs = $self->{defaults}{$elt->{_tag}}; | ||||
| 512 | |||||||
| 513 | 0 | 0 | for my $attrib (keys %$delAttribs) { | ||||
| 514 | 0 | 0 | 0 | next unless exists $elt->{$attrib}; | |||
| 515 | |||||||
| 516 | 0 | 0 | my $value = $delAttribs->{$attrib}; | ||||
| 517 | 0 | 0 | my @parentAttribs; | ||||
| 518 | 0 | 0 | my @criteria = (_tag => $elt->{_tag}); | ||||
| 519 | |||||||
| 520 | 0 | 0 | 0 | if ('Regexp' eq ref $value) { | |||
| 521 | 0 | 0 | 0 | next unless $elt->{$attrib} =~ $value; | |||
| 522 | push @criteria, sub { | ||||||
| 523 | 0 | 0 | 0 | my $attr = $_[0]->attr("$attrib"); | |||
| 524 | 0 | 0 | 0 | return 0 unless defined $attr; | |||
| 525 | 0 | 0 | return $attr !~ $value; | ||||
| 526 | 0 | 0 | }; | ||||
| 527 | } else { | ||||||
| 528 | 0 | 0 | my $value = $delAttribs->{$attrib}; | ||||
| 529 | |||||||
| 530 | 0 | 0 | 0 | next unless $elt->{$attrib} eq $value; | |||
| 531 | 0 | 0 | push @criteria, ($attrib => qr/^(?!\Q$value\E)/i); | ||||
| 532 | } | ||||||
| 533 | |||||||
| 534 | 0 | 0 | @parentAttribs = $elt->look_up (@criteria); | ||||
| 535 | |||||||
| 536 | # Don't delete attribute required to restore default | ||||||
| 537 | 0 | 0 | 0 | next if @parentAttribs; | |||
| 538 | 0 | 0 | delete $elt->{$attrib}; | ||||
| 539 | } | ||||||
| 540 | |||||||
| 541 | 0 | 0 | return $self->_removedEmpty ($elt); | ||||
| 542 | } | ||||||
| 543 | |||||||
| 544 | sub _removedEmpty { | ||||||
| 545 | 5 | 5 | 15 | my ($self, $elt) = @_; | |||
| 546 | |||||||
| 547 | 5 | 50 | 14 | return 0 if grep {!/^_/} $elt->all_attr_names (); | |||
| 14 | 67 | ||||||
| 548 | 5 | 50 | 26 | return 0 unless $elt->{_tag} =~ $self->{needattr}; | |||
| 549 | |||||||
| 550 | # Remove redundant element - no attributes left | ||||||
| 551 | 0 | $elt->replace_with ($elt->detach_content ()); | |||||
| 552 | 0 | $elt->delete (); | |||||
| 553 | 0 | return 1; | |||||
| 554 | } | ||||||
| 555 | |||||||
| 556 | sub _render { | ||||||
| 557 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
| 558 | |||||||
| 559 | return '' | ||||||
| 560 | 0 | 0 | 0 | unless $self->{-keepimplicit} || !$elt->{_implicit} || $elt->{_content}; | |||
| 0 | |||||||
| 561 | |||||||
| 562 | 0 | my $str = ''; | |||||
| 563 | |||||||
| 564 | 0 | 0 | 0 | if (! $self->{-keepimplicit} && $elt->{_implicit}) { | |||
| 0 | |||||||
| 0 | |||||||
| 565 | 0 | return $self->_renderContents ($elt, $indent); | |||||
| 566 | |||||||
| 567 | } elsif ($elt->{_tag} =~ $self->{selfclose}) { | ||||||
| 568 | 0 | $str .= $self->_append ("<$elt->{_tag} />", $indent); | |||||
| 569 | |||||||
| 570 | } elsif ($HTML::Tagset::isPhraseMarkup{$elt->{_tag}}) { | ||||||
| 571 | 0 | $str .= $self->_append ("<$elt->{_tag}", $indent); | |||||
| 572 | 0 | $str .= $self->_renderAttrs ($elt, $indent); | |||||
| 573 | 0 | $str .= $self->_renderContents ($elt, $indent); | |||||
| 574 | 0 | $str .= $self->_append ("$elt->{_tag}>",$indent); | |||||
| 575 | |||||||
| 576 | } else { | ||||||
| 577 | 0 | my $indented = "$indent$self->{-indent}"; | |||||
| 578 | |||||||
| 579 | 0 | $str = $self->_flushLine ($indent); | |||||
| 580 | 0 | $self->{line} .= "<$elt->{_tag}"; | |||||
| 581 | 0 | $self->{ishead} = 1; | |||||
| 582 | 0 | $str .= $self->_renderAttrs ($elt, $indented); | |||||
| 583 | 0 | $str .= $self->_renderContents ($elt, $indented); | |||||
| 584 | 0 | $str .= $self->_append ("$elt->{_tag}>", $indented); | |||||
| 585 | 0 | $str .= $self->_flushLine ($indented); | |||||
| 586 | } | ||||||
| 587 | |||||||
| 588 | 0 | return $str; | |||||
| 589 | } | ||||||
| 590 | |||||||
| 591 | sub _append { | ||||||
| 592 | 0 | 0 | my ($self, $tail, $indent) = @_; | ||||
| 593 | |||||||
| 594 | 0 | 0 | if ((length ($self->{line}) + length ($tail) + length ($indent)) > $self->{-maxlinelen}) { | ||||
| 595 | 0 | my $str = $self->_flushLine ($indent); | |||||
| 596 | |||||||
| 597 | 0 | $self->{line} = $tail; | |||||
| 598 | 0 | return $str; | |||||
| 599 | } else { | ||||||
| 600 | 0 | $self->{line} .= $tail; | |||||
| 601 | 0 | return ''; | |||||
| 602 | } | ||||||
| 603 | } | ||||||
| 604 | |||||||
| 605 | sub _flushLine { | ||||||
| 606 | 0 | 0 | my ($self, $indent) = @_; | ||||
| 607 | |||||||
| 608 | 0 | 0 | return '' unless length $self->{line}; | ||||
| 609 | |||||||
| 610 | 0 | my $str; | |||||
| 611 | |||||||
| 612 | 0 | 0 | if ($self->{-unformatted}) { | ||||
| 613 | 0 | $str = $self->{line}; | |||||
| 614 | |||||||
| 615 | } else { | ||||||
| 616 | 0 | 0 | if ($self->{ishead}) { | ||||
| 617 | 0 | substr ($indent, -length $self->{-indent}) = ''; | |||||
| 618 | 0 | $self->{isHead} = undef; | |||||
| 619 | } | ||||||
| 620 | |||||||
| 621 | 0 | $str = "$indent$self->{line}\n"; | |||||
| 622 | } | ||||||
| 623 | |||||||
| 624 | 0 | $self->{line} = ''; | |||||
| 625 | 0 | $str =~ s/\s+\n\z/\n/s; | |||||
| 626 | 0 | return $str; | |||||
| 627 | } | ||||||
| 628 | |||||||
| 629 | sub _renderAttrs { | ||||||
| 630 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
| 631 | 0 | my $str = ''; | |||||
| 632 | 0 | my @attrs = grep {! /^_/} keys %$elt; | |||||
| 0 | |||||||
| 633 | |||||||
| 634 | $str .= $self->_append ( | ||||||
| 635 | qq( $_=") . encode_entities ($elt->{$_}) . qq("), | ||||||
| 636 | $indent | ||||||
| 637 | ) | ||||||
| 638 | 0 | for sort @attrs; | |||||
| 639 | 0 | $self->{line} .= '>'; | |||||
| 640 | 0 | return $str; | |||||
| 641 | } | ||||||
| 642 | |||||||
| 643 | sub _renderContents { | ||||||
| 644 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
| 645 | 0 | my $str = ''; | |||||
| 646 | |||||||
| 647 | 0 | for my $subElt (@{$elt->{_content}}) { | |||||
| 0 | |||||||
| 648 | 0 | 0 | if (! ref $subElt) { | ||||
| 649 | 0 | $str .= $self->_renderText ($subElt, $indent); | |||||
| 650 | } else { | ||||||
| 651 | 0 | $str .= $self->_render ($subElt, $indent); | |||||
| 652 | } | ||||||
| 653 | } | ||||||
| 654 | |||||||
| 655 | 0 | return $str; | |||||
| 656 | } | ||||||
| 657 | |||||||
| 658 | |||||||
| 659 | sub _renderText { | ||||||
| 660 | 0 | 0 | my ($self, $elt, $indent) = @_; | ||||
| 661 | 0 | my $str = $self->{line} . encode_entities ($elt); | |||||
| 662 | |||||||
| 663 | 0 | 0 | if ($self->{-unformatted}) { | ||||
| 664 | 0 | $self->{line} = ''; | |||||
| 665 | |||||||
| 666 | } else { | ||||||
| 667 | 0 | my $maxLen = $self->{-maxlinelen} - length $indent; | |||||
| 668 | |||||||
| 669 | 0 | 0 | $str =~ s/(.{1,$maxLen})\s+/$indent$1\n/g if length($str) > $maxLen; | ||||
| 670 | 0 | ($str, $self->{line}) = $str =~ /(.*\n)?(.*)/; | |||||
| 671 | 0 | 0 | $str = '' unless defined $str; | ||||
| 672 | 0 | 0 | $self->{line} = '' unless defined $self->{line}; | ||||
| 673 | } | ||||||
| 674 | |||||||
| 675 | 0 | return $str; | |||||
| 676 | } | ||||||
| 677 | |||||||
| 678 | |||||||
| 679 | 1; | ||||||
| 680 | |||||||
| 681 | =head1 BUGS | ||||||
| 682 | |||||||
| 683 | =head3 p/div/p parsing issue | ||||||
| 684 | |||||||
| 685 | HTML::TreeBuilder 3.23 and earlier misparses: | ||||||
| 686 | |||||||
| 687 | foo |
||||||
| 688 | |||||||
| 689 | as: | ||||||
| 690 | |||||||
| 691 | foo |
||||||
| 692 | |||||||
| 693 | A work around in HTML::Normalize turns that into | ||||||
| 694 | |||||||
| 695 | foo |
||||||
| 696 | |||||||
| 697 | which is probably still incorrect - div elements should not nest within p | ||||||
| 698 | elements. A better fix for the problem requires HTML::TreeBuilder to be fixed. | ||||||
| 699 | |||||||
| 700 | =head3 Bug reports and feature requests | ||||||
| 701 | |||||||
| 702 | Please report any other bugs or feature requests to | ||||||
| 703 | C |
||||||
| 704 | L |
||||||
| 705 | I will be notified, and then you'll automatically be notified of progress on | ||||||
| 706 | your bug as I make changes. | ||||||
| 707 | |||||||
| 708 | =head1 SUPPORT | ||||||
| 709 | |||||||
| 710 | This module is supported by the author through CPAN. The following links may be | ||||||
| 711 | of assistance: | ||||||
| 712 | |||||||
| 713 | =over 4 | ||||||
| 714 | |||||||
| 715 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
| 716 | |||||||
| 717 | L |
||||||
| 718 | |||||||
| 719 | =item * CPAN Ratings | ||||||
| 720 | |||||||
| 721 | L |
||||||
| 722 | |||||||
| 723 | =item * RT: CPAN's request tracker | ||||||
| 724 | |||||||
| 725 | L |
||||||
| 726 | |||||||
| 727 | =item * Search CPAN | ||||||
| 728 | |||||||
| 729 | L |
||||||
| 730 | |||||||
| 731 | =back | ||||||
| 732 | |||||||
| 733 | =head1 ACKNOWLEDGEMENTS | ||||||
| 734 | |||||||
| 735 | This module was inspired by Bart Lateur's PerlMonks node 'Cleaning up HTML' | ||||||
| 736 | (L |
||||||
| 737 | and the author. | ||||||
| 738 | |||||||
| 739 | =head1 AUTHOR | ||||||
| 740 | |||||||
| 741 | Peter Jaquiery | ||||||
| 742 | CPAN ID: GRANDPA | ||||||
| 743 | grandpa@cpan.org | ||||||
| 744 | |||||||
| 745 | =head1 COPYRIGHT & LICENSE | ||||||
| 746 | |||||||
| 747 | This program is free software; you can redistribute | ||||||
| 748 | it and/or modify it under the same terms as Perl itself. | ||||||
| 749 | |||||||
| 750 | The full text of the license can be found in the | ||||||
| 751 | LICENSE file included with this module. | ||||||
| 752 | |||||||
| 753 | =cut |