| blib/lib/HTML/Laundry.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 396 | 487 | 81.3 |
| branch | 131 | 160 | 81.8 |
| condition | 11 | 21 | 52.3 |
| subroutine | 51 | 53 | 96.2 |
| pod | 18 | 18 | 100.0 |
| total | 607 | 739 | 82.1 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | ######################################################## | ||||||
| 2 | # Copyright © 2009 Six Apart, Ltd. | ||||||
| 3 | |||||||
| 4 | package HTML::Laundry; | ||||||
| 5 | |||||||
| 6 | 15 | 15 | 48448 | use strict; | |||
| 15 | 37 | ||||||
| 15 | 646 | ||||||
| 7 | 15 | 15 | 96 | use warnings; | |||
| 15 | 30 | ||||||
| 15 | 516 | ||||||
| 8 | |||||||
| 9 | 15 | 15 | 408 | use 5.008; | |||
| 15 | 66 | ||||||
| 15 | 612 | ||||||
| 10 | 15 | 15 | 13898 | use version; our $VERSION = 0.0103; | |||
| 15 | 38786 | ||||||
| 15 | 111 | ||||||
| 11 | |||||||
| 12 | =head1 NAME | ||||||
| 13 | |||||||
| 14 | HTML::Laundry - Perl module to clean HTML by the piece | ||||||
| 15 | |||||||
| 16 | =head1 VERSION | ||||||
| 17 | |||||||
| 18 | Version 0.0103 | ||||||
| 19 | |||||||
| 20 | =head1 SYNOPSIS | ||||||
| 21 | |||||||
| 22 | #!/usr/bin/perl -w | ||||||
| 23 | use strict; | ||||||
| 24 | use HTML::Laundry; | ||||||
| 25 | my $laundry = HTML::Laundry->new(); | ||||||
| 26 | my $snippet = q{ | ||||||
| 27 | |||||||
| 28 | If your gloves are sterilized |
||||||
| 29 | Rinse your mouth with Listerine | ||||||
| 30 | Blow disinfectant in her eyes" |
||||||
| 31 | -- X-Ray Spex, Germ-Free Adolescents | ||||||
| 32 | |||||||
| 33 | }; | ||||||
| 34 | my $germfree = $laundry->clean($snippet); | ||||||
| 35 | # $germfree is now: | ||||||
| 36 | # "You may get to touch her |
||||||
| 37 | # If your gloves are sterilized |
||||||
| 38 | # Rinse your mouth with Listerine |
||||||
| 39 | # Blow disinfectant in her eyes" |
||||||
| 40 | # -- X-Ray Spex, Germ-Free Adolescents | ||||||
| 41 | |||||||
| 42 | =head1 DESCRIPTION | ||||||
| 43 | |||||||
| 44 | HTML::Laundry is an L |
||||||
| 45 | meant for small pieces of HTML, such as user comments, Atom feed entries, | ||||||
| 46 | and the like, rather than full pages. Laundry takes these and returns clean, | ||||||
| 47 | sanitary, UTF-8-based XHTML. The parser's behavior may be changed with | ||||||
| 48 | callbacks, and the whitelist of acceptable elements and attributes may be | ||||||
| 49 | updated on the fly. | ||||||
| 50 | |||||||
| 51 | A snippet is cleaned several ways: | ||||||
| 52 | |||||||
| 53 | =over 4 | ||||||
| 54 | |||||||
| 55 | =item * Normalized, using HTML::Parser: attributes and elements will be | ||||||
| 56 | lowercased, empty elements such as will be forced into |
||||||
| 57 | the empty tag syntax if needed, and unknown attributes and elements will be | ||||||
| 58 | stripped. | ||||||
| 59 | |||||||
| 60 | =item * Sanitized, using an extensible whitelist of valid attributes and | ||||||
| 61 | elements based on Mark Pilgrim and Aaron Swartz's work on C |
||||||
| 62 | and attributes which are known to be possible attack vectors are removed. | ||||||
| 63 | |||||||
| 64 | =item * Tidied, using L |
||||||
| 65 | (as available): unclosed tags will be closed and the output generally | ||||||
| 66 | neatened; future version may also use tidying to deal with character encoding | ||||||
| 67 | issues. | ||||||
| 68 | |||||||
| 69 | =item * Optionally rebased, to turn relative URLs in attributes into | ||||||
| 70 | absolute ones. | ||||||
| 71 | |||||||
| 72 | =back | ||||||
| 73 | |||||||
| 74 | HTML::Laundry provides mechanisms to extend the list of known allowed | ||||||
| 75 | (and disallowed) tags, along with callback methods to allow scripts using | ||||||
| 76 | HTML::Laundry to extend the behavior in various ways. Future versions | ||||||
| 77 | may provide additional options for altering the rules used to clean | ||||||
| 78 | snippets. | ||||||
| 79 | |||||||
| 80 | Out of the box, HTML::Laundry does not currently know about the tag | ||||||
| 81 | and its children. For santizing full HTML pages, consider using L |
||||||
| 82 | or L |
||||||
| 83 | |||||||
| 84 | =cut | ||||||
| 85 | |||||||
| 86 | require HTML::Laundry::Rules; | ||||||
| 87 | require HTML::Laundry::Rules::Default; | ||||||
| 88 | |||||||
| 89 | require HTML::Parser; | ||||||
| 90 | 15 | 15 | 15896 | use HTML::Entities qw(encode_entities encode_entities_numeric); | |||
| 15 | 131611 | ||||||
| 15 | 2143 | ||||||
| 91 | 15 | 15 | 14694 | use URI; | |||
| 15 | 73022 | ||||||
| 15 | 591 | ||||||
| 92 | 15 | 15 | 134 | use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8); | |||
| 15 | 39 | ||||||
| 15 | 1244 | ||||||
| 93 | 15 | 15 | 13013 | use URI::Split qw(); | |||
| 15 | 9057 | ||||||
| 15 | 388 | ||||||
| 94 | 15 | 15 | 94 | use Scalar::Util 'blessed'; | |||
| 15 | 28 | ||||||
| 15 | 1251 | ||||||
| 95 | 15 | 15 | 13616 | use Switch; | |||
| 15 | 587376 | ||||||
| 15 | 98 | ||||||
| 96 | |||||||
| 97 | my @fragments; | ||||||
| 98 | my $unacceptable_count; | ||||||
| 99 | my $local_unacceptable_count; | ||||||
| 100 | my $cdata_dirty; | ||||||
| 101 | my $in_cdata; | ||||||
| 102 | my $tag_leading_whitespace = qr/ | ||||||
| 103 | (?<=<) # Left bracket followed by | ||||||
| 104 | \s* # any amount of whitespace | ||||||
| 105 | (\/?) # optionally with a forward slash | ||||||
| 106 | \s* # and then more whitespace | ||||||
| 107 | /x; | ||||||
| 108 | |||||||
| 109 | =head1 FUNCTIONS | ||||||
| 110 | |||||||
| 111 | =head2 new | ||||||
| 112 | |||||||
| 113 | Create an HTML::Laundry object. | ||||||
| 114 | |||||||
| 115 | my $l = HTML::Laundry->new(); | ||||||
| 116 | |||||||
| 117 | Takes an optional anonymous hash of arguments: | ||||||
| 118 | |||||||
| 119 | =over 4 | ||||||
| 120 | |||||||
| 121 | =item * base_url | ||||||
| 122 | |||||||
| 123 | This turns relative URIs, as in C< >, into |
||||||
| 124 | absolute URIs, as for use in feed parsing. | ||||||
| 125 | |||||||
| 126 | my $l = HTML::Laundry->new({ base_uri => 'http://example.com/foo/' }); | ||||||
| 127 | |||||||
| 128 | |||||||
| 129 | =item * notidy | ||||||
| 130 | |||||||
| 131 | Disable use of HTML::Tidy or HTML::Tidy::libXML, even if | ||||||
| 132 | they are available on your system. | ||||||
| 133 | |||||||
| 134 | my $l = HTML::Laundry->new({ notidy => 1 }); | ||||||
| 135 | |||||||
| 136 | =back | ||||||
| 137 | |||||||
| 138 | =cut | ||||||
| 139 | |||||||
| 140 | sub new { | ||||||
| 141 | 25 | 25 | 1 | 5967 | my $self = {}; | ||
| 142 | 25 | 56 | my $class = shift; | ||||
| 143 | 25 | 42 | my $args = shift; | ||||
| 144 | |||||||
| 145 | 25 | 100 | 204 | if ( blessed $args ) { | |||
| 100 | |||||||
| 146 | 1 | 50 | 6 | if ( $args->isa('HTML::Laundry::Rules') ) { | |||
| 147 | 1 | 4 | $args = { rules => $args }; | ||||
| 148 | } | ||||||
| 149 | else { | ||||||
| 150 | 0 | 0 | $args = {}; | ||||
| 151 | } | ||||||
| 152 | } | ||||||
| 153 | elsif ( ref $args ne 'HASH' ) { | ||||||
| 154 | 4 | 6 | my $rules; | ||||
| 155 | { | ||||||
| 156 | 4 | 4 | local $@; | ||||
| 4 | 4 | ||||||
| 157 | 4 | 8 | eval { | ||||
| 158 | 4 | 100 | 61 | $args->isa('HTML::Laundry::Rules') | |||
| 159 | and $rules = $args->new; | ||||||
| 160 | }; | ||||||
| 161 | } | ||||||
| 162 | 4 | 100 | 14 | if ($rules) { | |||
| 163 | 1 | 11 | $args = { rules => $args }; | ||||
| 164 | } | ||||||
| 165 | else { | ||||||
| 166 | 3 | 6 | $args = {}; | ||||
| 167 | } | ||||||
| 168 | } | ||||||
| 169 | |||||||
| 170 | 25 | 203 | $self->{tidy} = undef; | ||||
| 171 | 25 | 65 | $self->{tidy_added_inline} = {}; | ||||
| 172 | 25 | 53 | $self->{tidy_added_empty} = {}; | ||||
| 173 | 25 | 51 | $self->{base_uri} = q{}; | ||||
| 174 | 25 | 56 | bless $self, $class; | ||||
| 175 | 25 | 90 | $self->clear_callback('start_tag'); | ||||
| 176 | 25 | 78 | $self->clear_callback('end_tag'); | ||||
| 177 | 25 | 61 | $self->clear_callback('uri'); | ||||
| 178 | 25 | 60 | $self->clear_callback('text'); | ||||
| 179 | 25 | 62 | $self->clear_callback('output'); | ||||
| 180 | $self->{parser} = HTML::Parser->new( | ||||||
| 181 | api_version => 3, | ||||||
| 182 | utf8_mode => 1, | ||||||
| 183 | 481 | 481 | 1223 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 184 | 453 | 453 | 1026 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 185 | 25 | 139 | 396 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
| 139 | 375 | ||||||
| 186 | empty_element_tags => 1, | ||||||
| 187 | marked_sections => 1, | ||||||
| 188 | ); | ||||||
| 189 | $self->{cdata_parser} = HTML::Parser->new( | ||||||
| 190 | api_version => 3, | ||||||
| 191 | utf8_mode => 1, | ||||||
| 192 | 5 | 5 | 13 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 193 | 5 | 5 | 12 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 194 | 25 | 14 | 2708 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext' ], | |||
| 14 | 39 | ||||||
| 195 | empty_element_tags => 1, | ||||||
| 196 | unbroken_text => 1, | ||||||
| 197 | marked_sections => 0, | ||||||
| 198 | ); | ||||||
| 199 | 25 | 1652 | $self->initialize($args); | ||||
| 200 | |||||||
| 201 | 25 | 100 | 152 | if ( !$args->{notidy} ) { | |||
| 202 | 8 | 18 | $self->_generate_tidy; | ||||
| 203 | } | ||||||
| 204 | 25 | 87 | return $self; | ||||
| 205 | } | ||||||
| 206 | |||||||
| 207 | =head2 initialize | ||||||
| 208 | |||||||
| 209 | Instantiates the Laundry object properties based on an | ||||||
| 210 | HTML::Laundry::Rules module. | ||||||
| 211 | |||||||
| 212 | =cut | ||||||
| 213 | |||||||
| 214 | sub initialize { | ||||||
| 215 | 25 | 25 | 1 | 62 | my ( $self, $args ) = @_; | ||
| 216 | |||||||
| 217 | # Set defaults | ||||||
| 218 | 25 | 53 | $self->{tidy_added_tags} = undef; | ||||
| 219 | 25 | 50 | $self->{tidy_empty_tags} = undef; | ||||
| 220 | 25 | 46 | $self->{trim_trailing_whitespace} = 1; | ||||
| 221 | 25 | 46 | $self->{trim_tag_whitespace} = 0; | ||||
| 222 | 25 | 100 | 88 | $self->{base_uri} = URI->new( $args->{base_uri} ) | |||
| 223 | if $args->{base_uri}; | ||||||
| 224 | 25 | 3530 | my $rules = $args->{rules}; | ||||
| 225 | 25 | 66 | 215 | $rules ||= HTML::Laundry::Rules::Default->new(); | |||
| 226 | |||||||
| 227 | 25 | 85 | $self->{ruleset} = $rules; | ||||
| 228 | |||||||
| 229 | # Initialize based on ruleset | ||||||
| 230 | 25 | 140 | $self->{acceptable_a} = $rules->acceptable_a(); | ||||
| 231 | 25 | 135 | $self->{acceptable_e} = $rules->acceptable_e(); | ||||
| 232 | 25 | 153 | $self->{empty_e} = $rules->empty_e(); | ||||
| 233 | 25 | 148 | $self->{unacceptable_e} = $rules->unacceptable_e(); | ||||
| 234 | 25 | 144 | $self->{uri_list} = $rules->uri_list(); | ||||
| 235 | 25 | 143 | $self->{allowed_schemes} = $rules->allowed_schemes(); | ||||
| 236 | 25 | 139 | $rules->finalize_initialization($self); | ||||
| 237 | |||||||
| 238 | 25 | 38 | return; | ||||
| 239 | } | ||||||
| 240 | |||||||
| 241 | =head2 add_callback | ||||||
| 242 | |||||||
| 243 | Adds a callback of type "start_tag", "end_tag", "text", "uri", or "output" to | ||||||
| 244 | the appropriate internal array. | ||||||
| 245 | |||||||
| 246 | $l->add_callback('start_tag', sub { | ||||||
| 247 | my ($laundry, $tagref, $attrhashref) = @_; | ||||||
| 248 | # Now, perform actions and return | ||||||
| 249 | }); | ||||||
| 250 | |||||||
| 251 | start_tag, end_tag, text, and uri callbacks that return false values will | ||||||
| 252 | suppress the return value of the element they are processing; this allows | ||||||
| 253 | additional checks to be done (for instance, images can be allowed only from | ||||||
| 254 | whitelisted source domains). | ||||||
| 255 | |||||||
| 256 | =cut | ||||||
| 257 | |||||||
| 258 | sub add_callback { | ||||||
| 259 | 21 | 21 | 1 | 4696 | my ( $self, $action, $ref ) = @_; | ||
| 260 | 21 | 50 | 65 | return if ( ref($ref) ne 'CODE' ); | |||
| 261 | 21 | 29 | switch ($action) { | ||||
| 21 | 27 | ||||||
| 21 | 60 | ||||||
| 0 | 0 | ||||||
| 262 | 21 | 100 | 306 | case q{start_tag} { | |||
| 4 | 43 | ||||||
| 263 | 4 | 5 | push @{ $self->{start_tag_callback} }, $ref; | ||||
| 4 | 8 | ||||||
| 264 | 4 | 20 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 265 | 17 | 100 | 259 | case q{end_tag} { | |||
| 4 | 44 | ||||||
| 266 | 4 | 7 | push @{ $self->{end_tag_callback} }, $ref; | ||||
| 4 | 9 | ||||||
| 267 | 4 | 23 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 268 | 13 | 100 | 176 | case q{text} { | |||
| 6 | 70 | ||||||
| 269 | 6 | 10 | push @{ $self->{text_callback} }, $ref; | ||||
| 6 | 14 | ||||||
| 270 | 6 | 32 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 271 | 7 | 100 | 96 | case q{uri} { | |||
| 4 | 60 | ||||||
| 272 | 4 | 7 | push @{ $self->{uri_callback} }, $ref; | ||||
| 4 | 17 | ||||||
| 273 | 4 | 396 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 274 | 3 | 50 | 37 | case q{output} { | |||
| 3 | 36 | ||||||
| 275 | 3 | 4 | push @{ $self->{output_callback} }, $ref; | ||||
| 3 | 10 | ||||||
| 276 | 3 | 18 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 277 | } | ||||||
| 278 | 21 | 55 | return; | ||||
| 279 | } | ||||||
| 280 | |||||||
| 281 | =head2 clear_callback | ||||||
| 282 | |||||||
| 283 | Removes all callbacks of given type. | ||||||
| 284 | |||||||
| 285 | $l->clear_callback('start_tag'); | ||||||
| 286 | |||||||
| 287 | =cut | ||||||
| 288 | |||||||
| 289 | sub clear_callback { | ||||||
| 290 | 139 | 139 | 1 | 11105 | my ( $self, $action ) = @_; | ||
| 291 | 139 | 156 | switch ($action) { | ||||
| 139 | 163 | ||||||
| 139 | 366 | ||||||
| 0 | 0 | ||||||
| 292 | 139 | 100 | 1816 | case q{start_tag} { | |||
| 27 | 402 | ||||||
| 293 | 27 | 493 | 227 | $self->{start_tag_callback} = [ sub { 1; } ]; | |||
| 493 | 1072 | ||||||
| 294 | 27 | 220 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 295 | 112 | 100 | 1416 | case q{end_tag} { | |||
| 27 | 328 | ||||||
| 296 | 27 | 467 | 145 | $self->{end_tag_callback} = [ sub { 1; } ]; | |||
| 467 | 645 | ||||||
| 297 | 27 | 149 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 298 | 85 | 100 | 907 | case q{text} { | |||
| 29 | 290 | ||||||
| 299 | 29 | 143 | 178 | $self->{text_callback} = [ sub { 1; } ]; | |||
| 143 | 226 | ||||||
| 300 | 29 | 164 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 301 | 56 | 100 | 601 | case q{uri} { | |||
| 29 | 341 | ||||||
| 302 | 29 | 58 | 147 | $self->{uri_callback} = [ sub { 1; } ]; | |||
| 58 | 97 | ||||||
| 303 | 29 | 172 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 304 | 27 | 50 | 284 | case q{output} { | |||
| 27 | 266 | ||||||
| 305 | 27 | 462 | 160 | $self->{output_callback} = [ sub { 1; } ]; | |||
| 462 | 625 | ||||||
| 306 | 27 | 166 | } | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 307 | } | ||||||
| 308 | 139 | 407 | return; | ||||
| 309 | } | ||||||
| 310 | |||||||
| 311 | =head2 clean | ||||||
| 312 | |||||||
| 313 | Cleans a snippet of HTML, using the ruleset and object creation options given | ||||||
| 314 | to the Laundry object. The snippet should be passed as a scalar. | ||||||
| 315 | |||||||
| 316 | $output1 = $l->clean( ' The X-rays were penetrating' ); |
||||||
| 317 | $output2 = $l->clean( $snippet ); | ||||||
| 318 | |||||||
| 319 | =cut | ||||||
| 320 | |||||||
| 321 | sub clean { | ||||||
| 322 | 462 | 462 | 1 | 128097 | my ( $self, $chunk, $args ) = @_; | ||
| 323 | 462 | 996 | $self->_reset_state(); | ||||
| 324 | 462 | 50 | 1108 | if ( $self->{trim_tag_whitespace} ) { | |||
| 325 | 0 | 0 | $chunk =~ s/$tag_leading_whitespace/$1/gs; | ||||
| 326 | } | ||||||
| 327 | 462 | 661 | my $p = $self->{parser}; | ||||
| 328 | 462 | 561 | my $cp = $self->{cdata_parser}; | ||||
| 329 | 462 | 3875 | $p->parse($chunk); | ||||
| 330 | 462 | 100 | 33 | 1511 | if ( !$in_cdata && !$unacceptable_count ) { | ||
| 331 | 461 | 1171 | $p->eof(); | ||||
| 332 | } | ||||||
| 333 | 462 | 50 | 33 | 1097 | if ( $in_cdata && !$local_unacceptable_count ) { | ||
| 334 | 0 | 0 | $cp->eof(); | ||||
| 335 | } | ||||||
| 336 | 462 | 889 | my $output = $self->gen_output; | ||||
| 337 | 462 | 1159 | $cp->eof(); # Clear buffer if we haven't already | ||||
| 338 | 462 | 100 | 733 | if ($cdata_dirty) { # Overkill to get out of CDATA parser state | |||
| 339 | $self->{parser} = HTML::Parser->new( | ||||||
| 340 | api_version => 3, | ||||||
| 341 | start_h => | ||||||
| 342 | 7 | 7 | 18 | [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 343 | 9 | 9 | 21 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 344 | 4 | 19 | 40 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
| 19 | 41 | ||||||
| 345 | empty_element_tags => 1, | ||||||
| 346 | marked_sections => 1, | ||||||
| 347 | ); | ||||||
| 348 | } | ||||||
| 349 | else { | ||||||
| 350 | 458 | 1045 | $p->eof(); # Clear buffer if we haven't already | ||||
| 351 | } | ||||||
| 352 | 462 | 2298 | return $output; | ||||
| 353 | } | ||||||
| 354 | |||||||
| 355 | =head2 base_uri | ||||||
| 356 | |||||||
| 357 | Used to get or set the base_uri property, used in URI rebasing. | ||||||
| 358 | |||||||
| 359 | my $base_uri = $l->base_uri; # returns current base_uri | ||||||
| 360 | $l->base_uri(q{http://example.com}); # return 'http://example.com' | ||||||
| 361 | $l->base_uri(''); # unsets base_uri | ||||||
| 362 | |||||||
| 363 | =cut | ||||||
| 364 | |||||||
| 365 | sub base_uri { | ||||||
| 366 | 3 | 3 | 1 | 7 | my ( $self, $new_base ) = @_; | ||
| 367 | 3 | 100 | 66 | 15 | if ( defined $new_base and !ref $new_base ) { | ||
| 368 | 2 | 4 | $self->{base_uri} = $new_base; | ||||
| 369 | } | ||||||
| 370 | 3 | 15 | return $self->{base_uri}; | ||||
| 371 | } | ||||||
| 372 | |||||||
| 373 | sub _run_callbacks { | ||||||
| 374 | 1623 | 1623 | 1747 | my $self = shift; | |||
| 375 | 1623 | 1795 | my $action = shift; | ||||
| 376 | 1623 | 50 | 3036 | return unless $action; | |||
| 377 | 1623 | 2375 | my $type = $action . q{_callback}; | ||||
| 378 | 1623 | 1563 | for my $callback ( @{ $self->{$type} } ) { | ||||
| 1623 | 3557 | ||||||
| 379 | 1650 | 2944 | my $result = $callback->( $self, @_ ); | ||||
| 380 | 1650 | 100 | 18914 | return unless $result; | |||
| 381 | } | ||||||
| 382 | 1619 | 4238 | return 1; | ||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | =head2 gen_output | ||||||
| 386 | |||||||
| 387 | Used to generate the final, XHTML output from the internal stack of text and | ||||||
| 388 | tag tokens. Generally meant to be used internally, but potentially useful for | ||||||
| 389 | callbacks that require a snapshot of what the output would look like | ||||||
| 390 | before the cleaning process is complete. | ||||||
| 391 | |||||||
| 392 | my $xhtml = $l->gen_output; | ||||||
| 393 | |||||||
| 394 | =cut | ||||||
| 395 | |||||||
| 396 | sub gen_output { | ||||||
| 397 | 462 | 462 | 1 | 525 | my $self = shift; | ||
| 398 | 462 | 50 | 907 | if ( !$self->_run_callbacks( q{output}, \@fragments ) ) { | |||
| 399 | 0 | 0 | return q{}; | ||||
| 400 | } | ||||||
| 401 | 462 | 946 | my $output = join '', @fragments; | ||||
| 402 | 462 | 50 | 1011 | if ( $self->{tidy} ) { | |||
| 403 | 0 | 0 | 0 | if ( $self->{tidy_engine} eq q{HTML::Tidy} ) { | |||
| 0 | |||||||
| 404 | 0 | 0 | $output = $self->{tidy}->clean($output); | ||||
| 405 | 0 | 0 | $self->{tidy}->clear_messages; | ||||
| 406 | } | ||||||
| 407 | elsif ( $self->{tidy_engine} eq q{HTML::Tidy::libXML} ) { | ||||||
| 408 | 0 | 0 | my $clean | ||||
| 409 | = $self->{tidy} | ||||||
| 410 | ->clean( $self->{tidy_head} . $output . $self->{tidy_foot}, | ||||||
| 411 | 'UTF-8', 1 ); | ||||||
| 412 | 0 | 0 | $output = substr( $clean, length $self->{tidy_head} ); | ||||
| 413 | 0 | 0 | $output = substr( $output, 0, -1 * length $self->{tidy_foot} ); | ||||
| 414 | } | ||||||
| 415 | } | ||||||
| 416 | 462 | 50 | 915 | if ( $self->{trim_trailing_whitespace} ) { | |||
| 417 | 462 | 1181 | $output =~ s/\s+$//; | ||||
| 418 | } | ||||||
| 419 | 462 | 837 | return $output; | ||||
| 420 | } | ||||||
| 421 | |||||||
| 422 | =head2 empty_elements | ||||||
| 423 | |||||||
| 424 | Returns a list of the Laundry object's known empty elements: elements such | ||||||
| 425 | as which must not contain any children. |
||||||
| 426 | |||||||
| 427 | =cut | ||||||
| 428 | |||||||
| 429 | sub empty_elements { | ||||||
| 430 | 0 | 0 | 1 | 0 | my ( $self, $listref ) = @_; | ||
| 431 | 0 | 0 | 0 | if ($listref) { | |||
| 432 | 0 | 0 | my @list = @{$listref}; | ||||
| 0 | 0 | ||||||
| 433 | 0 | 0 | my %empty = map { ( $_, 1 ) } @list; | ||||
| 0 | 0 | ||||||
| 434 | 0 | 0 | $self->{empty_e} = \%empty; | ||||
| 435 | } | ||||||
| 436 | 0 | 0 | return keys %{ $self->{empty_e} }; | ||||
| 0 | 0 | ||||||
| 437 | } | ||||||
| 438 | |||||||
| 439 | =head2 remove_empty_element | ||||||
| 440 | |||||||
| 441 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
| 442 | the "empty elements" list maintained by the Laundry object. | ||||||
| 443 | |||||||
| 444 | $l->remove_empty_element(['img', 'br']); # Let's break XHTML! | ||||||
| 445 | |||||||
| 446 | This will not affect the acceptable/unacceptable status of the elements. | ||||||
| 447 | |||||||
| 448 | =cut | ||||||
| 449 | |||||||
| 450 | sub remove_empty_element { | ||||||
| 451 | 4 | 4 | 1 | 442 | my ( $self, $new_e, $args ) = @_; | ||
| 452 | 4 | 8 | my $empty = $self->{empty_e}; | ||||
| 453 | 4 | 100 | 12 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 454 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
| 1 | 3 | ||||||
| 455 | 2 | 9 | $self->remove_empty_element( $e, $args ); | ||||
| 456 | } | ||||||
| 457 | } | ||||||
| 458 | else { | ||||||
| 459 | 3 | 6 | delete $empty->{$new_e}; | ||||
| 460 | } | ||||||
| 461 | 4 | 10 | return 1; | ||||
| 462 | } | ||||||
| 463 | |||||||
| 464 | =head2 acceptable_elements | ||||||
| 465 | |||||||
| 466 | Returns a list of the Laundry object's known acceptable elements, which will | ||||||
| 467 | not be stripped during the sanitizing process. | ||||||
| 468 | |||||||
| 469 | =cut | ||||||
| 470 | |||||||
| 471 | sub acceptable_elements { | ||||||
| 472 | 4 | 4 | 1 | 517 | my ( $self, $listref ) = @_; | ||
| 473 | 4 | 100 | 19 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 474 | 1 | 2 | my @list = @{$listref}; | ||||
| 1 | 5 | ||||||
| 475 | 1 | 5 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
| 5 | 12 | ||||||
| 476 | 1 | 5 | $self->{acceptable_e} = \%acceptable; | ||||
| 477 | } | ||||||
| 478 | 4 | 22 | return keys %{ $self->{acceptable_e} }; | ||||
| 4 | 98 | ||||||
| 479 | } | ||||||
| 480 | |||||||
| 481 | =head2 add_acceptable_element | ||||||
| 482 | |||||||
| 483 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
| 484 | "acceptable elements" list maintained by the Laundry object. Items added in | ||||||
| 485 | this manner will automatically be removed from the "unacceptable elements" | ||||||
| 486 | list if they are present. | ||||||
| 487 | |||||||
| 488 | $l->add_acceptable_element('style'); | ||||||
| 489 | |||||||
| 490 | Elements which are empty may be flagged as such with an optional argument. | ||||||
| 491 | If this flag is set, all elements provided by the call will be added to | ||||||
| 492 | the "empty element" list. | ||||||
| 493 | |||||||
| 494 | $l->add_acceptable_element(['applet', 'script'], { empty => 1 }); | ||||||
| 495 | |||||||
| 496 | =cut | ||||||
| 497 | |||||||
| 498 | sub add_acceptable_element { | ||||||
| 499 | 10 | 10 | 1 | 1624 | my ( $self, $new_e, $args ) = @_; | ||
| 500 | 10 | 16 | my $acceptable = $self->{acceptable_e}; | ||||
| 501 | 10 | 16 | my $empty = $self->{empty_e}; | ||||
| 502 | 10 | 11 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 503 | 10 | 100 | 24 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 504 | 2 | 3 | foreach my $e ( @{$new_e} ) { | ||||
| 2 | 6 | ||||||
| 505 | 4 | 12 | $self->add_acceptable_element( $e, $args ); | ||||
| 506 | } | ||||||
| 507 | } | ||||||
| 508 | else { | ||||||
| 509 | 8 | 21 | $acceptable->{$new_e} = 1; | ||||
| 510 | 8 | 100 | 25 | if ( $args->{empty} ) { | |||
| 50 | |||||||
| 511 | 4 | 11 | $empty->{$new_e} = 1; | ||||
| 512 | 4 | 50 | 14 | if ( $self->{tidy} ) { | |||
| 513 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
| 514 | 0 | 0 | $self->{tidy_added_empty}->{$new_e} = 1; | ||||
| 515 | 0 | 0 | $self->_generate_tidy; | ||||
| 516 | } | ||||||
| 517 | } | ||||||
| 518 | elsif ( $self->{tidy} ) { | ||||||
| 519 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
| 520 | 0 | 0 | $self->_generate_tidy; | ||||
| 521 | } | ||||||
| 522 | 8 | 14 | delete $unacceptable->{$new_e}; | ||||
| 523 | |||||||
| 524 | } | ||||||
| 525 | 10 | 24 | return 1; | ||||
| 526 | } | ||||||
| 527 | |||||||
| 528 | =head2 remove_acceptable_element | ||||||
| 529 | |||||||
| 530 | Removes an element (or, if given an array reference, multiple elements) to the | ||||||
| 531 | "acceptable elements" list maintained by the Laundry object. These items | ||||||
| 532 | (although not their child elements) will now be stripped during parsing. | ||||||
| 533 | |||||||
| 534 | $l->remove_acceptable_element(['img', 'h1', 'h2']); | ||||||
| 535 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
| 536 | # returns 'The Day the World Turned Day-Glo' | ||||||
| 537 | |||||||
| 538 | =cut | ||||||
| 539 | |||||||
| 540 | sub remove_acceptable_element { | ||||||
| 541 | 16 | 16 | 1 | 33 | my ( $self, $new_e, $args ) = @_; | ||
| 542 | 16 | 32 | my $acceptable = $self->{acceptable_e}; | ||||
| 543 | 16 | 100 | 34 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 544 | 2 | 5 | foreach my $e ( @{$new_e} ) { | ||||
| 2 | 5 | ||||||
| 545 | 4 | 13 | $self->remove_acceptable_element( $e, $args ); | ||||
| 546 | } | ||||||
| 547 | } | ||||||
| 548 | else { | ||||||
| 549 | 14 | 32 | delete $acceptable->{$new_e}; | ||||
| 550 | } | ||||||
| 551 | 16 | 32 | return 1; | ||||
| 552 | } | ||||||
| 553 | |||||||
| 554 | =head2 unacceptable_elements | ||||||
| 555 | |||||||
| 556 | Returns a list of the Laundry object's unacceptable elements, which will be | ||||||
| 557 | stripped -- B |
||||||
| 558 | |||||||
| 559 | =cut | ||||||
| 560 | |||||||
| 561 | sub unacceptable_elements { | ||||||
| 562 | 3 | 3 | 1 | 6 | my ( $self, $listref ) = @_; | ||
| 563 | 3 | 100 | 11 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 564 | 1 | 3 | my @list = @{$listref}; | ||||
| 1 | 5 | ||||||
| 565 | 5 | 12 | my %unacceptable | ||||
| 566 | 1 | 3 | = map { $self->remove_acceptable_element($_); ( $_, 1 ); } @list; | ||||
| 5 | 16 | ||||||
| 567 | 1 | 4 | $self->{unacceptable_e} = \%unacceptable; | ||||
| 568 | } | ||||||
| 569 | 3 | 7 | return keys %{ $self->{unacceptable_e} }; | ||||
| 3 | 15 | ||||||
| 570 | } | ||||||
| 571 | |||||||
| 572 | =head2 add_unacceptable_element | ||||||
| 573 | |||||||
| 574 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
| 575 | "unacceptable elements" list maintained by the Laundry object. | ||||||
| 576 | |||||||
| 577 | $l->add_unacceptable_element(['h1', 'h2']); | ||||||
| 578 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
| 579 | # returns null string | ||||||
| 580 | |||||||
| 581 | =cut | ||||||
| 582 | |||||||
| 583 | sub add_unacceptable_element { | ||||||
| 584 | 4 | 4 | 1 | 1642 | my ( $self, $new_e, $args ) = @_; | ||
| 585 | 4 | 8 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 586 | 4 | 100 | 12 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 587 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
| 1 | 3 | ||||||
| 588 | 2 | 12 | $self->add_unacceptable_element( $e, $args ); | ||||
| 589 | } | ||||||
| 590 | } | ||||||
| 591 | else { | ||||||
| 592 | 3 | 9 | $self->remove_acceptable_element($new_e); | ||||
| 593 | 3 | 6 | $unacceptable->{$new_e} = 1; | ||||
| 594 | } | ||||||
| 595 | 4 | 8 | return 1; | ||||
| 596 | } | ||||||
| 597 | |||||||
| 598 | =head2 remove_unacceptable_element | ||||||
| 599 | |||||||
| 600 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
| 601 | the "unacceptable elements" list maintained by the Laundry object. Note that | ||||||
| 602 | this does not automatically add the element to the acceptable_element list. | ||||||
| 603 | |||||||
| 604 | $l->clean(q{}); | ||||||
| 605 | # returns null string | ||||||
| 606 | $l->remove_unacceptable_element( q{script} ); | ||||||
| 607 | $l->clean(q{}); | ||||||
| 608 | # returns "alert('!')" | ||||||
| 609 | |||||||
| 610 | =cut | ||||||
| 611 | |||||||
| 612 | sub remove_unacceptable_element { | ||||||
| 613 | 4 | 4 | 1 | 7 | my ( $self, $new_e, $args ) = @_; | ||
| 614 | 4 | 7 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 615 | 4 | 100 | 11 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 616 | 1 | 2 | foreach my $a ( @{$new_e} ) { | ||||
| 1 | 2 | ||||||
| 617 | 2 | 11 | $self->remove_unacceptable_element( $a, $args ); | ||||
| 618 | } | ||||||
| 619 | } | ||||||
| 620 | else { | ||||||
| 621 | 3 | 7 | delete $unacceptable->{$new_e}; | ||||
| 622 | } | ||||||
| 623 | 4 | 9 | return 1; | ||||
| 624 | } | ||||||
| 625 | |||||||
| 626 | =head2 acceptable_attributes | ||||||
| 627 | |||||||
| 628 | Returns a list of the Laundry object's known acceptable attributes, which will | ||||||
| 629 | not be stripped during the sanitizing process. | ||||||
| 630 | |||||||
| 631 | =cut | ||||||
| 632 | |||||||
| 633 | sub acceptable_attributes { | ||||||
| 634 | 3 | 3 | 1 | 6 | my ( $self, $listref ) = @_; | ||
| 635 | 3 | 100 | 11 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 636 | 1 | 2 | my @list = @{$listref}; | ||||
| 1 | 5 | ||||||
| 637 | 1 | 2 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
| 3 | 9 | ||||||
| 638 | 1 | 4 | $self->{acceptable_a} = \%acceptable; | ||||
| 639 | } | ||||||
| 640 | 3 | 18 | return keys %{ $self->{acceptable_a} }; | ||||
| 3 | 41 | ||||||
| 641 | } | ||||||
| 642 | |||||||
| 643 | =head2 add_acceptable_attribute | ||||||
| 644 | |||||||
| 645 | Adds an attribute (or, if given an array reference, multiple attributes) to the | ||||||
| 646 | "acceptable attributes" list maintained by the Laundry object. | ||||||
| 647 | |||||||
| 648 | my $snippet = q{ "My dear Mr. Bennet," said his lady to |
||||||
| 649 | him one day, "have you heard that | ||||||
| 650 | Netherfield Park is let at last?" | ||||||
| 651 | }; | ||||||
| 652 | $l->clean( $snippet ); | ||||||
| 653 | # returns: | ||||||
| 654 | # "My dear Mr. Bennet," said his lady to him one day, |
||||||
| 655 | # "have you heard that Netherfield Park is let at | ||||||
| 656 | # last?" | ||||||
| 657 | $l->add_acceptable_attribute([austen:id, austen:footnote]); | ||||||
| 658 | $l->clean( $snippet ); | ||||||
| 659 | # returns: | ||||||
| 660 | # "My dear Mr. Bennet," said his lady to him |
||||||
| 661 | # one day, "have you heard that | ||||||
| 662 | # Netherfield Park is let at last?" | ||||||
| 663 | |||||||
| 664 | =cut | ||||||
| 665 | |||||||
| 666 | sub add_acceptable_attribute { | ||||||
| 667 | 4 | 4 | 1 | 1787 | my ( $self, $new_a, $args ) = @_; | ||
| 668 | 4 | 10 | my $acceptable = $self->{acceptable_a}; | ||||
| 669 | 4 | 100 | 14 | if ( ref($new_a) eq 'ARRAY' ) { | |||
| 670 | 1 | 3 | foreach my $a ( @{$new_a} ) { | ||||
| 1 | 3 | ||||||
| 671 | 2 | 8 | $self->add_acceptable_attribute( $a, $args ); | ||||
| 672 | } | ||||||
| 673 | } | ||||||
| 674 | else { | ||||||
| 675 | 3 | 10 | $acceptable->{$new_a} = 1; | ||||
| 676 | } | ||||||
| 677 | 4 | 10 | return 1; | ||||
| 678 | } | ||||||
| 679 | |||||||
| 680 | =head2 remove_acceptable_attribute | ||||||
| 681 | |||||||
| 682 | Removes an attribute (or, if given an array reference, multiple attributes) | ||||||
| 683 | from the "acceptable attributes" list maintained by the Laundry object. | ||||||
| 684 | |||||||
| 685 | $l->clean(q{ plover }); |
||||||
| 686 | # returns ' plover ' |
||||||
| 687 | $l->remove_acceptable_element( q{id} ); | ||||||
| 688 | $l->clean(q{ plover }); |
||||||
| 689 | # returns ' plover |
||||||
| 690 | |||||||
| 691 | =cut | ||||||
| 692 | |||||||
| 693 | sub remove_acceptable_attribute { | ||||||
| 694 | 4 | 4 | 1 | 8 | my ( $self, $new_a, $args ) = @_; | ||
| 695 | 4 | 6 | my $acceptable = $self->{acceptable_a}; | ||||
| 696 | 4 | 100 | 12 | if ( ref($new_a) eq 'ARRAY' ) { | |||
| 697 | 1 | 2 | foreach my $a ( @{$new_a} ) { | ||||
| 1 | 3 | ||||||
| 698 | 2 | 9 | $self->remove_acceptable_attribute( $a, $args ); | ||||
| 699 | } | ||||||
| 700 | } | ||||||
| 701 | else { | ||||||
| 702 | 3 | 9 | delete $acceptable->{$new_a}; | ||||
| 703 | } | ||||||
| 704 | 4 | 9 | return 1; | ||||
| 705 | } | ||||||
| 706 | |||||||
| 707 | sub _generate_tidy { | ||||||
| 708 | 8 | 8 | 8 | my $self = shift; | |||
| 709 | 8 | 8 | my $param = shift; | ||||
| 710 | 8 | 16 | $self->_generate_html_tidy; | ||||
| 711 | 8 | 50 | 40 | if ( !$self->{tidy} ) { | |||
| 712 | 8 | 18 | $self->_generate_html_tidy_libxml; | ||||
| 713 | } | ||||||
| 714 | 8 | 31 | return; | ||||
| 715 | } | ||||||
| 716 | |||||||
| 717 | sub _generate_html_tidy_libxml { | ||||||
| 718 | 8 | 8 | 11 | my $self = shift; | |||
| 719 | { | ||||||
| 720 | 8 | 8 | local $@; | ||||
| 8 | 8 | ||||||
| 721 | 8 | 13 | eval { | ||||
| 722 | 8 | 2696 | require HTML::Tidy::libXML; | ||||
| 723 | 0 | 0 | $self->{tidy} = HTML::Tidy::libXML->new(); | ||||
| 724 | 0 | 0 | $self->{tidy_head} = q{ | ||||
| 725 | |||||||
| 726 | "http://www.w3.org/TR/ html1/DTD/ html1-transitional.dtd"> | ||||||
| 727 | }; | ||||||
| 728 | 0 | 0 | $self->{tidy_foot} = q{ | ||||
| 729 | }; | ||||||
| 730 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy::libXML}; | ||||
| 731 | 0 | 0 | 1; | ||||
| 732 | }; | ||||||
| 733 | } | ||||||
| 734 | } | ||||||
| 735 | |||||||
| 736 | sub _generate_html_tidy { | ||||||
| 737 | 8 | 8 | 10 | my $self = shift; | |||
| 738 | { | ||||||
| 739 | 8 | 9 | local $@; | ||||
| 8 | 9 | ||||||
| 740 | 8 | 10 | eval { | ||||
| 741 | 8 | 3035 | require HTML::Tidy; | ||||
| 742 | 0 | 0 | $self->{tidy_ruleset} = $self->{ruleset}->tidy_ruleset; | ||||
| 743 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_inline} } ) { | |||
| 0 | 0 | ||||||
| 744 | 0 | 0 | $self->{tidy_ruleset}->{new_inline_tags} | ||||
| 745 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_inline} } ); | ||||
| 746 | } | ||||||
| 747 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_empty} } ) { | |||
| 0 | 0 | ||||||
| 748 | 0 | 0 | $self->{tidy_ruleset}->{new_empty_tags} | ||||
| 749 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_empty} } ); | ||||
| 750 | } | ||||||
| 751 | 0 | 0 | $self->{tidy} = HTML::Tidy->new( $self->{tidy_ruleset} ); | ||||
| 752 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy}; | ||||
| 753 | 0 | 0 | 1; | ||||
| 754 | }; | ||||||
| 755 | } | ||||||
| 756 | } | ||||||
| 757 | |||||||
| 758 | sub _reset_state { | ||||||
| 759 | 462 | 462 | 545 | my ($self) = @_; | |||
| 760 | 462 | 800 | @fragments = (); | ||||
| 761 | 462 | 509 | $unacceptable_count = 0; | ||||
| 762 | 462 | 451 | $local_unacceptable_count = 0; | ||||
| 763 | 462 | 485 | $in_cdata = 0; | ||||
| 764 | 462 | 439 | $cdata_dirty = 0; | ||||
| 765 | 462 | 603 | return; | ||||
| 766 | } | ||||||
| 767 | |||||||
| 768 | sub _tag_start_handler { | ||||||
| 769 | 493 | 493 | 730 | my ( $self, $tagname, $attr ) = @_; | |||
| 770 | 493 | 100 | 1107 | if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) { | |||
| 771 | 1 | 7 | return; | ||||
| 772 | } | ||||||
| 773 | 492 | 100 | 987 | if ( !$in_cdata ) { | |||
| 774 | 487 | 570 | $cdata_dirty = 0; | ||||
| 775 | } | ||||||
| 776 | 492 | 477 | my @attributes; | ||||
| 777 | 492 | 514 | foreach my $k ( keys %{$attr} ) { | ||||
| 492 | 1335 | ||||||
| 778 | 259 | 100 | 749 | if ( $self->{acceptable_a}->{$k} ) { | |||
| 779 | 174 | 100 | 192 | if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) { | |||
| 151 | 969 | ||||||
| 174 | 499 | ||||||
| 780 | 58 | 217 | $self->_uri_handler( $tagname, \$k, \$attr->{$k}, | ||||
| 781 | $self->{base_uri} ); | ||||||
| 782 | } | ||||||
| 783 | |||||||
| 784 | # Allow uri handler to suppress insertion | ||||||
| 785 | 174 | 100 | 419 | if ($k) { | |||
| 786 | 157 | 549 | push @attributes, $k . q{="} . $attr->{$k} . q{"}; | ||||
| 787 | } | ||||||
| 788 | } | ||||||
| 789 | } | ||||||
| 790 | 492 | 943 | my $attributes = join q{ }, @attributes; | ||||
| 791 | 492 | 100 | 1185 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
| 792 | 376 | 100 | 775 | if ( $self->{empty_e}->{$tagname} ) { | |||
| 793 | 58 | 100 | 146 | if ($attributes) { | |||
| 794 | 19 | 32 | $attributes = $attributes . q{ }; | ||||
| 795 | } | ||||||
| 796 | 58 | 167 | push @fragments, "<$tagname $attributes/>"; | ||||
| 797 | } | ||||||
| 798 | else { | ||||||
| 799 | 318 | 100 | 1056 | if ($attributes) { | |||
| 800 | 122 | 234 | $attributes = q{ } . $attributes; | ||||
| 801 | } | ||||||
| 802 | 318 | 678 | push @fragments, "<$tagname$attributes>"; | ||||
| 803 | } | ||||||
| 804 | } | ||||||
| 805 | else { | ||||||
| 806 | 116 | 100 | 339 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
| 807 | 24 | 100 | 48 | if ($in_cdata) { | |||
| 808 | 3 | 4 | $local_unacceptable_count += 1; | ||||
| 809 | } | ||||||
| 810 | else { | ||||||
| 811 | 21 | 35 | $unacceptable_count += 1; | ||||
| 812 | } | ||||||
| 813 | } | ||||||
| 814 | } | ||||||
| 815 | 492 | 2809 | return; | ||||
| 816 | } | ||||||
| 817 | |||||||
| 818 | sub _tag_end_handler { | ||||||
| 819 | 467 | 467 | 652 | my ( $self, $tagname ) = @_; | |||
| 820 | 467 | 100 | 849 | if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) { | |||
| 821 | 1 | 5 | return; | ||||
| 822 | } | ||||||
| 823 | 466 | 100 | 991 | if ( !$in_cdata ) { | |||
| 824 | 463 | 517 | $cdata_dirty = 0; | ||||
| 825 | } | ||||||
| 826 | 466 | 100 | 1039 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
| 827 | 346 | 100 | 857 | if ( !$self->{empty_e}->{$tagname} ) { | |||
| 828 | 316 | 639 | push @fragments, "$tagname>"; | ||||
| 829 | } | ||||||
| 830 | } | ||||||
| 831 | else { | ||||||
| 832 | 120 | 100 | 282 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
| 833 | 30 | 100 | 53 | if ($in_cdata) { | |||
| 834 | 1 | 2 | $local_unacceptable_count -= 1; | ||||
| 835 | 1 | 50 | 5 | $local_unacceptable_count = 0 | |||
| 836 | if ( $local_unacceptable_count < 0 ); | ||||||
| 837 | } | ||||||
| 838 | else { | ||||||
| 839 | 29 | 35 | $unacceptable_count -= 1; | ||||
| 840 | 29 | 100 | 87 | $unacceptable_count = 0 if ( $unacceptable_count < 0 ); | |||
| 841 | } | ||||||
| 842 | } | ||||||
| 843 | } | ||||||
| 844 | 466 | 1504 | return; | ||||
| 845 | } | ||||||
| 846 | |||||||
| 847 | sub _text_handler { | ||||||
| 848 | 172 | 172 | 310 | my ( $self, $text, $is_cdata ) = @_; | |||
| 849 | 172 | 100 | 100 | 481 | if ( $in_cdata && $local_unacceptable_count ) { | ||
| 850 | 1 | 3 | return; | ||||
| 851 | } | ||||||
| 852 | 171 | 100 | 426 | if ($unacceptable_count) { | |||
| 853 | 15 | 71 | return; | ||||
| 854 | } | ||||||
| 855 | 156 | 100 | 271 | if ($is_cdata) { | |||
| 856 | 13 | 22 | my $cp = $self->{cdata_parser}; | ||||
| 857 | 13 | 17 | $in_cdata = 1; | ||||
| 858 | 13 | 44 | $cp->parse($text); | ||||
| 859 | 13 | 100 | 28 | if ( !$local_unacceptable_count ) { | |||
| 860 | 11 | 45 | $cp->eof(); | ||||
| 861 | } | ||||||
| 862 | 13 | 17 | $cdata_dirty = 1; | ||||
| 863 | 13 | 13 | $in_cdata = 0; | ||||
| 864 | 13 | 46 | return; | ||||
| 865 | } | ||||||
| 866 | else { | ||||||
| 867 | 143 | 100 | 331 | if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) { | |||
| 868 | 1 | 6 | return q{}; | ||||
| 869 | } | ||||||
| 870 | 142 | 486 | $text = encode_entities( $text, '<>&"' ); | ||||
| 871 | 142 | 8508 | $cdata_dirty = 0; | ||||
| 872 | } | ||||||
| 873 | 142 | 281 | push @fragments, $text; | ||||
| 874 | 142 | 619 | return; | ||||
| 875 | } | ||||||
| 876 | |||||||
| 877 | sub _uri_handler { | ||||||
| 878 | 58 | 58 | 104 | my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_; | |||
| 879 | 58 | 72 | my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} ); | ||||
| 58 | 121 | ||||||
| 58 | 99 | ||||||
| 880 | 58 | 241 | $value =~ s/[`\x00-\x1f\x7f]+//g; | ||||
| 881 | 58 | 101 | $value =~ s/\ufffd//g; | ||||
| 882 | 58 | 286 | my $uri = URI->new($value); | ||||
| 883 | 58 | 69793 | $uri = $uri->canonical; | ||||
| 884 | 58 | 100 | 5703 | if ( !$self->_run_callbacks( q{uri}, $tagname, $attr, \$uri ) ) { | |||
| 885 | 1 | 3 | ${$attr_ref} = q{}; | ||||
| 1 | 4 | ||||||
| 886 | 1 | 4 | return undef; | ||||
| 887 | } | ||||||
| 888 | 57 | 100 | 66 | 332 | if ( $self->{allowed_schemes} and $uri->scheme ) { | ||
| 889 | 42 | 100 | 722 | unless ( $self->{allowed_schemes}->{ $uri->scheme } ) { | |||
| 890 | 16 | 196 | ${$attr_ref} = q{}; | ||||
| 16 | 31 | ||||||
| 891 | 16 | 65 | return undef; | ||||
| 892 | } | ||||||
| 893 | } | ||||||
| 894 | 41 | 100 | 688 | if ( $self->{base_uri} ) { | |||
| 895 | 8 | 63 | $uri = URI->new_abs( $uri->as_string, $self->{base_uri} ); | ||||
| 896 | } | ||||||
| 897 | 41 | 100 | 1760 | if ( $uri->scheme ) { # Not a local URI | |||
| 898 | 33 | 370 | my $host; | ||||
| 899 | { | ||||||
| 900 | 33 | 41 | local $@; | ||||
| 33 | 73 | ||||||
| 901 | 33 | 54 | eval { $host = $uri->host; }; | ||||
| 33 | 95 | ||||||
| 902 | } | ||||||
| 903 | 33 | 50 | 745 | if ($host) { | |||
| 904 | |||||||
| 905 | # We may need to manually unescape domain names | ||||||
| 906 | # to deal with issues like tinyarro.ws | ||||||
| 907 | 33 | 83 | my $utf8_host = $self->_decode_utf8($host); | ||||
| 908 | 33 | 76 | utf8::upgrade($utf8_host); | ||||
| 909 | 33 | 50 | 90 | if ( $uri->host ne $utf8_host ) { | |||
| 910 | |||||||
| 911 | # TODO: Optionally use Punycode in this case | ||||||
| 912 | |||||||
| 913 | 0 | 0 | 0 | 0 | if ( $uri->port and $uri->port == $uri->default_port ) { | ||
| 914 | 0 | 0 | $uri->port(undef); | ||||
| 915 | } | ||||||
| 916 | 0 | 0 | my $escaped_host = $self->_encode_utf8( $uri->host ); | ||||
| 917 | 0 | 0 | my $uri_str = $uri->canonical->as_string; | ||||
| 918 | 0 | 0 | $uri_str =~ s/$escaped_host/$utf8_host/; | ||||
| 919 | 0 | 0 | utf8::upgrade($uri_str); | ||||
| 920 | 0 | 0 | ${$value_ref} = $uri_str; | ||||
| 0 | 0 | ||||||
| 921 | 0 | 0 | return; | ||||
| 922 | } | ||||||
| 923 | } | ||||||
| 924 | } | ||||||
| 925 | 41 | 836 | ${$value_ref} = $uri->canonical->as_string; | ||||
| 41 | 2774 | ||||||
| 926 | 41 | 189 | return; | ||||
| 927 | } | ||||||
| 928 | |||||||
| 929 | sub _decode_utf8 { | ||||||
| 930 | 33 | 33 | 43 | my $self = shift; | |||
| 931 | 33 | 59 | my $orig = my $str = shift; | ||||
| 932 | 33 | 54 | $str =~ s/\%([0-9a-f]{2})/chr(hex($1))/egi; | ||||
| 0 | 0 | ||||||
| 933 | 33 | 50 | 154 | return $str if utf8::decode($str); | |||
| 934 | 0 | return $orig; | |||||
| 935 | } | ||||||
| 936 | |||||||
| 937 | sub _encode_utf8 { | ||||||
| 938 | 0 | 0 | my $self = shift; | ||||
| 939 | 0 | my $str = shift; | |||||
| 940 | 0 | my $highbit = qr/[^\w\$-_.+!*'(),]/; | |||||
| 941 | 0 | $str =~ s/($highbit)/ sprintf ("%%%02X", ord($1)) /ge; | |||||
| 0 | |||||||
| 942 | 0 | utf8::upgrade($str); | |||||
| 943 | 0 | return $str; | |||||
| 944 | } | ||||||
| 945 | |||||||
| 946 | =head1 SEE ALSO | ||||||
| 947 | |||||||
| 948 | There are a number of tools designed for sanitizing HTML, some of which | ||||||
| 949 | may be better suited than HTML::Laundry to particular circumstances. In | ||||||
| 950 | addition to L |
||||||
| 951 | L |
||||||
| 952 | solely for the purposes of sanitizing HTML from potential XSS attack vectors; | ||||||
| 953 | L |
||||||
| 954 | L |
||||||
| 955 | |||||||
| 956 | =head1 AUTHOR | ||||||
| 957 | |||||||
| 958 | Steve Cook, C<< |
||||||
| 959 | |||||||
| 960 | =head1 BUGS | ||||||
| 961 | |||||||
| 962 | Please report any bugs or feature requests on the GitHub page for this project, | ||||||
| 963 | http://github.com/snark/html-laundry. | ||||||
| 964 | |||||||
| 965 | =head1 ACKNOWLEDGMENTS | ||||||
| 966 | |||||||
| 967 | Thanks to Dave Cross and Vera Tobin. | ||||||
| 968 | |||||||
| 969 | =head1 SUPPORT | ||||||
| 970 | |||||||
| 971 | You can find documentation for this module with the perldoc command. | ||||||
| 972 | |||||||
| 973 | perldoc HTML::Laundry | ||||||
| 974 | |||||||
| 975 | =head1 COPYRIGHT & LICENSE | ||||||
| 976 | |||||||
| 977 | Copyright 2009 Six Apart, Ltd., all rights reserved. | ||||||
| 978 | |||||||
| 979 | This program is free software; you can redistribute it and/or modify it | ||||||
| 980 | under the same terms as Perl itself. | ||||||
| 981 | |||||||
| 982 | =cut | ||||||
| 983 | |||||||
| 984 | 1; # End of HTML::Laundry |