| blib/lib/HTML/StripScripts.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 367 | 383 | 95.8 |
| branch | 169 | 202 | 83.6 |
| condition | 44 | 55 | 80.0 |
| subroutine | 68 | 72 | 94.4 |
| pod | 31 | 31 | 100.0 |
| total | 679 | 743 | 91.3 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::StripScripts; | ||||||
| 2 | 10 | 10 | 426913 | use strict; | |||
| 10 | 26 | ||||||
| 10 | 573 | ||||||
| 3 | 10 | 10 | 59 | use warnings FATAL => 'all'; | |||
| 10 | 20 | ||||||
| 10 | 875 | ||||||
| 4 | |||||||
| 5 | 10 | 10 | 71 | use vars qw($VERSION); | |||
| 10 | 21 | ||||||
| 10 | 54299 | ||||||
| 6 | $VERSION = '1.05'; | ||||||
| 7 | |||||||
| 8 | =head1 NAME | ||||||
| 9 | |||||||
| 10 | HTML::StripScripts - Strip scripting constructs out of HTML | ||||||
| 11 | |||||||
| 12 | =head1 SYNOPSIS | ||||||
| 13 | |||||||
| 14 | use HTML::StripScripts; | ||||||
| 15 | |||||||
| 16 | my $hss = HTML::StripScripts->new({ Context => 'Inline' }); | ||||||
| 17 | |||||||
| 18 | $hss->input_start_document; | ||||||
| 19 | |||||||
| 20 | $hss->input_start(''); | ||||||
| 21 | $hss->input_text('hello, world!'); | ||||||
| 22 | $hss->input_end(''); | ||||||
| 23 | |||||||
| 24 | $hss->input_end_document; | ||||||
| 25 | |||||||
| 26 | print $hss->filtered_document; | ||||||
| 27 | |||||||
| 28 | =head1 DESCRIPTION | ||||||
| 29 | |||||||
| 30 | This module strips scripting constructs out of HTML, leaving as | ||||||
| 31 | much non-scripting markup in place as possible. This allows web | ||||||
| 32 | applications to display HTML originating from an untrusted source | ||||||
| 33 | without introducing XSS (cross site scripting) vulnerabilities. | ||||||
| 34 | |||||||
| 35 | You will probably use L |
||||||
| 36 | this module directly. | ||||||
| 37 | |||||||
| 38 | The process is based on whitelists of tags, attributes and attribute | ||||||
| 39 | values. This approach is the most secure against disguised scripting | ||||||
| 40 | constructs hidden in malicious HTML documents. | ||||||
| 41 | |||||||
| 42 | As well as removing scripting constructs, this module ensures that | ||||||
| 43 | there is a matching end for each start tag, and that the tags are | ||||||
| 44 | properly nested. | ||||||
| 45 | |||||||
| 46 | Previously, in order to customise the output, you needed to subclass | ||||||
| 47 | C |
||||||
| 48 | can be done through the C |
||||||
| 49 | examples/declaration/ and examples/tags/ for cases where subclassing is | ||||||
| 50 | necessary.) | ||||||
| 51 | |||||||
| 52 | The HTML document must be parsed into start tags, end tags and | ||||||
| 53 | text before it can be filtered by this module. Use either | ||||||
| 54 | L |
||||||
| 55 | if you want to input an unparsed HTML document. | ||||||
| 56 | |||||||
| 57 | See examples/direct/ for an example of how to feed tokens directly to | ||||||
| 58 | HTML::StripScripts. | ||||||
| 59 | |||||||
| 60 | =head1 CONSTRUCTORS | ||||||
| 61 | |||||||
| 62 | =over | ||||||
| 63 | |||||||
| 64 | =item new ( CONFIG ) | ||||||
| 65 | |||||||
| 66 | Creates a new C |
||||||
| 67 | particular filtering policy. If present, the CONFIG parameter | ||||||
| 68 | must be a hashref. The following keys are recognized (unrecognized | ||||||
| 69 | keys will be silently ignored). | ||||||
| 70 | |||||||
| 71 | $s = HTML::Stripscripts->new({ | ||||||
| 72 | Context => 'Document|Flow|Inline|NoTags', | ||||||
| 73 | BanList => [qw( br img )] | {br => '1', img => '1'}, | ||||||
| 74 | BanAllBut => [qw(p div span)], | ||||||
| 75 | AllowSrc => 0|1, | ||||||
| 76 | AllowHref => 0|1, | ||||||
| 77 | AllowRelURL => 0|1, | ||||||
| 78 | AllowMailto => 0|1, | ||||||
| 79 | EscapeFiltered => 0|1, | ||||||
| 80 | Rules => { See below for details }, | ||||||
| 81 | }); | ||||||
| 82 | |||||||
| 83 | =over | ||||||
| 84 | |||||||
| 85 | =item C |
||||||
| 86 | |||||||
| 87 | A string specifying the context in which the filtered document | ||||||
| 88 | will be used. This influences the set of tags that will be | ||||||
| 89 | allowed. | ||||||
| 90 | |||||||
| 91 | If present, the C |
||||||
| 92 | |||||||
| 93 | =over | ||||||
| 94 | |||||||
| 95 | =item C |
||||||
| 96 | |||||||
| 97 | If C |
||||||
| 98 | HTML document, including the C tag and C and C | ||||||
| 99 | sections. | ||||||
| 100 | |||||||
| 101 | =item C |
||||||
| 102 | |||||||
| 103 | If C |
||||||
| 104 | would expect to find in a document body are allowed, including | ||||||
| 105 | lists and tables but not including forms. | ||||||
| 106 | |||||||
| 107 | =item C |
||||||
| 108 | |||||||
| 109 | If C |
||||||
| 110 | and C are allowed. | ||||||
| 111 | |||||||
| 112 | =item C |
||||||
| 113 | |||||||
| 114 | If C |
||||||
| 115 | |||||||
| 116 | =back | ||||||
| 117 | |||||||
| 118 | The default C |
||||||
| 119 | |||||||
| 120 | =item C |
||||||
| 121 | |||||||
| 122 | If present, this option must be an arrayref or a hashref. Any tag that | ||||||
| 123 | would normally be allowed (because it presents no XSS hazard) will be | ||||||
| 124 | blocked if the lowercase name of the tag is in this list. | ||||||
| 125 | |||||||
| 126 | For example, in a guestbook application where C tags are used to |
||||||
| 127 | separate posts, you may wish to prevent posts from including C |
||||||
| 128 | tags, even though C is not an XSS risk. |
||||||
| 129 | |||||||
| 130 | =item C |
||||||
| 131 | |||||||
| 132 | If present, this option must be reference to an array holding a list of | ||||||
| 133 | lowercase tag names. This has the effect of adding all but the listed | ||||||
| 134 | tags to the ban list, so that only those tags listed will be allowed. | ||||||
| 135 | |||||||
| 136 | =item C |
||||||
| 137 | |||||||
| 138 | By default, the filter won't allow constructs that cause the browser to | ||||||
| 139 | fetch things automatically, such as C |
||||||
| 140 | If this option is present and true then those constructs will be | ||||||
| 141 | allowed. | ||||||
| 142 | |||||||
| 143 | =item C |
||||||
| 144 | |||||||
| 145 | By default, the filter won't allow constructs that cause the browser to | ||||||
| 146 | fetch things if the user clicks on something, such as the C |
||||||
| 147 | attribute in C tags. Set this option to a true value to allow this | ||||||
| 148 | type of construct. | ||||||
| 149 | |||||||
| 150 | =item C |
||||||
| 151 | |||||||
| 152 | By default, the filter won't allow relative URLs such as C<../foo.html> | ||||||
| 153 | in C |
||||||
| 154 | to allow them. C |
||||||
| 155 | for this to have any effect. | ||||||
| 156 | |||||||
| 157 | =item C |
||||||
| 158 | |||||||
| 159 | By default, C |
||||||
| 160 | a true value, then this construct will be allowed. This can be enabled | ||||||
| 161 | separately from AllowHref. | ||||||
| 162 | |||||||
| 163 | =item C |
||||||
| 164 | |||||||
| 165 | By default, any filtered tags are outputted as C<< >>. If | ||||||
| 166 | C |
||||||
| 167 | to HTML entities. | ||||||
| 168 | |||||||
| 169 | For instance: | ||||||
| 170 | |||||||
| 171 | --> <br> |
||||||
| 172 | |||||||
| 173 | =item C |
||||||
| 174 | |||||||
| 175 | The C |
||||||
| 176 | |||||||
| 177 | The focus is safety-first, so it is applied after all of the previous validation. | ||||||
| 178 | This means that you cannot all malicious data should already have been cleared. | ||||||
| 179 | |||||||
| 180 | Rules can be specified for tags and for attributes. Any tag or attribute | ||||||
| 181 | not explicitly listed will be handled by the default C<*> rules. | ||||||
| 182 | |||||||
| 183 | The following is a synopsis of all of the options that you can use to | ||||||
| 184 | configure rules. Below, an example is broken into sections and explained. | ||||||
| 185 | |||||||
| 186 | Rules => { | ||||||
| 187 | |||||||
| 188 | tag => 0 | 1 | sub { tag_callback } | ||||||
| 189 | | { | ||||||
| 190 | attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
| 191 | '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
| 192 | required => [qw(attrname attrname)], | ||||||
| 193 | tag => sub { tag_callback } | ||||||
| 194 | }, | ||||||
| 195 | |||||||
| 196 | '*' => 0 | 1 | sub { tag_callback } | ||||||
| 197 | | { | ||||||
| 198 | attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
| 199 | '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
| 200 | tag => sub { tag_callback } | ||||||
| 201 | } | ||||||
| 202 | |||||||
| 203 | } | ||||||
| 204 | |||||||
| 205 | EXAMPLE: | ||||||
| 206 | |||||||
| 207 | Rules => { | ||||||
| 208 | |||||||
| 209 | ########################## | ||||||
| 210 | ##### EXPLICIT RULES ##### | ||||||
| 211 | ########################## | ||||||
| 212 | |||||||
| 213 | ## Allow tags, reject |
||||||
| 214 | br => 1, | ||||||
| 215 | img => 0, | ||||||
| 216 | |||||||
| 217 | ## Send all tags to a sub |
||||||
| 218 | div => sub { tag_callback }, | ||||||
| 219 | |||||||
| 220 | ## Allow tags,and allow the 'cite' attribute |
||||||
| 221 | ## All other attributes are handled by the default C<*> | ||||||
| 222 | blockquote => { | ||||||
| 223 | cite => 1, | ||||||
| 224 | }, | ||||||
| 225 | |||||||
| 226 | ## Allow tags, and | ||||||
| 227 | a => { | ||||||
| 228 | |||||||
| 229 | ## Allow the 'title' attribute | ||||||
| 230 | title => 1, | ||||||
| 231 | |||||||
| 232 | ## Allow the 'href' attribute if it matches the regex | ||||||
| 233 | href => '^http://yourdomain.com' | ||||||
| 234 | OR href => qr{^http://yourdomain.com}, | ||||||
| 235 | |||||||
| 236 | ## 'style' attributes are handled by a sub | ||||||
| 237 | style => sub { attr_callback }, | ||||||
| 238 | |||||||
| 239 | ## All other attributes are rejected | ||||||
| 240 | '*' => 0, | ||||||
| 241 | |||||||
| 242 | ## Additionally, the tag should be handled by this sub | ||||||
| 243 | tag => sub { tag_callback}, | ||||||
| 244 | |||||||
| 245 | ## If the tag doesn't have these attributes, filter the tag | ||||||
| 246 | required => [qw(href title)], | ||||||
| 247 | |||||||
| 248 | }, | ||||||
| 249 | |||||||
| 250 | ########################## | ||||||
| 251 | ##### DEFAULT RULES ##### | ||||||
| 252 | ########################## | ||||||
| 253 | |||||||
| 254 | ## The default '*' rule - accepts all the same options as above. | ||||||
| 255 | ## If a tag or attribute is not mentioned above, then the default | ||||||
| 256 | ## rule is applied: | ||||||
| 257 | |||||||
| 258 | ## Reject all tags | ||||||
| 259 | '*' => 0, | ||||||
| 260 | |||||||
| 261 | ## Allow all tags and all attributes | ||||||
| 262 | '*' => 1, | ||||||
| 263 | |||||||
| 264 | ## Send all tags to the sub | ||||||
| 265 | '*' => sub { tag_callback }, | ||||||
| 266 | |||||||
| 267 | ## Allow all tags, reject all attributes | ||||||
| 268 | '*' => { '*' => 0 }, | ||||||
| 269 | |||||||
| 270 | ## Allow all tags, and | ||||||
| 271 | '*' => { | ||||||
| 272 | |||||||
| 273 | ## Allow the 'title' attribute | ||||||
| 274 | title => 1, | ||||||
| 275 | |||||||
| 276 | ## Allow the 'href' attribute if it matches the regex | ||||||
| 277 | href => '^http://yourdomain.com' | ||||||
| 278 | OR href => qr{^http://yourdomain.com}, | ||||||
| 279 | |||||||
| 280 | ## 'style' attributes are handled by a sub | ||||||
| 281 | style => sub { attr_callback }, | ||||||
| 282 | |||||||
| 283 | ## All other attributes are rejected | ||||||
| 284 | '*' => 0, | ||||||
| 285 | |||||||
| 286 | ## Additionally, all tags should be handled by this sub | ||||||
| 287 | tag => sub { tag_callback}, | ||||||
| 288 | |||||||
| 289 | }, | ||||||
| 290 | |||||||
| 291 | =over | ||||||
| 292 | |||||||
| 293 | =item Tag Callbacks | ||||||
| 294 | |||||||
| 295 | sub tag_callback { | ||||||
| 296 | my ($filter,$element) = (@_); | ||||||
| 297 | |||||||
| 298 | $element = { | ||||||
| 299 | tag => 'tag', | ||||||
| 300 | content => 'inner_html', | ||||||
| 301 | attr => { | ||||||
| 302 | attr_name => 'attr_value', | ||||||
| 303 | } | ||||||
| 304 | }; | ||||||
| 305 | return 0 | 1; | ||||||
| 306 | } | ||||||
| 307 | |||||||
| 308 | A tag callback accepts two parameters, the C<$filter> object and the C$element>. | ||||||
| 309 | It should return C<0> to completely ignore the tag and its content (which includes | ||||||
| 310 | any nested HTML tags), or C<1> to accept and output the tag. | ||||||
| 311 | |||||||
| 312 | The C<$element> is a hash ref containing the keys: | ||||||
| 313 | |||||||
| 314 | =item C |
||||||
| 315 | |||||||
| 316 | This is the tagname in lowercase, eg C, C , C |
||||||
| 317 | the tag value to an empty string, then the tag will not be outputted, but | ||||||
| 318 | the tag contents will. | ||||||
| 319 | |||||||
| 320 | =item C |
||||||
| 321 | |||||||
| 322 | This is the equivalent of DOM's innerHTML. It contains the text content | ||||||
| 323 | and any HTML tags contained within this element. You can change the content | ||||||
| 324 | or set it to an empty string so that it is not outputted. | ||||||
| 325 | |||||||
| 326 | =item C |
||||||
| 327 | |||||||
| 328 | C |
||||||
| 329 | |||||||
| 330 | =back | ||||||
| 331 | |||||||
| 332 | If for instance, you wanted to replace C<< >> tags with C<< >> tags, | ||||||
| 333 | you could do this: | ||||||
| 334 | |||||||
| 335 | sub b_callback { | ||||||
| 336 | my ($filter,$element) = @_; | ||||||
| 337 | $element->{tag} = 'span'; | ||||||
| 338 | $element->{attr}{style} = 'font-weight:bold'; | ||||||
| 339 | return 1; | ||||||
| 340 | } | ||||||
| 341 | |||||||
| 342 | =item Attribute Callbacks | ||||||
| 343 | |||||||
| 344 | sub attr_callback { | ||||||
| 345 | my ( $filter, $tag, $attr_name, $attr_val ) = @_; | ||||||
| 346 | return undef | '' | 'value'; | ||||||
| 347 | } | ||||||
| 348 | |||||||
| 349 | Attribute callbacks accept four parameters, the C<$filter> object, the C<$tag> | ||||||
| 350 | name, the C<$attr_name> and the C<$attr_value>. | ||||||
| 351 | |||||||
| 352 | It should return either C |
||||||
| 353 | used. An empty string keeps the attribute, but without a value. | ||||||
| 354 | |||||||
| 355 | =item C |
||||||
| 356 | |||||||
| 357 | It is not necessary to use C |
||||||
| 358 | via C |
||||||
| 359 | |||||||
| 360 | BanAllBut => [qw(p div span)] | ||||||
| 361 | |||||||
| 362 | The logic works as follows: | ||||||
| 363 | |||||||
| 364 | * If BanAllBut exists, then ban everything but the tags in the list | ||||||
| 365 | * Add to the ban list any elements in BanList | ||||||
| 366 | * Any tags mentioned explicitly in Rules (eg a => 0, br => 1) | ||||||
| 367 | are added or removed from the BanList | ||||||
| 368 | * A default rule of { '*' => 0 } would ban all tags except | ||||||
| 369 | those mentioned in Rules | ||||||
| 370 | * A default rule of { '*' => 1 } would allow all tags except | ||||||
| 371 | those disallowed in the ban list, or by explicit rules | ||||||
| 372 | |||||||
| 373 | =back | ||||||
| 374 | |||||||
| 375 | =cut | ||||||
| 376 | |||||||
| 377 | sub new { | ||||||
| 378 | 1487 | 1487 | 1 | 50381 | my ( $pkg, $cfg ) = @_; | ||
| 379 | |||||||
| 380 | 1487 | 66 | 11379 | my $self = bless {}, ref $pkg || $pkg; | |||
| 381 | 1487 | 4280 | $self->hss_init($cfg); | ||||
| 382 | 1487 | 5024 | return $self; | ||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | =back | ||||||
| 386 | |||||||
| 387 | =head1 METHODS | ||||||
| 388 | |||||||
| 389 | This class provides the following methods: | ||||||
| 390 | |||||||
| 391 | =over | ||||||
| 392 | |||||||
| 393 | =item hss_init () | ||||||
| 394 | |||||||
| 395 | This method is called by new() and does the actual initialisation work | ||||||
| 396 | for the new HTML::StripScripts object. | ||||||
| 397 | |||||||
| 398 | =cut | ||||||
| 399 | |||||||
| 400 | sub hss_init { | ||||||
| 401 | 1487 | 1487 | 1 | 2208 | my ( $self, $cfg ) = @_; | ||
| 402 | 1487 | 100 | 3400 | $cfg ||= {}; | |||
| 403 | |||||||
| 404 | 1487 | 3758 | $self->{_hssCfg} = $cfg; | ||||
| 405 | |||||||
| 406 | 1487 | 4893 | $self->{_hssContext} = $self->init_context_whitelist; | ||||
| 407 | 1487 | 4195 | $self->{_hssAttrib} = $self->init_attrib_whitelist; | ||||
| 408 | 1487 | 3659 | $self->{_hssAttVal} = $self->init_attval_whitelist; | ||||
| 409 | 1487 | 5024 | $self->{_hssStyle} = $self->init_style_whitelist; | ||||
| 410 | 1487 | 3364 | $self->{_hssDeInter} = $self->init_deinter_whitelist; | ||||
| 411 | 1487 | 3822 | $self->{_hssBanList} = $self->_hss_prepare_ban_list($cfg); | ||||
| 412 | 1487 | 4767 | $self->{_hssRules} = $self->_hss_prepare_rules($cfg); | ||||
| 413 | } | ||||||
| 414 | |||||||
| 415 | =item input_start_document () | ||||||
| 416 | |||||||
| 417 | This method initializes the filter, and must be called once before | ||||||
| 418 | starting on each HTML document to be filtered. | ||||||
| 419 | |||||||
| 420 | =cut | ||||||
| 421 | |||||||
| 422 | sub input_start_document { | ||||||
| 423 | 1890 | 1890 | 1 | 117008 | my ( $self, $context ) = @_; | ||
| 424 | |||||||
| 425 | 1890 | 100 | 18711 | $self->{_hssStack} = [ { NAME => '', | |||
| 426 | CTX => $self->{_hssCfg}{Context} || 'Flow', | ||||||
| 427 | CONTENT => '', | ||||||
| 428 | } | ||||||
| 429 | ]; | ||||||
| 430 | 1890 | 4079 | $self->{_hssOutput} = ''; | ||||
| 431 | |||||||
| 432 | 1890 | 4184 | $self->output_start_document; | ||||
| 433 | } | ||||||
| 434 | |||||||
| 435 | =item input_start ( TEXT ) | ||||||
| 436 | |||||||
| 437 | Handles a start tag from the input document. TEXT must be the | ||||||
| 438 | full text of the tag, including angle-brackets. | ||||||
| 439 | |||||||
| 440 | =cut | ||||||
| 441 | |||||||
| 442 | sub input_start { | ||||||
| 443 | 3160 | 3160 | 1 | 12065 | my ( $self, $text ) = @_; | ||
| 444 | |||||||
| 445 | 3160 | 100 | 7691 | $self->_hss_accept_input_start($text) or $self->reject_start($text); | |||
| 446 | } | ||||||
| 447 | |||||||
| 448 | sub _hss_accept_input_start { | ||||||
| 449 | 3160 | 3160 | 17127 | my ( $self, $text ) = @_; | |||
| 450 | |||||||
| 451 | 3160 | 100 | 22110 | return 0 unless $text =~ m|^<([a-zA-Z0-9]+)\b(.*)>$|m; | |||
| 452 | 3153 | 11043 | my ( $tag, $attr ) = ( lc $1, $self->strip_nonprintable($2) ); | ||||
| 453 | |||||||
| 454 | 3153 | 50 | 10900 | return 0 if $self->{_hssSkipToEnd}; | |||
| 455 | 3153 | 50 | 33 | 21750 | if ( $tag eq 'script' or $tag eq 'style' ) { | ||
| 456 | 0 | 0 | $self->{_hssSkipToEnd} = $tag; | ||||
| 457 | 0 | 0 | return 0; | ||||
| 458 | } | ||||||
| 459 | |||||||
| 460 | 3153 | 100 | 8296 | return 0 if $self->_hss_tag_is_banned($tag); | |||
| 461 | |||||||
| 462 | 3119 | 7394 | my $allowed_attr = $self->{_hssAttrib}{$tag}; | ||||
| 463 | 3119 | 100 | 23106 | return 0 unless defined $allowed_attr; | |||
| 464 | |||||||
| 465 | 3117 | 100 | 7569 | return 0 unless $self->_hss_get_to_valid_context($tag); | |||
| 466 | |||||||
| 467 | 3114 | 8317 | my $default_filters = $self->{_hssRules}{'*'}; | ||||
| 468 | 3114 | 100 | 11896 | my $tag_filters = $self->{_hssRules}{$tag} | |||
| 469 | || $default_filters; | ||||||
| 470 | |||||||
| 471 | 3114 | 4409 | my %filtered_attr; | ||||
| 472 | 3114 | 26427 | while ( $attr | ||||
| 473 | =~ s#^\s*([\w\-]+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?## ) | ||||||
| 474 | { | ||||||
| 475 | 4365 | 10842 | my $key = lc $1; | ||||
| 476 | 4365 | 100 | 18540 | my $val = ( defined $2 ? $self->unquoted_to_canonical_form($2) | |||
| 100 | |||||||
| 100 | |||||||
| 477 | : defined $3 ? $self->quoted_to_canonical_form($3) | ||||||
| 478 | : defined $4 ? $self->quoted_to_canonical_form($4) | ||||||
| 479 | : '' | ||||||
| 480 | ); | ||||||
| 481 | |||||||
| 482 | 4365 | 9285 | my $value_class = $allowed_attr->{$key}; | ||||
| 483 | 4365 | 100 | 10402 | next unless defined $value_class; | |||
| 484 | |||||||
| 485 | 4364 | 10083 | my $attval_handler = $self->{_hssAttVal}{$value_class}; | ||||
| 486 | 4364 | 50 | 17098 | next unless defined $attval_handler; | |||
| 487 | |||||||
| 488 | 4364 | 4299 | my $attr_filter; | ||||
| 489 | 4364 | 100 | 10428 | if ($tag_filters) { | |||
| 490 | 4115 | 13420 | $attr_filter = | ||||
| 491 | $self->_hss_get_attr_filter( $default_filters, $tag_filters, | ||||||
| 492 | $key ); | ||||||
| 493 | |||||||
| 494 | # filter == 0 | ||||||
| 495 | 4115 | 100 | 13622 | next unless $attr_filter; | |||
| 496 | } | ||||||
| 497 | |||||||
| 498 | 3568 | 5259 | my $filtered_value = &{$attval_handler}( $self, $tag, $key, $val ); | ||||
| 3568 | 10105 | ||||||
| 499 | 3568 | 100 | 11221 | next unless defined $filtered_value; | |||
| 500 | |||||||
| 501 | # send value to filter if sub | ||||||
| 502 | 3555 | 100 | 100 | 22449 | if ( $tag_filters && ref $attr_filter ) { | ||
| 503 | 2392 | 7106 | $filtered_value | ||||
| 504 | = $attr_filter->( $self, $tag, $key, $filtered_value ); | ||||||
| 505 | 2392 | 100 | 16784 | next unless defined $filtered_value; | |||
| 506 | } | ||||||
| 507 | |||||||
| 508 | 3026 | 19755 | $filtered_attr{$key} = $filtered_value; | ||||
| 509 | |||||||
| 510 | } | ||||||
| 511 | |||||||
| 512 | # Check required attributes | ||||||
| 513 | 3114 | 100 | 12486 | if ( my $required = $tag_filters->{required} ) { | |||
| 514 | 3 | 6 | foreach my $key (@$required) { | ||||
| 515 | 5 | 100 | 66 | 40 | return 0 | ||
| 516 | unless defined $filtered_attr{$key} && length($filtered_attr{$key}); | ||||||
| 517 | } | ||||||
| 518 | } | ||||||
| 519 | |||||||
| 520 | # Check for callback | ||||||
| 521 | 3112 | 66 | 35163 | my $tag_callback = $tag_filters && $tag_filters->{tag} | |||
| 522 | || $default_filters->{tag}; | ||||||
| 523 | |||||||
| 524 | 3112 | 12487 | my $new_context = $self->{_hssContext}{ $self->{_hssStack}[0]{CTX} }{$tag}; | ||||
| 525 | |||||||
| 526 | 3112 | 37192 | my %stack_entry = ( NAME => $tag, | ||||
| 527 | ATTR => \%filtered_attr, | ||||||
| 528 | CTX => $new_context, | ||||||
| 529 | CALLBACK => $tag_callback, | ||||||
| 530 | CONTENT => '', | ||||||
| 531 | ); | ||||||
| 532 | 3112 | 100 | 8438 | if ( $new_context eq 'EMPTY' ) { | |||
| 533 | 1576 | 28076 | $self->output_stack_entry( \%stack_entry ); | ||||
| 534 | } | ||||||
| 535 | else { | ||||||
| 536 | 1536 | 1685 | unshift @{ $self->{_hssStack} }, \%stack_entry; | ||||
| 1536 | 4631 | ||||||
| 537 | |||||||
| 538 | } | ||||||
| 539 | |||||||
| 540 | 3112 | 27362 | return 1; | ||||
| 541 | } | ||||||
| 542 | |||||||
| 543 | =item input_end ( TEXT ) | ||||||
| 544 | |||||||
| 545 | Handles an end tag from the input document. TEXT must be the | ||||||
| 546 | full text of the end tag, including angle-brackets. | ||||||
| 547 | |||||||
| 548 | =cut | ||||||
| 549 | |||||||
| 550 | sub input_end { | ||||||
| 551 | 1508 | 1508 | 1 | 8189 | my ( $self, $text ) = @_; | ||
| 552 | |||||||
| 553 | 1508 | 100 | 4819 | $self->_hss_accept_input_end($text) or $self->reject_end($text); | |||
| 554 | } | ||||||
| 555 | |||||||
| 556 | sub _hss_accept_input_end { | ||||||
| 557 | 1508 | 1508 | 1876 | my ( $self, $text ) = @_; | |||
| 558 | |||||||
| 559 | 1508 | 100 | 7757 | return 0 unless $text =~ m#^(\w+)>$#; | |||
| 560 | 1505 | 3214 | my $tag = lc $1; | ||||
| 561 | |||||||
| 562 | 1505 | 50 | 3958 | if ( $self->{_hssSkipToEnd} ) { | |||
| 563 | 0 | 0 | 0 | if ( $self->{_hssSkipToEnd} eq $tag ) { | |||
| 564 | 0 | 0 | delete $self->{_hssSkipToEnd}; | ||||
| 565 | } | ||||||
| 566 | 0 | 0 | return 0; | ||||
| 567 | } | ||||||
| 568 | |||||||
| 569 | # Ignore a close without an open | ||||||
| 570 | 1505 | 100 | 1696 | return 0 unless grep { $_->{NAME} eq $tag } @{ $self->{_hssStack} }; | |||
| 3000 | 11015 | ||||||
| 1505 | 3863 | ||||||
| 571 | |||||||
| 572 | # Close open tags up to the matching open | ||||||
| 573 | 1478 | 3368 | my @close = (); | ||||
| 574 | |||||||
| 575 | 1478 | 1928 | while ( scalar @{ $self->{_hssStack} } ) { | ||||
| 1480 | 3984 | ||||||
| 576 | 1480 | 1827 | my $entry = shift @{ $self->{_hssStack} }; | ||||
| 1480 | 3611 | ||||||
| 577 | 1480 | 3942 | push @close, $entry; | ||||
| 578 | 1480 | 4512 | $self->output_stack_entry($entry); | ||||
| 579 | 1480 | 3342 | $entry->{CONTENT} = ''; | ||||
| 580 | 1480 | 100 | 4419 | last if $entry->{NAME} eq $tag; | |||
| 581 | } | ||||||
| 582 | |||||||
| 583 | # Reopen any we closed early if all that were closed are | ||||||
| 584 | # configured to be auto de-interleaved. | ||||||
| 585 | 1478 | 100 | 2287 | unless ( grep { !$self->{_hssDeInter}{ $_->{NAME} } } @close ) { | |||
| 1480 | 10024 | ||||||
| 586 | 89 | 185 | pop @close; | ||||
| 587 | 89 | 238 | unshift @{ $self->{_hssStack} }, @close; | ||||
| 89 | 190 | ||||||
| 588 | } | ||||||
| 589 | |||||||
| 590 | 1478 | 9255 | return 1; | ||||
| 591 | } | ||||||
| 592 | |||||||
| 593 | =item input_text ( TEXT ) | ||||||
| 594 | |||||||
| 595 | Handles some non-tag text from the input document. | ||||||
| 596 | |||||||
| 597 | =cut | ||||||
| 598 | |||||||
| 599 | sub input_text { | ||||||
| 600 | 382 | 382 | 1 | 1610 | my ( $self, $text ) = @_; | ||
| 601 | |||||||
| 602 | 382 | 50 | 1020 | return if $self->{_hssSkipToEnd}; | |||
| 603 | |||||||
| 604 | 382 | 1010 | $text = $self->strip_nonprintable($text); | ||||
| 605 | |||||||
| 606 | 382 | 100 | 1742 | if ( $text =~ /^(\s*)$/ ) { | |||
| 607 | 2 | 7 | $self->output_text($1); | ||||
| 608 | 2 | 5 | return; | ||||
| 609 | } | ||||||
| 610 | |||||||
| 611 | 380 | 100 | 932 | unless ( $self->_hss_get_to_valid_context('CDATA') ) { | |||
| 612 | 1 | 5 | $self->reject_text($text); | ||||
| 613 | 1 | 3 | return; | ||||
| 614 | } | ||||||
| 615 | |||||||
| 616 | 379 | 1152 | my $filtered = $self->filter_text( $self->text_to_canonical_form($text) ); | ||||
| 617 | 379 | 1284 | $self->output_text( $self->canonical_form_to_text($filtered) ); | ||||
| 618 | } | ||||||
| 619 | |||||||
| 620 | =item input_process ( TEXT ) | ||||||
| 621 | |||||||
| 622 | Handles a processing instruction from the input document. | ||||||
| 623 | |||||||
| 624 | =cut | ||||||
| 625 | |||||||
| 626 | sub input_process { | ||||||
| 627 | 1 | 1 | 1 | 10 | my ( $self, $text ) = @_; | ||
| 628 | |||||||
| 629 | 1 | 7 | $self->reject_process($text); | ||||
| 630 | } | ||||||
| 631 | |||||||
| 632 | =item input_comment ( TEXT ) | ||||||
| 633 | |||||||
| 634 | Handles an HTML comment from the input document. | ||||||
| 635 | |||||||
| 636 | =cut | ||||||
| 637 | |||||||
| 638 | sub input_comment { | ||||||
| 639 | 1 | 1 | 1 | 9 | my ( $self, $text ) = @_; | ||
| 640 | |||||||
| 641 | 1 | 4 | $self->reject_comment($text); | ||||
| 642 | } | ||||||
| 643 | |||||||
| 644 | =item input_declaration ( TEXT ) | ||||||
| 645 | |||||||
| 646 | Handles an declaration from the input document. | ||||||
| 647 | |||||||
| 648 | =cut | ||||||
| 649 | |||||||
| 650 | sub input_declaration { | ||||||
| 651 | 1 | 1 | 1 | 11 | my ( $self, $text ) = @_; | ||
| 652 | |||||||
| 653 | 1 | 3 | $self->reject_declaration($text); | ||||
| 654 | } | ||||||
| 655 | |||||||
| 656 | =item input_end_document () | ||||||
| 657 | |||||||
| 658 | Call this method to signal the end of the input document. | ||||||
| 659 | |||||||
| 660 | =cut | ||||||
| 661 | |||||||
| 662 | sub input_end_document { | ||||||
| 663 | 1890 | 1890 | 1 | 8945 | my ($self) = @_; | ||
| 664 | |||||||
| 665 | 1890 | 3165 | delete $self->{_hssSkipToEnd}; | ||||
| 666 | |||||||
| 667 | 1890 | 2456 | while ( @{ $self->{_hssStack} } > 1 ) { | ||||
| 1946 | 7671 | ||||||
| 668 | 56 | 99 | $self->output_stack_entry( shift @{ $self->{_hssStack} } ); | ||||
| 56 | 186 | ||||||
| 669 | } | ||||||
| 670 | |||||||
| 671 | 1890 | 4372 | $self->output_end_document; | ||||
| 672 | 1890 | 2330 | my $last_entry = shift @{ $self->{_hssStack} }; | ||||
| 1890 | 4011 | ||||||
| 673 | 1890 | 4052 | $self->{_hssOutput} = $last_entry->{CONTENT}; | ||||
| 674 | 1890 | 9422 | delete $self->{_hssStack}; | ||||
| 675 | |||||||
| 676 | } | ||||||
| 677 | |||||||
| 678 | =item filtered_document () | ||||||
| 679 | |||||||
| 680 | Returns the filtered document as a string. | ||||||
| 681 | |||||||
| 682 | =cut | ||||||
| 683 | |||||||
| 684 | sub filtered_document { | ||||||
| 685 | 1890 | 1890 | 1 | 5794 | my ($self) = @_; | ||
| 686 | 1890 | 12333 | $self->{_hssOutput}; | ||||
| 687 | } | ||||||
| 688 | |||||||
| 689 | =back | ||||||
| 690 | |||||||
| 691 | =cut | ||||||
| 692 | |||||||
| 693 | =head1 SUBCLASSING | ||||||
| 694 | |||||||
| 695 | The only reason for subclassing this module now is to add to the | ||||||
| 696 | list of accepted tags, attributes and styles (See | ||||||
| 697 | L"WHITELIST INITIALIZATION METHODS">). Everything else can be | ||||||
| 698 | achieved with L"Rules">. | ||||||
| 699 | |||||||
| 700 | The C |
||||||
| 701 | hashes and C |
||||||
| 702 | C<_hss>. The filter configuration can be set up by invoking the | ||||||
| 703 | hss_init() method, which takes the same arguments as new(). | ||||||
| 704 | |||||||
| 705 | =head1 OUTPUT METHODS | ||||||
| 706 | |||||||
| 707 | The filter outputs a stream of start tags, end tags, text, comments, | ||||||
| 708 | declarations and processing instructions, via the following C |
||||||
| 709 | methods. Subclasses may override these to intercept the filter output. | ||||||
| 710 | |||||||
| 711 | The default implementations of the C |
||||||
| 712 | text on to the output() method. The default implementation of the | ||||||
| 713 | output() method appends the text to a string, which can be fetched with | ||||||
| 714 | the filtered_document() method once processing is complete. | ||||||
| 715 | |||||||
| 716 | If the output() method or the individual C |
||||||
| 717 | overridden in a subclass, then filtered_document() will not work in | ||||||
| 718 | that subclass. | ||||||
| 719 | |||||||
| 720 | =over | ||||||
| 721 | |||||||
| 722 | =item output_start_document () | ||||||
| 723 | |||||||
| 724 | This method gets called once at the start of each HTML document passed | ||||||
| 725 | through the filter. The default implementation does nothing. | ||||||
| 726 | |||||||
| 727 | =cut | ||||||
| 728 | |||||||
| 729 | 3778 | 3778 | 1 | 6867 | sub output_start_document { } | ||
| 730 | |||||||
| 731 | =item output_end_document () | ||||||
| 732 | |||||||
| 733 | This method gets called once at the end of each HTML document passed | ||||||
| 734 | through the filter. The default implementation does nothing. | ||||||
| 735 | |||||||
| 736 | =cut | ||||||
| 737 | |||||||
| 738 | *output_end_document = \&output_start_document; | ||||||
| 739 | |||||||
| 740 | =item output_start ( TEXT ) | ||||||
| 741 | |||||||
| 742 | This method is used to output a filtered start tag. | ||||||
| 743 | |||||||
| 744 | =cut | ||||||
| 745 | |||||||
| 746 | 5108 | 5108 | 1 | 21565 | sub output_start { $_[0]->output( $_[1] ) } | ||
| 747 | |||||||
| 748 | =item output_end ( TEXT ) | ||||||
| 749 | |||||||
| 750 | This method is used to output a filtered end tag. | ||||||
| 751 | |||||||
| 752 | =cut | ||||||
| 753 | |||||||
| 754 | *output_end = \&output_start; | ||||||
| 755 | |||||||
| 756 | =item output_text ( TEXT ) | ||||||
| 757 | |||||||
| 758 | This method is used to output some filtered non-tag text. | ||||||
| 759 | |||||||
| 760 | =cut | ||||||
| 761 | |||||||
| 762 | *output_text = \&output_start; | ||||||
| 763 | |||||||
| 764 | =item output_declaration ( TEXT ) | ||||||
| 765 | |||||||
| 766 | This method is used to output a filtered declaration. | ||||||
| 767 | |||||||
| 768 | =cut | ||||||
| 769 | |||||||
| 770 | *output_declaration = \&output_start; | ||||||
| 771 | |||||||
| 772 | =item output_comment ( TEXT ) | ||||||
| 773 | |||||||
| 774 | This method is used to output a filtered HTML comment. | ||||||
| 775 | |||||||
| 776 | =cut | ||||||
| 777 | |||||||
| 778 | *output_comment = \&output_start; | ||||||
| 779 | |||||||
| 780 | =item output_process ( TEXT ) | ||||||
| 781 | |||||||
| 782 | This method is used to output a filtered processing instruction. | ||||||
| 783 | |||||||
| 784 | =cut | ||||||
| 785 | |||||||
| 786 | *output_process = \&output_start; | ||||||
| 787 | |||||||
| 788 | =item output ( TEXT ) | ||||||
| 789 | |||||||
| 790 | This method is invoked by all of the default C |
||||||
| 791 | default implementation appends the text to the string that the | ||||||
| 792 | filtered_document() method will return. | ||||||
| 793 | |||||||
| 794 | =cut | ||||||
| 795 | |||||||
| 796 | 5111 | 5111 | 1 | 25077 | sub output { $_[0]->{_hssStack}[0]{CONTENT} .= $_[1]; } | ||
| 797 | |||||||
| 798 | =item output_stack_entry ( TEXT ) | ||||||
| 799 | |||||||
| 800 | This method is invoked when a tag plus all text and nested HTML content | ||||||
| 801 | within the tag has been processed. It adds the tag plus its content | ||||||
| 802 | to the content for its parent tag. | ||||||
| 803 | |||||||
| 804 | =cut | ||||||
| 805 | |||||||
| 806 | sub output_stack_entry { | ||||||
| 807 | 3113 | 3113 | 1 | 5399 | my ( $self, $tag ) = @_; | ||
| 808 | |||||||
| 809 | 3113 | 4516 | my %entry; | ||||
| 810 | 3113 | 4091 | @entry{qw(tag attr content)} = @{$tag}{qw(NAME ATTR CONTENT)}; | ||||
| 3113 | 15606 | ||||||
| 811 | |||||||
| 812 | 3113 | 100 | 25025 | if ( my $tag_callback = $tag->{CALLBACK} ) { | |||
| 813 | 41 | 100 | 136 | $tag_callback->( $self, \%entry ) | |||
| 814 | or return; | ||||||
| 815 | } | ||||||
| 816 | |||||||
| 817 | 3112 | 6062 | my $tagname = $entry{tag}; | ||||
| 818 | 3112 | 8043 | my $filtered_attrs = $self->_hss_join_attribs( $entry{attr} ); | ||||
| 819 | |||||||
| 820 | 3112 | 100 | 11106 | if ( $tag->{CTX} eq 'EMPTY' ) { | |||
| 821 | 1576 | 50 | 8705 | $self->output_start("<$tagname$filtered_attrs />") | |||
| 822 | if $entry{tag}; | ||||||
| 823 | 1576 | 4975 | return; | ||||
| 824 | } | ||||||
| 825 | 1536 | 100 | 3442 | if ($tagname) { | |||
| 826 | 1535 | 9705 | $self->output_start("<$tagname$filtered_attrs>"); | ||||
| 827 | } | ||||||
| 828 | |||||||
| 829 | 1536 | 50 | 4122 | if ( defined $entry{content} ) { | |||
| 830 | 1536 | 4086 | $self->{_hssStack}[0]{CONTENT} .= $entry{content}; | ||||
| 831 | } | ||||||
| 832 | |||||||
| 833 | 1536 | 100 | 3083 | if ($tagname) { | |||
| 834 | 1535 | 4776 | $self->output_end("$tagname>"); | ||||
| 835 | } | ||||||
| 836 | } | ||||||
| 837 | |||||||
| 838 | =back | ||||||
| 839 | |||||||
| 840 | =head1 REJECT METHODS | ||||||
| 841 | |||||||
| 842 | When the filter encounters something in the input document which it | ||||||
| 843 | cannot transform into an acceptable construct, it invokes one of | ||||||
| 844 | the following C |
||||||
| 845 | document to take the place of the unacceptable construct. | ||||||
| 846 | |||||||
| 847 | The TEXT parameter is the full text of the unacceptable construct. | ||||||
| 848 | |||||||
| 849 | The default implementations of these methods output an HTML comment | ||||||
| 850 | containing the text C |
||||||
| 851 | is set to true, then the rejected text is HTML escaped instead. | ||||||
| 852 | |||||||
| 853 | Subclasses may override these methods, but should exercise caution. | ||||||
| 854 | The TEXT parameter is unfiltered input and may contain malicious | ||||||
| 855 | constructs. | ||||||
| 856 | |||||||
| 857 | =over | ||||||
| 858 | |||||||
| 859 | =item reject_start ( TEXT ) | ||||||
| 860 | |||||||
| 861 | =item reject_end ( TEXT ) | ||||||
| 862 | |||||||
| 863 | =item reject_text ( TEXT ) | ||||||
| 864 | |||||||
| 865 | =item reject_declaration ( TEXT ) | ||||||
| 866 | |||||||
| 867 | =item reject_comment ( TEXT ) | ||||||
| 868 | |||||||
| 869 | =item reject_process ( TEXT ) | ||||||
| 870 | |||||||
| 871 | =back | ||||||
| 872 | |||||||
| 873 | =cut | ||||||
| 874 | |||||||
| 875 | sub reject_start { | ||||||
| 876 | 76 | 100 | 76 | 1 | 599 | $_[0]->{_hssCfg}{EscapeFiltered} | |
| 877 | ? $_[0]->output_text( $_[0]->escape_html_metachars( $_[1] ) ) | ||||||
| 878 | : $_[0]->output_comment(''); | ||||||
| 879 | } | ||||||
| 880 | *reject_end = \&reject_start; | ||||||
| 881 | *reject_text = \&reject_start; | ||||||
| 882 | *reject_declaration = \&reject_start; | ||||||
| 883 | *reject_comment = \&reject_start; | ||||||
| 884 | *reject_process = \&reject_start; | ||||||
| 885 | |||||||
| 886 | =head1 WHITELIST INITIALIZATION METHODS | ||||||
| 887 | |||||||
| 888 | The filter refers to various whitelists to determine which constructs | ||||||
| 889 | are acceptable. To modify these whitelists, subclasses can override | ||||||
| 890 | the following methods. | ||||||
| 891 | |||||||
| 892 | Each method is called once at object initialization time, and must | ||||||
| 893 | return a reference to a nested data structure. These references are | ||||||
| 894 | installed into the object, and used whenever the filter needs to refer | ||||||
| 895 | to a whitelist. | ||||||
| 896 | |||||||
| 897 | The default implementations of these methods can be invoked as class | ||||||
| 898 | methods. | ||||||
| 899 | |||||||
| 900 | See examples/tags/ and examples/declaration/ for examples of how to | ||||||
| 901 | override these methods. | ||||||
| 902 | |||||||
| 903 | =over | ||||||
| 904 | |||||||
| 905 | =item init_context_whitelist () | ||||||
| 906 | |||||||
| 907 | Returns a reference to the C |
||||||
| 908 | which tags may appear at each point in the document, and which other | ||||||
| 909 | tags may be nested within them. | ||||||
| 910 | |||||||
| 911 | It is a hash, and the keys are context names, such as C |
||||||
| 912 | C |
||||||
| 913 | |||||||
| 914 | The values in the hash are hashrefs. The keys in these subhashes are | ||||||
| 915 | lowercase tag names, and the values are context names, specifying the | ||||||
| 916 | context that the tag provides to any other tags nested within it. | ||||||
| 917 | |||||||
| 918 | The special context C |
||||||
| 919 | nothing can be nested within that tag. | ||||||
| 920 | |||||||
| 921 | =cut | ||||||
| 922 | |||||||
| 923 | 10 | 10 | 132 | use vars qw(%_Context); | |||
| 10 | 36 | ||||||
| 10 | 3732 | ||||||
| 924 | |||||||
| 925 | BEGIN { | ||||||
| 926 | |||||||
| 927 | 10 | 10 | 295 | my %pre_content = ( 'br' => 'EMPTY', | |||
| 928 | 'span' => 'Inline', | ||||||
| 929 | 'tt' => 'Inline', | ||||||
| 930 | 'i' => 'Inline', | ||||||
| 931 | 'b' => 'Inline', | ||||||
| 932 | 'u' => 'Inline', | ||||||
| 933 | 's' => 'Inline', | ||||||
| 934 | 'strike' => 'Inline', | ||||||
| 935 | 'em' => 'Inline', | ||||||
| 936 | 'strong' => 'Inline', | ||||||
| 937 | 'dfn' => 'Inline', | ||||||
| 938 | 'code' => 'Inline', | ||||||
| 939 | 'q' => 'Inline', | ||||||
| 940 | 'samp' => 'Inline', | ||||||
| 941 | 'kbd' => 'Inline', | ||||||
| 942 | 'var' => 'Inline', | ||||||
| 943 | 'cite' => 'Inline', | ||||||
| 944 | 'abbr' => 'Inline', | ||||||
| 945 | 'acronym' => 'Inline', | ||||||
| 946 | 'ins' => 'Inline', | ||||||
| 947 | 'del' => 'Inline', | ||||||
| 948 | 'a' => 'Inline', | ||||||
| 949 | 'CDATA' => 'CDATA', | ||||||
| 950 | ); | ||||||
| 951 | |||||||
| 952 | 10 | 197 | my %inline = ( %pre_content, | ||||
| 953 | 'img' => 'EMPTY', | ||||||
| 954 | 'big' => 'Inline', | ||||||
| 955 | 'small' => 'Inline', | ||||||
| 956 | 'sub' => 'Inline', | ||||||
| 957 | 'sup' => 'Inline', | ||||||
| 958 | 'font' => 'Inline', | ||||||
| 959 | 'nobr' => 'Inline', | ||||||
| 960 | ); | ||||||
| 961 | |||||||
| 962 | 10 | 415 | my %flow = ( %inline, | ||||
| 963 | 'ins' => 'Flow', | ||||||
| 964 | 'del' => 'Flow', | ||||||
| 965 | 'div' => 'Flow', | ||||||
| 966 | 'p' => 'Inline', | ||||||
| 967 | 'h1' => 'Inline', | ||||||
| 968 | 'h2' => 'Inline', | ||||||
| 969 | 'h3' => 'Inline', | ||||||
| 970 | 'h4' => 'Inline', | ||||||
| 971 | 'h5' => 'Inline', | ||||||
| 972 | 'h6' => 'Inline', | ||||||
| 973 | 'ul' => 'list', | ||||||
| 974 | 'ol' => 'list', | ||||||
| 975 | 'menu' => 'list', | ||||||
| 976 | 'dir' => 'list', | ||||||
| 977 | 'dl' => 'dt_dd', | ||||||
| 978 | 'address' => 'Inline', | ||||||
| 979 | 'hr' => 'EMPTY', | ||||||
| 980 | 'pre' => 'pre.content', | ||||||
| 981 | 'blockquote' => 'Flow', | ||||||
| 982 | 'center' => 'Flow', | ||||||
| 983 | 'table' => 'table', | ||||||
| 984 | ); | ||||||
| 985 | |||||||
| 986 | 10 | 107 | my %table = ( 'caption' => 'Inline', | ||||
| 987 | 'thead' => 'tr_only', | ||||||
| 988 | 'tfoot' => 'tr_only', | ||||||
| 989 | 'tbody' => 'tr_only', | ||||||
| 990 | 'colgroup' => 'colgroup', | ||||||
| 991 | 'col' => 'EMPTY', | ||||||
| 992 | 'tr' => 'th_td', | ||||||
| 993 | ); | ||||||
| 994 | |||||||
| 995 | 10 | 35 | my %head = ( 'title' => 'NoTags', ); | ||||
| 996 | |||||||
| 997 | 10 | 1381 | %_Context = ( 'Document' => { 'html' => 'Html' }, | ||||
| 998 | 'Html' => { 'head' => 'Head', 'body' => 'Flow' }, | ||||||
| 999 | 'Head' => \%head, | ||||||
| 1000 | 'Inline' => \%inline, | ||||||
| 1001 | 'Flow' => \%flow, | ||||||
| 1002 | 'NoTags' => { 'CDATA' => 'CDATA' }, | ||||||
| 1003 | 'pre.content' => \%pre_content, | ||||||
| 1004 | 'table' => \%table, | ||||||
| 1005 | 'list' => { 'li' => 'Flow' }, | ||||||
| 1006 | 'dt_dd' => { 'dt' => 'Inline', 'dd' => 'Flow' }, | ||||||
| 1007 | 'tr_only' => { 'tr' => 'th_td' }, | ||||||
| 1008 | 'colgroup' => { 'col' => 'EMPTY' }, | ||||||
| 1009 | 'th_td' => { 'th' => 'Flow', 'td' => 'Flow' }, | ||||||
| 1010 | ); | ||||||
| 1011 | } | ||||||
| 1012 | |||||||
| 1013 | 1487 | 1487 | 1 | 4369 | sub init_context_whitelist { return \%_Context; } | ||
| 1014 | |||||||
| 1015 | =item init_attrib_whitelist () | ||||||
| 1016 | |||||||
| 1017 | Returns a reference to the C |
||||||
| 1018 | attributes each tag can have and the values that those attributes can | ||||||
| 1019 | take. | ||||||
| 1020 | |||||||
| 1021 | It is a hash, and the keys are lowercase tag names. | ||||||
| 1022 | |||||||
| 1023 | The values in the hash are hashrefs. The keys in these subhashes are | ||||||
| 1024 | lowercase attribute names, and the values are attribute value class names, | ||||||
| 1025 | which are short strings describing the type of values that the | ||||||
| 1026 | attribute can take, such as C |
||||||
| 1027 | |||||||
| 1028 | =cut | ||||||
| 1029 | |||||||
| 1030 | 10 | 10 | 64 | use vars qw(%_Attrib); | |||
| 10 | 18 | ||||||
| 10 | 6646 | ||||||
| 1031 | |||||||
| 1032 | BEGIN { | ||||||
| 1033 | |||||||
| 1034 | 10 | 10 | 47 | my %attr = ( 'style' => 'style' ); | |||
| 1035 | |||||||
| 1036 | 10 | 60 | my %font_attr = ( %attr, | ||||
| 1037 | 'size' => 'size', | ||||||
| 1038 | 'face' => 'wordlist', | ||||||
| 1039 | 'color' => 'color', | ||||||
| 1040 | ); | ||||||
| 1041 | |||||||
| 1042 | 10 | 46 | my %insdel_attr = ( %attr, | ||||
| 1043 | 'cite' => 'href', | ||||||
| 1044 | 'datetime' => 'text', | ||||||
| 1045 | ); | ||||||
| 1046 | |||||||
| 1047 | 10 | 661 | my %texta_attr = ( %attr, 'align' => 'word', ); | ||||
| 1048 | |||||||
| 1049 | 10 | 40 | my %cellha_attr = ( 'align' => 'word', | ||||
| 1050 | 'char' => 'word', | ||||||
| 1051 | 'charoff' => 'size', | ||||||
| 1052 | ); | ||||||
| 1053 | |||||||
| 1054 | 10 | 27 | my %cellva_attr = ( 'valign' => 'word', ); | ||||
| 1055 | |||||||
| 1056 | 10 | 183 | my %cellhv_attr = ( %attr, %cellha_attr, %cellva_attr ); | ||||
| 1057 | |||||||
| 1058 | 10 | 65 | my %col_attr = ( %attr, %cellhv_attr, | ||||
| 1059 | 'width' => 'size', | ||||||
| 1060 | 'span' => 'number', | ||||||
| 1061 | ); | ||||||
| 1062 | |||||||
| 1063 | 10 | 198 | my %thtd_attr = ( %attr, | ||||
| 1064 | 'abbr' => 'text', | ||||||
| 1065 | 'axis' => 'text', | ||||||
| 1066 | 'headers' => 'text', | ||||||
| 1067 | 'scope' => 'word', | ||||||
| 1068 | 'rowspan' => 'number', | ||||||
| 1069 | 'colspan' => 'number', | ||||||
| 1070 | %cellhv_attr, | ||||||
| 1071 | 'nowrap' => 'novalue', | ||||||
| 1072 | 'bgcolor' => 'color', | ||||||
| 1073 | 'width' => 'size', | ||||||
| 1074 | 'height' => 'size', | ||||||
| 1075 | 'bordercolor' => 'color', | ||||||
| 1076 | 'bordercolorlight' => 'color', | ||||||
| 1077 | 'bordercolordark' => 'color', | ||||||
| 1078 | ); | ||||||
| 1079 | |||||||
| 1080 | 10 | 1706 | %_Attrib = ( 'br' => { 'clear' => 'word' }, | ||||
| 1081 | 'em' => \%attr, | ||||||
| 1082 | 'strong' => \%attr, | ||||||
| 1083 | 'dfn' => \%attr, | ||||||
| 1084 | 'code' => \%attr, | ||||||
| 1085 | 'samp' => \%attr, | ||||||
| 1086 | 'kbd' => \%attr, | ||||||
| 1087 | 'var' => \%attr, | ||||||
| 1088 | 'cite' => \%attr, | ||||||
| 1089 | 'abbr' => \%attr, | ||||||
| 1090 | 'acronym' => \%attr, | ||||||
| 1091 | 'q' => { %attr, 'cite' => 'href' }, | ||||||
| 1092 | 'blockquote' => { %attr, 'cite' => 'href' }, | ||||||
| 1093 | 'sub' => \%attr, | ||||||
| 1094 | 'sup' => \%attr, | ||||||
| 1095 | 'tt' => \%attr, | ||||||
| 1096 | 'i' => \%attr, | ||||||
| 1097 | 'b' => \%attr, | ||||||
| 1098 | 'big' => \%attr, | ||||||
| 1099 | 'small' => \%attr, | ||||||
| 1100 | 'u' => \%attr, | ||||||
| 1101 | 's' => \%attr, | ||||||
| 1102 | 'strike' => \%attr, | ||||||
| 1103 | 'font' => \%font_attr, | ||||||
| 1104 | 'table' => { | ||||||
| 1105 | %attr, | ||||||
| 1106 | 'frame' => 'word', | ||||||
| 1107 | 'rules' => 'word', | ||||||
| 1108 | %texta_attr, | ||||||
| 1109 | 'bgcolor' => 'color', | ||||||
| 1110 | 'background' => 'src', | ||||||
| 1111 | 'width' => 'size', | ||||||
| 1112 | 'height' => 'size', | ||||||
| 1113 | 'cellspacing' => 'size', | ||||||
| 1114 | 'cellpadding' => 'size', | ||||||
| 1115 | 'border' => 'size', | ||||||
| 1116 | 'bordercolor' => 'color', | ||||||
| 1117 | 'bordercolorlight' => 'color', | ||||||
| 1118 | 'bordercolordark' => 'color', | ||||||
| 1119 | 'summary' => 'text', | ||||||
| 1120 | }, | ||||||
| 1121 | 'caption' => { %attr, 'align' => 'word', }, | ||||||
| 1122 | 'colgroup' => \%col_attr, | ||||||
| 1123 | 'col' => \%col_attr, | ||||||
| 1124 | 'thead' => \%cellhv_attr, | ||||||
| 1125 | 'tfoot' => \%cellhv_attr, | ||||||
| 1126 | 'tbody' => \%cellhv_attr, | ||||||
| 1127 | 'tr' => { | ||||||
| 1128 | %attr, | ||||||
| 1129 | bgcolor => 'color', | ||||||
| 1130 | %cellhv_attr, | ||||||
| 1131 | }, | ||||||
| 1132 | 'th' => \%thtd_attr, | ||||||
| 1133 | 'td' => \%thtd_attr, | ||||||
| 1134 | 'ins' => \%insdel_attr, | ||||||
| 1135 | 'del' => \%insdel_attr, | ||||||
| 1136 | 'a' => { %attr, href => 'href', }, | ||||||
| 1137 | 'h1' => \%texta_attr, | ||||||
| 1138 | 'h2' => \%texta_attr, | ||||||
| 1139 | 'h3' => \%texta_attr, | ||||||
| 1140 | 'h4' => \%texta_attr, | ||||||
| 1141 | 'h5' => \%texta_attr, | ||||||
| 1142 | 'h6' => \%texta_attr, | ||||||
| 1143 | 'p' => \%texta_attr, | ||||||
| 1144 | 'div' => \%texta_attr, | ||||||
| 1145 | 'span' => \%texta_attr, | ||||||
| 1146 | 'ul' => { | ||||||
| 1147 | %attr, | ||||||
| 1148 | 'type' => 'word', | ||||||
| 1149 | 'compact' => 'novalue', | ||||||
| 1150 | }, | ||||||
| 1151 | 'ol' => { %attr, | ||||||
| 1152 | 'type' => 'text', | ||||||
| 1153 | 'compact' => 'novalue', | ||||||
| 1154 | 'start' => 'number', | ||||||
| 1155 | }, | ||||||
| 1156 | 'li' => { %attr, | ||||||
| 1157 | 'type' => 'text', | ||||||
| 1158 | 'value' => 'number', | ||||||
| 1159 | }, | ||||||
| 1160 | 'dl' => { %attr, 'compact' => 'novalue' }, | ||||||
| 1161 | 'dt' => \%attr, | ||||||
| 1162 | 'dd' => \%attr, | ||||||
| 1163 | 'address' => \%attr, | ||||||
| 1164 | 'hr' => { | ||||||
| 1165 | %texta_attr, | ||||||
| 1166 | 'width' => 'size', | ||||||
| 1167 | 'size' => 'size', | ||||||
| 1168 | 'noshade' => 'novalue', | ||||||
| 1169 | }, | ||||||
| 1170 | 'pre' => { %attr, 'width' => 'size' }, | ||||||
| 1171 | 'center' => \%attr, | ||||||
| 1172 | 'nobr' => {}, | ||||||
| 1173 | 'img' => { | ||||||
| 1174 | 'src' => 'src', | ||||||
| 1175 | 'alt' => 'text', | ||||||
| 1176 | 'width' => 'size', | ||||||
| 1177 | 'height' => 'size', | ||||||
| 1178 | 'border' => 'size', | ||||||
| 1179 | 'hspace' => 'size', | ||||||
| 1180 | 'vspace' => 'size', | ||||||
| 1181 | 'align' => 'word', | ||||||
| 1182 | }, | ||||||
| 1183 | 'body' => { 'bgcolor' => 'color', | ||||||
| 1184 | 'background' => 'src', | ||||||
| 1185 | 'link' => 'color', | ||||||
| 1186 | 'vlink' => 'color', | ||||||
| 1187 | 'alink' => 'color', | ||||||
| 1188 | 'text' => 'color', | ||||||
| 1189 | }, | ||||||
| 1190 | 'head' => {}, | ||||||
| 1191 | 'title' => {}, | ||||||
| 1192 | 'html' => {}, | ||||||
| 1193 | ); | ||||||
| 1194 | } | ||||||
| 1195 | |||||||
| 1196 | 1487 | 1487 | 1 | 4684 | sub init_attrib_whitelist { return \%_Attrib; } | ||
| 1197 | |||||||
| 1198 | =item init_attval_whitelist () | ||||||
| 1199 | |||||||
| 1200 | Returns a reference to the C |
||||||
| 1201 | attribute value class names from the C |
||||||
| 1202 | subs to validate (and optionally transform) a particular attribute value. | ||||||
| 1203 | |||||||
| 1204 | The filter calls the attribute value validation subs with the | ||||||
| 1205 | following parameters: | ||||||
| 1206 | |||||||
| 1207 | =over | ||||||
| 1208 | |||||||
| 1209 | =item C |
||||||
| 1210 | |||||||
| 1211 | A reference to the filter object. | ||||||
| 1212 | |||||||
| 1213 | =item C |
||||||
| 1214 | |||||||
| 1215 | The lowercase name of the tag in which the attribute appears. | ||||||
| 1216 | |||||||
| 1217 | =item C |
||||||
| 1218 | |||||||
| 1219 | The name of the attribute. | ||||||
| 1220 | |||||||
| 1221 | =item C |
||||||
| 1222 | |||||||
| 1223 | The attribute value found in the input document, in canonical form | ||||||
| 1224 | (see L"CANONICAL FORM">). | ||||||
| 1225 | |||||||
| 1226 | =back | ||||||
| 1227 | |||||||
| 1228 | The validation sub can return undef to indicate that the attribute | ||||||
| 1229 | should be removed from the tag, or it can return the new value for | ||||||
| 1230 | the attribute, in canonical form. | ||||||
| 1231 | |||||||
| 1232 | =cut | ||||||
| 1233 | |||||||
| 1234 | 10 | 10 | 63 | use vars qw(%_AttVal); | |||
| 10 | 15 | ||||||
| 10 | 1601 | ||||||
| 1235 | |||||||
| 1236 | BEGIN { | ||||||
| 1237 | 10 | 10 | 823 | %_AttVal = ( 'style' => \&_hss_attval_style, | |||
| 1238 | 'size' => \&_hss_attval_size, | ||||||
| 1239 | 'number' => \&_hss_attval_number, | ||||||
| 1240 | 'color' => \&_hss_attval_color, | ||||||
| 1241 | 'text' => \&_hss_attval_text, | ||||||
| 1242 | 'word' => \&_hss_attval_word, | ||||||
| 1243 | 'wordlist' => \&_hss_attval_wordlist, | ||||||
| 1244 | 'wordlistq' => \&_hss_attval_wordlistq, | ||||||
| 1245 | 'href' => \&_hss_attval_href, | ||||||
| 1246 | 'src' => \&_hss_attval_src, | ||||||
| 1247 | 'stylesrc' => \&_hss_attval_stylesrc, | ||||||
| 1248 | 'novalue' => \&_hss_attval_novalue, | ||||||
| 1249 | ); | ||||||
| 1250 | } | ||||||
| 1251 | |||||||
| 1252 | 1487 | 1487 | 1 | 3882 | sub init_attval_whitelist { return \%_AttVal; } | ||
| 1253 | |||||||
| 1254 | =item init_style_whitelist () | ||||||
| 1255 | |||||||
| 1256 | Returns a reference to the C |