blib/lib/HTML/StripScripts.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 367 | 383 | 95.8 |
branch | 169 | 202 | 83.6 |
condition | 44 | 55 | 80.0 |
subroutine | 68 | 72 | 94.4 |
pod | 31 | 31 | 100.0 |
total | 679 | 743 | 91.3 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::StripScripts; | ||||||
2 | 10 | 10 | 426913 | use strict; | |||
10 | 26 | ||||||
10 | 573 | ||||||
3 | 10 | 10 | 59 | use warnings FATAL => 'all'; | |||
10 | 20 | ||||||
10 | 875 | ||||||
4 | |||||||
5 | 10 | 10 | 71 | use vars qw($VERSION); | |||
10 | 21 | ||||||
10 | 54299 | ||||||
6 | $VERSION = '1.05'; | ||||||
7 | |||||||
8 | =head1 NAME | ||||||
9 | |||||||
10 | HTML::StripScripts - Strip scripting constructs out of HTML | ||||||
11 | |||||||
12 | =head1 SYNOPSIS | ||||||
13 | |||||||
14 | use HTML::StripScripts; | ||||||
15 | |||||||
16 | my $hss = HTML::StripScripts->new({ Context => 'Inline' }); | ||||||
17 | |||||||
18 | $hss->input_start_document; | ||||||
19 | |||||||
20 | $hss->input_start(''); | ||||||
21 | $hss->input_text('hello, world!'); | ||||||
22 | $hss->input_end(''); | ||||||
23 | |||||||
24 | $hss->input_end_document; | ||||||
25 | |||||||
26 | print $hss->filtered_document; | ||||||
27 | |||||||
28 | =head1 DESCRIPTION | ||||||
29 | |||||||
30 | This module strips scripting constructs out of HTML, leaving as | ||||||
31 | much non-scripting markup in place as possible. This allows web | ||||||
32 | applications to display HTML originating from an untrusted source | ||||||
33 | without introducing XSS (cross site scripting) vulnerabilities. | ||||||
34 | |||||||
35 | You will probably use L |
||||||
36 | this module directly. | ||||||
37 | |||||||
38 | The process is based on whitelists of tags, attributes and attribute | ||||||
39 | values. This approach is the most secure against disguised scripting | ||||||
40 | constructs hidden in malicious HTML documents. | ||||||
41 | |||||||
42 | As well as removing scripting constructs, this module ensures that | ||||||
43 | there is a matching end for each start tag, and that the tags are | ||||||
44 | properly nested. | ||||||
45 | |||||||
46 | Previously, in order to customise the output, you needed to subclass | ||||||
47 | C |
||||||
48 | can be done through the C |
||||||
49 | examples/declaration/ and examples/tags/ for cases where subclassing is | ||||||
50 | necessary.) | ||||||
51 | |||||||
52 | The HTML document must be parsed into start tags, end tags and | ||||||
53 | text before it can be filtered by this module. Use either | ||||||
54 | L |
||||||
55 | if you want to input an unparsed HTML document. | ||||||
56 | |||||||
57 | See examples/direct/ for an example of how to feed tokens directly to | ||||||
58 | HTML::StripScripts. | ||||||
59 | |||||||
60 | =head1 CONSTRUCTORS | ||||||
61 | |||||||
62 | =over | ||||||
63 | |||||||
64 | =item new ( CONFIG ) | ||||||
65 | |||||||
66 | Creates a new C |
||||||
67 | particular filtering policy. If present, the CONFIG parameter | ||||||
68 | must be a hashref. The following keys are recognized (unrecognized | ||||||
69 | keys will be silently ignored). | ||||||
70 | |||||||
71 | $s = HTML::Stripscripts->new({ | ||||||
72 | Context => 'Document|Flow|Inline|NoTags', | ||||||
73 | BanList => [qw( br img )] | {br => '1', img => '1'}, | ||||||
74 | BanAllBut => [qw(p div span)], | ||||||
75 | AllowSrc => 0|1, | ||||||
76 | AllowHref => 0|1, | ||||||
77 | AllowRelURL => 0|1, | ||||||
78 | AllowMailto => 0|1, | ||||||
79 | EscapeFiltered => 0|1, | ||||||
80 | Rules => { See below for details }, | ||||||
81 | }); | ||||||
82 | |||||||
83 | =over | ||||||
84 | |||||||
85 | =item C |
||||||
86 | |||||||
87 | A string specifying the context in which the filtered document | ||||||
88 | will be used. This influences the set of tags that will be | ||||||
89 | allowed. | ||||||
90 | |||||||
91 | If present, the C |
||||||
92 | |||||||
93 | =over | ||||||
94 | |||||||
95 | =item C |
||||||
96 | |||||||
97 | If C |
||||||
98 | HTML document, including the C tag and C and C | ||||||
99 | sections. | ||||||
100 | |||||||
101 | =item C |
||||||
102 | |||||||
103 | If C |
||||||
104 | would expect to find in a document body are allowed, including | ||||||
105 | lists and tables but not including forms. | ||||||
106 | |||||||
107 | =item C |
||||||
108 | |||||||
109 | If C |
||||||
110 | and C are allowed. | ||||||
111 | |||||||
112 | =item C |
||||||
113 | |||||||
114 | If C |
||||||
115 | |||||||
116 | =back | ||||||
117 | |||||||
118 | The default C |
||||||
119 | |||||||
120 | =item C |
||||||
121 | |||||||
122 | If present, this option must be an arrayref or a hashref. Any tag that | ||||||
123 | would normally be allowed (because it presents no XSS hazard) will be | ||||||
124 | blocked if the lowercase name of the tag is in this list. | ||||||
125 | |||||||
126 | For example, in a guestbook application where C tags are used to |
||||||
127 | separate posts, you may wish to prevent posts from including C |
||||||
128 | tags, even though C is not an XSS risk. |
||||||
129 | |||||||
130 | =item C |
||||||
131 | |||||||
132 | If present, this option must be reference to an array holding a list of | ||||||
133 | lowercase tag names. This has the effect of adding all but the listed | ||||||
134 | tags to the ban list, so that only those tags listed will be allowed. | ||||||
135 | |||||||
136 | =item C |
||||||
137 | |||||||
138 | By default, the filter won't allow constructs that cause the browser to | ||||||
139 | fetch things automatically, such as C |
||||||
140 | If this option is present and true then those constructs will be | ||||||
141 | allowed. | ||||||
142 | |||||||
143 | =item C |
||||||
144 | |||||||
145 | By default, the filter won't allow constructs that cause the browser to | ||||||
146 | fetch things if the user clicks on something, such as the C |
||||||
147 | attribute in C tags. Set this option to a true value to allow this | ||||||
148 | type of construct. | ||||||
149 | |||||||
150 | =item C |
||||||
151 | |||||||
152 | By default, the filter won't allow relative URLs such as C<../foo.html> | ||||||
153 | in C |
||||||
154 | to allow them. C |
||||||
155 | for this to have any effect. | ||||||
156 | |||||||
157 | =item C |
||||||
158 | |||||||
159 | By default, C |
||||||
160 | a true value, then this construct will be allowed. This can be enabled | ||||||
161 | separately from AllowHref. | ||||||
162 | |||||||
163 | =item C |
||||||
164 | |||||||
165 | By default, any filtered tags are outputted as C<< >>. If | ||||||
166 | C |
||||||
167 | to HTML entities. | ||||||
168 | |||||||
169 | For instance: | ||||||
170 | |||||||
171 | --> <br> |
||||||
172 | |||||||
173 | =item C |
||||||
174 | |||||||
175 | The C |
||||||
176 | |||||||
177 | The focus is safety-first, so it is applied after all of the previous validation. | ||||||
178 | This means that you cannot all malicious data should already have been cleared. | ||||||
179 | |||||||
180 | Rules can be specified for tags and for attributes. Any tag or attribute | ||||||
181 | not explicitly listed will be handled by the default C<*> rules. | ||||||
182 | |||||||
183 | The following is a synopsis of all of the options that you can use to | ||||||
184 | configure rules. Below, an example is broken into sections and explained. | ||||||
185 | |||||||
186 | Rules => { | ||||||
187 | |||||||
188 | tag => 0 | 1 | sub { tag_callback } | ||||||
189 | | { | ||||||
190 | attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
191 | '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
192 | required => [qw(attrname attrname)], | ||||||
193 | tag => sub { tag_callback } | ||||||
194 | }, | ||||||
195 | |||||||
196 | '*' => 0 | 1 | sub { tag_callback } | ||||||
197 | | { | ||||||
198 | attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
199 | '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback}, | ||||||
200 | tag => sub { tag_callback } | ||||||
201 | } | ||||||
202 | |||||||
203 | } | ||||||
204 | |||||||
205 | EXAMPLE: | ||||||
206 | |||||||
207 | Rules => { | ||||||
208 | |||||||
209 | ########################## | ||||||
210 | ##### EXPLICIT RULES ##### | ||||||
211 | ########################## | ||||||
212 | |||||||
213 | ## Allow tags, reject tags |
||||||
214 | br => 1, | ||||||
215 | img => 0, | ||||||
216 | |||||||
217 | ## Send all tags to a sub |
||||||
218 | div => sub { tag_callback }, | ||||||
219 | |||||||
220 | ## Allow tags,and allow the 'cite' attribute |
||||||
221 | ## All other attributes are handled by the default C<*> | ||||||
222 | blockquote => { | ||||||
223 | cite => 1, | ||||||
224 | }, | ||||||
225 | |||||||
226 | ## Allow tags, and | ||||||
227 | a => { | ||||||
228 | |||||||
229 | ## Allow the 'title' attribute | ||||||
230 | title => 1, | ||||||
231 | |||||||
232 | ## Allow the 'href' attribute if it matches the regex | ||||||
233 | href => '^http://yourdomain.com' | ||||||
234 | OR href => qr{^http://yourdomain.com}, | ||||||
235 | |||||||
236 | ## 'style' attributes are handled by a sub | ||||||
237 | style => sub { attr_callback }, | ||||||
238 | |||||||
239 | ## All other attributes are rejected | ||||||
240 | '*' => 0, | ||||||
241 | |||||||
242 | ## Additionally, the tag should be handled by this sub | ||||||
243 | tag => sub { tag_callback}, | ||||||
244 | |||||||
245 | ## If the tag doesn't have these attributes, filter the tag | ||||||
246 | required => [qw(href title)], | ||||||
247 | |||||||
248 | }, | ||||||
249 | |||||||
250 | ########################## | ||||||
251 | ##### DEFAULT RULES ##### | ||||||
252 | ########################## | ||||||
253 | |||||||
254 | ## The default '*' rule - accepts all the same options as above. | ||||||
255 | ## If a tag or attribute is not mentioned above, then the default | ||||||
256 | ## rule is applied: | ||||||
257 | |||||||
258 | ## Reject all tags | ||||||
259 | '*' => 0, | ||||||
260 | |||||||
261 | ## Allow all tags and all attributes | ||||||
262 | '*' => 1, | ||||||
263 | |||||||
264 | ## Send all tags to the sub | ||||||
265 | '*' => sub { tag_callback }, | ||||||
266 | |||||||
267 | ## Allow all tags, reject all attributes | ||||||
268 | '*' => { '*' => 0 }, | ||||||
269 | |||||||
270 | ## Allow all tags, and | ||||||
271 | '*' => { | ||||||
272 | |||||||
273 | ## Allow the 'title' attribute | ||||||
274 | title => 1, | ||||||
275 | |||||||
276 | ## Allow the 'href' attribute if it matches the regex | ||||||
277 | href => '^http://yourdomain.com' | ||||||
278 | OR href => qr{^http://yourdomain.com}, | ||||||
279 | |||||||
280 | ## 'style' attributes are handled by a sub | ||||||
281 | style => sub { attr_callback }, | ||||||
282 | |||||||
283 | ## All other attributes are rejected | ||||||
284 | '*' => 0, | ||||||
285 | |||||||
286 | ## Additionally, all tags should be handled by this sub | ||||||
287 | tag => sub { tag_callback}, | ||||||
288 | |||||||
289 | }, | ||||||
290 | |||||||
291 | =over | ||||||
292 | |||||||
293 | =item Tag Callbacks | ||||||
294 | |||||||
295 | sub tag_callback { | ||||||
296 | my ($filter,$element) = (@_); | ||||||
297 | |||||||
298 | $element = { | ||||||
299 | tag => 'tag', | ||||||
300 | content => 'inner_html', | ||||||
301 | attr => { | ||||||
302 | attr_name => 'attr_value', | ||||||
303 | } | ||||||
304 | }; | ||||||
305 | return 0 | 1; | ||||||
306 | } | ||||||
307 | |||||||
308 | A tag callback accepts two parameters, the C<$filter> object and the C$element>. | ||||||
309 | It should return C<0> to completely ignore the tag and its content (which includes | ||||||
310 | any nested HTML tags), or C<1> to accept and output the tag. | ||||||
311 | |||||||
312 | The C<$element> is a hash ref containing the keys: | ||||||
313 | |||||||
314 | =item C |
||||||
315 | |||||||
316 | This is the tagname in lowercase, eg C, C , C. If you set |
||||||
317 | the tag value to an empty string, then the tag will not be outputted, but | ||||||
318 | the tag contents will. | ||||||
319 | |||||||
320 | =item C |
||||||
321 | |||||||
322 | This is the equivalent of DOM's innerHTML. It contains the text content | ||||||
323 | and any HTML tags contained within this element. You can change the content | ||||||
324 | or set it to an empty string so that it is not outputted. | ||||||
325 | |||||||
326 | =item C |
||||||
327 | |||||||
328 | C |
||||||
329 | |||||||
330 | =back | ||||||
331 | |||||||
332 | If for instance, you wanted to replace C<< >> tags with C<< >> tags, | ||||||
333 | you could do this: | ||||||
334 | |||||||
335 | sub b_callback { | ||||||
336 | my ($filter,$element) = @_; | ||||||
337 | $element->{tag} = 'span'; | ||||||
338 | $element->{attr}{style} = 'font-weight:bold'; | ||||||
339 | return 1; | ||||||
340 | } | ||||||
341 | |||||||
342 | =item Attribute Callbacks | ||||||
343 | |||||||
344 | sub attr_callback { | ||||||
345 | my ( $filter, $tag, $attr_name, $attr_val ) = @_; | ||||||
346 | return undef | '' | 'value'; | ||||||
347 | } | ||||||
348 | |||||||
349 | Attribute callbacks accept four parameters, the C<$filter> object, the C<$tag> | ||||||
350 | name, the C<$attr_name> and the C<$attr_value>. | ||||||
351 | |||||||
352 | It should return either C |
||||||
353 | used. An empty string keeps the attribute, but without a value. | ||||||
354 | |||||||
355 | =item C |
||||||
356 | |||||||
357 | It is not necessary to use C |
||||||
358 | via C |
||||||
359 | |||||||
360 | BanAllBut => [qw(p div span)] | ||||||
361 | |||||||
362 | The logic works as follows: | ||||||
363 | |||||||
364 | * If BanAllBut exists, then ban everything but the tags in the list | ||||||
365 | * Add to the ban list any elements in BanList | ||||||
366 | * Any tags mentioned explicitly in Rules (eg a => 0, br => 1) | ||||||
367 | are added or removed from the BanList | ||||||
368 | * A default rule of { '*' => 0 } would ban all tags except | ||||||
369 | those mentioned in Rules | ||||||
370 | * A default rule of { '*' => 1 } would allow all tags except | ||||||
371 | those disallowed in the ban list, or by explicit rules | ||||||
372 | |||||||
373 | =back | ||||||
374 | |||||||
375 | =cut | ||||||
376 | |||||||
377 | sub new { | ||||||
378 | 1487 | 1487 | 1 | 50381 | my ( $pkg, $cfg ) = @_; | ||
379 | |||||||
380 | 1487 | 66 | 11379 | my $self = bless {}, ref $pkg || $pkg; | |||
381 | 1487 | 4280 | $self->hss_init($cfg); | ||||
382 | 1487 | 5024 | return $self; | ||||
383 | } | ||||||
384 | |||||||
385 | =back | ||||||
386 | |||||||
387 | =head1 METHODS | ||||||
388 | |||||||
389 | This class provides the following methods: | ||||||
390 | |||||||
391 | =over | ||||||
392 | |||||||
393 | =item hss_init () | ||||||
394 | |||||||
395 | This method is called by new() and does the actual initialisation work | ||||||
396 | for the new HTML::StripScripts object. | ||||||
397 | |||||||
398 | =cut | ||||||
399 | |||||||
400 | sub hss_init { | ||||||
401 | 1487 | 1487 | 1 | 2208 | my ( $self, $cfg ) = @_; | ||
402 | 1487 | 100 | 3400 | $cfg ||= {}; | |||
403 | |||||||
404 | 1487 | 3758 | $self->{_hssCfg} = $cfg; | ||||
405 | |||||||
406 | 1487 | 4893 | $self->{_hssContext} = $self->init_context_whitelist; | ||||
407 | 1487 | 4195 | $self->{_hssAttrib} = $self->init_attrib_whitelist; | ||||
408 | 1487 | 3659 | $self->{_hssAttVal} = $self->init_attval_whitelist; | ||||
409 | 1487 | 5024 | $self->{_hssStyle} = $self->init_style_whitelist; | ||||
410 | 1487 | 3364 | $self->{_hssDeInter} = $self->init_deinter_whitelist; | ||||
411 | 1487 | 3822 | $self->{_hssBanList} = $self->_hss_prepare_ban_list($cfg); | ||||
412 | 1487 | 4767 | $self->{_hssRules} = $self->_hss_prepare_rules($cfg); | ||||
413 | } | ||||||
414 | |||||||
415 | =item input_start_document () | ||||||
416 | |||||||
417 | This method initializes the filter, and must be called once before | ||||||
418 | starting on each HTML document to be filtered. | ||||||
419 | |||||||
420 | =cut | ||||||
421 | |||||||
422 | sub input_start_document { | ||||||
423 | 1890 | 1890 | 1 | 117008 | my ( $self, $context ) = @_; | ||
424 | |||||||
425 | 1890 | 100 | 18711 | $self->{_hssStack} = [ { NAME => '', | |||
426 | CTX => $self->{_hssCfg}{Context} || 'Flow', | ||||||
427 | CONTENT => '', | ||||||
428 | } | ||||||
429 | ]; | ||||||
430 | 1890 | 4079 | $self->{_hssOutput} = ''; | ||||
431 | |||||||
432 | 1890 | 4184 | $self->output_start_document; | ||||
433 | } | ||||||
434 | |||||||
435 | =item input_start ( TEXT ) | ||||||
436 | |||||||
437 | Handles a start tag from the input document. TEXT must be the | ||||||
438 | full text of the tag, including angle-brackets. | ||||||
439 | |||||||
440 | =cut | ||||||
441 | |||||||
442 | sub input_start { | ||||||
443 | 3160 | 3160 | 1 | 12065 | my ( $self, $text ) = @_; | ||
444 | |||||||
445 | 3160 | 100 | 7691 | $self->_hss_accept_input_start($text) or $self->reject_start($text); | |||
446 | } | ||||||
447 | |||||||
448 | sub _hss_accept_input_start { | ||||||
449 | 3160 | 3160 | 17127 | my ( $self, $text ) = @_; | |||
450 | |||||||
451 | 3160 | 100 | 22110 | return 0 unless $text =~ m|^<([a-zA-Z0-9]+)\b(.*)>$|m; | |||
452 | 3153 | 11043 | my ( $tag, $attr ) = ( lc $1, $self->strip_nonprintable($2) ); | ||||
453 | |||||||
454 | 3153 | 50 | 10900 | return 0 if $self->{_hssSkipToEnd}; | |||
455 | 3153 | 50 | 33 | 21750 | if ( $tag eq 'script' or $tag eq 'style' ) { | ||
456 | 0 | 0 | $self->{_hssSkipToEnd} = $tag; | ||||
457 | 0 | 0 | return 0; | ||||
458 | } | ||||||
459 | |||||||
460 | 3153 | 100 | 8296 | return 0 if $self->_hss_tag_is_banned($tag); | |||
461 | |||||||
462 | 3119 | 7394 | my $allowed_attr = $self->{_hssAttrib}{$tag}; | ||||
463 | 3119 | 100 | 23106 | return 0 unless defined $allowed_attr; | |||
464 | |||||||
465 | 3117 | 100 | 7569 | return 0 unless $self->_hss_get_to_valid_context($tag); | |||
466 | |||||||
467 | 3114 | 8317 | my $default_filters = $self->{_hssRules}{'*'}; | ||||
468 | 3114 | 100 | 11896 | my $tag_filters = $self->{_hssRules}{$tag} | |||
469 | || $default_filters; | ||||||
470 | |||||||
471 | 3114 | 4409 | my %filtered_attr; | ||||
472 | 3114 | 26427 | while ( $attr | ||||
473 | =~ s#^\s*([\w\-]+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?## ) | ||||||
474 | { | ||||||
475 | 4365 | 10842 | my $key = lc $1; | ||||
476 | 4365 | 100 | 18540 | my $val = ( defined $2 ? $self->unquoted_to_canonical_form($2) | |||
100 | |||||||
100 | |||||||
477 | : defined $3 ? $self->quoted_to_canonical_form($3) | ||||||
478 | : defined $4 ? $self->quoted_to_canonical_form($4) | ||||||
479 | : '' | ||||||
480 | ); | ||||||
481 | |||||||
482 | 4365 | 9285 | my $value_class = $allowed_attr->{$key}; | ||||
483 | 4365 | 100 | 10402 | next unless defined $value_class; | |||
484 | |||||||
485 | 4364 | 10083 | my $attval_handler = $self->{_hssAttVal}{$value_class}; | ||||
486 | 4364 | 50 | 17098 | next unless defined $attval_handler; | |||
487 | |||||||
488 | 4364 | 4299 | my $attr_filter; | ||||
489 | 4364 | 100 | 10428 | if ($tag_filters) { | |||
490 | 4115 | 13420 | $attr_filter = | ||||
491 | $self->_hss_get_attr_filter( $default_filters, $tag_filters, | ||||||
492 | $key ); | ||||||
493 | |||||||
494 | # filter == 0 | ||||||
495 | 4115 | 100 | 13622 | next unless $attr_filter; | |||
496 | } | ||||||
497 | |||||||
498 | 3568 | 5259 | my $filtered_value = &{$attval_handler}( $self, $tag, $key, $val ); | ||||
3568 | 10105 | ||||||
499 | 3568 | 100 | 11221 | next unless defined $filtered_value; | |||
500 | |||||||
501 | # send value to filter if sub | ||||||
502 | 3555 | 100 | 100 | 22449 | if ( $tag_filters && ref $attr_filter ) { | ||
503 | 2392 | 7106 | $filtered_value | ||||
504 | = $attr_filter->( $self, $tag, $key, $filtered_value ); | ||||||
505 | 2392 | 100 | 16784 | next unless defined $filtered_value; | |||
506 | } | ||||||
507 | |||||||
508 | 3026 | 19755 | $filtered_attr{$key} = $filtered_value; | ||||
509 | |||||||
510 | } | ||||||
511 | |||||||
512 | # Check required attributes | ||||||
513 | 3114 | 100 | 12486 | if ( my $required = $tag_filters->{required} ) { | |||
514 | 3 | 6 | foreach my $key (@$required) { | ||||
515 | 5 | 100 | 66 | 40 | return 0 | ||
516 | unless defined $filtered_attr{$key} && length($filtered_attr{$key}); | ||||||
517 | } | ||||||
518 | } | ||||||
519 | |||||||
520 | # Check for callback | ||||||
521 | 3112 | 66 | 35163 | my $tag_callback = $tag_filters && $tag_filters->{tag} | |||
522 | || $default_filters->{tag}; | ||||||
523 | |||||||
524 | 3112 | 12487 | my $new_context = $self->{_hssContext}{ $self->{_hssStack}[0]{CTX} }{$tag}; | ||||
525 | |||||||
526 | 3112 | 37192 | my %stack_entry = ( NAME => $tag, | ||||
527 | ATTR => \%filtered_attr, | ||||||
528 | CTX => $new_context, | ||||||
529 | CALLBACK => $tag_callback, | ||||||
530 | CONTENT => '', | ||||||
531 | ); | ||||||
532 | 3112 | 100 | 8438 | if ( $new_context eq 'EMPTY' ) { | |||
533 | 1576 | 28076 | $self->output_stack_entry( \%stack_entry ); | ||||
534 | } | ||||||
535 | else { | ||||||
536 | 1536 | 1685 | unshift @{ $self->{_hssStack} }, \%stack_entry; | ||||
1536 | 4631 | ||||||
537 | |||||||
538 | } | ||||||
539 | |||||||
540 | 3112 | 27362 | return 1; | ||||
541 | } | ||||||
542 | |||||||
543 | =item input_end ( TEXT ) | ||||||
544 | |||||||
545 | Handles an end tag from the input document. TEXT must be the | ||||||
546 | full text of the end tag, including angle-brackets. | ||||||
547 | |||||||
548 | =cut | ||||||
549 | |||||||
550 | sub input_end { | ||||||
551 | 1508 | 1508 | 1 | 8189 | my ( $self, $text ) = @_; | ||
552 | |||||||
553 | 1508 | 100 | 4819 | $self->_hss_accept_input_end($text) or $self->reject_end($text); | |||
554 | } | ||||||
555 | |||||||
556 | sub _hss_accept_input_end { | ||||||
557 | 1508 | 1508 | 1876 | my ( $self, $text ) = @_; | |||
558 | |||||||
559 | 1508 | 100 | 7757 | return 0 unless $text =~ m#^(\w+)>$#; | |||
560 | 1505 | 3214 | my $tag = lc $1; | ||||
561 | |||||||
562 | 1505 | 50 | 3958 | if ( $self->{_hssSkipToEnd} ) { | |||
563 | 0 | 0 | 0 | if ( $self->{_hssSkipToEnd} eq $tag ) { | |||
564 | 0 | 0 | delete $self->{_hssSkipToEnd}; | ||||
565 | } | ||||||
566 | 0 | 0 | return 0; | ||||
567 | } | ||||||
568 | |||||||
569 | # Ignore a close without an open | ||||||
570 | 1505 | 100 | 1696 | return 0 unless grep { $_->{NAME} eq $tag } @{ $self->{_hssStack} }; | |||
3000 | 11015 | ||||||
1505 | 3863 | ||||||
571 | |||||||
572 | # Close open tags up to the matching open | ||||||
573 | 1478 | 3368 | my @close = (); | ||||
574 | |||||||
575 | 1478 | 1928 | while ( scalar @{ $self->{_hssStack} } ) { | ||||
1480 | 3984 | ||||||
576 | 1480 | 1827 | my $entry = shift @{ $self->{_hssStack} }; | ||||
1480 | 3611 | ||||||
577 | 1480 | 3942 | push @close, $entry; | ||||
578 | 1480 | 4512 | $self->output_stack_entry($entry); | ||||
579 | 1480 | 3342 | $entry->{CONTENT} = ''; | ||||
580 | 1480 | 100 | 4419 | last if $entry->{NAME} eq $tag; | |||
581 | } | ||||||
582 | |||||||
583 | # Reopen any we closed early if all that were closed are | ||||||
584 | # configured to be auto de-interleaved. | ||||||
585 | 1478 | 100 | 2287 | unless ( grep { !$self->{_hssDeInter}{ $_->{NAME} } } @close ) { | |||
1480 | 10024 | ||||||
586 | 89 | 185 | pop @close; | ||||
587 | 89 | 238 | unshift @{ $self->{_hssStack} }, @close; | ||||
89 | 190 | ||||||
588 | } | ||||||
589 | |||||||
590 | 1478 | 9255 | return 1; | ||||
591 | } | ||||||
592 | |||||||
593 | =item input_text ( TEXT ) | ||||||
594 | |||||||
595 | Handles some non-tag text from the input document. | ||||||
596 | |||||||
597 | =cut | ||||||
598 | |||||||
599 | sub input_text { | ||||||
600 | 382 | 382 | 1 | 1610 | my ( $self, $text ) = @_; | ||
601 | |||||||
602 | 382 | 50 | 1020 | return if $self->{_hssSkipToEnd}; | |||
603 | |||||||
604 | 382 | 1010 | $text = $self->strip_nonprintable($text); | ||||
605 | |||||||
606 | 382 | 100 | 1742 | if ( $text =~ /^(\s*)$/ ) { | |||
607 | 2 | 7 | $self->output_text($1); | ||||
608 | 2 | 5 | return; | ||||
609 | } | ||||||
610 | |||||||
611 | 380 | 100 | 932 | unless ( $self->_hss_get_to_valid_context('CDATA') ) { | |||
612 | 1 | 5 | $self->reject_text($text); | ||||
613 | 1 | 3 | return; | ||||
614 | } | ||||||
615 | |||||||
616 | 379 | 1152 | my $filtered = $self->filter_text( $self->text_to_canonical_form($text) ); | ||||
617 | 379 | 1284 | $self->output_text( $self->canonical_form_to_text($filtered) ); | ||||
618 | } | ||||||
619 | |||||||
620 | =item input_process ( TEXT ) | ||||||
621 | |||||||
622 | Handles a processing instruction from the input document. | ||||||
623 | |||||||
624 | =cut | ||||||
625 | |||||||
626 | sub input_process { | ||||||
627 | 1 | 1 | 1 | 10 | my ( $self, $text ) = @_; | ||
628 | |||||||
629 | 1 | 7 | $self->reject_process($text); | ||||
630 | } | ||||||
631 | |||||||
632 | =item input_comment ( TEXT ) | ||||||
633 | |||||||
634 | Handles an HTML comment from the input document. | ||||||
635 | |||||||
636 | =cut | ||||||
637 | |||||||
638 | sub input_comment { | ||||||
639 | 1 | 1 | 1 | 9 | my ( $self, $text ) = @_; | ||
640 | |||||||
641 | 1 | 4 | $self->reject_comment($text); | ||||
642 | } | ||||||
643 | |||||||
644 | =item input_declaration ( TEXT ) | ||||||
645 | |||||||
646 | Handles an declaration from the input document. | ||||||
647 | |||||||
648 | =cut | ||||||
649 | |||||||
650 | sub input_declaration { | ||||||
651 | 1 | 1 | 1 | 11 | my ( $self, $text ) = @_; | ||
652 | |||||||
653 | 1 | 3 | $self->reject_declaration($text); | ||||
654 | } | ||||||
655 | |||||||
656 | =item input_end_document () | ||||||
657 | |||||||
658 | Call this method to signal the end of the input document. | ||||||
659 | |||||||
660 | =cut | ||||||
661 | |||||||
662 | sub input_end_document { | ||||||
663 | 1890 | 1890 | 1 | 8945 | my ($self) = @_; | ||
664 | |||||||
665 | 1890 | 3165 | delete $self->{_hssSkipToEnd}; | ||||
666 | |||||||
667 | 1890 | 2456 | while ( @{ $self->{_hssStack} } > 1 ) { | ||||
1946 | 7671 | ||||||
668 | 56 | 99 | $self->output_stack_entry( shift @{ $self->{_hssStack} } ); | ||||
56 | 186 | ||||||
669 | } | ||||||
670 | |||||||
671 | 1890 | 4372 | $self->output_end_document; | ||||
672 | 1890 | 2330 | my $last_entry = shift @{ $self->{_hssStack} }; | ||||
1890 | 4011 | ||||||
673 | 1890 | 4052 | $self->{_hssOutput} = $last_entry->{CONTENT}; | ||||
674 | 1890 | 9422 | delete $self->{_hssStack}; | ||||
675 | |||||||
676 | } | ||||||
677 | |||||||
678 | =item filtered_document () | ||||||
679 | |||||||
680 | Returns the filtered document as a string. | ||||||
681 | |||||||
682 | =cut | ||||||
683 | |||||||
684 | sub filtered_document { | ||||||
685 | 1890 | 1890 | 1 | 5794 | my ($self) = @_; | ||
686 | 1890 | 12333 | $self->{_hssOutput}; | ||||
687 | } | ||||||
688 | |||||||
689 | =back | ||||||
690 | |||||||
691 | =cut | ||||||
692 | |||||||
693 | =head1 SUBCLASSING | ||||||
694 | |||||||
695 | The only reason for subclassing this module now is to add to the | ||||||
696 | list of accepted tags, attributes and styles (See | ||||||
697 | L"WHITELIST INITIALIZATION METHODS">). Everything else can be | ||||||
698 | achieved with L"Rules">. | ||||||
699 | |||||||
700 | The C |
||||||
701 | hashes and C |
||||||
702 | C<_hss>. The filter configuration can be set up by invoking the | ||||||
703 | hss_init() method, which takes the same arguments as new(). | ||||||
704 | |||||||
705 | =head1 OUTPUT METHODS | ||||||
706 | |||||||
707 | The filter outputs a stream of start tags, end tags, text, comments, | ||||||
708 | declarations and processing instructions, via the following C |
||||||
709 | methods. Subclasses may override these to intercept the filter output. | ||||||
710 | |||||||
711 | The default implementations of the C |
||||||
712 | text on to the output() method. The default implementation of the | ||||||
713 | output() method appends the text to a string, which can be fetched with | ||||||
714 | the filtered_document() method once processing is complete. | ||||||
715 | |||||||
716 | If the output() method or the individual C |
||||||
717 | overridden in a subclass, then filtered_document() will not work in | ||||||
718 | that subclass. | ||||||
719 | |||||||
720 | =over | ||||||
721 | |||||||
722 | =item output_start_document () | ||||||
723 | |||||||
724 | This method gets called once at the start of each HTML document passed | ||||||
725 | through the filter. The default implementation does nothing. | ||||||
726 | |||||||
727 | =cut | ||||||
728 | |||||||
729 | 3778 | 3778 | 1 | 6867 | sub output_start_document { } | ||
730 | |||||||
731 | =item output_end_document () | ||||||
732 | |||||||
733 | This method gets called once at the end of each HTML document passed | ||||||
734 | through the filter. The default implementation does nothing. | ||||||
735 | |||||||
736 | =cut | ||||||
737 | |||||||
738 | *output_end_document = \&output_start_document; | ||||||
739 | |||||||
740 | =item output_start ( TEXT ) | ||||||
741 | |||||||
742 | This method is used to output a filtered start tag. | ||||||
743 | |||||||
744 | =cut | ||||||
745 | |||||||
746 | 5108 | 5108 | 1 | 21565 | sub output_start { $_[0]->output( $_[1] ) } | ||
747 | |||||||
748 | =item output_end ( TEXT ) | ||||||
749 | |||||||
750 | This method is used to output a filtered end tag. | ||||||
751 | |||||||
752 | =cut | ||||||
753 | |||||||
754 | *output_end = \&output_start; | ||||||
755 | |||||||
756 | =item output_text ( TEXT ) | ||||||
757 | |||||||
758 | This method is used to output some filtered non-tag text. | ||||||
759 | |||||||
760 | =cut | ||||||
761 | |||||||
762 | *output_text = \&output_start; | ||||||
763 | |||||||
764 | =item output_declaration ( TEXT ) | ||||||
765 | |||||||
766 | This method is used to output a filtered declaration. | ||||||
767 | |||||||
768 | =cut | ||||||
769 | |||||||
770 | *output_declaration = \&output_start; | ||||||
771 | |||||||
772 | =item output_comment ( TEXT ) | ||||||
773 | |||||||
774 | This method is used to output a filtered HTML comment. | ||||||
775 | |||||||
776 | =cut | ||||||
777 | |||||||
778 | *output_comment = \&output_start; | ||||||
779 | |||||||
780 | =item output_process ( TEXT ) | ||||||
781 | |||||||
782 | This method is used to output a filtered processing instruction. | ||||||
783 | |||||||
784 | =cut | ||||||
785 | |||||||
786 | *output_process = \&output_start; | ||||||
787 | |||||||
788 | =item output ( TEXT ) | ||||||
789 | |||||||
790 | This method is invoked by all of the default C |
||||||
791 | default implementation appends the text to the string that the | ||||||
792 | filtered_document() method will return. | ||||||
793 | |||||||
794 | =cut | ||||||
795 | |||||||
796 | 5111 | 5111 | 1 | 25077 | sub output { $_[0]->{_hssStack}[0]{CONTENT} .= $_[1]; } | ||
797 | |||||||
798 | =item output_stack_entry ( TEXT ) | ||||||
799 | |||||||
800 | This method is invoked when a tag plus all text and nested HTML content | ||||||
801 | within the tag has been processed. It adds the tag plus its content | ||||||
802 | to the content for its parent tag. | ||||||
803 | |||||||
804 | =cut | ||||||
805 | |||||||
806 | sub output_stack_entry { | ||||||
807 | 3113 | 3113 | 1 | 5399 | my ( $self, $tag ) = @_; | ||
808 | |||||||
809 | 3113 | 4516 | my %entry; | ||||
810 | 3113 | 4091 | @entry{qw(tag attr content)} = @{$tag}{qw(NAME ATTR CONTENT)}; | ||||
3113 | 15606 | ||||||
811 | |||||||
812 | 3113 | 100 | 25025 | if ( my $tag_callback = $tag->{CALLBACK} ) { | |||
813 | 41 | 100 | 136 | $tag_callback->( $self, \%entry ) | |||
814 | or return; | ||||||
815 | } | ||||||
816 | |||||||
817 | 3112 | 6062 | my $tagname = $entry{tag}; | ||||
818 | 3112 | 8043 | my $filtered_attrs = $self->_hss_join_attribs( $entry{attr} ); | ||||
819 | |||||||
820 | 3112 | 100 | 11106 | if ( $tag->{CTX} eq 'EMPTY' ) { | |||
821 | 1576 | 50 | 8705 | $self->output_start("<$tagname$filtered_attrs />") | |||
822 | if $entry{tag}; | ||||||
823 | 1576 | 4975 | return; | ||||
824 | } | ||||||
825 | 1536 | 100 | 3442 | if ($tagname) { | |||
826 | 1535 | 9705 | $self->output_start("<$tagname$filtered_attrs>"); | ||||
827 | } | ||||||
828 | |||||||
829 | 1536 | 50 | 4122 | if ( defined $entry{content} ) { | |||
830 | 1536 | 4086 | $self->{_hssStack}[0]{CONTENT} .= $entry{content}; | ||||
831 | } | ||||||
832 | |||||||
833 | 1536 | 100 | 3083 | if ($tagname) { | |||
834 | 1535 | 4776 | $self->output_end("$tagname>"); | ||||
835 | } | ||||||
836 | } | ||||||
837 | |||||||
838 | =back | ||||||
839 | |||||||
840 | =head1 REJECT METHODS | ||||||
841 | |||||||
842 | When the filter encounters something in the input document which it | ||||||
843 | cannot transform into an acceptable construct, it invokes one of | ||||||
844 | the following C |
||||||
845 | document to take the place of the unacceptable construct. | ||||||
846 | |||||||
847 | The TEXT parameter is the full text of the unacceptable construct. | ||||||
848 | |||||||
849 | The default implementations of these methods output an HTML comment | ||||||
850 | containing the text C |
||||||
851 | is set to true, then the rejected text is HTML escaped instead. | ||||||
852 | |||||||
853 | Subclasses may override these methods, but should exercise caution. | ||||||
854 | The TEXT parameter is unfiltered input and may contain malicious | ||||||
855 | constructs. | ||||||
856 | |||||||
857 | =over | ||||||
858 | |||||||
859 | =item reject_start ( TEXT ) | ||||||
860 | |||||||
861 | =item reject_end ( TEXT ) | ||||||
862 | |||||||
863 | =item reject_text ( TEXT ) | ||||||
864 | |||||||
865 | =item reject_declaration ( TEXT ) | ||||||
866 | |||||||
867 | =item reject_comment ( TEXT ) | ||||||
868 | |||||||
869 | =item reject_process ( TEXT ) | ||||||
870 | |||||||
871 | =back | ||||||
872 | |||||||
873 | =cut | ||||||
874 | |||||||
875 | sub reject_start { | ||||||
876 | 76 | 100 | 76 | 1 | 599 | $_[0]->{_hssCfg}{EscapeFiltered} | |
877 | ? $_[0]->output_text( $_[0]->escape_html_metachars( $_[1] ) ) | ||||||
878 | : $_[0]->output_comment(''); | ||||||
879 | } | ||||||
880 | *reject_end = \&reject_start; | ||||||
881 | *reject_text = \&reject_start; | ||||||
882 | *reject_declaration = \&reject_start; | ||||||
883 | *reject_comment = \&reject_start; | ||||||
884 | *reject_process = \&reject_start; | ||||||
885 | |||||||
886 | =head1 WHITELIST INITIALIZATION METHODS | ||||||
887 | |||||||
888 | The filter refers to various whitelists to determine which constructs | ||||||
889 | are acceptable. To modify these whitelists, subclasses can override | ||||||
890 | the following methods. | ||||||
891 | |||||||
892 | Each method is called once at object initialization time, and must | ||||||
893 | return a reference to a nested data structure. These references are | ||||||
894 | installed into the object, and used whenever the filter needs to refer | ||||||
895 | to a whitelist. | ||||||
896 | |||||||
897 | The default implementations of these methods can be invoked as class | ||||||
898 | methods. | ||||||
899 | |||||||
900 | See examples/tags/ and examples/declaration/ for examples of how to | ||||||
901 | override these methods. | ||||||
902 | |||||||
903 | =over | ||||||
904 | |||||||
905 | =item init_context_whitelist () | ||||||
906 | |||||||
907 | Returns a reference to the C |
||||||
908 | which tags may appear at each point in the document, and which other | ||||||
909 | tags may be nested within them. | ||||||
910 | |||||||
911 | It is a hash, and the keys are context names, such as C |
||||||
912 | C |
||||||
913 | |||||||
914 | The values in the hash are hashrefs. The keys in these subhashes are | ||||||
915 | lowercase tag names, and the values are context names, specifying the | ||||||
916 | context that the tag provides to any other tags nested within it. | ||||||
917 | |||||||
918 | The special context C |
||||||
919 | nothing can be nested within that tag. | ||||||
920 | |||||||
921 | =cut | ||||||
922 | |||||||
923 | 10 | 10 | 132 | use vars qw(%_Context); | |||
10 | 36 | ||||||
10 | 3732 | ||||||
924 | |||||||
925 | BEGIN { | ||||||
926 | |||||||
927 | 10 | 10 | 295 | my %pre_content = ( 'br' => 'EMPTY', | |||
928 | 'span' => 'Inline', | ||||||
929 | 'tt' => 'Inline', | ||||||
930 | 'i' => 'Inline', | ||||||
931 | 'b' => 'Inline', | ||||||
932 | 'u' => 'Inline', | ||||||
933 | 's' => 'Inline', | ||||||
934 | 'strike' => 'Inline', | ||||||
935 | 'em' => 'Inline', | ||||||
936 | 'strong' => 'Inline', | ||||||
937 | 'dfn' => 'Inline', | ||||||
938 | 'code' => 'Inline', | ||||||
939 | 'q' => 'Inline', | ||||||
940 | 'samp' => 'Inline', | ||||||
941 | 'kbd' => 'Inline', | ||||||
942 | 'var' => 'Inline', | ||||||
943 | 'cite' => 'Inline', | ||||||
944 | 'abbr' => 'Inline', | ||||||
945 | 'acronym' => 'Inline', | ||||||
946 | 'ins' => 'Inline', | ||||||
947 | 'del' => 'Inline', | ||||||
948 | 'a' => 'Inline', | ||||||
949 | 'CDATA' => 'CDATA', | ||||||
950 | ); | ||||||
951 | |||||||
952 | 10 | 197 | my %inline = ( %pre_content, | ||||
953 | 'img' => 'EMPTY', | ||||||
954 | 'big' => 'Inline', | ||||||
955 | 'small' => 'Inline', | ||||||
956 | 'sub' => 'Inline', | ||||||
957 | 'sup' => 'Inline', | ||||||
958 | 'font' => 'Inline', | ||||||
959 | 'nobr' => 'Inline', | ||||||
960 | ); | ||||||
961 | |||||||
962 | 10 | 415 | my %flow = ( %inline, | ||||
963 | 'ins' => 'Flow', | ||||||
964 | 'del' => 'Flow', | ||||||
965 | 'div' => 'Flow', | ||||||
966 | 'p' => 'Inline', | ||||||
967 | 'h1' => 'Inline', | ||||||
968 | 'h2' => 'Inline', | ||||||
969 | 'h3' => 'Inline', | ||||||
970 | 'h4' => 'Inline', | ||||||
971 | 'h5' => 'Inline', | ||||||
972 | 'h6' => 'Inline', | ||||||
973 | 'ul' => 'list', | ||||||
974 | 'ol' => 'list', | ||||||
975 | 'menu' => 'list', | ||||||
976 | 'dir' => 'list', | ||||||
977 | 'dl' => 'dt_dd', | ||||||
978 | 'address' => 'Inline', | ||||||
979 | 'hr' => 'EMPTY', | ||||||
980 | 'pre' => 'pre.content', | ||||||
981 | 'blockquote' => 'Flow', | ||||||
982 | 'center' => 'Flow', | ||||||
983 | 'table' => 'table', | ||||||
984 | ); | ||||||
985 | |||||||
986 | 10 | 107 | my %table = ( 'caption' => 'Inline', | ||||
987 | 'thead' => 'tr_only', | ||||||
988 | 'tfoot' => 'tr_only', | ||||||
989 | 'tbody' => 'tr_only', | ||||||
990 | 'colgroup' => 'colgroup', | ||||||
991 | 'col' => 'EMPTY', | ||||||
992 | 'tr' => 'th_td', | ||||||
993 | ); | ||||||
994 | |||||||
995 | 10 | 35 | my %head = ( 'title' => 'NoTags', ); | ||||
996 | |||||||
997 | 10 | 1381 | %_Context = ( 'Document' => { 'html' => 'Html' }, | ||||
998 | 'Html' => { 'head' => 'Head', 'body' => 'Flow' }, | ||||||
999 | 'Head' => \%head, | ||||||
1000 | 'Inline' => \%inline, | ||||||
1001 | 'Flow' => \%flow, | ||||||
1002 | 'NoTags' => { 'CDATA' => 'CDATA' }, | ||||||
1003 | 'pre.content' => \%pre_content, | ||||||
1004 | 'table' => \%table, | ||||||
1005 | 'list' => { 'li' => 'Flow' }, | ||||||
1006 | 'dt_dd' => { 'dt' => 'Inline', 'dd' => 'Flow' }, | ||||||
1007 | 'tr_only' => { 'tr' => 'th_td' }, | ||||||
1008 | 'colgroup' => { 'col' => 'EMPTY' }, | ||||||
1009 | 'th_td' => { 'th' => 'Flow', 'td' => 'Flow' }, | ||||||
1010 | ); | ||||||
1011 | } | ||||||
1012 | |||||||
1013 | 1487 | 1487 | 1 | 4369 | sub init_context_whitelist { return \%_Context; } | ||
1014 | |||||||
1015 | =item init_attrib_whitelist () | ||||||
1016 | |||||||
1017 | Returns a reference to the C |
||||||
1018 | attributes each tag can have and the values that those attributes can | ||||||
1019 | take. | ||||||
1020 | |||||||
1021 | It is a hash, and the keys are lowercase tag names. | ||||||
1022 | |||||||
1023 | The values in the hash are hashrefs. The keys in these subhashes are | ||||||
1024 | lowercase attribute names, and the values are attribute value class names, | ||||||
1025 | which are short strings describing the type of values that the | ||||||
1026 | attribute can take, such as C |
||||||
1027 | |||||||
1028 | =cut | ||||||
1029 | |||||||
1030 | 10 | 10 | 64 | use vars qw(%_Attrib); | |||
10 | 18 | ||||||
10 | 6646 | ||||||
1031 | |||||||
1032 | BEGIN { | ||||||
1033 | |||||||
1034 | 10 | 10 | 47 | my %attr = ( 'style' => 'style' ); | |||
1035 | |||||||
1036 | 10 | 60 | my %font_attr = ( %attr, | ||||
1037 | 'size' => 'size', | ||||||
1038 | 'face' => 'wordlist', | ||||||
1039 | 'color' => 'color', | ||||||
1040 | ); | ||||||
1041 | |||||||
1042 | 10 | 46 | my %insdel_attr = ( %attr, | ||||
1043 | 'cite' => 'href', | ||||||
1044 | 'datetime' => 'text', | ||||||
1045 | ); | ||||||
1046 | |||||||
1047 | 10 | 661 | my %texta_attr = ( %attr, 'align' => 'word', ); | ||||
1048 | |||||||
1049 | 10 | 40 | my %cellha_attr = ( 'align' => 'word', | ||||
1050 | 'char' => 'word', | ||||||
1051 | 'charoff' => 'size', | ||||||
1052 | ); | ||||||
1053 | |||||||
1054 | 10 | 27 | my %cellva_attr = ( 'valign' => 'word', ); | ||||
1055 | |||||||
1056 | 10 | 183 | my %cellhv_attr = ( %attr, %cellha_attr, %cellva_attr ); | ||||
1057 | |||||||
1058 | 10 | 65 | my %col_attr = ( %attr, %cellhv_attr, | ||||
1059 | 'width' => 'size', | ||||||
1060 | 'span' => 'number', | ||||||
1061 | ); | ||||||
1062 | |||||||
1063 | 10 | 198 | my %thtd_attr = ( %attr, | ||||
1064 | 'abbr' => 'text', | ||||||
1065 | 'axis' => 'text', | ||||||
1066 | 'headers' => 'text', | ||||||
1067 | 'scope' => 'word', | ||||||
1068 | 'rowspan' => 'number', | ||||||
1069 | 'colspan' => 'number', | ||||||
1070 | %cellhv_attr, | ||||||
1071 | 'nowrap' => 'novalue', | ||||||
1072 | 'bgcolor' => 'color', | ||||||
1073 | 'width' => 'size', | ||||||
1074 | 'height' => 'size', | ||||||
1075 | 'bordercolor' => 'color', | ||||||
1076 | 'bordercolorlight' => 'color', | ||||||
1077 | 'bordercolordark' => 'color', | ||||||
1078 | ); | ||||||
1079 | |||||||
1080 | 10 | 1706 | %_Attrib = ( 'br' => { 'clear' => 'word' }, | ||||
1081 | 'em' => \%attr, | ||||||
1082 | 'strong' => \%attr, | ||||||
1083 | 'dfn' => \%attr, | ||||||
1084 | 'code' => \%attr, | ||||||
1085 | 'samp' => \%attr, | ||||||
1086 | 'kbd' => \%attr, | ||||||
1087 | 'var' => \%attr, | ||||||
1088 | 'cite' => \%attr, | ||||||
1089 | 'abbr' => \%attr, | ||||||
1090 | 'acronym' => \%attr, | ||||||
1091 | 'q' => { %attr, 'cite' => 'href' }, | ||||||
1092 | 'blockquote' => { %attr, 'cite' => 'href' }, | ||||||
1093 | 'sub' => \%attr, | ||||||
1094 | 'sup' => \%attr, | ||||||
1095 | 'tt' => \%attr, | ||||||
1096 | 'i' => \%attr, | ||||||
1097 | 'b' => \%attr, | ||||||
1098 | 'big' => \%attr, | ||||||
1099 | 'small' => \%attr, | ||||||
1100 | 'u' => \%attr, | ||||||
1101 | 's' => \%attr, | ||||||
1102 | 'strike' => \%attr, | ||||||
1103 | 'font' => \%font_attr, | ||||||
1104 | 'table' => { | ||||||
1105 | %attr, | ||||||
1106 | 'frame' => 'word', | ||||||
1107 | 'rules' => 'word', | ||||||
1108 | %texta_attr, | ||||||
1109 | 'bgcolor' => 'color', | ||||||
1110 | 'background' => 'src', | ||||||
1111 | 'width' => 'size', | ||||||
1112 | 'height' => 'size', | ||||||
1113 | 'cellspacing' => 'size', | ||||||
1114 | 'cellpadding' => 'size', | ||||||
1115 | 'border' => 'size', | ||||||
1116 | 'bordercolor' => 'color', | ||||||
1117 | 'bordercolorlight' => 'color', | ||||||
1118 | 'bordercolordark' => 'color', | ||||||
1119 | 'summary' => 'text', | ||||||
1120 | }, | ||||||
1121 | 'caption' => { %attr, 'align' => 'word', }, | ||||||
1122 | 'colgroup' => \%col_attr, | ||||||
1123 | 'col' => \%col_attr, | ||||||
1124 | 'thead' => \%cellhv_attr, | ||||||
1125 | 'tfoot' => \%cellhv_attr, | ||||||
1126 | 'tbody' => \%cellhv_attr, | ||||||
1127 | 'tr' => { | ||||||
1128 | %attr, | ||||||
1129 | bgcolor => 'color', | ||||||
1130 | %cellhv_attr, | ||||||
1131 | }, | ||||||
1132 | 'th' => \%thtd_attr, | ||||||
1133 | 'td' => \%thtd_attr, | ||||||
1134 | 'ins' => \%insdel_attr, | ||||||
1135 | 'del' => \%insdel_attr, | ||||||
1136 | 'a' => { %attr, href => 'href', }, | ||||||
1137 | 'h1' => \%texta_attr, | ||||||
1138 | 'h2' => \%texta_attr, | ||||||
1139 | 'h3' => \%texta_attr, | ||||||
1140 | 'h4' => \%texta_attr, | ||||||
1141 | 'h5' => \%texta_attr, | ||||||
1142 | 'h6' => \%texta_attr, | ||||||
1143 | 'p' => \%texta_attr, | ||||||
1144 | 'div' => \%texta_attr, | ||||||
1145 | 'span' => \%texta_attr, | ||||||
1146 | 'ul' => { | ||||||
1147 | %attr, | ||||||
1148 | 'type' => 'word', | ||||||
1149 | 'compact' => 'novalue', | ||||||
1150 | }, | ||||||
1151 | 'ol' => { %attr, | ||||||
1152 | 'type' => 'text', | ||||||
1153 | 'compact' => 'novalue', | ||||||
1154 | 'start' => 'number', | ||||||
1155 | }, | ||||||
1156 | 'li' => { %attr, | ||||||
1157 | 'type' => 'text', | ||||||
1158 | 'value' => 'number', | ||||||
1159 | }, | ||||||
1160 | 'dl' => { %attr, 'compact' => 'novalue' }, | ||||||
1161 | 'dt' => \%attr, | ||||||
1162 | 'dd' => \%attr, | ||||||
1163 | 'address' => \%attr, | ||||||
1164 | 'hr' => { | ||||||
1165 | %texta_attr, | ||||||
1166 | 'width' => 'size', | ||||||
1167 | 'size' => 'size', | ||||||
1168 | 'noshade' => 'novalue', | ||||||
1169 | }, | ||||||
1170 | 'pre' => { %attr, 'width' => 'size' }, | ||||||
1171 | 'center' => \%attr, | ||||||
1172 | 'nobr' => {}, | ||||||
1173 | 'img' => { | ||||||
1174 | 'src' => 'src', | ||||||
1175 | 'alt' => 'text', | ||||||
1176 | 'width' => 'size', | ||||||
1177 | 'height' => 'size', | ||||||
1178 | 'border' => 'size', | ||||||
1179 | 'hspace' => 'size', | ||||||
1180 | 'vspace' => 'size', | ||||||
1181 | 'align' => 'word', | ||||||
1182 | }, | ||||||
1183 | 'body' => { 'bgcolor' => 'color', | ||||||
1184 | 'background' => 'src', | ||||||
1185 | 'link' => 'color', | ||||||
1186 | 'vlink' => 'color', | ||||||
1187 | 'alink' => 'color', | ||||||
1188 | 'text' => 'color', | ||||||
1189 | }, | ||||||
1190 | 'head' => {}, | ||||||
1191 | 'title' => {}, | ||||||
1192 | 'html' => {}, | ||||||
1193 | ); | ||||||
1194 | } | ||||||
1195 | |||||||
1196 | 1487 | 1487 | 1 | 4684 | sub init_attrib_whitelist { return \%_Attrib; } | ||
1197 | |||||||
1198 | =item init_attval_whitelist () | ||||||
1199 | |||||||
1200 | Returns a reference to the C |
||||||
1201 | attribute value class names from the C |
||||||
1202 | subs to validate (and optionally transform) a particular attribute value. | ||||||
1203 | |||||||
1204 | The filter calls the attribute value validation subs with the | ||||||
1205 | following parameters: | ||||||
1206 | |||||||
1207 | =over | ||||||
1208 | |||||||
1209 | =item C |
||||||
1210 | |||||||
1211 | A reference to the filter object. | ||||||
1212 | |||||||
1213 | =item C |
||||||
1214 | |||||||
1215 | The lowercase name of the tag in which the attribute appears. | ||||||
1216 | |||||||
1217 | =item C |
||||||
1218 | |||||||
1219 | The name of the attribute. | ||||||
1220 | |||||||
1221 | =item C |
||||||
1222 | |||||||
1223 | The attribute value found in the input document, in canonical form | ||||||
1224 | (see L"CANONICAL FORM">). | ||||||
1225 | |||||||
1226 | =back | ||||||
1227 | |||||||
1228 | The validation sub can return undef to indicate that the attribute | ||||||
1229 | should be removed from the tag, or it can return the new value for | ||||||
1230 | the attribute, in canonical form. | ||||||
1231 | |||||||
1232 | =cut | ||||||
1233 | |||||||
1234 | 10 | 10 | 63 | use vars qw(%_AttVal); | |||
10 | 15 | ||||||
10 | 1601 | ||||||
1235 | |||||||
1236 | BEGIN { | ||||||
1237 | 10 | 10 | 823 | %_AttVal = ( 'style' => \&_hss_attval_style, | |||
1238 | 'size' => \&_hss_attval_size, | ||||||
1239 | 'number' => \&_hss_attval_number, | ||||||
1240 | 'color' => \&_hss_attval_color, | ||||||
1241 | 'text' => \&_hss_attval_text, | ||||||
1242 | 'word' => \&_hss_attval_word, | ||||||
1243 | 'wordlist' => \&_hss_attval_wordlist, | ||||||
1244 | 'wordlistq' => \&_hss_attval_wordlistq, | ||||||
1245 | 'href' => \&_hss_attval_href, | ||||||
1246 | 'src' => \&_hss_attval_src, | ||||||
1247 | 'stylesrc' => \&_hss_attval_stylesrc, | ||||||
1248 | 'novalue' => \&_hss_attval_novalue, | ||||||
1249 | ); | ||||||
1250 | } | ||||||
1251 | |||||||
1252 | 1487 | 1487 | 1 | 3882 | sub init_attval_whitelist { return \%_AttVal; } | ||
1253 | |||||||
1254 | =item init_style_whitelist () | ||||||
1255 | |||||||
1256 | Returns a reference to the C |