blib/lib/HTML/Laundry.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 396 | 487 | 81.3 |
branch | 131 | 160 | 81.8 |
condition | 11 | 21 | 52.3 |
subroutine | 51 | 53 | 96.2 |
pod | 18 | 18 | 100.0 |
total | 607 | 739 | 82.1 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | ######################################################## | ||||||
2 | # Copyright © 2009 Six Apart, Ltd. | ||||||
3 | |||||||
4 | package HTML::Laundry; | ||||||
5 | |||||||
6 | 15 | 15 | 48448 | use strict; | |||
15 | 37 | ||||||
15 | 646 | ||||||
7 | 15 | 15 | 96 | use warnings; | |||
15 | 30 | ||||||
15 | 516 | ||||||
8 | |||||||
9 | 15 | 15 | 408 | use 5.008; | |||
15 | 66 | ||||||
15 | 612 | ||||||
10 | 15 | 15 | 13898 | use version; our $VERSION = 0.0103; | |||
15 | 38786 | ||||||
15 | 111 | ||||||
11 | |||||||
12 | =head1 NAME | ||||||
13 | |||||||
14 | HTML::Laundry - Perl module to clean HTML by the piece | ||||||
15 | |||||||
16 | =head1 VERSION | ||||||
17 | |||||||
18 | Version 0.0103 | ||||||
19 | |||||||
20 | =head1 SYNOPSIS | ||||||
21 | |||||||
22 | #!/usr/bin/perl -w | ||||||
23 | use strict; | ||||||
24 | use HTML::Laundry; | ||||||
25 | my $laundry = HTML::Laundry->new(); | ||||||
26 | my $snippet = q{ | ||||||
27 | |||||||
28 | If your gloves are sterilized |
||||||
29 | Rinse your mouth with Listerine | ||||||
30 | Blow disinfectant in her eyes" |
||||||
31 | -- X-Ray Spex, Germ-Free Adolescents | ||||||
32 | |||||||
33 | }; | ||||||
34 | my $germfree = $laundry->clean($snippet); | ||||||
35 | # $germfree is now: | ||||||
36 | # "You may get to touch her |
||||||
37 | # If your gloves are sterilized |
||||||
38 | # Rinse your mouth with Listerine |
||||||
39 | # Blow disinfectant in her eyes" |
||||||
40 | # -- X-Ray Spex, Germ-Free Adolescents | ||||||
41 | |||||||
42 | =head1 DESCRIPTION | ||||||
43 | |||||||
44 | HTML::Laundry is an L |
||||||
45 | meant for small pieces of HTML, such as user comments, Atom feed entries, | ||||||
46 | and the like, rather than full pages. Laundry takes these and returns clean, | ||||||
47 | sanitary, UTF-8-based XHTML. The parser's behavior may be changed with | ||||||
48 | callbacks, and the whitelist of acceptable elements and attributes may be | ||||||
49 | updated on the fly. | ||||||
50 | |||||||
51 | A snippet is cleaned several ways: | ||||||
52 | |||||||
53 | =over 4 | ||||||
54 | |||||||
55 | =item * Normalized, using HTML::Parser: attributes and elements will be | ||||||
56 | lowercased, empty elements such as and will be forced into |
||||||
57 | the empty tag syntax if needed, and unknown attributes and elements will be | ||||||
58 | stripped. | ||||||
59 | |||||||
60 | =item * Sanitized, using an extensible whitelist of valid attributes and | ||||||
61 | elements based on Mark Pilgrim and Aaron Swartz's work on C |
||||||
62 | and attributes which are known to be possible attack vectors are removed. | ||||||
63 | |||||||
64 | =item * Tidied, using L |
||||||
65 | (as available): unclosed tags will be closed and the output generally | ||||||
66 | neatened; future version may also use tidying to deal with character encoding | ||||||
67 | issues. | ||||||
68 | |||||||
69 | =item * Optionally rebased, to turn relative URLs in attributes into | ||||||
70 | absolute ones. | ||||||
71 | |||||||
72 | =back | ||||||
73 | |||||||
74 | HTML::Laundry provides mechanisms to extend the list of known allowed | ||||||
75 | (and disallowed) tags, along with callback methods to allow scripts using | ||||||
76 | HTML::Laundry to extend the behavior in various ways. Future versions | ||||||
77 | may provide additional options for altering the rules used to clean | ||||||
78 | snippets. | ||||||
79 | |||||||
80 | Out of the box, HTML::Laundry does not currently know about the tag | ||||||
81 | and its children. For santizing full HTML pages, consider using L |
||||||
82 | or L |
||||||
83 | |||||||
84 | =cut | ||||||
85 | |||||||
86 | require HTML::Laundry::Rules; | ||||||
87 | require HTML::Laundry::Rules::Default; | ||||||
88 | |||||||
89 | require HTML::Parser; | ||||||
90 | 15 | 15 | 15896 | use HTML::Entities qw(encode_entities encode_entities_numeric); | |||
15 | 131611 | ||||||
15 | 2143 | ||||||
91 | 15 | 15 | 14694 | use URI; | |||
15 | 73022 | ||||||
15 | 591 | ||||||
92 | 15 | 15 | 134 | use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8); | |||
15 | 39 | ||||||
15 | 1244 | ||||||
93 | 15 | 15 | 13013 | use URI::Split qw(); | |||
15 | 9057 | ||||||
15 | 388 | ||||||
94 | 15 | 15 | 94 | use Scalar::Util 'blessed'; | |||
15 | 28 | ||||||
15 | 1251 | ||||||
95 | 15 | 15 | 13616 | use Switch; | |||
15 | 587376 | ||||||
15 | 98 | ||||||
96 | |||||||
97 | my @fragments; | ||||||
98 | my $unacceptable_count; | ||||||
99 | my $local_unacceptable_count; | ||||||
100 | my $cdata_dirty; | ||||||
101 | my $in_cdata; | ||||||
102 | my $tag_leading_whitespace = qr/ | ||||||
103 | (?<=<) # Left bracket followed by | ||||||
104 | \s* # any amount of whitespace | ||||||
105 | (\/?) # optionally with a forward slash | ||||||
106 | \s* # and then more whitespace | ||||||
107 | /x; | ||||||
108 | |||||||
109 | =head1 FUNCTIONS | ||||||
110 | |||||||
111 | =head2 new | ||||||
112 | |||||||
113 | Create an HTML::Laundry object. | ||||||
114 | |||||||
115 | my $l = HTML::Laundry->new(); | ||||||
116 | |||||||
117 | Takes an optional anonymous hash of arguments: | ||||||
118 | |||||||
119 | =over 4 | ||||||
120 | |||||||
121 | =item * base_url | ||||||
122 | |||||||
123 | This turns relative URIs, as in C<>, into | ||||||
124 | absolute URIs, as for use in feed parsing. | ||||||
125 | |||||||
126 | my $l = HTML::Laundry->new({ base_uri => 'http://example.com/foo/' }); | ||||||
127 | |||||||
128 | |||||||
129 | =item * notidy | ||||||
130 | |||||||
131 | Disable use of HTML::Tidy or HTML::Tidy::libXML, even if | ||||||
132 | they are available on your system. | ||||||
133 | |||||||
134 | my $l = HTML::Laundry->new({ notidy => 1 }); | ||||||
135 | |||||||
136 | =back | ||||||
137 | |||||||
138 | =cut | ||||||
139 | |||||||
140 | sub new { | ||||||
141 | 25 | 25 | 1 | 5967 | my $self = {}; | ||
142 | 25 | 56 | my $class = shift; | ||||
143 | 25 | 42 | my $args = shift; | ||||
144 | |||||||
145 | 25 | 100 | 204 | if ( blessed $args ) { | |||
100 | |||||||
146 | 1 | 50 | 6 | if ( $args->isa('HTML::Laundry::Rules') ) { | |||
147 | 1 | 4 | $args = { rules => $args }; | ||||
148 | } | ||||||
149 | else { | ||||||
150 | 0 | 0 | $args = {}; | ||||
151 | } | ||||||
152 | } | ||||||
153 | elsif ( ref $args ne 'HASH' ) { | ||||||
154 | 4 | 6 | my $rules; | ||||
155 | { | ||||||
156 | 4 | 4 | local $@; | ||||
4 | 4 | ||||||
157 | 4 | 8 | eval { | ||||
158 | 4 | 100 | 61 | $args->isa('HTML::Laundry::Rules') | |||
159 | and $rules = $args->new; | ||||||
160 | }; | ||||||
161 | } | ||||||
162 | 4 | 100 | 14 | if ($rules) { | |||
163 | 1 | 11 | $args = { rules => $args }; | ||||
164 | } | ||||||
165 | else { | ||||||
166 | 3 | 6 | $args = {}; | ||||
167 | } | ||||||
168 | } | ||||||
169 | |||||||
170 | 25 | 203 | $self->{tidy} = undef; | ||||
171 | 25 | 65 | $self->{tidy_added_inline} = {}; | ||||
172 | 25 | 53 | $self->{tidy_added_empty} = {}; | ||||
173 | 25 | 51 | $self->{base_uri} = q{}; | ||||
174 | 25 | 56 | bless $self, $class; | ||||
175 | 25 | 90 | $self->clear_callback('start_tag'); | ||||
176 | 25 | 78 | $self->clear_callback('end_tag'); | ||||
177 | 25 | 61 | $self->clear_callback('uri'); | ||||
178 | 25 | 60 | $self->clear_callback('text'); | ||||
179 | 25 | 62 | $self->clear_callback('output'); | ||||
180 | $self->{parser} = HTML::Parser->new( | ||||||
181 | api_version => 3, | ||||||
182 | utf8_mode => 1, | ||||||
183 | 481 | 481 | 1223 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
184 | 453 | 453 | 1026 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
185 | 25 | 139 | 396 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
139 | 375 | ||||||
186 | empty_element_tags => 1, | ||||||
187 | marked_sections => 1, | ||||||
188 | ); | ||||||
189 | $self->{cdata_parser} = HTML::Parser->new( | ||||||
190 | api_version => 3, | ||||||
191 | utf8_mode => 1, | ||||||
192 | 5 | 5 | 13 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
193 | 5 | 5 | 12 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
194 | 25 | 14 | 2708 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext' ], | |||
14 | 39 | ||||||
195 | empty_element_tags => 1, | ||||||
196 | unbroken_text => 1, | ||||||
197 | marked_sections => 0, | ||||||
198 | ); | ||||||
199 | 25 | 1652 | $self->initialize($args); | ||||
200 | |||||||
201 | 25 | 100 | 152 | if ( !$args->{notidy} ) { | |||
202 | 8 | 18 | $self->_generate_tidy; | ||||
203 | } | ||||||
204 | 25 | 87 | return $self; | ||||
205 | } | ||||||
206 | |||||||
207 | =head2 initialize | ||||||
208 | |||||||
209 | Instantiates the Laundry object properties based on an | ||||||
210 | HTML::Laundry::Rules module. | ||||||
211 | |||||||
212 | =cut | ||||||
213 | |||||||
214 | sub initialize { | ||||||
215 | 25 | 25 | 1 | 62 | my ( $self, $args ) = @_; | ||
216 | |||||||
217 | # Set defaults | ||||||
218 | 25 | 53 | $self->{tidy_added_tags} = undef; | ||||
219 | 25 | 50 | $self->{tidy_empty_tags} = undef; | ||||
220 | 25 | 46 | $self->{trim_trailing_whitespace} = 1; | ||||
221 | 25 | 46 | $self->{trim_tag_whitespace} = 0; | ||||
222 | 25 | 100 | 88 | $self->{base_uri} = URI->new( $args->{base_uri} ) | |||
223 | if $args->{base_uri}; | ||||||
224 | 25 | 3530 | my $rules = $args->{rules}; | ||||
225 | 25 | 66 | 215 | $rules ||= HTML::Laundry::Rules::Default->new(); | |||
226 | |||||||
227 | 25 | 85 | $self->{ruleset} = $rules; | ||||
228 | |||||||
229 | # Initialize based on ruleset | ||||||
230 | 25 | 140 | $self->{acceptable_a} = $rules->acceptable_a(); | ||||
231 | 25 | 135 | $self->{acceptable_e} = $rules->acceptable_e(); | ||||
232 | 25 | 153 | $self->{empty_e} = $rules->empty_e(); | ||||
233 | 25 | 148 | $self->{unacceptable_e} = $rules->unacceptable_e(); | ||||
234 | 25 | 144 | $self->{uri_list} = $rules->uri_list(); | ||||
235 | 25 | 143 | $self->{allowed_schemes} = $rules->allowed_schemes(); | ||||
236 | 25 | 139 | $rules->finalize_initialization($self); | ||||
237 | |||||||
238 | 25 | 38 | return; | ||||
239 | } | ||||||
240 | |||||||
241 | =head2 add_callback | ||||||
242 | |||||||
243 | Adds a callback of type "start_tag", "end_tag", "text", "uri", or "output" to | ||||||
244 | the appropriate internal array. | ||||||
245 | |||||||
246 | $l->add_callback('start_tag', sub { | ||||||
247 | my ($laundry, $tagref, $attrhashref) = @_; | ||||||
248 | # Now, perform actions and return | ||||||
249 | }); | ||||||
250 | |||||||
251 | start_tag, end_tag, text, and uri callbacks that return false values will | ||||||
252 | suppress the return value of the element they are processing; this allows | ||||||
253 | additional checks to be done (for instance, images can be allowed only from | ||||||
254 | whitelisted source domains). | ||||||
255 | |||||||
256 | =cut | ||||||
257 | |||||||
258 | sub add_callback { | ||||||
259 | 21 | 21 | 1 | 4696 | my ( $self, $action, $ref ) = @_; | ||
260 | 21 | 50 | 65 | return if ( ref($ref) ne 'CODE' ); | |||
261 | 21 | 29 | switch ($action) { | ||||
21 | 27 | ||||||
21 | 60 | ||||||
0 | 0 | ||||||
262 | 21 | 100 | 306 | case q{start_tag} { | |||
4 | 43 | ||||||
263 | 4 | 5 | push @{ $self->{start_tag_callback} }, $ref; | ||||
4 | 8 | ||||||
264 | 4 | 20 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
265 | 17 | 100 | 259 | case q{end_tag} { | |||
4 | 44 | ||||||
266 | 4 | 7 | push @{ $self->{end_tag_callback} }, $ref; | ||||
4 | 9 | ||||||
267 | 4 | 23 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
268 | 13 | 100 | 176 | case q{text} { | |||
6 | 70 | ||||||
269 | 6 | 10 | push @{ $self->{text_callback} }, $ref; | ||||
6 | 14 | ||||||
270 | 6 | 32 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
271 | 7 | 100 | 96 | case q{uri} { | |||
4 | 60 | ||||||
272 | 4 | 7 | push @{ $self->{uri_callback} }, $ref; | ||||
4 | 17 | ||||||
273 | 4 | 396 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
274 | 3 | 50 | 37 | case q{output} { | |||
3 | 36 | ||||||
275 | 3 | 4 | push @{ $self->{output_callback} }, $ref; | ||||
3 | 10 | ||||||
276 | 3 | 18 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
277 | } | ||||||
278 | 21 | 55 | return; | ||||
279 | } | ||||||
280 | |||||||
281 | =head2 clear_callback | ||||||
282 | |||||||
283 | Removes all callbacks of given type. | ||||||
284 | |||||||
285 | $l->clear_callback('start_tag'); | ||||||
286 | |||||||
287 | =cut | ||||||
288 | |||||||
289 | sub clear_callback { | ||||||
290 | 139 | 139 | 1 | 11105 | my ( $self, $action ) = @_; | ||
291 | 139 | 156 | switch ($action) { | ||||
139 | 163 | ||||||
139 | 366 | ||||||
0 | 0 | ||||||
292 | 139 | 100 | 1816 | case q{start_tag} { | |||
27 | 402 | ||||||
293 | 27 | 493 | 227 | $self->{start_tag_callback} = [ sub { 1; } ]; | |||
493 | 1072 | ||||||
294 | 27 | 220 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
295 | 112 | 100 | 1416 | case q{end_tag} { | |||
27 | 328 | ||||||
296 | 27 | 467 | 145 | $self->{end_tag_callback} = [ sub { 1; } ]; | |||
467 | 645 | ||||||
297 | 27 | 149 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
298 | 85 | 100 | 907 | case q{text} { | |||
29 | 290 | ||||||
299 | 29 | 143 | 178 | $self->{text_callback} = [ sub { 1; } ]; | |||
143 | 226 | ||||||
300 | 29 | 164 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
301 | 56 | 100 | 601 | case q{uri} { | |||
29 | 341 | ||||||
302 | 29 | 58 | 147 | $self->{uri_callback} = [ sub { 1; } ]; | |||
58 | 97 | ||||||
303 | 29 | 172 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
304 | 27 | 50 | 284 | case q{output} { | |||
27 | 266 | ||||||
305 | 27 | 462 | 160 | $self->{output_callback} = [ sub { 1; } ]; | |||
462 | 625 | ||||||
306 | 27 | 166 | } | ||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
307 | } | ||||||
308 | 139 | 407 | return; | ||||
309 | } | ||||||
310 | |||||||
311 | =head2 clean | ||||||
312 | |||||||
313 | Cleans a snippet of HTML, using the ruleset and object creation options given | ||||||
314 | to the Laundry object. The snippet should be passed as a scalar. | ||||||
315 | |||||||
316 | $output1 = $l->clean( ' The X-rays were penetrating' ); |
||||||
317 | $output2 = $l->clean( $snippet ); | ||||||
318 | |||||||
319 | =cut | ||||||
320 | |||||||
321 | sub clean { | ||||||
322 | 462 | 462 | 1 | 128097 | my ( $self, $chunk, $args ) = @_; | ||
323 | 462 | 996 | $self->_reset_state(); | ||||
324 | 462 | 50 | 1108 | if ( $self->{trim_tag_whitespace} ) { | |||
325 | 0 | 0 | $chunk =~ s/$tag_leading_whitespace/$1/gs; | ||||
326 | } | ||||||
327 | 462 | 661 | my $p = $self->{parser}; | ||||
328 | 462 | 561 | my $cp = $self->{cdata_parser}; | ||||
329 | 462 | 3875 | $p->parse($chunk); | ||||
330 | 462 | 100 | 33 | 1511 | if ( !$in_cdata && !$unacceptable_count ) { | ||
331 | 461 | 1171 | $p->eof(); | ||||
332 | } | ||||||
333 | 462 | 50 | 33 | 1097 | if ( $in_cdata && !$local_unacceptable_count ) { | ||
334 | 0 | 0 | $cp->eof(); | ||||
335 | } | ||||||
336 | 462 | 889 | my $output = $self->gen_output; | ||||
337 | 462 | 1159 | $cp->eof(); # Clear buffer if we haven't already | ||||
338 | 462 | 100 | 733 | if ($cdata_dirty) { # Overkill to get out of CDATA parser state | |||
339 | $self->{parser} = HTML::Parser->new( | ||||||
340 | api_version => 3, | ||||||
341 | start_h => | ||||||
342 | 7 | 7 | 18 | [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
343 | 9 | 9 | 21 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
344 | 4 | 19 | 40 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
19 | 41 | ||||||
345 | empty_element_tags => 1, | ||||||
346 | marked_sections => 1, | ||||||
347 | ); | ||||||
348 | } | ||||||
349 | else { | ||||||
350 | 458 | 1045 | $p->eof(); # Clear buffer if we haven't already | ||||
351 | } | ||||||
352 | 462 | 2298 | return $output; | ||||
353 | } | ||||||
354 | |||||||
355 | =head2 base_uri | ||||||
356 | |||||||
357 | Used to get or set the base_uri property, used in URI rebasing. | ||||||
358 | |||||||
359 | my $base_uri = $l->base_uri; # returns current base_uri | ||||||
360 | $l->base_uri(q{http://example.com}); # return 'http://example.com' | ||||||
361 | $l->base_uri(''); # unsets base_uri | ||||||
362 | |||||||
363 | =cut | ||||||
364 | |||||||
365 | sub base_uri { | ||||||
366 | 3 | 3 | 1 | 7 | my ( $self, $new_base ) = @_; | ||
367 | 3 | 100 | 66 | 15 | if ( defined $new_base and !ref $new_base ) { | ||
368 | 2 | 4 | $self->{base_uri} = $new_base; | ||||
369 | } | ||||||
370 | 3 | 15 | return $self->{base_uri}; | ||||
371 | } | ||||||
372 | |||||||
373 | sub _run_callbacks { | ||||||
374 | 1623 | 1623 | 1747 | my $self = shift; | |||
375 | 1623 | 1795 | my $action = shift; | ||||
376 | 1623 | 50 | 3036 | return unless $action; | |||
377 | 1623 | 2375 | my $type = $action . q{_callback}; | ||||
378 | 1623 | 1563 | for my $callback ( @{ $self->{$type} } ) { | ||||
1623 | 3557 | ||||||
379 | 1650 | 2944 | my $result = $callback->( $self, @_ ); | ||||
380 | 1650 | 100 | 18914 | return unless $result; | |||
381 | } | ||||||
382 | 1619 | 4238 | return 1; | ||||
383 | } | ||||||
384 | |||||||
385 | =head2 gen_output | ||||||
386 | |||||||
387 | Used to generate the final, XHTML output from the internal stack of text and | ||||||
388 | tag tokens. Generally meant to be used internally, but potentially useful for | ||||||
389 | callbacks that require a snapshot of what the output would look like | ||||||
390 | before the cleaning process is complete. | ||||||
391 | |||||||
392 | my $xhtml = $l->gen_output; | ||||||
393 | |||||||
394 | =cut | ||||||
395 | |||||||
396 | sub gen_output { | ||||||
397 | 462 | 462 | 1 | 525 | my $self = shift; | ||
398 | 462 | 50 | 907 | if ( !$self->_run_callbacks( q{output}, \@fragments ) ) { | |||
399 | 0 | 0 | return q{}; | ||||
400 | } | ||||||
401 | 462 | 946 | my $output = join '', @fragments; | ||||
402 | 462 | 50 | 1011 | if ( $self->{tidy} ) { | |||
403 | 0 | 0 | 0 | if ( $self->{tidy_engine} eq q{HTML::Tidy} ) { | |||
0 | |||||||
404 | 0 | 0 | $output = $self->{tidy}->clean($output); | ||||
405 | 0 | 0 | $self->{tidy}->clear_messages; | ||||
406 | } | ||||||
407 | elsif ( $self->{tidy_engine} eq q{HTML::Tidy::libXML} ) { | ||||||
408 | 0 | 0 | my $clean | ||||
409 | = $self->{tidy} | ||||||
410 | ->clean( $self->{tidy_head} . $output . $self->{tidy_foot}, | ||||||
411 | 'UTF-8', 1 ); | ||||||
412 | 0 | 0 | $output = substr( $clean, length $self->{tidy_head} ); | ||||
413 | 0 | 0 | $output = substr( $output, 0, -1 * length $self->{tidy_foot} ); | ||||
414 | } | ||||||
415 | } | ||||||
416 | 462 | 50 | 915 | if ( $self->{trim_trailing_whitespace} ) { | |||
417 | 462 | 1181 | $output =~ s/\s+$//; | ||||
418 | } | ||||||
419 | 462 | 837 | return $output; | ||||
420 | } | ||||||
421 | |||||||
422 | =head2 empty_elements | ||||||
423 | |||||||
424 | Returns a list of the Laundry object's known empty elements: elements such | ||||||
425 | as or which must not contain any children. |
||||||
426 | |||||||
427 | =cut | ||||||
428 | |||||||
429 | sub empty_elements { | ||||||
430 | 0 | 0 | 1 | 0 | my ( $self, $listref ) = @_; | ||
431 | 0 | 0 | 0 | if ($listref) { | |||
432 | 0 | 0 | my @list = @{$listref}; | ||||
0 | 0 | ||||||
433 | 0 | 0 | my %empty = map { ( $_, 1 ) } @list; | ||||
0 | 0 | ||||||
434 | 0 | 0 | $self->{empty_e} = \%empty; | ||||
435 | } | ||||||
436 | 0 | 0 | return keys %{ $self->{empty_e} }; | ||||
0 | 0 | ||||||
437 | } | ||||||
438 | |||||||
439 | =head2 remove_empty_element | ||||||
440 | |||||||
441 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
442 | the "empty elements" list maintained by the Laundry object. | ||||||
443 | |||||||
444 | $l->remove_empty_element(['img', 'br']); # Let's break XHTML! | ||||||
445 | |||||||
446 | This will not affect the acceptable/unacceptable status of the elements. | ||||||
447 | |||||||
448 | =cut | ||||||
449 | |||||||
450 | sub remove_empty_element { | ||||||
451 | 4 | 4 | 1 | 442 | my ( $self, $new_e, $args ) = @_; | ||
452 | 4 | 8 | my $empty = $self->{empty_e}; | ||||
453 | 4 | 100 | 12 | if ( ref($new_e) eq 'ARRAY' ) { | |||
454 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
1 | 3 | ||||||
455 | 2 | 9 | $self->remove_empty_element( $e, $args ); | ||||
456 | } | ||||||
457 | } | ||||||
458 | else { | ||||||
459 | 3 | 6 | delete $empty->{$new_e}; | ||||
460 | } | ||||||
461 | 4 | 10 | return 1; | ||||
462 | } | ||||||
463 | |||||||
464 | =head2 acceptable_elements | ||||||
465 | |||||||
466 | Returns a list of the Laundry object's known acceptable elements, which will | ||||||
467 | not be stripped during the sanitizing process. | ||||||
468 | |||||||
469 | =cut | ||||||
470 | |||||||
471 | sub acceptable_elements { | ||||||
472 | 4 | 4 | 1 | 517 | my ( $self, $listref ) = @_; | ||
473 | 4 | 100 | 19 | if ( ref($listref) eq 'ARRAY' ) { | |||
474 | 1 | 2 | my @list = @{$listref}; | ||||
1 | 5 | ||||||
475 | 1 | 5 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
5 | 12 | ||||||
476 | 1 | 5 | $self->{acceptable_e} = \%acceptable; | ||||
477 | } | ||||||
478 | 4 | 22 | return keys %{ $self->{acceptable_e} }; | ||||
4 | 98 | ||||||
479 | } | ||||||
480 | |||||||
481 | =head2 add_acceptable_element | ||||||
482 | |||||||
483 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
484 | "acceptable elements" list maintained by the Laundry object. Items added in | ||||||
485 | this manner will automatically be removed from the "unacceptable elements" | ||||||
486 | list if they are present. | ||||||
487 | |||||||
488 | $l->add_acceptable_element('style'); | ||||||
489 | |||||||
490 | Elements which are empty may be flagged as such with an optional argument. | ||||||
491 | If this flag is set, all elements provided by the call will be added to | ||||||
492 | the "empty element" list. | ||||||
493 | |||||||
494 | $l->add_acceptable_element(['applet', 'script'], { empty => 1 }); | ||||||
495 | |||||||
496 | =cut | ||||||
497 | |||||||
498 | sub add_acceptable_element { | ||||||
499 | 10 | 10 | 1 | 1624 | my ( $self, $new_e, $args ) = @_; | ||
500 | 10 | 16 | my $acceptable = $self->{acceptable_e}; | ||||
501 | 10 | 16 | my $empty = $self->{empty_e}; | ||||
502 | 10 | 11 | my $unacceptable = $self->{unacceptable_e}; | ||||
503 | 10 | 100 | 24 | if ( ref($new_e) eq 'ARRAY' ) { | |||
504 | 2 | 3 | foreach my $e ( @{$new_e} ) { | ||||
2 | 6 | ||||||
505 | 4 | 12 | $self->add_acceptable_element( $e, $args ); | ||||
506 | } | ||||||
507 | } | ||||||
508 | else { | ||||||
509 | 8 | 21 | $acceptable->{$new_e} = 1; | ||||
510 | 8 | 100 | 25 | if ( $args->{empty} ) { | |||
50 | |||||||
511 | 4 | 11 | $empty->{$new_e} = 1; | ||||
512 | 4 | 50 | 14 | if ( $self->{tidy} ) { | |||
513 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
514 | 0 | 0 | $self->{tidy_added_empty}->{$new_e} = 1; | ||||
515 | 0 | 0 | $self->_generate_tidy; | ||||
516 | } | ||||||
517 | } | ||||||
518 | elsif ( $self->{tidy} ) { | ||||||
519 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
520 | 0 | 0 | $self->_generate_tidy; | ||||
521 | } | ||||||
522 | 8 | 14 | delete $unacceptable->{$new_e}; | ||||
523 | |||||||
524 | } | ||||||
525 | 10 | 24 | return 1; | ||||
526 | } | ||||||
527 | |||||||
528 | =head2 remove_acceptable_element | ||||||
529 | |||||||
530 | Removes an element (or, if given an array reference, multiple elements) to the | ||||||
531 | "acceptable elements" list maintained by the Laundry object. These items | ||||||
532 | (although not their child elements) will now be stripped during parsing. | ||||||
533 | |||||||
534 | $l->remove_acceptable_element(['img', 'h1', 'h2']); | ||||||
535 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
536 | # returns 'The Day the World Turned Day-Glo' | ||||||
537 | |||||||
538 | =cut | ||||||
539 | |||||||
540 | sub remove_acceptable_element { | ||||||
541 | 16 | 16 | 1 | 33 | my ( $self, $new_e, $args ) = @_; | ||
542 | 16 | 32 | my $acceptable = $self->{acceptable_e}; | ||||
543 | 16 | 100 | 34 | if ( ref($new_e) eq 'ARRAY' ) { | |||
544 | 2 | 5 | foreach my $e ( @{$new_e} ) { | ||||
2 | 5 | ||||||
545 | 4 | 13 | $self->remove_acceptable_element( $e, $args ); | ||||
546 | } | ||||||
547 | } | ||||||
548 | else { | ||||||
549 | 14 | 32 | delete $acceptable->{$new_e}; | ||||
550 | } | ||||||
551 | 16 | 32 | return 1; | ||||
552 | } | ||||||
553 | |||||||
554 | =head2 unacceptable_elements | ||||||
555 | |||||||
556 | Returns a list of the Laundry object's unacceptable elements, which will be | ||||||
557 | stripped -- B |
||||||
558 | |||||||
559 | =cut | ||||||
560 | |||||||
561 | sub unacceptable_elements { | ||||||
562 | 3 | 3 | 1 | 6 | my ( $self, $listref ) = @_; | ||
563 | 3 | 100 | 11 | if ( ref($listref) eq 'ARRAY' ) { | |||
564 | 1 | 3 | my @list = @{$listref}; | ||||
1 | 5 | ||||||
565 | 5 | 12 | my %unacceptable | ||||
566 | 1 | 3 | = map { $self->remove_acceptable_element($_); ( $_, 1 ); } @list; | ||||
5 | 16 | ||||||
567 | 1 | 4 | $self->{unacceptable_e} = \%unacceptable; | ||||
568 | } | ||||||
569 | 3 | 7 | return keys %{ $self->{unacceptable_e} }; | ||||
3 | 15 | ||||||
570 | } | ||||||
571 | |||||||
572 | =head2 add_unacceptable_element | ||||||
573 | |||||||
574 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
575 | "unacceptable elements" list maintained by the Laundry object. | ||||||
576 | |||||||
577 | $l->add_unacceptable_element(['h1', 'h2']); | ||||||
578 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
579 | # returns null string | ||||||
580 | |||||||
581 | =cut | ||||||
582 | |||||||
583 | sub add_unacceptable_element { | ||||||
584 | 4 | 4 | 1 | 1642 | my ( $self, $new_e, $args ) = @_; | ||
585 | 4 | 8 | my $unacceptable = $self->{unacceptable_e}; | ||||
586 | 4 | 100 | 12 | if ( ref($new_e) eq 'ARRAY' ) { | |||
587 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
1 | 3 | ||||||
588 | 2 | 12 | $self->add_unacceptable_element( $e, $args ); | ||||
589 | } | ||||||
590 | } | ||||||
591 | else { | ||||||
592 | 3 | 9 | $self->remove_acceptable_element($new_e); | ||||
593 | 3 | 6 | $unacceptable->{$new_e} = 1; | ||||
594 | } | ||||||
595 | 4 | 8 | return 1; | ||||
596 | } | ||||||
597 | |||||||
598 | =head2 remove_unacceptable_element | ||||||
599 | |||||||
600 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
601 | the "unacceptable elements" list maintained by the Laundry object. Note that | ||||||
602 | this does not automatically add the element to the acceptable_element list. | ||||||
603 | |||||||
604 | $l->clean(q{}); | ||||||
605 | # returns null string | ||||||
606 | $l->remove_unacceptable_element( q{script} ); | ||||||
607 | $l->clean(q{}); | ||||||
608 | # returns "alert('!')" | ||||||
609 | |||||||
610 | =cut | ||||||
611 | |||||||
612 | sub remove_unacceptable_element { | ||||||
613 | 4 | 4 | 1 | 7 | my ( $self, $new_e, $args ) = @_; | ||
614 | 4 | 7 | my $unacceptable = $self->{unacceptable_e}; | ||||
615 | 4 | 100 | 11 | if ( ref($new_e) eq 'ARRAY' ) { | |||
616 | 1 | 2 | foreach my $a ( @{$new_e} ) { | ||||
1 | 2 | ||||||
617 | 2 | 11 | $self->remove_unacceptable_element( $a, $args ); | ||||
618 | } | ||||||
619 | } | ||||||
620 | else { | ||||||
621 | 3 | 7 | delete $unacceptable->{$new_e}; | ||||
622 | } | ||||||
623 | 4 | 9 | return 1; | ||||
624 | } | ||||||
625 | |||||||
626 | =head2 acceptable_attributes | ||||||
627 | |||||||
628 | Returns a list of the Laundry object's known acceptable attributes, which will | ||||||
629 | not be stripped during the sanitizing process. | ||||||
630 | |||||||
631 | =cut | ||||||
632 | |||||||
633 | sub acceptable_attributes { | ||||||
634 | 3 | 3 | 1 | 6 | my ( $self, $listref ) = @_; | ||
635 | 3 | 100 | 11 | if ( ref($listref) eq 'ARRAY' ) { | |||
636 | 1 | 2 | my @list = @{$listref}; | ||||
1 | 5 | ||||||
637 | 1 | 2 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
3 | 9 | ||||||
638 | 1 | 4 | $self->{acceptable_a} = \%acceptable; | ||||
639 | } | ||||||
640 | 3 | 18 | return keys %{ $self->{acceptable_a} }; | ||||
3 | 41 | ||||||
641 | } | ||||||
642 | |||||||
643 | =head2 add_acceptable_attribute | ||||||
644 | |||||||
645 | Adds an attribute (or, if given an array reference, multiple attributes) to the | ||||||
646 | "acceptable attributes" list maintained by the Laundry object. | ||||||
647 | |||||||
648 | my $snippet = q{ "My dear Mr. Bennet," said his lady to |
||||||
649 | him one day, "have you heard that | ||||||
650 | Netherfield Park is let at last?" | ||||||
651 | }; | ||||||
652 | $l->clean( $snippet ); | ||||||
653 | # returns: | ||||||
654 | # "My dear Mr. Bennet," said his lady to him one day, |
||||||
655 | # "have you heard that Netherfield Park is let at | ||||||
656 | # last?" | ||||||
657 | $l->add_acceptable_attribute([austen:id, austen:footnote]); | ||||||
658 | $l->clean( $snippet ); | ||||||
659 | # returns: | ||||||
660 | # "My dear Mr. Bennet," said his lady to him |
||||||
661 | # one day, "have you heard that | ||||||
662 | # Netherfield Park is let at last?" | ||||||
663 | |||||||
664 | =cut | ||||||
665 | |||||||
666 | sub add_acceptable_attribute { | ||||||
667 | 4 | 4 | 1 | 1787 | my ( $self, $new_a, $args ) = @_; | ||
668 | 4 | 10 | my $acceptable = $self->{acceptable_a}; | ||||
669 | 4 | 100 | 14 | if ( ref($new_a) eq 'ARRAY' ) { | |||
670 | 1 | 3 | foreach my $a ( @{$new_a} ) { | ||||
1 | 3 | ||||||
671 | 2 | 8 | $self->add_acceptable_attribute( $a, $args ); | ||||
672 | } | ||||||
673 | } | ||||||
674 | else { | ||||||
675 | 3 | 10 | $acceptable->{$new_a} = 1; | ||||
676 | } | ||||||
677 | 4 | 10 | return 1; | ||||
678 | } | ||||||
679 | |||||||
680 | =head2 remove_acceptable_attribute | ||||||
681 | |||||||
682 | Removes an attribute (or, if given an array reference, multiple attributes) | ||||||
683 | from the "acceptable attributes" list maintained by the Laundry object. | ||||||
684 | |||||||
685 | $l->clean(q{ plover }); |
||||||
686 | # returns ' plover ' |
||||||
687 | $l->remove_acceptable_element( q{id} ); | ||||||
688 | $l->clean(q{ plover }); |
||||||
689 | # returns ' plover |
||||||
690 | |||||||
691 | =cut | ||||||
692 | |||||||
693 | sub remove_acceptable_attribute { | ||||||
694 | 4 | 4 | 1 | 8 | my ( $self, $new_a, $args ) = @_; | ||
695 | 4 | 6 | my $acceptable = $self->{acceptable_a}; | ||||
696 | 4 | 100 | 12 | if ( ref($new_a) eq 'ARRAY' ) { | |||
697 | 1 | 2 | foreach my $a ( @{$new_a} ) { | ||||
1 | 3 | ||||||
698 | 2 | 9 | $self->remove_acceptable_attribute( $a, $args ); | ||||
699 | } | ||||||
700 | } | ||||||
701 | else { | ||||||
702 | 3 | 9 | delete $acceptable->{$new_a}; | ||||
703 | } | ||||||
704 | 4 | 9 | return 1; | ||||
705 | } | ||||||
706 | |||||||
707 | sub _generate_tidy { | ||||||
708 | 8 | 8 | 8 | my $self = shift; | |||
709 | 8 | 8 | my $param = shift; | ||||
710 | 8 | 16 | $self->_generate_html_tidy; | ||||
711 | 8 | 50 | 40 | if ( !$self->{tidy} ) { | |||
712 | 8 | 18 | $self->_generate_html_tidy_libxml; | ||||
713 | } | ||||||
714 | 8 | 31 | return; | ||||
715 | } | ||||||
716 | |||||||
717 | sub _generate_html_tidy_libxml { | ||||||
718 | 8 | 8 | 11 | my $self = shift; | |||
719 | { | ||||||
720 | 8 | 8 | local $@; | ||||
8 | 8 | ||||||
721 | 8 | 13 | eval { | ||||
722 | 8 | 2696 | require HTML::Tidy::libXML; | ||||
723 | 0 | 0 | $self->{tidy} = HTML::Tidy::libXML->new(); | ||||
724 | 0 | 0 | $self->{tidy_head} = q{ | ||||
725 | |||||||
726 | "http://www.w3.org/TR/ html1/DTD/ html1-transitional.dtd"> | ||||||
727 | }; | ||||||
728 | 0 | 0 | $self->{tidy_foot} = q{ | ||||
729 | }; | ||||||
730 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy::libXML}; | ||||
731 | 0 | 0 | 1; | ||||
732 | }; | ||||||
733 | } | ||||||
734 | } | ||||||
735 | |||||||
736 | sub _generate_html_tidy { | ||||||
737 | 8 | 8 | 10 | my $self = shift; | |||
738 | { | ||||||
739 | 8 | 9 | local $@; | ||||
8 | 9 | ||||||
740 | 8 | 10 | eval { | ||||
741 | 8 | 3035 | require HTML::Tidy; | ||||
742 | 0 | 0 | $self->{tidy_ruleset} = $self->{ruleset}->tidy_ruleset; | ||||
743 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_inline} } ) { | |||
0 | 0 | ||||||
744 | 0 | 0 | $self->{tidy_ruleset}->{new_inline_tags} | ||||
745 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_inline} } ); | ||||
746 | } | ||||||
747 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_empty} } ) { | |||
0 | 0 | ||||||
748 | 0 | 0 | $self->{tidy_ruleset}->{new_empty_tags} | ||||
749 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_empty} } ); | ||||
750 | } | ||||||
751 | 0 | 0 | $self->{tidy} = HTML::Tidy->new( $self->{tidy_ruleset} ); | ||||
752 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy}; | ||||
753 | 0 | 0 | 1; | ||||
754 | }; | ||||||
755 | } | ||||||
756 | } | ||||||
757 | |||||||
758 | sub _reset_state { | ||||||
759 | 462 | 462 | 545 | my ($self) = @_; | |||
760 | 462 | 800 | @fragments = (); | ||||
761 | 462 | 509 | $unacceptable_count = 0; | ||||
762 | 462 | 451 | $local_unacceptable_count = 0; | ||||
763 | 462 | 485 | $in_cdata = 0; | ||||
764 | 462 | 439 | $cdata_dirty = 0; | ||||
765 | 462 | 603 | return; | ||||
766 | } | ||||||
767 | |||||||
768 | sub _tag_start_handler { | ||||||
769 | 493 | 493 | 730 | my ( $self, $tagname, $attr ) = @_; | |||
770 | 493 | 100 | 1107 | if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) { | |||
771 | 1 | 7 | return; | ||||
772 | } | ||||||
773 | 492 | 100 | 987 | if ( !$in_cdata ) { | |||
774 | 487 | 570 | $cdata_dirty = 0; | ||||
775 | } | ||||||
776 | 492 | 477 | my @attributes; | ||||
777 | 492 | 514 | foreach my $k ( keys %{$attr} ) { | ||||
492 | 1335 | ||||||
778 | 259 | 100 | 749 | if ( $self->{acceptable_a}->{$k} ) { | |||
779 | 174 | 100 | 192 | if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) { | |||
151 | 969 | ||||||
174 | 499 | ||||||
780 | 58 | 217 | $self->_uri_handler( $tagname, \$k, \$attr->{$k}, | ||||
781 | $self->{base_uri} ); | ||||||
782 | } | ||||||
783 | |||||||
784 | # Allow uri handler to suppress insertion | ||||||
785 | 174 | 100 | 419 | if ($k) { | |||
786 | 157 | 549 | push @attributes, $k . q{="} . $attr->{$k} . q{"}; | ||||
787 | } | ||||||
788 | } | ||||||
789 | } | ||||||
790 | 492 | 943 | my $attributes = join q{ }, @attributes; | ||||
791 | 492 | 100 | 1185 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
792 | 376 | 100 | 775 | if ( $self->{empty_e}->{$tagname} ) { | |||
793 | 58 | 100 | 146 | if ($attributes) { | |||
794 | 19 | 32 | $attributes = $attributes . q{ }; | ||||
795 | } | ||||||
796 | 58 | 167 | push @fragments, "<$tagname $attributes/>"; | ||||
797 | } | ||||||
798 | else { | ||||||
799 | 318 | 100 | 1056 | if ($attributes) { | |||
800 | 122 | 234 | $attributes = q{ } . $attributes; | ||||
801 | } | ||||||
802 | 318 | 678 | push @fragments, "<$tagname$attributes>"; | ||||
803 | } | ||||||
804 | } | ||||||
805 | else { | ||||||
806 | 116 | 100 | 339 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
807 | 24 | 100 | 48 | if ($in_cdata) { | |||
808 | 3 | 4 | $local_unacceptable_count += 1; | ||||
809 | } | ||||||
810 | else { | ||||||
811 | 21 | 35 | $unacceptable_count += 1; | ||||
812 | } | ||||||
813 | } | ||||||
814 | } | ||||||
815 | 492 | 2809 | return; | ||||
816 | } | ||||||
817 | |||||||
818 | sub _tag_end_handler { | ||||||
819 | 467 | 467 | 652 | my ( $self, $tagname ) = @_; | |||
820 | 467 | 100 | 849 | if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) { | |||
821 | 1 | 5 | return; | ||||
822 | } | ||||||
823 | 466 | 100 | 991 | if ( !$in_cdata ) { | |||
824 | 463 | 517 | $cdata_dirty = 0; | ||||
825 | } | ||||||
826 | 466 | 100 | 1039 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
827 | 346 | 100 | 857 | if ( !$self->{empty_e}->{$tagname} ) { | |||
828 | 316 | 639 | push @fragments, "$tagname>"; | ||||
829 | } | ||||||
830 | } | ||||||
831 | else { | ||||||
832 | 120 | 100 | 282 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
833 | 30 | 100 | 53 | if ($in_cdata) { | |||
834 | 1 | 2 | $local_unacceptable_count -= 1; | ||||
835 | 1 | 50 | 5 | $local_unacceptable_count = 0 | |||
836 | if ( $local_unacceptable_count < 0 ); | ||||||
837 | } | ||||||
838 | else { | ||||||
839 | 29 | 35 | $unacceptable_count -= 1; | ||||
840 | 29 | 100 | 87 | $unacceptable_count = 0 if ( $unacceptable_count < 0 ); | |||
841 | } | ||||||
842 | } | ||||||
843 | } | ||||||
844 | 466 | 1504 | return; | ||||
845 | } | ||||||
846 | |||||||
847 | sub _text_handler { | ||||||
848 | 172 | 172 | 310 | my ( $self, $text, $is_cdata ) = @_; | |||
849 | 172 | 100 | 100 | 481 | if ( $in_cdata && $local_unacceptable_count ) { | ||
850 | 1 | 3 | return; | ||||
851 | } | ||||||
852 | 171 | 100 | 426 | if ($unacceptable_count) { | |||
853 | 15 | 71 | return; | ||||
854 | } | ||||||
855 | 156 | 100 | 271 | if ($is_cdata) { | |||
856 | 13 | 22 | my $cp = $self->{cdata_parser}; | ||||
857 | 13 | 17 | $in_cdata = 1; | ||||
858 | 13 | 44 | $cp->parse($text); | ||||
859 | 13 | 100 | 28 | if ( !$local_unacceptable_count ) { | |||
860 | 11 | 45 | $cp->eof(); | ||||
861 | } | ||||||
862 | 13 | 17 | $cdata_dirty = 1; | ||||
863 | 13 | 13 | $in_cdata = 0; | ||||
864 | 13 | 46 | return; | ||||
865 | } | ||||||
866 | else { | ||||||
867 | 143 | 100 | 331 | if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) { | |||
868 | 1 | 6 | return q{}; | ||||
869 | } | ||||||
870 | 142 | 486 | $text = encode_entities( $text, '<>&"' ); | ||||
871 | 142 | 8508 | $cdata_dirty = 0; | ||||
872 | } | ||||||
873 | 142 | 281 | push @fragments, $text; | ||||
874 | 142 | 619 | return; | ||||
875 | } | ||||||
876 | |||||||
877 | sub _uri_handler { | ||||||
878 | 58 | 58 | 104 | my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_; | |||
879 | 58 | 72 | my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} ); | ||||
58 | 121 | ||||||
58 | 99 | ||||||
880 | 58 | 241 | $value =~ s/[`\x00-\x1f\x7f]+//g; | ||||
881 | 58 | 101 | $value =~ s/\ufffd//g; | ||||
882 | 58 | 286 | my $uri = URI->new($value); | ||||
883 | 58 | 69793 | $uri = $uri->canonical; | ||||
884 | 58 | 100 | 5703 | if ( !$self->_run_callbacks( q{uri}, $tagname, $attr, \$uri ) ) { | |||
885 | 1 | 3 | ${$attr_ref} = q{}; | ||||
1 | 4 | ||||||
886 | 1 | 4 | return undef; | ||||
887 | } | ||||||
888 | 57 | 100 | 66 | 332 | if ( $self->{allowed_schemes} and $uri->scheme ) { | ||
889 | 42 | 100 | 722 | unless ( $self->{allowed_schemes}->{ $uri->scheme } ) { | |||
890 | 16 | 196 | ${$attr_ref} = q{}; | ||||
16 | 31 | ||||||
891 | 16 | 65 | return undef; | ||||
892 | } | ||||||
893 | } | ||||||
894 | 41 | 100 | 688 | if ( $self->{base_uri} ) { | |||
895 | 8 | 63 | $uri = URI->new_abs( $uri->as_string, $self->{base_uri} ); | ||||
896 | } | ||||||
897 | 41 | 100 | 1760 | if ( $uri->scheme ) { # Not a local URI | |||
898 | 33 | 370 | my $host; | ||||
899 | { | ||||||
900 | 33 | 41 | local $@; | ||||
33 | 73 | ||||||
901 | 33 | 54 | eval { $host = $uri->host; }; | ||||
33 | 95 | ||||||
902 | } | ||||||
903 | 33 | 50 | 745 | if ($host) { | |||
904 | |||||||
905 | # We may need to manually unescape domain names | ||||||
906 | # to deal with issues like tinyarro.ws | ||||||
907 | 33 | 83 | my $utf8_host = $self->_decode_utf8($host); | ||||
908 | 33 | 76 | utf8::upgrade($utf8_host); | ||||
909 | 33 | 50 | 90 | if ( $uri->host ne $utf8_host ) { | |||
910 | |||||||
911 | # TODO: Optionally use Punycode in this case | ||||||
912 | |||||||
913 | 0 | 0 | 0 | 0 | if ( $uri->port and $uri->port == $uri->default_port ) { | ||
914 | 0 | 0 | $uri->port(undef); | ||||
915 | } | ||||||
916 | 0 | 0 | my $escaped_host = $self->_encode_utf8( $uri->host ); | ||||
917 | 0 | 0 | my $uri_str = $uri->canonical->as_string; | ||||
918 | 0 | 0 | $uri_str =~ s/$escaped_host/$utf8_host/; | ||||
919 | 0 | 0 | utf8::upgrade($uri_str); | ||||
920 | 0 | 0 | ${$value_ref} = $uri_str; | ||||
0 | 0 | ||||||
921 | 0 | 0 | return; | ||||
922 | } | ||||||
923 | } | ||||||
924 | } | ||||||
925 | 41 | 836 | ${$value_ref} = $uri->canonical->as_string; | ||||
41 | 2774 | ||||||
926 | 41 | 189 | return; | ||||
927 | } | ||||||
928 | |||||||
929 | sub _decode_utf8 { | ||||||
930 | 33 | 33 | 43 | my $self = shift; | |||
931 | 33 | 59 | my $orig = my $str = shift; | ||||
932 | 33 | 54 | $str =~ s/\%([0-9a-f]{2})/chr(hex($1))/egi; | ||||
0 | 0 | ||||||
933 | 33 | 50 | 154 | return $str if utf8::decode($str); | |||
934 | 0 | return $orig; | |||||
935 | } | ||||||
936 | |||||||
937 | sub _encode_utf8 { | ||||||
938 | 0 | 0 | my $self = shift; | ||||
939 | 0 | my $str = shift; | |||||
940 | 0 | my $highbit = qr/[^\w\$-_.+!*'(),]/; | |||||
941 | 0 | $str =~ s/($highbit)/ sprintf ("%%%02X", ord($1)) /ge; | |||||
0 | |||||||
942 | 0 | utf8::upgrade($str); | |||||
943 | 0 | return $str; | |||||
944 | } | ||||||
945 | |||||||
946 | =head1 SEE ALSO | ||||||
947 | |||||||
948 | There are a number of tools designed for sanitizing HTML, some of which | ||||||
949 | may be better suited than HTML::Laundry to particular circumstances. In | ||||||
950 | addition to L |
||||||
951 | L |
||||||
952 | solely for the purposes of sanitizing HTML from potential XSS attack vectors; | ||||||
953 | L |
||||||
954 | L |
||||||
955 | |||||||
956 | =head1 AUTHOR | ||||||
957 | |||||||
958 | Steve Cook, C<< |
||||||
959 | |||||||
960 | =head1 BUGS | ||||||
961 | |||||||
962 | Please report any bugs or feature requests on the GitHub page for this project, | ||||||
963 | http://github.com/snark/html-laundry. | ||||||
964 | |||||||
965 | =head1 ACKNOWLEDGMENTS | ||||||
966 | |||||||
967 | Thanks to Dave Cross and Vera Tobin. | ||||||
968 | |||||||
969 | =head1 SUPPORT | ||||||
970 | |||||||
971 | You can find documentation for this module with the perldoc command. | ||||||
972 | |||||||
973 | perldoc HTML::Laundry | ||||||
974 | |||||||
975 | =head1 COPYRIGHT & LICENSE | ||||||
976 | |||||||
977 | Copyright 2009 Six Apart, Ltd., all rights reserved. | ||||||
978 | |||||||
979 | This program is free software; you can redistribute it and/or modify it | ||||||
980 | under the same terms as Perl itself. | ||||||
981 | |||||||
982 | =cut | ||||||
983 | |||||||
984 | 1; # End of HTML::Laundry |