blib/lib/HTML/Laundry.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 358 | 417 | 85.8 |
branch | 131 | 160 | 81.8 |
condition | 11 | 21 | 52.3 |
subroutine | 50 | 52 | 96.1 |
pod | 18 | 18 | 100.0 |
total | 568 | 668 | 85.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | ######################################################## | ||||||
2 | # Copyright © 2009 Six Apart, Ltd. | ||||||
3 | |||||||
4 | package HTML::Laundry; | ||||||
5 | |||||||
6 | 15 | 15 | 31872 | use strict; | |||
15 | 20 | ||||||
15 | 452 | ||||||
7 | 15 | 15 | 64 | use warnings; | |||
15 | 21 | ||||||
15 | 457 | ||||||
8 | |||||||
9 | 15 | 15 | 318 | use 5.008; | |||
15 | 39 | ||||||
10 | 15 | 15 | 6817 | use version; our $VERSION = 0.0105; | |||
15 | 22598 | ||||||
15 | 86 | ||||||
11 | |||||||
12 | =head1 NAME | ||||||
13 | |||||||
14 | HTML::Laundry - Perl module to clean HTML by the piece | ||||||
15 | |||||||
16 | =head1 VERSION | ||||||
17 | |||||||
18 | Version 0.0102 | ||||||
19 | |||||||
20 | =head1 SYNOPSIS | ||||||
21 | |||||||
22 | #!/usr/bin/perl -w | ||||||
23 | use strict; | ||||||
24 | use HTML::Laundry; | ||||||
25 | my $laundry = HTML::Laundry->new(); | ||||||
26 | my $snippet = q{ | ||||||
27 | |||||||
28 | If your gloves are sterilized |
||||||
29 | Rinse your mouth with Listerine | ||||||
30 | Blow disinfectant in her eyes" |
||||||
31 | -- X-Ray Spex, Germ-Free Adolescents | ||||||
32 | |||||||
33 | }; | ||||||
34 | my $germfree = $laundry->clean($snippet); | ||||||
35 | # $germfree is now: | ||||||
36 | # "You may get to touch her |
||||||
37 | # If your gloves are sterilized |
||||||
38 | # Rinse your mouth with Listerine |
||||||
39 | # Blow disinfectant in her eyes" |
||||||
40 | # -- X-Ray Spex, Germ-Free Adolescents | ||||||
41 | |||||||
42 | =head1 DESCRIPTION | ||||||
43 | |||||||
44 | HTML::Laundry is an L |
||||||
45 | meant for small pieces of HTML, such as user comments, Atom feed entries, | ||||||
46 | and the like, rather than full pages. Laundry takes these and returns clean, | ||||||
47 | sanitary, UTF-8-based XHTML. The parser's behavior may be changed with | ||||||
48 | callbacks, and the whitelist of acceptable elements and attributes may be | ||||||
49 | updated on the fly. | ||||||
50 | |||||||
51 | A snippet is cleaned several ways: | ||||||
52 | |||||||
53 | =over 4 | ||||||
54 | |||||||
55 | =item * Normalized, using HTML::Parser: attributes and elements will be | ||||||
56 | lowercased, empty elements such as and will be forced into |
||||||
57 | the empty tag syntax if needed, and unknown attributes and elements will be | ||||||
58 | stripped. | ||||||
59 | |||||||
60 | =item * Sanitized, using an extensible whitelist of valid attributes and | ||||||
61 | elements based on Mark Pilgrim and Aaron Swartz's work on C |
||||||
62 | and attributes which are known to be possible attack vectors are removed. | ||||||
63 | |||||||
64 | =item * Tidied, using L |
||||||
65 | (as available): unclosed tags will be closed and the output generally | ||||||
66 | neatened; future version may also use tidying to deal with character encoding | ||||||
67 | issues. | ||||||
68 | |||||||
69 | =item * Optionally rebased, to turn relative URLs in attributes into | ||||||
70 | absolute ones. | ||||||
71 | |||||||
72 | =back | ||||||
73 | |||||||
74 | HTML::Laundry provides mechanisms to extend the list of known allowed | ||||||
75 | (and disallowed) tags, along with callback methods to allow scripts using | ||||||
76 | HTML::Laundry to extend the behavior in various ways. Future versions | ||||||
77 | may provide additional options for altering the rules used to clean | ||||||
78 | snippets. | ||||||
79 | |||||||
80 | Out of the box, HTML::Laundry does not currently know about the tag | ||||||
81 | and its children. For santizing full HTML pages, consider using L |
||||||
82 | or L |
||||||
83 | |||||||
84 | =cut | ||||||
85 | |||||||
86 | require HTML::Laundry::Rules; | ||||||
87 | require HTML::Laundry::Rules::Default; | ||||||
88 | |||||||
89 | require HTML::Parser; | ||||||
90 | 15 | 15 | 7889 | use HTML::Entities qw(encode_entities encode_entities_numeric); | |||
15 | 56786 | ||||||
15 | 1318 | ||||||
91 | 15 | 15 | 7494 | use URI; | |||
15 | 47955 | ||||||
15 | 476 | ||||||
92 | 15 | 15 | 98 | use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8); | |||
15 | 22 | ||||||
15 | 942 | ||||||
93 | 15 | 15 | 6985 | use URI::Split qw(); | |||
15 | 6873 | ||||||
15 | 319 | ||||||
94 | 15 | 15 | 73 | use Scalar::Util 'blessed'; | |||
15 | 17 | ||||||
15 | 48707 | ||||||
95 | |||||||
96 | my @fragments; | ||||||
97 | my $unacceptable_count; | ||||||
98 | my $local_unacceptable_count; | ||||||
99 | my $cdata_dirty; | ||||||
100 | my $in_cdata; | ||||||
101 | my $tag_leading_whitespace = qr/ | ||||||
102 | (?<=<) # Left bracket followed by | ||||||
103 | \s* # any amount of whitespace | ||||||
104 | (\/?) # optionally with a forward slash | ||||||
105 | \s* # and then more whitespace | ||||||
106 | /x; | ||||||
107 | |||||||
108 | =head1 FUNCTIONS | ||||||
109 | |||||||
110 | =head2 new | ||||||
111 | |||||||
112 | Create an HTML::Laundry object. | ||||||
113 | |||||||
114 | my $l = HTML::Laundry->new(); | ||||||
115 | |||||||
116 | Takes an optional anonymous hash of arguments: | ||||||
117 | |||||||
118 | =over 4 | ||||||
119 | |||||||
120 | =item * base_url | ||||||
121 | |||||||
122 | This turns relative URIs, as in C<>, into | ||||||
123 | absolute URIs, as for use in feed parsing. | ||||||
124 | |||||||
125 | my $l = HTML::Laundry->new({ base_uri => 'http://example.com/foo/' }); | ||||||
126 | |||||||
127 | |||||||
128 | =item * notidy | ||||||
129 | |||||||
130 | Disable use of HTML::Tidy or HTML::Tidy::libXML, even if | ||||||
131 | they are available on your system. | ||||||
132 | |||||||
133 | my $l = HTML::Laundry->new({ notidy => 1 }); | ||||||
134 | |||||||
135 | =back | ||||||
136 | |||||||
137 | =cut | ||||||
138 | |||||||
139 | sub new { | ||||||
140 | 25 | 25 | 1 | 5032 | my $self = {}; | ||
141 | 25 | 40 | my $class = shift; | ||||
142 | 25 | 28 | my $args = shift; | ||||
143 | |||||||
144 | 25 | 100 | 280 | if ( blessed $args ) { | |||
100 | |||||||
145 | 1 | 50 | 6 | if ( $args->isa('HTML::Laundry::Rules') ) { | |||
146 | 1 | 3 | $args = { rules => $args }; | ||||
147 | } | ||||||
148 | else { | ||||||
149 | 0 | 0 | $args = {}; | ||||
150 | } | ||||||
151 | } | ||||||
152 | elsif ( ref $args ne 'HASH' ) { | ||||||
153 | 4 | 3 | my $rules; | ||||
154 | { | ||||||
155 | 4 | 4 | local $@; | ||||
4 | 5 | ||||||
156 | 4 | 4 | eval { | ||||
157 | 4 | 100 | 49 | $args->isa('HTML::Laundry::Rules') | |||
158 | and $rules = $args->new; | ||||||
159 | }; | ||||||
160 | } | ||||||
161 | 4 | 100 | 12 | if ($rules) { | |||
162 | 1 | 8 | $args = { rules => $args }; | ||||
163 | } | ||||||
164 | else { | ||||||
165 | 3 | 6 | $args = {}; | ||||
166 | } | ||||||
167 | } | ||||||
168 | |||||||
169 | 25 | 53 | $self->{tidy} = undef; | ||||
170 | 25 | 41 | $self->{tidy_added_inline} = {}; | ||||
171 | 25 | 44 | $self->{tidy_added_empty} = {}; | ||||
172 | 25 | 146 | $self->{base_uri} = q{}; | ||||
173 | 25 | 35 | bless $self, $class; | ||||
174 | 25 | 65 | $self->clear_callback('start_tag'); | ||||
175 | 25 | 44 | $self->clear_callback('end_tag'); | ||||
176 | 25 | 44 | $self->clear_callback('uri'); | ||||
177 | 25 | 39 | $self->clear_callback('text'); | ||||
178 | 25 | 40 | $self->clear_callback('output'); | ||||
179 | $self->{parser} = HTML::Parser->new( | ||||||
180 | api_version => 3, | ||||||
181 | utf8_mode => 1, | ||||||
182 | 481 | 481 | 930 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
183 | 453 | 453 | 925 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
184 | 25 | 139 | 316 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
139 | 299 | ||||||
185 | empty_element_tags => 1, | ||||||
186 | marked_sections => 1, | ||||||
187 | ); | ||||||
188 | $self->{cdata_parser} = HTML::Parser->new( | ||||||
189 | api_version => 3, | ||||||
190 | utf8_mode => 1, | ||||||
191 | 5 | 5 | 8 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
192 | 5 | 5 | 9 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
193 | 25 | 14 | 2035 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext' ], | |||
14 | 25 | ||||||
194 | empty_element_tags => 1, | ||||||
195 | unbroken_text => 1, | ||||||
196 | marked_sections => 0, | ||||||
197 | ); | ||||||
198 | 25 | 1244 | $self->initialize($args); | ||||
199 | |||||||
200 | 25 | 100 | 68 | if ( !$args->{notidy} ) { | |||
201 | 8 | 13 | $self->_generate_tidy; | ||||
202 | } | ||||||
203 | 25 | 74 | return $self; | ||||
204 | } | ||||||
205 | |||||||
206 | =head2 initialize | ||||||
207 | |||||||
208 | Instantiates the Laundry object properties based on an | ||||||
209 | HTML::Laundry::Rules module. | ||||||
210 | |||||||
211 | =cut | ||||||
212 | |||||||
213 | sub initialize { | ||||||
214 | 25 | 25 | 1 | 36 | my ( $self, $args ) = @_; | ||
215 | |||||||
216 | # Set defaults | ||||||
217 | 25 | 35 | $self->{tidy_added_tags} = undef; | ||||
218 | 25 | 35 | $self->{tidy_empty_tags} = undef; | ||||
219 | 25 | 33 | $self->{trim_trailing_whitespace} = 1; | ||||
220 | 25 | 30 | $self->{trim_tag_whitespace} = 0; | ||||
221 | $self->{base_uri} = URI->new( $args->{base_uri} ) | ||||||
222 | 25 | 100 | 98 | if $args->{base_uri}; | |||
223 | 25 | 2891 | my $rules = $args->{rules}; | ||||
224 | 25 | 66 | 205 | $rules ||= HTML::Laundry::Rules::Default->new(); | |||
225 | |||||||
226 | 25 | 73 | $self->{ruleset} = $rules; | ||||
227 | |||||||
228 | # Initialize based on ruleset | ||||||
229 | 25 | 102 | $self->{acceptable_a} = $rules->acceptable_a(); | ||||
230 | 25 | 101 | $self->{acceptable_e} = $rules->acceptable_e(); | ||||
231 | 25 | 102 | $self->{empty_e} = $rules->empty_e(); | ||||
232 | 25 | 141 | $self->{unacceptable_e} = $rules->unacceptable_e(); | ||||
233 | 25 | 90 | $self->{uri_list} = $rules->uri_list(); | ||||
234 | 25 | 102 | $self->{allowed_schemes} = $rules->allowed_schemes(); | ||||
235 | 25 | 91 | $rules->finalize_initialization($self); | ||||
236 | |||||||
237 | 25 | 38 | return; | ||||
238 | } | ||||||
239 | |||||||
240 | =head2 add_callback | ||||||
241 | |||||||
242 | Adds a callback of type "start_tag", "end_tag", "text", "uri", or "output" to | ||||||
243 | the appropriate internal array. | ||||||
244 | |||||||
245 | $l->add_callback('start_tag', sub { | ||||||
246 | my ($laundry, $tagref, $attrhashref) = @_; | ||||||
247 | # Now, perform actions and return | ||||||
248 | }); | ||||||
249 | |||||||
250 | start_tag, end_tag, text, and uri callbacks that return false values will | ||||||
251 | suppress the return value of the element they are processing; this allows | ||||||
252 | additional checks to be done (for instance, images can be allowed only from | ||||||
253 | whitelisted source domains). | ||||||
254 | |||||||
255 | =cut | ||||||
256 | |||||||
257 | sub add_callback { | ||||||
258 | 21 | 21 | 1 | 4541 | my ( $self, $action, $ref ) = @_; | ||
259 | 21 | 50 | 46 | return if ( ref($ref) ne 'CODE' ); | |||
260 | 21 | 100 | 72 | if ($action eq q{start_tag}) { | |||
100 | |||||||
100 | |||||||
100 | |||||||
50 | |||||||
261 | 4 | 3 | push @{ $self->{start_tag_callback} }, $ref; | ||||
4 | 7 | ||||||
262 | } elsif ($action eq q{end_tag}) { | ||||||
263 | 4 | 4 | push @{ $self->{end_tag_callback} }, $ref; | ||||
4 | 8 | ||||||
264 | } elsif ($action eq q{text}) { | ||||||
265 | 6 | 5 | push @{ $self->{text_callback} }, $ref; | ||||
6 | 13 | ||||||
266 | } elsif ($action eq q{uri}) { | ||||||
267 | 4 | 4 | push @{ $self->{uri_callback} }, $ref; | ||||
4 | 6 | ||||||
268 | } elsif ($action eq q{output}) { | ||||||
269 | 3 | 5 | push @{ $self->{output_callback} }, $ref; | ||||
3 | 6 | ||||||
270 | } | ||||||
271 | 21 | 32 | return; | ||||
272 | } | ||||||
273 | |||||||
274 | =head2 clear_callback | ||||||
275 | |||||||
276 | Removes all callbacks of given type. | ||||||
277 | |||||||
278 | $l->clear_callback('start_tag'); | ||||||
279 | |||||||
280 | =cut | ||||||
281 | |||||||
282 | sub clear_callback { | ||||||
283 | 139 | 139 | 1 | 11155 | my ( $self, $action ) = @_; | ||
284 | 139 | 100 | 421 | if ($action eq q{start_tag}) { | |||
100 | |||||||
100 | |||||||
100 | |||||||
50 | |||||||
285 | 27 | 493 | 236 | $self->{start_tag_callback} = [ sub { 1; } ]; | |||
493 | 491 | ||||||
286 | } elsif ($action eq q{end_tag}) { | ||||||
287 | 27 | 467 | 92 | $self->{end_tag_callback} = [ sub { 1; } ]; | |||
467 | 448 | ||||||
288 | } elsif ($action eq q{text}) { | ||||||
289 | 29 | 143 | 107 | $self->{text_callback} = [ sub { 1; } ]; | |||
143 | 145 | ||||||
290 | } elsif ($action eq q{uri}) { | ||||||
291 | 29 | 58 | 99 | $self->{uri_callback} = [ sub { 1; } ]; | |||
58 | 70 | ||||||
292 | } elsif ($action eq q{output}) { | ||||||
293 | 27 | 462 | 87 | $self->{output_callback} = [ sub { 1; } ]; | |||
462 | 430 | ||||||
294 | } | ||||||
295 | 139 | 230 | return; | ||||
296 | } | ||||||
297 | |||||||
298 | =head2 clean | ||||||
299 | |||||||
300 | Cleans a snippet of HTML, using the ruleset and object creation options given | ||||||
301 | to the Laundry object. The snippet should be passed as a scalar. | ||||||
302 | |||||||
303 | $output1 = $l->clean( ' The X-rays were penetrating' ); |
||||||
304 | $output2 = $l->clean( $snippet ); | ||||||
305 | |||||||
306 | =cut | ||||||
307 | |||||||
308 | sub clean { | ||||||
309 | 462 | 462 | 1 | 134162 | my ( $self, $chunk, $args ) = @_; | ||
310 | 462 | 900 | $self->_reset_state(); | ||||
311 | 462 | 50 | 960 | if ( $self->{trim_tag_whitespace} ) { | |||
312 | 0 | 0 | $chunk =~ s/$tag_leading_whitespace/$1/gs; | ||||
313 | } | ||||||
314 | 462 | 443 | my $p = $self->{parser}; | ||||
315 | 462 | 400 | my $cp = $self->{cdata_parser}; | ||||
316 | 462 | 3265 | $p->parse($chunk); | ||||
317 | 462 | 100 | 33 | 1291 | if ( !$in_cdata && !$unacceptable_count ) { | ||
318 | 461 | 999 | $p->eof(); | ||||
319 | } | ||||||
320 | 462 | 50 | 33 | 799 | if ( $in_cdata && !$local_unacceptable_count ) { | ||
321 | 0 | 0 | $cp->eof(); | ||||
322 | } | ||||||
323 | 462 | 661 | my $output = $self->gen_output; | ||||
324 | 462 | 895 | $cp->eof(); # Clear buffer if we haven't already | ||||
325 | 462 | 100 | 627 | if ($cdata_dirty) { # Overkill to get out of CDATA parser state | |||
326 | $self->{parser} = HTML::Parser->new( | ||||||
327 | api_version => 3, | ||||||
328 | start_h => | ||||||
329 | 7 | 7 | 11 | [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
330 | 9 | 9 | 15 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
331 | 4 | 19 | 34 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
19 | 26 | ||||||
332 | empty_element_tags => 1, | ||||||
333 | marked_sections => 1, | ||||||
334 | ); | ||||||
335 | } | ||||||
336 | else { | ||||||
337 | 458 | 647 | $p->eof(); # Clear buffer if we haven't already | ||||
338 | } | ||||||
339 | 462 | 2090 | return $output; | ||||
340 | } | ||||||
341 | |||||||
342 | =head2 base_uri | ||||||
343 | |||||||
344 | Used to get or set the base_uri property, used in URI rebasing. | ||||||
345 | |||||||
346 | my $base_uri = $l->base_uri; # returns current base_uri | ||||||
347 | $l->base_uri(q{http://example.com}); # return 'http://example.com' | ||||||
348 | $l->base_uri(''); # unsets base_uri | ||||||
349 | |||||||
350 | =cut | ||||||
351 | |||||||
352 | sub base_uri { | ||||||
353 | 3 | 3 | 1 | 4 | my ( $self, $new_base ) = @_; | ||
354 | 3 | 100 | 66 | 17 | if ( defined $new_base and !ref $new_base ) { | ||
355 | 2 | 4 | $self->{base_uri} = $new_base; | ||||
356 | } | ||||||
357 | 3 | 12 | return $self->{base_uri}; | ||||
358 | } | ||||||
359 | |||||||
360 | sub _run_callbacks { | ||||||
361 | 1623 | 1623 | 1345 | my $self = shift; | |||
362 | 1623 | 1252 | my $action = shift; | ||||
363 | 1623 | 50 | 2258 | return unless $action; | |||
364 | 1623 | 1802 | my $type = $action . q{_callback}; | ||||
365 | 1623 | 1175 | for my $callback ( @{ $self->{$type} } ) { | ||||
1623 | 2773 | ||||||
366 | 1650 | 2194 | my $result = $callback->( $self, @_ ); | ||||
367 | 1650 | 100 | 13391 | return unless $result; | |||
368 | } | ||||||
369 | 1619 | 2724 | return 1; | ||||
370 | } | ||||||
371 | |||||||
372 | =head2 gen_output | ||||||
373 | |||||||
374 | Used to generate the final, XHTML output from the internal stack of text and | ||||||
375 | tag tokens. Generally meant to be used internally, but potentially useful for | ||||||
376 | callbacks that require a snapshot of what the output would look like | ||||||
377 | before the cleaning process is complete. | ||||||
378 | |||||||
379 | my $xhtml = $l->gen_output; | ||||||
380 | |||||||
381 | =cut | ||||||
382 | |||||||
383 | sub gen_output { | ||||||
384 | 462 | 462 | 1 | 458 | my $self = shift; | ||
385 | 462 | 50 | 648 | if ( !$self->_run_callbacks( q{output}, \@fragments ) ) { | |||
386 | 0 | 0 | return q{}; | ||||
387 | } | ||||||
388 | 462 | 804 | my $output = join '', @fragments; | ||||
389 | 462 | 50 | 846 | if ( $self->{tidy} ) { | |||
390 | 0 | 0 | 0 | if ( $self->{tidy_engine} eq q{HTML::Tidy} ) { | |||
0 | |||||||
391 | 0 | 0 | $output = $self->{tidy}->clean($output); | ||||
392 | 0 | 0 | $self->{tidy}->clear_messages; | ||||
393 | } | ||||||
394 | elsif ( $self->{tidy_engine} eq q{HTML::Tidy::libXML} ) { | ||||||
395 | my $clean | ||||||
396 | = $self->{tidy} | ||||||
397 | ->clean( $self->{tidy_head} . $output . $self->{tidy_foot}, | ||||||
398 | 0 | 0 | 'UTF-8', 1 ); | ||||
399 | 0 | 0 | $output = substr( $clean, length $self->{tidy_head} ); | ||||
400 | 0 | 0 | $output = substr( $output, 0, -1 * length $self->{tidy_foot} ); | ||||
401 | } | ||||||
402 | } | ||||||
403 | 462 | 50 | 706 | if ( $self->{trim_trailing_whitespace} ) { | |||
404 | 462 | 1256 | $output =~ s/\s+$//; | ||||
405 | } | ||||||
406 | 462 | 584 | return $output; | ||||
407 | } | ||||||
408 | |||||||
409 | =head2 empty_elements | ||||||
410 | |||||||
411 | Returns a list of the Laundry object's known empty elements: elements such | ||||||
412 | as or which must not contain any children. |
||||||
413 | |||||||
414 | =cut | ||||||
415 | |||||||
416 | sub empty_elements { | ||||||
417 | 0 | 0 | 1 | 0 | my ( $self, $listref ) = @_; | ||
418 | 0 | 0 | 0 | if ($listref) { | |||
419 | 0 | 0 | my @list = @{$listref}; | ||||
0 | 0 | ||||||
420 | 0 | 0 | my %empty = map { ( $_, 1 ) } @list; | ||||
0 | 0 | ||||||
421 | 0 | 0 | $self->{empty_e} = \%empty; | ||||
422 | } | ||||||
423 | 0 | 0 | return keys %{ $self->{empty_e} }; | ||||
0 | 0 | ||||||
424 | } | ||||||
425 | |||||||
426 | =head2 remove_empty_element | ||||||
427 | |||||||
428 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
429 | the "empty elements" list maintained by the Laundry object. | ||||||
430 | |||||||
431 | $l->remove_empty_element(['img', 'br']); # Let's break XHTML! | ||||||
432 | |||||||
433 | This will not affect the acceptable/unacceptable status of the elements. | ||||||
434 | |||||||
435 | =cut | ||||||
436 | |||||||
437 | sub remove_empty_element { | ||||||
438 | 4 | 4 | 1 | 475 | my ( $self, $new_e, $args ) = @_; | ||
439 | 4 | 7 | my $empty = $self->{empty_e}; | ||||
440 | 4 | 100 | 13 | if ( ref($new_e) eq 'ARRAY' ) { | |||
441 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
1 | 4 | ||||||
442 | 2 | 9 | $self->remove_empty_element( $e, $args ); | ||||
443 | } | ||||||
444 | } | ||||||
445 | else { | ||||||
446 | 3 | 6 | delete $empty->{$new_e}; | ||||
447 | } | ||||||
448 | 4 | 8 | return 1; | ||||
449 | } | ||||||
450 | |||||||
451 | =head2 acceptable_elements | ||||||
452 | |||||||
453 | Returns a list of the Laundry object's known acceptable elements, which will | ||||||
454 | not be stripped during the sanitizing process. | ||||||
455 | |||||||
456 | =cut | ||||||
457 | |||||||
458 | sub acceptable_elements { | ||||||
459 | 4 | 4 | 1 | 346 | my ( $self, $listref ) = @_; | ||
460 | 4 | 100 | 18 | if ( ref($listref) eq 'ARRAY' ) { | |||
461 | 1 | 3 | my @list = @{$listref}; | ||||
1 | 4 | ||||||
462 | 1 | 4 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
5 | 11 | ||||||
463 | 1 | 4 | $self->{acceptable_e} = \%acceptable; | ||||
464 | } | ||||||
465 | 4 | 17 | return keys %{ $self->{acceptable_e} }; | ||||
4 | 77 | ||||||
466 | } | ||||||
467 | |||||||
468 | =head2 add_acceptable_element | ||||||
469 | |||||||
470 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
471 | "acceptable elements" list maintained by the Laundry object. Items added in | ||||||
472 | this manner will automatically be removed from the "unacceptable elements" | ||||||
473 | list if they are present. | ||||||
474 | |||||||
475 | $l->add_acceptable_element('style'); | ||||||
476 | |||||||
477 | Elements which are empty may be flagged as such with an optional argument. | ||||||
478 | If this flag is set, all elements provided by the call will be added to | ||||||
479 | the "empty element" list. | ||||||
480 | |||||||
481 | $l->add_acceptable_element(['applet', 'script'], { empty => 1 }); | ||||||
482 | |||||||
483 | =cut | ||||||
484 | |||||||
485 | sub add_acceptable_element { | ||||||
486 | 10 | 10 | 1 | 1204 | my ( $self, $new_e, $args ) = @_; | ||
487 | 10 | 13 | my $acceptable = $self->{acceptable_e}; | ||||
488 | 10 | 9 | my $empty = $self->{empty_e}; | ||||
489 | 10 | 11 | my $unacceptable = $self->{unacceptable_e}; | ||||
490 | 10 | 100 | 28 | if ( ref($new_e) eq 'ARRAY' ) { | |||
491 | 2 | 2 | foreach my $e ( @{$new_e} ) { | ||||
2 | 5 | ||||||
492 | 4 | 10 | $self->add_acceptable_element( $e, $args ); | ||||
493 | } | ||||||
494 | } | ||||||
495 | else { | ||||||
496 | 8 | 14 | $acceptable->{$new_e} = 1; | ||||
497 | 8 | 100 | 22 | if ( $args->{empty} ) { | |||
50 | |||||||
498 | 4 | 9 | $empty->{$new_e} = 1; | ||||
499 | 4 | 50 | 14 | if ( $self->{tidy} ) { | |||
500 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
501 | 0 | 0 | $self->{tidy_added_empty}->{$new_e} = 1; | ||||
502 | 0 | 0 | $self->_generate_tidy; | ||||
503 | } | ||||||
504 | } | ||||||
505 | elsif ( $self->{tidy} ) { | ||||||
506 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
507 | 0 | 0 | $self->_generate_tidy; | ||||
508 | } | ||||||
509 | 8 | 13 | delete $unacceptable->{$new_e}; | ||||
510 | |||||||
511 | } | ||||||
512 | 10 | 17 | return 1; | ||||
513 | } | ||||||
514 | |||||||
515 | =head2 remove_acceptable_element | ||||||
516 | |||||||
517 | Removes an element (or, if given an array reference, multiple elements) to the | ||||||
518 | "acceptable elements" list maintained by the Laundry object. These items | ||||||
519 | (although not their child elements) will now be stripped during parsing. | ||||||
520 | |||||||
521 | $l->remove_acceptable_element(['img', 'h1', 'h2']); | ||||||
522 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
523 | # returns 'The Day the World Turned Day-Glo' | ||||||
524 | |||||||
525 | =cut | ||||||
526 | |||||||
527 | sub remove_acceptable_element { | ||||||
528 | 16 | 16 | 1 | 23 | my ( $self, $new_e, $args ) = @_; | ||
529 | 16 | 17 | my $acceptable = $self->{acceptable_e}; | ||||
530 | 16 | 100 | 29 | if ( ref($new_e) eq 'ARRAY' ) { | |||
531 | 2 | 4 | foreach my $e ( @{$new_e} ) { | ||||
2 | 7 | ||||||
532 | 4 | 11 | $self->remove_acceptable_element( $e, $args ); | ||||
533 | } | ||||||
534 | } | ||||||
535 | else { | ||||||
536 | 14 | 23 | delete $acceptable->{$new_e}; | ||||
537 | } | ||||||
538 | 16 | 21 | return 1; | ||||
539 | } | ||||||
540 | |||||||
541 | =head2 unacceptable_elements | ||||||
542 | |||||||
543 | Returns a list of the Laundry object's unacceptable elements, which will be | ||||||
544 | stripped -- B |
||||||
545 | |||||||
546 | =cut | ||||||
547 | |||||||
548 | sub unacceptable_elements { | ||||||
549 | 3 | 3 | 1 | 3 | my ( $self, $listref ) = @_; | ||
550 | 3 | 100 | 8 | if ( ref($listref) eq 'ARRAY' ) { | |||
551 | 1 | 2 | my @list = @{$listref}; | ||||
1 | 3 | ||||||
552 | my %unacceptable | ||||||
553 | 1 | 2 | = map { $self->remove_acceptable_element($_); ( $_, 1 ); } @list; | ||||
5 | 6 | ||||||
5 | 9 | ||||||
554 | 1 | 4 | $self->{unacceptable_e} = \%unacceptable; | ||||
555 | } | ||||||
556 | 3 | 3 | return keys %{ $self->{unacceptable_e} }; | ||||
3 | 14 | ||||||
557 | } | ||||||
558 | |||||||
559 | =head2 add_unacceptable_element | ||||||
560 | |||||||
561 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
562 | "unacceptable elements" list maintained by the Laundry object. | ||||||
563 | |||||||
564 | $l->add_unacceptable_element(['h1', 'h2']); | ||||||
565 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
566 | # returns null string | ||||||
567 | |||||||
568 | =cut | ||||||
569 | |||||||
570 | sub add_unacceptable_element { | ||||||
571 | 4 | 4 | 1 | 483 | my ( $self, $new_e, $args ) = @_; | ||
572 | 4 | 3 | my $unacceptable = $self->{unacceptable_e}; | ||||
573 | 4 | 100 | 9 | if ( ref($new_e) eq 'ARRAY' ) { | |||
574 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
1 | 3 | ||||||
575 | 2 | 5 | $self->add_unacceptable_element( $e, $args ); | ||||
576 | } | ||||||
577 | } | ||||||
578 | else { | ||||||
579 | 3 | 4 | $self->remove_acceptable_element($new_e); | ||||
580 | 3 | 4 | $unacceptable->{$new_e} = 1; | ||||
581 | } | ||||||
582 | 4 | 4 | return 1; | ||||
583 | } | ||||||
584 | |||||||
585 | =head2 remove_unacceptable_element | ||||||
586 | |||||||
587 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
588 | the "unacceptable elements" list maintained by the Laundry object. Note that | ||||||
589 | this does not automatically add the element to the acceptable_element list. | ||||||
590 | |||||||
591 | $l->clean(q{}); | ||||||
592 | # returns null string | ||||||
593 | $l->remove_unacceptable_element( q{script} ); | ||||||
594 | $l->clean(q{}); | ||||||
595 | # returns "alert('!')" | ||||||
596 | |||||||
597 | =cut | ||||||
598 | |||||||
599 | sub remove_unacceptable_element { | ||||||
600 | 4 | 4 | 1 | 4 | my ( $self, $new_e, $args ) = @_; | ||
601 | 4 | 5 | my $unacceptable = $self->{unacceptable_e}; | ||||
602 | 4 | 100 | 10 | if ( ref($new_e) eq 'ARRAY' ) { | |||
603 | 1 | 2 | foreach my $a ( @{$new_e} ) { | ||||
1 | 2 | ||||||
604 | 2 | 7 | $self->remove_unacceptable_element( $a, $args ); | ||||
605 | } | ||||||
606 | } | ||||||
607 | else { | ||||||
608 | 3 | 4 | delete $unacceptable->{$new_e}; | ||||
609 | } | ||||||
610 | 4 | 6 | return 1; | ||||
611 | } | ||||||
612 | |||||||
613 | =head2 acceptable_attributes | ||||||
614 | |||||||
615 | Returns a list of the Laundry object's known acceptable attributes, which will | ||||||
616 | not be stripped during the sanitizing process. | ||||||
617 | |||||||
618 | =cut | ||||||
619 | |||||||
620 | sub acceptable_attributes { | ||||||
621 | 3 | 3 | 1 | 5 | my ( $self, $listref ) = @_; | ||
622 | 3 | 100 | 7 | if ( ref($listref) eq 'ARRAY' ) { | |||
623 | 1 | 1 | my @list = @{$listref}; | ||||
1 | 4 | ||||||
624 | 1 | 2 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
3 | 8 | ||||||
625 | 1 | 2 | $self->{acceptable_a} = \%acceptable; | ||||
626 | } | ||||||
627 | 3 | 10 | return keys %{ $self->{acceptable_a} }; | ||||
3 | 36 | ||||||
628 | } | ||||||
629 | |||||||
630 | =head2 add_acceptable_attribute | ||||||
631 | |||||||
632 | Adds an attribute (or, if given an array reference, multiple attributes) to the | ||||||
633 | "acceptable attributes" list maintained by the Laundry object. | ||||||
634 | |||||||
635 | my $snippet = q{ "My dear Mr. Bennet," said his lady to |
||||||
636 | him one day, "have you heard that | ||||||
637 | Netherfield Park is let at last?" | ||||||
638 | }; | ||||||
639 | $l->clean( $snippet ); | ||||||
640 | # returns: | ||||||
641 | # "My dear Mr. Bennet," said his lady to him one day, |
||||||
642 | # "have you heard that Netherfield Park is let at | ||||||
643 | # last?" | ||||||
644 | $l->add_acceptable_attribute([austen:id, austen:footnote]); | ||||||
645 | $l->clean( $snippet ); | ||||||
646 | # returns: | ||||||
647 | # "My dear Mr. Bennet," said his lady to him |
||||||
648 | # one day, "have you heard that | ||||||
649 | # Netherfield Park is let at last?" | ||||||
650 | |||||||
651 | =cut | ||||||
652 | |||||||
653 | sub add_acceptable_attribute { | ||||||
654 | 4 | 4 | 1 | 1148 | my ( $self, $new_a, $args ) = @_; | ||
655 | 4 | 5 | my $acceptable = $self->{acceptable_a}; | ||||
656 | 4 | 100 | 11 | if ( ref($new_a) eq 'ARRAY' ) { | |||
657 | 1 | 2 | foreach my $a ( @{$new_a} ) { | ||||
1 | 2 | ||||||
658 | 2 | 5 | $self->add_acceptable_attribute( $a, $args ); | ||||
659 | } | ||||||
660 | } | ||||||
661 | else { | ||||||
662 | 3 | 5 | $acceptable->{$new_a} = 1; | ||||
663 | } | ||||||
664 | 4 | 5 | return 1; | ||||
665 | } | ||||||
666 | |||||||
667 | =head2 remove_acceptable_attribute | ||||||
668 | |||||||
669 | Removes an attribute (or, if given an array reference, multiple attributes) | ||||||
670 | from the "acceptable attributes" list maintained by the Laundry object. | ||||||
671 | |||||||
672 | $l->clean(q{ plover }); |
||||||
673 | # returns ' plover ' |
||||||
674 | $l->remove_acceptable_element( q{id} ); | ||||||
675 | $l->clean(q{ plover }); |
||||||
676 | # returns ' plover |
||||||
677 | |||||||
678 | =cut | ||||||
679 | |||||||
680 | sub remove_acceptable_attribute { | ||||||
681 | 4 | 4 | 1 | 7 | my ( $self, $new_a, $args ) = @_; | ||
682 | 4 | 4 | my $acceptable = $self->{acceptable_a}; | ||||
683 | 4 | 100 | 8 | if ( ref($new_a) eq 'ARRAY' ) { | |||
684 | 1 | 2 | foreach my $a ( @{$new_a} ) { | ||||
1 | 2 | ||||||
685 | 2 | 5 | $self->remove_acceptable_attribute( $a, $args ); | ||||
686 | } | ||||||
687 | } | ||||||
688 | else { | ||||||
689 | 3 | 3 | delete $acceptable->{$new_a}; | ||||
690 | } | ||||||
691 | 4 | 5 | return 1; | ||||
692 | } | ||||||
693 | |||||||
694 | sub _generate_tidy { | ||||||
695 | 8 | 8 | 8 | my $self = shift; | |||
696 | 8 | 5 | my $param = shift; | ||||
697 | 8 | 13 | $self->_generate_html_tidy; | ||||
698 | 8 | 50 | 38 | if ( !$self->{tidy} ) { | |||
699 | 8 | 18 | $self->_generate_html_tidy_libxml; | ||||
700 | } | ||||||
701 | 8 | 25 | return; | ||||
702 | } | ||||||
703 | |||||||
704 | sub _generate_html_tidy_libxml { | ||||||
705 | 8 | 8 | 10 | my $self = shift; | |||
706 | { | ||||||
707 | 8 | 7 | local $@; | ||||
8 | 7 | ||||||
708 | 8 | 11 | eval { | ||||
709 | 8 | 1414 | require HTML::Tidy::libXML; | ||||
710 | 0 | 0 | $self->{tidy} = HTML::Tidy::libXML->new(); | ||||
711 | 0 | 0 | $self->{tidy_head} = q{ | ||||
712 | |||||||
713 | "http://www.w3.org/TR/ html1/DTD/ html1-transitional.dtd"> | ||||||
714 | }; | ||||||
715 | 0 | 0 | $self->{tidy_foot} = q{ | ||||
716 | }; | ||||||
717 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy::libXML}; | ||||
718 | 0 | 0 | 1; | ||||
719 | }; | ||||||
720 | } | ||||||
721 | } | ||||||
722 | |||||||
723 | sub _generate_html_tidy { | ||||||
724 | 8 | 8 | 7 | my $self = shift; | |||
725 | { | ||||||
726 | 8 | 5 | local $@; | ||||
8 | 9 | ||||||
727 | 8 | 9 | eval { | ||||
728 | 8 | 1658 | require HTML::Tidy; | ||||
729 | 0 | 0 | $self->{tidy_ruleset} = $self->{ruleset}->tidy_ruleset; | ||||
730 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_inline} } ) { | |||
0 | 0 | ||||||
731 | $self->{tidy_ruleset}->{new_inline_tags} | ||||||
732 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_inline} } ); | ||||
0 | 0 | ||||||
733 | } | ||||||
734 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_empty} } ) { | |||
0 | 0 | ||||||
735 | $self->{tidy_ruleset}->{new_empty_tags} | ||||||
736 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_empty} } ); | ||||
0 | 0 | ||||||
737 | } | ||||||
738 | 0 | 0 | $self->{tidy} = HTML::Tidy->new( $self->{tidy_ruleset} ); | ||||
739 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy}; | ||||
740 | 0 | 0 | 1; | ||||
741 | }; | ||||||
742 | } | ||||||
743 | } | ||||||
744 | |||||||
745 | sub _reset_state { | ||||||
746 | 462 | 462 | 406 | my ($self) = @_; | |||
747 | 462 | 709 | @fragments = (); | ||||
748 | 462 | 372 | $unacceptable_count = 0; | ||||
749 | 462 | 423 | $local_unacceptable_count = 0; | ||||
750 | 462 | 373 | $in_cdata = 0; | ||||
751 | 462 | 325 | $cdata_dirty = 0; | ||||
752 | 462 | 457 | return; | ||||
753 | } | ||||||
754 | |||||||
755 | sub _tag_start_handler { | ||||||
756 | 493 | 493 | 547 | my ( $self, $tagname, $attr ) = @_; | |||
757 | 493 | 100 | 910 | if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) { | |||
758 | 1 | 7 | return; | ||||
759 | } | ||||||
760 | 492 | 100 | 781 | if ( !$in_cdata ) { | |||
761 | 487 | 442 | $cdata_dirty = 0; | ||||
762 | } | ||||||
763 | 492 | 355 | my @attributes; | ||||
764 | 492 | 360 | foreach my $k ( keys %{$attr} ) { | ||||
492 | 1162 | ||||||
765 | 259 | 100 | 569 | if ( $self->{acceptable_a}->{$k} ) { | |||
766 | 174 | 100 | 130 | if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) { | |||
151 | 719 | ||||||
174 | 401 | ||||||
767 | $self->_uri_handler( $tagname, \$k, \$attr->{$k}, | ||||||
768 | 58 | 177 | $self->{base_uri} ); | ||||
769 | } | ||||||
770 | |||||||
771 | # Allow uri handler to suppress insertion | ||||||
772 | 174 | 100 | 298 | if ($k) { | |||
773 | 157 | 393 | push @attributes, $k . q{="} . $attr->{$k} . q{"}; | ||||
774 | } | ||||||
775 | } | ||||||
776 | } | ||||||
777 | 492 | 752 | my $attributes = join q{ }, @attributes; | ||||
778 | 492 | 100 | 902 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
779 | 376 | 100 | 591 | if ( $self->{empty_e}->{$tagname} ) { | |||
780 | 58 | 100 | 105 | if ($attributes) { | |||
781 | 19 | 23 | $attributes = $attributes . q{ }; | ||||
782 | } | ||||||
783 | 58 | 118 | push @fragments, "<$tagname $attributes/>"; | ||||
784 | } | ||||||
785 | else { | ||||||
786 | 318 | 100 | 516 | if ($attributes) { | |||
787 | 122 | 172 | $attributes = q{ } . $attributes; | ||||
788 | } | ||||||
789 | 318 | 566 | push @fragments, "<$tagname$attributes>"; | ||||
790 | } | ||||||
791 | } | ||||||
792 | else { | ||||||
793 | 116 | 100 | 232 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
794 | 24 | 100 | 39 | if ($in_cdata) { | |||
795 | 3 | 3 | $local_unacceptable_count += 1; | ||||
796 | } | ||||||
797 | else { | ||||||
798 | 21 | 48 | $unacceptable_count += 1; | ||||
799 | } | ||||||
800 | } | ||||||
801 | } | ||||||
802 | 492 | 2313 | return; | ||||
803 | } | ||||||
804 | |||||||
805 | sub _tag_end_handler { | ||||||
806 | 467 | 467 | 504 | my ( $self, $tagname ) = @_; | |||
807 | 467 | 100 | 658 | if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) { | |||
808 | 1 | 5 | return; | ||||
809 | } | ||||||
810 | 466 | 100 | 701 | if ( !$in_cdata ) { | |||
811 | 463 | 380 | $cdata_dirty = 0; | ||||
812 | } | ||||||
813 | 466 | 100 | 804 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
814 | 346 | 100 | 600 | if ( !$self->{empty_e}->{$tagname} ) { | |||
815 | 316 | 535 | push @fragments, "$tagname>"; | ||||
816 | } | ||||||
817 | } | ||||||
818 | else { | ||||||
819 | 120 | 100 | 217 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
820 | 30 | 100 | 36 | if ($in_cdata) { | |||
821 | 1 | 2 | $local_unacceptable_count -= 1; | ||||
822 | 1 | 50 | 3 | $local_unacceptable_count = 0 | |||
823 | if ( $local_unacceptable_count < 0 ); | ||||||
824 | } | ||||||
825 | else { | ||||||
826 | 29 | 28 | $unacceptable_count -= 1; | ||||
827 | 29 | 100 | 55 | $unacceptable_count = 0 if ( $unacceptable_count < 0 ); | |||
828 | } | ||||||
829 | } | ||||||
830 | } | ||||||
831 | 466 | 984 | return; | ||||
832 | } | ||||||
833 | |||||||
834 | sub _text_handler { | ||||||
835 | 172 | 172 | 241 | my ( $self, $text, $is_cdata ) = @_; | |||
836 | 172 | 100 | 100 | 378 | if ( $in_cdata && $local_unacceptable_count ) { | ||
837 | 1 | 3 | return; | ||||
838 | } | ||||||
839 | 171 | 100 | 258 | if ($unacceptable_count) { | |||
840 | 15 | 41 | return; | ||||
841 | } | ||||||
842 | 156 | 100 | 214 | if ($is_cdata) { | |||
843 | 13 | 15 | my $cp = $self->{cdata_parser}; | ||||
844 | 13 | 7 | $in_cdata = 1; | ||||
845 | 13 | 29 | $cp->parse($text); | ||||
846 | 13 | 100 | 21 | if ( !$local_unacceptable_count ) { | |||
847 | 11 | 31 | $cp->eof(); | ||||
848 | } | ||||||
849 | 13 | 13 | $cdata_dirty = 1; | ||||
850 | 13 | 6 | $in_cdata = 0; | ||||
851 | 13 | 33 | return; | ||||
852 | } | ||||||
853 | else { | ||||||
854 | 143 | 100 | 246 | if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) { | |||
855 | 1 | 6 | return q{}; | ||||
856 | } | ||||||
857 | 142 | 386 | $text = encode_entities( $text, '<>&"' ); | ||||
858 | 142 | 7032 | $cdata_dirty = 0; | ||||
859 | } | ||||||
860 | 142 | 177 | push @fragments, $text; | ||||
861 | 142 | 490 | return; | ||||
862 | } | ||||||
863 | |||||||
864 | sub _uri_handler { | ||||||
865 | 58 | 58 | 76 | my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_; | |||
866 | 58 | 47 | my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} ); | ||||
58 | 65 | ||||||
58 | 71 | ||||||
867 | 58 | 120 | $value =~ s/[`\x00-\x1f\x7f]+//g; | ||||
868 | 58 | 71 | $value =~ s/\ufffd//g; | ||||
869 | 58 | 205 | my $uri = URI->new($value); | ||||
870 | 58 | 45308 | $uri = $uri->canonical; | ||||
871 | 58 | 100 | 3526 | if ( !$self->_run_callbacks( q{uri}, $tagname, $attr, \$uri ) ) { | |||
872 | 1 | 1 | ${$attr_ref} = q{}; | ||||
1 | 2 | ||||||
873 | 1 | 2 | return undef; | ||||
874 | } | ||||||
875 | 57 | 100 | 66 | 270 | if ( $self->{allowed_schemes} and $uri->scheme ) { | ||
876 | 42 | 100 | 513 | unless ( $self->{allowed_schemes}->{ $uri->scheme } ) { | |||
877 | 16 | 131 | ${$attr_ref} = q{}; | ||||
16 | 23 | ||||||
878 | 16 | 43 | return undef; | ||||
879 | } | ||||||
880 | } | ||||||
881 | 41 | 100 | 451 | if ( $self->{base_uri} ) { | |||
882 | 8 | 57 | $uri = URI->new_abs( $uri->as_string, $self->{base_uri} ); | ||||
883 | } | ||||||
884 | 41 | 100 | 6726 | if ( $uri->scheme ) { # Not a local URI | |||
885 | 33 | 265 | my $host; | ||||
886 | { | ||||||
887 | 33 | 33 | local $@; | ||||
33 | 28 | ||||||
888 | 33 | 43 | eval { $host = $uri->host; }; | ||||
33 | 70 | ||||||
889 | } | ||||||
890 | 33 | 50 | 550 | if ($host) { | |||
891 | |||||||
892 | # We may need to manually unescape domain names | ||||||
893 | # to deal with issues like tinyarro.ws | ||||||
894 | 33 | 68 | my $utf8_host = $self->_decode_utf8($host); | ||||
895 | 33 | 52 | utf8::upgrade($utf8_host); | ||||
896 | 33 | 50 | 51 | if ( $uri->host ne $utf8_host ) { | |||
897 | |||||||
898 | # TODO: Optionally use Punycode in this case | ||||||
899 | |||||||
900 | 0 | 0 | 0 | 0 | if ( $uri->port and $uri->port == $uri->default_port ) { | ||
901 | 0 | 0 | $uri->port(undef); | ||||
902 | } | ||||||
903 | 0 | 0 | my $escaped_host = $self->_encode_utf8( $uri->host ); | ||||
904 | 0 | 0 | my $uri_str = $uri->canonical->as_string; | ||||
905 | 0 | 0 | $uri_str =~ s/$escaped_host/$utf8_host/; | ||||
906 | 0 | 0 | utf8::upgrade($uri_str); | ||||
907 | 0 | 0 | ${$value_ref} = $uri_str; | ||||
0 | 0 | ||||||
908 | 0 | 0 | return; | ||||
909 | } | ||||||
910 | } | ||||||
911 | } | ||||||
912 | 41 | 598 | ${$value_ref} = $uri->canonical->as_string; | ||||
41 | 2006 | ||||||
913 | 41 | 141 | return; | ||||
914 | } | ||||||
915 | |||||||
916 | sub _decode_utf8 { | ||||||
917 | 33 | 33 | 31 | my $self = shift; | |||
918 | 33 | 37 | my $orig = my $str = shift; | ||||
919 | 33 | 30 | $str =~ s/\%([0-9a-f]{2})/chr(hex($1))/egi; | ||||
0 | 0 | ||||||
920 | 33 | 50 | 165 | return $str if utf8::decode($str); | |||
921 | 0 | return $orig; | |||||
922 | } | ||||||
923 | |||||||
924 | sub _encode_utf8 { | ||||||
925 | 0 | 0 | my $self = shift; | ||||
926 | 0 | my $str = shift; | |||||
927 | 0 | my $highbit = qr/[^\w\$-_.+!*'(),]/; | |||||
928 | 0 | $str =~ s/($highbit)/ sprintf ("%%%02X", ord($1)) /ge; | |||||
0 | |||||||
929 | 0 | utf8::upgrade($str); | |||||
930 | 0 | return $str; | |||||
931 | } | ||||||
932 | |||||||
933 | =head1 SEE ALSO | ||||||
934 | |||||||
935 | There are a number of tools designed for sanitizing HTML, some of which | ||||||
936 | may be better suited than HTML::Laundry to particular circumstances. In | ||||||
937 | addition to L |
||||||
938 | L |
||||||
939 | solely for the purposes of sanitizing HTML from potential XSS attack vectors; | ||||||
940 | L |
||||||
941 | L |
||||||
942 | |||||||
943 | =head1 AUTHOR | ||||||
944 | |||||||
945 | Steve Cook, C<< |
||||||
946 | |||||||
947 | =head1 BUGS | ||||||
948 | |||||||
949 | Please report any bugs or feature requests on the GitHub page for this project, | ||||||
950 | http://github.com/snark/html-laundry. | ||||||
951 | |||||||
952 | =head1 ACKNOWLEDGMENTS | ||||||
953 | |||||||
954 | Thanks to Dave Cross and Vera Tobin. | ||||||
955 | |||||||
956 | =head1 SUPPORT | ||||||
957 | |||||||
958 | You can find documentation for this module with the perldoc command. | ||||||
959 | |||||||
960 | perldoc HTML::Laundry | ||||||
961 | |||||||
962 | =head1 COPYRIGHT & LICENSE | ||||||
963 | |||||||
964 | Copyright 2009 Six Apart, Ltd., all rights reserved. | ||||||
965 | |||||||
966 | This program is free software; you can redistribute it and/or modify it | ||||||
967 | under the same terms as Perl itself. | ||||||
968 | |||||||
969 | =cut | ||||||
970 | |||||||
971 | 1; # End of HTML::Laundry |