File Coverage

blib/lib/Catalyst/Plugin/HTML/Scrubber.pm
Criterion Covered Total %
statement 80 86 93.0
branch 53 70 75.7
condition 11 12 91.6
subroutine 13 13 100.0
pod 1 3 33.3
total 158 184 85.8


line stmt bran cond sub pod time code
1             package Catalyst::Plugin::HTML::Scrubber;
2             $Catalyst::Plugin::HTML::Scrubber::VERSION = '0.09';
3 5     5   11409680 use Moose;
  5         14  
  5         46  
4 5     5   42114 use namespace::autoclean;
  5         16  
  5         55  
5              
6             with 'Catalyst::ClassData';
7              
8 5     5   535 use MRO::Compat;
  5         14  
  5         173  
9 5     5   3406 use HTML::Scrubber;
  5         17497  
  5         6763  
10              
11             __PACKAGE__->mk_classdata('_scrubber');
12              
13             sub setup {
14 4     4 1 1579566 my $c = shift;
15              
16 4         38 my $conf = $c->config->{scrubber};
17 4 50       725 if (ref $conf eq 'ARRAY') {
    100          
18 0         0 $c->_scrubber(HTML::Scrubber->new(@$conf));
19             } elsif (ref $conf eq 'HASH') {
20             $c->config->{scrubber}{auto} = 1
21 3 50       18 unless defined $c->config->{scrubber}{auto};
22 3         315 $c->_scrubber(HTML::Scrubber->new(@{$conf->{params}}));
  3         99  
23             } else {
24 1         14 $c->_scrubber(HTML::Scrubber->new());
25             }
26              
27 4         989 return $c->maybe::next::method(@_);
28             }
29              
30             sub execute {
31 147     147 0 2982870 my $c = shift;
32              
33 147         863 $c->maybe::next::method(@_);
34              
35 147         78762 my $conf = $c->config->{scrubber};
36              
37             # There are two ways to configure the plugin, it seems; giving a hashref
38             # of params under `scrubber`, with any params intended for HTML::Scrubber
39             # under the vaguely-named `params` key, or an arrayref of params intended
40             # to be passed straight to HTML::Scrubber - save html_scrub() from knowing
41             # about that by abstracting that nastyness away:
42 147 100 100     14015 if (ref $conf ne 'HASH' || $conf->{auto}) {
43 111 100       642 $c->html_scrub(ref($conf) eq 'HASH' ? $conf : {});
44             }
45             }
46              
47             sub html_scrub {
48 113     113 0 601 my ($c, $conf) = @_;
49              
50             # Firstly, if an entry in ignore_urls matches, then we don't want to
51             # scrub anything for this request...
52 113 100       413 return if ($c->_req_path_exempt_from_scrubbing($conf));
53              
54             # If there's body_data - for e.g. a POSTed JSON body that was decoded -
55             # then we need to walk through it, scrubbing as appropriate; don't call
56             # body_data unless the content type is one there's a data handler for
57             # though, otherwise we'll trigger an exception (see GH#4)
58 101 100       298 if (exists $c->req->data_handlers->{ $c->req->content_type }) {
59 83 50       15097 if (my $body_data = $c->request->body_data) {
60 83         28680 $c->_scrub_recurse($conf, $c->request->body_data);
61             }
62             }
63              
64             # And if Catalyst::Controller::REST is in use so we have $req->data,
65             # then scrub that too
66 101 100       6380 if ($c->request->can('data')) {
67 9         427 my $data = $c->request->data;
68 9 50       479 if ($data) {
69 9         268 $c->_scrub_recurse($conf, $c->request->data);
70             }
71             }
72              
73             # Normal query/POST body parameters:
74 101         3375 $c->_scrub_recurse($conf, $c->request->parameters);
75              
76             }
77              
78             # Recursively scrub param values...
79             sub _scrub_recurse {
80 241     241   13038 my ($c, $conf, $data) = @_;
81              
82             # If the thing we've got is a hashref, walk over its keys, checking
83             # whether we should ignore, otherwise, do the needful
84 241 100       785 if (ref $data eq 'HASH') {
    50          
    0          
85 217         741 for my $key (keys %$data) {
86 268 100       873 if (!$c->_should_scrub_param($conf, $key, $data->{$key})) {
87 60         291 next;
88             }
89              
90             # OK, it's fine to fettle with this key - if its value is
91             # a ref, recurse, otherwise, scrub
92 208 100       643 if (my $ref = ref $data->{$key}) {
93             $c->_scrub_recurse($conf, $data->{$key})
94 48 50       251 if defined $data->{$key};
95             } else {
96             # Alright, non-ref value, so scrub it
97             # FIXME why did we have to have this ref-ref handling fun?
98             #$_ = $c->_scrubber->scrub($_) for (ref($$value) ? @{$$value} : $$value);
99             $data->{$key} = $c->_scrub_value($conf, $data->{$key})
100 160 50       618 if defined $data->{$key};
101             }
102             }
103             } elsif (ref $data eq 'ARRAY') {
104 24         72 for (@$data) {
105 48 50       94 if (ref $_) {
106 0         0 $c->_scrub_recurse($conf, $_);
107             } else {
108 48 50       132 $_ = $c->_scrub_value($conf, $_) if defined $_;
109             }
110             }
111             } elsif (ref $data eq 'CODE') {
112 0         0 $c->log->debug("Can't scrub a coderef!");
113             } else {
114             # Note that at this point we don't know what the param was called
115             # as we'll have called ourself with the value, but that's fine as
116             # name-based checks were already done; we do need to pass the
117             # value ($data) along to allow value-based ignore_values to work.
118 0 0       0 $data = $c->_scrub_value($conf, $data)
119             if $c->_should_scrub_param($conf, '', $data);
120             }
121             }
122              
123              
124             # Wrap HTML::Scrubber's scrub() so we can decode HTML entities if needed
125             sub _scrub_value {
126 208     208   522 my ($c, $conf, $value) = @_;
127              
128 208 50       464 return $value unless defined $value;
129            
130 208         716 $value = $c->_scrubber->scrub($value);
131              
132 208 100       22995 if ($conf->{no_encode_entities}) {
133 12         79 $value = HTML::Entities::decode_entities($value);
134             }
135 208         997 return $value;
136             }
137              
138             sub _should_scrub_param {
139 268     268   623 my ($c, $conf, $param, $value) = @_;
140             # If we only want to operate on certain params, do that checking
141             # now...
142 268 100 100     1194 if ($conf && $conf->{ignore_params}) {
143 228         709 my $ignore_params = $c->config->{scrubber}{ignore_params};
144 228 50       20454 if (ref $ignore_params ne 'ARRAY') {
145 0         0 $ignore_params = [ $ignore_params ];
146             }
147 228         503 for my $ignore_param (@$ignore_params) {
148 420 100       891 if (ref $ignore_param eq 'Regexp') {
149 228 100       1635 return if $param =~ $ignore_param;
150             } else {
151 192 100       551 return if $param eq $ignore_param;
152             }
153             }
154             }
155              
156             # For cases where there are legitimate values that HTML::Scrubber will
157             # munge... one example was an API where e.g. `<:100' would be eaten.
158             # To allow any param where a `<` is not followed by a `>` in the same
159             # param you could use qr{<[^]+$} or similar.
160 220 100 100     838 if ($conf && $conf->{ignore_values}) {
161 108         155 my $ignore_values = $conf->{ignore_values};
162 108 50       277 if (ref $ignore_values ne 'ARRAY') {
163 0         0 $ignore_values = [ $ignore_values ];
164             }
165 108         219 for my $ignore_value (@$ignore_values) {
166 108 100 66     795 if (defined $value && $value =~ $ignore_value) {
167 12         28 return;
168             }
169             }
170             }
171              
172             # If we've not bailed above, we didn't match any ignore_params
173             # entries, or didn't have any, so we do want to scrub
174 208         571 return 1;
175             }
176              
177              
178             sub _req_path_exempt_from_scrubbing {
179 113     113   219 my ($c, $conf) = @_;
180 113 100       400 return unless exists $conf->{ignore_paths};
181              
182 78         209 my $req_path = $c->req->path;
183 78 50       7485 $req_path = "/$req_path" unless $req_path =~ m{^/};
184 78         120 for my $ignore (@{ $conf->{ignore_paths} }) {
  78         213  
185 150 100       305 if (ref $ignore eq 'Regexp') {
186 72 100       400 return 1 if $req_path =~ $ignore;
187             } else {
188 78 100       207 return 1 if $req_path eq $ignore;
189             }
190             }
191             }
192              
193             # Incredibly nasty monkey-patch to rewind filehandle before parsing - see
194             # https://github.com/perl-catalyst/catalyst-runtime/pull/186
195             # First, get the default handlers hashref:
196             my $default_data_handlers = Catalyst->default_data_handlers();
197              
198             # Wrap the coderef for application/json in one that rewinds the filehandle
199             # first:
200             my $orig_json_handler = $default_data_handlers->{'application/json'};
201             $default_data_handlers->{'application/json'} = sub {
202             $_[0]->seek(0,0); # rewind $fh arg
203             $orig_json_handler->(@_);
204             };
205              
206              
207             {
208             # and now replace the original default_data_handlers() with a version that
209             # returns our modified handlers
210 5     5   45 no warnings 'redefine';
  5         11  
  5         591  
211             *Catalyst::default_data_handlers = sub {
212 4     4   12550 return $default_data_handlers;
213             };
214             }
215              
216             __PACKAGE__->meta->make_immutable;
217              
218             1;
219             __END__
220              
221              
222             =head1 NAME
223              
224             Catalyst::Plugin::HTML::Scrubber - Catalyst plugin for scrubbing/sanitizing incoming parameters
225              
226             =head1 SYNOPSIS
227              
228             use Catalyst qw[HTML::Scrubber];
229              
230             MyApp->config(
231             scrubber => {
232             auto => 1, # automatically run on request
233              
234             # Exempt certain parameter names from scrubbing
235             ignore_params => [ qr/_html$/, 'article_body' ],
236              
237             # Don't scrub at all for certain URL paths:
238             ignore_paths => [
239             '/foo',
240             qr{^/foo/.+},
241             ],
242              
243             # HTML::Scrubber will HTML-encode some chars, e.g. angle
244             # brackets. If you don't want that, enable this setting and
245             # the scrubbed values will be unencoded.
246             no_decode_entities => 0,
247            
248             # The following are options to HTML::Scrubber
249             params => [
250             default => 0,
251             comment => 0,
252             script => 0,
253             process => 0,
254             allow => [qw [ br hr b a h1]],
255             ],
256             },
257             );
258              
259             =head1 DESCRIPTION
260              
261             On request, sanitize HTML tags in all params (with the ability to exempt
262             some if needed), to protect against XSS (cross-site scripting) attacks and
263             other unwanted things.
264              
265              
266             =head1 EXTENDED METHODS
267              
268             =over 4
269              
270             =item setup
271              
272             See SYNOPSIS for how to configure the plugin, both with its own configuration
273             (e.g. whether to automatically run, whether to exempt certain fields) and
274             passing on any options from L<HTML::Scrubber> to control exactly what
275             scrubbing happens.
276              
277             =item dispatch
278              
279             Sanitize HTML tags in all parameters (unless `ignore_params` exempts them) -
280             this includes normal POST params, and serialised data (e.g. a POSTed JSON body)
281             accessed via `$c->req->body_data` or `$c->req->data`.
282              
283             =back
284              
285             =head1 SEE ALSO
286              
287             L<Catalyst>, L<HTML::Scrubber>.
288              
289             =head1 AUTHOR
290              
291             Hideo Kimura, << <hide@hide-k.net> >> original author
292              
293             David Precious (BIGPRESH), C<< <davidp@preshweb.co.uk> >> maintainer since 2023-07-17
294              
295             =head1 COPYRIGHT AND LICENSE
296              
297             Copyright (C) 2005 by Hideo Kimura
298              
299             This library is free software; you can redistribute it and/or modify
300             it under the same terms as Perl itself.
301              
302             =cut