File Coverage

blib/lib/HTML/StripScripts/Regex.pm

Criterion	Covered	Total	%
statement	26	27	96.3
branch	13	14	92.8
condition			n/a
subroutine	5	5	100.0
pod	1	1	100.0
total	45	47	95.7

line	stmt	bran	sub	pod	time	code
1						package HTML::StripScripts::Regex;
2	3		3		92302	use strict;
	3				9
	3				120
3	3		3		17	use warnings;
	3				8
	3				163
4						our $VERSION = '0.02';
5
6						=head1 NAME
7
8						HTML::StripScripts::Regex - XSS filter using a regular expression
9
10						=head1 VERSION
11
12						Version 0.02
13
14						=head1 SYNOPSIS
15
16						This class subclasses L, and adds an input method
17						based on a regular expression. See L.
18
19						use HTML::StripScripts::Regex;
20
21						my $hss = HTML::StripScripts::Regex->new({ Context => 'Inline' });
22
23						$hss->input("hello, world!");
24
25						print $hss->filtered_document;
26
27						Using a regular expression to parse HTML is error prone and inefficient
28						for large documents. If L is available then
29						L should be used in preference to this module.
30
31						=head1 METHODS
32
33						This subclass adds the following methods to those of L.
34
35						=over
36
37						=item input ( TEXT )
38
39						Parses an HTML document and runs it through the filter. TEXT must be the
40						entire HTML document to be filtered, as a single flat string.
41
42						=cut
43
44	3		3		3664	use HTML::StripScripts;
	3				48066
	3				148
45	3		3		41	use base qw(HTML::StripScripts);
	3				5
	3				1538
46
47						sub input {
48	124		124	1	108137	my ($self, $text) = @_;
49
50	124				413	$self->input_start_document;
51
52	124				3128	while ( $text =~ m[
53
54						# or constructs,
55						# in which everything between the tags counts as
56						# CDATA.
57						(?: <(script\|style).?> (.?) ) \|
58
59						# An HTML comment
60						( ) \|
61
62						# A processing instruction
63						( <\?.*?> ) \|
64
65						# A declaration
66						( <\!.*?> ) \|
67
68						# A start tag
69						( <[a-z0-9]+\b(?:[^>'"]\|"[^"]"\|'[^']')*> ) \|
70
71						# An end tag
72						( ) \|
73
74						# Some non-tag text. We eat '<' only if it's
75						# the first character, since a '<' as the
76						# first character can't be the start of a well
77						# formed tag or one of the patterns above would
78						# have matched.
79						( .[^<]* )
80
81						]igsx ) {
82
83	452	100			31879	if ( defined $1 ) {
		100
		100
		100
		100
		100
		50
84	2				9	$self->input_start("<$1>");
85	2				57	$self->input_text($2);
86	2				17	$self->input_end("");
87						}
88						elsif ( defined $3 ) {
89	7				42	$self->input_comment($3);
90						}
91						elsif ( defined $4 ) {
92	3				20	$self->input_process($4);
93						}
94						elsif ( defined $5 ) {
95	4				27	$self->input_declaration($5);
96						}
97						elsif ( defined $6 ) {
98	182				522	$self->input_start($6);
99						}
100						elsif ( defined $7 ) {
101	56				458	$self->input_end($7);
102						}
103						elsif ( defined $8 ) {
104	198				787	$self->input_text($8);
105						}
106						else {
107	0				0	die 'regex failed to act as expected';
108						}
109
110						}
111
112	124				10123	$self->input_end_document;
113						}
114
115						=back
116
117						=head1 SUBCLASSING
118
119						The C class is subclassable, in exactly the same
120						way as C. See L for
121						details.
122
123						=head1 SEE ALSO
124
125						L, L, L
126
127						=head1 AUTHOR
128
129						Nick Cleaton, C<< >>
130
131						=head1 COPYRIGHT & LICENSE
132
133						Copyright 2009 Nick Cleaton, all rights reserved.
134
135						This program is free software; you can redistribute it and/or modify it under
136						the same terms as Perl itself.
137
138						=cut
139
140						1;