File Coverage

/.cpan/build/HTML-CGIChecker-0.90-Tc_Jis/CGIChecker.pm
Criterion Covered Total %
statement 200 207 96.6
branch 54 68 79.4
condition 9 9 100.0
subroutine 19 19 100.0
pod 0 2 0.0
total 282 305 92.4


line stmt bran cond sub pod time code
1              
2             package HTML::CGIChecker;
3              
4 12     12   71 use strict;
  12         25  
  12         599  
5 12     12   848 use Carp;
  12         22  
  12         1071  
6              
7             BEGIN {
8 12     12   66 use vars qw ($VERSION @ISA);
  12         23  
  12         869  
9 12     12   24 $VERSION = 0.90;
10 12         34553 @ISA = ();
11             }
12              
13             =head1 NAME
14              
15             B
16              
17              
18             =head1 SYNOPSIS
19              
20             use HTML::CGIChecker;
21            
22             $feedback = '
23            
One column

24             " Arrays & variables "
25            
26             Dough > Hi, how are you ?
27            
28             And now some Perl code:
29            
 
30             print "";
31            
32             ';
33              
34             # create the $checker object
35            
36             $checker = new HTML::CGIChecker (
37             mode => 'allow',
38             allowclasses => [ qw( tables images ) ],
39             allowtags => [ qw ( B I A U STRONG BR HR ) ],
40             jscript => 0,
41             html => 0,
42             pre => 1,
43             debug => 0,
44             err_tag => 'Tag {tag} is not allowed in {element}.'
45             );
46              
47             # Now you can use it to check any string using its checkHTML()
48             # method. It "remembers" its configuration, so you can reuse it.
49              
50             ($checked_feedback, $Warnings) =
51             $checker->checkHTML ($feedback);
52              
53             # Process the results ...
54              
55             if ($checked_feedback) {
56             # save $checked_feedback to the database ....
57             }
58             else {
59             # print the warnings ...
60             print join ("\n", @{$Warnings});
61             }
62              
63             The example above produces no warning messages and returns
64             $feedback checked and properly HTML escaped. The only HTML "error" -
65             the unescaped "E" bracket on the fourth line - is autocorrected.
66             One warning message was overriden by a customized version. Potential
67             warnings would not be HTML formatted and HTML safe, because
68             the 'html' parameter is not true.
69              
70              
71             =head1 MOTIVATION
72              
73             Almost every modern website needs some way to get feedback from its users
74             in form of comments that are also visible to other visitors. It is convenient
75             to allow the users to use a limited set of HTML tags in their posts to
76             embold text, create hyperlinks or even include images.
77              
78             The problem araises when the user posts HTML code that breaks the page on
79             which the post is displayed. You must check the
80             posts for dangerous HTML errors and javascripts to prevent a malicious user
81             to render the rest of the page unusable. This module has been created
82             to fulfill this function and also to provide some extra features.
83              
84             Typical HTML validators do not suit well for the above mentioned purpose,
85             because they are way too strict and do not scale well. A small and fast
86             checker that also allows a programmer to deny and allow tags on an individual
87             basis comes as a solution of this problem. Another problem one has to solve
88             while creating a web site that allows HTML user posts is to escape these
89             posts correctly before storing them to the database and displaying them
90             to other users.
91              
92             The currently available module HTML::QuickCheck that should fill the same
93             purpose does not offer some crucial features:
94              
95             Checking of B - this problem can be fatal, because the
96             common typo when one forgets to close quotes in for example a HREF parameter
97             almost always totally corrupts the rest of a page.
98              
99             B of the right parts of the posts - ie. of the non-HTML parts.
100              
101             Denying/allowing of B.
102              
103             Denying/allowing of B, applets, styles, forms and other similar
104             functionality that requires a programmer to be able to deny/allow tags
105             or entire tags classes on an individual basis.
106              
107             Support for the special formatting B
. Please note that the 
108             PRE tag has special meaning for this module.
109             Everything that is placed inside PRE block is automatically HTML
110             escaped. The users can use this behaviour to easily post for
111             example code snippets that contain unescaped HTML brackets. All they
112             need is to place the snippet inside a PRE block. They do not need to
113             worry about escaping of the brackets.
114              
115             Ability to B that are returned
116             to the user in case when a problematic HTML in his post is detected.
117              
118             Autocorrection of some "common" errors, for example of chat messages
119             containing unescaped HTML brackets - "peter E how are you ?".
120             Both unmatched opening and closing HTML brackets are autocorrected.
121              
122             Proper detection of some table closing tags problems that can break the page
123             in some browsers.
124              
125             Conversion of images to appropriate hyperlinks.
126              
127             Automatic prepending of "http://" to URLs which do not start with "http://".
128              
129              
130             =head1 DESCRIPTION
131              
132             HTML::CGIChecker is a module for web developers to parse HTML and to detect
133             HTML code that could break a page in some way.
134             This module is not a HTML validator, but it allows one to check the HTML
135             code that users post to a web application, for example to a discussion
136             board, to prevent them to post a piece of code that would render the rest
137             of a page it is displayed on unusable.
138              
139             Using it one also can deny javascripts, images, tables or any other
140             tags on an individual basis. It also can check for correct quoting
141             and correct URLs.
142              
143             The module can autocorrect some common bad users' behaviour, for
144             example the use of unescaped HTML brackets in a chat room, etc.
145              
146             It is easy to use and very useful in any CGI application in which
147             you want its users to be able to use HTML in their posts to some
148             customizable extent. It is object oriented and designed to be easily
149             extensible.
150              
151             B, for validation you need an other
152             solution. This module does not care about correctness of the parsed HTML code
153             at all. All it does care about is whether the HTML code could break a page.
154             HTML tags that are not paired correctly or that cannot be rendered at all
155             can pass this checker. All the names of elements and attributes are not case
156             sensitive.
157              
158             The checker object is created by calling new() constructor of HTML::CGIChecker
159             class.
160              
161              
162             $checker = new HTML::CGIChecker (
163             mode => 'allow',
164             ....
165             );
166              
167              
168             Then you can use the checkHTML() instance method to perform a check on
169             a string using the settings this object has been configured with.
170              
171             ($checked_string, $Warnings) =
172             $checker->checkHTML ($string);
173              
174              
175             B
176              
177             Creates and returns a new checker object that can be configured with
178             parameters that are described below. Default configuration allows only
179             a few harmless inline tags to be used in the HTML code:
180              
181             B I A U STRONG BR
182             EM CITE VAR ABBR Q DFN CODE SUB SUP SAMP KBD ACRONYM
183              
184             Other tags except the special PRE tag are not allowed.
185             Javascripts are by default also not allowed.
186              
187             The various parameters are passed in as a list of B value>
188             pairs. List of these parameters together with their default values follows:
189              
190             mode => 'allow'
191             allowclasses => []
192             allowtags => [ qw (
193             B I A U STRONG BR EM CITE VAR ABBR Q DFN CODE
194             SUB SUP SAMP KBD ACRONYM
195             ) ]
196             denyclasses => [ keys (%tagclasses) ]
197             denytags => [ qw ( FONT ) ]
198             jscript => 0
199             html => 0
200             pre => 1
201             img_to_link => 0
202             check_http => 1
203             debug => 0
204             nonpairtags => [ qw (
205             IMG HR BR INPUT META AREA COL BASE LINK PARAM
206             ) ]
207             check_attribs => {}
208             err_tag => 'Tag {tag} is not allowed in {element}.'
209             err_javascript => 'Javascript is not allowed in {element}.'
210             err_quote => 'Missing quote in {element}.'
211             err_notclosed => 'Pair tag {tag} was not closed.'
212             err_notopened => 'Pair tag {tag} was not opened.'
213              
214              
215             B
216              
217             Two modes are available: allow (default) and deny.
218              
219             B: Error is raised if any tag that is not explicitely
220             allowed is found.
221              
222             B: Error is raised if an explicitely denied tag is found,
223             any other tags are allowed.
224              
225              
226             B
227              
228             These parameters apply only in the 'allow' mode.
229             Here you can specify the tags you allow the user to use.
230             Allowtags must be a reference to an array of tag names.
231             Allowclasses must be a refernce to an array of class names.
232             Tag class (tag group) is a set of tags that can be allowed or denied all
233             at once by allowing or denying the class. These classes are available:
234              
235             base FRAMESET FRAME HTML BODY HEAD TITLE BASE
236             STYLE SCRIPT META NOSCRIPT NOFRAMES
237             externals APPLET OBJECT LINK IFRAME PARAM
238             forms FORM TEXTAREA SELECT INPUT BUTTON LABEL
239             FIELDSET LEGEND OPTGROUP
240             tables TABLE TR TD TBODY THEAD TFOOT TH COLGROUP
241             COL CAPTION
242             lists UL OL LI DL DT DD
243             images IMG MAP AREA
244             heading H1 H2 H3 H4 H5 H6 H7 H8
245              
246             By default only the above mentioned harmless inline tags are allowed.
247             By default no classes are allowed.
248              
249             B, B
250              
251             These parameters apply only in the 'deny' mode.
252             They work similar to the allowclasses and allowtags
253             parameters. By default B above listed classes plus the FONT tag are
254             denied. All other tags are by default allowed in this mode.
255              
256             B
257              
258             This option disables javascript inside HTML elements.
259             You also must ensure that the SCRIPT tag is not allowed to block
260             the javascript completely.
261              
262             0: javascript is not allowed
263             1: javascript is allowed
264             Default: 0
265              
266              
267             B
268              
269             0: messages will not be in HTML format nor HTML escaped -
270             useful for the command line mode
271             1: all warning messages will be in HTML versions and also
272             HTML escaped
273             Default: 0
274              
275             B
 
276              
277             0: users will not be allowed to use the special PRE tag
278             1: users will be allowed to use the special PRE tag
279             Default: 1
280              
281             B
282              
283             0: do not alter images
284             1: convert all images to appropriate links to these
285             images: ----> url
286             Default: 0
287              
288             B
289              
290             0: do not alter URLs
291             1: prepend "http://" to URLs that do not start
292             with "http://", "ftp://" or "mailto:"
293             Default: 1
294            
295             Note: the URLs are recognized only in
296             HREF and SRC parameters.
297              
298             B
299              
300             0: debugging to STDERR is disabled
301             1: debugging to STDERR is enabled
302             Default: 0
303              
304             B
305              
306             The tags that are processed as non-pair can be specified here
307             via a reference to an anonymous array.
308             By default these tags are processed as non-pair:
309              
310             IMG HR BR INPUT META AREA COL BASE LINK PARAM
311              
312             B
313              
314             You also can use the check_attribs parameter to allow the user to use
315             only a limited set of attributes in an element. The parameter is a
316             hash reference, that consists of key->value pairs, in which the key is
317             name of an element, and the value is a reference to an array of attributes.
318             For each element specified in this hash, the user will only be allowed
319             to use the specified attributes.
320              
321             For example, if you define following hash reference:
322              
323             check_attribs => {
324             img => [ 'src', 'width', 'height', 'alt' ]
325             }
326              
327             then the user will be allowed to use ONLY the specified attributes in
328             the element. Any other elements are not affected and the user
329             will be allowed to use any attributes in them. Names of the elements
330             and of the attributes are not case sensitive.
331              
332             B can be redefined by setting these parameters:
333              
334             err_tag = 'Tag {tag} is not allowed in {element}.'
335             err_javascript = 'Javascript is not allowed in {element}.'
336             err_quote = 'Missing quote in {element}.'
337             err_notclosed = 'Pair tag {tag} was not closed.'
338             err_notopened = 'Pair tag {tag} was not opened.'
339              
340             Messages displayed above are the defaults. Special tokens {tag} and {element}
341             are replaced by the appropriate values. You can redefine these messages to
342             localize them.
343              
344              
345             B
346              
347              
348             ($checked_string, $Warnings) =
349             $checker->checkHTML ($string);
350              
351              
352             This method accepts only one parameter - the actual string to check.
353              
354             If the string contains anything dangerous or not allowed then this method
355             returns an undefined value and a reference to an array of warning messages
356             that describe the problems that were detected.
357              
358             If the string is safe then checked and escaped version of the
359             string is returned together with an reference to an empty array.
360              
361             Please note the warning messages are not returned as an array, but as a
362             reference to an array, that must be B when you use it as an
363             array. Usual way to print all the warnings is using the join() function:
364              
365             print join ("
\n", @{$Warnings});
366              
367              
368             =head1 SUPPORT
369              
370             No official support is provided, but I welcome any comments, patches
371             and suggestions on my email. If you suggest a new feature, please justify
372             how it will help the purpose of this module - to provide B checking
373             for HTML code that breaks pages.
374              
375              
376             =head1 BUGS
377              
378             I am aware of no bugs. But remember, this is NOT a validator - bad HTML may
379             and will pass it. Please let me know if you find any chunk of code that passes
380             it and also breaks a page.
381              
382              
383             =head1 AVAILABILITY
384              
385             http://www.geocities.com/tripiecz/
386              
387              
388             =head1 AUTHOR
389              
390             B, tripiecz@yahoo.com
391              
392             Prague, the Czech republic
393              
394              
395             =head1 LICENSE
396              
397             HTML::CGIChecker - A Perl module to detect dangerous HTML code
398              
399             Copyright (C) 2000 Tomas Styblo (tripiecz@yahoo.com)
400              
401             This module is free software; you can redistribute it and/or modify it
402             under the terms of either:
403              
404             a) the GNU General Public License as published by the Free Software
405             Foundation; either version 1, or (at your option) any later version,
406             or
407              
408             b) the "Artistic License" which comes with this module.
409              
410             This program is distributed in the hope that it will be useful,
411             but WITHOUT ANY WARRANTY; without even the implied warranty of
412             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either
413             the GNU General Public License or the Artistic License for more details.
414              
415             You should have received a copy of the Artistic License with this
416             module, in the file Artistic. If not, I'll be glad to provide one.
417              
418             You should have received a copy of the GNU General Public License
419             along with this program; if not, write to the Free Software
420             Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
421             USA
422              
423              
424             =head1 SEE ALSO
425            
426             perl(1).
427              
428              
429             =cut
430              
431             # public:
432             sub new;
433             sub checkHTML;
434              
435             # private:
436             sub _pair_compile;
437             sub _process_element;
438             sub _http;
439             sub _imgtolink;
440             sub _quoting;
441             sub _javascript;
442             sub _pair_saveinfo;
443             sub _allowed_tags;
444             sub _parse_error;
445             sub _pair_check;
446             sub _html_escape;
447              
448 12     12   0 END { }
449              
450             sub new {
451 12     12 0 28 my $class = shift;
452            
453 12 50       72 croak ('HTML::CGIChecker - even number of parameters expected.')
454             if (@_ % 2);
455            
456 12         21 my %tagclasses;
457 12         65 $tagclasses{'base'} =
458             [ qw ( FRAMESET FRAME HTML BODY HEAD TITLE BASE
459             STYLE SCRIPT META NOSCRIPT NOFRAMES ) ];
460 12         43 $tagclasses{'externals'} = [ qw ( APPLET OBJECT LINK IFRAME PARAM ) ];
461 12         47 $tagclasses{'forms'} = [ qw (
462             FORM TEXTAREA SELECT INPUT BUTTON LABEL FIELDSET LEGEND
463             OPTGROUP
464             ) ];
465 12         45 $tagclasses{'tables'} =
466             [ qw ( TABLE TR TD TBODY THEAD TFOOT TH COLGROUP COL CAPTION ) ];
467 12         38 $tagclasses{'lists'} = [ qw ( UL OL LI DL DD DT ) ];
468 12         29 $tagclasses{'images'} = [ qw ( IMG MAP AREA ) ];
469 12         131 $tagclasses{'heading'} = [ qw ( H1 H2 H3 H4 H5 H6 H7 H8 ) ];
470              
471             # set the defaults
472 12         569 my $self = {
473             check => '',
474             mode => 'allow',
475             denyclasses => [ keys (%tagclasses) ],
476             denytags => [ qw ( FONT ) ],
477             allowclasses => [],
478             allowtags => [ qw (
479             B I A U STRONG BR EM CITE VAR ABBR Q DFN CODE
480             SUB SUP SAMP KBD ACRONYM
481             ) ],
482             jscript => 0,
483             html => 0,
484             pre => 1,
485             img_to_link => 0,
486             check_http => 1,
487             debug => 0,
488             nonpairtags => [ qw (
489             IMG HR BR INPUT META AREA COL BASE LINK PARAM
490             ) ],
491             check_attribs => {},
492             err_tag => 'Tag {tag} is not allowed in {element}.',
493             err_javascript => 'Javascript is not allowed in {element}.',
494             err_quote => 'Missing quote in {element}.',
495             err_notclosed => 'Pair tag {tag} was not closed.',
496             err_notopened => 'Pair tag {tag} was not opened.',
497             err_attrib => 'Attribute {attrib} is not allowed in {tag}.'
498             };
499              
500 12         46 bless ($self, $class);
501              
502             # get parameters, overiding the defaults
503 12         52 for (my $i = 0; $i <= $#_; $i += 2) {
504 16 50       129 exists ( $self->{lc($_[$i])} ) or
505             croak ('Invalid parameter ' . $_[$i] . '.');
506 16         71 $self->{lc($_[$i])} = $_[($i + 1)];
507             }
508              
509             # convert to uppercase
510 12         20 map ($_ = uc($_), @{$self->{'allowtags'}});
  12         111  
511 12         25 map ($_ = uc($_), @{$self->{'denytags'}});
  12         33  
512              
513 12         23 my $class;
514 12         34 foreach $class (keys(%tagclasses)) {
515 84         97 map ($_ = uc($_), @{$tagclasses{$class}});
  84         378  
516             }
517            
518 12         27 map ($_ = uc($_), @{$self->{'nonpairtags'}});
  12         63  
519            
520 12         22 my $element;
521 12         17 foreach $element (keys(%{$self->{'check_attribs'}})) {
  12         38  
522 1         1 map ($_ = uc($_), @{$self->{'check_attribs'}->{$element}});
  1         3  
523 1 50       3 if ($element ne uc($element)) {
524 1         2 my $ucelement = uc($element);
525 1         2 $self->{'check_attribs'}->{$ucelement} =
526             $self->{'check_attribs'}->{$element};
527 1         3 delete($self->{'check_attribs'}->{$element});
528             }
529             }
530              
531             # compiles allowed/denied tag data
532 12         49 $self->_pair_compile (\%tagclasses);
533              
534 12         97 return $self;
535             }
536              
537              
538             sub checkHTML {
539 12     12 0 22 my $self = shift;
540 12         27 my $in = shift;
541 12         18 my $out; # processed input to return
542 12         19 my $premode = 0; # indicates we are in a PRE block
543            
544 12         28 $self->{'_errors'} = [];
545 12         27 $self->{'_opentags'} = {};
546 12         24 $self->{'_closetags'} = {};
547              
548 12         239 my @tokens = split(/(<[^<]*?>)/s, $in);
549 12         28 my $token;
550 12         22 foreach $token (@tokens) {
551 204 100       527 if (index($token, '<') != 0) {
552             # This token is not a HTML element.
553             # Jump to the next token.
554 108         206 $out .= $self->_html_escape($token);
555 108         295 next;
556             }
557            
558             # This token is a HTML element.
559 96         158 $self->{'_element'} = $token; # element = the whole <.*> thing
560 96         415 ($self->{'_tag'}) = $token =~ m|^<\s*(/?\w+)|s; # name of this tag
561 96 100       235 next if (not $self->{'_tag'});
562 95         248 $self->{'_tag'} = uc($self->{'_tag'});
563            
564             # generetes the abstag
565 95 100       206 if (index($self->{'_tag'}, '/') == 0) {
566 42         91 $self->{'_abstag'} = substr ($self->{'_tag'}, 1);
567             }
568             else {
569 53         91 $self->{'_abstag'} = $self->{'_tag'};
570             }
571            
572             # the PRE feature
573             # end this pass if PRE is allowed and we are in a PRE block
574            
575 95 50       201 if ($self->{'pre'}) {
576 95 100 100     616 if (not $premode and $self->{'_tag'} eq 'PRE') {
    100 100        
577 4         9 $out .= $self->{'_element'};
578 4         5 $premode = 1;
579 4         8 $self->_pair_saveinfo();
580 4         8 next;
581             }
582             elsif ($premode and $self->{'_tag'} eq '/PRE') {
583 3         13 $out .= $self->{'_element'};
584 3         10 $premode = 0;
585 3         9 $self->_pair_saveinfo();
586 3         6 next;
587             }
588            
589 88 100       175 if ($premode) {
590 17         47 $out .= $self->_html_escape($self->{'_element'});
591 17         33 next;
592             }
593             }
594            
595             # If PRE is allowed, then we get to this point only if the current
596             # element is not inside a PRE block.
597             #
598             # This method calls all the methods that process the current element.
599             # Override this method in your subclass to add your own methods.
600 71         146 $self->_process_element();
601            
602             # Current element is pushed to the end of the output buffer.
603 71         186 $out .= $self->{'_element'};
604             }
605             # END of the loop
606            
607 12         29 $self->{'_element'} = '';
608 12         41 $self->{'_tag'} = '';
609 12         29 $self->{'_abstag'} = '';
610            
611 12         46 $self->_pair_check ($self->{'_opentags'}, $self->{'err_notclosed'});
612 12         49 $self->_pair_check ($self->{'_closetags'}, $self->{'err_notopened'});
613              
614             # We make a copy of the array of errors to prevent memory leakage
615             # that can occur if a bad programmer does not destroy the reference
616             # to this array after he is finished with it. If we do not make this copy
617             # then the whole object would not be freed in this case.
618            
619 12         19 my @ret_errors = @{$self->{'_errors'}};
  12         35  
620              
621 12 100       50 if (@ret_errors) {
622 6         62 return (undef, \@ret_errors);
623             }
624             else {
625 6         44 return ($out, \@ret_errors);
626             }
627             }
628              
629             ##############################################################################
630             ##################### PRIVATE METHODS ##########################
631             ##############################################################################
632              
633             # "compiles" the array of allowed/denied tags
634             # resolves tag classes and merges them with individualy specified tags
635              
636             sub _pair_compile {
637 12     12   25 my $self = shift;
638 12         15 my $tagclasses = shift; # reference to predefined tag classes
639 12         21 my @allow;
640             my @deny;
641            
642 12 100       54 if ($self->{'mode'} eq "allow") {
    50          
643             # allow mode = error if any not explicitely allowed tag is found
644 11         17 my $allowclass;
645 11         14 foreach $allowclass (@{$self->{'allowclasses'}}) {
  11         36  
646 2         3 push(@allow,@{$tagclasses->{$allowclass}})
  2         8  
647             }
648 11         33 my $tag;
649 11         14 foreach $tag (@{$self->{'allowtags'}}) {
  11         27  
650 198 50       706 push (@allow, $tag) if (not grep (($_ eq $tag), @allow));
651             }
652 11 50       46 $self->{'debug'} and
653             warn('HTML::CGIChecker DEBUG: @allow = '.join(",", @allow));
654             }
655             elsif ($self->{'mode'} eq "deny") {
656             # deny mode = error if any denied tag is found, everything other is OK
657 1         1 my $denyclass;
658 1         2 foreach $denyclass (@{$self->{'denyclasses'}}) {
  1         2  
659 7         7 push (@deny, @{$tagclasses->{$denyclass}})
  7         26  
660             }
661 1         3 my $tag;
662 1         1 foreach $tag (@{$self->{'denytags'}}) {
  1         2  
663 1 50       13 push (@deny, $tag) if (not grep (($_ eq $tag), @deny));
664             }
665 1 50       5 $self->{'debug'} and
666             warn('HTML::CGIChecker DEBUG: @deny = '.join(",", @deny));
667             }
668             else {
669 0         0 croak('HTML::CGIChecker - mode has to be either "allow" or "deny".')
670             }
671              
672             # store refs to the compiled arrays
673 12         27 $self->{'_allow'} = \@allow;
674 12         34 $self->{'_deny'} = \@deny;
675             }
676              
677              
678             # Processes the current element.
679             # NOTE: The order in which these methods are called matters !
680              
681             sub _process_element {
682 71     71   86 my $self = shift;
683 71         143 $self->_http();
684 71         133 $self->_imgtolink();
685 71         124 $self->_quoting();
686 71         137 $self->_javascript();
687 71         133 $self->_check_attribs();
688 71         123 $self->_pair_saveinfo();
689 71         153 $self->_allowed_tags();
690             }
691              
692              
693             # implements the check_http option
694              
695             sub _http {
696 71     71   73 my $self = shift;
697 71 50       163 if ($self->{'check_http'}) {
698 71 100       523 $self->{'_element'} =~ s!
699             ^<
700             (.+) # 1
701             (href|src)\s*=\s* # 2; MSIE allows the spaces ..
702             (")? # 3 "
703             (.*?) # 4
704             (".*>|\s.*>|>)$ # 5 "
705             !<$1$2=$3http://$4$5!six if
706             ($self->{'_element'} !~
707             m!
708             (http://
709             |mailto:
710             |ftp://
711             |telnet://
712             |file://
713             )
714             !xis);
715             }
716             }
717              
718              
719             # implements the img_to_link option
720              
721             sub _imgtolink {
722 71     71   81 my $self = shift;
723 71 100       203 if ($self->{'img_to_link'}) {
724 5         68 $self->{'_element'} =~ s!
725             ^<
726             img.+src\s*=\s* # MSIE allows the spaces ..
727             "? # "
728             (.*?) # $1
729             (".*>|\s.*>|>)$ # "
730             !$1!six;
731             }
732             }
733              
734              
735             # checks the element for correct quoting
736              
737             sub _quoting {
738 71     71   79 my $self = shift;
739 71 50       210 if ($self->{'_element'} =~ tr/"/"/ % 2) {
740 0         0 push (@{$self->{'_errors'}},
  0         0  
741             $self->_parse_error ($self->{'err_quote'}));
742             }
743             }
744              
745              
746             # checks the element for a javascript
747              
748             sub _javascript {
749 71     71   86 my $self = shift;
750 71 50       159 if (not $self->{'jscript'}) {
751 71 100 100     403 if ($self->{'_element'} =~ /javascript/i or
752             $self->{'_element'} =~ /\son\w+\s*=/i) {
753 2         3 push (@{$self->{'_errors'}},
  2         5  
754             $self->_parse_error ($self->{'err_javascript'}));
755             }
756             }
757             }
758              
759              
760             sub _check_attribs {
761 71     71   82 my $self = shift;
762 71         100 my $element = $self->{'_element'};
763            
764 71 100       221 if (ref($self->{'check_attribs'}->{$self->{'_abstag'}})) {
765 1         9 $element =~ s/^<\w+\s*//s;
766 1         3 $element =~ s|/?>$||;
767 1         4 my @pairs = split(/\s+/, $element);
768 1         2 my $pair;
769 1         1 foreach $pair (@pairs) {
770 3         7 my ($attrib) = split(/=/, $pair);
771 3         4 $attrib = uc($attrib);
772 3 100       3 if (not grep($attrib eq $_,
  3         16  
773             @{$self->{'check_attribs'}->{$self->{'_abstag'}}})) {
774 1         1 $self->{'_attrib'} = $attrib;
775 1         2 push (@{$self->{'_errors'}},
  1         3  
776             $self->_parse_error ($self->{'err_attrib'}));
777             }
778             }
779             }
780             }
781              
782              
783             # Works with counters of tags, so we can later say if there are some tags that
784             # are not correctly either closed or opened.
785             # This implementation prevents a user to place a closing tag before the opening
786             # one and thus make the checker to think it was correctly closed.
787              
788             sub _pair_saveinfo {
789 78     78   89 my $self = shift;
790            
791 78 100       188 if (index($self->{'_tag'}, '/') == 0) {
  43 100       265  
792             # this is a closing tag
793 35 50       60 if (not grep ( ($_ eq $self->{'_abstag'}),
  35         201  
794             @{$self->{'nonpairtags'}})) {
795 35 100       110 if ($self->{'_opentags'}->{$self->{'_abstag'}}) {
796 33         97 $self->{'_opentags'}->{$self->{'_abstag'}}--;
797             }
798             else {
799 2         8 $self->{'_closetags'}->{$self->{'_abstag'}}++;
800             }
801             }
802             }
803             elsif (not grep ( ($_ eq $self->{'_abstag'}),
804             @{$self->{'nonpairtags'}})) {
805             # this is an opening tag
806 35         113 $self->{'_opentags'}->{$self->{'_abstag'}}++;
807             }
808             }
809              
810              
811             # checks if a given tag is allowed
812              
813             sub _allowed_tags {
814 71     71   86 my $self = shift;
815 71 100       175 if ($self->{'mode'} eq "allow") {
816 8         20 grep ( ($_ eq $self->{'_abstag'}), @{$self->{'_allow'}}) or
  61         415  
817 61 100       64 push (@{$self->{'_errors'}},
818             $self->_parse_error ($self->{'err_tag'}));
819             } else {
820 6         16 grep ( ($_ eq $self->{'_abstag'}), @{$self->{'_deny'}}) and
  10         134  
821 10 100       11 push (@{$self->{'_errors'}},
822             $self->_parse_error ($self->{'err_tag'}));
823             }
824             }
825              
826              
827             # processes the error messages
828              
829             sub _parse_error {
830 21     21   26 my $self = shift;
831 21         25 my ($error) = @_;
832 21         35 my $element = $self->{'_element'};
833 21         29 my $tag = $self->{'_tag'};
834 21         32 my $attrib = $self->{'_attrib'};
835              
836 21 50       44 if ($self->{'html'}) {
837 0         0 $element = $self->_html_escape ($element);
838 0         0 $tag = '<' . $tag . '>';
839 0         0 $element = '' . $element . '';
840 0         0 $attrib = '' . $attrib . '';
841             }
842 21         37 else { $tag = "<$tag>" }
843              
844 21         64 $error =~ s/{tag}/$tag/;
845 21         48 $error =~ s/{element}/$element/;
846 21         33 $error =~ s/{attrib}/$attrib/;
847 21         377 $error =~ s/\n+|\r+|\s+/ /g;
848 21         66 return $error;
849             }
850              
851              
852             # raises an error if a badly paired tag is found
853              
854             sub _pair_check {
855 24     24   34 my $self = shift;
856 24         36 my ($Tags, $error) = @_;
857 24         32 my ($count);
858            
859 24         36 while ( ($self->{'_tag'}, $count) = each (%{$Tags})) {
  54         244  
860 30 100       80 if ($count) {
861 4         6 push (@{$self->{'_errors'}},
  4         21  
862             $self->_parse_error ($error));
863             }
864             }
865             }
866              
867              
868             # Escapes some dangerous characters.
869             # Ampersand "&" is escaped only if it is not part of a HTML entity.
870             # Therefore, users can post HTML entities. Ampersands that are part
871             # of an ordinary text are still properly escaped.
872             # Thanks to godless@hermes.slipstream.com for this idea.
873              
874             sub _html_escape {
875 125     125   159 my $self = shift;
876 125         176 my ($in) = @_;
877              
878 125         169 for ($in) {
879 125         150 s/&(?!\w+;)/&/g;
880 125         149 s/>/>/g;
881 125         162 s/
882 125         274 s/"/"/g;
883             }
884 125         257 return $in;
885             }
886              
887              
888             1;