File Coverage

blib/lib/Regexp/IgnoreHTML.pm
Criterion Covered Total %
statement 39 39 100.0
branch 9 10 90.0
condition 7 12 58.3
subroutine 4 4 100.0
pod 3 3 100.0
total 62 68 91.1


line stmt bran cond sub pod time code
1             package Regexp::IgnoreHTML;
2 1     1   1343 use Regexp::Ignore;
  1         4  
  1         537  
3             our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class
4              
5             ########################
6             # new
7             ########################
8             sub new {
9 6     6 1 2573 my $proto = shift;
10 6   33     43 my $class = ref($proto) || $proto;
11 6         43 my $self = $class->SUPER::new(@_);
12             # by default it does not add spaces
13 6         18 $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = 0;
14 6         14 return $self;
15             } # of new
16              
17             #############################################
18             # space_after_non_text_characteristics_html
19             #############################################
20             sub space_after_non_text_characteristics_html {
21 3314     3314 1 4675 my $self = shift;
22 3314 100       6455 if (@_) { $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = shift }
  6         12  
23 3314         47492 return $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML};
24             } # of space_after_non_text_characteristics_html
25              
26             ###########################################
27             #
28             #
29             #
30             ########################
31             # get_tokens
32             ########################
33             sub get_tokens {
34 6     6 1 10 my $self = shift;
35              
36 6         12 my $tokens = [];
37 6         9 my $flags = [];
38 6         10 my $index = 0;
39             # we should create tokens from the TEXT.
40 6         23 my $text = $self->text();
41              
42             # the regular expression will try to match:
43             # - HTML remarks - all the remark will be matched.
44             # - HTML tags
45 6         24 my $re1 = qr/(<\!\-\-[\s\S]+?\-\->)|(<\/?[^\>]*?>)/is;
46            
47 6         8 my $re2;
48 6 100       14 if ($self->space_after_non_text_characteristics_html()) {
49             # if the tag that we found is one of the following, we do not
50             # put space after it: B, BASEFONT, BIG, BLINK, CITE, CODE, EM,
51             # FONT, I, KBD, PLAINTEXT, S, SMALL, STRIKE, STRONG, SUB, SUP,
52             # TT, U, VAR, A, SPAN, WBR
53 3         8 $re2 = '<\!\-\-.+?\-\->|'.
54             '\<\!\[[^\]]*?\]\>|'.
55             '<\/?\s*B(\s[^>]*?>|\s*>)|'.
56             '<\/?\s*BASEFONT(\s[^>]*?>|\s*>)|'.
57             '<\/?\s*BIG(\s[^>]*?>|\s*>)|'.
58             '<\/?\s*BLINK(\s[^>]*?>|\s*>)|'.
59             '<\/?\s*CITE(\s[^>]*?>|\s*>)|'.
60             '<\/?\s*CODE(\s[^>]*?>|\s*>)|'.
61             '<\/?\s*EM(\s[^>]*?>|\s*>)|'.
62             '<\/?\s*FONT(\s[^>]*?>|\s*>)|'.
63             '<\/?\s*I(\s[^>]*?>|\s*>)|'.
64             '<\/?\s*KBD(\s[^>]*?>|\s*>)|'.
65             '<\/?\s*PLAINTEXT(\s[^>]*?>|\s*>)|'.
66             '<\/?\s*S(\s[^>]*?>|\s*>)|'.
67             '<\/?\s*SMALL(\s[^>]*?>|\s*>)|'.
68             '<\/?\s*STRIKE(\s[^>]*?>|\s*>)|'.
69             '<\/?\s*STRONG(\s[^>]*?>|\s*>)|'.
70             '<\/?\s*SUB(\s[^>]*?>|\s*>)|'.
71             '<\/?\s*SUP(\s[^>]*?>|\s*>)|'.
72             '<\/?\s*TT(\s[^>]*?>|\s*>)|'.
73             '<\/?\s*U(\s[^>]*?>|\s*>)|'.
74             '<\/?\s*VAR(\s[^>]*?>|\s*>)|'.
75             '<\/?\s*A(\s[^>]*?>|\s*>)|'.
76             '<\/?\s*SPAN(\s[^>]*?>|\s*>)|'.
77             '<\/?\s*WBR(\s[^>]*?>|\s*>)|'.
78             '<\/?\s*[OVWXP]\:[^>]*?>';
79 3         338 $re2 = qr/$re2/is;
80             }
81            
82 6   66     114 while (defined($text) && $text =~ /$re1/) {
83 3302 100       7949 if ($`) { # if there is a text before, take it as clean
84 960         5684 $tokens->[$index] = $`;
85 960         1204 $flags->[$index] = 1; # the text before the match is clean.
86 960         1160 $index++; # increment the index
87             }
88 3302         6915 $tokens->[$index] = $&;
89 3302         4917 $flags->[$index] = 0; # the match itself is unwanted.
90 3302         24253 $text = $'; # update the original text to after the match.
91 3302         3799 $index++; # increment the index again
92             # check if we should add space after the text
93 3302 100 100     8084 if ($self->space_after_non_text_characteristics_html() &&
94             $tokens->[$index - 1] !~ /$re2/) { # this tag is not text
95             # characteristic tag
96             # we add a space token after this tag
97 423         687 $tokens->[$index] = " ";
98 423         521 $flags->[$index] = 1;
99 423         3651 $index++;
100             }
101             }
102              
103             # if we had no match, check if there is still something in the
104             # $text. this will be also a clean text.
105 6 50 33     32 if (defined($text) && $text) {
106 6         15 $tokens->[$index] = $text;
107 6         9 $flags->[$index] = 1;
108             }
109             # return the two lists
110 6         59 return ($tokens, $flags);
111             } # of get_tokens
112              
113             1; # make perl happy
114              
115             __END__