line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Regexp::IgnoreHTML; |
2
|
1
|
|
|
1
|
|
1343
|
use Regexp::Ignore; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
537
|
|
3
|
|
|
|
|
|
|
our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
######################## |
6
|
|
|
|
|
|
|
# new |
7
|
|
|
|
|
|
|
######################## |
8
|
|
|
|
|
|
|
sub new { |
9
|
6
|
|
|
6
|
1
|
2573
|
my $proto = shift; |
10
|
6
|
|
33
|
|
|
43
|
my $class = ref($proto) || $proto; |
11
|
6
|
|
|
|
|
43
|
my $self = $class->SUPER::new(@_); |
12
|
|
|
|
|
|
|
# by default it does not add spaces |
13
|
6
|
|
|
|
|
18
|
$self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = 0; |
14
|
6
|
|
|
|
|
14
|
return $self; |
15
|
|
|
|
|
|
|
} # of new |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
############################################# |
18
|
|
|
|
|
|
|
# space_after_non_text_characteristics_html |
19
|
|
|
|
|
|
|
############################################# |
20
|
|
|
|
|
|
|
sub space_after_non_text_characteristics_html { |
21
|
3314
|
|
|
3314
|
1
|
4675
|
my $self = shift; |
22
|
3314
|
100
|
|
|
|
6455
|
if (@_) { $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = shift } |
|
6
|
|
|
|
|
12
|
|
23
|
3314
|
|
|
|
|
47492
|
return $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML}; |
24
|
|
|
|
|
|
|
} # of space_after_non_text_characteristics_html |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
########################################### |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
# |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
######################## |
31
|
|
|
|
|
|
|
# get_tokens |
32
|
|
|
|
|
|
|
######################## |
33
|
|
|
|
|
|
|
sub get_tokens { |
34
|
6
|
|
|
6
|
1
|
10
|
my $self = shift; |
35
|
|
|
|
|
|
|
|
36
|
6
|
|
|
|
|
12
|
my $tokens = []; |
37
|
6
|
|
|
|
|
9
|
my $flags = []; |
38
|
6
|
|
|
|
|
10
|
my $index = 0; |
39
|
|
|
|
|
|
|
# we should create tokens from the TEXT. |
40
|
6
|
|
|
|
|
23
|
my $text = $self->text(); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# the regular expression will try to match: |
43
|
|
|
|
|
|
|
# - HTML remarks - all the remark will be matched. |
44
|
|
|
|
|
|
|
# - HTML tags |
45
|
6
|
|
|
|
|
24
|
my $re1 = qr/(<\!\-\-[\s\S]+?\-\->)|(<\/?[^\>]*?>)/is; |
46
|
|
|
|
|
|
|
|
47
|
6
|
|
|
|
|
8
|
my $re2; |
48
|
6
|
100
|
|
|
|
14
|
if ($self->space_after_non_text_characteristics_html()) { |
49
|
|
|
|
|
|
|
# if the tag that we found is one of the following, we do not |
50
|
|
|
|
|
|
|
# put space after it: B, BASEFONT, BIG, BLINK, CITE, CODE, EM, |
51
|
|
|
|
|
|
|
# FONT, I, KBD, PLAINTEXT, S, SMALL, STRIKE, STRONG, SUB, SUP, |
52
|
|
|
|
|
|
|
# TT, U, VAR, A, SPAN, WBR |
53
|
3
|
|
|
|
|
8
|
$re2 = '<\!\-\-.+?\-\->|'. |
54
|
|
|
|
|
|
|
'\<\!\[[^\]]*?\]\>|'. |
55
|
|
|
|
|
|
|
'<\/?\s*B(\s[^>]*?>|\s*>)|'. |
56
|
|
|
|
|
|
|
'<\/?\s*BASEFONT(\s[^>]*?>|\s*>)|'. |
57
|
|
|
|
|
|
|
'<\/?\s*BIG(\s[^>]*?>|\s*>)|'. |
58
|
|
|
|
|
|
|
'<\/?\s*BLINK(\s[^>]*?>|\s*>)|'. |
59
|
|
|
|
|
|
|
'<\/?\s*CITE(\s[^>]*?>|\s*>)|'. |
60
|
|
|
|
|
|
|
'<\/?\s*CODE(\s[^>]*?>|\s*>)|'. |
61
|
|
|
|
|
|
|
'<\/?\s*EM(\s[^>]*?>|\s*>)|'. |
62
|
|
|
|
|
|
|
'<\/?\s*FONT(\s[^>]*?>|\s*>)|'. |
63
|
|
|
|
|
|
|
'<\/?\s*I(\s[^>]*?>|\s*>)|'. |
64
|
|
|
|
|
|
|
'<\/?\s*KBD(\s[^>]*?>|\s*>)|'. |
65
|
|
|
|
|
|
|
'<\/?\s*PLAINTEXT(\s[^>]*?>|\s*>)|'. |
66
|
|
|
|
|
|
|
'<\/?\s*S(\s[^>]*?>|\s*>)|'. |
67
|
|
|
|
|
|
|
'<\/?\s*SMALL(\s[^>]*?>|\s*>)|'. |
68
|
|
|
|
|
|
|
'<\/?\s*STRIKE(\s[^>]*?>|\s*>)|'. |
69
|
|
|
|
|
|
|
'<\/?\s*STRONG(\s[^>]*?>|\s*>)|'. |
70
|
|
|
|
|
|
|
'<\/?\s*SUB(\s[^>]*?>|\s*>)|'. |
71
|
|
|
|
|
|
|
'<\/?\s*SUP(\s[^>]*?>|\s*>)|'. |
72
|
|
|
|
|
|
|
'<\/?\s*TT(\s[^>]*?>|\s*>)|'. |
73
|
|
|
|
|
|
|
'<\/?\s*U(\s[^>]*?>|\s*>)|'. |
74
|
|
|
|
|
|
|
'<\/?\s*VAR(\s[^>]*?>|\s*>)|'. |
75
|
|
|
|
|
|
|
'<\/?\s*A(\s[^>]*?>|\s*>)|'. |
76
|
|
|
|
|
|
|
'<\/?\s*SPAN(\s[^>]*?>|\s*>)|'. |
77
|
|
|
|
|
|
|
'<\/?\s*WBR(\s[^>]*?>|\s*>)|'. |
78
|
|
|
|
|
|
|
'<\/?\s*[OVWXP]\:[^>]*?>'; |
79
|
3
|
|
|
|
|
338
|
$re2 = qr/$re2/is; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
6
|
|
66
|
|
|
114
|
while (defined($text) && $text =~ /$re1/) { |
83
|
3302
|
100
|
|
|
|
7949
|
if ($`) { # if there is a text before, take it as clean |
84
|
960
|
|
|
|
|
5684
|
$tokens->[$index] = $`; |
85
|
960
|
|
|
|
|
1204
|
$flags->[$index] = 1; # the text before the match is clean. |
86
|
960
|
|
|
|
|
1160
|
$index++; # increment the index |
87
|
|
|
|
|
|
|
} |
88
|
3302
|
|
|
|
|
6915
|
$tokens->[$index] = $&; |
89
|
3302
|
|
|
|
|
4917
|
$flags->[$index] = 0; # the match itself is unwanted. |
90
|
3302
|
|
|
|
|
24253
|
$text = $'; # update the original text to after the match. |
91
|
3302
|
|
|
|
|
3799
|
$index++; # increment the index again |
92
|
|
|
|
|
|
|
# check if we should add space after the text |
93
|
3302
|
100
|
100
|
|
|
8084
|
if ($self->space_after_non_text_characteristics_html() && |
94
|
|
|
|
|
|
|
$tokens->[$index - 1] !~ /$re2/) { # this tag is not text |
95
|
|
|
|
|
|
|
# characteristic tag |
96
|
|
|
|
|
|
|
# we add a space token after this tag |
97
|
423
|
|
|
|
|
687
|
$tokens->[$index] = " "; |
98
|
423
|
|
|
|
|
521
|
$flags->[$index] = 1; |
99
|
423
|
|
|
|
|
3651
|
$index++; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# if we had no match, check if there is still something in the |
104
|
|
|
|
|
|
|
# $text. this will be also a clean text. |
105
|
6
|
50
|
33
|
|
|
32
|
if (defined($text) && $text) { |
106
|
6
|
|
|
|
|
15
|
$tokens->[$index] = $text; |
107
|
6
|
|
|
|
|
9
|
$flags->[$index] = 1; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
# return the two lists |
110
|
6
|
|
|
|
|
59
|
return ($tokens, $flags); |
111
|
|
|
|
|
|
|
} # of get_tokens |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
1; # make perl happy |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
__END__ |