File Coverage

blib/lib/Regexp/IgnoreHTML.pm

Criterion	Covered	Total	%
statement	39	39	100.0
branch	9	10	90.0
condition	7	12	58.3
subroutine	4	4	100.0
pod	3	3	100.0
total	62	68	91.1

line	stmt	bran	cond	sub	pod	time	code
1							package Regexp::IgnoreHTML;
2	1			1		1343	use Regexp::Ignore;
	1					4
	1					537
3							our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class
4
5							########################
6							# new
7							########################
8							sub new {
9	6			6	1	2573	my $proto = shift;
10	6		33			43	my $class = ref($proto) \|\| $proto;
11	6					43	my $self = $class->SUPER::new(@_);
12							# by default it does not add spaces
13	6					18	$self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = 0;
14	6					14	return $self;
15							} # of new
16
17							#############################################
18							# space_after_non_text_characteristics_html
19							#############################################
20							sub space_after_non_text_characteristics_html {
21	3314			3314	1	4675	my $self = shift;
22	3314	100				6455	if (@_) { $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML} = shift }
	6					12
23	3314					47492	return $self->{SPACE_AFTER_NON_TEXT_CHARACTERISTICS_HTML};
24							} # of space_after_non_text_characteristics_html
25
26							###########################################
27							#
28							#
29							#
30							########################
31							# get_tokens
32							########################
33							sub get_tokens {
34	6			6	1	10	my $self = shift;
35
36	6					12	my $tokens = [];
37	6					9	my $flags = [];
38	6					10	my $index = 0;
39							# we should create tokens from the TEXT.
40	6					23	my $text = $self->text();
41
42							# the regular expression will try to match:
43							# - HTML remarks - all the remark will be matched.
44							# - HTML tags
45	6					24	my $re1 = qr/(<\!\-\-[\s\S]+?\-\->)\|(<\/?[^\>]*?>)/is;
46
47	6					8	my $re2;
48	6	100				14	if ($self->space_after_non_text_characteristics_html()) {
49							# if the tag that we found is one of the following, we do not
50							# put space after it: B, BASEFONT, BIG, BLINK, CITE, CODE, EM,
51							# FONT, I, KBD, PLAINTEXT, S, SMALL, STRIKE, STRONG, SUB, SUP,
52							# TT, U, VAR, A, SPAN, WBR
53	3					8	$re2 = '<\!\-\-.+?\-\->\|'.
54							'\<\!\[[^\]]*?\]\>\|'.
55							'<\/?\sB(\s[^>]?>\|\s*>)\|'.
56							'<\/?\sBASEFONT(\s[^>]?>\|\s*>)\|'.
57							'<\/?\sBIG(\s[^>]?>\|\s*>)\|'.
58							'<\/?\sBLINK(\s[^>]?>\|\s*>)\|'.
59							'<\/?\sCITE(\s[^>]?>\|\s*>)\|'.
60							'<\/?\sCODE(\s[^>]?>\|\s*>)\|'.
61							'<\/?\sEM(\s[^>]?>\|\s*>)\|'.
62							'<\/?\sFONT(\s[^>]?>\|\s*>)\|'.
63							'<\/?\sI(\s[^>]?>\|\s*>)\|'.
64							'<\/?\sKBD(\s[^>]?>\|\s*>)\|'.
65							'<\/?\sPLAINTEXT(\s[^>]?>\|\s*>)\|'.
66							'<\/?\sS(\s[^>]?>\|\s*>)\|'.
67							'<\/?\sSMALL(\s[^>]?>\|\s*>)\|'.
68							'<\/?\sSTRIKE(\s[^>]?>\|\s*>)\|'.
69							'<\/?\sSTRONG(\s[^>]?>\|\s*>)\|'.
70							'<\/?\sSUB(\s[^>]?>\|\s*>)\|'.
71							'<\/?\sSUP(\s[^>]?>\|\s*>)\|'.
72							'<\/?\sTT(\s[^>]?>\|\s*>)\|'.
73							'<\/?\sU(\s[^>]?>\|\s*>)\|'.
74							'<\/?\sVAR(\s[^>]?>\|\s*>)\|'.
75							'<\/?\sA(\s[^>]?>\|\s*>)\|'.
76							'<\/?\sSPAN(\s[^>]?>\|\s*>)\|'.
77							'<\/?\sWBR(\s[^>]?>\|\s*>)\|'.
78							'<\/?\s[OVWXP]\:[^>]?>';
79	3					338	$re2 = qr/$re2/is;
80							}
81
82	6		66			114	while (defined($text) && $text =~ /$re1/) {
83	3302	100				7949	if ($`) { # if there is a text before, take it as clean
84	960					5684	$tokens->[$index] = $`;
85	960					1204	$flags->[$index] = 1; # the text before the match is clean.
86	960					1160	$index++; # increment the index
87							}
88	3302					6915	$tokens->[$index] = $&;
89	3302					4917	$flags->[$index] = 0; # the match itself is unwanted.
90	3302					24253	$text = $'; # update the original text to after the match.
91	3302					3799	$index++; # increment the index again
92							# check if we should add space after the text
93	3302	100	100			8084	if ($self->space_after_non_text_characteristics_html() &&
94							$tokens->[$index - 1] !~ /$re2/) { # this tag is not text
95							# characteristic tag
96							# we add a space token after this tag
97	423					687	$tokens->[$index] = " ";
98	423					521	$flags->[$index] = 1;
99	423					3651	$index++;
100							}
101							}
102
103							# if we had no match, check if there is still something in the
104							# $text. this will be also a clean text.
105	6	50	33			32	if (defined($text) && $text) {
106	6					15	$tokens->[$index] = $text;
107	6					9	$flags->[$index] = 1;
108							}
109							# return the two lists
110	6					59	return ($tokens, $flags);
111							} # of get_tokens
112
113							1; # make perl happy
114
115							__END__