File Coverage

blib/lib/HTML/SimpleParse.pm

Criterion	Covered	Total	%
statement	63	70	90.0
branch	30	38	78.9
condition	5	6	83.3
subroutine	14	16	87.5
pod	14	14	100.0
total	126	144	87.5

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::SimpleParse;
2
3	1			1		8177	use strict;
	1					2
	1					38
4	1			1		5	use vars qw($VERSION $FIX_CASE);
	1					1
	1					4086
5
6							$VERSION = '0.12';
7							my $debug = 0;
8
9							sub new {
10	4			4	1	1067	my $pack = shift;
11
12	4					22	my $self = bless {
13							'text' => shift(),
14							'tree' => [],
15							@_
16							}, $pack;
17
18	4	100	100			35	$self->parse() if defined $self->{'text'} and length $self->{'text'};
19	4					11	return $self;
20							}
21
22							sub text {
23	0			0	1	0	my $self = shift;
24	0	0				0	$self->{'text'} = shift if @_;
25	0					0	return $self->{'text'};
26							}
27
28	2			2	1	7	sub tree { @{$_[0]->{'tree'}} }
	2					14
29
30							sub parse {
31							# Much of this is a dumbed-down version of HTML::Parser::parse.
32
33	2			2	1	5	my $self = shift;
34	2					4	my $text = \ $self->{'text'};
35	2					5	my $tree = $self->{'tree'};
36
37							# Parse html text in $$text. The strategy is to remove complete
38							# tokens from the beginning of $$text until we can't decide whether
39							# it is a token or not, or the $$text is empty.
40
41	2					5	@$tree = ();
42	2					4	while (1) {
43	58					65	my ($content, $type);
44
45							# First we try to pull off any plain text (anything before a "<" char)
46	58	100				275	if ($$text =~ /\G([^<]+)/gcs) {
		100
		100
		100
		100
47	26					43	$content = $1; $type = 'text';
	26					31
48
49							# Then, SSI, comments, and markup declarations (usually )
50							# ssi:
51							# comment:
52							# markup:
53							} elsif ($$text =~ /\G<(!--(\#?).*?--)>/gcs) {
54	4	100				13	$type = ($2 ? 'ssi' : 'comment');
55	4					6	$content = $1;
56
57							} elsif ($$text =~ /\G<(!.*?)>/gcs) {
58	2					4	$type = 'markup';
59	2					4	$content = $1;
60
61							# Then, look for an end tag
62							} elsif ($$text =~ m\|\G<(/[a-zA-Z][a-zA-Z0-9\.\-]\s)>\|gcs) {
63	12					18	$content = $1; $type = 'endtag';
	12					18
64
65							# Then, finally we look for a start tag
66							# We know the first char is <, make sure there's a >
67							} elsif ($$text =~ /\G<(.*?)>/gcs) {
68	12					23	$content = $1; $type = 'starttag';
	12					15
69
70							} else {
71							# the string is exhausted, or there's no > in it.
72	2	50				14	push @$tree, {
73							'content' => substr($$text, pos $$text),
74							'type' => 'text',
75							} unless pos($$text) eq length($$text);
76	2					4	last;
77							}
78
79	56	100				292	push @$tree, {
80							'content' => $content,
81							'type' => $type,
82							'offset' => ($type eq 'text' ?
83							pos($$text) - length($content) :
84							pos($$text) - length($content) - 2),
85							};
86							}
87
88	2					4	$self;
89							}
90
91
92							$FIX_CASE = 1;
93							sub parse_args {
94	12			12	1	3950	my $self = shift; # Not needed here
95	12					19	my $str = shift;
96	12	100	66			48	my $fix_case = ((ref $self and exists $self->{fix_case}) ? $self->{fix_case} : $FIX_CASE);
97	12					15	my @returns;
98
99							# Make sure we start searching at the beginning of the string
100	12					32	pos($str) = 0;
101
102	12					20	while (1) {
103	33	100				93	next if $str =~ m/\G\s+/gc; # Get rid of leading whitespace
104
105	32	100				145	if ( $str =~ m/\G
		100
106							([\w.-]+)\s=\s # the key
107							(?:
108							"([^\"\\]* (?: \\.[^\"\\]* )* )"\s* # quoted string, with possible whitespace inside,
109							\| # or
110							'([^\'\\]* (?: \\.[^\'\\]* )* )'\s* # quoted string, with possible whitespace inside,
111							\| # or
112							([^\s>])\s # anything else, without whitespace or >
113							)/gcx ) {
114
115	16					45	my ($key, $val) = ($1, $+);
116	16					31	$val =~ s/\\(.)/$1/gs;
117	16	50				57	push @returns, ($fix_case==1 ? uc($key) : $fix_case==-1 ? lc($key) : $key), $val;
		100
118
119							} elsif ( $str =~ m,\G/?([\w.-]+)\s*,gc ) {
120	4	0				18	push @returns, ($fix_case==1 ? uc($1) : $fix_case==-1 ? lc($1) : $1 ), undef;
		50
121							} else {
122	12					17	last;
123							}
124							}
125
126	12					72	return @returns;
127							}
128
129
130							sub execute {
131	28			28	1	33	my $self = shift;
132	28					28	my $ref = shift;
133	28					46	my $method = "output_$ref->{type}";
134	28	50				45	warn "calling $self->$method(...)" if $debug;
135	28					66	return $self->$method($ref->{content});
136							}
137
138							sub get_output {
139	1			1	1	7	my $self = shift;
140	1					2	my ($method, $out) = ('', '');
141	1					5	foreach ($self->tree) {
142	28					53	$out .= $self->execute($_);
143							}
144	1					6	return $out;
145							}
146
147
148							sub output {
149	0			0	1	0	my $self = shift;
150	0					0	my $method;
151	0					0	foreach ($self->tree) {
152	0					0	print $self->execute($_);
153							}
154							}
155
156	13			13	1	36	sub output_text { $_[1]; }
157	1			1	1	5	sub output_comment { "<$_[1]>"; }
158	6			6	1	18	sub output_endtag { "<$_[1]>"; }
159	6			6	1	19	sub output_starttag { "<$_[1]>"; }
160	1			1	1	4	sub output_markup { "<$_[1]>"; }
161	1			1	1	5	sub output_ssi { "<$_[1]>"; }
162
163							1;
164							__END__