File Coverage

blib/lib/HTML/TokeParser.pm

Criterion	Covered	Total	%
statement	85	90	94.4
branch	39	50	78.0
condition	14	17	82.3
subroutine	10	10	100.0
pod	5	5	100.0
total	153	172	88.9

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::TokeParser;
2
3	1			1		490	use strict;
	1					7
	1					59
4
5							require HTML::PullParser;
6							our @ISA = qw(HTML::PullParser);
7							our $VERSION = '3.80';
8
9	1			1		5	use Carp ();
	1					2
	1					30
10	1			1		432	use HTML::Entities qw(decode_entities);
	1					3
	1					62
11	1			1		458	use HTML::Tagset ();
	1					1286
	1					992
12
13							my %ARGS =
14							(
15							start => "'S',tagname,attr,attrseq,text",
16							end => "'E',tagname,text",
17							text => "'T',text,is_cdata",
18							process => "'PI',token0,text",
19							comment => "'C',text",
20							declaration => "'D',text",
21
22							# options that default on
23							unbroken_text => 1,
24							);
25
26
27							sub new
28							{
29	10			10	1	6788	my $class = shift;
30	10					18	my %cnf;
31
32	10	50				29	if (@_ == 1) {
33	10	100				30	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
34	10					28	%cnf = ($type => $_[0]);
35							}
36							else {
37	0	0				0	unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
		0
38	0					0	%cnf = @_;
39							}
40
41	10		50			61	my $textify = delete $cnf{textify} \|\| {img => "alt", applet => "alt"};
42
43	10		100			63	my $self = $class->SUPER::new(%ARGS, %cnf) \|\| return undef;
44
45	9					20	$self->{textify} = $textify;
46	9					30	$self;
47							}
48
49
50							sub get_tag
51							{
52	43			43	1	2245	my $self = shift;
53	43					50	my $token;
54	43					53	while (1) {
55	117		100			215	$token = $self->get_token \|\| return undef;
56	115					163	my $type = shift @$token;
57	115	100	100			296	next unless $type eq "S" \|\| $type eq "E";
58	57	100				106	substr($token->[0], 0, 0) = "/" if $type eq "E";
59	57	100				116	return $token unless @_;
60	21					39	for (@_) {
61	24	100				61	return $token if $token->[0] eq $_;
62							}
63							}
64							}
65
66
67							sub _textify {
68	10			10		19	my($self, $token) = @_;
69	10					15	my $tag = $token->[1];
70	10	100				35	return undef unless exists $self->{textify}{$tag};
71
72	1					30	my $alt = $self->{textify}{$tag};
73	1					17	my $text;
74	1	50				5	if (ref($alt)) {
75	0					0	$text = &$alt(@$token);
76							} else {
77	1		50			6	$text = $token->[2]{$alt \|\| "alt"};
78	1	50				4	$text = "[\U$tag]" unless defined $text;
79							}
80	1					5	return $text;
81							}
82
83
84							sub get_text
85							{
86	5			5	1	9	my $self = shift;
87	5					10	my @text;
88	5					12	while (my $token = $self->get_token) {
89	19					32	my $type = $token->[0];
90	19	100				67	if ($type eq "T") {
		100
91	9					14	my $text = $token->[1];
92	9	50				44	decode_entities($text) unless $token->[2];
93	9					31	push(@text, $text);
94							} elsif ($type =~ /^[SE]$/) {
95	9					29	my $tag = $token->[1];
96	9	100				20	if ($type eq "S") {
97	5	100				12	if (defined(my $text = _textify($self, $token))) {
98	1					3	push(@text, $text);
99	1					5	next;
100							}
101							} else {
102	4					10	$tag = "/$tag";
103							}
104	8	100	100			40	if (!@_ \|\| grep $_ eq $tag, @_) {
105	5					23	$self->unget_token($token);
106	5					11	last;
107							}
108							push(@text, " ")
109	3	100	66			22	if $tag eq "br" \|\| !$HTML::Tagset::isPhraseMarkup{$token->[1]};
110							}
111							}
112	5					20	join("", @text);
113							}
114
115
116							sub get_trimmed_text
117							{
118	4			4	1	37	my $self = shift;
119	4					14	my $text = $self->get_text(@_);
120	4					25	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	4					22
	4					17
121	4					14	$text;
122							}
123
124							sub get_phrase {
125	3			3	1	691	my $self = shift;
126	3					5	my @text;
127	3					8	while (my $token = $self->get_token) {
128	19					29	my $type = $token->[0];
129	19	100				51	if ($type eq "T") {
		50
130	9					13	my $text = $token->[1];
131	9	50				34	decode_entities($text) unless $token->[2];
132	9					27	push(@text, $text);
133							} elsif ($type =~ /^[SE]$/) {
134	10					16	my $tag = $token->[1];
135	10	100				18	if ($type eq "S") {
136	5	50				9	if (defined(my $text = _textify($self, $token))) {
137	0					0	push(@text, $text);
138	0					0	next;
139							}
140							}
141	10	100				29	if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
142	3					9	$self->unget_token($token);
143	3					6	last;
144							}
145	7	100				24	push(@text, " ") if $tag eq "br";
146							}
147							}
148	3					10	my $text = join("", @text);
149	3					10	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	3					12
	3					17
150	3					9	$text;
151							}
152
153							1;
154
155
156							__END__