File Coverage

blib/lib/HTML/TokeParser.pm

Criterion	Covered	Total	%
statement	85	90	94.4
branch	39	50	78.0
condition	14	17	82.3
subroutine	10	10	100.0
pod	5	5	100.0
total	153	172	88.9

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::TokeParser;
2
3	1			1		4214	use strict;
	1					2
	1					50
4
5							require HTML::PullParser;
6							our @ISA = qw(HTML::PullParser);
7							our $VERSION = '3.83';
8
9	1			1		8	use Carp ();
	1					1
	1					24
10	1			1		459	use HTML::Entities qw(decode_entities);
	1					2
	1					70
11	1			1		415	use HTML::Tagset ();
	1					1018
	1					1060
12
13							my %ARGS =
14							(
15							start => "'S',tagname,attr,attrseq,text",
16							end => "'E',tagname,text",
17							text => "'T',text,is_cdata",
18							process => "'PI',token0,text",
19							comment => "'C',text",
20							declaration => "'D',text",
21
22							# options that default on
23							unbroken_text => 1,
24							);
25
26
27							sub new
28							{
29	10			10	1	278989	my $class = shift;
30	10					17	my %cnf;
31
32	10	50				28	if (@_ == 1) {
33	10	100				26	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
34	10					28	%cnf = ($type => $_[0]);
35							}
36							else {
37	0	0				0	unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
		0
38	0					0	%cnf = @_;
39							}
40
41	10		50			53	my $textify = delete $cnf{textify} \|\| {img => "alt", applet => "alt"};
42
43	10		100			56	my $self = $class->SUPER::new(%ARGS, %cnf) \|\| return undef;
44
45	9					21	$self->{textify} = $textify;
46	9					31	$self;
47							}
48
49
50							sub get_tag
51							{
52	43			43	1	2507	my $self = shift;
53	43					45	my $token;
54	43					44	while (1) {
55	117		100			162	$token = $self->get_token \|\| return undef;
56	115					133	my $type = shift @$token;
57	115	100	100			213	next unless $type eq "S" \|\| $type eq "E";
58	57	100				84	substr($token->[0], 0, 0) = "/" if $type eq "E";
59	57	100				97	return $token unless @_;
60	21					27	for (@_) {
61	24	100				47	return $token if $token->[0] eq $_;
62							}
63							}
64							}
65
66
67							sub _textify {
68	10			10		21	my($self, $token) = @_;
69	10					17	my $tag = $token->[1];
70	10	100				79	return undef unless exists $self->{textify}{$tag};
71
72	1					2	my $alt = $self->{textify}{$tag};
73	1					1	my $text;
74	1	50				4	if (ref($alt)) {
75	0					0	$text = &$alt(@$token);
76							} else {
77	1		50			3	$text = $token->[2]{$alt \|\| "alt"};
78	1	50				3	$text = "[\U$tag]" unless defined $text;
79							}
80	1					4	return $text;
81							}
82
83
84							sub get_text
85							{
86	5			5	1	7	my $self = shift;
87	5					8	my @text;
88	5					27	while (my $token = $self->get_token) {
89	19					28	my $type = $token->[0];
90	19	100				54	if ($type eq "T") {
		100
91	9					13	my $text = $token->[1];
92	9	50				78	decode_entities($text) unless $token->[2];
93	9					37	push(@text, $text);
94							} elsif ($type =~ /^[SE]$/) {
95	9					15	my $tag = $token->[1];
96	9	100				14	if ($type eq "S") {
97	5	100				12	if (defined(my $text = _textify($self, $token))) {
98	1					1	push(@text, $text);
99	1					4	next;
100							}
101							} else {
102	4					8	$tag = "/$tag";
103							}
104	8	100	100			39	if (!@_ \|\| grep $_ eq $tag, @_) {
105	5					15	$self->unget_token($token);
106	5					8	last;
107							}
108							push(@text, " ")
109	3	100	66			25	if $tag eq "br" \|\| !$HTML::Tagset::isPhraseMarkup{$token->[1]};
110							}
111							}
112	5					22	join("", @text);
113							}
114
115
116							sub get_trimmed_text
117							{
118	4			4	1	18	my $self = shift;
119	4					13	my $text = $self->get_text(@_);
120	4					17	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	4					23
	4					18
121	4					14	$text;
122							}
123
124							sub get_phrase {
125	3			3	1	1005	my $self = shift;
126	3					6	my @text;
127	3					10	while (my $token = $self->get_token) {
128	19					32	my $type = $token->[0];
129	19	100				62	if ($type eq "T") {
		50
130	9					16	my $text = $token->[1];
131	9	50				49	decode_entities($text) unless $token->[2];
132	9					34	push(@text, $text);
133							} elsif ($type =~ /^[SE]$/) {
134	10					19	my $tag = $token->[1];
135	10	100				28	if ($type eq "S") {
136	5	50				12	if (defined(my $text = _textify($self, $token))) {
137	0					0	push(@text, $text);
138	0					0	next;
139							}
140							}
141	10	100				33	if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
142	3					12	$self->unget_token($token);
143	3					9	last;
144							}
145	7	100				29	push(@text, " ") if $tag eq "br";
146							}
147							}
148	3					13	my $text = join("", @text);
149	3					13	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	3					14
	3					20
150	3					13	$text;
151							}
152
153							1;
154
155
156							__END__