File Coverage

blib/lib/Mail/Pyzor/Digest/StripHtml.pm

Criterion	Covered	Total	%
statement	9	40	22.5
branch	0	10	0.0
condition			n/a
subroutine	3	7	42.8
pod	1	1	100.0
total	13	58	22.4

line	stmt	bran	sub	pod	time	code
1						package Mail::Pyzor::Digest::StripHtml;
2
3						# Copyright 2018 cPanel, LLC.
4						# All rights reserved.
5						# http://cpanel.net
6						#
7						# This is free software; you can redistribute it and/or modify it under the
8						# Apache 2.0 license.
9
10	1		1		202915	use strict;
	1				10
	1				32
11	1		1		5	use warnings;
	1				2
	1				34
12
13						=encoding utf-8
14
15						=head1 NAME
16
17						Mail::Pyzor::Digest::StripHtml
18
19						=head1 SYNOPSIS
20
21						my $stripped = Mail::Pyzor::Digest::StripHtml::strip($html);
22
23						=head1 DESCRIPTION
24
25						This module attempts to duplicate pyzor’s HTML-stripping logic.
26
27						=head1 ACCURACY
28
29						This library cannot achieve 100%, bug-for-bug parity with pyzor
30						because to do so would require duplicating Python’s own HTML parsing
31						library. Since that library’s output has changed over time, and those
32						changes in turn affect pyzor, it’s literally impossible to arrive at
33						a single, fully-compatible reimplementation.
34
35						That said, all known divergences between pyzor and this library involve
36						invalid HTML as input.
37
38						Please open bug reports for any divergences you identify, particularly
39						if the input is valid HTML.
40
41						=cut
42
43						#----------------------------------------------------------------------
44
45	1		1		632	use HTML::Parser ();
	1				5944
	1				582
46
47						#----------------------------------------------------------------------
48
49						=head1 FUNCTIONS
50
51						=head2 $stripped = strip( $HTML )
52
53						Give it some HTML, and it’ll give back the stripped text.
54
55						In B, the stripping consists of removing tags as well as
56						CscriptE> and CstyleE> elements; however, it also
57						removes HTML entities.
58
59						This tries very hard to duplicate pyzor’s behavior with invalid HTML.
60
61						=cut
62
63						sub strip {
64	0		0	1		my ($html) = @_;
65
66	0					$html =~ s<\A\s+><>;
67	0					$html =~ s<\s+\z><>;
68
69	0					my $p = HTML::Parser->new( api_version => 3 );
70
71	0					my @pieces;
72
73	0					my $accumulate = 1;
74
75						$p->handler(
76						start => sub {
77	0		0			my ($tagname) = @_;
78
79	0	0				$accumulate = 0 if $tagname eq 'script';
80	0	0				$accumulate = 0 if $tagname eq 'style';
81
82	0					return;
83						},
84	0					'tagname',
85						);
86
87						$p->handler(
88						end => sub {
89	0		0			$accumulate = 1;
90	0					return;
91						}
92	0					);
93
94						$p->handler(
95						text => sub {
96	0		0			my ($copy) = @_;
97
98	0	0				return if !$accumulate;
99
100						# pyzor’s HTML parser discards HTML entities. On top of that,
101						# we need to match, as closely as possible, pyzor’s handling of
102						# invalid HTML entities … which is a function of Python’s
103						# standard HTML parsing library. This will probably never be
104						# fully compatible with the pyzor, but we can get it close.
105
106						# The original is:
107						#
108						# re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
109						#
110						# The parsing loop then “backs up” one byte if the last
111						# character isn’t a “;”. We use a look-ahead assertion to
112						# mimic that behavior.
113	0					$copy =~ s<\&\# (?:[0-9]+ \| [xX][0-9a-fA-F]+) (?: ; \| \z \| (?=[^0-9a-fA-F]) )>< >gx;
114
115						# The original is:
116						#
117						# re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
118						#
119						# We again use a look-ahead assertion to mimic Python.
120	0					$copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; \| \z \| (?=[^a-zA-Z0-9]) )>< >gx;
121
122						# Python’s HTMLParser aborts its parsing loop when it encounters
123						# an invalid numeric reference.
124	0					$copy =~ s<\&\#
125						(?:
126						[^0-9xX] # anything but the expected first char
127						\|
128						[0-9]+[a-fA-F] # hex within decimal
129						\|
130						[xX][^0-9a-fA-F]
131						)
132						(.*)
133						><
134	0	0				( -1 == index($1, ';') ) ? q<> : '&#'
135						>exs;
136
137						# Python’s HTMLParser treats invalid entities as incomplete
138	0					$copy =~ s<(\&\#?)><$1 >gx;
139
140	0					$copy =~ s<\A\s+><>;
141	0					$copy =~ s<\s+\z><>;
142
143	0	0				push @pieces, \$copy if length $copy;
144						},
145	0					'text,tagname',
146						);
147
148	0					$p->parse($html);
149	0					$p->eof();
150
151	0					my $payload = join( q< >, map { $$_ } @pieces );
	0
152
153						# Convert all sequences of whitespace OTHER THAN non-breaking spaces to
154						# plain spaces.
155	0					$payload =~ s<[^\S\x{a0}]+>< >g;
156
157	0					return $payload;
158						}
159
160						1;