File Coverage

blib/lib/Mail/Pyzor/Digest/StripHtml.pm

Criterion	Covered	Total	%
statement	9	40	22.5
branch	0	10	0.0
condition			n/a
subroutine	3	7	42.8
pod	1	1	100.0
total	13	58	22.4

line	stmt	bran	sub	pod	time	code
1						package Mail::Pyzor::Digest::StripHtml;
2
3						# Copyright 2018 cPanel, LLC.
4						# All rights reserved.
5						# http://cpanel.net
6						#
7						# <@LICENSE>
8						# Licensed to the Apache Software Foundation (ASF) under one or more
9						# contributor license agreements. See the NOTICE file distributed with
10						# this work for additional information regarding copyright ownership.
11						# The ASF licenses this file to you under the Apache License, Version 2.0
12						# (the "License"); you may not use this file except in compliance with
13						# the License. You may obtain a copy of the License at:
14						#
15						# http://www.apache.org/licenses/LICENSE-2.0
16						#
17						# Unless required by applicable law or agreed to in writing, software
18						# distributed under the License is distributed on an "AS IS" BASIS,
19						# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20						# See the License for the specific language governing permissions and
21						# limitations under the License.
22						#
23						#
24
25	1		1		179787	use strict;
	1				9
	1				27
26	1		1		4	use warnings;
	1				2
	1				38
27
28						=encoding utf-8
29
30						=head1 NAME
31
32						Mail::Pyzor::Digest::StripHtml
33
34						=head1 SYNOPSIS
35
36						my $stripped = Mail::Pyzor::Digest::StripHtml::strip($html);
37
38						=head1 DESCRIPTION
39
40						This module attempts to duplicate pyzor’s HTML-stripping logic.
41
42						=head1 ACCURACY
43
44						This library cannot achieve 100%, bug-for-bug parity with pyzor
45						because to do so would require duplicating Python’s own HTML parsing
46						library. Since that library’s output has changed over time, and those
47						changes in turn affect pyzor, it’s literally impossible to arrive at
48						a single, fully-compatible reimplementation.
49
50						That said, all known divergences between pyzor and this library involve
51						invalid HTML as input.
52
53						Please open bug reports for any divergences you identify, particularly
54						if the input is valid HTML.
55
56						=cut
57
58						#----------------------------------------------------------------------
59
60	1		1		497	use HTML::Parser ();
	1				4836
	1				456
61
62						our $VERSION = '0.06_01'; # TRIAL
63						$VERSION =~ tr/_//d;
64
65						#----------------------------------------------------------------------
66
67						=head1 FUNCTIONS
68
69						=head2 $stripped = strip( $HTML )
70
71						Give it some HTML, and it’ll give back the stripped text.
72
73						In B, the stripping consists of removing tags as well as
74						CscriptE> and CstyleE> elements; however, it also
75						removes HTML entities.
76
77						This tries very hard to duplicate pyzor’s behavior with invalid HTML.
78
79						=cut
80
81						sub strip {
82	0		0	1		my ($html) = @_;
83
84	0					$html =~ s<\A\s+><>;
85	0					$html =~ s<\s+\z><>;
86
87	0					my $p = HTML::Parser->new( api_version => 3 );
88
89	0					my @pieces;
90
91	0					my $accumulate = 1;
92
93						$p->handler(
94						start => sub {
95	0		0			my ($tagname) = @_;
96
97	0	0				$accumulate = 0 if $tagname eq 'script';
98	0	0				$accumulate = 0 if $tagname eq 'style';
99
100	0					return;
101						},
102	0					'tagname',
103						);
104
105						$p->handler(
106						end => sub {
107	0		0			$accumulate = 1;
108	0					return;
109						}
110	0					);
111
112						$p->handler(
113						text => sub {
114	0		0			my ($copy) = @_;
115
116	0	0				return if !$accumulate;
117
118						# pyzor’s HTML parser discards HTML entities. On top of that,
119						# we need to match, as closely as possible, pyzor’s handling of
120						# invalid HTML entities … which is a function of Python’s
121						# standard HTML parsing library. This will probably never be
122						# fully compatible with the pyzor, but we can get it close.
123
124						# The original is:
125						#
126						# re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
127						#
128						# The parsing loop then “backs up” one byte if the last
129						# character isn’t a “;”. We use a look-ahead assertion to
130						# mimic that behavior.
131	0					$copy =~ s<\&\# (?:[0-9]+ \| [xX][0-9a-fA-F]+) (?: ; \| \z \| (?=[^0-9a-fA-F]) )>< >gx;
132
133						# The original is:
134						#
135						# re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
136						#
137						# We again use a look-ahead assertion to mimic Python.
138	0					$copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; \| \z \| (?=[^a-zA-Z0-9]) )>< >gx;
139
140						# Python’s HTMLParser aborts its parsing loop when it encounters
141						# an invalid numeric reference.
142	0					$copy =~ s<\&\#
143						(?:
144						[^0-9xX] # anything but the expected first char
145						\|
146						[0-9]+[a-fA-F] # hex within decimal
147						\|
148						[xX][^0-9a-fA-F]
149						)
150						(.*)
151						><
152	0	0				( -1 == index($1, ';') ) ? q<> : '&#'
153						>exs;
154
155						# Python’s HTMLParser treats invalid entities as incomplete
156	0					$copy =~ s<(\&\#?)><$1 >gx;
157
158	0					$copy =~ s<\A\s+><>;
159	0					$copy =~ s<\s+\z><>;
160
161	0	0				push @pieces, \$copy if length $copy;
162						},
163	0					'text,tagname',
164						);
165
166	0					$p->parse($html);
167	0					$p->eof();
168
169	0					my $payload = join( q< >, map { $$_ } @pieces );
	0
170
171						# Convert all sequences of whitespace OTHER THAN non-breaking spaces to
172						# plain spaces.
173	0					$payload =~ s<[^\S\x{a0}]+>< >g;
174
175	0					return $payload;
176						}
177
178						1;