File Coverage

blib/lib/Markdown/Perl/HTML.pm

Criterion	Covered	Total	%
statement	60	62	96.7
branch	17	20	85.0
condition	7	10	70.0
subroutine	14	14	100.0
pod	0	7	0.0
total	98	113	86.7

line	stmt	bran	cond	sub	pod	time	code
1							package Markdown::Perl::HTML;
2
3	33			33		22598	use strict;
	33					72
	33					1497
4	33			33		226	use warnings;
	33					76
	33					2199
5	33			33		211	use utf8;
	33					59
	33					224
6	33			33		1442	use feature ':5.24';
	33					72
	33					6548
7
8	33			33		345	use Carp 'cluck';
	33					72
	33					2557
9	33			33		946	use English;
	33					16484
	33					275
10	33			33		36001	use Exporter 'import';
	33					95
	33					155334
11
12							our @EXPORT_OK = qw(decode_entities html_escape http_escape remove_disallowed_tags
13							parse_attributes);
14							our %EXPORT_TAGS = (all => \@EXPORT_OK);
15
16							our $VERSION = 0.01;
17
18							our %html_entities;
19
20							# Parse the __DATA__ section into the %html_entities hash.
21							# This is done lazily to reduce the loading time of the program if the table is
22							# not needed.
23							#
24							# TODO: add a :full_init import option to the main package, to force the early
25							# execution of this method.
26							sub parse_entities {
27	16819	100		16819	0	291299	return if %html_entities;
28	5					30	local $INPUT_RECORD_SEPARATOR = undef;
29	5					119037	%html_entities = eval ; ## no critic (ProhibitStringyEval);
30	5	50				717	confess $EVAL_ERROR if $EVAL_ERROR;
31	5	50				122	close DATA or cluck 'Can’t close DATA file handle in lazy parsing of HTML entities mapping';
32	5					169	return;
33							}
34
35							# Decode HTML entities like & in the given string. Do it in-place if called
36							# in void-context otherwise returns a modified string.
37							# This is mostly equivalent to HTML::Entities::decode_entities but with our more
38							# complete list of entities, probably slower, and handling the replacement of
39							# 0x0 characters by 0xfffd.
40
41							# Currently, CounterClockwiseContourIntegral is the longest entity name, at 31
42							# char. We limit our search to 40 char to limit possible backtracking in case of
43							# failures (even if the spec does not say anything about it).
44							my $numeric_entity_re = qr/(?:\#(?: (? [0-9]{1,7} ) \| [xX] (? [a-fA-F0-9]{1,6} ) ))/x;
45							my $entity_re = qr/& (?: ${numeric_entity_re} \| (? [a-zA-Z0-9]{2,40} ) ) ; /x;
46
47							sub convert_entity {
48							# the `\|\| 0xfffd` part is so that � is correctly replaced by the 0xfffd
49							# "replacement" character (as is being done on the whole string at the
50							# beginning of its processing).
51							return
52							exists $+{dec} ? chr($+{dec} \|\| 0xfffd)
53							: exists $+{hex} ? chr(hex($+{hex}) \|\| 0xfffd)
54	16849	100	100	16849	0	215348	: (parse_entities(), $html_entities{$+{named}}) // "&$+{named};";
		100	50
			66
55							}
56
57							sub decode_entities {
58	114338	100		114338	0	2152889	return $_[0] =~ s/${entity_re}/&convert_entity/egr if defined wantarray;
	7					19
59	114325					696971	$_[0] =~ s/${entity_re}/&convert_entity/eg;
	16842					54288
60	114325					238621	return;
61							}
62
63							# There are four characters that are escaped in the html output (although the
64							# spec never really says so because they claim that they care only about parsing).
65							my %char_to_html_entity = (
66							'"' => '"',
67							"'" => ''',
68							'&' => '&',
69							'<' => '<',
70							'>' => '>'
71							);
72
73							# html_escape($str_to_escape, $character_class_to_use_for_escaping)
74							sub html_escape {
75							# TODO: I believe that this is re-compiled at each call because of the local
76							# variable. The regex should be built once in the Markdown::Perl object.
77	163299			163299	0	1133784	$_[0] =~ s/([$_[1]])/$char_to_html_entity{$1}/eg;
	81048					395159
78	163299					371078	return;
79							}
80
81							sub http_escape {
82	11650			11650	0	54564	utf8::encode($_[0]);
83							# TODO: there are more characters that need to be escaped here, but it’s not
84							# extremely clear which ones (the spec does a show of pretending to care only
85							# only about parsing, but the conformance test suite expect something quite
86							# specific. See: https://spec.commonmark.org/0.31.2/#example-502
87	11650					28194	$_[0] =~ s/([ \\\[\]\x80-\xff`""])/sprintf('%%%02X', ord($1))/ge;
	145					859
88	11650					22609	return;
89							}
90
91							# remove_disallowed_tags($html, \@tag_list)
92							# modify $html in place.
93							sub remove_disallowed_tags {
94							# TODO: check if GitHub also deactivates closing tags or only opening ones.
95							# Note: this is a dummy approach, based on the fact that this method is called
96							# only in the context of HTML blocks or inline HTML and so, something looking
97							# like an HTML tag is probably an HTML tag.
98	17235			17235	0	46219	for my $t (@{$_[1]}) {
	17235					54862
99	32976					416123	$_[0] =~ s/<(\Q$t\E)/<$1/gi;
100							}
101	17235					47610	return;
102							}
103
104							# This parses the attributes of the directive syntax. E.g. it receives the
105							# attribute part in the :name[content]{attributes} syntax, where each part can
106							# be missing.
107							sub parse_attributes {
108	13			13	0	62	my ($attributes) = @_;
109	13					47	my %out;
110	13					69	my $id_re = qr/(?: \# (? \w+ ))/x;
111	13					56	my $class_re = qr/(?: \. (? \w+ ))/x;
112	13					59	my $key_value_re = qr/(?: (? \w+ ) \s* = \s* (? \w+ ))/x;
113	13					75	pos($attributes) = 0; # so that it’s set even if we don’t match once to avoid a warning later.
114	13					705	while ($attributes =~ m/\G\s* (?: ${id_re} \| ${class_re} \| ${key_value_re} ) /gcx) {
115	16	100	66			336	if (defined ($+{id}) && !exists($out{id})) {
		100
		50
116	4					56	$out{id} = $+{id};
117							} elsif (defined ($+{class})) {
118	9					29	push(@{$out{class}}, $+{class});
	9					180
119							} elsif (defined ($+{key})) {
120	3					9	push(@{$out{keys}}, [$+{key}, $+{value}]);
	3					83
121							} else {
122	0					0	pos($attributes) = $LAST_MATCH_START[0];
123	0					0	last;
124							}
125							}
126	13	100				116	if ($attributes !~ m/\G \s* $/x) {
127	4					33	$out{junk} = substr($attributes, pos($attributes));
128							}
129
130	13					117	return %out;
131							}
132
133							1;
134
135							__DATA__