File Coverage

blib/lib/Markdown/Perl/Inlines.pm

Criterion	Covered	Total	%
statement	402	403	99.7
branch	189	194	97.4
condition	61	63	96.8
subroutine	37	37	100.0
pod	0	20	0.0
total	689	717	96.0

line	stmt	bran	cond	sub	pod	time	code
1							# Package to process the inline structure of Markdown.
2
3							package Markdown::Perl::Inlines;
4
5	31			31		266	use strict;
	31					80
	31					1996
6	31			31		354	use warnings;
	31					90
	31					2780
7	31			31		231	use utf8;
	31					500
	31					739
8	31			31		1588	use feature ':5.24';
	31					96
	31					8859
9
10	31			31		279	use Carp;
	31					67
	31					2928
11	31			31		203	use English;
	31					60
	31					290
12	31			31		48527	use List::MoreUtils 'first_index', 'last_index';
	31					72
	31					409
13	31			31		82500	use List::Util 'min';
	31					110
	31					3350
14	31			31		218	use List::Util 1.45 'uniq';
	31					5302
	31					3173
15	31			31		53693	use Markdown::Perl::InlineTree ':all';
	31					190
	31					34123
16	31			31		335	use Markdown::Perl::Util 'normalize_label';
	31					76
	31					2184
17	31			31		224	use Markdown::Perl::HTML 'remove_disallowed_tags';
	31					72
	31					976216
18
19							our $VERSION = 0.01;
20
21							# Everywhere here, $that is a Markdown::Perl instance that we carry everywhere
22							# because it contains the options that we are using.
23							sub render {
24	44565			44565	0	134819	my ($that, $linkrefs, @lines) = @_;
25
26	44565					187085	my $text = join("\n", @lines);
27	44565					170264	my $tree = find_code_and_tag_runs($that, $text);
28
29							# At this point, @runs contains only 'text', 'code', or 'link' elements, that
30							# can’t have any children (yet).
31
32	44565			92575		360970	$tree->map(sub { process_char_escaping($that, $_) });
	92575					242637
33
34							# At this point, @runs can also contain 'literal' elements, that don’t have
35							# children.
36
37	44565					279416	process_links($that, $linkrefs, $tree);
38
39							# This removes the spurious white-space at the beginning and end of lines and
40							# also inserts hard line break as required.
41	44565					180617	process_whitespaces($that, $tree);
42
43							# Now, there are more link elements and they can have children instead of
44							# content.
45
46	44565	100				172248	if ($that->get_use_extended_autolinks) {
47	24308			58738		204132	$tree->map(sub { create_extended_autolinks($that, $_) });
	58738					162030
48	24308			59516		202975	$tree->map(sub { create_extended_email_autolinks($that, $_) });
	59516					171131
49							}
50
51	44565					207861	process_styles($that, $tree);
52
53							# At this point we have added the emphasis, strong emphasis, etc. in the tree.
54
55							$tree->apply(
56							sub {
57	180326			180326		546336	$_->escape_content($that->get_html_escaped_characters,
58							$that->get_html_escaped_code_characters);
59	44565					364958	});
60
61	44565					333352	my $out = $tree->render_html();
62
63	44565					535839	return $out;
64							}
65
66							# TODO: share these regex with Perl.pm (but note that we are not matching the
67							# open and close < > characters here).
68							my $html_tag_name_re = qr/[a-zA-Z][-a-zA-Z0-9]*/;
69							my $html_attribute_name_re = qr/[a-zA-Z_:][-a-zA-Z0-9_.:]*/;
70							my $html_space_re = qr/\n[ \t]\|[ \t][ \t]\n?[ \t]*/; # Spaces, tabs, and up to one line ending.
71							my $opt_html_space_re = qr/[ \t]\n?[ \t]/; # Optional spaces.
72							my $html_attribute_value_re = qr/ [^ \t\n"'=<>`]+ \| '[^']' \| "[^"]" /x;
73							my $html_attribute_re =
74							qr/ ${html_space_re} ${html_attribute_name_re} (?: ${opt_html_space_re} = ${opt_html_space_re} ${html_attribute_value_re} )? /x;
75
76							my $html_open_tag_re = qr/ ${html_tag_name_re} ${html_attribute_re}* ${opt_html_space_re} \/? /x;
77							my $html_close_tag_re = qr/ \/ ${html_tag_name_re} ${opt_html_space_re} /x;
78							my $html_comment_re = qr/!--\|!---\|!--.*?--/s;
79							my $html_proc_re = qr/\?.*?\?/s;
80							my $html_decl_re = qr/![a-zA-Z].*?/s;
81							my $html_cdata_re = qr/!\[CDATA\[.*?\]\]/s;
82
83							my $html_tag_re =
84							qr/ ${html_open_tag_re} \| ${html_close_tag_re} \| ${html_comment_re} \| ${html_proc_re} \| ${html_decl_re} \| ${html_cdata_re}/x;
85
86							# Bug: there is a bug in that backslash escapes don’t work inside autolinks. But
87							# we can turn our autolinks into full-links later (where the escape should
88							# work). However, the spec does not test this corner case so we’re fine.
89
90							sub find_code_and_tag_runs {
91	44565			44565	0	142591	my ($that, $text) = @_;
92
93	44565					223453	my $tree = Markdown::Perl::InlineTree->new();
94
95							# We match code-spans and autolinks first as they bind strongest. Raw HTML
96							# should be here too, but we don’t support it yet.
97							# https://spec.commonmark.org/0.30/#code-spans
98							# TODO: https://spec.commonmark.org/0.30/#autolinks
99							# TODO: https://spec.commonmark.org/0.30/#raw-html
100							# while ($text =~ m/(?\`+)\|(?\<)/g) {
101							# We are manually handling the backslash escaping here because they are not
102							# interpreted inside code blocks. We will then process all the others
103							# afterward.
104	44565					897216	while ($text =~ m/(? (?:\\\\)*) (?: (?\`+) \| < )/gx) {
105							my ($start_before, $start_after) =
106	50146					518960	($LAST_MATCH_START[0] + length($+{backslashes}), $LAST_MATCH_END[0]);
107	50146	100				336467	if ($+{code}) {
108	18908					88163	my $fence = $+{code};
109							# We’re searching for a fence of the same length, without any backtick
110							# before or after.
111	18908	100				562232	if ($text =~ m/(?
112	3803					18520	my ($end_before, $end_after) = ($LAST_MATCH_START[0], $LAST_MATCH_END[0]);
113	3803	100				24068	$tree->push(new_text(substr($text, 0, $start_before)))
114							if $start_before > 0;
115	3803					41252	$tree->push(new_code(substr($text, $start_after, ($end_before - $start_after))));
116	3803					84465	substr $text, 0, $end_after, ''; # This resets pos($text) as we want it to.
117							} # in the else clause, pos($text) == $start_after (because of the /c modifier).
118							} else {
119							# We matched a single < character.
120	31238					125152	my $re = $that->get_autolinks_regex;
121	31238					107428	my $email_re = $that->get_autolinks_email_regex;
122							# We’re not using /gc in these to regex because this confuses the ProhibitUnusedCapture
123							# PerlCritic policy. Anyway, as we are always resetting pos() in case of
124							# successful match, it’s not important to update it.
125	31238	100				734421	if ($text =~ m/\G(?${re})>/) {
		100
		100
126	8130	100				53752	$tree->push(new_text(substr($text, 0, $start_before)))
127							if $start_before > 0;
128	8130					93943	$tree->push(new_link($+{link}, type => 'autolink', target => $+{link}));
129	8130					164084	substr $text, 0, $+[0], ''; # This resets pos($text) as we want it to.
130							} elsif ($text =~ m/\G(?${email_re})>/) {
131	4	50				19	$tree->push(new_text(substr($text, 0, $start_before)))
132							if $start_before > 0;
133							# TODO: we have a bug in to_source_text, that will assume that the
134							# mailto: was part of the source text.
135	4					69	$tree->push(new_link($+{link}, type => 'autolink', target => 'mailto:'.$+{link}));
136	4					41	substr $text, 0, $+[0], ''; # This resets pos($text) as we want it to.
137							} elsif ($text =~ m/\G(?:${html_tag_re})>/) {
138							# This resets pos($text) as we want it to.
139	14487	100				100846	$tree->push(new_text(substr($text, 0, $start_before, '')))
140							if $start_before > 0;
141	14487					73460	my $html = substr($text, 0, $LAST_MATCH_END[0] - $start_before, '');
142	14487					91192	remove_disallowed_tags($html, $that->get_disallowed_html_tags);
143	14487					51270	$tree->push(new_html($html));
144							}
145							}
146							}
147	44565	100				200815	$tree->push(new_text($text)) if $text;
148
149	44565					149524	return $tree;
150							}
151
152							sub process_char_escaping {
153	92575			92575	0	199848	my ($that, $node) = @_;
154
155							# This is executed after
156	92575	100	100			550402	if ($node->{type} eq 'code' \|\| $node->{type} eq 'link') {
		100
		50
157							# At this stage, link nodes are only autolinks, in which back-slash escaping
158							# is not processed.
159	11937					37833	return $node;
160							} elsif ($node->{type} eq 'text') {
161							# TODO: with the current code for map, this could just be @nodes.
162	66151					219547	my $new_tree = Markdown::Perl::InlineTree->new();
163							# TODO: make this regex configurable (the set of characters that can be
164							# escaped). Note that the regex has to be updated in Perl.pm unescape_char
165							# method too.
166	66151					328181	while ($node->{content} =~ m/\\(\p{PosixPunct})/g) {
167							# Literal parsing is OK here (even if we will later create label reference
168							# which distinguish between escaped and non-escaped literals) because we
169							# can always invert it (and it makes the rest of the processing be much
170							# simpler because we don’t need to check whether we have escaped text or
171							# not).
172	157	100				8063	$new_tree->push(new_text(substr $node->{content}, 0, $LAST_MATCH_START[0]))
173							if $LAST_MATCH_START[0] > 0;
174	157					562	$new_tree->push(new_literal($1));
175	157					1132	substr $node->{content}, 0, $LAST_MATCH_END[0], ''; # This resets pos($node->{content}) as we want it to.
176							}
177	66151	100				289608	$new_tree->push($node) if $node->{content};
178	66151					233300	return $new_tree;
179							} elsif ($node->{type} eq 'html') {
180	14487					46218	return $node;
181							} else {
182	0					0	confess 'Unexpected node type in process_char_escaping: '.$node->{type};
183							}
184							}
185
186							# We find all the links in the tree.
187							#
188							# We are mostly implementing the recommended algorithm from
189							# https://spec.commonmark.org/0.31.2/#phase-2-inline-structure
190							# Except that we don’t do the inline parsing at this stage.
191							#
192							# Overall, this methods implement this whole section of the spec:
193							# https://spec.commonmark.org/0.30/#links
194							sub process_links {
195	44565			44565	0	134433	my ($that, $linkrefs, $tree) = @_;
196
197	44565					116304	my @open_link;
198	44565					98224	for (my $i = 0; $i < @{$tree->{children}}; $i++) {
	139067					481832
199	94502					181364	my $n = $tree->{children}[$i];
200	94502	100				260843	next if $n->{type} ne 'text';
201	67950					777026	while ($n->{content} =~ m/(?!?\[)\|\]/g) {
202	92099					407996	my @pos = ($i, $LAST_MATCH_START[0], $LAST_MATCH_END[0]);
203	92099	100				552131	if ($+{open}) {
204	41600	100				131814	my $type = $pos[2] - $pos[1] > 1 ? 'img' : 'link';
205	41600					458402	push @open_link, {type => $type, active => 1, pos => \@pos};
206							} else {
207	50499	100				327620	next unless @open_link;
208	30305					47977	my %open = %{pop @open_link};
	30305					143511
209	30305	100				163010	next unless $open{active};
210	30248					102244	my @text_span = ($open{pos}[0], $open{pos}[2], $pos[0], $pos[1]);
211	30248					73752	my $cur_pos = pos($n->{content});
212	30248					121535	my %target =
213							find_link_destination_and_title($that, $linkrefs, $tree, $pos[0], $pos[2], @text_span);
214	30248					116618	pos($n->{content}) = $cur_pos;
215	30248	100				445096	next unless %target;
216	2208					8086	my $text_tree = $tree->extract(@text_span);
217							my (undef, $dest_node_index) =
218	2208					18448	$tree->extract($open{pos}[0], $open{pos}[1], $open{pos}[0] + 1, 1);
219	2208					16055	my $link = new_link($text_tree, type => $open{type}, %target);
220	2208					9781	$tree->insert($dest_node_index, $link);
221
222	2208	100				23014	if ($open{type} eq 'link') {
223	1394					3843	for (@open_link) {
224	283	100				1296	$_->{active} = 0 if $_->{type} eq 'link';
225							}
226							}
227	2208					4712	$i = $dest_node_index;
228	2208					34196	last; # same as a next OUTER, but without the need to define the OUTER label.
229							}
230							}
231							}
232	44565					136699	return;
233							}
234
235							# @text_span is the span of the link definition text, used in case we have a
236							# collapsed link reference call.
237							sub find_link_destination_and_title {
238	30248			30248	0	97568	my ($that, $linkrefs, $tree, $child_start, $text_start, @text_span) = @_;
239							# We assume that the beginning of the link destination must be just after the
240							# link text and in the same child, as there can be no other constructs
241							# in-between.
242
243	30248					55071	my $cur_child = $child_start;
244	30248					64933	my $n = $tree->{children}[$cur_child];
245							confess 'Unexpected link destination search in a non-text element: '.$n->{type}
246	30248	50				97916	unless $n->{type} eq 'text';
247	30248					97285	pos($n->{content}) = $text_start;
248	30248					180302	$n->{content} =~ m/ \G (? [ \t\n]+ )? (?: (? \( ) \| (? \[\]? ) )? /x;
249	30248					104558	my @start = ($child_start, $text_start, $child_start, $LAST_MATCH_END[0]);
250
251	30248					165419	my $has_space = exists $+{space};
252	30248					73844	my $type;
253	30248	100				195364	if (exists $+{inline}) {
		100
254	3948					13339	$type = 'inline';
255							} elsif (exists $+{reference}) {
256	912	100				6390	if ($+{reference} eq '[') {
257	632					1806	$type = 'reference';
258							} else {
259	280					802	$type = 'collapsed';
260							}
261							} else {
262	25388					51046	$type = 'shortcut';
263							}
264
265	30248					176522	my $mode = $that->get_allow_spaces_in_links;
266	30248	100				75016	if ($has_space) {
267							# 'reference' mode is the mode that emulates Markdown.pl which is kind of
268							# weird and probably does not intend this exact behavior (at most one space
269							# then an optional new line and, if you have it, then any number of spaces).
270	2127	100	100			16254	if ( $mode eq 'reference'
			100
			100
271							&& ($type eq 'reference' \|\| $type eq 'collapsed')
272							&& $+{space} =~ m/^ ?(?:\n[ \t]*)?$/) {
273							# ok, do nothing
274							} else {
275							# We have forbidden spaces, so we treat this as a tentative shortcut link.
276	2118					4790	$type = 'shortcut';
277							}
278							}
279
280	30248	100				99176	if ($type eq 'inline') {
		100
281	3887					18917	my @target = parse_inline_link($tree, @start);
282	3887	100				20129	return @target if @target;
283							# pass-through intended if we can’t parse a valid target, we will try a
284							# shortcut link.
285							} elsif ($type eq 'reference') {
286	564					2733	my %target = parse_reference_link($that, $linkrefs, $tree, @start);
287	564	100				3583	return %target if exists $target{target};
288							# no pass-through here if this was a valid reference link syntax. This is
289							# not fully specified by the spec but matches what the reference
290							# implementation does.
291	519	100				3237	return if %target;
292							# Otherwise, pass-through.
293							}
294							# This is either a collapsed or a shortcut reference link (or something that
295							# might be one).
296	27809					129908	my $ref = $tree->span_to_source_text(@text_span, UNESCAPE_LITERAL);
297	27809	100				140268	$ref = normalize_label($ref) if $ref;
298	27809	100				95063	if (my $l = get_linkref($that, $linkrefs, $ref)) {
299	149	100				603	$tree->extract(@start) if $type eq 'collapsed';
300	149					293	return %{$l};
	149					1139
301							}
302	27660					125752	return;
303							}
304
305							sub parse_inline_link {
306	3887			3887	0	13379	my ($tree, @start) = @_; # ($child_start, $text_start, $child_start, $text_start + 1);
307							# @start points to before and after the '(' character opening the link.
308
309							# $cur_child is advanced through the tree while we parse the link destination
310							# and title, it always point to the node that we are currently looking into
311							# (the one containing the end of the element that was previously found).
312							# $n is the node at index $cur_child.
313	3887					11777	my $cur_child = $start[0];
314	3887					9755	my $n = $tree->{children}[$cur_child];
315
316	3887					12867	pos($n->{content}) = $start[3];
317	3887					31856	$n->{content} =~ m/\G[ \t]\n?[ \t]/;
318	3887					11497	my $search_start = $LAST_MATCH_END[0];
319
320							# TODO: first check if we have a destination between <>, that may have already
321							# been matched as an autolink or as a closing HTML tag :-(
322
323	3887					8728	my @target;
324	3887					8874	my $ok_to_have_title = 1;
325
326	3887					29435	my $has_bracket =
327							$tree->find_in_text(qr/
328
329							# We have this variable early because we may be filling it soon if the link
330							# destination was already parsed as an autolink or an html element.
331	3887					18921	my $target = '';
332
333	3887	100	100			53969	if ($has_bracket) {
		100	100
		100	66
334	44	100				317	if (my @end_target = $tree->find_in_text(qr/>/, $cur_child, $search_start + 1)) {
335	20					76	@target = ($cur_child, $search_start + 1, $end_target[0], $end_target[1]);
336	20	100				122	return if $tree->find_in_text(qr/<\|\n/, @target);
337							}
338							} elsif (
339							length($n->{content}) <= $search_start
340	320					4444	&& @{$tree->{children}} > $cur_child + 1
341							&& ( $tree->{children}[$cur_child + 1]{type} eq 'html'
342							\|\| $tree->{children}[$cur_child + 1]{type} eq 'link')
343							) {
344							# The element inside was already parsed as an autolink or an html element,
345							# we use it as-is for the link destination. However, we need at least one
346							# element after in the tree for this to be valid (otherwise we know that the
347							# syntax can’t be a real tree, so we return from here).
348	105	100				228	return if @{$tree->{children}} <= $cur_child + 2;
	105					529
349	98					466	@target = ($cur_child + 1, 0, $cur_child + 2, 0);
350	98					345	my $link_node = $tree->{children}[$cur_child + 1];
351	98	100				564	if ($link_node->{type} eq 'html') {
352	64					234	$target = $link_node->{content};
353	64					570	$target =~ s/^<\|>$//g;
354							} else {
355	34					156	$target = $link_node->{target};
356							}
357	98	100				473	return if $target =~ m/\n/; # No new lines in link targets are allowed.
358							} elsif (
359							my @end_target = $tree->find_in_text_with_balanced_content(
360							qr/$/, qr/$/, qr/[ [:cntrl:]]/,
361							$cur_child, $search_start)
362							) {
363	3037					11042	@target = ($cur_child, $search_start, $end_target[0], $end_target[1]);
364							}
365	3876	100				15869	if (@target) {
366							# We can’t extract the target just yet, because the parsing can still fail
367							# in which case we must not modify the tree.
368	3151					5648	$cur_child = $target[2];
369	3151					7843	$n = $tree->{children}[$cur_child];
370							# On the next line, [1] and not [2] because if there was a control character
371							# we will fail the whole method. So we restart the search before the end
372							# condition of the find... method above.
373	3151	100				13139	pos($n->{content}) = $target[3] + ($has_bracket ? 1 : 0);
374	3151					15169	$n->{content} =~ m/\G[ \t]\n?[ \t]/;
375	3151					9156	$search_start = $LAST_MATCH_END[0];
376	3151					12579	$ok_to_have_title = $LAST_MATCH_END[0] != $LAST_MATCH_START[0]; # target and title must be separated.
377							}
378
379							# The first character of the title must be ", ', or ( and so can’t be another
380							# inline construct. As such, using a normal regex is fine (and not an
381							# InlineTree method).
382	3876					12574	pos($n->{content}) = $search_start;
383	3876					8430	my @end_title;
384	3876	100				31759	if ($n->{content} =~ m/\G"/gc) {
		100
		100
385	42					358	@end_title = $tree->find_in_text(qr/"/, $cur_child, $search_start + 1);
386							} elsif ($n->{content} =~ m/\G'/gc) {
387	4					28	@end_title = $tree->find_in_text(qr/'/, $cur_child, $search_start + 1);
388							} elsif ($n->{content} =~ m/\G\(/gc) {
389	59					476	@end_title = $tree->find_balanced_in_text(qr/$/, qr/$/, $cur_child, $search_start + 1);
390							}
391	3876					32176	my @title;
392	3876	100				15258	if (@end_title) {
393	55	100				228	return unless $ok_to_have_title;
394	46					155	@title = ($cur_child, $search_start + 1, $end_title[0], $end_title[1]);
395	46					95	$cur_child = $end_title[0];
396	46					111	$n = $tree->{children}[$cur_child];
397	46					142	pos($n->{content}) = $end_title[2]; # This time, we look after the closing character.
398	46					240	$n->{content} =~ m/\G[ \t]\n?[ \t]/;
399	46					122	$search_start = $LAST_MATCH_END[0];
400							}
401
402							# TODO: call a new InlineTree method to turn (child, offset_at_end) into
403							# (child + 1, 0). This needs to be called also at the beginning of this
404							# method.
405	3867					9961	pos($n->{content}) = $search_start;
406	3867	100				20601	return unless $n->{content} =~ m/\G\)/;
407
408							# Now we have a valid title, we can start to rewrite the tree (beginning from
409							# the end, to not alter the node index before we touch them).
410							{
411	2014					3870	my @last_item = (@title, @target, @start);
	2014					8089
412							# We remove the spaces after the last item and also the closing paren.
413	2014					10741	$tree->extract($last_item[2], $last_item[3], $cur_child, $search_start + 1);
414							}
415
416	2014					4820	my $title;
417	2014	100				9413	if (@title) {
418	24					109	my $title_tree = $tree->extract(@title);
419	24					107	$title = $title_tree->to_source_text();
420	24					303	my @last_item = (@target, @start);
421	24					101	$tree->extract($last_item[2], $last_item[3], $title[0], $title[1]);
422							}
423
424	2014	50				5397	if (@target) {
425	2014					8168	my $target_tree = $tree->extract(@target);
426	2014	100				14387	$target = $target_tree->to_source_text() unless $target;
427	2014					34796	$tree->extract($start[2], $start[3], $target[0], $target[1]);
428							}
429
430	2014					8605	$tree->extract(@start);
431
432	2014	100				14188	return (target => $target, ($title ? (title => $title) : ()));
433							}
434
435							sub parse_reference_link {
436	564			564	0	2161	my ($that, $linkrefs, $tree, @start) = @_; # ($child_start, $text_start, $child_start, $text_start + 1);
437
438	564					1260	my $cur_child = $start[0];
439	564					1661	my $n = $tree->{children}[$cur_child];
440
441	564					1371	my $ref_start = $start[3];
442
443	564	100				4380	if (my @end_ref = $tree->find_in_text(qr/]/, $cur_child, $start[3])) {
444	425					2947	my $ref =
445							normalize_label($tree->span_to_source_text(@start[2, 3], @end_ref[0, 1], UNESCAPE_LITERAL));
446	425	100				4362	if (my $l = get_linkref($that, $linkrefs, $ref)) {
447	45					314	$tree->extract(@start[0, 1], @end_ref[0, 2]);
448	45					160	return %{$l};
	45					3416
449							} else {
450							# TODO: we should only return this if the span was indeed a valid
451							# reference link label (not longer than 1000 characters mostly).
452							# This is used to notice that we had a proper reference link syntax and
453							# not fallback to trying a shortcut link.
454	380					3094	return (ignored_valid_value => 1);
455							}
456							}
457	139					617	return;
458							}
459
460							# Returns a hashref with (title and dest) or undef.
461							sub get_linkref {
462	28234			28234	0	93808	my ($that, $linkrefs, $ref) = @_;
463	28234	100				147177	if (exists $linkrefs->{$ref}) {
		100
464	188					1031	return $linkrefs->{$ref};
465							} elsif (exists $that->{hooks}{resolve_link_ref}) {
466	7					45	return $that->{hooks}{resolve_link_ref}->($ref);
467							}
468	28039					98569	return;
469							}
470
471							# This methods remove line break at the beginning and end of lines (inside text
472							# nodes only), and add hard line breaks as required.
473							#
474							# $not_root is set when we recurse inside sub-tree, to indicate that the first
475							# and last node of the the tree are not, in fact, the beginning and and of the
476							# paragraph.
477							sub process_whitespaces {
478	46773			46773	0	135746	my ($that, $tree, $not_root) = @_;
479
480	46773					100160	for (my $i = 0; $i < @{$tree->{children}}; $i++) {
	144680					375077
481	97907					183555	my $n = $tree->{children}[$i];
482	97907	100				261785	process_whitespaces($that, $n->{subtree}, 1) if exists $n->{subtree};
483	97907	100				325594	next unless $n->{type} eq 'text';
484							# TODO: add tests for the fact that we don’t want hard break at the end of a
485							# paragraph.
486	69147					110420	my $re;
487	69147	100				258089	if ($that->get_two_spaces_hard_line_breaks) {
488	48529					206030	$re = qr/(?: {2,}\|\\)\n(?=.) */s;
489							} else {
490	20618					78478	$re = qr/\\\n(?=.) */s;
491							}
492	69147					740599	my @hard_breaks = split($re, $n->{content}, -1);
493	69147					259389	for (my $j = 0; $j < @hard_breaks; $j++) {
494							# $hard_breaks[$j] = '' unless defined($hard_breaks[$j]);
495	79280	100	100			515981	$hard_breaks[$j] =~ s/^ +// if !$not_root && $i == 0 && $j == 0;
			100
496	79280					280800	$hard_breaks[$j] =~ s/(\n\|\r) +/$1/g;
497							$hard_breaks[$j] =~ s/ +$//gm
498	79280	100	100			196753	if !$not_root && $i == $#{$tree->{children}} && $j == $#hard_breaks;
	76885		100			464480
499	79280	100				226964	if ($j == 0) {
500	69147					362741	$n->{content} = $hard_breaks[0];
501							} else {
502	10133					39299	$tree->insert($i + 1, new_html(' '), new_text("\n".$hard_breaks[$j]));
503	10133					57921	$i += 2;
504							}
505							}
506							}
507	46773					127547	return;
508							}
509
510							# This methods adds "style", that is it parses the emphasis (* and _) and also
511							# strike-through (~). To do so, we process each level of the tree independently
512							# because a style-run can’t cross another HTML construct (but it can span over
513							# it).
514							#
515							# We first find all the possible delimiters and insert them in the tree instead
516							# of their text. And then decide whether they are actually opening, closing, or
517							# neither.
518							#
519							# This methods implement all of:
520							# https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis
521							sub process_styles {
522	46773			46773	0	105907	my ($that, $tree) = @_;
523
524							# We recurse first as there are less children to iterate over than after.
525	46773					90648	for my $c (@{$tree->{children}}) {
	46773					131588
526	118954	100				314053	process_styles($that, $c->{subtree}) if exists $c->{subtree};
527							}
528
529							# TODO: only search for characters that are actually used by our current
530							# options.
531	46773					84771	my $current_child = 0;
532	46773					104880	my @delimiters;
533	46773					148641	my $delim = delim_characters($that);
534	46773					104081	my %max_delim_run_length = %{$that->get_inline_delimiters_max_run_length};
	46773					173601
535	46773					1162411	while (my @match = $tree->find_in_text(qr/([${delim}])\1*/, $current_child, 0)) {
536							# We extract the delimiter run into a new node, that will be at $index.
537	40877					180550	my ($delim_tree, $index) = $tree->extract($match[0], $match[1], $match[0], $match[2]);
538							# We use the type literal so that if we do nothing with the delimiter it
539							# will be rendered correctly. We keep track of which literals might be
540							# delimiters using the @delimiters array.
541	40877					138920	$delim_tree->{children}[0]{type} = 'literal';
542	40877					140991	$tree->insert($index, $delim_tree);
543	40877					129525	my $d = classify_delimiter($that, $tree, $index);
544	40877	100	100			173903	if (!exists $max_delim_run_length{$d->{delim}}
545							\|\| $d->{len} <= $max_delim_run_length{$d->{delim}}) {
546	40875					93229	push @delimiters, $d;
547							}
548	40877					481000	$current_child = $index + 1;
549							}
550
551	46773					217537	match_delimiters($that, $tree, @delimiters);
552	46773					189065	return;
553							}
554
555							# Decides whether the delimiter run at the given index in the tree can open or
556							# close emphasis (or any other style).
557							sub classify_delimiter {
558	40877			40877	0	96904	my ($that, $tree, $index) = @_;
559	40877					131058	my $pred_type = classify_flank($that, $tree, $index, 'left');
560	40877					120580	my $succ_type = classify_flank($that, $tree, $index, 'right');
561	40877		100			300593	my $is_left = $succ_type ne 'space' && ($succ_type ne 'punct' \|\| $pred_type ne 'none');
562	40877		100			184097	my $is_right = $pred_type ne 'space' && ($pred_type ne 'punct' \|\| $succ_type ne 'none');
563	40877					105173	my $len = length($tree->{children}[$index]{content});
564	40877					129471	my $delim = substr $tree->{children}[$index]{content}, 0, 1;
565	40877					71707	my $can_open = 0;
566	40877					70052	my $can_close = 0;
567							# This is implementing the first 8 rules (out of 17...) of
568							# https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis
569							# The rules are more complex for '_' than for '*' because it is assuming that
570							# underscores can appear within word. So we apply the star rules to all other
571							# delimiters (that is, we only check for underscore here). Currently our only
572							# other delimiter is '~'.
573							# TODO: add an option to decide which rule to apply per delimiter.
574	40877	100				105186	if ($delim eq '_') {
575	8503		100			50995	$can_open = $is_left && (!$is_right \|\| $pred_type eq 'punct');
576	8503		100			41771	$can_close = $is_right && (!$is_left \|\| $succ_type eq 'punct');
577							} else {
578	32374					52508	$can_open = $is_left;
579	32374					52517	$can_close = $is_right;
580							}
581							return {
582	40877					319421	index => $index,
583							can_open => $can_open,
584							can_close => $can_close,
585							len => $len,
586							delim => $delim,
587							orig_len => $len
588							};
589							}
590
591							# Computes whether the type of the "flank" of the delimiter run at the given
592							# index in the tree (looking either at the "left" or "right" side). This returns
593							# one of 'none', 'punct', or 'space' following the rule given in
594							# https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis.
595							# The purpose is to decide whether the delimiter run is left flanking and/or
596							# right flanking (that decision happens in classify_delimiter).
597							sub classify_flank {
598	81754			81754	0	200244	my ($that, $tree, $index, $side) = @_;
599	81754	100	100			251284	return 'space' if $index == 0 && $side eq 'left';
600	78398	100	100			121092	return 'space' if $index == $#{$tree->{children}} && $side eq 'right';
	78398					242719
601	75402	100				222158	my $node = $tree->{children}[$index + ($side eq 'left' ? -1 : 1)];
602							# If the node before the delimiters is not text, let’s assume that we had some
603							# punctuation characters that delimited it.
604	75402	100	100			263231	return 'punct' if $node->{type} ne 'text' && $node->{type} ne 'literal';
605	71519	100				289621	my $space_re = $side eq 'left' ? qr/\s$/u : qr/^\s/u;
606	71519	100				502217	return 'space' if $node->{content} =~ m/${space_re}/;
607	63533	100				215999	my $punct_re = $side eq 'left' ? qr/[\p{Punct}\p{Symbol}]$/u : qr/^[\p{Punct}\p{Symbol}]/u;
608	63533	100				437383	return 'punct' if $node->{content} =~ m/${punct_re}/;
609	23523					89141	return 'none';
610							}
611
612							# We match the pair of delimiters together as much as we can, following the
613							# rules of the CommonMark spec.
614							sub match_delimiters {
615	46773			46773	0	121395	my ($that, $tree, @delimiters) = @_;
616
617	46773					138162	for (my $close_index = 1; $close_index < @delimiters; $close_index++) {
618	21049					39414	my %c = %{$delimiters[$close_index]};
	21049					134928
619	21049	100				98123	next if !$c{can_close};
620							# We have a closing delimiter, now we backtrack and find the tighter match
621							# for this closing delimiter. This is because "foo _bar baz_" will only
622							# match the * (that comes first) but "foo bar*"" will match the second
623							# and third star, that are the tightest match. This is for rule 15 and 16 of
624							# https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis
625							# We also apply rules 9 and 10 here. Rules 1-8 have already been computed in
626							# classify_delimiter.
627							my $open_index =
628	15572	100	100	15572		108732	last_index { $_->{can_open} && $_->{delim} eq $c{delim} && valid_rules_9_10($_, \%c) }
629	16128					164883	@delimiters[0 .. $close_index - 1];
630							# TODO: here there are a lot of optimization that we could apply, based on
631							# the "process emphasis" method from the spec (like removing our closing
632							# delimiter if it is not an opener, and keeping track of the fact that
633							# we have no delimiter in the 0..close_index-1 range that can match a
634							# delimiter of the same type as %c).
635							# This does not seem very important for reasonable inputs. So, instead, we
636							# just check the next potential closer.
637	16128	100				110159	next if $open_index == -1;
638
639	10237					48891	$close_index = apply_delimiters($that, $tree, \@delimiters, $open_index, $close_index);
640							}
641
642	46773					110316	return;
643							}
644
645							# Given a tree, its delimiters and the index of two delimiters, rewrite the
646							# tree with the style applied by these delimiters (we’re assuming here that they
647							# are of a matching type).
648							#
649							# The delimiter may not be consumed entirely (but we will consume as much as
650							# possible).
651							sub apply_delimiters {
652	10237			10237	0	46032	my ($that, $tree, $delimiters, $open_index, $close_index) = @_;
653	10237					17050	my %o = %{$delimiters->[$open_index]};
	10237					61037
654	10237					35396	my %c = %{$delimiters->[$close_index]};
	10237					54613
655
656							# We rewrite the tree in between our two delimiters.
657							# TODO: maybe we need a splice method in InlineTree.
658	10237					36554	my @styled_subnodes = splice @{$tree->{children}}, $o{index} + 1, $c{index} - $o{index} - 1;
	10237					77512
659	10237					42713	my $styled_tree = Markdown::Perl::InlineTree->new();
660	10237					49374	$styled_tree->push(@styled_subnodes);
661							# With our current algorithm in match_delimiters we know that there is no
662							# reasons to recurse (because the closing delimiter here was the first
663							# closing delimiter with a matching opener.)
664							# my @styled_delimiters = map { $_->{index} -= $o{index} + 1; $_ } splice @{$delimiters},
665							# $open_index + 1, $close_index - $open_index - 1;
666							# match_delimiters($that, $styled_tree, @styled_delimiters);
667	10237					17548	splice @{$delimiters}, $open_index + 1, $close_index - $open_index - 1;
	10237					28915
668
669							# And now we rebuild our own tree around the new one.
670	10237					40054	my $len = min($o{len}, $c{len}, max_delim_length($that, $o{delim}));
671	10237					73560	my $styled_node = new_style($styled_tree, tag => delim_to_html_tag($that, $o{delim} x $len));
672	10237					28600	my $style_start = $o{index};
673	10237					18003	my $style_length = 2;
674	10237					20534	$close_index = $open_index + 1;
675	10237	100				29505	if ($len < $o{len}) {
676	337					1802	substr($tree->{children}[$o{index}]{content}, $o{len} - $len) = ''; ## no critic (ProhibitLvalueSubstr)
677	337					896	$delimiters->[$open_index]{len} -= $len;
678	337					688	$style_start++;
679	337					693	$style_length--;
680							} else {
681	9900					16966	splice @{$delimiters}, $open_index, 1;
	9900					24127
682	9900					19197	$close_index--;
683							}
684	10237	100				30963	if ($len < $c{len}) {
685							# The closing node is now just after the opening one.
686	409					2214	substr($tree->{children}[$o{index} + 1]{content}, $c{len} - $len) = ''; ## no critic (ProhibitLvalueSubstr)
687	409					969	$delimiters->[$close_index]{len} -= $len;
688	409					758	$style_length--;
689							} else {
690	9828					17603	splice @{$delimiters}, $close_index, 1; # We remove our closing delimiter.
	9828					21541
691							}
692	10237					26928	splice @{$tree->{children}}, $style_start, $style_length, $styled_node;
	10237					50915
693	10237					43663	for my $i ($close_index .. $#{$delimiters}) {
	10237					45841
694	7849					24694	$delimiters->[$i]{index} -= $c{index} - $o{index} - 2 + $style_length;
695							}
696	10237	100				91983	return $open_index - ($len < $o{len} ? 0 : 1);
697							}
698
699							# Returns true if the given delimiters can be an open/close pair without
700							# breaking rules 9 and 10 of
701							# https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis.
702							sub valid_rules_9_10 {
703	11436			11436	0	32703	my ($o, $c) = @_;
704							# TODO: BUG: there is a probable bug here in that the length of the delimiter
705							# to consider is not its current length but the length of the original span
706							# of which it was a part.
707							return
708							(!$o->{can_close} && !$c->{can_open})
709							\|\| (($o->{orig_len} + $c->{orig_len}) % 3 != 0)
710	11436		66			117054	\|\| ($o->{orig_len} % 3 == 0 && $c->{orig_len} % 3 == 0);
711							}
712
713							# TODO: use ^ and ˇ to represent sup and sub
714							# TODO: add support for MathML in some way.
715							sub delim_to_html_tag {
716	10237			10237	0	41499	my ($that, $delim) = @_;
717							# TODO: sort what to do if a given delimiter does not have a variant with
718							# two characters (we must backtrack somewhere in match_delimiters probably).
719							# TODO: add support for when the value in the map is ".foo"
720							# instead of just "foo".
721	10237					26321	return $that->get_inline_delimiters()->{$delim};
722							}
723
724							# Return the list of characters that can be delimiters (using the regex
725							# character class syntax).
726							sub delim_characters {
727	46773			46773	0	109908	my ($that) = @_;
728							# TODO: memo-ize this function inside $that (but clear it when the options
729							# change).
730	46773					77824	my @c = map { substr $_, 0, 1 } keys %{$that->get_inline_delimiters()};
	237842					639635
	46773					177318
731	46773					505709	return join('', uniq @c);
732							}
733
734							# Returns the max defined delim
735							sub max_delim_length {
736	10237			10237	0	26934	my ($that, $delim) = @_;
737							# TODO: memo-ize this function
738							# We assume that the $delim is in the map because it reached this point and
739							# also that the map can contains only delimiters not repeated or repeated
740							# once.
741	10237	100				53829	return exists $that->get_inline_delimiters()->{$delim x 2} ? 2 : 1;
742							}
743
744							sub create_extended_autolinks {
745	58738			58738	0	130875	my ($that, $n) = @_;
746	58738	100				193254	if ($n->{type} ne 'text') {
747	18330					52602	return $n;
748							}
749
750	40408					78219	my @nodes;
751
752							# TODO: technically we should forbid the presence of _ in the last two parts
753							# of the domain, according to the gfm spec.
754							## no critic (ProhibitComplexRegexes)
755	40408					387634	while (
756							$n->{content} =~ m/
757							(? ^ \| [ \t\n*_~\(] ) # The link must start after a whitespace or some specific delimiters.
758							(?
759							(?: (?https?:\/\/) \| www\. ) # It must start by a scheme or the string wwww.
760							[-_a-zA-Z0-9]+ (?: \. [-_a-zA-Z0-9]+ )* # Then there must be something that looks like a domain
761							(?: \/ [^ \t\n<]*? )? # Some characters are forbidden in the link.
762							)
763							[?!.,:_~] (?: [ \t\n<] \| $) # We remove some punctuation from the end of the link.
764							/x
765							## use critic
766							) {
767	1303					12790	my $url = $+{url};
768	1303					25069	my $match_start = $LAST_MATCH_START[0] + length($LAST_PAREN_MATCH{prefix});
769	1303					4429	my $match_end = $match_start + length($url);
770	1303					17699	my $has_scheme = exists $LAST_PAREN_MATCH{scheme};
771	1303	100				6003	if ($url =~ m/\)+$/) {
772	8					37	my $nb_final_closing_parens = $LAST_MATCH_END[0] - $LAST_MATCH_START[0];
773	8					23	my $open = 0;
774	8					94	() = $url =~ m/ $ (?{$open++}) \| $ (?{$open--}) /gx;
	8					58
	12					57
775	8					52	my $remove = min($nb_final_closing_parens, -$open);
776	8	100				46	if ($remove > 0) {
777	3					8	$match_end -= $remove;
778	3					13	substr $url, -$remove, $remove, '';
779							}
780							}
781	1303	100				4958	if ($url =~ m/\&[a-zA-Z0-9]+;$/) {
782	1					6	my $len = $LAST_MATCH_END[0] - $LAST_MATCH_START[0];
783	1					4	$match_end -= $len;
784	1					8	substr $url, -$len, $len, '';
785							}
786	1303	100				4373	if ($match_start > 0) {
787	726					3783	push @nodes, new_text(substr $n->{content}, 0, $match_start);
788							}
789	1303	100				6517	my $scheme = $has_scheme ? '' : $that->get_default_extended_autolinks_scheme.'://';
790	1303					7447	push @nodes,
791							new_link($url, type => 'autolink', target => $scheme.$url, debug => 'extended autolink');
792	1303					6677	$n = new_text(substr $n->{content}, $match_end);
793							}
794	40408	100				134336	push @nodes, $n if length($n->{content}) > 0;
795	40408					138354	return @nodes;
796							}
797
798							sub create_extended_email_autolinks {
799	59516			59516	0	115290	my ($that, $n) = @_;
800	59516	100				157521	if ($n->{type} ne 'text') {
801	19633					52610	return $n;
802							}
803
804	39883					63616	my @nodes;
805
806							# TODO: We’re not handling links with prefix protocol (mailto: or xmpp:) but
807							# these are not tested by the spec present in the current repo (although they
808							# are documented online).
809							## no critic (ProhibitComplexRegexes)
810	39883					129502	while (
811							$n->{content} =~ m/
812							(? ^ \| [ \t\n*_~\(] ) # The link must start after a whitespace or some specific delimiters.
813							(?
814							(? mailto:\/\/ )?
815							[-_.+a-zA-Z0-9]+ @ [-_a-zA-Z0-9]+ (?: \. [-_a-zA-Z0-9]+ )+ (?<= [a-zA-Z0-9] )
816							)
817							(?: [ \t\n.<] \| $ ) # We remove some punctuation from the end of the link.
818							/x
819							## use critic
820							) {
821	5					64	my $email = $+{email};
822	5					49	my $match_start = $LAST_MATCH_START[0] + length($LAST_PAREN_MATCH{prefix});
823	5					18	my $match_end = $match_start + length($email);
824	5					32	my $has_scheme = exists $LAST_PAREN_MATCH{scheme};
825	5	100				25	if ($match_start > 0) {
826	1					8	push @nodes, new_text(substr $n->{content}, 0, $match_start);
827							}
828	5	50				20	my $scheme = $has_scheme ? '' : 'mailto:';
829	5					42	push @nodes,
830							new_link(
831							$email,
832							type => 'autolink',
833							target => $scheme.$email,
834							debug => 'extended autolink');
835	5					37	$n = new_text(substr $n->{content}, $match_end);
836							}
837	39883	100				119384	push @nodes, $n if length($n->{content}) > 0;
838	39883					119584	return @nodes;
839							}
840
841							1;