File Coverage

blib/lib/Markdown/Perl/Inlines.pm
Criterion Covered Total %
statement 402 403 99.7
branch 189 194 97.4
condition 61 63 96.8
subroutine 37 37 100.0
pod 0 20 0.0
total 689 717 96.0


line stmt bran cond sub pod time code
1             # Package to process the inline structure of Markdown.
2              
3             package Markdown::Perl::Inlines;
4              
5 31     31   266 use strict;
  31         80  
  31         1996  
6 31     31   354 use warnings;
  31         90  
  31         2780  
7 31     31   231 use utf8;
  31         500  
  31         739  
8 31     31   1588 use feature ':5.24';
  31         96  
  31         8859  
9              
10 31     31   279 use Carp;
  31         67  
  31         2928  
11 31     31   203 use English;
  31         60  
  31         290  
12 31     31   48527 use List::MoreUtils 'first_index', 'last_index';
  31         72  
  31         409  
13 31     31   82500 use List::Util 'min';
  31         110  
  31         3350  
14 31     31   218 use List::Util 1.45 'uniq';
  31         5302  
  31         3173  
15 31     31   53693 use Markdown::Perl::InlineTree ':all';
  31         190  
  31         34123  
16 31     31   335 use Markdown::Perl::Util 'normalize_label';
  31         76  
  31         2184  
17 31     31   224 use Markdown::Perl::HTML 'remove_disallowed_tags';
  31         72  
  31         976216  
18              
19             our $VERSION = 0.01;
20              
21             # Everywhere here, $that is a Markdown::Perl instance that we carry everywhere
22             # because it contains the options that we are using.
23             sub render {
24 44565     44565 0 134819 my ($that, $linkrefs, @lines) = @_;
25              
26 44565         187085 my $text = join("\n", @lines);
27 44565         170264 my $tree = find_code_and_tag_runs($that, $text);
28              
29             # At this point, @runs contains only 'text', 'code', or 'link' elements, that
30             # can’t have any children (yet).
31              
32 44565     92575   360970 $tree->map(sub { process_char_escaping($that, $_) });
  92575         242637  
33              
34             # At this point, @runs can also contain 'literal' elements, that don’t have
35             # children.
36              
37 44565         279416 process_links($that, $linkrefs, $tree);
38              
39             # This removes the spurious white-space at the beginning and end of lines and
40             # also inserts hard line break as required.
41 44565         180617 process_whitespaces($that, $tree);
42              
43             # Now, there are more link elements and they can have children instead of
44             # content.
45              
46 44565 100       172248 if ($that->get_use_extended_autolinks) {
47 24308     58738   204132 $tree->map(sub { create_extended_autolinks($that, $_) });
  58738         162030  
48 24308     59516   202975 $tree->map(sub { create_extended_email_autolinks($that, $_) });
  59516         171131  
49             }
50              
51 44565         207861 process_styles($that, $tree);
52              
53             # At this point we have added the emphasis, strong emphasis, etc. in the tree.
54              
55             $tree->apply(
56             sub {
57 180326     180326   546336 $_->escape_content($that->get_html_escaped_characters,
58             $that->get_html_escaped_code_characters);
59 44565         364958 });
60              
61 44565         333352 my $out = $tree->render_html();
62              
63 44565         535839 return $out;
64             }
65              
66             # TODO: share these regex with Perl.pm (but note that we are not matching the
67             # open and close < > characters here).
68             my $html_tag_name_re = qr/[a-zA-Z][-a-zA-Z0-9]*/;
69             my $html_attribute_name_re = qr/[a-zA-Z_:][-a-zA-Z0-9_.:]*/;
70             my $html_space_re = qr/\n[ \t]*|[ \t][ \t]*\n?[ \t]*/; # Spaces, tabs, and up to one line ending.
71             my $opt_html_space_re = qr/[ \t]*\n?[ \t]*/; # Optional spaces.
72             my $html_attribute_value_re = qr/ [^ \t\n"'=<>`]+ | '[^']*' | "[^"]*" /x;
73             my $html_attribute_re =
74             qr/ ${html_space_re} ${html_attribute_name_re} (?: ${opt_html_space_re} = ${opt_html_space_re} ${html_attribute_value_re} )? /x;
75              
76             my $html_open_tag_re = qr/ ${html_tag_name_re} ${html_attribute_re}* ${opt_html_space_re} \/? /x;
77             my $html_close_tag_re = qr/ \/ ${html_tag_name_re} ${opt_html_space_re} /x;
78             my $html_comment_re = qr/!--|!---|!--.*?--/s;
79             my $html_proc_re = qr/\?.*?\?/s;
80             my $html_decl_re = qr/![a-zA-Z].*?/s;
81             my $html_cdata_re = qr/!\[CDATA\[.*?\]\]/s;
82              
83             my $html_tag_re =
84             qr/ ${html_open_tag_re} | ${html_close_tag_re} | ${html_comment_re} | ${html_proc_re} | ${html_decl_re} | ${html_cdata_re}/x;
85              
86             # Bug: there is a bug in that backslash escapes don’t work inside autolinks. But
87             # we can turn our autolinks into full-links later (where the escape should
88             # work). However, the spec does not test this corner case so we’re fine.
89              
90             sub find_code_and_tag_runs {
91 44565     44565 0 142591 my ($that, $text) = @_;
92              
93 44565         223453 my $tree = Markdown::Perl::InlineTree->new();
94              
95             # We match code-spans and autolinks first as they bind strongest. Raw HTML
96             # should be here too, but we don’t support it yet.
97             # https://spec.commonmark.org/0.30/#code-spans
98             # TODO: https://spec.commonmark.org/0.30/#autolinks
99             # TODO: https://spec.commonmark.org/0.30/#raw-html
100             # while ($text =~ m/(?\`+)|(?\<)/g) {
101             # We are manually handling the backslash escaping here because they are not
102             # interpreted inside code blocks. We will then process all the others
103             # afterward.
104 44565         897216 while ($text =~ m/(? (?:\\\\)*) (?: (?\`+) | < )/gx) {
105             my ($start_before, $start_after) =
106 50146         518960 ($LAST_MATCH_START[0] + length($+{backslashes}), $LAST_MATCH_END[0]);
107 50146 100       336467 if ($+{code}) {
108 18908         88163 my $fence = $+{code};
109             # We’re searching for a fence of the same length, without any backtick
110             # before or after.
111 18908 100       562232 if ($text =~ m/(?
112 3803         18520 my ($end_before, $end_after) = ($LAST_MATCH_START[0], $LAST_MATCH_END[0]);
113 3803 100       24068 $tree->push(new_text(substr($text, 0, $start_before)))
114             if $start_before > 0;
115 3803         41252 $tree->push(new_code(substr($text, $start_after, ($end_before - $start_after))));
116 3803         84465 substr $text, 0, $end_after, ''; # This resets pos($text) as we want it to.
117             } # in the else clause, pos($text) == $start_after (because of the /c modifier).
118             } else {
119             # We matched a single < character.
120 31238         125152 my $re = $that->get_autolinks_regex;
121 31238         107428 my $email_re = $that->get_autolinks_email_regex;
122             # We’re not using /gc in these to regex because this confuses the ProhibitUnusedCapture
123             # PerlCritic policy. Anyway, as we are always resetting pos() in case of
124             # successful match, it’s not important to update it.
125 31238 100       734421 if ($text =~ m/\G(?${re})>/) {
    100          
    100          
126 8130 100       53752 $tree->push(new_text(substr($text, 0, $start_before)))
127             if $start_before > 0;
128 8130         93943 $tree->push(new_link($+{link}, type => 'autolink', target => $+{link}));
129 8130         164084 substr $text, 0, $+[0], ''; # This resets pos($text) as we want it to.
130             } elsif ($text =~ m/\G(?${email_re})>/) {
131 4 50       19 $tree->push(new_text(substr($text, 0, $start_before)))
132             if $start_before > 0;
133             # TODO: we have a bug in to_source_text, that will assume that the
134             # mailto: was part of the source text.
135 4         69 $tree->push(new_link($+{link}, type => 'autolink', target => 'mailto:'.$+{link}));
136 4         41 substr $text, 0, $+[0], ''; # This resets pos($text) as we want it to.
137             } elsif ($text =~ m/\G(?:${html_tag_re})>/) {
138             # This resets pos($text) as we want it to.
139 14487 100       100846 $tree->push(new_text(substr($text, 0, $start_before, '')))
140             if $start_before > 0;
141 14487         73460 my $html = substr($text, 0, $LAST_MATCH_END[0] - $start_before, '');
142 14487         91192 remove_disallowed_tags($html, $that->get_disallowed_html_tags);
143 14487         51270 $tree->push(new_html($html));
144             }
145             }
146             }
147 44565 100       200815 $tree->push(new_text($text)) if $text;
148              
149 44565         149524 return $tree;
150             }
151              
152             sub process_char_escaping {
153 92575     92575 0 199848 my ($that, $node) = @_;
154              
155             # This is executed after
156 92575 100 100     550402 if ($node->{type} eq 'code' || $node->{type} eq 'link') {
    100          
    50          
157             # At this stage, link nodes are only autolinks, in which back-slash escaping
158             # is not processed.
159 11937         37833 return $node;
160             } elsif ($node->{type} eq 'text') {
161             # TODO: with the current code for map, this could just be @nodes.
162 66151         219547 my $new_tree = Markdown::Perl::InlineTree->new();
163             # TODO: make this regex configurable (the set of characters that can be
164             # escaped). Note that the regex has to be updated in Perl.pm unescape_char
165             # method too.
166 66151         328181 while ($node->{content} =~ m/\\(\p{PosixPunct})/g) {
167             # Literal parsing is OK here (even if we will later create label reference
168             # which distinguish between escaped and non-escaped literals) because we
169             # can always invert it (and it makes the rest of the processing be much
170             # simpler because we don’t need to check whether we have escaped text or
171             # not).
172 157 100       8063 $new_tree->push(new_text(substr $node->{content}, 0, $LAST_MATCH_START[0]))
173             if $LAST_MATCH_START[0] > 0;
174 157         562 $new_tree->push(new_literal($1));
175 157         1132 substr $node->{content}, 0, $LAST_MATCH_END[0], ''; # This resets pos($node->{content}) as we want it to.
176             }
177 66151 100       289608 $new_tree->push($node) if $node->{content};
178 66151         233300 return $new_tree;
179             } elsif ($node->{type} eq 'html') {
180 14487         46218 return $node;
181             } else {
182 0         0 confess 'Unexpected node type in process_char_escaping: '.$node->{type};
183             }
184             }
185              
186             # We find all the links in the tree.
187             #
188             # We are mostly implementing the recommended algorithm from
189             # https://spec.commonmark.org/0.31.2/#phase-2-inline-structure
190             # Except that we don’t do the inline parsing at this stage.
191             #
192             # Overall, this methods implement this whole section of the spec:
193             # https://spec.commonmark.org/0.30/#links
194             sub process_links {
195 44565     44565 0 134433 my ($that, $linkrefs, $tree) = @_;
196              
197 44565         116304 my @open_link;
198 44565         98224 for (my $i = 0; $i < @{$tree->{children}}; $i++) {
  139067         481832  
199 94502         181364 my $n = $tree->{children}[$i];
200 94502 100       260843 next if $n->{type} ne 'text';
201 67950         777026 while ($n->{content} =~ m/(?!?\[)|\]/g) {
202 92099         407996 my @pos = ($i, $LAST_MATCH_START[0], $LAST_MATCH_END[0]);
203 92099 100       552131 if ($+{open}) {
204 41600 100       131814 my $type = $pos[2] - $pos[1] > 1 ? 'img' : 'link';
205 41600         458402 push @open_link, {type => $type, active => 1, pos => \@pos};
206             } else {
207 50499 100       327620 next unless @open_link;
208 30305         47977 my %open = %{pop @open_link};
  30305         143511  
209 30305 100       163010 next unless $open{active};
210 30248         102244 my @text_span = ($open{pos}[0], $open{pos}[2], $pos[0], $pos[1]);
211 30248         73752 my $cur_pos = pos($n->{content});
212 30248         121535 my %target =
213             find_link_destination_and_title($that, $linkrefs, $tree, $pos[0], $pos[2], @text_span);
214 30248         116618 pos($n->{content}) = $cur_pos;
215 30248 100       445096 next unless %target;
216 2208         8086 my $text_tree = $tree->extract(@text_span);
217             my (undef, $dest_node_index) =
218 2208         18448 $tree->extract($open{pos}[0], $open{pos}[1], $open{pos}[0] + 1, 1);
219 2208         16055 my $link = new_link($text_tree, type => $open{type}, %target);
220 2208         9781 $tree->insert($dest_node_index, $link);
221              
222 2208 100       23014 if ($open{type} eq 'link') {
223 1394         3843 for (@open_link) {
224 283 100       1296 $_->{active} = 0 if $_->{type} eq 'link';
225             }
226             }
227 2208         4712 $i = $dest_node_index;
228 2208         34196 last; # same as a next OUTER, but without the need to define the OUTER label.
229             }
230             }
231             }
232 44565         136699 return;
233             }
234              
235             # @text_span is the span of the link definition text, used in case we have a
236             # collapsed link reference call.
237             sub find_link_destination_and_title {
238 30248     30248 0 97568 my ($that, $linkrefs, $tree, $child_start, $text_start, @text_span) = @_;
239             # We assume that the beginning of the link destination must be just after the
240             # link text and in the same child, as there can be no other constructs
241             # in-between.
242              
243 30248         55071 my $cur_child = $child_start;
244 30248         64933 my $n = $tree->{children}[$cur_child];
245             confess 'Unexpected link destination search in a non-text element: '.$n->{type}
246 30248 50       97916 unless $n->{type} eq 'text';
247 30248         97285 pos($n->{content}) = $text_start;
248 30248         180302 $n->{content} =~ m/ \G (? [ \t\n]+ )? (?: (? \( ) | (? \[\]? ) )? /x;
249 30248         104558 my @start = ($child_start, $text_start, $child_start, $LAST_MATCH_END[0]);
250              
251 30248         165419 my $has_space = exists $+{space};
252 30248         73844 my $type;
253 30248 100       195364 if (exists $+{inline}) {
    100          
254 3948         13339 $type = 'inline';
255             } elsif (exists $+{reference}) {
256 912 100       6390 if ($+{reference} eq '[') {
257 632         1806 $type = 'reference';
258             } else {
259 280         802 $type = 'collapsed';
260             }
261             } else {
262 25388         51046 $type = 'shortcut';
263             }
264              
265 30248         176522 my $mode = $that->get_allow_spaces_in_links;
266 30248 100       75016 if ($has_space) {
267             # 'reference' mode is the mode that emulates Markdown.pl which is kind of
268             # weird and probably does not intend this exact behavior (at most one space
269             # then an optional new line and, if you have it, then any number of spaces).
270 2127 100 100     16254 if ( $mode eq 'reference'
      100        
      100        
271             && ($type eq 'reference' || $type eq 'collapsed')
272             && $+{space} =~ m/^ ?(?:\n[ \t]*)?$/) {
273             # ok, do nothing
274             } else {
275             # We have forbidden spaces, so we treat this as a tentative shortcut link.
276 2118         4790 $type = 'shortcut';
277             }
278             }
279              
280 30248 100       99176 if ($type eq 'inline') {
    100          
281 3887         18917 my @target = parse_inline_link($tree, @start);
282 3887 100       20129 return @target if @target;
283             # pass-through intended if we can’t parse a valid target, we will try a
284             # shortcut link.
285             } elsif ($type eq 'reference') {
286 564         2733 my %target = parse_reference_link($that, $linkrefs, $tree, @start);
287 564 100       3583 return %target if exists $target{target};
288             # no pass-through here if this was a valid reference link syntax. This is
289             # not fully specified by the spec but matches what the reference
290             # implementation does.
291 519 100       3237 return if %target;
292             # Otherwise, pass-through.
293             }
294             # This is either a collapsed or a shortcut reference link (or something that
295             # might be one).
296 27809         129908 my $ref = $tree->span_to_source_text(@text_span, UNESCAPE_LITERAL);
297 27809 100       140268 $ref = normalize_label($ref) if $ref;
298 27809 100       95063 if (my $l = get_linkref($that, $linkrefs, $ref)) {
299 149 100       603 $tree->extract(@start) if $type eq 'collapsed';
300 149         293 return %{$l};
  149         1139  
301             }
302 27660         125752 return;
303             }
304              
305             sub parse_inline_link {
306 3887     3887 0 13379 my ($tree, @start) = @_; # ($child_start, $text_start, $child_start, $text_start + 1);
307             # @start points to before and after the '(' character opening the link.
308              
309             # $cur_child is advanced through the tree while we parse the link destination
310             # and title, it always point to the node that we are currently looking into
311             # (the one containing the end of the element that was previously found).
312             # $n is the node at index $cur_child.
313 3887         11777 my $cur_child = $start[0];
314 3887         9755 my $n = $tree->{children}[$cur_child];
315              
316 3887         12867 pos($n->{content}) = $start[3];
317 3887         31856 $n->{content} =~ m/\G[ \t]*\n?[ \t]*/;
318 3887         11497 my $search_start = $LAST_MATCH_END[0];
319              
320             # TODO: first check if we have a destination between <>, that may have already
321             # been matched as an autolink or as a closing HTML tag :-(
322              
323 3887         8728 my @target;
324 3887         8874 my $ok_to_have_title = 1;
325              
326 3887         29435 my $has_bracket =
327             $tree->find_in_text(qr/
328              
329             # We have this variable early because we may be filling it soon if the link
330             # destination was already parsed as an autolink or an html element.
331 3887         18921 my $target = '';
332              
333 3887 100 100     53969 if ($has_bracket) {
    100 100        
    100 66        
334 44 100       317 if (my @end_target = $tree->find_in_text(qr/>/, $cur_child, $search_start + 1)) {
335 20         76 @target = ($cur_child, $search_start + 1, $end_target[0], $end_target[1]);
336 20 100       122 return if $tree->find_in_text(qr/<|\n/, @target);
337             }
338             } elsif (
339             length($n->{content}) <= $search_start
340 320         4444 && @{$tree->{children}} > $cur_child + 1
341             && ( $tree->{children}[$cur_child + 1]{type} eq 'html'
342             || $tree->{children}[$cur_child + 1]{type} eq 'link')
343             ) {
344             # The element inside was already parsed as an autolink or an html element,
345             # we use it as-is for the link destination. However, we need at least one
346             # element after in the tree for this to be valid (otherwise we know that the
347             # syntax can’t be a real tree, so we return from here).
348 105 100       228 return if @{$tree->{children}} <= $cur_child + 2;
  105         529  
349 98         466 @target = ($cur_child + 1, 0, $cur_child + 2, 0);
350 98         345 my $link_node = $tree->{children}[$cur_child + 1];
351 98 100       564 if ($link_node->{type} eq 'html') {
352 64         234 $target = $link_node->{content};
353 64         570 $target =~ s/^<|>$//g;
354             } else {
355 34         156 $target = $link_node->{target};
356             }
357 98 100       473 return if $target =~ m/\n/; # No new lines in link targets are allowed.
358             } elsif (
359             my @end_target = $tree->find_in_text_with_balanced_content(
360             qr/\(/, qr/\)/, qr/[ [:cntrl:]]/,
361             $cur_child, $search_start)
362             ) {
363 3037         11042 @target = ($cur_child, $search_start, $end_target[0], $end_target[1]);
364             }
365 3876 100       15869 if (@target) {
366             # We can’t extract the target just yet, because the parsing can still fail
367             # in which case we must not modify the tree.
368 3151         5648 $cur_child = $target[2];
369 3151         7843 $n = $tree->{children}[$cur_child];
370             # On the next line, [1] and not [2] because if there was a control character
371             # we will fail the whole method. So we restart the search before the end
372             # condition of the find... method above.
373 3151 100       13139 pos($n->{content}) = $target[3] + ($has_bracket ? 1 : 0);
374 3151         15169 $n->{content} =~ m/\G[ \t]*\n?[ \t]*/;
375 3151         9156 $search_start = $LAST_MATCH_END[0];
376 3151         12579 $ok_to_have_title = $LAST_MATCH_END[0] != $LAST_MATCH_START[0]; # target and title must be separated.
377             }
378              
379             # The first character of the title must be ", ', or ( and so can’t be another
380             # inline construct. As such, using a normal regex is fine (and not an
381             # InlineTree method).
382 3876         12574 pos($n->{content}) = $search_start;
383 3876         8430 my @end_title;
384 3876 100       31759 if ($n->{content} =~ m/\G"/gc) {
    100          
    100          
385 42         358 @end_title = $tree->find_in_text(qr/"/, $cur_child, $search_start + 1);
386             } elsif ($n->{content} =~ m/\G'/gc) {
387 4         28 @end_title = $tree->find_in_text(qr/'/, $cur_child, $search_start + 1);
388             } elsif ($n->{content} =~ m/\G\(/gc) {
389 59         476 @end_title = $tree->find_balanced_in_text(qr/\(/, qr/\)/, $cur_child, $search_start + 1);
390             }
391 3876         32176 my @title;
392 3876 100       15258 if (@end_title) {
393 55 100       228 return unless $ok_to_have_title;
394 46         155 @title = ($cur_child, $search_start + 1, $end_title[0], $end_title[1]);
395 46         95 $cur_child = $end_title[0];
396 46         111 $n = $tree->{children}[$cur_child];
397 46         142 pos($n->{content}) = $end_title[2]; # This time, we look after the closing character.
398 46         240 $n->{content} =~ m/\G[ \t]*\n?[ \t]*/;
399 46         122 $search_start = $LAST_MATCH_END[0];
400             }
401              
402             # TODO: call a new InlineTree method to turn (child, offset_at_end) into
403             # (child + 1, 0). This needs to be called also at the beginning of this
404             # method.
405 3867         9961 pos($n->{content}) = $search_start;
406 3867 100       20601 return unless $n->{content} =~ m/\G\)/;
407              
408             # Now we have a valid title, we can start to rewrite the tree (beginning from
409             # the end, to not alter the node index before we touch them).
410             {
411 2014         3870 my @last_item = (@title, @target, @start);
  2014         8089  
412             # We remove the spaces after the last item and also the closing paren.
413 2014         10741 $tree->extract($last_item[2], $last_item[3], $cur_child, $search_start + 1);
414             }
415              
416 2014         4820 my $title;
417 2014 100       9413 if (@title) {
418 24         109 my $title_tree = $tree->extract(@title);
419 24         107 $title = $title_tree->to_source_text();
420 24         303 my @last_item = (@target, @start);
421 24         101 $tree->extract($last_item[2], $last_item[3], $title[0], $title[1]);
422             }
423              
424 2014 50       5397 if (@target) {
425 2014         8168 my $target_tree = $tree->extract(@target);
426 2014 100       14387 $target = $target_tree->to_source_text() unless $target;
427 2014         34796 $tree->extract($start[2], $start[3], $target[0], $target[1]);
428             }
429              
430 2014         8605 $tree->extract(@start);
431              
432 2014 100       14188 return (target => $target, ($title ? (title => $title) : ()));
433             }
434              
435             sub parse_reference_link {
436 564     564 0 2161 my ($that, $linkrefs, $tree, @start) = @_; # ($child_start, $text_start, $child_start, $text_start + 1);
437              
438 564         1260 my $cur_child = $start[0];
439 564         1661 my $n = $tree->{children}[$cur_child];
440              
441 564         1371 my $ref_start = $start[3];
442              
443 564 100       4380 if (my @end_ref = $tree->find_in_text(qr/]/, $cur_child, $start[3])) {
444 425         2947 my $ref =
445             normalize_label($tree->span_to_source_text(@start[2, 3], @end_ref[0, 1], UNESCAPE_LITERAL));
446 425 100       4362 if (my $l = get_linkref($that, $linkrefs, $ref)) {
447 45         314 $tree->extract(@start[0, 1], @end_ref[0, 2]);
448 45         160 return %{$l};
  45         3416  
449             } else {
450             # TODO: we should only return this if the span was indeed a valid
451             # reference link label (not longer than 1000 characters mostly).
452             # This is used to notice that we had a proper reference link syntax and
453             # not fallback to trying a shortcut link.
454 380         3094 return (ignored_valid_value => 1);
455             }
456             }
457 139         617 return;
458             }
459              
460             # Returns a hashref with (title and dest) or undef.
461             sub get_linkref {
462 28234     28234 0 93808 my ($that, $linkrefs, $ref) = @_;
463 28234 100       147177 if (exists $linkrefs->{$ref}) {
    100          
464 188         1031 return $linkrefs->{$ref};
465             } elsif (exists $that->{hooks}{resolve_link_ref}) {
466 7         45 return $that->{hooks}{resolve_link_ref}->($ref);
467             }
468 28039         98569 return;
469             }
470              
471             # This methods remove line break at the beginning and end of lines (inside text
472             # nodes only), and add hard line breaks as required.
473             #
474             # $not_root is set when we recurse inside sub-tree, to indicate that the first
475             # and last node of the the tree are not, in fact, the beginning and and of the
476             # paragraph.
477             sub process_whitespaces {
478 46773     46773 0 135746 my ($that, $tree, $not_root) = @_;
479              
480 46773         100160 for (my $i = 0; $i < @{$tree->{children}}; $i++) {
  144680         375077  
481 97907         183555 my $n = $tree->{children}[$i];
482 97907 100       261785 process_whitespaces($that, $n->{subtree}, 1) if exists $n->{subtree};
483 97907 100       325594 next unless $n->{type} eq 'text';
484             # TODO: add tests for the fact that we don’t want hard break at the end of a
485             # paragraph.
486 69147         110420 my $re;
487 69147 100       258089 if ($that->get_two_spaces_hard_line_breaks) {
488 48529         206030 $re = qr/(?: {2,}|\\)\n(?=.) */s;
489             } else {
490 20618         78478 $re = qr/\\\n(?=.) */s;
491             }
492 69147         740599 my @hard_breaks = split($re, $n->{content}, -1);
493 69147         259389 for (my $j = 0; $j < @hard_breaks; $j++) {
494             # $hard_breaks[$j] = '' unless defined($hard_breaks[$j]);
495 79280 100 100     515981 $hard_breaks[$j] =~ s/^ +// if !$not_root && $i == 0 && $j == 0;
      100        
496 79280         280800 $hard_breaks[$j] =~ s/(\n|\r) +/$1/g;
497             $hard_breaks[$j] =~ s/ +$//gm
498 79280 100 100     196753 if !$not_root && $i == $#{$tree->{children}} && $j == $#hard_breaks;
  76885   100     464480  
499 79280 100       226964 if ($j == 0) {
500 69147         362741 $n->{content} = $hard_breaks[0];
501             } else {
502 10133         39299 $tree->insert($i + 1, new_html('
'), new_text("\n".$hard_breaks[$j]));
503 10133         57921 $i += 2;
504             }
505             }
506             }
507 46773         127547 return;
508             }
509              
510             # This methods adds "style", that is it parses the emphasis (* and _) and also
511             # strike-through (~). To do so, we process each level of the tree independently
512             # because a style-run can’t cross another HTML construct (but it can span over
513             # it).
514             #
515             # We first find all the possible delimiters and insert them in the tree instead
516             # of their text. And then decide whether they are actually opening, closing, or
517             # neither.
518             #
519             # This methods implement all of:
520             # https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis
521             sub process_styles {
522 46773     46773 0 105907 my ($that, $tree) = @_;
523              
524             # We recurse first as there are less children to iterate over than after.
525 46773         90648 for my $c (@{$tree->{children}}) {
  46773         131588  
526 118954 100       314053 process_styles($that, $c->{subtree}) if exists $c->{subtree};
527             }
528              
529             # TODO: only search for characters that are actually used by our current
530             # options.
531 46773         84771 my $current_child = 0;
532 46773         104880 my @delimiters;
533 46773         148641 my $delim = delim_characters($that);
534 46773         104081 my %max_delim_run_length = %{$that->get_inline_delimiters_max_run_length};
  46773         173601  
535 46773         1162411 while (my @match = $tree->find_in_text(qr/([${delim}])\1*/, $current_child, 0)) {
536             # We extract the delimiter run into a new node, that will be at $index.
537 40877         180550 my ($delim_tree, $index) = $tree->extract($match[0], $match[1], $match[0], $match[2]);
538             # We use the type literal so that if we do nothing with the delimiter it
539             # will be rendered correctly. We keep track of which literals might be
540             # delimiters using the @delimiters array.
541 40877         138920 $delim_tree->{children}[0]{type} = 'literal';
542 40877         140991 $tree->insert($index, $delim_tree);
543 40877         129525 my $d = classify_delimiter($that, $tree, $index);
544 40877 100 100     173903 if (!exists $max_delim_run_length{$d->{delim}}
545             || $d->{len} <= $max_delim_run_length{$d->{delim}}) {
546 40875         93229 push @delimiters, $d;
547             }
548 40877         481000 $current_child = $index + 1;
549             }
550              
551 46773         217537 match_delimiters($that, $tree, @delimiters);
552 46773         189065 return;
553             }
554              
555             # Decides whether the delimiter run at the given index in the tree can open or
556             # close emphasis (or any other style).
557             sub classify_delimiter {
558 40877     40877 0 96904 my ($that, $tree, $index) = @_;
559 40877         131058 my $pred_type = classify_flank($that, $tree, $index, 'left');
560 40877         120580 my $succ_type = classify_flank($that, $tree, $index, 'right');
561 40877   100     300593 my $is_left = $succ_type ne 'space' && ($succ_type ne 'punct' || $pred_type ne 'none');
562 40877   100     184097 my $is_right = $pred_type ne 'space' && ($pred_type ne 'punct' || $succ_type ne 'none');
563 40877         105173 my $len = length($tree->{children}[$index]{content});
564 40877         129471 my $delim = substr $tree->{children}[$index]{content}, 0, 1;
565 40877         71707 my $can_open = 0;
566 40877         70052 my $can_close = 0;
567             # This is implementing the first 8 rules (out of 17...) of
568             # https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis
569             # The rules are more complex for '_' than for '*' because it is assuming that
570             # underscores can appear within word. So we apply the star rules to all other
571             # delimiters (that is, we only check for underscore here). Currently our only
572             # other delimiter is '~'.
573             # TODO: add an option to decide which rule to apply per delimiter.
574 40877 100       105186 if ($delim eq '_') {
575 8503   100     50995 $can_open = $is_left && (!$is_right || $pred_type eq 'punct');
576 8503   100     41771 $can_close = $is_right && (!$is_left || $succ_type eq 'punct');
577             } else {
578 32374         52508 $can_open = $is_left;
579 32374         52517 $can_close = $is_right;
580             }
581             return {
582 40877         319421 index => $index,
583             can_open => $can_open,
584             can_close => $can_close,
585             len => $len,
586             delim => $delim,
587             orig_len => $len
588             };
589             }
590              
591             # Computes whether the type of the "flank" of the delimiter run at the given
592             # index in the tree (looking either at the "left" or "right" side). This returns
593             # one of 'none', 'punct', or 'space' following the rule given in
594             # https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis.
595             # The purpose is to decide whether the delimiter run is left flanking and/or
596             # right flanking (that decision happens in classify_delimiter).
597             sub classify_flank {
598 81754     81754 0 200244 my ($that, $tree, $index, $side) = @_;
599 81754 100 100     251284 return 'space' if $index == 0 && $side eq 'left';
600 78398 100 100     121092 return 'space' if $index == $#{$tree->{children}} && $side eq 'right';
  78398         242719  
601 75402 100       222158 my $node = $tree->{children}[$index + ($side eq 'left' ? -1 : 1)];
602             # If the node before the delimiters is not text, let’s assume that we had some
603             # punctuation characters that delimited it.
604 75402 100 100     263231 return 'punct' if $node->{type} ne 'text' && $node->{type} ne 'literal';
605 71519 100       289621 my $space_re = $side eq 'left' ? qr/\s$/u : qr/^\s/u;
606 71519 100       502217 return 'space' if $node->{content} =~ m/${space_re}/;
607 63533 100       215999 my $punct_re = $side eq 'left' ? qr/[\p{Punct}\p{Symbol}]$/u : qr/^[\p{Punct}\p{Symbol}]/u;
608 63533 100       437383 return 'punct' if $node->{content} =~ m/${punct_re}/;
609 23523         89141 return 'none';
610             }
611              
612             # We match the pair of delimiters together as much as we can, following the
613             # rules of the CommonMark spec.
614             sub match_delimiters {
615 46773     46773 0 121395 my ($that, $tree, @delimiters) = @_;
616              
617 46773         138162 for (my $close_index = 1; $close_index < @delimiters; $close_index++) {
618 21049         39414 my %c = %{$delimiters[$close_index]};
  21049         134928  
619 21049 100       98123 next if !$c{can_close};
620             # We have a closing delimiter, now we backtrack and find the tighter match
621             # for this closing delimiter. This is because "*foo _bar* baz_" will only
622             # match the * (that comes first) but "*foo *bar*"" will match the second
623             # and third star, that are the tightest match. This is for rule 15 and 16 of
624             # https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis
625             # We also apply rules 9 and 10 here. Rules 1-8 have already been computed in
626             # classify_delimiter.
627             my $open_index =
628 15572 100 100 15572   108732 last_index { $_->{can_open} && $_->{delim} eq $c{delim} && valid_rules_9_10($_, \%c) }
629 16128         164883 @delimiters[0 .. $close_index - 1];
630             # TODO: here there are a lot of optimization that we could apply, based on
631             # the "process emphasis" method from the spec (like removing our closing
632             # delimiter if it is not an opener, and keeping track of the fact that
633             # we have no delimiter in the 0..close_index-1 range that can match a
634             # delimiter of the same type as %c).
635             # This does not seem very important for reasonable inputs. So, instead, we
636             # just check the next potential closer.
637 16128 100       110159 next if $open_index == -1;
638              
639 10237         48891 $close_index = apply_delimiters($that, $tree, \@delimiters, $open_index, $close_index);
640             }
641              
642 46773         110316 return;
643             }
644              
645             # Given a tree, its delimiters and the index of two delimiters, rewrite the
646             # tree with the style applied by these delimiters (we’re assuming here that they
647             # are of a matching type).
648             #
649             # The delimiter may not be consumed entirely (but we will consume as much as
650             # possible).
651             sub apply_delimiters {
652 10237     10237 0 46032 my ($that, $tree, $delimiters, $open_index, $close_index) = @_;
653 10237         17050 my %o = %{$delimiters->[$open_index]};
  10237         61037  
654 10237         35396 my %c = %{$delimiters->[$close_index]};
  10237         54613  
655              
656             # We rewrite the tree in between our two delimiters.
657             # TODO: maybe we need a splice method in InlineTree.
658 10237         36554 my @styled_subnodes = splice @{$tree->{children}}, $o{index} + 1, $c{index} - $o{index} - 1;
  10237         77512  
659 10237         42713 my $styled_tree = Markdown::Perl::InlineTree->new();
660 10237         49374 $styled_tree->push(@styled_subnodes);
661             # With our current algorithm in match_delimiters we know that there is no
662             # reasons to recurse (because the closing delimiter here was the first
663             # closing delimiter with a matching opener.)
664             # my @styled_delimiters = map { $_->{index} -= $o{index} + 1; $_ } splice @{$delimiters},
665             # $open_index + 1, $close_index - $open_index - 1;
666             # match_delimiters($that, $styled_tree, @styled_delimiters);
667 10237         17548 splice @{$delimiters}, $open_index + 1, $close_index - $open_index - 1;
  10237         28915  
668              
669             # And now we rebuild our own tree around the new one.
670 10237         40054 my $len = min($o{len}, $c{len}, max_delim_length($that, $o{delim}));
671 10237         73560 my $styled_node = new_style($styled_tree, tag => delim_to_html_tag($that, $o{delim} x $len));
672 10237         28600 my $style_start = $o{index};
673 10237         18003 my $style_length = 2;
674 10237         20534 $close_index = $open_index + 1;
675 10237 100       29505 if ($len < $o{len}) {
676 337         1802 substr($tree->{children}[$o{index}]{content}, $o{len} - $len) = ''; ## no critic (ProhibitLvalueSubstr)
677 337         896 $delimiters->[$open_index]{len} -= $len;
678 337         688 $style_start++;
679 337         693 $style_length--;
680             } else {
681 9900         16966 splice @{$delimiters}, $open_index, 1;
  9900         24127  
682 9900         19197 $close_index--;
683             }
684 10237 100       30963 if ($len < $c{len}) {
685             # The closing node is now just after the opening one.
686 409         2214 substr($tree->{children}[$o{index} + 1]{content}, $c{len} - $len) = ''; ## no critic (ProhibitLvalueSubstr)
687 409         969 $delimiters->[$close_index]{len} -= $len;
688 409         758 $style_length--;
689             } else {
690 9828         17603 splice @{$delimiters}, $close_index, 1; # We remove our closing delimiter.
  9828         21541  
691             }
692 10237         26928 splice @{$tree->{children}}, $style_start, $style_length, $styled_node;
  10237         50915  
693 10237         43663 for my $i ($close_index .. $#{$delimiters}) {
  10237         45841  
694 7849         24694 $delimiters->[$i]{index} -= $c{index} - $o{index} - 2 + $style_length;
695             }
696 10237 100       91983 return $open_index - ($len < $o{len} ? 0 : 1);
697             }
698              
699             # Returns true if the given delimiters can be an open/close pair without
700             # breaking rules 9 and 10 of
701             # https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis.
702             sub valid_rules_9_10 {
703 11436     11436 0 32703 my ($o, $c) = @_;
704             # TODO: BUG: there is a probable bug here in that the length of the delimiter
705             # to consider is not its current length but the length of the original span
706             # of which it was a part.
707             return
708             (!$o->{can_close} && !$c->{can_open})
709             || (($o->{orig_len} + $c->{orig_len}) % 3 != 0)
710 11436   66     117054 || ($o->{orig_len} % 3 == 0 && $c->{orig_len} % 3 == 0);
711             }
712              
713             # TODO: use ^ and ˇ to represent sup and sub
714             # TODO: add support for MathML in some way.
715             sub delim_to_html_tag {
716 10237     10237 0 41499 my ($that, $delim) = @_;
717             # TODO: sort what to do if a given delimiter does not have a variant with
718             # two characters (we must backtrack somewhere in match_delimiters probably).
719             # TODO: add support for when the value in the map is ".foo"
720             # instead of just "foo".
721 10237         26321 return $that->get_inline_delimiters()->{$delim};
722             }
723              
724             # Return the list of characters that can be delimiters (using the regex
725             # character class syntax).
726             sub delim_characters {
727 46773     46773 0 109908 my ($that) = @_;
728             # TODO: memo-ize this function inside $that (but clear it when the options
729             # change).
730 46773         77824 my @c = map { substr $_, 0, 1 } keys %{$that->get_inline_delimiters()};
  237842         639635  
  46773         177318  
731 46773         505709 return join('', uniq @c);
732             }
733              
734             # Returns the max defined delim
735             sub max_delim_length {
736 10237     10237 0 26934 my ($that, $delim) = @_;
737             # TODO: memo-ize this function
738             # We assume that the $delim is in the map because it reached this point and
739             # also that the map can contains only delimiters not repeated or repeated
740             # once.
741 10237 100       53829 return exists $that->get_inline_delimiters()->{$delim x 2} ? 2 : 1;
742             }
743              
744             sub create_extended_autolinks {
745 58738     58738 0 130875 my ($that, $n) = @_;
746 58738 100       193254 if ($n->{type} ne 'text') {
747 18330         52602 return $n;
748             }
749              
750 40408         78219 my @nodes;
751              
752             # TODO: technically we should forbid the presence of _ in the last two parts
753             # of the domain, according to the gfm spec.
754             ## no critic (ProhibitComplexRegexes)
755 40408         387634 while (
756             $n->{content} =~ m/
757             (? ^ | [ \t\n*_~\(] ) # The link must start after a whitespace or some specific delimiters.
758             (?
759             (?: (?https?:\/\/) | www\. ) # It must start by a scheme or the string wwww.
760             [-_a-zA-Z0-9]+ (?: \. [-_a-zA-Z0-9]+ )* # Then there must be something that looks like a domain
761             (?: \/ [^ \t\n<]*? )? # Some characters are forbidden in the link.
762             )
763             [?!.,:*_~]* (?: [ \t\n<] | $) # We remove some punctuation from the end of the link.
764             /x
765             ## use critic
766             ) {
767 1303         12790 my $url = $+{url};
768 1303         25069 my $match_start = $LAST_MATCH_START[0] + length($LAST_PAREN_MATCH{prefix});
769 1303         4429 my $match_end = $match_start + length($url);
770 1303         17699 my $has_scheme = exists $LAST_PAREN_MATCH{scheme};
771 1303 100       6003 if ($url =~ m/\)+$/) {
772 8         37 my $nb_final_closing_parens = $LAST_MATCH_END[0] - $LAST_MATCH_START[0];
773 8         23 my $open = 0;
774 8         94 () = $url =~ m/ \( (?{$open++}) | \) (?{$open--}) /gx;
  8         58  
  12         57  
775 8         52 my $remove = min($nb_final_closing_parens, -$open);
776 8 100       46 if ($remove > 0) {
777 3         8 $match_end -= $remove;
778 3         13 substr $url, -$remove, $remove, '';
779             }
780             }
781 1303 100       4958 if ($url =~ m/\&[a-zA-Z0-9]+;$/) {
782 1         6 my $len = $LAST_MATCH_END[0] - $LAST_MATCH_START[0];
783 1         4 $match_end -= $len;
784 1         8 substr $url, -$len, $len, '';
785             }
786 1303 100       4373 if ($match_start > 0) {
787 726         3783 push @nodes, new_text(substr $n->{content}, 0, $match_start);
788             }
789 1303 100       6517 my $scheme = $has_scheme ? '' : $that->get_default_extended_autolinks_scheme.'://';
790 1303         7447 push @nodes,
791             new_link($url, type => 'autolink', target => $scheme.$url, debug => 'extended autolink');
792 1303         6677 $n = new_text(substr $n->{content}, $match_end);
793             }
794 40408 100       134336 push @nodes, $n if length($n->{content}) > 0;
795 40408         138354 return @nodes;
796             }
797              
798             sub create_extended_email_autolinks {
799 59516     59516 0 115290 my ($that, $n) = @_;
800 59516 100       157521 if ($n->{type} ne 'text') {
801 19633         52610 return $n;
802             }
803              
804 39883         63616 my @nodes;
805              
806             # TODO: We’re not handling links with prefix protocol (mailto: or xmpp:) but
807             # these are not tested by the spec present in the current repo (although they
808             # are documented online).
809             ## no critic (ProhibitComplexRegexes)
810 39883         129502 while (
811             $n->{content} =~ m/
812             (? ^ | [ \t\n*_~\(] ) # The link must start after a whitespace or some specific delimiters.
813             (?
814             (? mailto:\/\/ )?
815             [-_.+a-zA-Z0-9]+ @ [-_a-zA-Z0-9]+ (?: \. [-_a-zA-Z0-9]+ )+ (?<= [a-zA-Z0-9] )
816             )
817             (?: [ \t\n.<] | $ ) # We remove some punctuation from the end of the link.
818             /x
819             ## use critic
820             ) {
821 5         64 my $email = $+{email};
822 5         49 my $match_start = $LAST_MATCH_START[0] + length($LAST_PAREN_MATCH{prefix});
823 5         18 my $match_end = $match_start + length($email);
824 5         32 my $has_scheme = exists $LAST_PAREN_MATCH{scheme};
825 5 100       25 if ($match_start > 0) {
826 1         8 push @nodes, new_text(substr $n->{content}, 0, $match_start);
827             }
828 5 50       20 my $scheme = $has_scheme ? '' : 'mailto:';
829 5         42 push @nodes,
830             new_link(
831             $email,
832             type => 'autolink',
833             target => $scheme.$email,
834             debug => 'extended autolink');
835 5         37 $n = new_text(substr $n->{content}, $match_end);
836             }
837 39883 100       119384 push @nodes, $n if length($n->{content}) > 0;
838 39883         119584 return @nodes;
839             }
840              
841             1;