File Coverage

blib/lib/Markdown/Perl/HTML.pm
Criterion Covered Total %
statement 60 62 96.7
branch 17 20 85.0
condition 7 10 70.0
subroutine 14 14 100.0
pod 0 7 0.0
total 98 113 86.7


line stmt bran cond sub pod time code
1             package Markdown::Perl::HTML;
2              
3 33     33   22598 use strict;
  33         72  
  33         1497  
4 33     33   226 use warnings;
  33         76  
  33         2199  
5 33     33   211 use utf8;
  33         59  
  33         224  
6 33     33   1442 use feature ':5.24';
  33         72  
  33         6548  
7              
8 33     33   345 use Carp 'cluck';
  33         72  
  33         2557  
9 33     33   946 use English;
  33         16484  
  33         275  
10 33     33   36001 use Exporter 'import';
  33         95  
  33         155334  
11              
12             our @EXPORT_OK = qw(decode_entities html_escape http_escape remove_disallowed_tags
13             parse_attributes);
14             our %EXPORT_TAGS = (all => \@EXPORT_OK);
15              
16             our $VERSION = 0.01;
17              
18             our %html_entities;
19              
20             # Parse the __DATA__ section into the %html_entities hash.
21             # This is done lazily to reduce the loading time of the program if the table is
22             # not needed.
23             #
24             # TODO: add a :full_init import option to the main package, to force the early
25             # execution of this method.
26             sub parse_entities {
27 16819 100   16819 0 291299 return if %html_entities;
28 5         30 local $INPUT_RECORD_SEPARATOR = undef;
29 5         119037 %html_entities = eval ; ## no critic (ProhibitStringyEval);
30 5 50       717 confess $EVAL_ERROR if $EVAL_ERROR;
31 5 50       122 close DATA or cluck 'Can’t close DATA file handle in lazy parsing of HTML entities mapping';
32 5         169 return;
33             }
34              
35             # Decode HTML entities like & in the given string. Do it in-place if called
36             # in void-context otherwise returns a modified string.
37             # This is mostly equivalent to HTML::Entities::decode_entities but with our more
38             # complete list of entities, probably slower, and handling the replacement of
39             # 0x0 characters by 0xfffd.
40              
41             # Currently, CounterClockwiseContourIntegral is the longest entity name, at 31
42             # char. We limit our search to 40 char to limit possible backtracking in case of
43             # failures (even if the spec does not say anything about it).
44             my $numeric_entity_re = qr/(?:\#(?: (? [0-9]{1,7} ) | [xX] (? [a-fA-F0-9]{1,6} ) ))/x;
45             my $entity_re = qr/& (?: ${numeric_entity_re} | (? [a-zA-Z0-9]{2,40} ) ) ; /x;
46              
47             sub convert_entity {
48             # the `|| 0xfffd` part is so that � is correctly replaced by the 0xfffd
49             # "replacement" character (as is being done on the whole string at the
50             # beginning of its processing).
51             return
52             exists $+{dec} ? chr($+{dec} || 0xfffd)
53             : exists $+{hex} ? chr(hex($+{hex}) || 0xfffd)
54 16849 100 100 16849 0 215348 : (parse_entities(), $html_entities{$+{named}}) // "&$+{named};";
    100 50        
      66        
55             }
56              
57             sub decode_entities {
58 114338 100   114338 0 2152889 return $_[0] =~ s/${entity_re}/&convert_entity/egr if defined wantarray;
  7         19  
59 114325         696971 $_[0] =~ s/${entity_re}/&convert_entity/eg;
  16842         54288  
60 114325         238621 return;
61             }
62              
63             # There are four characters that are escaped in the html output (although the
64             # spec never really says so because they claim that they care only about parsing).
65             my %char_to_html_entity = (
66             '"' => '"',
67             "'" => ''',
68             '&' => '&',
69             '<' => '<',
70             '>' => '>'
71             );
72              
73             # html_escape($str_to_escape, $character_class_to_use_for_escaping)
74             sub html_escape {
75             # TODO: I believe that this is re-compiled at each call because of the local
76             # variable. The regex should be built once in the Markdown::Perl object.
77 163299     163299 0 1133784 $_[0] =~ s/([$_[1]])/$char_to_html_entity{$1}/eg;
  81048         395159  
78 163299         371078 return;
79             }
80              
81             sub http_escape {
82 11650     11650 0 54564 utf8::encode($_[0]);
83             # TODO: there are more characters that need to be escaped here, but it’s not
84             # extremely clear which ones (the spec does a show of pretending to care only
85             # only about parsing, but the conformance test suite expect something quite
86             # specific. See: https://spec.commonmark.org/0.31.2/#example-502
87 11650         28194 $_[0] =~ s/([ \\\[\]\x80-\xff`""])/sprintf('%%%02X', ord($1))/ge;
  145         859  
88 11650         22609 return;
89             }
90              
91             # remove_disallowed_tags($html, \@tag_list)
92             # modify $html in place.
93             sub remove_disallowed_tags {
94             # TODO: check if GitHub also deactivates closing tags or only opening ones.
95             # Note: this is a dummy approach, based on the fact that this method is called
96             # only in the context of HTML blocks or inline HTML and so, something looking
97             # like an HTML tag is probably an HTML tag.
98 17235     17235 0 46219 for my $t (@{$_[1]}) {
  17235         54862  
99 32976         416123 $_[0] =~ s/<(\Q$t\E)/<$1/gi;
100             }
101 17235         47610 return;
102             }
103              
104             # This parses the attributes of the directive syntax. E.g. it receives the
105             # attribute part in the :name[content]{attributes} syntax, where each part can
106             # be missing.
107             sub parse_attributes {
108 13     13 0 62 my ($attributes) = @_;
109 13         47 my %out;
110 13         69 my $id_re = qr/(?: \# (? \w+ ))/x;
111 13         56 my $class_re = qr/(?: \. (? \w+ ))/x;
112 13         59 my $key_value_re = qr/(?: (? \w+ ) \s* = \s* (? \w+ ))/x;
113 13         75 pos($attributes) = 0; # so that it’s set even if we don’t match once to avoid a warning later.
114 13         705 while ($attributes =~ m/\G\s* (?: ${id_re} | ${class_re} | ${key_value_re} ) /gcx) {
115 16 100 66     336 if (defined ($+{id}) && !exists($out{id})) {
    100          
    50          
116 4         56 $out{id} = $+{id};
117             } elsif (defined ($+{class})) {
118 9         29 push(@{$out{class}}, $+{class});
  9         180  
119             } elsif (defined ($+{key})) {
120 3         9 push(@{$out{keys}}, [$+{key}, $+{value}]);
  3         83  
121             } else {
122 0         0 pos($attributes) = $LAST_MATCH_START[0];
123 0         0 last;
124             }
125             }
126 13 100       116 if ($attributes !~ m/\G \s* $/x) {
127 4         33 $out{junk} = substr($attributes, pos($attributes));
128             }
129              
130 13         117 return %out;
131             }
132              
133             1;
134              
135             __DATA__