File Coverage

blib/lib/Pod/Markdown.pm

Criterion	Covered	Total	%
statement	395	403	98.0
branch	124	134	92.5
condition	33	43	76.7
subroutine	99	101	98.0
pod	11	59	18.6
total	662	740	89.4

line	stmt	bran	cond	sub	pod	time	code
1							# vim: set ts=2 sts=2 sw=2 expandtab smarttab:
2							#
3							# This file is part of Pod-Markdown
4							#
5							# This software is copyright (c) 2011 by Randy Stauner.
6							#
7							# This is free software; you can redistribute it and/or modify it under
8							# the same terms as the Perl 5 programming language system itself.
9							#
10	29			29		2759549	use 5.008;
	29					114
11	29			29		154	use strict;
	29					83
	29					987
12	29			29		145	use warnings;
	29					63
	29					3481
13
14							package Pod::Markdown;
15							# git description: v3.300-3-gb01c18d
16
17							our $AUTHORITY = 'cpan:RWSTAUNER';
18							# ABSTRACT: Convert POD to Markdown
19							$Pod::Markdown::VERSION = '3.400';
20	29			29		19160	use Pod::Simple 3.27 (); # detected_encoding and keep_encoding bug fix
	29					1473089
	29					1291
21	29			29		15271	use parent qw(Pod::Simple::Methody);
	29					9380
	29					199
22	29			29		45411	use Encode ();
	29					524309
	29					1234
23	29			29		15265	use URI::Escape ();
	29					59880
	29					7997
24
25							our %URL_PREFIXES = (
26							sco => 'http://search.cpan.org/perldoc?',
27							metacpan => 'https://metacpan.org/pod/',
28							man => 'http://man.he.net/man',
29							);
30							$URL_PREFIXES{perldoc} = $URL_PREFIXES{metacpan};
31
32							our $LOCAL_MODULE_RE = qr/^(Local::\|\w?_\w)/;
33
34							## no critic
35							#{
36							our $HAS_HTML_ENTITIES;
37
38							# Stolen from Pod::Simple::XHTML 3.28. {{{
39
40							BEGIN {
41	29			29		2369	$HAS_HTML_ENTITIES = eval "require HTML::Entities; 1";
42							}
43
44							my %entities = (
45							q{>} => 'gt',
46							q{<} => 'lt',
47							q{'} => '#39',
48							q{"} => 'quot',
49							q{&} => 'amp',
50							);
51
52							sub encode_entities {
53	238			238	0	608	my $self = shift;
54	238					527	my $ents = $self->html_encode_chars;
55	238	100				898	return HTML::Entities::encode_entities( $_[0], $ents ) if $HAS_HTML_ENTITIES;
56	49	100				108	if (defined $ents) {
57	48					127	$ents =~ s,(?
58	48					241	$ents =~ s,(?
59							} else {
60	1					5	$ents = join '', keys %entities;
61							}
62	49					81	my $str = $_[0];
63	49		66			465	$str =~ s/([$ents])/'&' . ($entities{$1} \|\| sprintf '#x%X', ord $1) . ';'/ge;
	23					174
64	49					189	return $str;
65							}
66
67							# }}}
68
69							# Add a few very common ones for consistency and readability
70							# (in case HTML::Entities isn't available).
71							%entities = (
72							# Pod::Markdown has always required 5.8 so unicode_to_native will be available.
73							chr(utf8::unicode_to_native(0xA0)) => 'nbsp',
74							chr(utf8::unicode_to_native(0xA9)) => 'copy',
75							%entities
76							);
77
78							sub __entity_encode_ord_he {
79	12			12		188	my $chr = chr $_[0];
80							# Skip the encode_entities() logic and go straight for the substitution
81							# since we already have the char we know we want replaced.
82							# Both the hash and the function are documented as exportable (so should be reliable).
83	12		33			81	return $HTML::Entities::char2entity{ $chr } \|\| HTML::Entities::num_entity( $chr );
84							}
85							sub __entity_encode_ord_basic {
86	7		33	7		215	return '&' . ($entities{chr $_[0]} \|\| sprintf '#x%X', $_[0]) . ';';
87							}
88
89							# From HTML::Entities 3.69
90							my $DEFAULT_ENTITY_CHARS = '^\n\r\t !\#\$%\(-;=?-~';
91
92							#}
93							## use critic
94
95							# Use hash for simple "exists" check in `new` (much more accurate than `->can`).
96							my %attributes = map { ($_ => 1) }
97							qw(
98							html_encode_chars
99							match_encoding
100							output_encoding
101							local_module_re
102							local_module_url_prefix
103							man_url_prefix
104							perldoc_url_prefix
105							perldoc_fragment_format
106							markdown_fragment_format
107							include_meta_tags
108							escape_url
109							);
110
111
112							sub new {
113	317			317	1	6174593	my $class = shift;
114	317					1468	my %args = @_;
115
116	317					1727	my $self = $class->SUPER::new();
117	317					11946	$self->preserve_whitespace(1);
118	317					2386	$self->nbsp_for_S(1);
119	317					2395	$self->accept_targets(qw( markdown html ));
120	317					7490	$self->escape_url(1);
121
122							# Default to the global, but allow it to be overwritten in args.
123	317					2509	$self->local_module_re($LOCAL_MODULE_RE);
124
125	317					1954	for my $type ( qw( perldoc man ) ){
126	634					1145	my $attr = $type . '_url_prefix';
127							# Initialize to the alias.
128	634					1992	$self->$attr($type);
129							}
130
131	317					1314	while( my ($attr, $val) = each %args ){
132							# NOTE: Checking exists on a private var means we don't allow Pod::Simple
133							# attributes to be set this way. It's not very consistent, but I think
134							# I'm ok with that for now since there probably aren't many Pod::Simple attributes
135							# being changed besides `output_*` which feel like API rather than attributes.
136							# We'll see.
137							# This is currently backward-compatible as we previously just put the attribute
138							# into the private stash so anything unknown was silently ignored.
139							# We could open this up to `$self->can($attr)` in the future if that seems better
140							# but it tricked me when I was testing a misspelled attribute name
141							# which also happened to be a Pod::Simple method.
142
143	924	100				4570	exists $attributes{ $attr } or
144							# Provide a more descriptive message than "Can't locate object method".
145							warn("Unknown argument to ${class}->new(): '$attr'"), next;
146
147							# Call setter.
148	922					2359	$self->$attr($val);
149							}
150
151							# TODO: call from the setters.
152	317					2243	$self->_prepare_fragment_formats;
153
154	317	100	100			777	if(defined $self->local_module_url_prefix && $self->local_module_url_prefix eq '' && !$self->escape_url) {
			100
155	1					22	warn("turning escape_url with an empty local_module_url_prefix is not recommended as relative URLs could be confused for IPv6 addresses");
156							}
157
158	317					1337	return $self;
159							}
160
161							for my $type ( qw( local_module perldoc man ) ){
162							my $attr = $type . '_url_prefix';
163	29			29		290	no strict 'refs'; ## no critic
	29					71
	29					249403
164							*$attr = sub {
165	1373			1373		23546	my $self = shift;
166	1373	100				2602	if (@_) {
167	877		100			4480	$self->{$attr} = $URL_PREFIXES{ $_[0] } \|\| $_[0];
168							}
169							else {
170	496					1901	return $self->{$attr};
171							}
172							}
173							}
174
175							## Attribute accessors ##
176
177
178							sub html_encode_chars {
179	310			310	1	454	my $self = shift;
180	310					681	my $stash = $self->_private;
181
182							# Setter.
183	310	100				741	if( @_ ){
184							# If false ('', 0, undef), disable.
185	72	50				164	if( !$_[0] ){
186	0					0	delete $stash->{html_encode_chars};
187	0					0	$stash->{encode_amp} = 1;
188	0					0	$stash->{encode_lt} = 1;
189							}
190							else {
191							# Special case boolean '1' to mean "all".
192							# If we have HTML::Entities, undef will use the default.
193							# Without it, we need to specify so that we use the same list (for consistency).
194	72	100				361	$stash->{html_encode_chars} = $_[0] eq '1' ? ($HAS_HTML_ENTITIES ? undef : $DEFAULT_ENTITY_CHARS) : $_[0];
		100
195
196							# If [char] doesn't get encoded, we need to do it ourselves.
197	72					218	$stash->{encode_amp} = ($self->encode_entities('&') eq '&');
198	72					2265	$stash->{encode_lt} = ($self->encode_entities('<') eq '<');
199							}
200	72					1613	return;
201							}
202
203							# Getter.
204	238					529	return $stash->{html_encode_chars};
205							}
206
207
208							# I prefer ro-accessors (immutability!) but it can be confusing
209							# to not support the same API as other Pod::Simple classes.
210
211							# NOTE: Pod::Simple::_accessorize is not a documented public API.
212							# Skip any that have already been defined.
213							__PACKAGE__->_accessorize(grep { !__PACKAGE__->can($_) } keys %attributes);
214
215							sub _prepare_fragment_formats {
216	317			317		604	my ($self) = @_;
217
218	317					1273	foreach my $attr ( keys %attributes ){
219	3487	100				10392	next unless $attr =~ /^(\w+)_fragment_format/;
220	634					1515	my $type = $1;
221	634					1765	my $format = $self->$attr;
222
223							# If one was provided.
224	634	100				3871	if( $format ){
225							# If the attribute is a coderef just use it.
226	510	100				1467	next if ref($format) eq 'CODE';
227							}
228							# Else determine a default.
229							else {
230	124	100				324	if( $type eq 'perldoc' ){
231							# Choose a default that matches the destination url.
232	61					157	my $target = $self->perldoc_url_prefix;
233	61					137	foreach my $alias ( qw( metacpan sco ) ){
234	122	100				482	if( $target eq $URL_PREFIXES{ $alias } ){
235	52					101	$format = $alias;
236							}
237							}
238							# This seems like a reasonable fallback.
239	61		100			220	$format \|\|= 'pod_simple_xhtml';
240							}
241							else {
242	63					124	$format = $type;
243							}
244							}
245
246							# The short name should become a method name with the prefix prepended.
247	170					319	my $prefix = 'format_fragment_';
248	170					1061	$format =~ s/^$prefix//;
249	170	50				857	die "Unknown fragment format '$format'"
250							unless $self->can($prefix . $format);
251
252							# Save it.
253	170					461	$self->$attr($format);
254							}
255
256	317					974	return;
257							}
258
259							## Backward compatible API ##
260
261							# For backward compatibility (previously based on Pod::Parser):
262							# While Pod::Simple provides a parse_from_file() method
263							# it's primarily for Pod::Parser compatibility.
264							# When called without an output handle it will print to STDOUT
265							# but the old Pod::Markdown never printed to a handle
266							# so we don't want to start now.
267							sub parse_from_file {
268	11			11	1	23	my ($self, $file) = @_;
269
270							# TODO: Check that all dependent cpan modules use the Pod::Simple API
271							# then add a deprecation warning here to avoid confusion.
272
273	11					57	$self->output_string(\($self->{_as_markdown_}));
274	11					767	$self->parse_file($file);
275							}
276
277							# Likewise, though Pod::Simple doesn't define this method at all.
278	10			10	0	593	sub parse_from_filehandle { shift->parse_from_file(@_) }
279
280
281							## Document state ##
282
283							sub _private {
284	10186			10186		14870	my ($self) = @_;
285							$self->{_Pod_Markdown_} \|\|= {
286	10186		100			35434	indent => 0,
287							stacks => [],
288							states => [{}],
289							link => [],
290							encode_amp => 1,
291							encode_lt => 1,
292							};
293							}
294
295							sub _increase_indent {
296	67	50		67		154	++$_[0]->_private->{indent} >= 1
297							or die 'Invalid state: indent < 0';
298							}
299							sub _decrease_indent {
300	67	50		67		143	--$_[0]->_private->{indent} >= 0
301							or die 'Invalid state: indent < 0';
302							}
303
304							sub _new_stack {
305	1043			1043		2988	push @{ $_[0]->_private->{stacks} }, [];
	1043					2250
306	1043					1721	push @{ $_[0]->_private->{states} }, {};
	1043					1980
307							}
308
309							sub _last_string {
310	18			18		25	$_[0]->_private->{stacks}->[-1][-1];
311							}
312
313							sub _pop_stack_text {
314	740			740		1043	$_[0]->_private->{last_state} = pop @{ $_[0]->_private->{states} };
	740					1355
315	740					1125	join '', @{ pop @{ $_[0]->_private->{stacks} } };
	740					1058
	740					1314
316							}
317
318							sub _stack_state {
319	455			455		802	$_[0]->_private->{states}->[-1];
320							}
321
322							sub _save {
323	1404			1404		2789	my ($self, $text) = @_;
324	1404					1828	push @{ $self->_private->{stacks}->[-1] }, $text;
	1404					2335
325							# return $text; # DEBUG
326							}
327
328							sub _save_line {
329	518			518		1055	my ($self, $text) = @_;
330
331	518					1189	$text = $self->_process_escapes($text);
332
333	518					1598	$self->_save($text . $/);
334							}
335
336							# For paragraphs, etc.
337							sub _save_block {
338	453			453		812	my ($self, $text) = @_;
339
340	453					898	$self->_stack_state->{blocks}++;
341
342	453					1117	$self->_save_line($self->_indent($text) . $/);
343							}
344
345							## Formatting ##
346
347							sub _chomp_all {
348	345			345		735	my ($self, $text) = @_;
349	345					1465	1 while chomp $text;
350	345					1169	return $text;
351							}
352
353							sub _indent {
354	496			496		886	my ($self, $text) = @_;
355	496					809	my $level = $self->_private->{indent};
356
357	496	100				1069	if( $level ){
358	34					148	my $indent = ' ' x ($level * 4);
359
360							# Capture text on the line so that we don't indent blank lines (/^\x20{4}$/).
361	34					301	$text =~ s/^(.+)/$indent$1/mg;
362							}
363
364	496					2088	return $text;
365							}
366
367							# as_markdown() exists solely for backward compatibility
368							# and requires having called parse_from_file() to be useful.
369
370
371							sub as_markdown {
372	11			11	0	415	my ($parser, %args) = @_;
373	11					15	my @header;
374							# Don't add meta tags again if we've already done it.
375	11	100	100			34	if( $args{with_meta} && !$parser->include_meta_tags ){
376	3					19	@header = $parser->_build_markdown_head;
377							}
378	11					58	return join("\n" x 2, @header, $parser->{_as_markdown_});
379							}
380
381							sub _build_markdown_head {
382	9			9		10	my $parser = shift;
383	9					14	my $data = $parser->_private;
384							return join "\n",
385	12					53	map { qq![[meta \l$_="$data->{$_}"]]! }
386	9					16	grep { defined $data->{$_} }
	18					31
387							qw( Title Author );
388							}
389
390							## Escaping ##
391
392							# http://daringfireball.net/projects/markdown/syntax#backslash
393							# Markdown provides backslash escapes for the following characters:
394							#
395							# \ backslash
396							# ` backtick
397							# * asterisk
398							# _ underscore
399							# {} curly braces
400							# [] square brackets
401							# () parentheses
402							# # hash mark
403							# + plus sign
404							# - minus sign (hyphen)
405							# . dot
406							# ! exclamation mark
407
408							# However some of those only need to be escaped in certain places:
409							# * Backslashes do need to be escaped or they may be swallowed by markdown.
410							# * Word-surrounding characters (/[`_]/) do* need to be escaped mid-word
411							# because the markdown spec explicitly allows mid-word emphasis.
412							# * I don't actually see anything that curly braces are used for.
413							# * Escaping square brackets is enough to avoid accidentally
414							# creating links and images (so we don't need to escape plain parentheses
415							# or exclamation points as that would generate a lot of unnecesary noise).
416							# Parentheses will be escaped in urls (&end_L) to avoid premature termination.
417							# * We don't need a backslash for every hash mark or every hyphen found mid-word,
418							# just the ones that start a line (likewise for plus and dot).
419							# (Those will all be handled by _escape_paragraph_markdown).
420
421
422							# Backslash escape markdown characters to avoid having them interpreted.
423							sub _escape_inline_markdown {
424	484			484		894	local $_ = $_[1];
425
426							# s/([\\`*_{}\[\]()#+-.!])/\\$1/g; # See comments above.
427	484					1535	s/([\\`*_\[\]])/\\$1/g;
428
429	484					1203	return $_;
430							}
431
432							# Escape markdown characters that would be interpreted
433							# at the start of a line.
434							sub _escape_paragraph_markdown {
435	355			355		605	local $_ = $_[1];
436
437							# Escape headings, horizontal rules, (unordered) lists, and blockquotes.
438	355					1054	s/^([-+#>])/\\$1/mg;
439
440							# Markdown doesn't support backslash escapes for equal signs
441							# even though they can be used to underline a header.
442							# So use html to escape them to avoid having them interpreted.
443	355					681	s/^([=])/sprintf '&#x%x;', ord($1)/mge;
	1					10
444
445							# Escape the dots that would wrongfully create numbered lists.
446	355					674	s/^( (?:>\s+)? \d+ ) (\.\x20)/$1\\$2/xgm;
447
448	355					672	return $_;
449							}
450
451
452							# Additionally Markdown allows inline html so we need to escape things that look like it.
453							# While _some_ Markdown processors handle backslash-escaped html,
454							# [Daring Fireball](http://daringfireball.net/projects/markdown/syntax) states distinctly:
455							# > In HTML, there are two characters that demand special treatment: < and &...
456							# > If you want to use them as literal characters, you must escape them as entities, e.g. <, and &.
457
458							# It goes on to say:
459							# > Markdown allows you to use these characters naturally,
460							# > taking care of all the necessary escaping for you.
461							# > If you use an ampersand as part of an HTML entity,
462							# > it remains unchanged; otherwise it will be translated into &.
463							# > Similarly, because Markdown supports inline HTML,
464							# > if you use angle brackets as delimiters for HTML tags, Markdown will treat them as such.
465
466							# In order to only encode the occurrences that require it (something that
467							# could be interpreted as an entity) we escape them all so that we can do the
468							# suffix test later after the string is complete (since we don't know what
469							# strings might come after this one).
470
471							my %_escape =
472							map {
473							my ($k, $v) = split /:/;
474							# Put the "code" marker before the char instead of after so that it doesn't
475							# get confused as the $2 (which is what requires us to entity-encode it).
476							# ( "XsX", "XcsX", "X(c?)sX" )
477							my ($s, $code, $re) = map { "\0$_$v\0" } '', map { ($_, '('.$_.'?)') } 'c';
478
479							(
480							$k => $s,
481							$k.'_code' => $code,
482							$k.'_re' => qr/$re/,
483							)
484							}
485							qw( amp:& lt:< );
486
487							# Make the values of this private var available to the tests.
488	1			1		573081	sub __escape_sequences { %_escape }
489
490
491							# HTML-entity encode any characters configured by the user.
492							# If that doesn't include [&<] then we escape those chars so we can decide
493							# later if we will entity-encode them or put them back verbatim.
494							sub _encode_or_escape_entities {
495	484			484		701	my $self = $_[0];
496	484					940	my $stash = $self->_private;
497	484					855	local $_ = $_[1];
498
499	484	100				1110	if( $stash->{encode_amp} ){
		50
500	462	100				863	if( exists($stash->{html_encode_chars}) ){
501							# Escape all amps for later processing.
502							# Pass intermediate strings to entity encoder so that it doesn't
503							# process any of the characters of our escape sequences.
504							# Use -1 to get "as many fields as possible" so that we keep leading and
505							# trailing (possibly empty) fields.
506	38					183	$_ = join $_escape{amp}, map { $self->encode_entities($_) } split /&/, $_, -1;
	57					533
507							}
508							else {
509	424					926	s/&/$_escape{amp}/g;
510							}
511							}
512							elsif( exists($stash->{html_encode_chars}) ){
513	22					72	$_ = $self->encode_entities($_);
514							}
515
516							s/
517	484	100				3054	if $stash->{encode_lt};
518
519	484					1090	return $_;
520							}
521
522							# From Markdown.pl version 1.0.1 line 1172 (_DoAutoLinks).
523							my $EMAIL_MARKER = qr{
524							# < # Opening token is in parent regexp.
525							(?:mailto:)?
526							(
527							[-.\w]+
528							\@
529							[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
530							)
531							>
532							}x;
533
534							# Process any escapes we put in the text earlier,
535							# now that the text is complete (end of a block).
536							sub _process_escapes {
537	518			518		811	my $self = $_[0];
538	518					842	my $stash = $self->_private;
539	518					827	local $_ = $_[1];
540
541							# The patterns below are taken from Markdown.pl 1.0.1 _EncodeAmpsAndAngles().
542							# In this case we only want to encode the ones that Markdown won't.
543							# This is overkill but produces nicer looking text (less escaped entities).
544							# If it proves insufficent then we'll just encode them all.
545
546							# $1: If the escape was in a code sequence, simply replace the original.
547							# $2: If the unescaped value would be followed by characters
548							# that could be interpreted as html, entity-encode it.
549							# else: The character is safe to leave bare.
550
551							# Neither currently allows $2 to contain '0' so bool tests are sufficient.
552
553	518	100				1138	if( $stash->{encode_amp} ){
554							# Encode & if succeeded by chars that look like an html entity.
555	483					4800	s,$_escape{amp_re}((?:#?[xX]?(?:[0-9a-fA-F]+\|\w+);)?),
556	90	100				583	$1 ? '&'.$2 : $2 ? '&'.$2 : '&',egos;
		100
557							}
558
559	518	100				1802	if( $stash->{encode_lt} ){
560							# Encode < if succeeded by chars that look like an html tag.
561							# Leave email addresses () for Markdown to process.
562	483					5518	s,$_escape{lt_re}((?=$EMAIL_MARKER)\|(?:[a-z/?\$!])?),
563	72	100				414	$1 ? '<'.$2 : $2 ? '<'.$2 : '<',egos;
		100
564							}
565
566	518					1270	return $_;
567							}
568
569
570							## Parsing ##
571
572							sub handle_text {
573	567			567	0	7568	my $self = $_[0];
574	567					1069	my $stash = $self->_private;
575	567					1045	local $_ = $_[1];
576
577							# Unless we're in a code span, verbatim block, or formatted region.
578	567	100				1321	unless( $stash->{no_escape} ){
579
580							# We could, in theory, alter what gets escaped according to context
581							# (for example, escape square brackets (but not parens) inside link text).
582							# The markdown produced might look slightly nicer but either way you're
583							# at the whim of the markdown processor to interpret things correctly.
584							# For now just escape everything.
585
586							# Don't let literal characters be interpreted as markdown.
587	484					1083	$_ = $self->_escape_inline_markdown($_);
588
589							# Entity-encode (or escape for later processing) necessary/desired chars.
590	484					1155	$_ = $self->_encode_or_escape_entities($_);
591
592							}
593							# If this _is_ a code section, do limited/specific handling.
594							else {
595							# Always escaping these chars ensures that we won't mangle the text
596							# in the unlikely event that a sequence matching our escape occurred in the
597							# input stream (since we're going to escape it and then unescape it).
598	83	100				459	s/&/$_escape{amp_code}/gos if $stash->{encode_amp};
599	83	100				339	s/{encode_lt};
600							}
601
602	567					1231	$self->_save($_);
603							}
604
605							sub start_Document {
606	303			303	0	138868	my ($self) = @_;
607	303					925	$self->_new_stack;
608							}
609
610							sub end_Document {
611	303			303	0	34290	my ($self) = @_;
612	303					1013	$self->_check_search_header;
613	303					458	my $end = pop @{ $self->_private->{stacks} };
	303					512
614
615	303	50				487	@{ $self->_private->{stacks} } == 0
	303					497
616							or die 'Document ended with stacks remaining';
617
618	303					1579	my @doc = $self->_chomp_all(join('', @$end)) . $/;
619
620	303	100				1003	if( $self->include_meta_tags ){
621	6					39	unshift @doc, $self->_build_markdown_head, ($/ x 2);
622							}
623
624	303	100				2275	if( my $encoding = $self->_get_output_encoding ){
625							# Do the check outside the loop(s) for efficiency.
626	66	100				622	my $ents = $HAS_HTML_ENTITIES ? \&__entity_encode_ord_he : \&__entity_encode_ord_basic;
627							# Iterate indices to avoid copying large strings.
628	66					196	for my $i ( 0 .. $#doc ){
629	66					90	print { $self->{output_fh} } Encode::encode($encoding, $doc[$i], $ents);
	66					482
630							}
631							}
632							else {
633	237					1350	print { $self->{output_fh} } @doc;
	237					1330
634							}
635							}
636
637							sub _get_output_encoding {
638	303			303		544	my ($self) = @_;
639
640							# If 'match_encoding' is set we need to return an encoding.
641							# If pod has no =encoding, Pod::Simple will guess if it sees a high-bit char.
642							# If there are no high-bit chars, encoding is undef.
643							# Use detected_encoding() rather than encoding() because if Pod::Simple
644							# can't use whatever encoding was specified, we probably can't either.
645							# Fallback to 'o_e' if no match is found. This gives the user the choice,
646							# since otherwise there would be no reason to specify 'o_e' and 'm_e'.
647							# Fallback to UTF-8 since it is a reasonable default these days.
648
649	303	100	100			711	return $self->detected_encoding \|\| $self->output_encoding \|\| 'UTF-8'
650							if $self->match_encoding;
651
652							# If output encoding wasn't specified, return false.
653	289					1906	return $self->output_encoding;
654							}
655
656							## Blocks ##
657
658							sub start_Verbatim {
659	17			17	0	6396	my ($self) = @_;
660	17					60	$self->_new_stack;
661	17					64	$self->_private->{no_escape} = 1;
662							}
663
664							sub end_Verbatim {
665	17			17	0	175	my ($self) = @_;
666
667	17					106	my $text = $self->_pop_stack_text;
668
669	17					46	$text = $self->_indent_verbatim($text);
670
671	17					54	$self->_private->{no_escape} = 0;
672
673							# Verbatim blocks do not generate a separate "Para" event.
674	17					39	$self->_save_block($text);
675							}
676
677							sub _indent_verbatim {
678	17			17		33	my ($self, $paragraph) = @_;
679
680							# NOTE: Pod::Simple expands the tabs for us (as suggested by perlpodspec).
681							# Pod::Simple also has a 'strip_verbatim_indent' attribute
682							# but it doesn't sound like it gains us anything over this method.
683
684							# POD verbatim can start with any number of spaces (or tabs)
685							# markdown should be 4 spaces (or a tab)
686							# so indent any paragraphs so that all lines start with at least 4 spaces
687	17					62	my @lines = split /\n/, $paragraph;
688	17					31	my $indent = ' ' x 4;
689	17					39	foreach my $line ( @lines ){
690	32	100				123	next unless $line =~ m/^( +)/;
691							# find the smallest indentation
692	31	100				111	$indent = $1 if length($1) < length($indent);
693							}
694	17	100				55	if( (my $smallest = length($indent)) < 4 ){
695							# invert to get what needs to be prepended
696	11					31	$indent = ' ' x (4 - $smallest);
697
698							# Prepend indent to each line.
699							# We could check /\S/ to only indent non-blank lines,
700							# but it's backward compatible to respect the whitespace.
701							# Additionally, both pod and markdown say they ignore blank lines
702							# so it shouldn't hurt to leave them in.
703	11	100				27	$paragraph = join "\n", map { length($_) ? $indent . $_ : '' } @lines;
	23					90
704							}
705
706	17					47	return $paragraph;
707							}
708
709							sub start_Para {
710	355			355	0	187791	$_[0]->_new_stack;
711							}
712
713							sub end_Para {
714	355			355	0	4781	my ($self) = @_;
715	355					664	my $text = $self->_pop_stack_text;
716
717	355					920	$text = $self->_escape_paragraph_markdown($text);
718
719	355					799	$self->_save_block($text);
720							}
721
722
723							## Headings ##
724
725	51			51	0	8454	sub start_head1 { $_[0]->_start_head(1) }
726	51			51	0	602	sub end_head1 { $_[0]->_end_head(1) }
727	11			11	0	4820	sub start_head2 { $_[0]->_start_head(2) }
728	11			11	0	150	sub end_head2 { $_[0]->_end_head(2) }
729	1			1	0	508	sub start_head3 { $_[0]->_start_head(3) }
730	1			1	0	7	sub end_head3 { $_[0]->_end_head(3) }
731	0			0	0	0	sub start_head4 { $_[0]->_start_head(4) }
732	0			0	0	0	sub end_head4 { $_[0]->_end_head(4) }
733
734							sub _check_search_header {
735	366			366		574	my ($self) = @_;
736							# Save the text since the last heading if we want it for metadata.
737	366	100				725	if( my $last = $self->_private->{search_header} ){
738	18					25	for( $self->_private->{$last} = $self->_last_string ){
739	18					31	s/\A\s+//;
740	18					78	s/\s+\z//;
741							}
742							}
743							}
744							sub _start_head {
745	63			63		209	my ($self) = @_;
746	63					196	$self->_check_search_header;
747	63					125	$self->_new_stack;
748							}
749
750							sub _end_head {
751	63			63		122	my ($self, $num) = @_;
752	63					129	my $h = '#' x $num;
753
754	63					142	my $text = $self->_pop_stack_text;
755							$self->_private->{search_header} =
756	63	100				339	$text =~ /NAME/ ? 'Title'
		100
757							: $text =~ /AUTHOR/ ? 'Author'
758							: undef;
759
760							# TODO: option for $h suffix
761							# TODO: put a name="" if $self->{embed_anchor_tags}; ?
762							# https://rt.cpan.org/Ticket/Display.html?id=57776
763	63					249	$self->_save_block(join(' ', $h, $text));
764							}
765
766							## Lists ##
767
768							# With Pod::Simple->parse_empty_lists(1) there could be an over_empty event,
769							# but what would you do with that?
770
771							sub _start_list {
772	22			22		57	my ($self) = @_;
773	22					78	$self->_new_stack;
774
775							# Nest again b/c start_item will pop this to look for preceding content.
776	22					72	$self->_increase_indent;
777	22					54	$self->_new_stack;
778							}
779
780							sub _end_list {
781	22			22		51	my ($self) = @_;
782	22					67	$self->_handle_between_item_content;
783
784							# Finish the list.
785
786							# All the child elements should be blocks,
787							# but don't end with a double newline.
788	22					100	my $text = $self->_chomp_all($self->_pop_stack_text);
789
790	22					98	$_[0]->_save_line($text . $/);
791							}
792
793							sub _handle_between_item_content {
794	65			65		133	my ($self) = @_;
795
796							# This might be empty (if the list item had no additional content).
797	65	100				170	if( my $text = $self->_pop_stack_text ){
798							# Else it's a sub-document.
799							# If there are blocks we need to separate with blank lines.
800	21	100				51	if( $self->_private->{last_state}->{blocks} ){
801	16					61	$text = $/ . $text;
802							}
803							# If not, we can condense the text.
804							# In this module's history there was a patch contributed to specifically
805							# produce "huddled" lists so we'll try to maintain that functionality.
806							else {
807	5					18	$text = $self->_chomp_all($text) . $/;
808							}
809	21					112	$self->_save($text)
810							}
811
812	65					162	$self->_decrease_indent;
813							}
814
815							sub _start_item {
816	43			43		98	my ($self) = @_;
817	43					127	$self->_handle_between_item_content;
818	43					99	$self->_new_stack;
819							}
820
821							sub _end_item {
822	43			43		99	my ($self, $marker) = @_;
823	43					127	my $text = $self->_pop_stack_text;
824	43	100	66			440	$self->_save_line($self->_indent($marker .
825							# Add a space only if there is text after the marker.
826							(defined($text) && length($text) ? ' ' . $text : '')
827							));
828
829							# Store any possible contents in a new stack (like a sub-document).
830	43					144	$self->_increase_indent;
831	43					99	$self->_new_stack;
832							}
833
834	7			7	0	2572	sub start_over_bullet { $_[0]->_start_list }
835	7			7	0	1050	sub end_over_bullet { $_[0]->_end_list }
836
837	18			18	0	8647	sub start_item_bullet { $_[0]->_start_item }
838	18			18	0	258	sub end_item_bullet { $_[0]->_end_item('-') }
839
840	9			9	0	4151	sub start_over_number { $_[0]->_start_list }
841	9			9	0	2494	sub end_over_number { $_[0]->_end_list }
842
843							sub start_item_number {
844	18			18	0	9587	$_[0]->_start_item;
845							# It seems like this should be a stack,
846							# but from testing it appears that the corresponding 'end' event
847							# comes right after the text (it doesn't surround any embedded content).
848							# See t/nested.t which shows start-item, text, end-item, para, start-item....
849	18					52	$_[0]->_private->{item_number} = $_[1]->{number};
850							}
851
852							sub end_item_number {
853	18			18	0	249	my ($self) = @_;
854	18					42	$self->_end_item($self->_private->{item_number} . '.');
855							}
856
857							# Markdown doesn't support definition lists
858							# so do regular (unordered) lists with indented paragraphs.
859	6			6	0	2791	sub start_over_text { $_[0]->_start_list }
860	6			6	0	1273	sub end_over_text { $_[0]->_end_list }
861
862	7			7	0	3225	sub start_item_text { $_[0]->_start_item }
863	7			7	0	151	sub end_item_text { $_[0]->_end_item('-')}
864
865
866							# perlpodspec equates an over/back region with no items to a blockquote.
867							sub start_over_block {
868							# NOTE: We don't actually need to indent for a blockquote.
869	3			3	0	1743	$_[0]->_new_stack;
870							}
871
872							sub end_over_block {
873	3			3	0	844	my ($self) = @_;
874
875							# Chomp first to avoid prefixing a blank line with a `>`.
876	3					10	my $text = $self->_chomp_all($self->_pop_stack_text);
877
878							# NOTE: Paragraphs will already be escaped.
879
880							# I don't really like either of these implementations
881							# but the join/map/split seems a little better and benches a little faster.
882							# You would lose the last newline but we've already chomped.
883							#$text =~ s{^(.)?}{'>' . (defined($1) && length($1) ? (' ' . $1) : '')}mge;
884	3	100				52	$text = join $/, map { length($_) ? '> ' . $_ : '>' } split qr-$/-, $text;
	22					68
885
886	3					16	$self->_save_block($text);
887							}
888
889							## Custom Formats ##
890
891							sub start_for {
892	10			10	0	4211	my ($self, $attr) = @_;
893	10					34	$self->_new_stack;
894
895	10	100				40	if( $attr->{target} eq 'html' ){
896							# Use another stack so we can indent
897							# (not syntactily necessary but seems appropriate).
898	2					8	$self->_new_stack;
899	2					7	$self->_increase_indent;
900	2					5	$self->_private->{no_escape} = 1;
901							# Mark this so we know to undo it.
902	2					5	$self->_stack_state->{for_html} = 1;
903							}
904							}
905
906							sub end_for {
907	10			10	0	1051	my ($self) = @_;
908							# Data gets saved as a block (which will handle indents),
909							# but if there was html we'll alter this, so chomp and save a block again.
910	10					26	my $text = $self->_chomp_all($self->_pop_stack_text);
911
912	10	100				19	if( $self->_private->{last_state}->{for_html} ){
913	2					6	$self->_private->{no_escape} = 0;
914							# Save it to the next stack up so we can pop it again (we made two stacks).
915	2					8	$self->_save($text);
916	2					8	$self->_decrease_indent;
917	2					6	$text = join "\n", ' ', $self->_chomp_all($self->_pop_stack_text), ' ';
918							}
919
920	10					24	$self->_save_block($text);
921							}
922
923							# Data events will be emitted for any formatted regions that have been enabled
924							# (by default, `markdown` and `html`).
925
926							sub start_Data {
927	5			5	0	649	my ($self) = @_;
928							# TODO: limit this to what's in attr?
929	5					12	$self->_private->{no_escape}++;
930	5					11	$self->_new_stack;
931							}
932
933							sub end_Data {
934	5			5	0	52	my ($self) = @_;
935	5					12	my $text = $self->_pop_stack_text;
936	5					11	$self->_private->{no_escape}--;
937	5					12	$self->_save_block($text);
938							}
939
940							## Codes ##
941
942	96			96	0	1150	sub start_B { $_[0]->_save('**') }
943	48			48	0	726	sub end_B { $_[0]->start_B() }
944
945	46			46	0	526	sub start_I { $_[0]->_save('_') }
946	23			23	0	276	sub end_I { $_[0]->start_I() }
947
948							sub start_C {
949	61			61	0	1151	my ($self) = @_;
950	61					174	$self->_new_stack;
951	61					148	$self->_private->{no_escape}++;
952							}
953
954							sub end_C {
955	61			61	0	677	my ($self) = @_;
956	61					124	$self->_private->{no_escape}--;
957	61					178	$self->_save( $self->_wrap_code_span($self->_pop_stack_text) );
958							}
959
960							# Use code spans for F<>.
961	4			4	0	67	sub start_F { shift->start_C(@_); }
962	4			4	0	36	sub end_F { shift ->end_C(@_); }
963
964							sub start_L {
965	93			93	0	1475	my ($self, $flags) = @_;
966	93					233	$self->_new_stack;
967	93					126	push @{ $self->_private->{link} }, $flags;
	93					169
968							}
969
970							sub end_L {
971	93			93	0	997	my ($self) = @_;
972	93	50				109	my $flags = pop @{ $self->_private->{link} }
	93					160
973							or die 'Invalid state: link end with no link start';
974
975	93					144	my ($type, $to, $section) = @{$flags}{qw( type to section )};
	93					247
976
977	93	50				487	my $url = (
		100
		100
978							$type eq 'url' ? $to
979							: $type eq 'man' ? $self->format_man_url($to, $section)
980							: $type eq 'pod' ? $self->format_perldoc_url($to, $section)
981							: undef
982							);
983
984	93					254	my $text = $self->_pop_stack_text;
985
986							# NOTE: I don't think the perlpodspec says what to do with L<\|blah>
987							# but it seems like a blank link text just doesn't make sense
988	93	100				216	if( !length($text) ){
989	4	100				13	$text =
		100
990							$section ?
991							$to ? sprintf('"%s" in %s', $section, $to)
992							: ('"' . $section . '"')
993							: $to;
994							}
995
996							# FIXME: What does Pod::Simple::X?HTML do for this?
997							# if we don't know how to handle the url just print the pod back out
998	93	50				309	if (!$url) {
999	0					0	$self->_save(sprintf 'L<%s>', $flags->{raw});
1000	0					0	return;
1001							}
1002
1003							# In the url we need to escape quotes and parentheses lest markdown
1004							# break the url (cut it short and/or wrongfully interpret a title).
1005
1006							# Backslash escapes do not work for the space and quotes.
1007							# URL-encoding the space is not sufficient
1008							# (the quotes confuse some parsers and produce invalid html).
1009							# I've arbitratily chosen HTML encoding to hide them from markdown
1010							# while mangling the url as litle as possible.
1011	93					721	$url =~ s/([ '"])/sprintf '&#x%x;', ord($1)/ge;
	11					60
1012
1013							# We also need to double any backslashes that may be present
1014							# (lest they be swallowed up) and stop parens from breaking the url.
1015	93					484	$url =~ s/([\\()])/\\$1/g;
1016
1017							# TODO: put section name in title if not the same as $text
1018	93					467	$self->_save('[' . $text . '](' . $url . ')');
1019							}
1020
1021							sub start_X {
1022	1			1	0	14	$_[0]->_new_stack;
1023							}
1024
1025							sub end_X {
1026	1			1	0	8	my ($self) = @_;
1027	1					2	my $text = $self->_pop_stack_text;
1028							# TODO: mangle $text?
1029							# TODO: put if configured
1030							}
1031
1032							# A code span can be delimited by multiple backticks (and a space)
1033							# similar to pod codes (C<< code >>), so ensure we use a big enough
1034							# delimiter to not have it broken by embedded backticks.
1035							sub _wrap_code_span {
1036	61			61		175	my ($self, $arg) = @_;
1037	61					108	my $longest = 0;
1038	61					261	while( $arg =~ /([`]+)/g ){
1039	5					16	my $len = length($1);
1040	5	100				31	$longest = $len if $longest < $len;
1041							}
1042	61					180	my $delim = '`' x ($longest + 1);
1043	61	100				278	my $pad = $longest > 0 ? ' ' : '';
1044	61					230	return $delim . $pad . $arg . $pad . $delim;
1045							}
1046
1047							## Link Formatting (TODO: Move this to another module) ##
1048
1049
1050							sub format_man_url {
1051	6			6	1	13	my ($self, $to) = @_;
1052	6					14	my ($page, $part) = ($to =~ /^ ([^(]+) (?: $ (\S+) $ )? /x);
1053	6		50			98	return $self->man_url_prefix . ($part \|\| 1) . '/' . ($page \|\| $to);
			33
1054							}
1055
1056
1057							sub format_perldoc_url {
1058	70			70	1	154	my ($self, $name, $section) = @_;
1059
1060	70					155	my $url_prefix = $self->perldoc_url_prefix;
1061	70	100	100			283	if (
			100
1062							defined($name)
1063							&& $self->is_local_module($name)
1064							&& defined($self->local_module_url_prefix)
1065							) {
1066	5					10	$url_prefix = $self->local_module_url_prefix;
1067							}
1068
1069	70					1508	my $url = '';
1070
1071							# If the link is to another module (external link).
1072	70	100				138	if ($name) {
1073	59	100				1136	$url = $url_prefix . ($self->escape_url ? URI::Escape::uri_escape($name) : $name);
1074							}
1075
1076							# See https://rt.cpan.org/Ticket/Display.html?id=57776
1077							# for a discussion on the need to mangle the section.
1078	70	100				3495	if ($section){
1079
1080	42	100				923	my $method = $url
1081							# If we already have a prefix on the url it's external.
1082							? $self->perldoc_fragment_format
1083							# Else an internal link points to this markdown doc.
1084							: $self->markdown_fragment_format;
1085
1086	42	100				251	$method = 'format_fragment_' . $method
1087							unless ref($method);
1088
1089							{
1090							# Set topic to enable code refs to be simple.
1091	42					59	local $_ = $section;
	42					56
1092	42					138	$section = $self->$method($section);
1093							}
1094
1095	42					139	$url .= '#' . $section;
1096							}
1097
1098	70					355	return $url;
1099							}
1100
1101
1102							# TODO: simple, pandoc, etc?
1103
1104							sub format_fragment_markdown {
1105	3			3	1	43	my ($self, $section) = @_;
1106
1107							# If this is an internal link (to another section in this doc)
1108							# we can't be sure what the heading id's will look like
1109							# (it depends on what is rendering the markdown to html)
1110							# but we can try to follow popular conventions.
1111
1112							# http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#header-identifiers-in-html-latex-and-context
1113							#$section =~ s/(?![-_.])[[:punct:]]//g;
1114							#$section =~ s/\s+/-/g;
1115	3					6	$section =~ s/\W+/-/g;
1116	3					98	$section =~ s/-+$//;
1117	3					5	$section =~ s/^-+//;
1118	3					6	$section = lc $section;
1119							#$section =~ s/^[^a-z]+//;
1120	3		50			10	$section \|\|= 'section';
1121
1122	3					6	return $section;
1123							}
1124
1125
1126							{
1127							# From Pod::Simple::XHTML 3.28.
1128							# The strings gets passed through encode_entities() before idify().
1129							# If we don't do it here the substitutions below won't operate consistently.
1130
1131							sub format_fragment_pod_simple_xhtml {
1132	9			9	1	18	my ($self, $t) = @_;
1133
1134							# encode_entities {
1135							# We need to use the defaults in case html_encode_chars has been customized
1136							# (since the purpose is to match what external sources are doing).
1137
1138	9					20	local $self->_private->{html_encode_chars};
1139	9					34	$t = $self->encode_entities($t);
1140							# }
1141
1142							# idify {
1143	9					345	for ($t) {
1144	9					17	s/<[^>]+>//g; # Strip HTML.
1145	9					85	s/&[^;]+;//g; # Strip entities.
1146	9					71	s/^\s+//; s/\s+$//; # Strip white space.
	9					73
1147	9					77	s/^([^a-zA-Z]+)$/pod$1/; # Prepend "pod" if no valid chars.
1148	9					82	s/^[^a-zA-Z]+//; # First char must be a letter.
1149	9					70	s/[^-a-zA-Z0-9_:.]+/-/g; # All other chars must be valid.
1150	9					114	s/[-:.]+$//; # Strip trailing punctuation.
1151							}
1152							# }
1153
1154	9					77	return $t;
1155							}
1156							}
1157
1158
1159							sub format_fragment_pod_simple_html {
1160	9			9	1	24	my ($self, $section) = @_;
1161
1162							# From Pod::Simple::HTML 3.28.
1163
1164							# section_name_tidy {
1165	9					17	$section =~ s/^\s+//;
1166	9					115	$section =~ s/\s+$//;
1167	9					117	$section =~ tr/ /_/;
1168	9					189	$section =~ tr/\x00-\x1F\x80-\x9F//d if 'A' eq chr(65); # drop crazy characters
1169
1170							#$section = $self->unicode_escape_url($section);
1171							# unicode_escape_url {
1172	9					18	$section =~ s/([^\x00-\xFF])/'('.ord($1).')'/eg;
	0					0
1173							# Turn char 1234 into "(1234)"
1174							# }
1175
1176	9	50				24	$section = '_' unless length $section;
1177	9					21	return $section;
1178							# }
1179							}
1180
1181
1182	9			9	1	28	sub format_fragment_metacpan { shift->format_fragment_pod_simple_xhtml(@_); }
1183	9			9	1	27	sub format_fragment_sco { shift->format_fragment_pod_simple_html(@_); }
1184
1185
1186							sub is_local_module {
1187	59			59	1	118	my ($self, $name) = @_;
1188
1189	59					197	return ($name =~ $self->local_module_re);
1190							}
1191
1192							1;
1193
1194							__END__