File Coverage

blib/lib/HTML/WikiConverter/Markdown.pm
Criterion Covered Total %
statement 21 191 10.9
branch 0 84 0.0
condition 0 76 0.0
subroutine 7 35 20.0
pod 0 5 0.0
total 28 391 7.1


line stmt bran cond sub pod time code
1             package HTML::WikiConverter::Markdown;
2              
3 1     1   194362 use warnings;
  1         3  
  1         45  
4 1     1   6 use strict;
  1         3  
  1         41  
5              
6 1     1   6 use base 'HTML::WikiConverter';
  1         16  
  1         817  
7             our $VERSION = '0.06';
8              
9 1     1   1555 use Params::Validate ':types';
  1         17181  
  1         226  
10 1     1   1222 use HTML::Entities;
  1         9020  
  1         101  
11 1     1   964 use HTML::Tagset;
  1         1455  
  1         38  
12 1     1   969 use URI;
  1         9403  
  1         4602  
13              
14             =head1 NAME
15              
16             HTML::WikiConverter::Markdown - Convert HTML to Markdown markup
17              
18             =head1 SYNOPSIS
19              
20             use HTML::WikiConverter;
21             my $wc = new HTML::WikiConverter( dialect => 'Markdown' );
22             print $wc->html2wiki( $html );
23              
24             =head1 DESCRIPTION
25              
26             This module contains rules for converting HTML into Markdown markup.
27             You should not use this module directly; HTML::WikiConverter is the
28             entry point for html->wiki conversion (eg, see synopsis above). See
29             L<HTML::WikiConverter> for additional usage details.
30              
31             =head1 ATTRIBUTES
32              
33             In addition to the regular set of attributes recognized by the
34             L<HTML::WikiConverter> constructor, this dialect also accepts the
35             following attributes that can be passed into the C<new()>
36             constructor. See L<HTML::WikiConverter/ATTRIBUTES> for usage details.
37              
38             =head2 header_style
39              
40             Possible values: C<'setext'>, C<'atx'>. Determines how headers
41             C<h1>-C<h6> will be formatted. See
42             L<http://daringfireball.net/projects/markdown/syntax#header> for more
43             information. Default is C<'atx'>.
44              
45             =head2 link_style
46              
47             Possible values: C<'inline'>, C<'reference'>. See
48             L<http://daringfireball.net/projects/markdown/syntax#link> for more
49             information. Default is C<'reference'>.
50              
51             =head2 force_inline_anchor_links
52              
53             Possible values: C<0>, C<1>. If enabled, links to anchors within the
54             same page (eg, C<#some-anchor>) will always produce inline Markdown
55             links, even under reference link style. This might be useful for
56             building tables of contents. Default is C<0>.
57              
58             =head2 image_style
59              
60             Possible values: C<'inline'>, C<'reference'>. See
61             L<http://daringfireball.net/projects/markdown/syntax#img> for more
62             information. Default is C<'reference'>.
63              
64             =head2 image_tag_fallback
65              
66             Possible values: C<0>, C<1>. Markdown's image markup does not
67             support image dimensions. If C<image_tag_fallback> is enabled, image
68             tags containing dimensional information (ie, width or height) will not
69             be converted into Markdown markup. Rather, they will be roughly
70             preserved in their HTML form. Default is C<1>.
71              
72             =head2 unordered_list_style
73              
74             Possible values: C<'asterisk'>, C<'plus'>, C<'dash'>. See
75             L<http://daringfireball.net/projects/markdown/syntax#list> for more
76             information. Default is C<'asterisk'>.
77              
78             =head2 ordered_list_style
79              
80             Possible values: C<'sequential'>, C<'one-dot'>. Markdown supports two
81             different markups for ordered lists. Sequential style gives each list
82             element its own ordinal number (ie, C<'1.'>, C<'2.'>, C<'3.'>,
83             etc.). One-dot style gives each list element the same ordinal number
84             (ie, C<'1.'>). See
85             L<http://daringfireball.net/projects/markdown/syntax#list> for more
86             information. Default is C<'sequential'>.
87              
88             =head2 md_extra
89              
90             Possible values: C<0>, C<1>. Support MarkDown Extra
91             L<https://github.com/jmcmanus/pagedown-extra> extensions. Default is C<0>.
92              
93             This support is incomplete.
94              
95             # Tables Supported
96             # Fenced Code Blocks
97             # Definition Lists Supported
98             # Footnotes
99             # Special Attributes
100             # SmartyPants
101             # Newlines
102             # Strikethrough
103              
104             =cut
105              
106             sub attributes {
107             {
108 0     0 0   header_style => { default => 'atx', type => SCALAR },
109             link_style => { default => 'reference', type => SCALAR },
110             force_inline_anchor_links => { default => 0, type => BOOLEAN },
111             image_style => { default => 'reference', type => SCALAR },
112             image_tag_fallback => { default => 1, type => BOOLEAN },
113             unordered_list_style => { default => 'asterisk', type => SCALAR },
114             ordered_list_style => { default => 'sequential', type => SCALAR },
115              
116             # Requires H::WC version 0.67
117             p_strict => { default => 0 },
118             md_extra => { default => 0, type => BOOLEAN },
119             };
120             }
121              
122             my @common_attrs = qw/ id class lang dir title style /;
123              
124             # Hack to accommodate bug #43997 - multiline code blocks
125             my $code_block_prefix = 'bqwegsdfbwegadfbnsdfbahwerfgkjnsdfbohqw34t927398y5jnwrteb8uq34inb';
126              
127             sub rules {
128 0     0 0   my $self = shift;
129              
130 0           my %rules = (
131             hr => { replace => "\n\n----\n\n" },
132             br => { preserve => 1, empty => 1, end => \&_br_end },
133             p => {
134             block => 1,
135             trim => 'both',
136             line_format => 'multi',
137             line_prefix => \&_p_prefix
138             },
139             blockquote => {
140             block => 1,
141             trim => 'both',
142             line_format => 'blocks',
143             line_prefix => '> '
144             },
145             ul => { block => 1, line_format => 'multi' },
146             ol => { alias => 'ul' },
147             li => { start => \&_li_start, trim => 'leading' },
148              
149             i => { start => '_', end => '_' },
150             em => { alias => 'i' },
151             b => { start => '**', end => '**' },
152             strong => { alias => 'b' },
153             code => { start => \&_code_delim, end => \&_code_delim },
154             code_block => { line_prefix => $code_block_prefix, block => 1 },
155              
156             a => { replace => \&_link },
157             img => { replace => \&_img },
158             div => { block => 1, line_format => 'blocks' },
159             pre => { line_prefix => "\t", block => 1, line_format => 'blocks' },
160             );
161              
162 0           for ( 1 .. 6 ) {
163 0           $rules{"h$_"} = {
164             start => \&_header_start,
165             end => \&_header_end,
166             trim => 'both',
167             block => 1
168             };
169             }
170              
171 0           for (qw/ table caption tr th td /) {
172 0           $rules{$_} = {
173             preserve => 1,
174             attrs => \@common_attrs,
175             start => "\n",
176             end => "\n",
177             line_format => 'multi'
178             };
179             }
180              
181             # MarkDown Extra https://github.com/jmcmanus/pagedown-extra
182             # Tables Supported
183             # Fenced Code Blocks
184             # Definition Lists Supported
185             # Footnotes
186             # Special Attributes
187             # SmartyPants
188             # Newlines
189             # Strikethrough
190 0 0         if ( $self->md_extra ) {
191 0           $rules{dt} = { start => "\n", end => "\n", trim => 'both', };
192 0           $rules{dd} = { start => ": ", end => "\n", trim => 'both', };
193 0           delete( $rules{table} );
194 0           delete( $rules{caption} );
195 0           $rules{tr} = { start => "\n", end => "|", trim => 'both' };
196 0           $rules{td} = { start => "|", trim => 'both' };
197 0           $rules{th} = { alias => 'td' };
198 0           $rules{thead} = { end => "\n|-|", trim => 'both' };
199             # need an extra line here as some lists can contain complex block structures.
200 0           $rules{ul} = { block => 1, line_format => 'blocks' };
201 0           $rules{li} = { start => \&_li_start, blocks => 1, trim => 'leading' };
202             }
203              
204 0           return \%rules;
205             }
206              
207             sub _br_end {
208 0     0     my ( $self, $node, $rules ) = @_;
209 0           return "\n";
210             }
211              
212             sub _header_start {
213 0     0     my ( $self, $node, $rules ) = @_;
214 0 0         return '' unless $self->header_style eq 'atx';
215              
216 0           ( my $level = $node->tag ) =~ s/\D//g;
217 0 0         return unless $level;
218              
219 0           my $hr = ('#') x $level;
220 0           return "$hr ";
221             }
222              
223             sub _header_end {
224 0     0     my ( $self, $node, $rules ) = @_;
225 0           my $anchor = '';
226              
227 0 0         if ( $node->id() ) {
228 0           $anchor = "\t{#" . $node->id() . "}";
229             }
230              
231 0 0         return $anchor unless $self->header_style eq 'setext';
232 0           ( my $level = $node->tag ) =~ s/\D//g;
233 0 0         return $anchor unless $level;
234              
235 0 0         my $symbol = $level == 1 ? '=' : '-';
236 0           my $len = length $self->get_elem_contents($node);
237 0           my $bar = ($symbol) x $len;
238 0           return "$anchor\n$bar\n";
239             }
240              
241             sub _link {
242 0     0     my ( $self, $node, $rules ) = @_;
243              
244 0   0       my $url = $self->_abs2rel( $node->attr('href') || '' );
245 0           my $text = $self->get_elem_contents($node);
246 0   0       my $title = $node->attr('title') || '';
247              
248 0           my $style = $self->link_style;
249 0 0 0       $style = 'inline' if $url =~ /^\#/ and $self->force_inline_anchor_links;
250              
251 0 0         if ( $url eq $text ) {
    0          
    0          
252 0           return sprintf "<%s>", $url;
253             }
254             elsif ( $style eq 'inline' ) {
255 0 0         return sprintf "[%s](%s \"%s\")", $text, $url, $title if $title;
256 0           return sprintf "[%s](%s)", $text, $url;
257             }
258             elsif ( $style eq 'reference' ) {
259 0           my $id = $self->_next_link_id;
260 0           $self->_add_link( { id => $id, url => $url, title => $title } );
261 0           return sprintf "[%s][%s]", $text, $id;
262             }
263             }
264              
265 0     0     sub _last_link_id { shift->_attr( { internal => 1 }, _last_link_id => @_ ) }
266              
267 0     0     sub _links { shift->_attr( { internal => 1 }, _links => @_ ) }
268              
269             sub _next_link_id {
270 0     0     my $self = shift;
271 0   0       my $next_id = ( $self->_last_link_id || 0 ) + 1;
272 0           $self->_last_link_id($next_id);
273 0           return $next_id;
274             }
275              
276             sub _add_link {
277 0     0     my ( $self, $link ) = @_;
278 0 0         $self->_links( [ @{ $self->_links || [] }, $link ] );
  0            
279             }
280              
281             sub _img {
282 0     0     my ( $self, $node, $rules ) = @_;
283              
284 0   0       my $url = $node->attr('src') || '';
285 0   0       my $text = $node->attr('alt') || '';
286 0   0       my $title = $node->attr('title') || '';
287 0   0       my $width = $node->attr('width') || '';
288 0   0       my $height = $node->attr('height') || '';
289              
290 0 0 0       if ( $width || $height and $self->image_tag_fallback ) {
    0 0        
    0          
291 0           return "<img " . $self->get_attr_str( $node, qw/ src width height alt /, @common_attrs ) . " />";
292             }
293             elsif ( $self->image_style eq 'inline' ) {
294 0 0         return sprintf "![%s](%s \"%s\")", $text, $url, $title if $title;
295 0           return sprintf "![%s](%s)", $text, $url;
296             }
297             elsif ( $self->image_style eq 'reference' ) {
298 0           my $id = $self->_next_link_id;
299 0           $self->_add_link( { id => $id, url => $url, title => $title } );
300 0           return sprintf "![%s][%s]", $text, $id;
301             }
302             }
303              
304             sub _li_start {
305 0     0     my ( $self, $node, $rules ) = @_;
306 0           my @parent_lists = $node->look_up( _tag => qr/ul|ol/ );
307              
308 0           my $prefix = (' ') x ( @parent_lists - 1 );
309              
310 0           my $bullet = '';
311 0 0 0       $bullet = $self->_ul_li_start if $node->parent and $node->parent->tag eq 'ul';
312 0 0 0       $bullet = $self->_ol_li_start( $node->parent )
313             if $node->parent and $node->parent->tag eq 'ol';
314 0           return "\n$prefix$bullet ";
315             }
316              
317             sub _ul_li_start {
318 0     0     my $self = shift;
319 0           my $style = $self->unordered_list_style;
320 0 0         return '*' if $style eq 'asterisk';
321 0 0         return '+' if $style eq 'plus';
322 0 0         return '-' if $style eq 'dash';
323 0           die "no such unordered list style: '$style'";
324             }
325              
326             my %ol_count = ();
327              
328             sub _ol_li_start {
329 0     0     my ( $self, $ol ) = @_;
330 0           my $style = $self->ordered_list_style;
331              
332 0 0         if ( $style eq 'one-dot' ) {
    0          
333 0           return '1.';
334             }
335             elsif ( $style eq 'sequential' ) {
336 0           my $count = ++$ol_count{$ol};
337 0           return "$count.";
338             }
339             else {
340 0           die "no such ordered list style: $style";
341             }
342             }
343              
344             sub _p_prefix {
345 0     0     my ( $wc, $node, $rules ) = @_;
346 0 0         return $node->look_up( _tag => 'li' ) ? ' ' : '';
347             }
348              
349             sub preprocess_node {
350 0     0 0   my ( $self, $node ) = @_;
351 0 0 0       return unless $node->tag and $node->parent and $node->parent->tag;
      0        
352              
353 0 0         if ( $node->tag eq 'blockquote' ) {
    0          
354 0           my @non_phrasal_children = grep { !$self->_is_phrase_tag( $_->tag ) } $node->content_list;
  0            
355 0 0         unless (@non_phrasal_children)
356             { # ie, we have things like <blockquote>blah blah blah</blockquote>, without a <p> or something
357 0           $self->_envelop_children( $node, HTML::Element->new('p') );
358             }
359             }
360             elsif ( $node->tag eq '~text' ) {
361 0           $self->_escape_text($node);
362              
363             # bug #43998
364 0 0 0       $self->_decode_entities_in_code($node)
365             if $node->parent->tag eq 'code'
366             or $node->parent->tag eq 'code_block';
367             }
368             }
369              
370             sub preprocess_tree {
371 0     0 0   my ( $self, $root ) = @_;
372 0           foreach my $node ( $root->descendants ) {
373              
374             # bug #43997 - multiline code blocks
375 0 0         if ( $self->_text_is_within_code_pre($node) ) {
376 0           $self->_convert_to_code_block($node);
377             }
378             }
379             }
380              
381             sub _text_is_within_code_pre {
382 0     0     my ( $self, $node ) = @_;
383 0 0 0       return unless $node->parent->parent and $node->parent->parent->tag;
384              
385             # Must be <code><pre>...</pre></code> (or <pre><code>...</code></pre>)
386 0   0       my $code_pre = $node->parent->tag eq 'code' && $node->parent->parent->tag eq 'pre';
387 0   0       my $pre_code = $node->parent->tag eq 'pre' && $node->parent->parent->tag eq 'code';
388 0 0 0       return unless $code_pre or $pre_code;
389              
390             # Can't be any other nodes in a code block
391 0 0 0       return if $node->left or $node->right;
392 0 0 0       return if $node->parent->left or $node->parent->right;
393              
394 0           return 1;
395             }
396              
397             sub _convert_to_code_block {
398 0     0     my ( $self, $node ) = @_;
399 0           $node->parent->parent->replace_with_content->delete;
400 0           $node->parent->tag("code_block");
401             }
402              
403             sub _envelop_children {
404 0     0     my ( $self, $node, $new_child ) = @_;
405              
406 0           my @children = $node->detach_content;
407 0           $node->push_content($new_child);
408 0           $new_child->push_content(@children);
409             }
410              
411             # special handling for: ` _ # . [ !
412             my @escapes = qw( \\ * { } _ ` );
413              
414             my %backslash_escapes = (
415             '\\' => [ '0923fjhtml2wikiescapedbackslash', "\\\\" ],
416             '*' => [ '0923fjhtml2wikiescapedasterisk', "\\*" ],
417             '{' => [ '0923fjhtml2wikiescapedopenbrace', "\\{" ],
418             '}' => [ '0923fjhtml2wikiescapedclosebrace', "\\}" ],
419             '_' => [ '0923fjhtml2wikiescapedunderscore', "\\_" ],
420             '`' => [ '0923fjhtml2wikiescapedbacktick', "\\`" ],
421             );
422              
423             sub _escape_text {
424 0     0     my ( $self, $node ) = @_;
425 0   0       my $text = $node->attr('text') || '';
426              
427             #
428             # (bug #43998)
429             # Only backslash-escape backticks that don't occur within <code>
430             # tags. Those within <code> tags are left alone and the backticks to
431             # signal a <code> tag get upgraded to a double-backtick by
432             # _code_delim().
433             #
434             # (bug #43993)
435             # Likewise, only backslash-escape underscores that occur outside
436             # <code> tags.
437             #
438              
439 0   0       my $inside_code
440             = $node->look_up( _tag => 'code' )
441             || $node->look_up( _tag => 'code_block' )
442             || $node->look_up( _tag => 'pre' );
443              
444 0 0         if ( not $inside_code ) {
445 0           my $escapes = join '', @escapes;
446 0           $text =~ s/([\Q$escapes\E])/$backslash_escapes{$1}->[0]/g;
447 0           $text =~ s/^([\d]+)\./$1\\./;
448 0           $text =~ s/^\#/\\#/;
449 0           $text =~ s/\!\[/\\![/g;
450 0           $text =~ s/\]\[/]\\[/g;
451              
452 0           $node->attr( text => $text );
453             }
454             }
455              
456             # bug #43998
457             sub _code_delim {
458 0     0     my ( $self, $node, $rules ) = @_;
459 0           my $contents = $self->get_elem_contents($node);
460 0 0         return $contents =~ /\`/ ? '``' : '`';
461             }
462              
463             # bug #43996
464             sub _decode_entities_in_code {
465 0     0     my ( $self, $node ) = @_;
466 0   0       my $text = $node->attr('text') || '';
467 0 0         return unless $text;
468              
469 0           HTML::Entities::_decode_entities( $text, { 'amp' => '&', 'lt' => '<', 'gt' => '>' } );
470 0           $node->attr( text => $text );
471             }
472              
473             sub postprocess_output {
474 0     0 0   my ( $self, $outref ) = @_;
475 0           $$outref =~ s/\Q$code_block_prefix\E/ /gm;
476 0           $self->_unescape_text($outref);
477 0           $self->_add_references($outref);
478             }
479              
480             sub _unescape_text {
481 0     0     my ( $self, $outref ) = @_;
482 0           foreach my $escape ( values %backslash_escapes ) {
483 0           $$outref =~ s/$escape->[0]/$escape->[1]/g;
484             }
485             }
486              
487             sub _add_references {
488 0     0     my ( $self, $outref ) = @_;
489 0 0         my @links = @{ $self->_links || [] };
  0            
490 0 0         return unless @links;
491              
492 0           my $links = '';
493 0           foreach my $link (@links) {
494 0   0       my $id = $link->{id} || '';
495 0   0       my $url = $link->{url} || '';
496 0   0       my $title = $link->{title} || '';
497 0 0         if ($title) {
498 0           $links .= sprintf " [%s]: %s \"%s\"\n", $id, $url, $title;
499             }
500             else {
501 0           $links .= sprintf " [%s]: %s\n", $id, $url;
502             }
503             }
504              
505 0           $self->_links( [] );
506 0           $self->_last_link_id(0);
507              
508 0           $$outref .= "\n\n$links";
509 0           $$outref =~ s/\s+$//gs;
510             }
511              
512             sub _is_phrase_tag {
513 0   0 0     my $tag = pop || '';
514 0   0       return $HTML::Tagset::isPhraseMarkup{$tag} || $tag eq '~text';
515             }
516              
517             sub _abs2rel {
518 0     0     my ( $self, $uri ) = @_;
519 0 0         return $uri unless $self->base_uri;
520 0           return URI->new($uri)->rel( $self->base_uri )->as_string;
521             }
522              
523             =head1 AUTHOR
524              
525             David J. Iberri, C<< <diberri at cpan.org> >>
526              
527             =head1 BUGS
528              
529             Please report any bugs or feature requests to
530             C<bug-html-wikiconverter-markdown at rt.cpan.org>, or through the web interface at
531             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter-Markdown>.
532             I will be notified, and then you'll automatically be notified of progress on
533             your bug as I make changes.
534              
535             =head1 SUPPORT
536              
537             You can find documentation for this module with the perldoc command.
538              
539             perldoc HTML::WikiConverter::Markdown
540              
541             You can also look for information at:
542              
543             =over 4
544              
545             =item * AnnoCPAN: Annotated CPAN documentation
546              
547             L<http://annocpan.org/dist/HTML-WikiConverter-Markdown>
548              
549             =item * CPAN Ratings
550              
551             L<http://cpanratings.perl.org/d/HTML-WikiConverter-Markdown>
552              
553             =item * RT: CPAN's request tracker
554              
555             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter-Markdown>
556              
557             =item * Search CPAN
558              
559             L<http://search.cpan.org/dist/HTML-WikiConverter-Markdown>
560              
561             =back
562              
563             =head1 COPYRIGHT & LICENSE
564              
565             Copyright 2006 David J. Iberri, all rights reserved.
566              
567             This program is free software; you can redistribute it and/or modify it
568             under the same terms as Perl itself.
569              
570             =cut
571              
572             1;
573