File Coverage

blib/lib/MojoMojo/Formatter/TOC.pm
Criterion Covered Total %
statement 41 41 100.0
branch 8 14 57.1
condition 5 7 71.4
subroutine 9 9 100.0
pod 4 4 100.0
total 67 75 89.3


line stmt bran cond sub pod time code
1             package MojoMojo::Formatter::TOC;
2              
3 38     38   52308 use parent qw/MojoMojo::Formatter/;
  38         314  
  38         209  
4              
5 38     38   2820 use HTML::Entities;
  38         9553  
  38         1919  
6 38     38   1110 use Encode;
  38         15570  
  38         15494  
7              
8 38     38   15472 eval "use HTML::Toc;use HTML::TocInsertor;";
  38     38   82853  
  38         1026  
  38         17558  
  38         338230  
  38         717  
9             my $eval_res = $@;
10              
11             =head2 module_loaded
12              
13             Return true if the module is loaded.
14              
15             =cut
16              
17 303 50   303 1 1491 sub module_loaded { $eval_res ? 0 : 1 }
18              
19             =head1 NAME
20              
21             MojoMojo::Formatter::TOC - generate table of contents
22              
23             =head1 DESCRIPTION
24              
25             This formatter will replace C<{{toc}}> with a table of contents, using
26             HTML::GenToc. If you don't want an element to be included in the TOC,
27             make it have C<class="notoc">
28              
29             =head1 METHODS
30              
31             =head2 format_content_order
32              
33             The TOC formatter expects HTML input so it needs to run after the main
34             formatter. Since comment-type formatters (order 91) could add a heading
35             for the comment section, the TOC formatter will run with a priority of 95.
36              
37             =cut
38              
39 868     868 1 2653 sub format_content_order { 95 }
40              
41             =head2 format_content
42              
43             Calls the formatter. Takes a ref to the content as well as the context object.
44             The syntax for the TOC plugin invocation is:
45              
46             {{toc M- }} # start from Header level M
47             {{toc -N }} # stop at Header level N
48             {{toc M-N }} # process only header levels M..N
49              
50             where M is the minimum heading level to include in the TOC, and N is the
51             maximum level (depth). For example, suppose you only have one H1 on the page
52             so it doesn't make sense to add it to the TOC; also, assume you and don't want
53             to include any headers smaller than H3. The {{toc}} markup to achieve that would be:
54              
55             {{toc 2-3}}
56              
57             Defaults to 1-6.
58              
59             =cut
60              
61             sub format_content {
62 132     132 1 54176 my ( $class, $content ) = @_;
63 132 50       591 return unless $class->module_loaded;
64 132         609 my $toc_params_RE = qr/\s+ (\d+)? \s* - \s* (\d+)?/x;
65 132         1936 while (
66             # replace the {{toc ..}} markup tag and parse potential parameters
67             $$content =~ s[
68             \{\{ toc (?:$toc_params_RE)? \s* \/? }}
69             ][<div class="toc">\n<!--mojomojoTOCwillgohere-->\n</div>]ix) {
70 12         33 my ($toc_h_min, $toc_h_max);
71 12   100     62 $toc_h_min = $1 || 1;
72 12   100     54 $toc_h_max = $2 || 9; # in practice, there are no more than 6 heading levels
73 12 50       46 $toc_h_min = 9 if $toc_h_min > 9; # prevent TocGenerator error for headings >= 10
74 12 50 33     60 $toc_h_max = 9 if $toc_h_max > 9 or $toc_h_max < $toc_h_min; # {{toc 3-1}} is wrong; make it {{toc 3-9}} instead
75              
76 12         68 my $toc = HTML::Toc->new();
77 12         663 my $tocInsertor = HTML::TocInsertor->new();
78              
79 12         1914 $toc->setOptions({
80             header => '', # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
81             footer => '',
82             insertionPoint => 'replace <!--mojomojoTOCwillgohere-->',
83             doLinkToId => 0,
84             levelToToc => "[$toc_h_min-$toc_h_max]",
85             templateAnchorName => \&assembleAnchorName,
86             });
87              
88             # http://search.cpan.org/dist/HTML-Toc/Toc.pod#HTML::TocInsertor::insert()
89 12         261 $tocInsertor->insert($toc, $$content, {output => $content});
90              
91 12         18101 return 1;
92             }
93             }
94              
95             =head2 SEO-friendly anchors
96              
97             Anchors should be generated with SEO- (and human-) friendly names, i.e. out of the entire
98             token text, instead of being numeric or reduced to the first word(s) of the token.
99             In the spirit of http://seo2.0.onreact.com/top-10-fatal-url-design-mistakes, compare:
100              
101             http://beachfashion.com/photos/Pamela_Anderson#In_red_swimsuit_in_Baywatch
102             vs.
103             http://beachfashion.com/photos/Pamela_Anderson#in
104              
105             "Which one speaks your language more, which one will you rather click?"
106              
107             The anchor names generated are compliant with XHTML1.0 Strict. Also, per the
108             HTML 4.01 spec, anchor names should be restricted to ASCII characters and
109             anchors that differ only in case may not appear in the same document. In
110             particular, an anchor name may be defined only once in a document (logically,
111             because otherwise the user agent wouldn't know which #foo to scroll to).
112             This is currently a problem with L<HTML::Toc> v1.11, which doesn't have
113             support for passing the already existing anchors to the C<templateAnchorName>
114             sub.
115              
116             =head2 assembleAnchorName
117              
118             http://search.cpan.org/dist/HTML-Toc/Toc.pod#templateAnchorName
119            
120             =cut
121              
122             sub assembleAnchorName {
123 49     49 1 61192 my ($aFile, $aGroupId, $aLevel, $aNode, $text, $children) = @_;
124              
125 49 50       246 if ($text !~ /^\s*$/) {
126             # generate a SEO-friendly anchor right from the token content
127             # The allowed character set is limited first by the URI specification for fragments, http://tools.ietf.org/html/rfc3986#section-2: characters
128             # then by the limitations of the values of 'id' and 'name' attributes: http://www.w3.org/TR/REC-html40/types.html#type-name
129             # Eventually, the only punctuation allowed in id values is [_.:-]
130             # Unicode characters with code points > 0x7E (e.g. Chinese characters) are allowed (test "<h1 id="????">header</h1>" at http://validator.w3.org/#validate_by_input+with_options), except for smart quotes (!), see http://www.w3.org/Search/Mail/Public/search?type-index=www-validator&index-type=t&keywords=[VE][122]+smart+quotes&search=Search+Mail+Archives
131             # However, that contradicts the HTML 4.01 spec: "Anchor names should be restricted to ASCII characters." - http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
132             # ...and the [A-Za-z] class of letters mentioned at http://www.w3.org/TR/REC-html40/types.html#type-name
133             # Finally, note that pod2html fails miserably to generate XHTML-compliant anchor links. See http://validator.w3.org/check?uri=http%3A%2F%2Fsearch.cpan.org%2Fdist%2FCatalyst-Runtime%2Flib%2FCatalyst%2FRequest.pm&charset=(detect+automatically)&doctype=XHTML+1.0+Transitional&group=0&user-agent=W3C_Validator%2F1.606
134 49         217 $text =~ s/\s/_/g;
135 49         226 decode_entities($text); # we need to replace [#&;] only when they are NOT part of an HTML entity. decode_entities saves us from crafting a nasty regexp
136 49         158 $text = encode('utf-8', $text); # convert to UTF-8 because we need to output the UTF-8 bytes
137 49         2650 $text =~ s/([^A-Za-z0-9_:.-])/sprintf('.%02X', ord($1))/eg; # MediaWiki also uses the period, see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity
  69         203  
138 49 100       163 $text = 'L'.$text if $text =~ /\A\W/; # "ID and NAME tokens must begin with a letter ([A-Za-z])" -- http://www.w3.org/TR/html4/types.html#type-name
139             }
140 49 50       142 $text = 'id' if $text eq '';
141              
142             # check if the anchor already exists; if so, add a number
143             # NOTE: there is no way currently to do this easily in HTML-Toc-1.10.
144              
145             #my $anch_num = 1;
146             #my $word_name = $name;
147             ## Reference: http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
148             ## Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.
149             #while (grep {lc $_ eq lc $name} keys %{$args{anchors}}) {
150             # # FIXME (in caller sub): to avoid the grep above, the $args{anchors} hash
151             # # should have as key the lowercased anchor name, and as value its actual value (instead of '1')
152             # $name = $word_name . "_$anch_num";
153             # $anch_num++;
154             #}
155              
156 49         136 return $text;
157             }
158              
159              
160             =head1 SEE ALSO
161              
162             L<MojoMojo> and L<Module::Pluggable::Ordered>.
163              
164             =head1 AUTHORS
165              
166             Dan Dascalescu, L<http://dandascalescu.com>
167              
168             =head1 LICENSE
169              
170             This library is free software. You can redistribute it and/or modify
171             it under the same terms as Perl itself.
172              
173             =cut
174              
175             1;