File Coverage

blib/lib/HTML/WikiConverter.pm
Criterion Covered Total %
statement 312 403 77.4
branch 134 198 67.6
condition 53 84 63.1
subroutine 70 86 81.4
pod 7 21 33.3
total 576 792 72.7


line stmt bran cond sub pod time code
1             package HTML::WikiConverter;
2 2     2   47486 use warnings;
  2         4  
  2         82  
3 2     2   13 use strict;
  2         3  
  2         72  
4              
5 2     2   2677 use Params::Validate ':all';
  2         26997  
  2         465  
6 2     2   2135 use HTML::WikiConverter::Normalizer;
  2         8  
  2         86  
7 2     2   6554 use HTML::TreeBuilder;
  2         28557  
  2         33  
8 2     2   100 use HTML::Entities;
  2         4  
  2         172  
9 2     2   13 use HTML::Tagset;
  2         3  
  2         53  
10 2     2   13 use File::Spec;
  2         3  
  2         56  
11 2     2   2348 use DirHandle;
  2         5404  
  2         62  
12 2     2   5626 use Encode;
  2         41938  
  2         254  
13 2     2   25 use Carp;
  2         4  
  2         136  
14              
15 2     2   1828 use URI::Escape;
  2         3038  
  2         159  
16 2     2   2106 use URI;
  2         6260  
  2         14475  
17              
18             our $VERSION = '0.68';
19             our $AUTOLOAD;
20              
21             =head1 NAME
22              
23             HTML::WikiConverter - Convert HTML to wiki markup
24              
25             =head1 SYNOPSIS
26              
27             use HTML::WikiConverter;
28             my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
29             print $wc->html2wiki( html => '<b>text</b>' ), "\n\n";
30              
31             # A more complete example
32              
33             my $html = qq(
34             <p><i>Italic</i>, <b>bold</b>, <span style="font-weight:bold">also bold</span>, etc.</p>
35             );
36              
37             my @dialects = HTML::WikiConverter->available_dialects;
38             foreach my $dialect ( @dialects ) {
39             my $wc = new HTML::WikiConverter( dialect => $dialect );
40             my $wiki = $wc->html2wiki( html => $html );
41             printf "The %s dialect gives:\n\n%s\n\n", $dialect, $wiki;
42             }
43              
44             =head1 DESCRIPTION
45              
46             C<HTML::WikiConverter> is an HTML to wiki converter. It can convert
47             HTML source into a variety of wiki markups, called wiki
48             "dialects". The following dialects are supported:
49              
50             DokuWiki
51             Kwiki
52             MediaWiki
53             MoinMoin
54             Oddmuse
55             PbWiki
56             PhpWiki
57             PmWiki
58             SlipSlap
59             TikiWiki
60             UseMod
61             WakkaWiki
62             WikkaWiki
63              
64             Note that while dialects usually produce satisfactory wiki markup, not
65             all features of all dialects are supported. Consult individual
66             dialects' documentation for details of supported features. Suggestions
67             for improvements, especially in the form of patches, are very much
68             appreciated.
69              
70             =head1 METHODS
71              
72             =head2 new
73              
74             my $wc = new HTML::WikiConverter( dialect => $dialect, %attrs );
75              
76             Returns a converter for the specified wiki dialect. Croaks if
77             C<$dialect> is not provided or its dialect module is not installed on
78             your system. Additional attributes may be specified in C<%attrs>; see
79             L</"ATTRIBUTES"> for a complete list.
80              
81             =cut
82              
83             sub new {
84 10     10 1 486937 my $pkg = shift;
85 10 100       53 return $pkg->__new_dialect(@_) if $pkg eq __PACKAGE__;
86              
87 5         16 my $self = bless { }, $pkg;
88 5         27 $self->__load_attribute_specs();
89 5         32 $self->__setup(@_);
90 4         28 return $self;
91             }
92              
93             sub __new_dialect {
94 5     5   26 my( $pkg, %opts ) = @_;
95 5 50       18 croak "Required 'dialect' parameter is missing" unless $opts{dialect};
96 5         24 my @dialect_classes = ( __PACKAGE__.'::'.$opts{dialect}, $opts{dialect} );
97 5         9 foreach my $dialect_class ( @dialect_classes ) {
98 10 100 66 1   701 return $dialect_class->new( %opts ) if eval "use $dialect_class; 1" or $dialect_class->isa($pkg);
  1     1   484  
  0     1   0  
  0     1   0  
  1     1   662  
  0     1   0  
  0     1   0  
  1     1   732  
  0     1   0  
  0     1   0  
  1         313  
  0         0  
  0         0  
  1         374  
  0         0  
  0         0  
  1         337  
  0         0  
  0         0  
  1         465  
  0         0  
  0         0  
  1         549  
  0         0  
  0         0  
  1         460  
  0         0  
  0         0  
  1         352  
  0         0  
  0         0  
99             }
100 0         0 my $dc_list = join ', ', @dialect_classes;
101 0         0 croak "Dialect '$opts{dialect}' could not be loaded (tried $dc_list). Error: $@";
102             }
103              
104             sub __setup {
105 5     5   8 my $self = shift;
106 5         25 $self->__setup_attributes(@_);
107 4         23 $self->__setup_rules();
108             }
109              
110             sub __setup_attributes {
111 5     5   7 my $self = shift;
112 5         23 $self->__attrs( {} );
113 5         28 $self->__load_and_validate_attributes(@_);
114             }
115              
116             sub __setup_rules {
117 23     23   35 my $self = shift;
118 23         115 $self->__load_rules();
119 23         88 $self->__validate_rules();
120             }
121              
122 33     33   126 sub __original_attrs { shift->_attr( { internal => 1 }, __original_attrs => @_ ) }
123 583     583   1732 sub __attrs { shift->_attr( { internal => 1 }, __attrs => @_ ) }
124 131     131   413 sub __attrs_changed { shift->_attr( { internal => 1 }, __attrs_changed => @_ ) }
125 203     203   649 sub __root { shift->_attr( { internal => 1 }, __root => @_ ) }
126 515     515   1787 sub __rules { shift->_attr( { internal => 1 }, __rules => @_ ) }
127 631     631   2306 sub __attribute_specs { shift->_attr( { internal => 1 }, __attribute_specs => @_ ) }
128              
129             # Unsupported attributes
130 0     0 0 0 sub base_url { shift->__no_such( attribute => qw/ base_url base_uri / ) }
131 0     0 0 0 sub wiki_url { shift->__no_such( attribute => qw/ wiki_url wiki_uri / ) }
132              
133             sub __no_such {
134 0     0   0 my( $self, $thing, $that, $this ) = @_;
135 0         0 croak "'$that' is not a valid $thing. Perhaps you meant '$this'?";
136             }
137              
138             # Pass '{internal=>1}' as first arg for params that aren't attributes
139             sub _attr {
140 2688 100   2688   8193 my( $self, $opts, $param, @value ) = ref $_[1] eq 'HASH' ? @_ : ( +shift, {}, @_ );
141 2688 100       5720 my $store = $opts->{internal} ? $self : $self->__attrs;
142              
143 2688 100       6069 if( @value ) {
144 296 100       376 eval { validate_pos( @value, $self->__attribute_specs->{$param} ) unless $opts->{internal} };
  296         826  
145 296 50       647 $self->__attribute_error($@) if $@;
146 296         558 $store->{$param} = $value[0];
147 296 100       1585 $self->__attrs_changed(1) if !$opts->{internal};
148             }
149              
150 2688 100       16339 return defined $store->{$param} ? $store->{$param} : '';
151             }
152              
153             # Attribute accessors and mutators
154             sub AUTOLOAD {
155 546     546   1712 my $self = shift;
156 546         2433 ( my $attr = $AUTOLOAD ) =~ s/.*://;
157 546 100       1291 return $self->_attr( $attr => @_ ) if exists $self->__attribute_specs->{$attr};
158 1         195 croak "Can't locate method '$attr' in package ".ref($self);
159             }
160              
161             # So AUTOLOAD doesn't intercept calls to destruction method
162 0     0   0 sub DESTROY { }
163              
164             sub __slurp {
165 0     0   0 my( $self, $file ) = @_;
166 0         0 eval "use File::Slurp;";
167 0 0       0 return $self->__simple_slurp($file) if $@;
168 0         0 return scalar File::Slurp::read_file($file);
169             }
170              
171             sub __simple_slurp {
172 0     0   0 my( $self, $file ) = @_;
173 0 0       0 open my $fh, $file or croak "can't open file $file for reading: $!";
174 0         0 my $text = do { local $/; <$fh> };
  0         0  
  0         0  
175 0         0 close $fh;
176 0         0 return $text;
177             }
178              
179             =head2 html2wiki
180              
181             $wiki = $wc->html2wiki( $html, %attrs );
182             $wiki = $wc->html2wiki( html => $html, %attrs );
183             $wiki = $wc->html2wiki( file => $file, %attrs );
184             $wiki = $wc->html2wiki( uri => $uri, %attrs );
185              
186             Converts HTML source to wiki markup for the current dialect. Accepts
187             either an HTML string C<$html>, an file C<$file>, or a URI <$uri> to
188             read from.
189              
190             Attributes assigned in C<%attrs> (see L</"ATTRIBUTES">) will augment
191             or override previously assigned attributes for the duration of the
192             C<html2wiki()> call.
193              
194             =cut
195              
196             sub html2wiki {
197 24     24 1 278295 my $self = shift;
198              
199             # Assumes that if @_ is odd-numbered, its first element is html
200 24 100       152 my %args = @_ % 2 ? ( html => +shift, @_ ) : @_;
201              
202 24         101 my %common_arg_errors = ( url => 'uri', base_url => 'base_uri', wiki_url => 'wiki_uri' );
203 24         190 while( my( $bad, $good ) = each %common_arg_errors ) {
204 72 50       276 $self->__no_such( 'argument to html2wiki()', $bad, $good ) if exists $args{$bad};
205             }
206              
207 24         59 my @input_sources = grep { exists $args{$_} } qw/ html file uri /;
  72         169  
208 24 50       69 croak "missing 'html', 'file', or 'uri' argument to html2wiki" unless @input_sources;
209 24 50       68 croak "more than one of 'html', 'file', or 'uri' provided, but only one input source allowed" if @input_sources > 1;
210              
211 24   100     95 my $html = delete $args{html} || '';
212 24   50     132 my $file = delete $args{file} || '';
213 24   100     92 my $uri = delete $args{uri} || '';
214              
215 24 50 33     66 $html = $self->__slurp($file) if $file && $self->slurp;
216 24 100       64 $html = $self->__fetch_html_from_uri($uri) if $uri; # may set 'user_agent' attrib, so call before storing attribs
217 24 50 33     172 $html = "<html>$html</html>" if $html and $self->wrap_in_html;
218              
219 24         62 $self->__original_attrs( { %{ $self->__attrs } } );
  24         55  
220 24         165 $self->$_( $args{$_} ) foreach keys %args;
221 24 100       66 $self->__setup_rules() if $self->__attrs_changed;
222              
223             # Decode into Perl's internal form
224 24         138 $html = decode( $self->encoding, $html );
225              
226 24         1640 my $tree = new HTML::TreeBuilder();
227 24         6047 $tree->store_comments(1);
228 24         324 $tree->p_strict( $self->p_strict );
229 24         273 $tree->implicit_body_p_tag(1);
230 24         1038 $tree->ignore_unknown(0); # <ruby> et al
231              
232             # Parse the HTML string or file
233 24 50       207 if( $html ) {
234 24         80 $self->given_html( $html );
235 24         324 $tree->parse($html);
236 24         41060 $tree->eof();
237             } else {
238 0         0 $self->given_html( $self->__slurp($file) );
239 0         0 $tree->parse_file($file);
240             }
241              
242             # Preprocess, save tree and parsed HTML
243 24         3017 $self->__root( $tree );
244 24         98 $self->__preprocess_tree();
245              
246 23         607 $self->__root->deobjectify_text();
247 23         991 $self->parsed_html( $tree->as_HTML(undef, ' ', {}) );
248 23         83 $self->__root->objectify_text();
249              
250             # Convert and preprocess
251 23         1943 my $output = $self->__wikify($tree);
252 23         97 $self->__postprocess_output(\$output);
253              
254             # Avoid leaks
255 23         104 $tree->delete();
256              
257             # Return to original encoding
258 23         1743 $output = encode( $self->encoding, $output );
259              
260 23 100       1039 if( $self->__attrs_changed ) {
261 9         11 $self->__attrs( { %{ $self->__original_attrs } } );
  9         25  
262 9         37 $self->__setup_rules();
263 9         44 $self->__attrs_changed(0);
264             }
265              
266 23         285 return $output;
267             }
268              
269             sub __wikify {
270 140     140   212 my( $self, $node ) = @_;
271              
272             # Concatenate adjacent text nodes
273 140         358 $node->normalize_content();
274              
275 140 100       2054 if( $node->tag eq '~text' ) {
    100          
276 36         279 return $node->attr('text');
277             } elsif( $node->tag eq '~comment' ) {
278 1         12 return '<!--' . $node->attr('text') . '-->';
279             } else {
280 103         1510 my $rules = $self->rules_for_tag( $node->tag );
281              
282 103 100       290 return $self->__subst($rules->{replace}, $node, $rules) if exists $rules->{replace};
283              
284             # Set private preserve rules
285 98 100       284 if( $rules->{preserve} ) {
    50          
286 4 50       24 $rules->{__start} = \&__preserve_start,
287             $rules->{__end} = $rules->{empty} ? undef : '</'.$node->tag.'>';
288             } elsif( $rules->{passthrough} ) {
289 0         0 $rules->{__start} = '';
290 0         0 $rules->{__end} = '';
291             }
292              
293             # Recurse
294 98         307 my $output = $self->get_elem_contents($node);
295              
296             # Unspecified tags have their whitespace preserved (this allows
297             # 'html' and 'body' tags [among others] to keep formatting when
298             # inner tags like 'pre' need to preserve whitespace).
299 98   100     389 my $trim = $rules->{trim} || 'none';
300 98 100 66     425 $output =~ s/^\s+// if $trim eq 'both' or $trim eq 'leading';
301 98 100 66     463 $output =~ s/\s+$// if $trim eq 'both' or $trim eq 'trailing';
302              
303 98   100     331 my $lf = $rules->{line_format} || 'none';
304 98 100       213 $output =~ s/^\s*\n/\n/gm if $lf ne 'none';
305 98 50       389 if( $lf eq 'blocks' ) {
    100          
    50          
    50          
306 0         0 $output =~ s/\n{3,}/\n\n/g;
307             } elsif( $lf eq 'multi' ) {
308 3         9 $output =~ s/\n{2,}/\n/g;
309             } elsif( $lf eq 'single' ) {
310 0         0 $output =~ s/\n+/ /g;
311             } elsif( $lf eq 'none' ) {
312             # Do nothing
313             }
314              
315             # Substitutions
316 98 100       219 $output =~ s/^/$self->__subst($rules->{line_prefix}, $node, $rules)/gem if $rules->{line_prefix};
  4         12  
317 98 100       198 $output = $self->__subst($rules->{__start}, $node, $rules).$output if $rules->{__start};
318 98 100       196 $output = $output.$self->__subst($rules->{__end}, $node, $rules) if $rules->{__end};
319 98 100       238 $output = $self->__subst($rules->{start}, $node, $rules).$output if $rules->{start};
320 98 100       240 $output = $output.$self->__subst($rules->{end}, $node, $rules) if $rules->{end};
321            
322             # If the current element is a block and is contained within
323             # another block element, then we will not block the current
324             # element by default. However, if the current element is a block
325             # and is contained within another block element that specifies a
326             # line_format of 'blocks', then we will block the current element.
327 98 100 66     265 $output = "\n\n$output\n\n" if $rules->{block} &&
      66        
328             ( ! $self->elem_search_lineage( $node, { block => 1 } ) or
329             $self->elem_search_lineage( $node, { line_format => 'blocks' } ) );
330              
331             # ...but they are put on their own line
332 98 100 100     328 $output = "\n$output" if $rules->{block} and $node->parent->look_up( _tag => $node->tag ) and $trim ne 'none';
      66        
333              
334 98         1411 return $output;
335             }
336             }
337              
338             # Deprecated. Instead use elem_search_lineage( $node, { block => 1 } ).
339             sub elem_within_block {
340 0     0 0 0 my( $self, $node ) = @_;
341 0         0 foreach my $n ( $node->lineage ) {
342 0 0 0     0 return $n if $self->rules_for_tag($n->tag || '')->{block};
343             }
344 0         0 return 0;
345             }
346              
347             =head2 elem_search_lineage
348              
349             my $ancestor = $wc->elem_search_lineage( $node, \%rules );
350              
351             Searches the lineage of C<$node> and returns the first ancestor node
352             that has rules matching those specified in C<%rules>, or C<undef> if
353             no matching node is found.
354              
355             For example, to find out whether C<$node> has an ancestor with rules
356             matching C<{ block =E<gt>1 }>, one could use:
357              
358             if( $wc->elem_search_lineage( $node, { block => 1 } ) ) {
359             # do something
360             }
361              
362             =cut
363              
364             sub elem_search_lineage {
365 15     15 1 27 my( $self, $node, $search_rules ) = @_;
366              
367 15         60 foreach my $n ( $node->lineage ) {
368 30         323 my $rules = $self->rules_for_tag( $n->tag );
369              
370 30         50 my $matched = 1;
371 30         106 while( my($k,$v) = each %$search_rules ) {
372 30   100     116 my $rule_value = $rules->{$k} || '';
373 30 100       146 $matched = 0 unless $v eq $rule_value;
374             }
375              
376 30 100       131 return $n if $matched;
377             }
378              
379 14         76 return undef;
380             }
381              
382             sub __subst {
383 51     51   93 my( $self, $subst, $node, $rules ) = @_;
384 51 100       185 return ref $subst eq 'CODE' ? $subst->( $self, $node, $rules ) : $subst;
385             }
386              
387             sub __preserve_start {
388 4     4   9 my( $self, $node, $rules ) = @_;
389              
390 4         10 my $tag = $node->tag;
391 4 50       35 my @attrs = exists $rules->{attributes} ? @{$rules->{attributes}} : ( );
  0         0  
392 4         18 my $attr_str = $self->get_attr_str( $node, @attrs );
393 4 50       14 my $slash = $rules->{empty} ? ' /' : '';
394              
395 4 50       7 return '<'.$tag.' '.$attr_str.$slash.'>' if $attr_str;
396 4         19 return '<'.$tag.$slash.'>';
397             }
398              
399             # Maps a tag name to its URI attribute
400             my %rel2abs = ( a => 'href', img => 'src' );
401              
402             my %allowedEmptyTag = ( %HTML::Tagset::emptyElement, '~comment' => 1, '~text' => 1 );
403             my %isKnownTag = %HTML::Tagset::isKnown;
404              
405             sub __preprocess_tree {
406 24     24   37 my $self = shift;
407              
408 24         59 $self->__root->objectify_text();
409              
410 24         9209 $self->preprocess_tree($self->__root);
411              
412 24 50       129 HTML::WikiConverter::Normalizer->new->normalize($self->__root) if $self->normalize;
413              
414 23 50       213 my %strip_tag = map { $_ => 1 } @{ $self->strip_tags || [] };
  71         184  
  23         126  
415 23         276 my %passthrough_naked_tags = map { $_ => 1 } $self->__passthrough_naked_tags;
  0         0  
416              
417 23         59 foreach my $node ( $self->__root->descendents ) {
418 110 100       5498 $node->tag('') unless $node->tag;
419 110 100       915 $node->delete, next if $strip_tag{$node->tag};
420 94 50 33     776 $node->replace_with_content->delete, next if $passthrough_naked_tags{$node->tag} and !$node->all_external_attr_names;
421 94         721 $self->__rm_invalid_text($node);
422 94 100 100     421 $node->delete, next if $self->strip_empty_tags and !$allowedEmptyTag{$node->tag} and $self->__elem_is_empty($node);
      100        
423 84 100 100     345 $self->__encode_entities($node) if $node->tag eq '~text' and $self->escape_entities;
424 84 100 100     1095 $self->__rel2abs($node) if $self->base_uri and exists $rel2abs{$node->tag};
425 84         11651 $self->preprocess_node($node);
426             }
427              
428             # Reobjectify in case preprocessing added new text
429 23         169 $self->__root->objectify_text();
430              
431 23 100       764 $self->preprocess->( $self->__root ) if ref $self->preprocess;
432             }
433              
434             sub __passthrough_naked_tags {
435 23     23   42 my $self = shift;
436              
437 23         28 my @tags;
438 23 50       105 if( ref $self->passthrough_naked_tags eq 'ARRAY' ) {
    50          
439 0         0 @tags = @{ $self->passthrough_naked_tags };
  0         0  
440             } elsif( $self->passthrough_naked_tags ) {
441 0         0 @tags = $self->__default_passthrough_naked_tags;
442             } else {
443 23         36 @tags = ( );
444             }
445              
446 23         106 return @tags;
447             }
448              
449             # (bug #28402)
450 0     0   0 sub __default_passthrough_naked_tags { qw/ tbody thead span div font / }
451              
452             sub __elem_is_empty {
453 26     26   293 my( $self, $node ) = @_;
454 26         64 my $content = $self->get_elem_contents($node);
455 26 100 66     145 my $has_nonwhitespace = $content && length $content ? $content =~ /\S/ : 0;
456 26         148 return !$has_nonwhitespace;
457             }
458              
459             sub __fetch_html_from_uri {
460 1     1   3 my( $self, $uri ) = @_;
461 1         9 my $ua = $self->__user_agent;
462 1         7 my $res = $ua->get($uri);
463 1 50       628118 croak "request for <$uri> failed" unless $res->is_success;
464 1   50     30 my $encoding = $self->encoding || $self->__guess_encoding($res) || 'utf-8';
465 1         8 my $html = encode( $self->encoding, decode( $encoding, $res->content ) );
466 1         2403 return $html;
467             }
468              
469             sub __guess_encoding {
470 0     0   0 my( $self, $res ) = @_;
471 0 0       0 carp "LWP::Charset is not installed but is required for determining the charset claimed by the content at the requested URI", return
472             unless eval "use LWP::Charset; 1";
473 0         0 return LWP::Charset::getCharset($res);
474             }
475              
476             sub __user_agent {
477 1     1   4 my $self = shift;
478 1 50       12 $self->user_agent( $self->__default_user_agent ) unless $self->user_agent;
479 1         9 return $self->user_agent;
480             }
481              
482             sub __default_user_agent {
483 1 50   1   289 croak "LWP is not installed but is required for fetching URIs" unless eval "use LWP::UserAgent; 1";
  1     1   13  
  1         5  
  1         19  
484 1         10 return LWP::UserAgent->new( agent => shift->__default_ua_string );
485             }
486              
487 1     1   11 sub __default_ua_string { "html2wiki/$VERSION" }
488              
489             # Encodes high-bit and control chars in node's text to HTML entities.
490             sub __encode_entities {
491 19     19   76 my( $self, $node ) = @_;
492 19 50       66 my $text = defined $node->attr('text') ? $node->attr('text') : '';
493 19         441 encode_entities( $text, '<>&' );
494 19         1254 $node->attr( text => $text );
495             }
496              
497             # Convert relative to absolute URIs
498             sub __rel2abs {
499 5     5   67 my( $self, $node ) = @_;
500 5         16 my $attr = $rel2abs{$node->tag};
501 5 50       43 return unless $node->attr($attr); # don't add attribute if it's not already there
502 5         77 $node->attr( $attr => uri_unescape( URI->new_abs( $node->attr($attr), $self->base_uri )->as_string ) );
503             }
504              
505             # Removes text nodes directly inside container elements.
506             my %containers = map { $_ => 1 } qw/ table tbody tr ul ol dl menu /;
507              
508             sub __rm_invalid_text {
509 94     94   140 my( $self, $node ) = @_;
510 94 50       221 my $tag = defined $node->tag ? $node->tag : '';
511 94 50       1467 if( $containers{$tag} ) {
512 0         0 $_->delete for grep { $_->tag eq '~text' } $node->content_list;
  0         0  
513             }
514             }
515              
516             sub strip_aname {
517 0     0 0 0 my( $self, $node ) = @_;
518 0 0       0 return if $node->attr('href');
519 0         0 $node->replace_with_content->delete();
520             }
521              
522             sub caption2para {
523 0     0 0 0 my( $self, $node ) = @_;
524 0         0 my $table = $node->parent;
525 0         0 $node->detach();
526 0         0 $table->preinsert($node);
527 0         0 $node->tag('p');
528             }
529              
530 24     24 0 46 sub preprocess_tree { }
531 84     84 0 178 sub preprocess_node { }
532              
533             sub __postprocess_output {
534 23     23   38 my( $self, $outref ) = @_;
535 23         143 $$outref =~ s/\n[\s^\n]+\n/\n\n/g; # XXX this is causing bug 14527
536 23         122 $$outref =~ s/\n{2,}/\n\n/g;
537 23         75 $$outref =~ s/^\n+//;
538 23         96 $$outref =~ s/\s+$//;
539 23         56 $$outref =~ s/[ \t]+$//gm;
540 23         69 $self->postprocess_output($outref);
541             }
542              
543 23     23 0 42 sub postprocess_output { }
544              
545 0     0 0 0 sub attributes { {} }
546              
547             sub __load_attribute_specs {
548 5     5   19 my $self = shift;
549              
550             # Get default attribute specs
551 5         29 my $default_specs = $self->__default_attribute_specs;
552              
553             # Get dialect attribute specs
554 5         24 my @dialect_specs = $self->attributes;
555 5 50 33     64 my $dialect_specs = @dialect_specs == 1 && ref $dialect_specs[0] eq 'HASH' ? $dialect_specs[0] : {@dialect_specs};
556              
557 5         53 my %attr_specs = %$default_specs;
558 5         29 while( my( $attr, $spec ) = each %$dialect_specs ) {
559 13         47 $attr_specs{$attr} = $spec;
560             }
561              
562 5         31 $self->__attribute_specs( \%attr_specs );
563             }
564              
565             sub __load_and_validate_attributes {
566 5     5   8 my $self = shift;
567              
568 5         8 my %attrs = eval { validate( @_, $self->__attribute_specs ) };
  5         13  
569 5 100       317 $self->__attribute_error($@) if $@;
570              
571 4         17 while( my( $attr, $value ) = each %attrs ) {
572 64         434 $self->$attr($value);
573             }
574             }
575              
576             sub __attribute_error {
577 1     1   2 my( $self, $error ) = @_;
578             # Validating attributes failed, so we don't have access to the
579             # 'dialect' attribute; obtain it from the package name instead
580 1         3 ( my $dialect = ref $self ) =~ s/.*://;
581              
582 1 50       10 $error = sprintf "The attribute '%s' does not exist in the dialect '%s'.", $1, $dialect
583             if $error =~ /not listed in the validation options\: (\w+)/;
584              
585 1         172 croak $error;
586             }
587              
588 0     0 0 0 sub rules { {} }
589              
590             sub __load_rules {
591 23     23   38 my $self = shift;
592 23         80 $self->__rules( $self->rules );
593             }
594              
595             # Rules for validating rules
596             my %meta_rules = (
597             trim => { range => [ qw/ none both leading trailing / ] },
598             line_format => { range => [ qw/ none single multi blocks / ] },
599             replace => { singleton => 1 },
600             alias => { singleton => 1 },
601             attributes => { depends => [ qw/ preserve / ] },
602             empty => { depends => [ qw/ preserve / ] },
603             passthrough => { singleton => 1 },
604             );
605              
606             sub __validate_rules {
607 23     23   33 my $self = shift;
608              
609 23         27 foreach my $tag ( keys %{ $self->__rules } ) {
  23         40  
610 185         351 my $rules = $self->__rules->{$tag};
611              
612 185         501 foreach my $opt ( keys %$rules ) {
613 280 100       793 my $spec = $meta_rules{$opt} or next;
614              
615 96   100     242 my $singleton = $spec->{singleton} || 0;
616 96 50       246 my @disallows = ref $spec->{disallows} eq 'ARRAY' ? @{ $spec->{disallows} } : ( );
  0         0  
617 96 50       187 my @depends = ref $spec->{depends} eq 'ARRAY' ? @{ $spec->{depends} } : ( );
  0         0  
618 96 100       193 my @range = ref $spec->{range} eq 'ARRAY' ? @{ $spec->{range} } : ( );
  30         76  
619 96         144 my %range = map { $_ => 1 } @range;
  120         247  
620              
621 96 50 66     423 $self->__rule_error( $tag, "'$opt' cannot be combined with any other option" )
622             if $singleton and keys %$rules != 1;
623              
624             $rules->{$_} && $self->__rule_error( $tag, "'$opt' cannot be combined with '$_'" )
625 96   0     177 foreach @disallows;
626              
627             ! $rules->{$_} && $self->__rule_error( $tag, "'$opt' must be combined with '$_'" )
628 96   0     137 foreach @depends;
629              
630 96 50 66     520 $self->__rule_error( $tag, "Unknown '$opt' value '$rules->{$opt}'. '$opt' must be one of ", join(', ', map { "'$_'" } @range) )
  0         0  
631             if @range and ! exists $range{$rules->{$opt}};
632             }
633             }
634             }
635              
636             sub __rule_error {
637 0     0   0 my( $self, $tag, @msg ) = @_;
638 0         0 my $dialect = ref $self;
639 0         0 croak @msg, " in tag '$tag', dialect '$dialect'.\n";
640             }
641              
642             sub get_elem_contents {
643 128     128 0 187 my( $self, $node ) = @_;
644 128         344 my $str = join '', map { $self->__wikify($_) } $node->content_list;
  117         817  
645 128 50       959 return defined $str ? $str : '';
646             }
647              
648             sub get_wiki_page {
649 5     5 0 111 my( $self, $uri ) = @_;
650 5 50       30 my @wiki_uris = ref $self->wiki_uri eq 'ARRAY' ? @{$self->wiki_uri} : $self->wiki_uri;
  5         23  
651 5         20 foreach my $wiki_uri ( @wiki_uris ) {
652 12         34 my $page = $self->__extract_wiki_page( $uri, $wiki_uri );
653 12 100       451 return $page if $page;
654             }
655              
656 2         7 return undef;
657             }
658              
659             sub __extract_wiki_page {
660 12     12   20 my( $self, $uri, $wiki_uri ) = @_;
661 12 50       24 return undef unless $wiki_uri;
662              
663 12 50       39 if( ref $wiki_uri eq 'Regexp' ) {
    100          
664 0 0       0 return $uri =~ $wiki_uri ? $1 : undef;
665             } elsif( ref $wiki_uri eq 'CODE' ) {
666 3         11 return $wiki_uri->( $self, URI->new($uri) );
667             } else {
668             # Ensure $wiki_uri is absolute
669 9         42 $wiki_uri = URI->new_abs( $wiki_uri, $self->base_uri )->as_string;
670              
671 9 100       871 return undef unless index( $uri, $wiki_uri ) == 0;
672 2 50       10 return undef unless length $uri > length $wiki_uri;
673 2         10 return substr( $uri, length $wiki_uri );
674             }
675             }
676              
677             # Adapted from Kwiki source
678             my $UPPER = '\p{UppercaseLetter}';
679             my $LOWER = '\p{LowercaseLetter}';
680             my $WIKIWORD = "$UPPER$LOWER\\p{Number}\\p{ConnectorPunctuation}";
681              
682 0     0 0 0 sub is_camel_case { return $_[1] =~ /(?:[$UPPER](?=[$WIKIWORD]*[$UPPER])(?=[$WIKIWORD]*[$LOWER])[$WIKIWORD]+)/ }
683              
684             sub get_attr_str {
685 4     4 0 8 my( $self, $node, @attrs ) = @_;
686 4         9 my %attrs = map { $_ => $node->attr($_) } @attrs;
  0         0  
687 4         9 my $str = join ' ', map { $_.'="'.encode_entities($attrs{$_}).'"' } grep { $attrs{$_} } @attrs;
  0         0  
  0         0  
688              
689             # (bug #19046) partial fix: attributes must be contained on a single line
690 4 50       9 $str =~ s/[\n\r]/ /g if $str;
691              
692 4 50       16 return defined $str ? $str : '';
693             }
694              
695             =head2 given_html
696              
697             my $html = $wc->given_html;
698              
699             Returns the HTML passed to or fetched (ie, from a file or URI) by the
700             last C<html2wiki()> method call. Useful for debugging.
701              
702             =cut
703              
704 24     24 1 93 sub given_html { shift->_attr( { internal => 1 }, __given_html => @_ ) }
705              
706             =head2 parsed_html
707              
708             my $parsed_html = $wc->parsed_html;
709              
710             Returns a string containing the post-processed HTML from the last
711             C<html2wiki> call. Post-processing includes parsing by
712             L<HTML::TreeBuilder>, CSS normalization by
713             L<HTML::WikiConverter::Normalizer>, and calls to the C<preprocess> and
714             C<preprocess_tree> dialect methods.
715              
716             =cut
717              
718 23     23 1 18412 sub parsed_html { shift->_attr( { internal => 1 }, __parsed_html => @_ ) }
719              
720             =head2 available_dialects
721              
722             my @dialects = HTML::WikiConverter->available_dialects;
723              
724             Returns a list of all available dialects by searching the directories
725             in C<@INC> for C<HTML::WikiConverter::> modules.
726              
727             =cut
728              
729             sub available_dialects {
730 0     0 1 0 my @dialects;
731              
732             my %seen;
733 0         0 for my $inc ( @INC ) {
734 0         0 my $dir = File::Spec->catfile( $inc, 'HTML', 'WikiConverter' );
735 0 0       0 my $dh = DirHandle->new( $dir ) or next;
736 0         0 while ( my $f = $dh->read ) {
737 0 0       0 next unless $f =~ /^(\w+)\.pm$/;
738 0         0 my $dialect = $1;
739              
740 0 0       0 next if $seen{$dialect}++;
741 0 0 0     0 next if $dialect eq 'Normalizer' or $dialect eq 'WebApp';
742              
743 0         0 push @dialects, $dialect;
744             }
745             }
746              
747 0 0       0 return wantarray ? sort @dialects : @dialects;
748             }
749              
750             =head2 rules_for_tag
751              
752             my $rules = $wc->rules_for_tag( $tag );
753              
754             Returns the rules that will be used for converting elements of the
755             given tag. Follows C<alias> references. Note that the rules used for a
756             particular tag may depend on the current set of attributes being used.
757              
758             =cut
759              
760             sub rules_for_tag {
761 133     133 1 895 my( $self, $tag ) = @_;
762 133         294 my $rules = $self->__rules_for_tag($tag);
763 133 100       422 return $rules->{alias} ? $self->__rules_for_tag( $rules->{alias} ) : $rules;
764             }
765              
766             sub __rules_for_tag {
767 142     142   195 my( $self, $tag ) = @_;
768 142 100       276 return $self->__rules->{$tag} if $self->__rules->{$tag};
769 91 50 66     255 return $self->__rules->{UNKNOWN} if $self->__rules->{UNKNOWN} and !$isKnownTag{$tag};
770 91         337 return { };
771             }
772              
773             =head1 ATTRIBUTES
774              
775             You may configure C<HTML::WikiConverter> using a number of
776             attributes. These may be passed as arguments to the C<new>
777             constructor, or can be called as object methods on an H::WC object.
778              
779             Some dialects allow other attributes in addition to those below, and
780             may override the attributes' default values. Consult the dialect's
781             documentation for details.
782              
783             =head2 base_uri
784              
785             URI to use for converting relative URIs to absolute ones. This
786             effectively ensures that the C<src> and C<href> attributes of image
787             and anchor tags, respectively, are absolute before converting the HTML
788             to wiki markup, which is necessary for wiki dialects that handle
789             internal and external links separately. Relative URIs are only
790             converted to absolute ones if the C<base_uri> argument is
791             present. Defaults to C<undef>.
792              
793             =head2 dialect
794              
795             (Required) Dialect to use for converting HTML into wiki markup. See
796             the L</"DESCRIPTION"> section above for a list of dialects. C<new()>
797             will fail if the dialect given is not installed on your system. Use
798             C<available_dialects()> to list installed dialects.
799              
800             =head2 encoding
801              
802             Specifies the encoding used by the HTML to be converted. Also
803             determines the encoding of the wiki markup returned by the
804             C<html2wiki> method. Defaults to C<"utf8">.
805              
806             =head2 escape_entities
807              
808             Passing C<escape_entities> a true value uses L<HTML::Entities> to
809             encode potentially unsafe 'E<lt>', 'E<gt>', and 'E<amp>' characters.
810             Defaults to true.
811              
812             =head2 p_strict
813              
814             Boolean indicating whether L<HTML::TreeBuilder> will use strict
815             handling of paragraph tags when parsing HTML input. (This corresponds
816             to the C<p_strict> method in the L<HTML::TreeBuilder> module.) Enabled
817             by default.
818              
819             =head2 passthrough_naked_tags
820              
821             Boolean indicating whether tags with no attributes ("naked" tags)
822             should be removed and replaced with their content. By default, this
823             only applies to non-semantic tags such as E<lt>spanE<gt>,
824             E<lt>divE<gt>, etc., but does not apply to semantic tags such as
825             E<lt>strongE<gt>, E<lt>addressE<gt>, etc. To override this behavior
826             and specify the tags that should be considered for passthrough,
827             provide this attribute with a reference to an array of tag names.
828             Defaults to false, but you'll probably want to enable it.
829              
830             =head2 preprocess
831              
832             Code reference that gets invoked after HTML is parsed but before it is
833             converted into wiki markup. The callback is passed two arguments: the
834             C<HTML::WikiConverter> object and a L<HTML::Element> pointing to the
835             root node of the HTML tree created by L<HTML::TreeBuilder>.
836              
837             =head2 slurp
838              
839             Boolean that, if enabled, bypasses C<HTML::Parser>'s incremental
840             parsing (thus I<slurping> the file in all at once) of files when
841             reading HTML files. If L<File::Slurp> is installed, its C<read_file()>
842             function will be used to perform slurping; otherwise, a common Perl
843             idiom will be used for slurping instead. This option is only used if
844             you call C<html2wiki()> with the C<file> argument.
845              
846             =head2 strip_empty_tags
847              
848             Strips elements containing no content (unless those elements
849             legitimately contain no content, such as is the case for C<br> and
850             C<img> tags, for example). Defaults to false.
851              
852             =head2 strip_tags
853              
854             A reference to an array of tags to be removed from the HTML input
855             prior to conversion to wiki markup. Tag names are the same as those
856             used in L<HTML::Element>. Defaults to C<[ '~comment', 'head',
857             'script', 'style' ]>.
858              
859             =head2 user_agent
860              
861             Specifies the L<LWP::UserAgent> object to be used when fetching the
862             URI passed to C<html2wiki()>. If unspecified and C<html2wiki()> is
863             passed a URI, a default user agent will be created.
864              
865             =head2 wiki_uri
866              
867             Takes a URI, regular expression, or coderef (or a reference to an
868             array of elements of these types) used to determine which links are to
869             wiki pages: a link whose C<href> parameter matches C<wiki_uri> will be
870             treated as a link to a wiki page. In addition, C<wiki_uri> will be
871             used to extract the title of the wiki page. The way this is done
872             depends on whether the C<wiki_uri> has been set to a string, regexp,
873             or coderef. The default is C<undef>, meaning that all links will be
874             treated as external links by default.
875              
876             If C<wiki_uri> is a string, it is interpreted as a URI template, and
877             it will be assumed that URIs to wiki pages are created by joining
878             C<wiki_uri> with the wiki page title. For example, the English
879             Wikipedia might use C<"http://en.wikipedia.org/wiki/"> as the value of
880             C<wiki_uri>. Ward's wiki might use C<"http://c2.com/cgi/wiki?">. These
881             examples use an absolute C<wiki_uri>, but a relative URI can be used
882             as well; an absolute URI will be created based on the value of
883             C<base_uri>. For example, the Wikipedia example above can be rewritten
884             using C<base_uri> of C<"http://en.wikipedia.org"> and a C<wiki_uri> of
885             C<"/wiki/">.
886              
887             C<wiki_uri> can also be a regexp that matches URIs to wiki pages and
888             also extracts the page title from them. For example, the English
889             Wikipedia might use
890             C<qr~http://en\.wikipedia\.org/w/index\.php\?title\=([^&]+)~>.
891              
892             C<wiki_uri> can also be a coderef that takes the current
893             C<HTML::WikiConverter> object and a L<URI> object. It should return
894             the title of the wiki page extracted from the URI, or C<undef> if the
895             URI doesn't represent a link to a wiki page.
896              
897             As mentioned above, the C<wiki_uri> attribute can either take a single
898             URI/regexp/coderef element or it may be assigned a reference to an
899             array of any number of these elements. This is useful for wikis that
900             have different ways of creating links to wiki pages. For example, the
901             English Wikipedia might use:
902              
903             my $wc = new HTML::WikiConverter(
904             dialect => 'MediaWiki',
905             wiki_uri => [
906             'http://en.wikipiedia.org/wiki/',
907             sub { pop->query_param('title') } # requires URI::QueryParam
908             ]
909             );
910              
911             =head2 wrap_in_html
912              
913             Helps L<HTML::TreeBuilder> parse HTML fragments by wrapping HTML in
914             C<E<lt>htmlE<gt>> and C<E<lt>/htmlE<gt>> before passing it through
915             C<html2wiki>. Boolean, enabled by default.
916              
917             =cut
918              
919             sub __default_attribute_specs { {
920 5     5   130 base_uri => { type => SCALAR, default => '' },
921             dialect => { type => SCALAR, optional => 0 },
922             encoding => { type => SCALAR, default => 'utf-8' },
923             escape_entities => { type => BOOLEAN, default => 1 },
924             normalize => { type => BOOLEAN, default => 1 },
925             p_strict => { type => BOOLEAN, default => 1 },
926             preprocess => { type => CODEREF | UNDEF, default => undef },
927             strip_empty_tags => { type => BOOLEAN, default => 0 },
928             slurp => { type => BOOLEAN, default => 0 },
929             strip_tags => { type => ARRAYREF, default => [ qw/ ~comment head script style / ] },
930             passthrough_naked_tags => { type => ARRAYREF | BOOLEAN, default => 0 },
931             user_agent => { type => OBJECT | UNDEF, default => undef },
932             wiki_uri => { type => SCALAR | ARRAYREF, default => '' },
933             wrap_in_html => { type => BOOLEAN, default => 1 },
934             } }
935              
936             =head1 ADDING A DIALECT
937              
938             Consult L<HTML::WikiConverter::Dialects> for documentation on how to
939             write your own dialect module for C<HTML::WikiConverter>. Or if you're
940             not up to the task, drop me an email and I'll have a go at it when I
941             get a spare moment.
942              
943             =head1 SEE ALSO
944              
945             L<HTML::Tree>, L<Convert::Wiki>
946              
947             =head1 AUTHOR
948              
949             David J. Iberri, C<< <diberri@cpan.org> >>
950              
951             =head1 BUGS
952              
953             Please report any bugs or feature requests to
954             C<bug-html-wikiconverter at rt.cpan.org>, or through the web interface at
955             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>.
956             I will be notified, and then you'll automatically be notified of progress on
957             your bug as I make changes.
958              
959             =head1 SUPPORT
960              
961             You can find documentation for this module with the perldoc command.
962              
963             perldoc HTML::WikiConverter
964              
965             You can also look for information at:
966              
967             =over 4
968              
969             =item * AnnoCPAN: Annotated CPAN documentation
970              
971             L<http://annocpan.org/dist/HTML-WikiConverter>
972              
973             =item * CPAN Ratings
974              
975             L<http://cpanratings.perl.org/d/HTML-WikiConverter>
976              
977             =item * RT: CPAN's request tracker
978              
979             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter>
980              
981             =item * Search CPAN
982              
983             L<http://search.cpan.org/dist/HTML-WikiConverter>
984              
985             =back
986              
987             =head1 ACKNOWLEDGEMENTS
988              
989             Thanks to Tatsuhiko Miyagawa for suggesting
990             L<Bundle::HTMLWikiConverter> as well as providing code for the
991             C<available_dialects()> class method.
992              
993             My thanks also goes to Martin Kudlvasr for catching (and fixing!) a
994             bug in the logic of how HTML files were processed.
995              
996             Big thanks to Dave Schaefer for the PbWiki dialect and for the idea
997             behind the new C<attributes()> implementation.
998              
999             =head1 COPYRIGHT & LICENSE
1000              
1001             Copyright (c) David J. Iberri, all rights reserved.
1002              
1003             This program is free software; you can redistribute it and/or modify it
1004             under the same terms as Perl itself.
1005              
1006             =cut
1007              
1008             1;