line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::WikiConverter; |
2
|
2
|
|
|
2
|
|
47486
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
82
|
|
3
|
2
|
|
|
2
|
|
13
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
72
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
2677
|
use Params::Validate ':all'; |
|
2
|
|
|
|
|
26997
|
|
|
2
|
|
|
|
|
465
|
|
6
|
2
|
|
|
2
|
|
2135
|
use HTML::WikiConverter::Normalizer; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
86
|
|
7
|
2
|
|
|
2
|
|
6554
|
use HTML::TreeBuilder; |
|
2
|
|
|
|
|
28557
|
|
|
2
|
|
|
|
|
33
|
|
8
|
2
|
|
|
2
|
|
100
|
use HTML::Entities; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
172
|
|
9
|
2
|
|
|
2
|
|
13
|
use HTML::Tagset; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
53
|
|
10
|
2
|
|
|
2
|
|
13
|
use File::Spec; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
56
|
|
11
|
2
|
|
|
2
|
|
2348
|
use DirHandle; |
|
2
|
|
|
|
|
5404
|
|
|
2
|
|
|
|
|
62
|
|
12
|
2
|
|
|
2
|
|
5626
|
use Encode; |
|
2
|
|
|
|
|
41938
|
|
|
2
|
|
|
|
|
254
|
|
13
|
2
|
|
|
2
|
|
25
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
136
|
|
14
|
|
|
|
|
|
|
|
15
|
2
|
|
|
2
|
|
1828
|
use URI::Escape; |
|
2
|
|
|
|
|
3038
|
|
|
2
|
|
|
|
|
159
|
|
16
|
2
|
|
|
2
|
|
2106
|
use URI; |
|
2
|
|
|
|
|
6260
|
|
|
2
|
|
|
|
|
14475
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.68'; |
19
|
|
|
|
|
|
|
our $AUTOLOAD; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 NAME |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
HTML::WikiConverter - Convert HTML to wiki markup |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 SYNOPSIS |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use HTML::WikiConverter; |
28
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' ); |
29
|
|
|
|
|
|
|
print $wc->html2wiki( html => '<b>text</b>' ), "\n\n"; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# A more complete example |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $html = qq( |
34
|
|
|
|
|
|
|
<p><i>Italic</i>, <b>bold</b>, <span style="font-weight:bold">also bold</span>, etc.</p> |
35
|
|
|
|
|
|
|
); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my @dialects = HTML::WikiConverter->available_dialects; |
38
|
|
|
|
|
|
|
foreach my $dialect ( @dialects ) { |
39
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => $dialect ); |
40
|
|
|
|
|
|
|
my $wiki = $wc->html2wiki( html => $html ); |
41
|
|
|
|
|
|
|
printf "The %s dialect gives:\n\n%s\n\n", $dialect, $wiki; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head1 DESCRIPTION |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
C<HTML::WikiConverter> is an HTML to wiki converter. It can convert |
47
|
|
|
|
|
|
|
HTML source into a variety of wiki markups, called wiki |
48
|
|
|
|
|
|
|
"dialects". The following dialects are supported: |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
DokuWiki |
51
|
|
|
|
|
|
|
Kwiki |
52
|
|
|
|
|
|
|
MediaWiki |
53
|
|
|
|
|
|
|
MoinMoin |
54
|
|
|
|
|
|
|
Oddmuse |
55
|
|
|
|
|
|
|
PbWiki |
56
|
|
|
|
|
|
|
PhpWiki |
57
|
|
|
|
|
|
|
PmWiki |
58
|
|
|
|
|
|
|
SlipSlap |
59
|
|
|
|
|
|
|
TikiWiki |
60
|
|
|
|
|
|
|
UseMod |
61
|
|
|
|
|
|
|
WakkaWiki |
62
|
|
|
|
|
|
|
WikkaWiki |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Note that while dialects usually produce satisfactory wiki markup, not |
65
|
|
|
|
|
|
|
all features of all dialects are supported. Consult individual |
66
|
|
|
|
|
|
|
dialects' documentation for details of supported features. Suggestions |
67
|
|
|
|
|
|
|
for improvements, especially in the form of patches, are very much |
68
|
|
|
|
|
|
|
appreciated. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 METHODS |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 new |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => $dialect, %attrs ); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Returns a converter for the specified wiki dialect. Croaks if |
77
|
|
|
|
|
|
|
C<$dialect> is not provided or its dialect module is not installed on |
78
|
|
|
|
|
|
|
your system. Additional attributes may be specified in C<%attrs>; see |
79
|
|
|
|
|
|
|
L</"ATTRIBUTES"> for a complete list. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=cut |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub new { |
84
|
10
|
|
|
10
|
1
|
486937
|
my $pkg = shift; |
85
|
10
|
100
|
|
|
|
53
|
return $pkg->__new_dialect(@_) if $pkg eq __PACKAGE__; |
86
|
|
|
|
|
|
|
|
87
|
5
|
|
|
|
|
16
|
my $self = bless { }, $pkg; |
88
|
5
|
|
|
|
|
27
|
$self->__load_attribute_specs(); |
89
|
5
|
|
|
|
|
32
|
$self->__setup(@_); |
90
|
4
|
|
|
|
|
28
|
return $self; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub __new_dialect { |
94
|
5
|
|
|
5
|
|
26
|
my( $pkg, %opts ) = @_; |
95
|
5
|
50
|
|
|
|
18
|
croak "Required 'dialect' parameter is missing" unless $opts{dialect}; |
96
|
5
|
|
|
|
|
24
|
my @dialect_classes = ( __PACKAGE__.'::'.$opts{dialect}, $opts{dialect} ); |
97
|
5
|
|
|
|
|
9
|
foreach my $dialect_class ( @dialect_classes ) { |
98
|
10
|
100
|
66
|
1
|
|
701
|
return $dialect_class->new( %opts ) if eval "use $dialect_class; 1" or $dialect_class->isa($pkg); |
|
1
|
|
|
1
|
|
484
|
|
|
0
|
|
|
1
|
|
0
|
|
|
0
|
|
|
1
|
|
0
|
|
|
1
|
|
|
1
|
|
662
|
|
|
0
|
|
|
1
|
|
0
|
|
|
0
|
|
|
1
|
|
0
|
|
|
1
|
|
|
1
|
|
732
|
|
|
0
|
|
|
1
|
|
0
|
|
|
0
|
|
|
1
|
|
0
|
|
|
1
|
|
|
|
|
313
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
374
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
337
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
465
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
549
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
460
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
352
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
99
|
|
|
|
|
|
|
} |
100
|
0
|
|
|
|
|
0
|
my $dc_list = join ', ', @dialect_classes; |
101
|
0
|
|
|
|
|
0
|
croak "Dialect '$opts{dialect}' could not be loaded (tried $dc_list). Error: $@"; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub __setup { |
105
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
106
|
5
|
|
|
|
|
25
|
$self->__setup_attributes(@_); |
107
|
4
|
|
|
|
|
23
|
$self->__setup_rules(); |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub __setup_attributes { |
111
|
5
|
|
|
5
|
|
7
|
my $self = shift; |
112
|
5
|
|
|
|
|
23
|
$self->__attrs( {} ); |
113
|
5
|
|
|
|
|
28
|
$self->__load_and_validate_attributes(@_); |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub __setup_rules { |
117
|
23
|
|
|
23
|
|
35
|
my $self = shift; |
118
|
23
|
|
|
|
|
115
|
$self->__load_rules(); |
119
|
23
|
|
|
|
|
88
|
$self->__validate_rules(); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
33
|
|
|
33
|
|
126
|
sub __original_attrs { shift->_attr( { internal => 1 }, __original_attrs => @_ ) } |
123
|
583
|
|
|
583
|
|
1732
|
sub __attrs { shift->_attr( { internal => 1 }, __attrs => @_ ) } |
124
|
131
|
|
|
131
|
|
413
|
sub __attrs_changed { shift->_attr( { internal => 1 }, __attrs_changed => @_ ) } |
125
|
203
|
|
|
203
|
|
649
|
sub __root { shift->_attr( { internal => 1 }, __root => @_ ) } |
126
|
515
|
|
|
515
|
|
1787
|
sub __rules { shift->_attr( { internal => 1 }, __rules => @_ ) } |
127
|
631
|
|
|
631
|
|
2306
|
sub __attribute_specs { shift->_attr( { internal => 1 }, __attribute_specs => @_ ) } |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Unsupported attributes |
130
|
0
|
|
|
0
|
0
|
0
|
sub base_url { shift->__no_such( attribute => qw/ base_url base_uri / ) } |
131
|
0
|
|
|
0
|
0
|
0
|
sub wiki_url { shift->__no_such( attribute => qw/ wiki_url wiki_uri / ) } |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub __no_such { |
134
|
0
|
|
|
0
|
|
0
|
my( $self, $thing, $that, $this ) = @_; |
135
|
0
|
|
|
|
|
0
|
croak "'$that' is not a valid $thing. Perhaps you meant '$this'?"; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Pass '{internal=>1}' as first arg for params that aren't attributes |
139
|
|
|
|
|
|
|
sub _attr { |
140
|
2688
|
100
|
|
2688
|
|
8193
|
my( $self, $opts, $param, @value ) = ref $_[1] eq 'HASH' ? @_ : ( +shift, {}, @_ ); |
141
|
2688
|
100
|
|
|
|
5720
|
my $store = $opts->{internal} ? $self : $self->__attrs; |
142
|
|
|
|
|
|
|
|
143
|
2688
|
100
|
|
|
|
6069
|
if( @value ) { |
144
|
296
|
100
|
|
|
|
376
|
eval { validate_pos( @value, $self->__attribute_specs->{$param} ) unless $opts->{internal} }; |
|
296
|
|
|
|
|
826
|
|
145
|
296
|
50
|
|
|
|
647
|
$self->__attribute_error($@) if $@; |
146
|
296
|
|
|
|
|
558
|
$store->{$param} = $value[0]; |
147
|
296
|
100
|
|
|
|
1585
|
$self->__attrs_changed(1) if !$opts->{internal}; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
2688
|
100
|
|
|
|
16339
|
return defined $store->{$param} ? $store->{$param} : ''; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# Attribute accessors and mutators |
154
|
|
|
|
|
|
|
sub AUTOLOAD { |
155
|
546
|
|
|
546
|
|
1712
|
my $self = shift; |
156
|
546
|
|
|
|
|
2433
|
( my $attr = $AUTOLOAD ) =~ s/.*://; |
157
|
546
|
100
|
|
|
|
1291
|
return $self->_attr( $attr => @_ ) if exists $self->__attribute_specs->{$attr}; |
158
|
1
|
|
|
|
|
195
|
croak "Can't locate method '$attr' in package ".ref($self); |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# So AUTOLOAD doesn't intercept calls to destruction method |
162
|
0
|
|
|
0
|
|
0
|
sub DESTROY { } |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub __slurp { |
165
|
0
|
|
|
0
|
|
0
|
my( $self, $file ) = @_; |
166
|
0
|
|
|
|
|
0
|
eval "use File::Slurp;"; |
167
|
0
|
0
|
|
|
|
0
|
return $self->__simple_slurp($file) if $@; |
168
|
0
|
|
|
|
|
0
|
return scalar File::Slurp::read_file($file); |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub __simple_slurp { |
172
|
0
|
|
|
0
|
|
0
|
my( $self, $file ) = @_; |
173
|
0
|
0
|
|
|
|
0
|
open my $fh, $file or croak "can't open file $file for reading: $!"; |
174
|
0
|
|
|
|
|
0
|
my $text = do { local $/; <$fh> }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
175
|
0
|
|
|
|
|
0
|
close $fh; |
176
|
0
|
|
|
|
|
0
|
return $text; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 html2wiki |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( $html, %attrs ); |
182
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( html => $html, %attrs ); |
183
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( file => $file, %attrs ); |
184
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( uri => $uri, %attrs ); |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Converts HTML source to wiki markup for the current dialect. Accepts |
187
|
|
|
|
|
|
|
either an HTML string C<$html>, an file C<$file>, or a URI <$uri> to |
188
|
|
|
|
|
|
|
read from. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Attributes assigned in C<%attrs> (see L</"ATTRIBUTES">) will augment |
191
|
|
|
|
|
|
|
or override previously assigned attributes for the duration of the |
192
|
|
|
|
|
|
|
C<html2wiki()> call. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub html2wiki { |
197
|
24
|
|
|
24
|
1
|
278295
|
my $self = shift; |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Assumes that if @_ is odd-numbered, its first element is html |
200
|
24
|
100
|
|
|
|
152
|
my %args = @_ % 2 ? ( html => +shift, @_ ) : @_; |
201
|
|
|
|
|
|
|
|
202
|
24
|
|
|
|
|
101
|
my %common_arg_errors = ( url => 'uri', base_url => 'base_uri', wiki_url => 'wiki_uri' ); |
203
|
24
|
|
|
|
|
190
|
while( my( $bad, $good ) = each %common_arg_errors ) { |
204
|
72
|
50
|
|
|
|
276
|
$self->__no_such( 'argument to html2wiki()', $bad, $good ) if exists $args{$bad}; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
24
|
|
|
|
|
59
|
my @input_sources = grep { exists $args{$_} } qw/ html file uri /; |
|
72
|
|
|
|
|
169
|
|
208
|
24
|
50
|
|
|
|
69
|
croak "missing 'html', 'file', or 'uri' argument to html2wiki" unless @input_sources; |
209
|
24
|
50
|
|
|
|
68
|
croak "more than one of 'html', 'file', or 'uri' provided, but only one input source allowed" if @input_sources > 1; |
210
|
|
|
|
|
|
|
|
211
|
24
|
|
100
|
|
|
95
|
my $html = delete $args{html} || ''; |
212
|
24
|
|
50
|
|
|
132
|
my $file = delete $args{file} || ''; |
213
|
24
|
|
100
|
|
|
92
|
my $uri = delete $args{uri} || ''; |
214
|
|
|
|
|
|
|
|
215
|
24
|
50
|
33
|
|
|
66
|
$html = $self->__slurp($file) if $file && $self->slurp; |
216
|
24
|
100
|
|
|
|
64
|
$html = $self->__fetch_html_from_uri($uri) if $uri; # may set 'user_agent' attrib, so call before storing attribs |
217
|
24
|
50
|
33
|
|
|
172
|
$html = "<html>$html</html>" if $html and $self->wrap_in_html; |
218
|
|
|
|
|
|
|
|
219
|
24
|
|
|
|
|
62
|
$self->__original_attrs( { %{ $self->__attrs } } ); |
|
24
|
|
|
|
|
55
|
|
220
|
24
|
|
|
|
|
165
|
$self->$_( $args{$_} ) foreach keys %args; |
221
|
24
|
100
|
|
|
|
66
|
$self->__setup_rules() if $self->__attrs_changed; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Decode into Perl's internal form |
224
|
24
|
|
|
|
|
138
|
$html = decode( $self->encoding, $html ); |
225
|
|
|
|
|
|
|
|
226
|
24
|
|
|
|
|
1640
|
my $tree = new HTML::TreeBuilder(); |
227
|
24
|
|
|
|
|
6047
|
$tree->store_comments(1); |
228
|
24
|
|
|
|
|
324
|
$tree->p_strict( $self->p_strict ); |
229
|
24
|
|
|
|
|
273
|
$tree->implicit_body_p_tag(1); |
230
|
24
|
|
|
|
|
1038
|
$tree->ignore_unknown(0); # <ruby> et al |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# Parse the HTML string or file |
233
|
24
|
50
|
|
|
|
207
|
if( $html ) { |
234
|
24
|
|
|
|
|
80
|
$self->given_html( $html ); |
235
|
24
|
|
|
|
|
324
|
$tree->parse($html); |
236
|
24
|
|
|
|
|
41060
|
$tree->eof(); |
237
|
|
|
|
|
|
|
} else { |
238
|
0
|
|
|
|
|
0
|
$self->given_html( $self->__slurp($file) ); |
239
|
0
|
|
|
|
|
0
|
$tree->parse_file($file); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Preprocess, save tree and parsed HTML |
243
|
24
|
|
|
|
|
3017
|
$self->__root( $tree ); |
244
|
24
|
|
|
|
|
98
|
$self->__preprocess_tree(); |
245
|
|
|
|
|
|
|
|
246
|
23
|
|
|
|
|
607
|
$self->__root->deobjectify_text(); |
247
|
23
|
|
|
|
|
991
|
$self->parsed_html( $tree->as_HTML(undef, ' ', {}) ); |
248
|
23
|
|
|
|
|
83
|
$self->__root->objectify_text(); |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Convert and preprocess |
251
|
23
|
|
|
|
|
1943
|
my $output = $self->__wikify($tree); |
252
|
23
|
|
|
|
|
97
|
$self->__postprocess_output(\$output); |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Avoid leaks |
255
|
23
|
|
|
|
|
104
|
$tree->delete(); |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# Return to original encoding |
258
|
23
|
|
|
|
|
1743
|
$output = encode( $self->encoding, $output ); |
259
|
|
|
|
|
|
|
|
260
|
23
|
100
|
|
|
|
1039
|
if( $self->__attrs_changed ) { |
261
|
9
|
|
|
|
|
11
|
$self->__attrs( { %{ $self->__original_attrs } } ); |
|
9
|
|
|
|
|
25
|
|
262
|
9
|
|
|
|
|
37
|
$self->__setup_rules(); |
263
|
9
|
|
|
|
|
44
|
$self->__attrs_changed(0); |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
23
|
|
|
|
|
285
|
return $output; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub __wikify { |
270
|
140
|
|
|
140
|
|
212
|
my( $self, $node ) = @_; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# Concatenate adjacent text nodes |
273
|
140
|
|
|
|
|
358
|
$node->normalize_content(); |
274
|
|
|
|
|
|
|
|
275
|
140
|
100
|
|
|
|
2054
|
if( $node->tag eq '~text' ) { |
|
|
100
|
|
|
|
|
|
276
|
36
|
|
|
|
|
279
|
return $node->attr('text'); |
277
|
|
|
|
|
|
|
} elsif( $node->tag eq '~comment' ) { |
278
|
1
|
|
|
|
|
12
|
return '<!--' . $node->attr('text') . '-->'; |
279
|
|
|
|
|
|
|
} else { |
280
|
103
|
|
|
|
|
1510
|
my $rules = $self->rules_for_tag( $node->tag ); |
281
|
|
|
|
|
|
|
|
282
|
103
|
100
|
|
|
|
290
|
return $self->__subst($rules->{replace}, $node, $rules) if exists $rules->{replace}; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Set private preserve rules |
285
|
98
|
100
|
|
|
|
284
|
if( $rules->{preserve} ) { |
|
|
50
|
|
|
|
|
|
286
|
4
|
50
|
|
|
|
24
|
$rules->{__start} = \&__preserve_start, |
287
|
|
|
|
|
|
|
$rules->{__end} = $rules->{empty} ? undef : '</'.$node->tag.'>'; |
288
|
|
|
|
|
|
|
} elsif( $rules->{passthrough} ) { |
289
|
0
|
|
|
|
|
0
|
$rules->{__start} = ''; |
290
|
0
|
|
|
|
|
0
|
$rules->{__end} = ''; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Recurse |
294
|
98
|
|
|
|
|
307
|
my $output = $self->get_elem_contents($node); |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Unspecified tags have their whitespace preserved (this allows |
297
|
|
|
|
|
|
|
# 'html' and 'body' tags [among others] to keep formatting when |
298
|
|
|
|
|
|
|
# inner tags like 'pre' need to preserve whitespace). |
299
|
98
|
|
100
|
|
|
389
|
my $trim = $rules->{trim} || 'none'; |
300
|
98
|
100
|
66
|
|
|
425
|
$output =~ s/^\s+// if $trim eq 'both' or $trim eq 'leading'; |
301
|
98
|
100
|
66
|
|
|
463
|
$output =~ s/\s+$// if $trim eq 'both' or $trim eq 'trailing'; |
302
|
|
|
|
|
|
|
|
303
|
98
|
|
100
|
|
|
331
|
my $lf = $rules->{line_format} || 'none'; |
304
|
98
|
100
|
|
|
|
213
|
$output =~ s/^\s*\n/\n/gm if $lf ne 'none'; |
305
|
98
|
50
|
|
|
|
389
|
if( $lf eq 'blocks' ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
306
|
0
|
|
|
|
|
0
|
$output =~ s/\n{3,}/\n\n/g; |
307
|
|
|
|
|
|
|
} elsif( $lf eq 'multi' ) { |
308
|
3
|
|
|
|
|
9
|
$output =~ s/\n{2,}/\n/g; |
309
|
|
|
|
|
|
|
} elsif( $lf eq 'single' ) { |
310
|
0
|
|
|
|
|
0
|
$output =~ s/\n+/ /g; |
311
|
|
|
|
|
|
|
} elsif( $lf eq 'none' ) { |
312
|
|
|
|
|
|
|
# Do nothing |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# Substitutions |
316
|
98
|
100
|
|
|
|
219
|
$output =~ s/^/$self->__subst($rules->{line_prefix}, $node, $rules)/gem if $rules->{line_prefix}; |
|
4
|
|
|
|
|
12
|
|
317
|
98
|
100
|
|
|
|
198
|
$output = $self->__subst($rules->{__start}, $node, $rules).$output if $rules->{__start}; |
318
|
98
|
100
|
|
|
|
196
|
$output = $output.$self->__subst($rules->{__end}, $node, $rules) if $rules->{__end}; |
319
|
98
|
100
|
|
|
|
238
|
$output = $self->__subst($rules->{start}, $node, $rules).$output if $rules->{start}; |
320
|
98
|
100
|
|
|
|
240
|
$output = $output.$self->__subst($rules->{end}, $node, $rules) if $rules->{end}; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# If the current element is a block and is contained within |
323
|
|
|
|
|
|
|
# another block element, then we will not block the current |
324
|
|
|
|
|
|
|
# element by default. However, if the current element is a block |
325
|
|
|
|
|
|
|
# and is contained within another block element that specifies a |
326
|
|
|
|
|
|
|
# line_format of 'blocks', then we will block the current element. |
327
|
98
|
100
|
66
|
|
|
265
|
$output = "\n\n$output\n\n" if $rules->{block} && |
|
|
|
66
|
|
|
|
|
328
|
|
|
|
|
|
|
( ! $self->elem_search_lineage( $node, { block => 1 } ) or |
329
|
|
|
|
|
|
|
$self->elem_search_lineage( $node, { line_format => 'blocks' } ) ); |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# ...but they are put on their own line |
332
|
98
|
100
|
100
|
|
|
328
|
$output = "\n$output" if $rules->{block} and $node->parent->look_up( _tag => $node->tag ) and $trim ne 'none'; |
|
|
|
66
|
|
|
|
|
333
|
|
|
|
|
|
|
|
334
|
98
|
|
|
|
|
1411
|
return $output; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Deprecated. Instead use elem_search_lineage( $node, { block => 1 } ). |
339
|
|
|
|
|
|
|
sub elem_within_block { |
340
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
341
|
0
|
|
|
|
|
0
|
foreach my $n ( $node->lineage ) { |
342
|
0
|
0
|
0
|
|
|
0
|
return $n if $self->rules_for_tag($n->tag || '')->{block}; |
343
|
|
|
|
|
|
|
} |
344
|
0
|
|
|
|
|
0
|
return 0; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=head2 elem_search_lineage |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
my $ancestor = $wc->elem_search_lineage( $node, \%rules ); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
Searches the lineage of C<$node> and returns the first ancestor node |
352
|
|
|
|
|
|
|
that has rules matching those specified in C<%rules>, or C<undef> if |
353
|
|
|
|
|
|
|
no matching node is found. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
For example, to find out whether C<$node> has an ancestor with rules |
356
|
|
|
|
|
|
|
matching C<{ block =E<gt>1 }>, one could use: |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
if( $wc->elem_search_lineage( $node, { block => 1 } ) ) { |
359
|
|
|
|
|
|
|
# do something |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=cut |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub elem_search_lineage { |
365
|
15
|
|
|
15
|
1
|
27
|
my( $self, $node, $search_rules ) = @_; |
366
|
|
|
|
|
|
|
|
367
|
15
|
|
|
|
|
60
|
foreach my $n ( $node->lineage ) { |
368
|
30
|
|
|
|
|
323
|
my $rules = $self->rules_for_tag( $n->tag ); |
369
|
|
|
|
|
|
|
|
370
|
30
|
|
|
|
|
50
|
my $matched = 1; |
371
|
30
|
|
|
|
|
106
|
while( my($k,$v) = each %$search_rules ) { |
372
|
30
|
|
100
|
|
|
116
|
my $rule_value = $rules->{$k} || ''; |
373
|
30
|
100
|
|
|
|
146
|
$matched = 0 unless $v eq $rule_value; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
30
|
100
|
|
|
|
131
|
return $n if $matched; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
14
|
|
|
|
|
76
|
return undef; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub __subst { |
383
|
51
|
|
|
51
|
|
93
|
my( $self, $subst, $node, $rules ) = @_; |
384
|
51
|
100
|
|
|
|
185
|
return ref $subst eq 'CODE' ? $subst->( $self, $node, $rules ) : $subst; |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub __preserve_start { |
388
|
4
|
|
|
4
|
|
9
|
my( $self, $node, $rules ) = @_; |
389
|
|
|
|
|
|
|
|
390
|
4
|
|
|
|
|
10
|
my $tag = $node->tag; |
391
|
4
|
50
|
|
|
|
35
|
my @attrs = exists $rules->{attributes} ? @{$rules->{attributes}} : ( ); |
|
0
|
|
|
|
|
0
|
|
392
|
4
|
|
|
|
|
18
|
my $attr_str = $self->get_attr_str( $node, @attrs ); |
393
|
4
|
50
|
|
|
|
14
|
my $slash = $rules->{empty} ? ' /' : ''; |
394
|
|
|
|
|
|
|
|
395
|
4
|
50
|
|
|
|
7
|
return '<'.$tag.' '.$attr_str.$slash.'>' if $attr_str; |
396
|
4
|
|
|
|
|
19
|
return '<'.$tag.$slash.'>'; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# Maps a tag name to its URI attribute |
400
|
|
|
|
|
|
|
my %rel2abs = ( a => 'href', img => 'src' ); |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
my %allowedEmptyTag = ( %HTML::Tagset::emptyElement, '~comment' => 1, '~text' => 1 ); |
403
|
|
|
|
|
|
|
my %isKnownTag = %HTML::Tagset::isKnown; |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
sub __preprocess_tree { |
406
|
24
|
|
|
24
|
|
37
|
my $self = shift; |
407
|
|
|
|
|
|
|
|
408
|
24
|
|
|
|
|
59
|
$self->__root->objectify_text(); |
409
|
|
|
|
|
|
|
|
410
|
24
|
|
|
|
|
9209
|
$self->preprocess_tree($self->__root); |
411
|
|
|
|
|
|
|
|
412
|
24
|
50
|
|
|
|
129
|
HTML::WikiConverter::Normalizer->new->normalize($self->__root) if $self->normalize; |
413
|
|
|
|
|
|
|
|
414
|
23
|
50
|
|
|
|
213
|
my %strip_tag = map { $_ => 1 } @{ $self->strip_tags || [] }; |
|
71
|
|
|
|
|
184
|
|
|
23
|
|
|
|
|
126
|
|
415
|
23
|
|
|
|
|
276
|
my %passthrough_naked_tags = map { $_ => 1 } $self->__passthrough_naked_tags; |
|
0
|
|
|
|
|
0
|
|
416
|
|
|
|
|
|
|
|
417
|
23
|
|
|
|
|
59
|
foreach my $node ( $self->__root->descendents ) { |
418
|
110
|
100
|
|
|
|
5498
|
$node->tag('') unless $node->tag; |
419
|
110
|
100
|
|
|
|
915
|
$node->delete, next if $strip_tag{$node->tag}; |
420
|
94
|
50
|
33
|
|
|
776
|
$node->replace_with_content->delete, next if $passthrough_naked_tags{$node->tag} and !$node->all_external_attr_names; |
421
|
94
|
|
|
|
|
721
|
$self->__rm_invalid_text($node); |
422
|
94
|
100
|
100
|
|
|
421
|
$node->delete, next if $self->strip_empty_tags and !$allowedEmptyTag{$node->tag} and $self->__elem_is_empty($node); |
|
|
|
100
|
|
|
|
|
423
|
84
|
100
|
100
|
|
|
345
|
$self->__encode_entities($node) if $node->tag eq '~text' and $self->escape_entities; |
424
|
84
|
100
|
100
|
|
|
1095
|
$self->__rel2abs($node) if $self->base_uri and exists $rel2abs{$node->tag}; |
425
|
84
|
|
|
|
|
11651
|
$self->preprocess_node($node); |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# Reobjectify in case preprocessing added new text |
429
|
23
|
|
|
|
|
169
|
$self->__root->objectify_text(); |
430
|
|
|
|
|
|
|
|
431
|
23
|
100
|
|
|
|
764
|
$self->preprocess->( $self->__root ) if ref $self->preprocess; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
sub __passthrough_naked_tags { |
435
|
23
|
|
|
23
|
|
42
|
my $self = shift; |
436
|
|
|
|
|
|
|
|
437
|
23
|
|
|
|
|
28
|
my @tags; |
438
|
23
|
50
|
|
|
|
105
|
if( ref $self->passthrough_naked_tags eq 'ARRAY' ) { |
|
|
50
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
@tags = @{ $self->passthrough_naked_tags }; |
|
0
|
|
|
|
|
0
|
|
440
|
|
|
|
|
|
|
} elsif( $self->passthrough_naked_tags ) { |
441
|
0
|
|
|
|
|
0
|
@tags = $self->__default_passthrough_naked_tags; |
442
|
|
|
|
|
|
|
} else { |
443
|
23
|
|
|
|
|
36
|
@tags = ( ); |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
23
|
|
|
|
|
106
|
return @tags; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# (bug #28402) |
450
|
0
|
|
|
0
|
|
0
|
sub __default_passthrough_naked_tags { qw/ tbody thead span div font / } |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
sub __elem_is_empty { |
453
|
26
|
|
|
26
|
|
293
|
my( $self, $node ) = @_; |
454
|
26
|
|
|
|
|
64
|
my $content = $self->get_elem_contents($node); |
455
|
26
|
100
|
66
|
|
|
145
|
my $has_nonwhitespace = $content && length $content ? $content =~ /\S/ : 0; |
456
|
26
|
|
|
|
|
148
|
return !$has_nonwhitespace; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub __fetch_html_from_uri { |
460
|
1
|
|
|
1
|
|
3
|
my( $self, $uri ) = @_; |
461
|
1
|
|
|
|
|
9
|
my $ua = $self->__user_agent; |
462
|
1
|
|
|
|
|
7
|
my $res = $ua->get($uri); |
463
|
1
|
50
|
|
|
|
628118
|
croak "request for <$uri> failed" unless $res->is_success; |
464
|
1
|
|
50
|
|
|
30
|
my $encoding = $self->encoding || $self->__guess_encoding($res) || 'utf-8'; |
465
|
1
|
|
|
|
|
8
|
my $html = encode( $self->encoding, decode( $encoding, $res->content ) ); |
466
|
1
|
|
|
|
|
2403
|
return $html; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub __guess_encoding { |
470
|
0
|
|
|
0
|
|
0
|
my( $self, $res ) = @_; |
471
|
0
|
0
|
|
|
|
0
|
carp "LWP::Charset is not installed but is required for determining the charset claimed by the content at the requested URI", return |
472
|
|
|
|
|
|
|
unless eval "use LWP::Charset; 1"; |
473
|
0
|
|
|
|
|
0
|
return LWP::Charset::getCharset($res); |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub __user_agent { |
477
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
478
|
1
|
50
|
|
|
|
12
|
$self->user_agent( $self->__default_user_agent ) unless $self->user_agent; |
479
|
1
|
|
|
|
|
9
|
return $self->user_agent; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub __default_user_agent { |
483
|
1
|
50
|
|
1
|
|
289
|
croak "LWP is not installed but is required for fetching URIs" unless eval "use LWP::UserAgent; 1"; |
|
1
|
|
|
1
|
|
13
|
|
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
19
|
|
484
|
1
|
|
|
|
|
10
|
return LWP::UserAgent->new( agent => shift->__default_ua_string ); |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
1
|
|
|
1
|
|
11
|
sub __default_ua_string { "html2wiki/$VERSION" } |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# Encodes high-bit and control chars in node's text to HTML entities. |
490
|
|
|
|
|
|
|
sub __encode_entities { |
491
|
19
|
|
|
19
|
|
76
|
my( $self, $node ) = @_; |
492
|
19
|
50
|
|
|
|
66
|
my $text = defined $node->attr('text') ? $node->attr('text') : ''; |
493
|
19
|
|
|
|
|
441
|
encode_entities( $text, '<>&' ); |
494
|
19
|
|
|
|
|
1254
|
$node->attr( text => $text ); |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# Convert relative to absolute URIs |
498
|
|
|
|
|
|
|
sub __rel2abs { |
499
|
5
|
|
|
5
|
|
67
|
my( $self, $node ) = @_; |
500
|
5
|
|
|
|
|
16
|
my $attr = $rel2abs{$node->tag}; |
501
|
5
|
50
|
|
|
|
43
|
return unless $node->attr($attr); # don't add attribute if it's not already there |
502
|
5
|
|
|
|
|
77
|
$node->attr( $attr => uri_unescape( URI->new_abs( $node->attr($attr), $self->base_uri )->as_string ) ); |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Removes text nodes directly inside container elements. |
506
|
|
|
|
|
|
|
my %containers = map { $_ => 1 } qw/ table tbody tr ul ol dl menu /; |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub __rm_invalid_text { |
509
|
94
|
|
|
94
|
|
140
|
my( $self, $node ) = @_; |
510
|
94
|
50
|
|
|
|
221
|
my $tag = defined $node->tag ? $node->tag : ''; |
511
|
94
|
50
|
|
|
|
1467
|
if( $containers{$tag} ) { |
512
|
0
|
|
|
|
|
0
|
$_->delete for grep { $_->tag eq '~text' } $node->content_list; |
|
0
|
|
|
|
|
0
|
|
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub strip_aname { |
517
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
518
|
0
|
0
|
|
|
|
0
|
return if $node->attr('href'); |
519
|
0
|
|
|
|
|
0
|
$node->replace_with_content->delete(); |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub caption2para { |
523
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
524
|
0
|
|
|
|
|
0
|
my $table = $node->parent; |
525
|
0
|
|
|
|
|
0
|
$node->detach(); |
526
|
0
|
|
|
|
|
0
|
$table->preinsert($node); |
527
|
0
|
|
|
|
|
0
|
$node->tag('p'); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
24
|
|
|
24
|
0
|
46
|
sub preprocess_tree { } |
531
|
84
|
|
|
84
|
0
|
178
|
sub preprocess_node { } |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
sub __postprocess_output { |
534
|
23
|
|
|
23
|
|
38
|
my( $self, $outref ) = @_; |
535
|
23
|
|
|
|
|
143
|
$$outref =~ s/\n[\s^\n]+\n/\n\n/g; # XXX this is causing bug 14527 |
536
|
23
|
|
|
|
|
122
|
$$outref =~ s/\n{2,}/\n\n/g; |
537
|
23
|
|
|
|
|
75
|
$$outref =~ s/^\n+//; |
538
|
23
|
|
|
|
|
96
|
$$outref =~ s/\s+$//; |
539
|
23
|
|
|
|
|
56
|
$$outref =~ s/[ \t]+$//gm; |
540
|
23
|
|
|
|
|
69
|
$self->postprocess_output($outref); |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
23
|
|
|
23
|
0
|
42
|
sub postprocess_output { } |
544
|
|
|
|
|
|
|
|
545
|
0
|
|
|
0
|
0
|
0
|
sub attributes { {} } |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub __load_attribute_specs { |
548
|
5
|
|
|
5
|
|
19
|
my $self = shift; |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
# Get default attribute specs |
551
|
5
|
|
|
|
|
29
|
my $default_specs = $self->__default_attribute_specs; |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# Get dialect attribute specs |
554
|
5
|
|
|
|
|
24
|
my @dialect_specs = $self->attributes; |
555
|
5
|
50
|
33
|
|
|
64
|
my $dialect_specs = @dialect_specs == 1 && ref $dialect_specs[0] eq 'HASH' ? $dialect_specs[0] : {@dialect_specs}; |
556
|
|
|
|
|
|
|
|
557
|
5
|
|
|
|
|
53
|
my %attr_specs = %$default_specs; |
558
|
5
|
|
|
|
|
29
|
while( my( $attr, $spec ) = each %$dialect_specs ) { |
559
|
13
|
|
|
|
|
47
|
$attr_specs{$attr} = $spec; |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
|
562
|
5
|
|
|
|
|
31
|
$self->__attribute_specs( \%attr_specs ); |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub __load_and_validate_attributes { |
566
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
567
|
|
|
|
|
|
|
|
568
|
5
|
|
|
|
|
8
|
my %attrs = eval { validate( @_, $self->__attribute_specs ) }; |
|
5
|
|
|
|
|
13
|
|
569
|
5
|
100
|
|
|
|
317
|
$self->__attribute_error($@) if $@; |
570
|
|
|
|
|
|
|
|
571
|
4
|
|
|
|
|
17
|
while( my( $attr, $value ) = each %attrs ) { |
572
|
64
|
|
|
|
|
434
|
$self->$attr($value); |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub __attribute_error { |
577
|
1
|
|
|
1
|
|
2
|
my( $self, $error ) = @_; |
578
|
|
|
|
|
|
|
# Validating attributes failed, so we don't have access to the |
579
|
|
|
|
|
|
|
# 'dialect' attribute; obtain it from the package name instead |
580
|
1
|
|
|
|
|
3
|
( my $dialect = ref $self ) =~ s/.*://; |
581
|
|
|
|
|
|
|
|
582
|
1
|
50
|
|
|
|
10
|
$error = sprintf "The attribute '%s' does not exist in the dialect '%s'.", $1, $dialect |
583
|
|
|
|
|
|
|
if $error =~ /not listed in the validation options\: (\w+)/; |
584
|
|
|
|
|
|
|
|
585
|
1
|
|
|
|
|
172
|
croak $error; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
0
|
|
|
0
|
0
|
0
|
sub rules { {} } |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub __load_rules { |
591
|
23
|
|
|
23
|
|
38
|
my $self = shift; |
592
|
23
|
|
|
|
|
80
|
$self->__rules( $self->rules ); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# Rules for validating rules |
596
|
|
|
|
|
|
|
my %meta_rules = ( |
597
|
|
|
|
|
|
|
trim => { range => [ qw/ none both leading trailing / ] }, |
598
|
|
|
|
|
|
|
line_format => { range => [ qw/ none single multi blocks / ] }, |
599
|
|
|
|
|
|
|
replace => { singleton => 1 }, |
600
|
|
|
|
|
|
|
alias => { singleton => 1 }, |
601
|
|
|
|
|
|
|
attributes => { depends => [ qw/ preserve / ] }, |
602
|
|
|
|
|
|
|
empty => { depends => [ qw/ preserve / ] }, |
603
|
|
|
|
|
|
|
passthrough => { singleton => 1 }, |
604
|
|
|
|
|
|
|
); |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
sub __validate_rules { |
607
|
23
|
|
|
23
|
|
33
|
my $self = shift; |
608
|
|
|
|
|
|
|
|
609
|
23
|
|
|
|
|
27
|
foreach my $tag ( keys %{ $self->__rules } ) { |
|
23
|
|
|
|
|
40
|
|
610
|
185
|
|
|
|
|
351
|
my $rules = $self->__rules->{$tag}; |
611
|
|
|
|
|
|
|
|
612
|
185
|
|
|
|
|
501
|
foreach my $opt ( keys %$rules ) { |
613
|
280
|
100
|
|
|
|
793
|
my $spec = $meta_rules{$opt} or next; |
614
|
|
|
|
|
|
|
|
615
|
96
|
|
100
|
|
|
242
|
my $singleton = $spec->{singleton} || 0; |
616
|
96
|
50
|
|
|
|
246
|
my @disallows = ref $spec->{disallows} eq 'ARRAY' ? @{ $spec->{disallows} } : ( ); |
|
0
|
|
|
|
|
0
|
|
617
|
96
|
50
|
|
|
|
187
|
my @depends = ref $spec->{depends} eq 'ARRAY' ? @{ $spec->{depends} } : ( ); |
|
0
|
|
|
|
|
0
|
|
618
|
96
|
100
|
|
|
|
193
|
my @range = ref $spec->{range} eq 'ARRAY' ? @{ $spec->{range} } : ( ); |
|
30
|
|
|
|
|
76
|
|
619
|
96
|
|
|
|
|
144
|
my %range = map { $_ => 1 } @range; |
|
120
|
|
|
|
|
247
|
|
620
|
|
|
|
|
|
|
|
621
|
96
|
50
|
66
|
|
|
423
|
$self->__rule_error( $tag, "'$opt' cannot be combined with any other option" ) |
622
|
|
|
|
|
|
|
if $singleton and keys %$rules != 1; |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
$rules->{$_} && $self->__rule_error( $tag, "'$opt' cannot be combined with '$_'" ) |
625
|
96
|
|
0
|
|
|
177
|
foreach @disallows; |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
! $rules->{$_} && $self->__rule_error( $tag, "'$opt' must be combined with '$_'" ) |
628
|
96
|
|
0
|
|
|
137
|
foreach @depends; |
629
|
|
|
|
|
|
|
|
630
|
96
|
50
|
66
|
|
|
520
|
$self->__rule_error( $tag, "Unknown '$opt' value '$rules->{$opt}'. '$opt' must be one of ", join(', ', map { "'$_'" } @range) ) |
|
0
|
|
|
|
|
0
|
|
631
|
|
|
|
|
|
|
if @range and ! exists $range{$rules->{$opt}}; |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
sub __rule_error { |
637
|
0
|
|
|
0
|
|
0
|
my( $self, $tag, @msg ) = @_; |
638
|
0
|
|
|
|
|
0
|
my $dialect = ref $self; |
639
|
0
|
|
|
|
|
0
|
croak @msg, " in tag '$tag', dialect '$dialect'.\n"; |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub get_elem_contents { |
643
|
128
|
|
|
128
|
0
|
187
|
my( $self, $node ) = @_; |
644
|
128
|
|
|
|
|
344
|
my $str = join '', map { $self->__wikify($_) } $node->content_list; |
|
117
|
|
|
|
|
817
|
|
645
|
128
|
50
|
|
|
|
959
|
return defined $str ? $str : ''; |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
sub get_wiki_page { |
649
|
5
|
|
|
5
|
0
|
111
|
my( $self, $uri ) = @_; |
650
|
5
|
50
|
|
|
|
30
|
my @wiki_uris = ref $self->wiki_uri eq 'ARRAY' ? @{$self->wiki_uri} : $self->wiki_uri; |
|
5
|
|
|
|
|
23
|
|
651
|
5
|
|
|
|
|
20
|
foreach my $wiki_uri ( @wiki_uris ) { |
652
|
12
|
|
|
|
|
34
|
my $page = $self->__extract_wiki_page( $uri, $wiki_uri ); |
653
|
12
|
100
|
|
|
|
451
|
return $page if $page; |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
|
656
|
2
|
|
|
|
|
7
|
return undef; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
sub __extract_wiki_page { |
660
|
12
|
|
|
12
|
|
20
|
my( $self, $uri, $wiki_uri ) = @_; |
661
|
12
|
50
|
|
|
|
24
|
return undef unless $wiki_uri; |
662
|
|
|
|
|
|
|
|
663
|
12
|
50
|
|
|
|
39
|
if( ref $wiki_uri eq 'Regexp' ) { |
|
|
100
|
|
|
|
|
|
664
|
0
|
0
|
|
|
|
0
|
return $uri =~ $wiki_uri ? $1 : undef; |
665
|
|
|
|
|
|
|
} elsif( ref $wiki_uri eq 'CODE' ) { |
666
|
3
|
|
|
|
|
11
|
return $wiki_uri->( $self, URI->new($uri) ); |
667
|
|
|
|
|
|
|
} else { |
668
|
|
|
|
|
|
|
# Ensure $wiki_uri is absolute |
669
|
9
|
|
|
|
|
42
|
$wiki_uri = URI->new_abs( $wiki_uri, $self->base_uri )->as_string; |
670
|
|
|
|
|
|
|
|
671
|
9
|
100
|
|
|
|
871
|
return undef unless index( $uri, $wiki_uri ) == 0; |
672
|
2
|
50
|
|
|
|
10
|
return undef unless length $uri > length $wiki_uri; |
673
|
2
|
|
|
|
|
10
|
return substr( $uri, length $wiki_uri ); |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
} |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
# Adapted from Kwiki source |
678
|
|
|
|
|
|
|
my $UPPER = '\p{UppercaseLetter}'; |
679
|
|
|
|
|
|
|
my $LOWER = '\p{LowercaseLetter}'; |
680
|
|
|
|
|
|
|
my $WIKIWORD = "$UPPER$LOWER\\p{Number}\\p{ConnectorPunctuation}"; |
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
0
|
0
|
0
|
sub is_camel_case { return $_[1] =~ /(?:[$UPPER](?=[$WIKIWORD]*[$UPPER])(?=[$WIKIWORD]*[$LOWER])[$WIKIWORD]+)/ } |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
sub get_attr_str { |
685
|
4
|
|
|
4
|
0
|
8
|
my( $self, $node, @attrs ) = @_; |
686
|
4
|
|
|
|
|
9
|
my %attrs = map { $_ => $node->attr($_) } @attrs; |
|
0
|
|
|
|
|
0
|
|
687
|
4
|
|
|
|
|
9
|
my $str = join ' ', map { $_.'="'.encode_entities($attrs{$_}).'"' } grep { $attrs{$_} } @attrs; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
# (bug #19046) partial fix: attributes must be contained on a single line |
690
|
4
|
50
|
|
|
|
9
|
$str =~ s/[\n\r]/ /g if $str; |
691
|
|
|
|
|
|
|
|
692
|
4
|
50
|
|
|
|
16
|
return defined $str ? $str : ''; |
693
|
|
|
|
|
|
|
} |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=head2 given_html |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
my $html = $wc->given_html; |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Returns the HTML passed to or fetched (ie, from a file or URI) by the |
700
|
|
|
|
|
|
|
last C<html2wiki()> method call. Useful for debugging. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=cut |
703
|
|
|
|
|
|
|
|
704
|
24
|
|
|
24
|
1
|
93
|
sub given_html { shift->_attr( { internal => 1 }, __given_html => @_ ) } |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=head2 parsed_html |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my $parsed_html = $wc->parsed_html; |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
Returns a string containing the post-processed HTML from the last |
711
|
|
|
|
|
|
|
C<html2wiki> call. Post-processing includes parsing by |
712
|
|
|
|
|
|
|
L<HTML::TreeBuilder>, CSS normalization by |
713
|
|
|
|
|
|
|
L<HTML::WikiConverter::Normalizer>, and calls to the C<preprocess> and |
714
|
|
|
|
|
|
|
C<preprocess_tree> dialect methods. |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
=cut |
717
|
|
|
|
|
|
|
|
718
|
23
|
|
|
23
|
1
|
18412
|
sub parsed_html { shift->_attr( { internal => 1 }, __parsed_html => @_ ) } |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=head2 available_dialects |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
my @dialects = HTML::WikiConverter->available_dialects; |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
Returns a list of all available dialects by searching the directories |
725
|
|
|
|
|
|
|
in C<@INC> for C<HTML::WikiConverter::> modules. |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=cut |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
sub available_dialects { |
730
|
0
|
|
|
0
|
1
|
0
|
my @dialects; |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
my %seen; |
733
|
0
|
|
|
|
|
0
|
for my $inc ( @INC ) { |
734
|
0
|
|
|
|
|
0
|
my $dir = File::Spec->catfile( $inc, 'HTML', 'WikiConverter' ); |
735
|
0
|
0
|
|
|
|
0
|
my $dh = DirHandle->new( $dir ) or next; |
736
|
0
|
|
|
|
|
0
|
while ( my $f = $dh->read ) { |
737
|
0
|
0
|
|
|
|
0
|
next unless $f =~ /^(\w+)\.pm$/; |
738
|
0
|
|
|
|
|
0
|
my $dialect = $1; |
739
|
|
|
|
|
|
|
|
740
|
0
|
0
|
|
|
|
0
|
next if $seen{$dialect}++; |
741
|
0
|
0
|
0
|
|
|
0
|
next if $dialect eq 'Normalizer' or $dialect eq 'WebApp'; |
742
|
|
|
|
|
|
|
|
743
|
0
|
|
|
|
|
0
|
push @dialects, $dialect; |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
0
|
0
|
|
|
|
0
|
return wantarray ? sort @dialects : @dialects; |
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=head2 rules_for_tag |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
my $rules = $wc->rules_for_tag( $tag ); |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
Returns the rules that will be used for converting elements of the |
755
|
|
|
|
|
|
|
given tag. Follows C<alias> references. Note that the rules used for a |
756
|
|
|
|
|
|
|
particular tag may depend on the current set of attributes being used. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=cut |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
sub rules_for_tag { |
761
|
133
|
|
|
133
|
1
|
895
|
my( $self, $tag ) = @_; |
762
|
133
|
|
|
|
|
294
|
my $rules = $self->__rules_for_tag($tag); |
763
|
133
|
100
|
|
|
|
422
|
return $rules->{alias} ? $self->__rules_for_tag( $rules->{alias} ) : $rules; |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
sub __rules_for_tag { |
767
|
142
|
|
|
142
|
|
195
|
my( $self, $tag ) = @_; |
768
|
142
|
100
|
|
|
|
276
|
return $self->__rules->{$tag} if $self->__rules->{$tag}; |
769
|
91
|
50
|
66
|
|
|
255
|
return $self->__rules->{UNKNOWN} if $self->__rules->{UNKNOWN} and !$isKnownTag{$tag}; |
770
|
91
|
|
|
|
|
337
|
return { }; |
771
|
|
|
|
|
|
|
} |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
You may configure C<HTML::WikiConverter> using a number of |
776
|
|
|
|
|
|
|
attributes. These may be passed as arguments to the C<new> |
777
|
|
|
|
|
|
|
constructor, or can be called as object methods on an H::WC object. |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Some dialects allow other attributes in addition to those below, and |
780
|
|
|
|
|
|
|
may override the attributes' default values. Consult the dialect's |
781
|
|
|
|
|
|
|
documentation for details. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=head2 base_uri |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
URI to use for converting relative URIs to absolute ones. This |
786
|
|
|
|
|
|
|
effectively ensures that the C<src> and C<href> attributes of image |
787
|
|
|
|
|
|
|
and anchor tags, respectively, are absolute before converting the HTML |
788
|
|
|
|
|
|
|
to wiki markup, which is necessary for wiki dialects that handle |
789
|
|
|
|
|
|
|
internal and external links separately. Relative URIs are only |
790
|
|
|
|
|
|
|
converted to absolute ones if the C<base_uri> argument is |
791
|
|
|
|
|
|
|
present. Defaults to C<undef>. |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
=head2 dialect |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
(Required) Dialect to use for converting HTML into wiki markup. See |
796
|
|
|
|
|
|
|
the L</"DESCRIPTION"> section above for a list of dialects. C<new()> |
797
|
|
|
|
|
|
|
will fail if the dialect given is not installed on your system. Use |
798
|
|
|
|
|
|
|
C<available_dialects()> to list installed dialects. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=head2 encoding |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Specifies the encoding used by the HTML to be converted. Also |
803
|
|
|
|
|
|
|
determines the encoding of the wiki markup returned by the |
804
|
|
|
|
|
|
|
C<html2wiki> method. Defaults to C<"utf8">. |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=head2 escape_entities |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
Passing C<escape_entities> a true value uses L<HTML::Entities> to |
809
|
|
|
|
|
|
|
encode potentially unsafe 'E<lt>', 'E<gt>', and 'E<amp>' characters. |
810
|
|
|
|
|
|
|
Defaults to true. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=head2 p_strict |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
Boolean indicating whether L<HTML::TreeBuilder> will use strict |
815
|
|
|
|
|
|
|
handling of paragraph tags when parsing HTML input. (This corresponds |
816
|
|
|
|
|
|
|
to the C<p_strict> method in the L<HTML::TreeBuilder> module.) Enabled |
817
|
|
|
|
|
|
|
by default. |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 passthrough_naked_tags |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Boolean indicating whether tags with no attributes ("naked" tags) |
822
|
|
|
|
|
|
|
should be removed and replaced with their content. By default, this |
823
|
|
|
|
|
|
|
only applies to non-semantic tags such as E<lt>spanE<gt>, |
824
|
|
|
|
|
|
|
E<lt>divE<gt>, etc., but does not apply to semantic tags such as |
825
|
|
|
|
|
|
|
E<lt>strongE<gt>, E<lt>addressE<gt>, etc. To override this behavior |
826
|
|
|
|
|
|
|
and specify the tags that should be considered for passthrough, |
827
|
|
|
|
|
|
|
provide this attribute with a reference to an array of tag names. |
828
|
|
|
|
|
|
|
Defaults to false, but you'll probably want to enable it. |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head2 preprocess |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
Code reference that gets invoked after HTML is parsed but before it is |
833
|
|
|
|
|
|
|
converted into wiki markup. The callback is passed two arguments: the |
834
|
|
|
|
|
|
|
C<HTML::WikiConverter> object and a L<HTML::Element> pointing to the |
835
|
|
|
|
|
|
|
root node of the HTML tree created by L<HTML::TreeBuilder>. |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=head2 slurp |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Boolean that, if enabled, bypasses C<HTML::Parser>'s incremental |
840
|
|
|
|
|
|
|
parsing (thus I<slurping> the file in all at once) of files when |
841
|
|
|
|
|
|
|
reading HTML files. If L<File::Slurp> is installed, its C<read_file()> |
842
|
|
|
|
|
|
|
function will be used to perform slurping; otherwise, a common Perl |
843
|
|
|
|
|
|
|
idiom will be used for slurping instead. This option is only used if |
844
|
|
|
|
|
|
|
you call C<html2wiki()> with the C<file> argument. |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=head2 strip_empty_tags |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Strips elements containing no content (unless those elements |
849
|
|
|
|
|
|
|
legitimately contain no content, such as is the case for C<br> and |
850
|
|
|
|
|
|
|
C<img> tags, for example). Defaults to false. |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=head2 strip_tags |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
A reference to an array of tags to be removed from the HTML input |
855
|
|
|
|
|
|
|
prior to conversion to wiki markup. Tag names are the same as those |
856
|
|
|
|
|
|
|
used in L<HTML::Element>. Defaults to C<[ '~comment', 'head', |
857
|
|
|
|
|
|
|
'script', 'style' ]>. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=head2 user_agent |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
Specifies the L<LWP::UserAgent> object to be used when fetching the |
862
|
|
|
|
|
|
|
URI passed to C<html2wiki()>. If unspecified and C<html2wiki()> is |
863
|
|
|
|
|
|
|
passed a URI, a default user agent will be created. |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head2 wiki_uri |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
Takes a URI, regular expression, or coderef (or a reference to an |
868
|
|
|
|
|
|
|
array of elements of these types) used to determine which links are to |
869
|
|
|
|
|
|
|
wiki pages: a link whose C<href> parameter matches C<wiki_uri> will be |
870
|
|
|
|
|
|
|
treated as a link to a wiki page. In addition, C<wiki_uri> will be |
871
|
|
|
|
|
|
|
used to extract the title of the wiki page. The way this is done |
872
|
|
|
|
|
|
|
depends on whether the C<wiki_uri> has been set to a string, regexp, |
873
|
|
|
|
|
|
|
or coderef. The default is C<undef>, meaning that all links will be |
874
|
|
|
|
|
|
|
treated as external links by default. |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
If C<wiki_uri> is a string, it is interpreted as a URI template, and |
877
|
|
|
|
|
|
|
it will be assumed that URIs to wiki pages are created by joining |
878
|
|
|
|
|
|
|
C<wiki_uri> with the wiki page title. For example, the English |
879
|
|
|
|
|
|
|
Wikipedia might use C<"http://en.wikipedia.org/wiki/"> as the value of |
880
|
|
|
|
|
|
|
C<wiki_uri>. Ward's wiki might use C<"http://c2.com/cgi/wiki?">. These |
881
|
|
|
|
|
|
|
examples use an absolute C<wiki_uri>, but a relative URI can be used |
882
|
|
|
|
|
|
|
as well; an absolute URI will be created based on the value of |
883
|
|
|
|
|
|
|
C<base_uri>. For example, the Wikipedia example above can be rewritten |
884
|
|
|
|
|
|
|
using C<base_uri> of C<"http://en.wikipedia.org"> and a C<wiki_uri> of |
885
|
|
|
|
|
|
|
C<"/wiki/">. |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
C<wiki_uri> can also be a regexp that matches URIs to wiki pages and |
888
|
|
|
|
|
|
|
also extracts the page title from them. For example, the English |
889
|
|
|
|
|
|
|
Wikipedia might use |
890
|
|
|
|
|
|
|
C<qr~http://en\.wikipedia\.org/w/index\.php\?title\=([^&]+)~>. |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
C<wiki_uri> can also be a coderef that takes the current |
893
|
|
|
|
|
|
|
C<HTML::WikiConverter> object and a L<URI> object. It should return |
894
|
|
|
|
|
|
|
the title of the wiki page extracted from the URI, or C<undef> if the |
895
|
|
|
|
|
|
|
URI doesn't represent a link to a wiki page. |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
As mentioned above, the C<wiki_uri> attribute can either take a single |
898
|
|
|
|
|
|
|
URI/regexp/coderef element or it may be assigned a reference to an |
899
|
|
|
|
|
|
|
array of any number of these elements. This is useful for wikis that |
900
|
|
|
|
|
|
|
have different ways of creating links to wiki pages. For example, the |
901
|
|
|
|
|
|
|
English Wikipedia might use: |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( |
904
|
|
|
|
|
|
|
dialect => 'MediaWiki', |
905
|
|
|
|
|
|
|
wiki_uri => [ |
906
|
|
|
|
|
|
|
'http://en.wikipiedia.org/wiki/', |
907
|
|
|
|
|
|
|
sub { pop->query_param('title') } # requires URI::QueryParam |
908
|
|
|
|
|
|
|
] |
909
|
|
|
|
|
|
|
); |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=head2 wrap_in_html |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
Helps L<HTML::TreeBuilder> parse HTML fragments by wrapping HTML in |
914
|
|
|
|
|
|
|
C<E<lt>htmlE<gt>> and C<E<lt>/htmlE<gt>> before passing it through |
915
|
|
|
|
|
|
|
C<html2wiki>. Boolean, enabled by default. |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=cut |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
sub __default_attribute_specs { { |
920
|
5
|
|
|
5
|
|
130
|
base_uri => { type => SCALAR, default => '' }, |
921
|
|
|
|
|
|
|
dialect => { type => SCALAR, optional => 0 }, |
922
|
|
|
|
|
|
|
encoding => { type => SCALAR, default => 'utf-8' }, |
923
|
|
|
|
|
|
|
escape_entities => { type => BOOLEAN, default => 1 }, |
924
|
|
|
|
|
|
|
normalize => { type => BOOLEAN, default => 1 }, |
925
|
|
|
|
|
|
|
p_strict => { type => BOOLEAN, default => 1 }, |
926
|
|
|
|
|
|
|
preprocess => { type => CODEREF | UNDEF, default => undef }, |
927
|
|
|
|
|
|
|
strip_empty_tags => { type => BOOLEAN, default => 0 }, |
928
|
|
|
|
|
|
|
slurp => { type => BOOLEAN, default => 0 }, |
929
|
|
|
|
|
|
|
strip_tags => { type => ARRAYREF, default => [ qw/ ~comment head script style / ] }, |
930
|
|
|
|
|
|
|
passthrough_naked_tags => { type => ARRAYREF | BOOLEAN, default => 0 }, |
931
|
|
|
|
|
|
|
user_agent => { type => OBJECT | UNDEF, default => undef }, |
932
|
|
|
|
|
|
|
wiki_uri => { type => SCALAR | ARRAYREF, default => '' }, |
933
|
|
|
|
|
|
|
wrap_in_html => { type => BOOLEAN, default => 1 }, |
934
|
|
|
|
|
|
|
} } |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=head1 ADDING A DIALECT |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
Consult L<HTML::WikiConverter::Dialects> for documentation on how to |
939
|
|
|
|
|
|
|
write your own dialect module for C<HTML::WikiConverter>. Or if you're |
940
|
|
|
|
|
|
|
not up to the task, drop me an email and I'll have a go at it when I |
941
|
|
|
|
|
|
|
get a spare moment. |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=head1 SEE ALSO |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
L<HTML::Tree>, L<Convert::Wiki> |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head1 AUTHOR |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
David J. Iberri, C<< <diberri@cpan.org> >> |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=head1 BUGS |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
954
|
|
|
|
|
|
|
C<bug-html-wikiconverter at rt.cpan.org>, or through the web interface at |
955
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>. |
956
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
957
|
|
|
|
|
|
|
your bug as I make changes. |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
=head1 SUPPORT |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
perldoc HTML::WikiConverter |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
You can also look for information at: |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
=over 4 |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
L<http://annocpan.org/dist/HTML-WikiConverter> |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=item * CPAN Ratings |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/HTML-WikiConverter> |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter> |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
=item * Search CPAN |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
L<http://search.cpan.org/dist/HTML-WikiConverter> |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=back |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
Thanks to Tatsuhiko Miyagawa for suggesting |
990
|
|
|
|
|
|
|
L<Bundle::HTMLWikiConverter> as well as providing code for the |
991
|
|
|
|
|
|
|
C<available_dialects()> class method. |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
My thanks also goes to Martin Kudlvasr for catching (and fixing!) a |
994
|
|
|
|
|
|
|
bug in the logic of how HTML files were processed. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Big thanks to Dave Schaefer for the PbWiki dialect and for the idea |
997
|
|
|
|
|
|
|
behind the new C<attributes()> implementation. |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
Copyright (c) David J. Iberri, all rights reserved. |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
1004
|
|
|
|
|
|
|
under the same terms as Perl itself. |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
=cut |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
1; |