| blib/lib/Pod/HTML2Pod.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 307 | 540 | 56.8 |
| branch | 77 | 256 | 30.0 |
| condition | 14 | 92 | 15.2 |
| subroutine | 40 | 42 | 95.2 |
| pod | 1 | 30 | 3.3 |
| total | 439 | 960 | 45.7 |
| line | stmt | bran | cond | sub | pod | time | code | ||
|---|---|---|---|---|---|---|---|---|---|
| 1 | |||||||||
| 2 | require 5; | ||||||||
| 3 | # Time-stamp: "2004-12-29 18:41:19 AST" | ||||||||
| 4 | |||||||||
| 5 | package Pod::HTML2Pod; | ||||||||
| 6 | 2 | 2 | 27892 | use strict; | |||||
| 2 | 4 | ||||||||
| 2 | 72 | ||||||||
| 7 | 2 | 2 | 2387 | use integer; # haul aaaaaaaaaass! | |||||
| 2 | 22 | ||||||||
| 2 | 12 | ||||||||
| 8 | 2 | 2 | 2746 | use UNIVERSAL (); | |||||
| 2 | 31 | ||||||||
| 2 | 47 | ||||||||
| 9 | 2 | 2 | 15 | use Carp (); | |||||
| 2 | 3 | ||||||||
| 2 | 46 | ||||||||
| 10 | 2 | 2 | 2510 | use HTML::TreeBuilder 3.01 (); | |||||
| 2 | 125915 | ||||||||
| 2 | 62 | ||||||||
| 11 | 2 | 2 | 25 | use HTML::Element 3.05 (); | |||||
| 2 | 27 | ||||||||
| 2 | 33 | ||||||||
| 12 | 2 | 2 | 9 | use HTML::Tagset (); # presumably used by HTML::TreeBuilder anyhow | |||||
| 2 | 6 | ||||||||
| 2 | 28 | ||||||||
| 13 | 2 | 2 | 11 | use HTML::Entities (); # presumably used by HTML::Parser anyhow | |||||
| 2 | 5 | ||||||||
| 2 | 44 | ||||||||
| 14 | 2 | 39414 | use vars qw($Debug $VERSION %Phrasal %Char2ent | ||||||
| 15 | 2 | 2 | 9 | $nbsp $E_slash $E_vbar $counter); | |||||
| 2 | 5 | ||||||||
| 16 | |||||||||
| 17 | $VERSION = '4.05'; | ||||||||
| 18 | $Debug = 0 unless defined $Debug; | ||||||||
| 19 | |||||||||
| 20 | =head1 NAME | ||||||||
| 21 | |||||||||
| 22 | Pod::HTML2Pod -- translate HTML into POD | ||||||||
| 23 | |||||||||
| 24 | =head1 SYNOPSIS | ||||||||
| 25 | |||||||||
| 26 | # Use the program 'html2pod' that comes in this dist, or: | ||||||||
| 27 | use Pod::HTML2Pod; | ||||||||
| 28 | print Pod::HTML2Pod::convert( | ||||||||
| 29 | 'file' => 'my_stuff.html', # input file | ||||||||
| 30 | 'a_href' => 1, # try converting links | ||||||||
| 31 | ); | ||||||||
| 32 | |||||||||
| 33 | =head1 DESCRIPTION | ||||||||
| 34 | |||||||||
| 35 | Larry Wall once said (1999-08-27, on the C |
||||||||
| 36 | do believe): "The whole point of pod is to get people to document stuff | ||||||||
| 37 | they wouldn't document in any other form." | ||||||||
| 38 | |||||||||
| 39 | To that end, I wrote this module so that people who are unpracticed | ||||||||
| 40 | with POD but in a hurry to simply document their programs or modules, | ||||||||
| 41 | could write their documentation in simple HTML, and convert that to | ||||||||
| 42 | POD. That's what this module does. | ||||||||
| 43 | |||||||||
| 44 | Specifically, this module bends over backwards to try to turn even | ||||||||
| 45 | vaguely plausable HTML into POD -- and when in doubt, it simply ignores | ||||||||
| 46 | things that it doesn't know about, or can't render. | ||||||||
| 47 | |||||||||
| 48 | =head1 FUNCTIONS | ||||||||
| 49 | |||||||||
| 50 | This module provides one documented function, which it does not export: | ||||||||
| 51 | |||||||||
| 52 | =over | ||||||||
| 53 | |||||||||
| 54 | =item Pod::HTML2Pod::convert( ...options... ) | ||||||||
| 55 | |||||||||
| 56 | =back | ||||||||
| 57 | |||||||||
| 58 | This returns a single scalar value containing the converted POD text, | ||||||||
| 59 | with some comments after the end. | ||||||||
| 60 | |||||||||
| 61 | This function takes options: | ||||||||
| 62 | |||||||||
| 63 | =over | ||||||||
| 64 | |||||||||
| 65 | =item 'file' => FILENAME, | ||||||||
| 66 | |||||||||
| 67 | Specifies that the HTML code is to be read from the filename given. | ||||||||
| 68 | |||||||||
| 69 | =item 'handle' => *HANDLE, | ||||||||
| 70 | |||||||||
| 71 | Specifies that the HTML code is to be read from the open filehandle | ||||||||
| 72 | given (e.g., C<$fh_obj>, C<*HANDLE>, C<*HANDLE{IO}>, etc.) If you | ||||||||
| 73 | specify this, but fail to specify an actual handle object, inscrutible | ||||||||
| 74 | errors may result. | ||||||||
| 75 | |||||||||
| 76 | =item 'content' => STRING, | ||||||||
| 77 | |||||||||
| 78 | Specifies that the HTML code is in the string given. (Alternately, | ||||||||
| 79 | pass a reference to the scalar: C<'content' =E |
||||||||
| 80 | |||||||||
| 81 | =item 'tree' => OBJ, | ||||||||
| 82 | |||||||||
| 83 | Specifies that the HTML document is contained in the given | ||||||||
| 84 | HTML::TreeBuilder object (or HTML::Element object, at least). | ||||||||
| 85 | |||||||||
| 86 | =item 'a_name' => BOOLEAN, | ||||||||
| 87 | |||||||||
| 88 | Specifies whether you want to try converting C |
||||||||
| 89 | elements. By default this is off -- i.e., such elements are ignored. | ||||||||
| 90 | |||||||||
| 91 | =item 'a_href' => BOOLEAN, | ||||||||
| 92 | |||||||||
| 93 | Specifies whether you want to try converting C |
||||||||
| 94 | elements. By default this is off -- i.e., such elements are ignored. | ||||||||
| 95 | If on, bear in mind that relative URLs cannot be properly converted to | ||||||||
| 96 | POD -- any relative URLs will be complained about in comments after | ||||||||
| 97 | the end of the document. Normal absolute URLs will be treated as best | ||||||||
| 98 | they can be. Note that URLs beginning "pod:..." will be turned into | ||||||||
| 99 | POD links to whatever follows; that is, "pod:Getopt::Std" is turned | ||||||||
| 100 | into C |
||||||||
| 101 | |||||||||
| 102 | =item 'debug' => INTEGER, | ||||||||
| 103 | |||||||||
| 104 | Puts Pod::HTML2Pod into verbose debug mode for the duration of | ||||||||
| 105 | processing this this HTML document. INTEGER can be 0 for no debug | ||||||||
| 106 | output, 1 for a moderate amount that will cause the HTML syntax tree | ||||||||
| 107 | to be be dumped at the start of the conversion, and 2 for that plus a | ||||||||
| 108 | dump of the intermediate POD doctree, plus a few more inscrutible | ||||||||
| 109 | diagnostic messages. Looking at the trees dumped might be helpful in | ||||||||
| 110 | making sense of error messages that refer to a particular node in the | ||||||||
| 111 | parse tree. | ||||||||
| 112 | |||||||||
| 113 | =item | ||||||||
| 114 | |||||||||
| 115 | =back | ||||||||
| 116 | |||||||||
| 117 | =head1 GUIDELINES | ||||||||
| 118 | |||||||||
| 119 | Don't write crappy HTML and expect this module to understand it. | ||||||||
| 120 | |||||||||
| 121 | Don't take the output of C |
||||||||
| 122 | you think it'd be neat to try it. You'll just learn really unpleasant | ||||||||
| 123 | things about C |
||||||||
| 124 | it to improve C |
||||||||
| 125 | |||||||||
| 126 | However, I |
||||||||
| 127 | bearing in mind these simple truths: | ||||||||
| 128 | |||||||||
| 129 | POD can't do tables, images, forms, imagemaps, layers, CSS, embedded | ||||||||
| 130 | Java applets or any other kind of object, FONT, or BLINK. So don't | ||||||||
| 131 | try to do any of these things. | ||||||||
| 132 | |||||||||
| 133 | Use C |
||||||||
| 134 | |||||||||
| 135 | If you want to have a block of literal example code, put it in a | ||||||||
| 136 | C |
||||||||
| 137 | |||||||||
| 138 | Keep things simple. | ||||||||
| 139 | |||||||||
| 140 | Remember: Just because it comes I |
||||||||
| 141 | it's happy normal pod. You can do lots of things in HTML that will | ||||||||
| 142 | produce POD that is strange but technically legal (like having huge | ||||||||
| 143 | and complex content in a C |
||||||||
| 144 | perldoc scream bloody murder about nroff macros stretched past their | ||||||||
| 145 | limit. | ||||||||
| 146 | |||||||||
| 147 | Try to avoid using a WYSIWYG HTML editor, as they often produce scary | ||||||||
| 148 | source. Ditto for taking selecting "Save as... HTML" in your word | ||||||||
| 149 | processor. You can always try it, but look at the HTML to survey the | ||||||||
| 150 | damage before you try converting it to POD. | ||||||||
| 151 | |||||||||
| 152 | Always look at the POD that's been output by HTML2Pod -- never just | ||||||||
| 153 | blindly include it. | ||||||||
| 154 | |||||||||
| 155 | Consider starting from this template: | ||||||||
| 156 | |||||||||
| 157 | |||||||||
| 158 | |||||||||
| 159 | |
||||||||
| 160 | |||||||||
| 161 | |||||||||
| 162 | |||||||||
| 163 | NAME |
||||||||
| 164 | |||||||||
| 165 | Things::Stuff -- does some things with stuff | ||||||||
| 166 | |||||||||
| 167 | SYNOPSIS |
||||||||
| 168 | |||||||||
| 169 | |
||||||||
| 170 | use HTML::Stuff; | ||||||||
| 171 | do some more stuff; | ||||||||
| 172 | la la la la la; | ||||||||
| 173 | oogah; | ||||||||
| 174 | |||||||||
| 175 | |||||||||
| 176 | DESCRIPTION |
||||||||
| 177 | |||||||||
| 178 | This module does things with stuff. It exports these functions: | ||||||||
| 179 | |||||||||
| 180 | |
||||||||
| 181 | thingify( ... ) |
||||||||
| 182 | |
||||||||
| 183 | |||||||||
| 184 | destuffulate( ... ) |
||||||||
| 185 | |
||||||||
| 186 | It will throw a fatal exception if applied to things. |
||||||||
| 187 | So don't do that. |
||||||||
| 188 | |||||||||
| 189 | enthinction( ... ) |
||||||||
| 190 | |
||||||||
| 191 | involving "thing" and "stuff". Mostly. | ||||||||
| 192 | |||||||||
| 193 | |||||||||
| 194 | |||||||||
| 195 | Caveats and WYA's |
||||||||
| 196 | |||||||||
| 197 | Things to be wary of: | ||||||||
| 198 | |||||||||
| 199 | |
||||||||
| 200 | |
||||||||
| 201 | |
||||||||
| 202 | Don't forget about that stuff. Gotta keep an eye on that. |
||||||||
| 203 | |||||||||
| 204 | |||||||||
| 205 | BUGS |
||||||||
| 206 | |||||||||
| 207 | Stuff is hard. | ||||||||
| 208 | |||||||||
| 209 | SEE ALSO |
||||||||
| 210 | |||||||||
| 211 | Class::Classless, | ||||||||
| 212 | strict, | ||||||||
| 213 | |||||||||
| 214 | >Lingua::EN::Numbers::Ordinate, | ||||||||
| 215 | perlvar, | ||||||||
| 216 | |||||||||
| 217 | |||||||||
| 219 | |||||||||
| 220 | COPYRIGHT |
||||||||
| 221 | |||||||||
| 222 | Copyright 2000, Joey Jo-Jo Jr. Shabadoo. | ||||||||
| 223 | |||||||||
| 224 | |||||||||
| 225 | This library is free software; you can redistribute it and/or modify |
||||||||
| 226 | it under the same terms as Perl itself. | ||||||||
| 227 | |||||||||
| 228 | AUTHOR |
||||||||
| 229 | Joey Jo-Jo Jr. Shabadoo, jojojo@shabadoo.int |
||||||||
| 230 | |||||||||
| 231 | |||||||||
| 232 | |||||||||
| 233 | =head1 BUG REPORTS | ||||||||
| 234 | |||||||||
| 235 | If you do find a case where this converter misinterprets what you | ||||||||
| 236 | consider straightforward HTML (which you should really really have run | ||||||||
| 237 | thru an HTML syntax checker, by the way!), report it to me as a bug, at | ||||||||
| 238 | C |
||||||||
| 239 | |||||||||
| 240 | Be sure to include the entire document that causes the error -- then | ||||||||
| 241 | specify exactly what you consider the error to be. | ||||||||
| 242 | |||||||||
| 243 | =head1 BUGS AND CAVEATS | ||||||||
| 244 | |||||||||
| 245 | * Doesn't try to turn "smart quotes" characters into simple " and '. | ||||||||
| 246 | Maybe should? | ||||||||
| 247 | |||||||||
| 248 | * Fails to turn | ||||||||
| 249 | |||||||||
| 250 | foo thing bar baz quux | ||||||||
| 251 | |||||||||
| 252 | into | ||||||||
| 253 | |||||||||
| 254 | foo S |
||||||||
| 255 | |||||||||
| 256 | I.e., currently just turns C< >'s into normal spaces. | ||||||||
| 257 | |||||||||
| 258 | * Numeric entities (C |
||||||||
| 259 | are not understood by some older POD converters. | ||||||||
| 260 | |||||||||
| 261 | * No HTML that you provide will turn into C |
||||||||
| 262 | |||||||||
| 263 | * Currently maps | ||||||||
| 264 | |||||||||
| 265 | bar | ||||||||
| 266 | |||||||||
| 267 | to | ||||||||
| 268 | |||||||||
| 269 | X |
||||||||
| 270 | |||||||||
| 271 | but is this correct? | ||||||||
| 272 | |||||||||
| 273 | =head1 SEE ALSO | ||||||||
| 274 | |||||||||
| 275 | L |
||||||||
| 276 | |||||||||
| 277 | And HTML Tidy, at C |
||||||||
| 278 | |||||||||
| 279 | =head1 COPYRIGHT | ||||||||
| 280 | |||||||||
| 281 | Copyright (c) 2000 Sean M. Burke. All rights reserved. | ||||||||
| 282 | |||||||||
| 283 | This library is free software; you can redistribute it and/or modify | ||||||||
| 284 | it under the same terms as Perl itself. | ||||||||
| 285 | |||||||||
| 286 | =head1 AUTHOR | ||||||||
| 287 | |||||||||
| 288 | Sean M. Burke C |
||||||||
| 289 | |||||||||
| 290 | =cut | ||||||||
| 291 | |||||||||
| 292 | # TODO: test whether anchors and references to them actually work | ||||||||
| 293 | # in extremis? (see what recent pod2html versions do to them?) | ||||||||
| 294 | |||||||||
| 295 | #-------------------------------------------------------------------------- | ||||||||
| 296 | |||||||||
| 297 | sub convert { | ||||||||
| 298 | 1 | 50 | 1 | 1 | 607 | Carp::croak(__PACKAGE__ . '::convert needs parameters!')unless @_; | |||
| 299 | 1 | 50 | 19 | Carp::croak( | |||||
| 300 | "odd number of elements in options to " . __PACKAGE__ . "::convert") | ||||||||
| 301 | if @_ % 2; | ||||||||
| 302 | |||||||||
| 303 | 1 | 6 | my %o = @_; | ||||||
| 304 | 1 | 3 | local($Debug) = $Debug; | ||||||
| 305 | 1 | 50 | 5 | if(exists $o{'debug'}) { $Debug = $o{'debug'} } | |||||
| 0 | 0 | ||||||||
| 306 | |||||||||
| 307 | 1 | 11 | my $tree = HTML::TreeBuilder->new(); | ||||||
| 308 | |||||||||
| 309 | 1 | 318 | $tree->ignore_ignorable_whitespace(1); | ||||||
| 310 | |||||||||
| 311 | 1 | 11 | my $comments = [ __PACKAGE__ . ' conversion notes:' ]; | ||||||
| 312 | |||||||||
| 313 | 1 | 50 | 4 | if(exists $o{'tree'}) { | |||||
| 314 | 0 | 0 | $tree->delete; # never mind that one | ||||||
| 315 | 0 | 0 | $tree = $o{'tree'}; | ||||||
| 316 | 0 | 0 | 0 | die "but the 'tree' value is undef" unless defined $tree; | |||||
| 317 | 0 | 0 | 0 | die "but the 'tree' value isn't an object" unless ref $tree; | |||||
| 318 | 0 | 0 | 0 | die "but the 'tree' value object's class isn't based on HTML::Element" | |||||
| 319 | unless $tree->isa('HTML::Element'); | ||||||||
| 320 | 0 | 0 | $tree = $tree->clone; | ||||||
| 321 | |||||||||
| 322 | } else { | ||||||||
| 323 | |||||||||
| 324 | 1 | 50 | 5 | if(exists $o{'file'}) { | |||||
| 325 | 0 | 0 | 0 | die "File $o{'file'} doesn't exist" unless -e $o{'file'}; | |||||
| 326 | 0 | 0 | local(*IN); | ||||||
| 327 | 0 | 0 | 0 | open(IN, "<$o{'file'}") or die "Can't open $o{'file'}: $!"; | |||||
| 328 | 0 | 0 | $o{'handle'} = *IN{IO}; | ||||||
| 329 | 0 | 0 | ++$o{'_close_after'}; | ||||||
| 330 | 0 | 0 | 0 | print "Input from $o{'file'} ($o{'handle'})\n" if $Debug; | |||||
| 331 | 0 | 0 | push @$comments, "#From file $o{'file'}"; | ||||||
| 332 | } | ||||||||
| 333 | |||||||||
| 334 | 1 | 50 | 4 | if(exists $o{'handle'}) { | |||||
| 335 | 0 | 0 | local $/; | ||||||
| 336 | 0 | 0 | my $fh = $o{'handle'}; | ||||||
| 337 | 0 | 0 | my $x; | ||||||
| 338 | 0 | 0 | $x = <$fh>; | ||||||
| 339 | 0 | 0 | 0 | close($fh) if $o{'_close_after'}; | |||||
| 340 | 0 | 0 | $o{'content'} = \$x; | ||||||
| 341 | 0 | 0 | 0 | print "Input from handle ($o{'handle'})\n" if $Debug; | |||||
| 342 | } | ||||||||
| 343 | |||||||||
| 344 | 1 | 50 | 5 | if(exists $o{'content'}) { | |||||
| 345 | 1 | 3 | my($content_r, $is_copy); | ||||||
| 346 | 1 | 50 | 8 | if(!defined $o{'content'}) { # undef content? | |||||
| 50 | |||||||||
| 347 | 0 | 0 | die "content is undef"; | ||||||
| 348 | } elsif(ref $o{'content'}) { # scalar ref | ||||||||
| 349 | 0 | 0 | 0 | die "content only accepts scalars or scalar refs" | |||||
| 350 | unless ref $o{'content'} eq 'SCALAR'; | ||||||||
| 351 | 0 | 0 | $content_r = $o{'content'}; | ||||||
| 352 | 0 | 0 | $is_copy = 0; | ||||||
| 353 | } else { # simple scalar | ||||||||
| 354 | 1 | 2 | $content_r = \$o{'content'}; | ||||||
| 355 | 1 | 3 | $is_copy = 1; | ||||||
| 356 | } | ||||||||
| 357 | |||||||||
| 358 | # Nativize newlines, if possible and if need be. | ||||||||
| 359 | # Otherwise PREs will be hard to reckon. | ||||||||
| 360 | 1 | 50 | 15 | if("\n" ne "\cm" and "\n" ne "\cm\cj" and "\n" ne "\cj") { | |||||
| 361 | print "I don't recognize what \"\\n\" means on this system!" if $Debug; | ||||||||
| 362 | 0 | 0 | } elsif($$content_r =~ m/(\cm\cj|\cm|\cj)/) { | ||||||
| 363 | 0 | 0 | my $nl = $1; | ||||||
| 364 | 0 | 0 | 0 | if($nl eq "\n") { | |||||
| 365 | # no-op | ||||||||
| 366 | 0 | 0 | 0 | print "# Already in native newline format\n" if $Debug; | |||||
| 367 | } else { | ||||||||
| 368 | 0 | 0 | 0 | unless($is_copy) { | |||||
| 369 | 0 | 0 | my $x = $$content_r; | ||||||
| 370 | 0 | 0 | $content_r = \$x; # copy | ||||||
| 371 | 0 | 0 | $is_copy = 1; | ||||||
| 372 | } | ||||||||
| 373 | 0 | 0 | 0 | if($nl eq "\cm") { | |||||
| 0 | |||||||||
| 0 | |||||||||
| 374 | 0 | 0 | $$content_r =~ tr/\cm/\n/; | ||||||
| 375 | 0 | 0 | 0 | print "# Nativizing newlines from \\cm to \\n\n" if $Debug; | |||||
| 376 | } elsif($nl eq "\cj") { | ||||||||
| 377 | 0 | 0 | $$content_r =~ tr/\cj/\n/; | ||||||
| 378 | 0 | 0 | 0 | print "# Nativizing newlines from \\cj to \\n\n" if $Debug; | |||||
| 379 | } elsif($nl eq "\cm\cj") { | ||||||||
| 380 | 0 | 0 | $$content_r =~ tr/\cj//d; | ||||||
| 381 | 0 | 0 | $$content_r =~ tr/\cm/\n/ unless "\cm" eq "\n"; | ||||||
| 382 | 0 | 0 | 0 | print "# Nativizing newlines from \\cm\\cj to \\n\n" if $Debug; | |||||
| 383 | } | ||||||||
| 384 | } | ||||||||
| 385 | } | ||||||||
| 386 | |||||||||
| 387 | 1 | 5 | push @$comments, | ||||||
| 388 | '# ' . length($$content_r) . ' bytes of input'; | ||||||||
| 389 | 1 | 31 | $tree->parse($$content_r); | ||||||
| 390 | 1 | 3086 | $tree->eof; | ||||||
| 391 | 1 | 127 | delete $o{'content'}; | ||||||
| 392 | } else { | ||||||||
| 393 | 0 | 0 | die "No input source specified?"; | ||||||
| 394 | } | ||||||||
| 395 | } | ||||||||
| 396 | |||||||||
| 397 | { | ||||||||
| 398 | # The BODY is all we need. Discard the rest. | ||||||||
| 399 | 1 | 50 | 3 | my $body = $tree->find_by_tag_name('body') || die "No BODY in tree?"; | |||||
| 1 | 31 | ||||||||
| 400 | 1 | 49 | $body->detach; | ||||||
| 401 | 1 | 19 | $tree->delete; | ||||||
| 402 | 1 | 38 | $tree = $body; | ||||||
| 403 | } | ||||||||
| 404 | |||||||||
| 405 | 1 | 50 | 116 | push @$comments, scalar(localtime) . ' ' . ($ENV{'USER'} || ''); | |||||
| 406 | 1 | 7 | $tree->attr('_pod_comments', $comments); | ||||||
| 407 | |||||||||
| 408 | # More options: | ||||||||
| 409 | 1 | 50 | 16 | if($o{'a_name'}) { | |||||
| 410 | 0 | 0 | $tree->attr('_a_name', 1); | ||||||
| 411 | 0 | 0 | push @$comments, " Will try to render "; | ||||||
| 412 | } else { | ||||||||
| 413 | 1 | 3 | push @$comments, | ||||||
| 414 | " No a_name switch not specified, so will not try to render "; | ||||||||
| 415 | } | ||||||||
| 416 | 1 | 50 | 3 | if($o{'a_href'}) { | |||||
| 417 | 0 | 0 | $tree->attr('_a_href', 1); | ||||||
| 418 | 0 | 0 | push @$comments, " Will try to render "; | ||||||
| 419 | } else { | ||||||||
| 420 | 1 | 3 | push @$comments, | ||||||
| 421 | " No a_href switch not specified, so will not try to render "; | ||||||||
| 422 | } | ||||||||
| 423 | |||||||||
| 424 | 1 | 4 | twist_tree($tree); | ||||||
| 425 | |||||||||
| 426 | 1 | 4 | my $rendering_r = tree_as_pod($tree); | ||||||
| 427 | 1 | 5 | $tree->delete; | ||||||
| 428 | 1 | 81 | return $$rendering_r; | ||||||
| 429 | } | ||||||||
| 430 | |||||||||
| 431 | ########################################################################### | ||||||||
| 432 | # | ||||||||
| 433 | # The code below this point is not happy nice readable undocumented code. | ||||||||
| 434 | # It is angry cryptic code, of the sort that you will find little use in | ||||||||
| 435 | # reading. | ||||||||
| 436 | # | ||||||||
| 437 | # When I first thought of writing this module, several years ago, I had | ||||||||
| 438 | # noble dreams that I could write some sort of universal markup-language | ||||||||
| 439 | # mixmaster, which would only need be fed some information about the | ||||||||
| 440 | # source language and the target language, and a few simple facts about | ||||||||
| 441 | # what constructs are equivalent (that HTML "h1" is POD "head1", for | ||||||||
| 442 | # example), and then magic would happen, and documents would be converted. | ||||||||
| 443 | # | ||||||||
| 444 | # Well, I've not yet found that mixmaster, so I've had to write some | ||||||||
| 445 | # very spooky crusty strange code. It seems to work rather well when fed | ||||||||
| 446 | # simple HTML, and seems to degrade gracefully when fed too-complex HTML. | ||||||||
| 447 | # | ||||||||
| 448 | # The code can be used as-is, but it's not conceivably adaptable to other | ||||||||
| 449 | # tasks, or even easily maintainable, regrettably. However, as HTML or | ||||||||
| 450 | # POD are not likely to mutate significantly any time soon, I think | ||||||||
| 451 | # substantial maintenance will not be needed -- just minor tweaking or | ||||||||
| 452 | # bugfixes on my part. | ||||||||
| 453 | # | ||||||||
| 454 | ########################################################################### | ||||||||
| 455 | # SO STOP READING NOW, IF YOU VALUE YOUR SANITY | ||||||||
| 456 | ########################################################################### | ||||||||
| 457 | # | ||||||||
| 458 | # Stay away! | ||||||||
| 459 | # STAY AWAY! | ||||||||
| 460 | # Stay away! | ||||||||
| 461 | # You might end up like me! | ||||||||
| 462 | # | ||||||||
| 463 | # It's the pain | ||||||||
| 464 | # that keeps us alive, | ||||||||
| 465 | # but that beauty is all that we need to survive. | ||||||||
| 466 | # | ||||||||
| 467 | # That damned beauty is all that we need to survive. | ||||||||
| 468 | # | ||||||||
| 469 | # -- David Byrne, "They Are In Love" | ||||||||
| 470 | # | ||||||||
| 471 | ########################################################################### | ||||||||
| 472 | |||||||||
| 473 | # Initialization code: | ||||||||
| 474 | |||||||||
| 475 | # TODO: replace this with a hardwired table? | ||||||||
| 476 | %Phrasal = %HTML::Tagset::isPhraseMarkup; | ||||||||
| 477 | delete @Phrasal{'br', 'hr'}; | ||||||||
| 478 | for (qw(~literal ~texticle)) { $Phrasal{$_} = 1 } | ||||||||
| 479 | $counter = 0 unless defined $counter; | ||||||||
| 480 | |||||||||
| 481 | $Debug = 2 unless defined $Debug; | ||||||||
| 482 | |||||||||
| 483 | # Fill out Char2ent: | ||||||||
| 484 | { | ||||||||
| 485 | die "\%HTML::Entities::char2entity is empty?" | ||||||||
| 486 | unless keys %HTML::Entities::char2entity; | ||||||||
| 487 | |||||||||
| 488 | my($c,$e); | ||||||||
| 489 | while(($c,$e) = each(%HTML::Entities::char2entity)) { | ||||||||
| 490 | if($e =~ m{^(\d+);$}s) { | ||||||||
| 491 | $Char2ent{$c} = "E<$1>"; | ||||||||
| 492 | #print "num $e => E<$1>\n"; | ||||||||
| 493 | # { => E<123> | ||||||||
| 494 | } elsif($e =~ m{^&([^;]+);$}s) { | ||||||||
| 495 | $Char2ent{$c} = "E<$1>"; | ||||||||
| 496 | #print "eng $e => E<$1>\n"; | ||||||||
| 497 | # é => E |
||||||||
| 498 | } else { | ||||||||
| 499 | warn "Unknown thingy in %HTML::Entities::char2entity: $e" | ||||||||
| 500 | # if $^W; | ||||||||
| 501 | } | ||||||||
| 502 | } | ||||||||
| 503 | |||||||||
| 504 | # Points of difference between HTML entities and POD entities: | ||||||||
| 505 | |||||||||
| 506 | $Char2ent{"\xA0"} = "E<160>"; # there is no E |
||||||||
| 507 | |||||||||
| 508 | $Char2ent{"\xAB"} = "E |
||||||||
| 509 | $Char2ent{"\xBB"} = "E |
||||||||
| 510 | # Altho new POD processors also know E |
||||||||
| 511 | |||||||||
| 512 | # Old POD processors don't know these two -- so leave numeric | ||||||||
| 513 | # $Char2ent{'/'} = 'E |
||||||||
| 514 | # $Char2ent{'|'} = 'E |
||||||||
| 515 | } | ||||||||
| 516 | |||||||||
| 517 | # Set up some initial values we'll need later. | ||||||||
| 518 | unless(defined $nbsp) { | ||||||||
| 519 | my $nb = ' '; | ||||||||
| 520 | HTML::Entities::decode_entities($nb); | ||||||||
| 521 | if(!defined $nb) { | ||||||||
| 522 | die " decodes to undef?"; | ||||||||
| 523 | } elsif($nb eq '') { | ||||||||
| 524 | die " decodes to empty-string?"; | ||||||||
| 525 | } elsif($nb eq ' ') { | ||||||||
| 526 | die " doesn't decode?"; | ||||||||
| 527 | } elsif($nb eq ' ') { | ||||||||
| 528 | $nbsp = undef; | ||||||||
| 529 | } else { | ||||||||
| 530 | $nbsp = $nb; | ||||||||
| 531 | } | ||||||||
| 532 | } | ||||||||
| 533 | |||||||||
| 534 | unless(defined $E_slash) { | ||||||||
| 535 | my $x = '/'; | ||||||||
| 536 | encode_entities_harder($x); | ||||||||
| 537 | if(!defined $x or !length $x) { | ||||||||
| 538 | die "'/' encodes to nothing??"; | ||||||||
| 539 | } elsif($x eq '/') { | ||||||||
| 540 | # no-op | ||||||||
| 541 | } elsif($x =~ m{^E<[^>]+>$}s) { | ||||||||
| 542 | $E_slash = $x; | ||||||||
| 543 | } else { | ||||||||
| 544 | die "'/' encodes as $x?!"; | ||||||||
| 545 | } | ||||||||
| 546 | } | ||||||||
| 547 | |||||||||
| 548 | unless(defined $E_vbar) { | ||||||||
| 549 | my $x = '|'; | ||||||||
| 550 | encode_entities_harder($x); | ||||||||
| 551 | if(!defined $x or !length $x) { | ||||||||
| 552 | die "'|' encodes to nothing??"; | ||||||||
| 553 | } elsif($x eq '|') { | ||||||||
| 554 | # no-op | ||||||||
| 555 | } elsif($x =~ m{^E<[^>]+>$}s) { | ||||||||
| 556 | $E_vbar = $x; | ||||||||
| 557 | } else { | ||||||||
| 558 | die "'|' encodes as $x?!"; | ||||||||
| 559 | } | ||||||||
| 560 | } | ||||||||
| 561 | |||||||||
| 562 | # Last chance to save your sanity: stop reading now... | ||||||||
| 563 | |||||||||
| 564 | #-------------------------------------------------------------------------- | ||||||||
| 565 | |||||||||
| 566 | # TODO: make all P's go byebye once we've texticulated? | ||||||||
| 567 | |||||||||
| 568 | sub twist_tree { | ||||||||
| 569 | 1 | 1 | 0 | 3 | my $tree = $_[0]; | ||||
| 570 | |||||||||
| 571 | 1 | 5 | html_node_name($tree); | ||||||
| 572 | |||||||||
| 573 | 1 | 5 | delete_unknowns($tree); | ||||||
| 574 | |||||||||
| 575 | 1 | 4 | special_splice_div($tree); | ||||||
| 576 | |||||||||
| 577 | 1 | 50 | 14 | print("Input tree:\n"), $tree->dump, sleep(0) if $Debug; | |||||
| 578 | |||||||||
| 579 | 1 | 9 | prune_by_tag_name( $tree, | ||||||
| 580 | [qw~ script style ~], | ||||||||
| 581 | [qw~ map style isindex select textarea del input embed bgsound basefont ~], | ||||||||
| 582 | ); | ||||||||
| 583 | |||||||||
| 584 | 1 | 13 | splice_by_tag_name($tree, | ||||||
| 585 | [qw~ | ||||||||
| 586 | big small acronym sub sup multicol | ||||||||
| 587 | applet param object | ||||||||
| 588 | table tr caption col thead tbody tfoot colgroup | ||||||||
| 589 | noscript center font bdo fieldset ins | ||||||||
| 590 | form label legend button link layer object | ||||||||
| 591 | span abbr blink strike wbr | ||||||||
| 592 | frame frameset ilayer layer nolayer | ||||||||
| 593 | address nobr | ||||||||
| 594 | ~], | ||||||||
| 595 | ); | ||||||||
| 596 | |||||||||
| 597 | 1 | 17 | remap_tags($tree, {qw~ | ||||||
| 598 | td p | ||||||||
| 599 | th p | ||||||||
| 600 | em i | ||||||||
| 601 | strong b | ||||||||
| 602 | cite i | ||||||||
| 603 | code code | ||||||||
| 604 | tt code | ||||||||
| 605 | kbd code | ||||||||
| 606 | samp code | ||||||||
| 607 | var i | ||||||||
| 608 | dfn b | ||||||||
| 609 | listing pre | ||||||||
| 610 | plaintext pre | ||||||||
| 611 | xmp pre | ||||||||
| 612 | dd p | ||||||||
| 613 | ~}); | ||||||||
| 614 | # CODE for C<> | ||||||||
| 615 | # I for I<> | ||||||||
| 616 | # B for B<> | ||||||||
| 617 | |||||||||
| 618 | # TODO: Warn of cases where heading has too-complex text in it? | ||||||||
| 619 | |||||||||
| 620 | 1 | 6 | p_unnest($tree); | ||||||
| 621 | |||||||||
| 622 | 1 | 32 | pre_render($tree); | ||||||
| 623 | 1 | 4 | q_render($tree); | ||||||
| 624 | |||||||||
| 625 | 1 | 3 | images_render($tree); | ||||||
| 626 | 1 | 4 | hr_render($tree); | ||||||
| 627 | 1 | 3 | br_render($tree); | ||||||
| 628 | 1 | 4 | lists_render($tree); | ||||||
| 629 | #wrangle_body_children($tree); | ||||||||
| 630 | |||||||||
| 631 | 1 | 4 | literalize_text_under($tree); | ||||||
| 632 | |||||||||
| 633 | 1 | 4 | winge_about_phrasal_paradoxes($tree); | ||||||
| 634 | |||||||||
| 635 | 1 | 4 | texticulate($tree); | ||||||
| 636 | 1 | 3 | promote_some_secondary_children($tree); | ||||||
| 637 | 1 | 5 | goodify_p_elements($tree); | ||||||
| 638 | |||||||||
| 639 | 1 | 3 | render_headings($tree); # busts up the headings | ||||||
| 640 | |||||||||
| 641 | 1 | 4 | a_tweak($tree); | ||||||
| 642 | #bust_up($tree, qw~h1 h2 h3 h4 h5 h6 p~); | ||||||||
| 643 | |||||||||
| 644 | 1 | 4 | pod_node_name($tree); | ||||||
| 645 | 1 | 50 | 8 | $tree->dump, sleep(0) if $Debug > 1; | |||||
| 646 | 1 | 2 | return; | ||||||
| 647 | } | ||||||||
| 648 | |||||||||
| 649 | #========================================================================== | ||||||||
| 650 | # Subs below here are in no particular order. Ahwell. | ||||||||
| 651 | |||||||||
| 652 | sub a_tweak { | ||||||||
| 653 | |||||||||
| 654 | #Scratch: | ||||||||
| 655 | 1 | 1 | 0 | 2 | my($a_name, $parent, $grandparent, $gptag, @cl, $text); | ||||
| 656 | |||||||||
| 657 | 1 | 4 | foreach my $a ($_[0]->find_by_tag_name('a')) { | ||||||
| 658 | # The configuration we're after looks like this: | ||||||||
| 659 | # @0.0 |
||||||||
| 660 | # <~texticle -pod-id="~texticle_1" id="``G55"> @0.0.0 | ||||||||
| 661 | # @0.0.0.0 | ||||||||
| 662 | # NAME @0.0.0.0.0 | ||||||||
| 663 | 0 | 0 | $a_name = $a->attr('name'); | ||||||
| 664 | 0 | 0 | 0 | next unless defined $a_name; | |||||
| 665 | |||||||||
| 666 | 0 | 0 | 0 | $parent = $a->parent || next; | |||||
| 667 | 0 | 0 | 0 | next unless $parent->tag eq '~texticle'; | |||||
| 668 | 0 | 0 | 0 | $grandparent = $parent->parent || next; | |||||
| 669 | 0 | 0 | $gptag = $grandparent->tag; | ||||||
| 670 | 0 | 0 | 0 | 0 | next unless $gptag eq 'h1' or $gptag eq 'h2' or $gptag eq 'item'; | ||||
| 0 | |||||||||
| 671 | 0 | 0 | 0 | 0 | next unless $parent->content_list == 1 | ||||
| 672 | and $grandparent->content_list == 1; # only child of an only child | ||||||||
| 673 | 0 | 0 | @cl = $a->content_list; # with one child, a texticle | ||||||
| 674 | 0 | 0 | 0 | 0 | next unless @cl == 1 and ref $cl[0] and $cl[0]->tag eq '~literal'; | ||||
| 0 | |||||||||
| 675 | 0 | 0 | $text = $cl[0]->attr('text'); | ||||||
| 676 | 0 | 0 | 0 | next unless defined $text; | |||||
| 677 | 0 | 0 | $text =~ s/^\s+//s; | ||||||
| 678 | 0 | 0 | $text =~ s/\s+$//s; | ||||||
| 679 | 0 | 0 | 0 | if($a_name eq $text) { | |||||
| 680 | 0 | 0 | $a->replace_with_content; | ||||||
| 681 | 0 | 0 | 0 | print "a_tweak applies to ", $a->attr('id'), "\n" if $Debug > 1 | |||||
| 682 | } else { | ||||||||
| 683 | 0 | 0 | 0 | print "a_tweak can't apply to ", | |||||
| 684 | $a->attr('id'), ": [$a_name] ne [$text]\n" | ||||||||
| 685 | if $Debug > 1; | ||||||||
| 686 | # hack can't apply | ||||||||
| 687 | } | ||||||||
| 688 | } | ||||||||
| 689 | |||||||||
| 690 | 1 | 41 | return; | ||||||
| 691 | } | ||||||||
| 692 | |||||||||
| 693 | sub p_unnest { | ||||||||
| 694 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 695 | # Now, p's can't nest in HTML, but once we've spliced out and remapped | ||||||||
| 696 | # things, we can end up with p's containing p's in our parse tree: | ||||||||
| 697 | #
|
||||||||
| 698 | # = Foo Bar Baz |
||||||||
| 699 | 1 | 4 | foreach my $p (reverse $tree->find_by_tag_name('p')) { | ||||||
| 700 | 0 | 0 | 0 | if($p->parent->tag eq 'p') { | |||||
| 701 | 0 | 0 | my @c = $p->detach_content; | ||||||
| 702 | 0 | 0 | $p->replace_with( | ||||||
| 703 | HTML::Element->new( 'br', | ||||||||
| 704 | 'id', '``G' . ++$counter), | ||||||||
| 705 | @c, | ||||||||
| 706 | HTML::Element->new( 'br', | ||||||||
| 707 | 'id', '``G' . ++$counter), | ||||||||
| 708 | ); | ||||||||
| 709 | } | ||||||||
| 710 | } | ||||||||
| 711 | } | ||||||||
| 712 | |||||||||
| 713 | #========================================================================== | ||||||||
| 714 | |||||||||
| 715 | sub delete_unknowns { | ||||||||
| 716 | 1 | 1 | 0 | 3 | my $tree = $_[0]; | ||||
| 717 | 1 | 3 | my $map_r = $tree->tagname_map; | ||||||
| 718 | 1 | 104 | delete @$map_r{keys %HTML::Tagset::isKnown}; | ||||||
| 719 | 1 | 8 | my($tag, $elements); | ||||||
| 720 | 1 | 6 | while(($tag,$elements) = each %$map_r) { | ||||||
| 721 | 0 | 0 | commentate($tree, join ", ", | ||||||
| 722 | "# Unknown \"$tag\" elements deleted: ", | ||||||||
| 723 | map $_->attr('id'), @$elements | ||||||||
| 724 | ); | ||||||||
| 725 | 0 | 0 | foreach my $e (@$elements) { $e->replace_with_content } | ||||||
| 0 | 0 | ||||||||
| 726 | } | ||||||||
| 727 | 1 | 3 | return; | ||||||
| 728 | } | ||||||||
| 729 | |||||||||
| 730 | #========================================================================== | ||||||||
| 731 | sub special_splice_div { | ||||||||
| 732 | 1 | 1 | 0 | 6 | foreach my $div ($_[0]->find_by_tag_name('div', 'iframe')) { | ||||
| 733 | 0 | 0 | $div->replace_with( | ||||||
| 734 | HTML::Element->new( 'br', | ||||||||
| 735 | 'id', '``G' . ++$counter), | ||||||||
| 736 | $div->content_list(), | ||||||||
| 737 | HTML::Element->new( 'br', | ||||||||
| 738 | 'id', '``G' . ++$counter), | ||||||||
| 739 | ); | ||||||||
| 740 | } | ||||||||
| 741 | 1 | 36 | return; | ||||||
| 742 | } | ||||||||
| 743 | |||||||||
| 744 | #========================================================================== | ||||||||
| 745 | |||||||||
| 746 | sub winge_about_phrasal_paradoxes { | ||||||||
| 747 | 1 | 1 | 0 | 3 | my $tree = $_[0]; | ||||
| 748 | 1 | 1 | my @non_phrasal_children; | ||||||
| 749 | 1 | 12 | foreach my $p (reverse $tree->find_by_tag_name(keys %Phrasal)) { | ||||||
| 750 | 1 | 238 | @non_phrasal_children = (); | ||||||
| 751 | 1 | 5 | foreach my $c ($p->content_list) { | ||||||
| 752 | 0 | 0 | 0 | 0 | push @non_phrasal_children, $c | ||||
| 753 | if ref $c and not $Phrasal{$c->tag}; | ||||||||
| 754 | } | ||||||||
| 755 | 1 | 50 | 56 | if(@non_phrasal_children) { | |||||
| 756 | 0 | 0 | my $tag = $p->tag; | ||||||
| 757 | 0 | 0 | commentate( $tree, | ||||||
| 758 | join '', | ||||||||
| 759 | " Deleting phrasal \"$tag\" element (", | ||||||||
| 760 | $p->attr('id'), | ||||||||
| 761 | ") because it has super-phrasal elements (", | ||||||||
| 762 | join(", ", | ||||||||
| 763 | map $_->attr('id'), @non_phrasal_children | ||||||||
| 764 | ), ") as children.", | ||||||||
| 765 | ) | ||||||||
| 766 | ; | ||||||||
| 767 | 0 | 0 | $p->replace_with_content; | ||||||
| 768 | } | ||||||||
| 769 | } | ||||||||
| 770 | 1 | 5 | return; | ||||||
| 771 | } | ||||||||
| 772 | |||||||||
| 773 | #========================================================================== | ||||||||
| 774 | |||||||||
| 775 | sub commentate { | ||||||||
| 776 | 0 | 0 | 0 | 0 | my $tree = shift; | ||||
| 777 | 0 | 0 | push @{ $tree->attr('_pod_comments') }, @_; | ||||||
| 0 | 0 | ||||||||
| 778 | 0 | 0 | return; | ||||||
| 779 | } | ||||||||
| 780 | |||||||||
| 781 | #========================================================================== | ||||||||
| 782 | |||||||||
| 783 | sub html_node_name { | ||||||||
| 784 | 1 | 1 | 0 | 8 | my $map_r = $_[0]->tagname_map; | ||||
| 785 | |||||||||
| 786 | 1 | 28 | my($name, $nodes); | ||||||
| 787 | 1 | 11 | while(($name, $nodes) = each %$map_r) { | ||||||
| 788 | 2 | 34 | my $counter = 0; | ||||||
| 789 | 2 | 5 | foreach my $node (@$nodes) { | ||||||
| 790 | 2 | 3 | ++$counter; | ||||||
| 791 | 2 | 33 | 14 | $node->attr('id', | |||||
| 792 | $node->attr('id') || ( '`' . $name . '_' . $counter ) | ||||||||
| 793 | ) | ||||||||
| 794 | ; | ||||||||
| 795 | } | ||||||||
| 796 | } | ||||||||
| 797 | |||||||||
| 798 | 1 | 27 | return; | ||||||
| 799 | } | ||||||||
| 800 | |||||||||
| 801 | sub pod_node_name { | ||||||||
| 802 | 1 | 1 | 0 | 4 | my $map_r = $_[0]->tagname_map; | ||||
| 803 | |||||||||
| 804 | 1 | 34 | my($name, $nodes); | ||||||
| 805 | 1 | 6 | while(($name, $nodes) = each %$map_r) { | ||||||
| 806 | 4 | 46 | my $counter = 0; | ||||||
| 807 | 4 | 7 | foreach my $node (@$nodes) { | ||||||
| 808 | 4 | 4 | ++$counter; | ||||||
| 809 | 4 | 21 | $node->attr('-pod-id', | ||||||
| 810 | $name . '_' . $counter | ||||||||
| 811 | ) | ||||||||
| 812 | ; | ||||||||
| 813 | } | ||||||||
| 814 | } | ||||||||
| 815 | |||||||||
| 816 | 1 | 16 | return; | ||||||
| 817 | } | ||||||||
| 818 | |||||||||
| 819 | #========================================================================== | ||||||||
| 820 | |||||||||
| 821 | sub render_headings { | ||||||||
| 822 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 823 | 1 | 4 | my $map_r = $tree->tagname_map; | ||||||
| 824 | 1 | 47 | my @levels = sort grep m/^h[1-9]+$/s, keys %$map_r; | ||||||
| 825 | 1 | 3 | my @headings; | ||||||
| 826 | |||||||||
| 827 | 1 | 50 | 4 | if(@levels == 0) { # no headings!?! | |||||
| 828 | # TODO: insert something? | ||||||||
| 829 | } else { | ||||||||
| 830 | 1 | 50 | 4 | print "# Highest heading level: $levels[0] Making that =head1\n" | |||||
| 831 | if $Debug; | ||||||||
| 832 | 1 | 2 | foreach my $h (@{$map_r->{shift @levels}}) { | ||||||
| 1 | 3 | ||||||||
| 833 | 1 | 1 | push @headings, $h; | ||||||
| 834 | 1 | 4 | $h->attr('was-tag', $h->tag); | ||||||
| 835 | 1 | 26 | $h->attr('_tag', 'h1'); | ||||||
| 836 | } | ||||||||
| 837 | # And, for any sub-primary levels... | ||||||||
| 838 | 1 | 0 | 33 | 15 | print "# Lower levels: @levels. Making those =head2\n" | ||||
| 839 | if @levels and $Debug; | ||||||||
| 840 | 1 | 3 | foreach my $h (map @{$map_r->{$_}}, @levels) { | ||||||
| 0 | 0 | ||||||||
| 841 | 0 | 0 | push @headings, $h; | ||||||
| 842 | 0 | 0 | $h->attr('was-tag', $h->tag); | ||||||
| 843 | 0 | 0 | $h->attr('_tag', 'h2'); | ||||||
| 844 | } | ||||||||
| 845 | } | ||||||||
| 846 | |||||||||
| 847 | 1 | 3 | foreach my $h (@headings) { | ||||||
| 848 | 1 | 50 | 9 | if($h->parent->is_inside('h1', 'h2')) { | |||||
| 849 | # Don't put headings inside other headings. It's just stupid. | ||||||||
| 850 | 0 | 0 | $h->replace_with_content; | ||||||
| 851 | 0 | 0 | undef($h); | ||||||
| 852 | } | ||||||||
| 853 | } | ||||||||
| 854 | |||||||||
| 855 | 1 | 33 | foreach my $h (grep defined($_), @headings) { | ||||||
| 856 | 1 | 4 | my @c = $h->content_list; | ||||||
| 857 | 1 | 50 | 10 | if(!@c) { | |||||
| 50 | |||||||||
| 858 | 0 | 0 | $h->delete; | ||||||
| 859 | } elsif($c[0]->tag ne '~texticle') { | ||||||||
| 860 | 0 | 0 | $h->replace_with_content; | ||||||
| 861 | # Don't have things other than texticles in headings | ||||||||
| 862 | } else { | ||||||||
| 863 | 1 | 50 | 10 | if(@c > 1) { | |||||
| 864 | # promote all but the first element | ||||||||
| 865 | 0 | 0 | $h->detach_content; | ||||||
| 866 | 0 | 0 | $h->push_content(shift @c); | ||||||
| 867 | 0 | 0 | $h->postinsert(@c); | ||||||
| 868 | # SHOULD HAVE HAPPENED ANYWAY. | ||||||||
| 869 | } | ||||||||
| 870 | # else @c is just one element, a texticle -- which is ideal. | ||||||||
| 871 | 1 | 50 | 3 | commentate($tree, | |||||
| 872 | "# Icky: heading " . $h->attr('id') | ||||||||
| 873 | . " not immediately under body." | ||||||||
| 874 | ) unless $h->parent eq $tree; | ||||||||
| 875 | } | ||||||||
| 876 | } | ||||||||
| 877 | |||||||||
| 878 | 1 | 13 | return; | ||||||
| 879 | } | ||||||||
| 880 | |||||||||
| 881 | #-------------------------------------------------------------------------- | ||||||||
| 882 | |||||||||
| 883 | sub goodify_p_elements { | ||||||||
| 884 | 1 | 1 | 0 | 5 | foreach my $x ($_[0], $_[0]->find_by_tag_name('over', 'item')) { | ||||
| 885 | 1 | 44 | my $dirty; | ||||||
| 886 | 1 | 4 | my @children = $x->content_list; | ||||||
| 887 | |||||||||
| 888 | 1 | 9 | for(my $i = 0; $i < @children; ++$i) { | ||||||
| 889 | 1 | 50 | 3 | if($children[$i]->tag eq 'p') { | |||||
| 890 | 0 | 0 | my $p = $children[$i]; | ||||||
| 891 | 0 | 0 | my @p_content = $p->detach_content; | ||||||
| 892 | 0 | 0 | $p->delete; | ||||||
| 893 | 0 | 0 | $dirty = 1; | ||||||
| 894 | |||||||||
| 895 | # Replace the p in the list with its content, and update $i: | ||||||||
| 896 | 0 | 0 | splice @children, $i, 1, @p_content; | ||||||
| 897 | 0 | 0 | $i += scalar(@p_content) - 1; | ||||||
| 898 | # Properly, | ||||||||
| 899 | # Leaves $i alone if @p_content == 1. | ||||||||
| 900 | # Decrements $i if @p_content == 0. | ||||||||
| 901 | # Adds to $i appropriately for other sizes of @p_content. | ||||||||
| 902 | } | ||||||||
| 903 | } | ||||||||
| 904 | |||||||||
| 905 | 1 | 50 | 12 | if($dirty) { | |||||
| 906 | 0 | 0 | $x->detach_content; | ||||||
| 907 | 0 | 0 | $x->push_content(@children); | ||||||
| 908 | } | ||||||||
| 909 | } | ||||||||
| 910 | |||||||||
| 911 | 1 | 2 | my @c; | ||||||
| 912 | # /Try/ to delete all p's | ||||||||
| 913 | 1 | 4 | foreach my $p ($_[0]->find_by_tag_name('p')) { | ||||||
| 914 | 0 | 0 | @c = $p->content_list; | ||||||
| 915 | 0 | 0 | 0 | if(!@c) { | |||||
| 0 | |||||||||
| 916 | 0 | 0 | $p->delete; # always right? | ||||||
| 917 | |||||||||
| 918 | 0 | 0 | } elsif(@c == grep {; $_->tag eq '~texticle'} @c) { | ||||||
| 919 | #all texticles! | ||||||||
| 920 | 0 | 0 | $p->replace_with_content; | ||||||
| 921 | } else { | ||||||||
| 922 | 0 | 0 | |||||||
| 923 | "# Odd: content of p (", | ||||||||
| 924 | $p->attr('id'), | ||||||||
| 925 | ") is not all texticles: [", | ||||||||
| 926 | join(' ', map $_->tag, @c), "]\n" | ||||||||
| 927 | ; | ||||||||
| 928 | # Shouldn't happen, I think. | ||||||||
| 929 | } | ||||||||
| 930 | } | ||||||||
| 931 | |||||||||
| 932 | 1 | 40 | return; | ||||||
| 933 | } | ||||||||
| 934 | |||||||||
| 935 | #-------------------------------------------------------------------------- | ||||||||
| 936 | |||||||||
| 937 | sub promote_some_secondary_children { | ||||||||
| 938 | 1 | 1 | 0 | 9 | foreach my $x (reverse($_[0]->find_by_tag_name('item', 'h1' .. 'h6'))) { | ||||
| 939 | 1 | 66 | my @c = $x->content_list; | ||||||
| 940 | 1 | 50 | 20 | if(@c > 1) { | |||||
| 941 | # Take all children after the first, and move them up to | ||||||||
| 942 | # being right sisters of this node. | ||||||||
| 943 | 0 | 0 | 0 | ||||||
| 944 | "# Promote_some_secondary_children applies to ", | ||||||||
| 945 | $x->attr('id'), | ||||||||
| 946 | ": (", | ||||||||
| 947 | join(", ", map $_->attr('id'), @c), ")\n" if $Debug; | ||||||||
| 948 | 0 | 0 | $x->detach_content; | ||||||
| 949 | 0 | 0 | $x->push_content(shift @c); | ||||||
| 950 | 0 | 0 | $x->postinsert(@c); | ||||||
| 951 | #print "Done\n" if $Debug; | ||||||||
| 952 | } | ||||||||
| 953 | } | ||||||||
| 954 | #print "Returning\n" if $Debug; | ||||||||
| 955 | 1 | 23 | return; | ||||||
| 956 | } | ||||||||
| 957 | |||||||||
| 958 | sub literalize_text_under { | ||||||||
| 959 | # Traverse tree, turning text segments into ~literal pseudoelements | ||||||||
| 960 | 2 | 2 | 0 | 4 | my $node = $_[0]; | ||||
| 961 | 2 | 4 | my(@children, $dirty); | ||||||
| 962 | 2 | 9 | foreach my $c (@children = $node->content_list) { | ||||||
| 963 | 2 | 100 | 16 | if(ref $c) { | |||||
| 964 | 1 | 14 | literalize_text_under($c); | ||||||
| 965 | } else { | ||||||||
| 966 | 1 | 2 | $dirty = 1; | ||||||
| 967 | 1 | 15 | $c = HTML::Element->new('~literal', 'text' => $c, | ||||||
| 968 | 'id', '``G' . ++$counter); | ||||||||
| 969 | } | ||||||||
| 970 | } | ||||||||
| 971 | 2 | 100 | 49 | if($dirty) { | |||||
| 972 | 1 | 5 | $node->detach_content; | ||||||
| 973 | 1 | 13 | $node->push_content(@children); | ||||||
| 974 | } | ||||||||
| 975 | 2 | 18 | return; | ||||||
| 976 | } | ||||||||
| 977 | |||||||||
| 978 | #-------------------------------------------------------------------------- | ||||||||
| 979 | |||||||||
| 980 | sub texticulate { | ||||||||
| 981 | # group ~literals and phrasals into texticles | ||||||||
| 982 | # -- maximally high-and-merged phrasal/text groups | ||||||||
| 983 | 3 | 3 | 0 | 5 | my $node = $_[0]; | ||||
| 984 | 3 | 4 | my $dirty; | ||||||
| 985 | 3 | 9 | my(@children) = $node->content_list; | ||||||
| 986 | |||||||||
| 987 | #foreach my $c (@children) { | ||||||||
| 988 | # texticulate($c); | ||||||||
| 989 | #} | ||||||||
| 990 | |||||||||
| 991 | #print "Applying to $node = ", $node->tag, "\n"; | ||||||||
| 992 | |||||||||
| 993 | 3 | 100 | 19 | if(! $Phrasal{$node->tag}) { | |||||
| 994 | # Only non-phrasals can have texticles as children! | ||||||||
| 995 | 2 | 14 | my $last_tag; | ||||||
| 996 | 2 | 8 | for(my $i = 0; $i < @children; $i++) { | ||||||
| 997 | 2 | 9 | texticulate($children[$i]); # RECURSE! | ||||||
| 998 | 2 | 100 | 18 | next unless $Phrasal{$children[$i]->tag}; | |||||
| 999 | |||||||||
| 1000 | 1 | 50 | 33 | 18 | if($i == 0 | ||||
| 0 | |||||||||
| 1001 | or | ||||||||
| 1002 | !$Phrasal{ | ||||||||
| 1003 | $last_tag = $children[$i - 1]->tag | ||||||||
| 1004 | } | ||||||||
| 1005 | ) { | ||||||||
| 1006 | # start a new texticle group | ||||||||
| 1007 | 1 | 2 | $dirty = 1; | ||||||
| 1008 | 1 | 2 | my $old = $children[$i]; | ||||||
| 1009 | 1 | 7 | $children[$i] = HTML::Element->new('~texticle', | ||||||
| 1010 | 'id', '``G' . ++$counter); | ||||||||
| 1011 | 1 | 59 | $children[$i]->push_content($old); # and demote the phrasal to under it | ||||||
| 1012 | } elsif($last_tag eq '~texticle') { | ||||||||
| 1013 | # move this under preceding texticle | ||||||||
| 1014 | 0 | 0 | $dirty = 1; | ||||||
| 1015 | 0 | 0 | $children[$i - 1]->push_content( splice @children, $i, 1 ); | ||||||
| 1016 | 0 | 0 | --$i; | ||||||
| 1017 | } else { | ||||||||
| 1018 | 0 | 0 | die "SPORK 1231233312!"; | ||||||
| 1019 | } | ||||||||
| 1020 | } | ||||||||
| 1021 | |||||||||
| 1022 | #if(0) { | ||||||||
| 1023 | # foreach my $c (@children) { | ||||||||
| 1024 | # # Now fold the texticular content up | ||||||||
| 1025 | # if($c->tag eq '~texticle') { | ||||||||
| 1026 | # $c->attr('~folded' => [$c->detach_content]); | ||||||||
| 1027 | # } | ||||||||
| 1028 | # } | ||||||||
| 1029 | #} | ||||||||
| 1030 | } | ||||||||
| 1031 | |||||||||
| 1032 | # Now delete all br's! | ||||||||
| 1033 | # (Would it be better to delete BRs only adjacent to a texticle?) | ||||||||
| 1034 | 3 | 49 | for(my $i = 0; $i < @children; $i++) { | ||||||
| 1035 | 2 | 50 | 7 | if($children[$i]->tag eq 'br') { | |||||
| 1036 | 0 | 0 | splice @children, $i, 1; | ||||||
| 1037 | 0 | 0 | --$i; | ||||||
| 1038 | 0 | 0 | $dirty = 1; | ||||||
| 1039 | } | ||||||||
| 1040 | } | ||||||||
| 1041 | # So, the only purpose/effect of BRs is that they serve as barriers | ||||||||
| 1042 | # to unifying adjacent phrasal elements under a common texticle. | ||||||||
| 1043 | # Once we've unified things, we just delete them from the tree. | ||||||||
| 1044 | |||||||||
| 1045 | 3 | 100 | 23 | if($dirty) { | |||||
| 1046 | 1 | 3 | $node->detach_content; | ||||||
| 1047 | 1 | 8 | $node->push_content(@children); | ||||||
| 1048 | } | ||||||||
| 1049 | } | ||||||||
| 1050 | |||||||||
| 1051 | #========================================================================== | ||||||||
| 1052 | |||||||||
| 1053 | sub remap_tags { | ||||||||
| 1054 | 1 | 1 | 0 | 3 | my($tree, $hr) = @_; | ||||
| 1055 | 1 | 50 | 33 | 11 | die unless $hr and ref($hr) eq 'HASH'; | ||||
| 1056 | 1 | 133 | my($recursor, $tag); | ||||||
| 1057 | $recursor = sub { | ||||||||
| 1058 | 2 | 2 | 10 | foreach my $c ($_[0]->content_list) { | |||||
| 1059 | 2 | 100 | 67 | if(ref $c) { | |||||
| 1060 | 1 | 50 | 33 | 4 | if(($tag = $c->tag) and defined $tag and exists $hr->{$tag}) { | ||||
| 33 | |||||||||
| 1061 | 0 | 0 | $c->attr('_tag', $hr->{$tag}); | ||||||
| 1062 | } | ||||||||
| 1063 | 1 | 19 | $recursor->($c); # recurse! | ||||||
| 1064 | } | ||||||||
| 1065 | } | ||||||||
| 1066 | 2 | 31 | return; | ||||||
| 1067 | 1 | 10 | }; | ||||||
| 1068 | |||||||||
| 1069 | 1 | 3 | $recursor->($tree); # Run the recursion. | ||||||
| 1070 | |||||||||
| 1071 | 1 | 2 | undef $recursor; # So the lambda's refcount can hit 0, and can GC. | ||||||
| 1072 | 1 | 6 | return; | ||||||
| 1073 | } | ||||||||
| 1074 | |||||||||
| 1075 | #-------------------------------------------------------------------------- | ||||||||
| 1076 | |||||||||
| 1077 | sub wrangle_body_children { | ||||||||
| 1078 | 0 | 0 | 0 | 0 | my $tree = $_[0]; | ||||
| 1079 | 0 | 0 | my @children = $tree->content_list; | ||||||
| 1080 | 0 | 0 | my $dirty = 0; | ||||||
| 1081 | |||||||||
| 1082 | 0 | 0 | my $c; | ||||||
| 1083 | 0 | 0 | $tree->normalize_content; # NB: doesn't recurse | ||||||
| 1084 | |||||||||
| 1085 | 0 | 0 | for(my $i = 0; $i < @children; ++$i) { | ||||||
| 1086 | 0 | 0 | my $c = $children[$i]; | ||||||
| 1087 | 0 | 0 | 0 | if(!ref($c)) { | |||||
| 1088 | # put under a new p | ||||||||
| 1089 | 0 | 0 | $dirty = 1; | ||||||
| 1090 | ( | ||||||||
| 1091 | 0 | 0 | $children[$i] = HTML::Element->new('p', 'superimplicit' => 1, | ||||||
| 1092 | 'id', '``G' . ++$counter | ||||||||
| 1093 | ) | ||||||||
| 1094 | )->push_content($c); | ||||||||
| 1095 | #} elsif($c->tag eq 'hr') { | ||||||||
| 1096 | # # do anything special? | ||||||||
| 1097 | } | ||||||||
| 1098 | } | ||||||||
| 1099 | |||||||||
| 1100 | 0 | 0 | 0 | if($dirty) { | |||||
| 1101 | 0 | 0 | $tree->detach_content; | ||||||
| 1102 | 0 | 0 | $tree->push_content(@children); | ||||||
| 1103 | } | ||||||||
| 1104 | |||||||||
| 1105 | 0 | 0 | return; | ||||||
| 1106 | } | ||||||||
| 1107 | |||||||||
| 1108 | #-------------------------------------------------------------------------- | ||||||||
| 1109 | |||||||||
| 1110 | sub lists_render { # Recursive. | ||||||||
| 1111 | 2 | 2 | 0 | 19 | my $node = $_[0]; | ||||
| 1112 | 2 | 4 | my $tag; | ||||||
| 1113 | 2 | 50 | 33 | 6 | if(($tag = $node->tag) eq 'ul' or $tag eq 'menu') { | ||||
| 50 | |||||||||
| 50 | |||||||||
| 50 | |||||||||
| 1114 | 0 | 0 | $node->attr('was-tag', $tag); | ||||||
| 1115 | 0 | 0 | $node->attr('_tag', 'over'); | ||||||
| 1116 | 0 | 0 | foreach my $c ($node->content_list) { | ||||||
| 1117 | 0 | 0 | 0 | 0 | next unless ref($c) and $c->tag eq 'li'; | ||||
| 1118 | 0 | 0 | $c->attr('_tag', 'item'); | ||||||
| 1119 | 0 | 0 | $c->unshift_content('* '); | ||||||
| 1120 | # TODO: support bullet types other than this? | ||||||||
| 1121 | } | ||||||||
| 1122 | |||||||||
| 1123 | } elsif($tag eq 'ol') { | ||||||||
| 1124 | 0 | 0 | $node->attr('was-tag', $tag); | ||||||
| 1125 | 0 | 0 | $node->attr('_tag', 'over'); | ||||||
| 1126 | 0 | 0 | my $x = 0; | ||||||
| 1127 | 0 | 0 | foreach my $c ($node->content_list) { | ||||||
| 1128 | 0 | 0 | 0 | 0 | next unless ref($c) and $c->tag eq 'li'; | ||||
| 1129 | 0 | 0 | $c->attr('_tag', 'item'); | ||||||
| 1130 | 0 | 0 | $c->unshift_content(++$x . '. '); | ||||||
| 1131 | # TODO: support number styles other than this? | ||||||||
| 1132 | } | ||||||||
| 1133 | |||||||||
| 1134 | } elsif($tag eq 'dl') { | ||||||||
| 1135 | 0 | 0 | $node->attr('was-tag', $tag); | ||||||
| 1136 | 0 | 0 | $node->attr('_tag', 'over'); | ||||||
| 1137 | 0 | 0 | my $tag; | ||||||
| 1138 | 0 | 0 | foreach my $c ($node->content_list) { | ||||||
| 1139 | 0 | 0 | 0 | next unless ref($c); | |||||
| 1140 | 0 | 0 | 0 | if(($tag = $c->tag) eq 'dt') { | |||||
| 0 | |||||||||
| 1141 | 0 | 0 | $c->attr('was-tag', $tag); | ||||||
| 1142 | 0 | 0 | $c->attr('_tag', 'item'); | ||||||
| 1143 | } elsif($tag eq 'dd') { | ||||||||
| 1144 | 0 | 0 | $c->attr('was-tag', $tag); | ||||||
| 1145 | 0 | 0 | $c->attr('_tag', 'item'); | ||||||
| 1146 | # Altho really, earlier on, we will have turned all dd's into p's! | ||||||||
| 1147 | # This code is here just in case we decide that that wasn't | ||||||||
| 1148 | # such a hot idea. | ||||||||
| 1149 | # Instead of turning dd's into items, consider replacing with | ||||||||
| 1150 | # content, with a br on each side? Or too late for that? | ||||||||
| 1151 | } | ||||||||
| 1152 | # else just moooove along | ||||||||
| 1153 | } | ||||||||
| 1154 | |||||||||
| 1155 | } elsif($tag eq 'blockquote') { # not really a list, but hey. | ||||||||
| 1156 | 0 | 0 | $node->attr('was-tag', $tag); | ||||||
| 1157 | 0 | 0 | $node->attr('_tag', 'over'); | ||||||
| 1158 | } | ||||||||
| 1159 | |||||||||
| 1160 | # In any case, recurse... | ||||||||
| 1161 | 2 | 36 | foreach my $c ($node->content_list) { | ||||||
| 1162 | 2 | 100 | 19 | lists_render($c) if ref $c; | |||||
| 1163 | } | ||||||||
| 1164 | } | ||||||||
| 1165 | |||||||||
| 1166 | #-------------------------------------------------------------------------- | ||||||||
| 1167 | |||||||||
| 1168 | sub br_render { | ||||||||
| 1169 | # render BRs. | ||||||||
| 1170 | |||||||||
| 1171 | # TODO: anything necessary? | ||||||||
| 1172 | |||||||||
| 1173 | 1 | 1 | 0 | 2 | return; | ||||
| 1174 | } | ||||||||
| 1175 | |||||||||
| 1176 | |||||||||
| 1177 | sub hr_render { | ||||||||
| 1178 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 1179 | 1 | 2 | my $alt; | ||||||
| 1180 | 1 | 4 | foreach my $hr ($tree->find_by_tag_name('hr')) { | ||||||
| 1181 | 0 | 0 | 0 | if($hr->parent->tag eq 'body') { | |||||
| 1182 | # Special sauce. SPECIAL SAUCE! | ||||||||
| 1183 | 0 | 0 | $hr->attr('_tag', 'p'); | ||||||
| 1184 | 0 | 0 | $hr->attr('was-tag', 'hr'); | ||||||
| 1185 | 0 | 0 | $hr->push_content('----'); | ||||||
| 1186 | } else { | ||||||||
| 1187 | 0 | 0 | $hr->replace_with( | ||||||
| 1188 | $hr->new('br', 'was-tag' => 'hr', 'id' => '``G' . ++$counter), | ||||||||
| 1189 | '----', | ||||||||
| 1190 | $hr->new('br', 'was-tag' => 'hr', 'id' => '``G' . ++$counter), | ||||||||
| 1191 | ); | ||||||||
| 1192 | } | ||||||||
| 1193 | } | ||||||||
| 1194 | 1 | 29 | return; | ||||||
| 1195 | } | ||||||||
| 1196 | |||||||||
| 1197 | |||||||||
| 1198 | sub pre_render { | ||||||||
| 1199 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 1200 | 1 | 4 | foreach my $p ($tree->find_by_tag_name('pre')) { | ||||||
| 1201 | # Delete left or right ignorable WS nodes... | ||||||||
| 1202 | { | ||||||||
| 1203 | 0 | 0 | my $left = $p->left; | ||||||
| 0 | 0 | ||||||||
| 1204 | #print "Left of $p is $left\n"; | ||||||||
| 1205 | 0 | 0 | 0 | 0 | if(defined $left and !ref $left and $left =~ m<^\s*$>s) { | ||||
| 0 | |||||||||
| 1206 | # all nil or WS. | ||||||||
| 1207 | #print "Delendum left at", $p->attr('id') || $p->address, "!\n"; | ||||||||
| 1208 | 0 | 0 | $p->parent->splice_content($p->pindex - 1, 1); # delete preceding WS. | ||||||
| 1209 | } | ||||||||
| 1210 | } | ||||||||
| 1211 | { | ||||||||
| 1212 | 0 | 0 | my $right = $p->right; | ||||||
| 0 | 0 | ||||||||
| 1213 | 0 | 0 | 0 | 0 | if(defined $right and !ref $right and $right =~ m<^\s*$>s) { | ||||
| 0 | |||||||||
| 1214 | # all nil or WS. | ||||||||
| 1215 | #print "Delendum right at", $p->attr('id') || $p->address, "!\n"; | ||||||||
| 1216 | 0 | 0 | $p->parent->splice_content($p->pindex + 1, 1); # delete following WS. | ||||||
| 1217 | } | ||||||||
| 1218 | } | ||||||||
| 1219 | |||||||||
| 1220 | # Now acually render, simply... | ||||||||
| 1221 | 0 | 0 | my $text_content = $p->as_text; | ||||||
| 1222 | 0 | 0 | 0 | unless($text_content =~ m/\S+/) { | |||||
| 1223 | 0 | 0 | $p->delete; | ||||||
| 1224 | 0 | 0 | next; | ||||||
| 1225 | } | ||||||||
| 1226 | |||||||||
| 1227 | 0 | 0 | $text_content =~ s/^\n+//s; # Kill leading newlines | ||||||
| 1228 | 0 | 0 | $text_content =~ s/\n+$//s; # Kill trailing newlines | ||||||
| 1229 | |||||||||
| 1230 | 0 | 0 | my $left = $p->left; | ||||||
| 1231 | 0 | 0 | 0 | 0 | if($left and ref($left) and $left->tag eq 'pre') { | ||||
| 0 | |||||||||
| 1232 | # prepend to the immediately preceding pre's content | ||||||||
| 1233 | 0 | 0 | ${ | ||||||
| 1234 | 0 | 0 | $left->attr('~pre_content_r') | ||||||
| 1235 | } .= "\n" . $text_content; | ||||||||
| 1236 | 0 | 0 | $p->delete; | ||||||
| 1237 | } else { | ||||||||
| 1238 | 0 | 0 | $p->delete_content; | ||||||
| 1239 | 0 | 0 | $p->attr('~pre_content_r', \$text_content); | ||||||
| 1240 | #print "Pre content [[",$text_content,"]]\n"; | ||||||||
| 1241 | } | ||||||||
| 1242 | } | ||||||||
| 1243 | 1 | 26 | return; | ||||||
| 1244 | } | ||||||||
| 1245 | |||||||||
| 1246 | sub q_render { | ||||||||
| 1247 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 1248 | 1 | 4 | foreach my $q ($tree->find_by_tag_name('q')) { | ||||||
| 1249 | 0 | 0 | $q->push_content('"'); | ||||||
| 1250 | 0 | 0 | $q->unshift_content('"'); | ||||||
| 1251 | 0 | 0 | $q->replace_with_content; | ||||||
| 1252 | } | ||||||||
| 1253 | 1 | 25 | return; | ||||||
| 1254 | } | ||||||||
| 1255 | |||||||||
| 1256 | sub images_render { | ||||||||
| 1257 | 1 | 1 | 0 | 13 | my $tree = $_[0]; | ||||
| 1258 | 1 | 15 | foreach my $img ($tree->find_by_tag_name('img')) { | ||||||
| 1259 | 0 | 0 | my $alt; | ||||||
| 1260 | 0 | 0 | 0 | if(defined($alt = $img->attr('alt'))) { | |||||
| 1261 | 0 | 0 | $img->replace_with($alt); | ||||||
| 1262 | } else { | ||||||||
| 1263 | 0 | 0 | 0 | $img->replace_with( | |||||
| 1264 | $Debug ? | ||||||||
| 1265 | ('[IMAGE' . $img->attr('id') . ']') : | ||||||||
| 1266 | '[IMAGE]' | ||||||||
| 1267 | ); | ||||||||
| 1268 | #?? $img->delete; | ||||||||
| 1269 | } | ||||||||
| 1270 | } | ||||||||
| 1271 | 1 | 27 | return; | ||||||
| 1272 | } | ||||||||
| 1273 | |||||||||
| 1274 | #-------------------------------------------------------------------------- | ||||||||
| 1275 | |||||||||
| 1276 | sub prune_by_tag_name { | ||||||||
| 1277 | 1 | 1 | 0 | 2 | my($tree, @o) = @_; | ||||
| 1278 | 1 | 3 | foreach my $o (@o) { | ||||||
| 1279 | 2 | 50 | 82 | foreach my $x ($tree->find_by_tag_name(ref $o ? @$o : $o)) { | |||||
| 1280 | 0 | 0 | $x->delete; | ||||||
| 1281 | } | ||||||||
| 1282 | } | ||||||||
| 1283 | 1 | 102 | return; | ||||||
| 1284 | } | ||||||||
| 1285 | |||||||||
| 1286 | sub splice_by_tag_name { | ||||||||
| 1287 | 1 | 1 | 0 | 3 | my($tree, @o) = @_; | ||||
| 1288 | 1 | 3 | foreach my $o (@o) { | ||||||
| 1289 | 1 | 50 | 7 | foreach my $x ($tree->find_by_tag_name(ref $o ? @$o : $o)) { | |||||
| 1290 | 0 | 0 | $x->replace_with_content; | ||||||
| 1291 | } | ||||||||
| 1292 | } | ||||||||
| 1293 | 1 | 144 | return; | ||||||
| 1294 | } | ||||||||
| 1295 | |||||||||
| 1296 | #-------------------------------------------------------------------------- | ||||||||
| 1297 | sub tree_as_pod { | ||||||||
| 1298 | 1 | 1 | 0 | 2 | my $tree = $_[0]; | ||||
| 1299 | |||||||||
| 1300 | 1 | 2 | my @lines; | ||||||
| 1301 | 1 | 3 | my $comments = $tree->attr('_pod_comments'); | ||||||
| 1302 | |||||||||
| 1303 | 1 | 11 | my $bender; | ||||||
| 1304 | |||||||||
| 1305 | $bender = sub { | ||||||||
| 1306 | 3 | 3 | 6 | my(@post, $node); | |||||
| 1307 | 3 | 9 | my $tag = ($node = $_[0])->tag; | ||||||
| 1308 | |||||||||
| 1309 | 3 | 100 | 35 | if($tag eq 'body') { | |||||
| 50 | |||||||||
| 50 | |||||||||
| 50 | |||||||||
| 100 | |||||||||
| 50 | |||||||||
| 50 | |||||||||
| 1310 | # no-op | ||||||||
| 1311 | } elsif($tag eq 'pre') { | ||||||||
| 1312 | 0 | 0 | push @lines, ${$node->attr('~pre_content_r')}; | ||||||
| 0 | 0 | ||||||||
| 1313 | 0 | 0 | 0 | $lines[-1] =~ s/^/ /gm if $lines[-1] =~ m/^\S/m; | |||||
| 1314 | # bump everything over if there's any lines that start with | ||||||||
| 1315 | # anything non-spaceys | ||||||||
| 1316 | 0 | 0 | while($lines[-1] =~ s/\n\n/\n \n/) { } | ||||||
| 1317 | # have there be no zero-length lines. | ||||||||
| 1318 | } elsif($tag eq 'over') { | ||||||||
| 1319 | 0 | 0 | push @lines, "=over"; | ||||||
| 1320 | 0 | 0 | push @post, "=back"; | ||||||
| 1321 | } elsif($tag eq 'item') { | ||||||||
| 1322 | 0 | 0 | push @lines, "=item"; | ||||||
| 1323 | } elsif($tag eq 'h1') { | ||||||||
| 1324 | 1 | 13 | push @lines, "=head1"; | ||||||
| 1325 | } elsif($tag eq 'h2') { | ||||||||
| 1326 | 0 | 0 | push @lines, "=head2"; | ||||||
| 1327 | } elsif($tag eq '~texticle') { | ||||||||
| 1328 | 1 | 4 | my $text = render_texticle($tree,$node); | ||||||
| 1329 | 1 | 4 | $text =~ s/^\s+//s; | ||||||
| 1330 | 1 | 3 | $text =~ s/\s+$//s; | ||||||
| 1331 | 1 | 2 | $text =~ s/^=/E<61>/s; | ||||||
| 1332 | # So that this can't be mistaken for a directive -- on the | ||||||||
| 1333 | # off chance that text content starts with a '=' | ||||||||
| 1334 | |||||||||
| 1335 | #$text = "{$text}"; | ||||||||
| 1336 | |||||||||
| 1337 | 1 | 50 | 33 | 25 | if( | ||||
| 33 | |||||||||
| 33 | |||||||||
| 1338 | @lines and | ||||||||
| 1339 | $lines[-1] =~ m/^=(\w{1,10})$/s and | ||||||||
| 1340 | ( $1 eq 'item' or $1 eq 'head1' or $1 eq 'head2' ) | ||||||||
| 1341 | ) { | ||||||||
| 1342 | # Merge this text with the directive: | ||||||||
| 1343 | 1 | 3 | $text = pop(@lines) . ' ' . $text; | ||||||
| 1344 | } | ||||||||
| 1345 | |||||||||
| 1346 | 1 | 5 | push @lines, wrap72_dammit($text); | ||||||
| 1347 | 1 | 4 | $lines[-1] =~ s/\s+$//s; # Make REALLY sure there's no tailing WS | ||||||
| 1348 | 1 | 50 | 4 | pop @lines unless length $lines[-1]; # Sanity check. | |||||
| 1349 | |||||||||
| 1350 | 1 | 4 | return; | ||||||
| 1351 | # Don't recurse under texticles (because nothing should be there!) | ||||||||
| 1352 | } else { | ||||||||
| 1353 | 0 | 0 | 0 | print "unrenderable element \"$tag\" in phrasal-pass\n" if $Debug; | |||||
| 1354 | } | ||||||||
| 1355 | |||||||||
| 1356 | 2 | 6 | foreach my $c ($node->content_list) { | ||||||
| 1357 | 2 | 23 | $bender->($c); | ||||||
| 1358 | } | ||||||||
| 1359 | |||||||||
| 1360 | 2 | 4 | push @lines, @post; | ||||||
| 1361 | 2 | 5 | return; | ||||||
| 1362 | 1 | 7 | }; | ||||||
| 1363 | 1 | 3 | $bender->($tree); | ||||||
| 1364 | 1 | 1 | undef $bender; | ||||||
| 1365 | |||||||||
| 1366 | 1 | 50 | 33 | 24 | unshift @lines, "=pod" unless @lines and $lines[0] =~ m<^=>s; | ||||
| 1367 | |||||||||
| 1368 | 1 | 3 | push @lines, "=cut\n\n"; # get extra double-newline at end | ||||||
| 1369 | |||||||||
| 1370 | 1 | 4 | my $pod = join "\n\n", @lines; | ||||||
| 1371 | |||||||||
| 1372 | 1 | 50 | 33 | 11 | if($comments and @$comments) { | ||||
| 1373 | 1 | 2 | foreach my $c (@$comments) { | ||||||
| 1374 | 5 | 11 | $c =~ tr<\cm\cj>< >s; | ||||||
| 1375 | 5 | 100 | 22 | $c = "#" . $c unless $c =~ m<^\s*#>s; | |||||
| 1376 | } | ||||||||
| 1377 | 1 | 5 | $pod .= join "\n", @$comments, ''; | ||||||
| 1378 | } | ||||||||
| 1379 | |||||||||
| 1380 | 1 | 50 | 20 | sleep(0), print("#Start pod\n\n$pod\n"), sleep(0) if $Debug > 1; | |||||
| 1381 | 1 | 5 | return \$pod; | ||||||
| 1382 | } | ||||||||
| 1383 | |||||||||
| 1384 | #-------------------------------------------------------------------------- | ||||||||
| 1385 | sub render_texticle { | ||||||||
| 1386 | 1 | 1 | 0 | 3 | my($tree, $t) = @_; | ||||
| 1387 | 1 | 2 | my $text = ''; | ||||||
| 1388 | 1 | 2 | my $bender; | ||||||
| 1389 | |||||||||
| 1390 | 1 | 3 | my $a_name = $tree->attr('_a_name'); | ||||||
| 1391 | 1 | 17 | my $a_href = $tree->attr('_a_href'); | ||||||
| 1392 | |||||||||
| 1393 | 1 | 9 | my $under_l_count = 0; | ||||||
| 1394 | $bender = sub { | ||||||||
| 1395 | 2 | 2 | 7 | my $tag = (my $node = $_[0])->tag; | |||||
| 1396 | 2 | 11 | my $post = '>'; | ||||||
| 1397 | 2 | 3 | my $decr_under_l_count_post = 0; | ||||||
| 1398 | 2 | 100 | 7 | if($tag eq '~texticle') { | |||||
| 50 | |||||||||
| 0 | |||||||||
| 0 | |||||||||
| 0 | |||||||||
| 0 | |||||||||
| 1399 | # no-op -- just a container | ||||||||
| 1400 | 1 | 2 | $post = ''; | ||||||
| 1401 | } elsif($tag eq '~literal') { | ||||||||
| 1402 | 1 | 4 | my $content = $node->attr('text'); | ||||||
| 1403 | #print "Text from ~literal : ", $node->attr('text'), "\n"; | ||||||||
| 1404 | 1 | 50 | 45 | $content =~ s/\Q$nbsp/ /og if defined $nbsp; | |||||
| 1405 | # Kill nbsps. Why? | ||||||||
| 1406 | # First off, most of them are lame editor artifacts. | ||||||||
| 1407 | # Second off, actually treating them correctly (with S<...>) | ||||||||
| 1408 | # would be a real pain. | ||||||||
| 1409 | |||||||||
| 1410 | 1 | 50 | 4 | if($under_l_count) { | |||||
| 1411 | 0 | 0 | encode_entities_harder($content); | ||||||
| 1412 | } else { | ||||||||
| 1413 | 1 | 3 | encode_entities($content); | ||||||
| 1414 | } | ||||||||
| 1415 | #if(defined $E_slash) { | ||||||||
| 1416 | # # Delete at least most of the optional E |
||||||||
| 1417 | # while( $content =~ s{^([^<>]*)\Q$E_slash\E}{$1/}so ) {} | ||||||||
| 1418 | # while( $content =~ s{\Q$E_slash\E([^<>]*)$}{/$1}so ) {} | ||||||||
| 1419 | #} | ||||||||
| 1420 | #if(defined $E_vbar) { | ||||||||
| 1421 | # # Delete at least most of the optional E |
||||||||
| 1422 | # while( $content =~ s{^([^<>]*)\Q$E_vbar\E}{$1|}so ) {} | ||||||||
| 1423 | # while( $content =~ s{\Q$E_vbar\E([^<>]*)$}{|$1}so ) {} | ||||||||
| 1424 | #} | ||||||||
| 1425 | 1 | 50 | 4 | print "\$text is undef?" unless defined $content; | |||||
| 1426 | 1 | 2 | $text .= $content; | ||||||
| 1427 | 1 | 2 | $post = ''; | ||||||
| 1428 | } elsif($tag eq 'code') { | ||||||||
| 1429 | 0 | 0 | $text .= 'C<'; | ||||||
| 1430 | } elsif($tag eq 'i') { | ||||||||
| 1431 | 0 | 0 | $text .= 'I<'; | ||||||
| 1432 | } elsif($tag eq 'b') { | ||||||||
| 1433 | 0 | 0 | $text .= 'B<'; | ||||||
| 1434 | } elsif($tag eq 'a') { | ||||||||
| 1435 | 0 | 0 | my($name, $href); | ||||||
| 1436 | 0 | 0 | 0 | $name = $a_name ? $node->attr('name') : undef; | |||||
| 1437 | 0 | 0 | 0 | $href = $a_href ? $node->attr('href') : undef; | |||||
| 1438 | 0 | 0 | $post = ''; | ||||||
| 1439 | |||||||||
| 1440 | 0 | 0 | 0 | 0 | if(defined $name and length $name) { | ||||
| 1441 | 0 | 0 | $text .= 'X<' . $name . '>'; | ||||||
| 1442 | } | ||||||||
| 1443 | |||||||||
| 1444 | 0 | 0 | 0 | 0 | if(defined $href and length $href) { | ||||
| 1445 | 0 | 0 | encode_entities($href); | ||||||
| 1446 | #print "{Link text:{$href}}\n"; | ||||||||
| 1447 | 0 | 0 | 0 | if($href =~ s/^#//s) { | |||||
| 0 | |||||||||
| 0 | |||||||||
| 1448 | # internal relative href | ||||||||
| 1449 | 0 | 0 | $text .= 'L<'; | ||||||
| 1450 | 0 | 0 | $post .= "|/$href>"; | ||||||
| 1451 | 0 | 0 | ++$under_l_count; | ||||||
| 1452 | 0 | 0 | $decr_under_l_count_post = 1; | ||||||
| 1453 | } elsif($href =~ s/^pod://s) { | ||||||||
| 1454 | # Pass that thru. | ||||||||
| 1455 | # A back door for making straightforward pod links. | ||||||||
| 1456 | 0 | 0 | $text .= 'L<'; | ||||||
| 1457 | 0 | 0 | $post .= "|$href>"; | ||||||
| 1458 | 0 | 0 | ++$under_l_count; | ||||||
| 1459 | 0 | 0 | $decr_under_l_count_post = 1; | ||||||
| 1460 | } elsif($href =~ m<^[-+.a-z0-9A-Z]+\:[^:]>s) { | ||||||||
| 1461 | # It matches RFC 1738's idea of an absolute URL. | ||||||||
| 1462 | # Pass it thru: the podulator should detect that it's a URL | ||||||||
| 1463 | # and handle appropriately. | ||||||||
| 1464 | 0 | 0 | $post .= " ($href)"; | ||||||
| 1465 | } else { | ||||||||
| 1466 | # a relative link?? | ||||||||
| 1467 | 0 | 0 | $href = $href; | ||||||
| 1468 | 0 | 0 | commentate($t->root, "# Untranslatable link: \"$href\""); | ||||||
| 1469 | } | ||||||||
| 1470 | } | ||||||||
| 1471 | } else { | ||||||||
| 1472 | 0 | 0 | print "Unrenderable sub-phrasal element $tag: ignoring\n"; | ||||||
| 1473 | 0 | 0 | $post = ''; | ||||||
| 1474 | } | ||||||||
| 1475 | |||||||||
| 1476 | # Recurse! | ||||||||
| 1477 | 2 | 8 | foreach my $c ($node->content_list) { | ||||||
| 1478 | 1 | 24 | $bender->($c); | ||||||
| 1479 | } | ||||||||
| 1480 | |||||||||
| 1481 | # Now, post-order things: | ||||||||
| 1482 | |||||||||
| 1483 | 2 | 9 | $text .= $post; | ||||||
| 1484 | 2 | 50 | 6 | $under_l_count-- if $decr_under_l_count_post; | |||||
| 1485 | 2 | 5 | return; | ||||||
| 1486 | 1 | 8 | }; | ||||||
| 1487 | 1 | 3 | $bender->($t); | ||||||
| 1488 | 1 | 2 | undef $bender; | ||||||
| 1489 | |||||||||
| 1490 | 1 | 21 | $text =~ s/\s+/ /g; | ||||||
| 1491 | |||||||||
| 1492 | # A weensy bit of cleanup: | ||||||||
| 1493 | 1 | 3 | $text =~ s/ ?> ?$/>/s; | ||||||
| 1494 | 1 | 3 | $text =~ s/^((?:\w<)+) ([^>])/$1$2/; | ||||||
| 1495 | |||||||||
| 1496 | #print "{$text}\n"; | ||||||||
| 1497 | |||||||||
| 1498 | 1 | 3 | return $text; | ||||||
| 1499 | } | ||||||||
| 1500 | |||||||||
| 1501 | #-------------------------------------------------------------------------- | ||||||||
| 1502 | sub COLMAX () {72} | ||||||||
| 1503 | |||||||||
| 1504 | sub wrap72_dammit { | ||||||||
| 1505 | # All because Text::Wrap::wrap DIES when it hits an unwrappably | ||||||||
| 1506 | # large text chunk, DAMMIT. | ||||||||
| 1507 | |||||||||
| 1508 | # So this is a stupid wrapper: knows nothing about tabs or anything. | ||||||||
| 1509 | 1 | 1 | 0 | 3 | my $text = ''; | ||||
| 1510 | 1 | 2 | my $col = 0; | ||||||
| 1511 | 1 | 12 | foreach my $w (split /\s+/, $_[0]) { | ||||||
| 1512 | 2 | 50 | 7 | next unless length $w; | |||||
| 1513 | 2 | 50 | 8 | if(length($w) >= COLMAX) { | |||||
| 50 | |||||||||
| 1514 | # Unwrappably large chunk. | ||||||||
| 1515 | 0 | 0 | 0 | if($col) { | |||||
| 1516 | 0 | 0 | $text .= "\n$w\n"; | ||||||
| 1517 | } else { | ||||||||
| 1518 | 0 | 0 | $text .= "$w\n"; | ||||||
| 1519 | } | ||||||||
| 1520 | 0 | 0 | $col = 0; | ||||||
| 1521 | } elsif ((1 + $col + length $w) < COLMAX) { | ||||||||
| 1522 | # The word will fit on /this/ line | ||||||||
| 1523 | 2 | 100 | 5 | if($col) { | |||||
| 1524 | 1 | 4 | $text .= " $w"; | ||||||
| 1525 | 1 | 3 | $col += 1 + length $w; | ||||||
| 1526 | } else { | ||||||||
| 1527 | 1 | 3 | $text .= $w ; | ||||||
| 1528 | 1 | 2 | $col += length $w; | ||||||
| 1529 | } | ||||||||
| 1530 | } else { | ||||||||
| 1531 | # Start a new line | ||||||||
| 1532 | 0 | 0 | 0 | if($col) { | |||||
| 1533 | 0 | 0 | $text .= "\n$w"; | ||||||
| 1534 | } else { | ||||||||
| 1535 | 0 | 0 | $text .= $w; # never applies? | ||||||
| 1536 | } | ||||||||
| 1537 | 0 | 0 | $col = length $w; | ||||||
| 1538 | } | ||||||||
| 1539 | } | ||||||||
| 1540 | 1 | 4 | $text =~ s/\n+$//s; # nix and trailing newlines | ||||||
| 1541 | |||||||||
| 1542 | 1 | 3 | return $text; | ||||||
| 1543 | } | ||||||||
| 1544 | |||||||||
| 1545 | |||||||||
| 1546 | #========================================================================== | ||||||||
| 1547 | # Adapted from Gisle Aas's HTML::Entities::encode_entities: | ||||||||
| 1548 | |||||||||
| 1549 | sub encode_entities { | ||||||||
| 1550 | 1 | 1 | 0 | 4 | $_[0] =~ s/([^\n\t !-;=?-~])/$Char2ent{$1}/g; | ||||
| 1551 | # Encode control chars, high bit chars and '<' and '>' | ||||||||
| 1552 | 1 | 3 | return; | ||||||
| 1553 | } | ||||||||
| 1554 | |||||||||
| 1555 | sub encode_entities_harder { | ||||||||
| 1556 | 4 | 4 | 0 | 31 | $_[0] =~ s/([^\n\t !\#\$%\'-.0-=?-{}~])/$Char2ent{$1}/g; | ||||
| 1557 | # Encode control chars, high bit chars and '<', '&', '>', '"', | ||||||||
| 1558 | # '|', '/' | ||||||||
| 1559 | 4 | 8 | return; | ||||||
| 1560 | } | ||||||||
| 1561 | |||||||||
| 1562 | #-------------------------------------------------------------------------- | ||||||||
| 1563 | |||||||||
| 1564 | __END__ |