| blib/lib/HTML/ToDocBook.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % | 
| statement | 16 | 18 | 88.8 | 
| branch | n/a | ||
| condition | n/a | ||
| subroutine | 6 | 6 | 100.0 | 
| pod | n/a | ||
| total | 22 | 24 | 91.6 | 
| line | stmt | bran | cond | sub | pod | time | code | 
|---|---|---|---|---|---|---|---|
| 1 | package HTML::ToDocBook; | ||||||
| 2 | 2 | 2 | 42414 | use strict; | |||
| 2 | 4 | ||||||
| 2 | 88 | ||||||
| 3 | 2 | 2 | 13 | use warnings; | |||
| 2 | 3 | ||||||
| 2 | 98 | ||||||
| 4 | |||||||
| 5 | =head1 NAME | ||||||
| 6 | |||||||
| 7 | HTML::ToDocBook - Converts an XHTML file into DocBook. | ||||||
| 8 | |||||||
| 9 | =head1 VERSION | ||||||
| 10 | |||||||
| 11 | This describes version B<0.03> of HTML::ToDocBook. | ||||||
| 12 | |||||||
| 13 | =cut | ||||||
| 14 | |||||||
| 15 | our $VERSION = '0.0301'; | ||||||
| 16 | |||||||
| 17 | =head1 SYNOPSIS | ||||||
| 18 | |||||||
| 19 | use HTML::ToDocBook; | ||||||
| 20 | |||||||
| 21 | my $obj = HTML::ToDocBook->new(%args); | ||||||
| 22 | |||||||
| 23 | $obj->convert(infile=>$filename); | ||||||
| 24 | |||||||
| 25 | # convert HTML file | ||||||
| 26 | $obj->convert(infile=>$filename, html=>1); | ||||||
| 27 | |||||||
| 28 | =head1 DESCRIPTION | ||||||
| 29 | |||||||
| 30 | This module converts an XHTML file into DocBook format using both | ||||||
| 31 | heuristics and XSLT processing. By default, this expects the input file to | ||||||
| 32 | be correct XHTML -- there are other programs such as html tidy | ||||||
| 33 | (http://tidy.sourceforge.net/) which can correct files for you; this does | ||||||
| 34 | not do that. | ||||||
| 35 | |||||||
| 36 | Note also this is very simple; it doesn't deal with things like | ||||||
| 37 |    or  which it has no way of guessing the meaning of.   | 
||||||
| 38 | (For some, however, if they have class names which match DocBook tags, | ||||||
| 39 | they will be turned into those tags) | ||||||
| 40 | This does not merge multiple XHTML files into a single document, | ||||||
| 41 |  so this converts each XHTML file into a  | 
||||||
| 42 |  header being a section (sect1 to sect5).  The  | 
||||||
| 43 | the chapter title. | ||||||
| 44 | |||||||
| 45 | There will likely to be validity errors, depending on how good the original | ||||||
| 46 |  HTML was.  There may be broken links,  | 
||||||
| 47 |  and overuse of  | 
||||||
| 48 | |||||||
| 49 | =cut | ||||||
| 50 | |||||||
| 51 | 2 | 2 | 11 | use Cwd 'abs_path'; | |||
| 2 | 2 | ||||||
| 2 | 114 | ||||||
| 52 | 2 | 2 | 9 | use File::Basename; | |||
| 2 | 4 | ||||||
| 2 | 198 | ||||||
| 53 | 2 | 2 | 11 | use File::Spec; | |||
| 2 | 4 | ||||||
| 2 | 54 | ||||||
| 54 | 2 | 2 | 1730 | use XML::LibXSLT; | |||
| 0 | |||||||
| 0 | |||||||
| 55 | use XML::LibXML; | ||||||
| 56 | use HTML::SimpleParse; | ||||||
| 57 | |||||||
| 58 | =head1 METHODS | ||||||
| 59 | |||||||
| 60 | =head2 new | ||||||
| 61 | |||||||
| 62 | my $conv = HTML::ToDocBook->new(); | ||||||
| 63 | |||||||
| 64 | my $conv = HTML::ToDocBook->new(stylesheet=>$stylesheet); | ||||||
| 65 | |||||||
| 66 | Arguments: | ||||||
| 67 | |||||||
| 68 | =over | ||||||
| 69 | |||||||
| 70 | =item stylesheet | ||||||
| 71 | |||||||
| 72 | A replacement XSLT stylesheet to use for conversions instead of the | ||||||
| 73 | built-in one. This can either be a file name or a string containing | ||||||
| 74 | the entire stylesheet. | ||||||
| 75 | |||||||
| 76 | =back | ||||||
| 77 | |||||||
| 78 | =cut | ||||||
| 79 | |||||||
| 80 | sub new { | ||||||
| 81 | my $class = shift; | ||||||
| 82 | my %parameters = @_; | ||||||
| 83 | my $self = bless ({%parameters}, ref ($class) || $class); | ||||||
| 84 | |||||||
| 85 | my $parser = XML::LibXML->new(); | ||||||
| 86 | my $xslt = XML::LibXSLT->new(); | ||||||
| 87 | |||||||
| 88 | $self->{_parser} = $parser; | ||||||
| 89 | $self->{_xslt} = $xslt; | ||||||
| 90 | |||||||
| 91 | if ($self->{stylesheet} | ||||||
| 92 | and -f $self->{stylesheet}) | ||||||
| 93 | { | ||||||
| 94 | my $fn = abs_path($self->{stylesheet}); | ||||||
| 95 | my $style_doc = $parser->parse_file($fn) | ||||||
| 96 | or die "Could not parse $fn XSLT file"; | ||||||
| 97 | my $stylesheet = $xslt->parse_stylesheet($style_doc) | ||||||
| 98 | or die "Could not parse $fn stylesheet"; | ||||||
| 99 | $self->{_xslt_sheet} = $stylesheet; | ||||||
| 100 | } | ||||||
| 101 | elsif ($self->{stylesheet}) | ||||||
| 102 | { | ||||||
| 103 | my $style_doc = $parser->parse_string($self->{stylesheet}) | ||||||
| 104 | or die "Could not parse string XSLT"; | ||||||
| 105 | my $stylesheet = $xslt->parse_stylesheet($style_doc) | ||||||
| 106 | or die "Could not parse stylesheet"; | ||||||
| 107 | $self->{_xslt_sheet} = $stylesheet; | ||||||
| 108 | } | ||||||
| 109 | else | ||||||
| 110 | { | ||||||
| 111 | |||||||
| 112 | # build the parsed stylesheet from the DATA | ||||||
| 113 | |||||||
| 114 | # This is stored in the DATA handle, after the __DATA__ at | ||||||
| 115 | # the end of this file; but because the scripts may not just | ||||||
| 116 | # create one instance of this object, | ||||||
| 117 | # we have to remember the position of the DATA handle | ||||||
| 118 | # and reset it after we've read from it, just in case | ||||||
| 119 | # we have to read from it again. | ||||||
| 120 | # This also means that we don't close it, either. Hope that doesn't | ||||||
| 121 | # cause a problem... | ||||||
| 122 | |||||||
| 123 | my $curpos = tell(DATA); # remember the __DATA__ position | ||||||
| 124 | my $style_doc = $parser->parse_fh(\*DATA); | ||||||
| 125 | # reset the data handle to the start, just in case | ||||||
| 126 | seek(DATA, $curpos, 0); | ||||||
| 127 | |||||||
| 128 | my $stylesheet = $xslt->parse_stylesheet($style_doc); | ||||||
| 129 | $self->{_xslt_sheet} = $stylesheet; | ||||||
| 130 | } | ||||||
| 131 | |||||||
| 132 | return ($self); | ||||||
| 133 | } # new | ||||||
| 134 | |||||||
| 135 | =head2 convert | ||||||
| 136 | |||||||
| 137 | $obj->convert(infile=>$filename, | ||||||
| 138 | html=>1); | ||||||
| 139 | |||||||
| 140 | Arguments: | ||||||
| 141 | |||||||
| 142 | =over | ||||||
| 143 | |||||||
| 144 | =item infile | ||||||
| 145 | |||||||
| 146 | The name of the file to convert. | ||||||
| 147 | |||||||
| 148 | =item html | ||||||
| 149 | |||||||
| 150 | Parse the input as HTML rather than XML. | ||||||
| 151 | |||||||
| 152 | =back | ||||||
| 153 | |||||||
| 154 | =cut | ||||||
| 155 | |||||||
| 156 | sub convert { | ||||||
| 157 | my $self = shift; | ||||||
| 158 | my %args = ( | ||||||
| 159 | html=>0, | ||||||
| 160 | @_ | ||||||
| 161 | ); | ||||||
| 162 | my $filename = $args{infile}; | ||||||
| 163 | |||||||
| 164 | my ($basename,$path,$suffix) = fileparse($filename,qr{\.html?}i); | ||||||
| 165 | my $outfile = File::Spec->catfile($path, "${basename}.xml"); | ||||||
| 166 | $outfile = '-' if ($filename eq ''); | ||||||
| 167 | |||||||
| 168 | # We need to read in the file first because we need to | ||||||
| 169 | # pre-process it | ||||||
| 170 | my $file_str; | ||||||
| 171 | if ($filename eq '-') # read from STDIN | ||||||
| 172 | { | ||||||
| 173 | local $/; | ||||||
| 174 |  	$file_str =  | 
||||||
| 175 | } | ||||||
| 176 | else | ||||||
| 177 | { | ||||||
| 178 | local $/; | ||||||
| 179 | my $fh; | ||||||
| 180 | open ($fh, "<", $filename) or die "could not open $filename"; | ||||||
| 181 | $file_str = <$fh>; | ||||||
| 182 | close $fh; | ||||||
| 183 | } | ||||||
| 184 | $file_str = $self->insert_sections($file_str); | ||||||
| 185 | |||||||
| 186 | my $first_ss = $self->{_xslt_sheet}; | ||||||
| 187 | |||||||
| 188 | my $source = undef; | ||||||
| 189 | my $result_str = ''; | ||||||
| 190 | if ($args{html}) | ||||||
| 191 | { | ||||||
| 192 | $source = $self->{_parser}->parse_html_string($file_str); | ||||||
| 193 | } | ||||||
| 194 | else | ||||||
| 195 | { | ||||||
| 196 | $source = $self->{_parser}->parse_string($file_str); | ||||||
| 197 | } | ||||||
| 198 | undef $file_str; | ||||||
| 199 | |||||||
| 200 | my %all_params = (); | ||||||
| 201 | my $results = $first_ss->transform($source, %all_params); | ||||||
| 202 | $result_str = $first_ss->output_string($results); | ||||||
| 203 | |||||||
| 204 | # print the result | ||||||
| 205 | my $outfh = undef; | ||||||
| 206 | if ($outfile eq '-' or $outfile eq '') | ||||||
| 207 | { | ||||||
| 208 | $outfh = \*STDOUT; | ||||||
| 209 | } | ||||||
| 210 | else | ||||||
| 211 | { | ||||||
| 212 | open(OUT, ">", $outfile) | ||||||
| 213 | || die "Can't open $outfile for writing!"; | ||||||
| 214 | $outfh = \*OUT; | ||||||
| 215 | } | ||||||
| 216 | print $outfh $result_str; | ||||||
| 217 | if ($outfile ne '-' and $outfile ne '') | ||||||
| 218 | { | ||||||
| 219 | close($outfh); | ||||||
| 220 | } | ||||||
| 221 | return $result_str; | ||||||
| 222 | } # convert | ||||||
| 223 | |||||||
| 224 | =head1 Private Methods | ||||||
| 225 | |||||||
| 226 | These are not guaranteed to be stable. | ||||||
| 227 | |||||||
| 228 | =head2 insert_sections | ||||||
| 229 | |||||||
| 230 | $my str = $obj->insert_sections($string); | ||||||
| 231 | |||||||
| 232 |  This inserts   tags to enclose all levels   | 
||||||
| 233 | of header. These will then be picked up by the XSLT stylesheet | ||||||
| 234 | and converted into section tags. | ||||||
| 235 | |||||||
| 236 | =cut | ||||||
| 237 | |||||||
| 238 | sub insert_sections { | ||||||
| 239 | my $self = shift; | ||||||
| 240 | my $string = shift; | ||||||
| 241 | my %args = ( | ||||||
| 242 | parse_type=>'xml', | ||||||
| 243 | @_ | ||||||
| 244 | ); | ||||||
| 245 | |||||||
| 246 | my $hp = new HTML::SimpleParse(); | ||||||
| 247 | $hp->text($string); | ||||||
| 248 | $hp->parse(); | ||||||
| 249 | |||||||
| 250 | my @newhtml = (); | ||||||
| 251 | my @levels = (); | ||||||
| 252 | my $tok; | ||||||
| 253 | my @tree = $hp->tree(); | ||||||
| 254 | while (@tree) | ||||||
| 255 | { | ||||||
| 256 | $tok = shift @tree; | ||||||
| 257 | if ($tok->{type} eq 'starttag' | ||||||
| 258 | and $tok->{content} =~ /^h(\d)/i) | ||||||
| 259 | { | ||||||
| 260 | # we have a header | ||||||
| 261 | my $header_level = $1; | ||||||
| 262 | # if we had a previous header, then close its div | ||||||
| 263 | # if it is the same or higher | ||||||
| 264 | if (@levels) | ||||||
| 265 | { | ||||||
| 266 | my $prev_level = $levels[$#levels]; | ||||||
| 267 | while ($prev_level > $header_level) | ||||||
| 268 | { | ||||||
| 269 | pop @levels; | ||||||
| 270 | push @newhtml, "\n"; | ||||||
| 271 | $prev_level = $levels[$#levels]; | ||||||
| 272 | } | ||||||
| 273 | if ($prev_level == $header_level) | ||||||
| 274 | { | ||||||
| 275 | pop @levels; | ||||||
| 276 | push @newhtml, "\n"; | ||||||
| 277 | } | ||||||
| 278 | } | ||||||
| 279 | # start a new div for the new header | ||||||
| 280 |  	    push @newhtml, sprintf("\n \n", $header_level);   | 
||||||
| 281 | push @levels, $header_level; | ||||||
| 282 | } | ||||||
| 283 | elsif ($tok->{type} eq 'endtag' | ||||||
| 284 | and $tok->{content} =~ /^\/body/i) | ||||||
| 285 | { | ||||||
| 286 | # we need to close any remaining open section divs | ||||||
| 287 | while (@levels) | ||||||
| 288 | { | ||||||
| 289 | my $prev_level = pop @levels; | ||||||
| 290 | push @newhtml, "\n"; | ||||||
| 291 | } | ||||||
| 292 | } | ||||||
| 293 | push @newhtml, $hp->execute($tok); | ||||||
| 294 | } # go through all the tags | ||||||
| 295 | |||||||
| 296 | return join('', @newhtml); | ||||||
| 297 | } # insert_sections | ||||||
| 298 | |||||||
| 299 | =head1 REQUIRES | ||||||
| 300 | |||||||
| 301 | Cwd | ||||||
| 302 | File::Basename | ||||||
| 303 | File::Spec | ||||||
| 304 | XML::LibXML | ||||||
| 305 | XML::LibXSLT | ||||||
| 306 | HTML::SimpleParse | ||||||
| 307 | Test::More | ||||||
| 308 | |||||||
| 309 | =head1 INSTALLATION | ||||||
| 310 | |||||||
| 311 | To install this module, run the following commands: | ||||||
| 312 | |||||||
| 313 | perl Build.PL | ||||||
| 314 | ./Build | ||||||
| 315 | ./Build test | ||||||
| 316 | ./Build install | ||||||
| 317 | |||||||
| 318 | Or, if you're on a platform (like DOS or Windows) that doesn't like the | ||||||
| 319 | "./" notation, you can do this: | ||||||
| 320 | |||||||
| 321 | perl Build.PL | ||||||
| 322 | perl Build | ||||||
| 323 | perl Build test | ||||||
| 324 | perl Build install | ||||||
| 325 | |||||||
| 326 | In order to install somewhere other than the default, such as | ||||||
| 327 | in a directory under your home directory, like "/home/fred/perl" | ||||||
| 328 | go | ||||||
| 329 | |||||||
| 330 | perl Build.PL --install_base /home/fred/perl | ||||||
| 331 | |||||||
| 332 | as the first step instead. | ||||||
| 333 | |||||||
| 334 | This will install the files underneath /home/fred/perl. | ||||||
| 335 | |||||||
| 336 | You will then need to make sure that you alter the PERL5LIB variable to | ||||||
| 337 | find the modules, and the PATH variable to find the script. | ||||||
| 338 | |||||||
| 339 | Therefore you will need to change: | ||||||
| 340 | your path, to include /home/fred/perl/script (where the script will be) | ||||||
| 341 | |||||||
| 342 | PATH=/home/fred/perl/script:${PATH} | ||||||
| 343 | |||||||
| 344 | the PERL5LIB variable to add /home/fred/perl/lib | ||||||
| 345 | |||||||
| 346 | PERL5LIB=/home/fred/perl/lib:${PERL5LIB} | ||||||
| 347 | |||||||
| 348 | |||||||
| 349 | =head1 SEE ALSO | ||||||
| 350 | |||||||
| 351 | perl(1). | ||||||
| 352 | |||||||
| 353 | =head1 BUGS | ||||||
| 354 | |||||||
| 355 | Please report any bugs or feature requests to the author. | ||||||
| 356 | |||||||
| 357 | =head1 AUTHOR | ||||||
| 358 | |||||||
| 359 | Kathryn Andersen (RUBYKAT) | ||||||
| 360 | perlkat AT katspace dot com | ||||||
| 361 | http://www.katspace.org/tools | ||||||
| 362 | |||||||
| 363 | =head1 COPYRIGHT AND LICENCE | ||||||
| 364 | |||||||
| 365 | XSLT stylesheet based on the one at http://wiki.docbook.org/topic/Html2DocBook | ||||||
| 366 | by Jeff Beal | ||||||
| 367 | |||||||
| 368 | Copyright (c) 2006 by Kathryn Andersen | ||||||
| 369 | |||||||
| 370 | This program is free software; you can redistribute it and/or modify it | ||||||
| 371 | under the same terms as Perl itself. | ||||||
| 372 | |||||||
| 373 | |||||||
| 374 | =cut | ||||||
| 375 | |||||||
| 376 | 1; # End of HTML::ToDocBook | ||||||
| 377 | #------------------------------------------------------------------------ | ||||||
| 378 | # The XSLT stylesheet! | ||||||
| 379 | # The original stylesheet came from | ||||||
| 380 | # http://wiki.docbook.org/topic/Html2DocBook | ||||||
| 381 | # | ||||||
| 382 | __DATA__ |