blib/lib/HTML/ToDocBook.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 16 | 18 | 88.8 |
branch | n/a | ||
condition | n/a | ||
subroutine | 6 | 6 | 100.0 |
pod | n/a | ||
total | 22 | 24 | 91.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::ToDocBook; | ||||||
2 | 2 | 2 | 53148 | use strict; | |||
2 | 6 | ||||||
2 | 148 | ||||||
3 | 2 | 2 | 11 | use warnings; | |||
2 | 5 | ||||||
2 | 116 | ||||||
4 | |||||||
5 | =head1 NAME | ||||||
6 | |||||||
7 | HTML::ToDocBook - Converts an XHTML file into DocBook. | ||||||
8 | |||||||
9 | =head1 VERSION | ||||||
10 | |||||||
11 | This describes version B<0.03> of HTML::ToDocBook. | ||||||
12 | |||||||
13 | =cut | ||||||
14 | |||||||
15 | our $VERSION = '0.03'; | ||||||
16 | |||||||
17 | =head1 SYNOPSIS | ||||||
18 | |||||||
19 | use HTML::ToDocBook; | ||||||
20 | |||||||
21 | my $obj = HTML::ToDocBook->new(%args); | ||||||
22 | |||||||
23 | $obj->convert(infile=>$filename); | ||||||
24 | |||||||
25 | # convert HTML file | ||||||
26 | $obj->convert(infile=>$filename, html=>1); | ||||||
27 | |||||||
28 | =head1 DESCRIPTION | ||||||
29 | |||||||
30 | This module converts an XHTML file into DocBook format using both | ||||||
31 | heuristics and XSLT processing. By default, this expects the input file to | ||||||
32 | be correct XHTML -- there are other programs such as html tidy | ||||||
33 | (http://tidy.sourceforge.net/) which can correct files for you; this does | ||||||
34 | not do that. | ||||||
35 | |||||||
36 | Note also this is very simple; it doesn't deal with things like | ||||||
37 | or which it has no way of guessing the meaning of. |
||||||
38 | (For some, however, if they have class names which match DocBook tags, | ||||||
39 | they will be turned into those tags) | ||||||
40 | This does not merge multiple XHTML files into a single document, | ||||||
41 | so this converts each XHTML file into a |
||||||
42 | header being a section (sect1 to sect5). The |
||||||
43 | the chapter title. | ||||||
44 | |||||||
45 | There will likely to be validity errors, depending on how good the original | ||||||
46 | HTML was. There may be broken links, |
||||||
47 | and overuse of |
||||||
48 | |||||||
49 | =cut | ||||||
50 | |||||||
51 | 2 | 2 | 9 | use Cwd 'abs_path'; | |||
2 | 3 | ||||||
2 | 119 | ||||||
52 | 2 | 2 | 13 | use File::Basename; | |||
2 | 9 | ||||||
2 | 233 | ||||||
53 | 2 | 2 | 11 | use File::Spec; | |||
2 | 4 | ||||||
2 | 47 | ||||||
54 | 2 | 2 | 2319 | use XML::LibXSLT; | |||
0 | |||||||
0 | |||||||
55 | use XML::LibXML; | ||||||
56 | use HTML::SimpleParse; | ||||||
57 | |||||||
58 | =head1 METHODS | ||||||
59 | |||||||
60 | =head2 new | ||||||
61 | |||||||
62 | my $conv = HTML::ToDocBook->new(); | ||||||
63 | |||||||
64 | my $conv = HTML::ToDocBook->new(stylesheet=>$stylesheet); | ||||||
65 | |||||||
66 | Arguments: | ||||||
67 | |||||||
68 | =over | ||||||
69 | |||||||
70 | =item stylesheet | ||||||
71 | |||||||
72 | A replacement XSLT stylesheet to use for conversions instead of the | ||||||
73 | built-in one. This can either be a file name or a string containing | ||||||
74 | the entire stylesheet. | ||||||
75 | |||||||
76 | =back | ||||||
77 | |||||||
78 | =cut | ||||||
79 | |||||||
80 | sub new { | ||||||
81 | my $class = shift; | ||||||
82 | my %parameters = @_; | ||||||
83 | my $self = bless ({%parameters}, ref ($class) || $class); | ||||||
84 | |||||||
85 | my $parser = XML::LibXML->new(); | ||||||
86 | my $xslt = XML::LibXSLT->new(); | ||||||
87 | |||||||
88 | $self->{_parser} = $parser; | ||||||
89 | $self->{_xslt} = $xslt; | ||||||
90 | |||||||
91 | if ($self->{stylesheet} | ||||||
92 | and -f $self->{stylesheet}) | ||||||
93 | { | ||||||
94 | my $fn = abs_path($self->{stylesheet}); | ||||||
95 | my $style_doc = $parser->parse_file($fn) | ||||||
96 | or die "Could not parse $fn XSLT file"; | ||||||
97 | my $stylesheet = $xslt->parse_stylesheet($style_doc) | ||||||
98 | or die "Could not parse $fn stylesheet"; | ||||||
99 | $self->{_xslt_sheet} = $stylesheet; | ||||||
100 | } | ||||||
101 | elsif ($self->{stylesheet}) | ||||||
102 | { | ||||||
103 | my $style_doc = $parser->parse_string($self->{stylesheet}) | ||||||
104 | or die "Could not parse string XSLT"; | ||||||
105 | my $stylesheet = $xslt->parse_stylesheet($style_doc) | ||||||
106 | or die "Could not parse stylesheet"; | ||||||
107 | $self->{_xslt_sheet} = $stylesheet; | ||||||
108 | } | ||||||
109 | else | ||||||
110 | { | ||||||
111 | |||||||
112 | # build the parsed stylesheet from the DATA | ||||||
113 | |||||||
114 | # This is stored in the DATA handle, after the __DATA__ at | ||||||
115 | # the end of this file; but because the scripts may not just | ||||||
116 | # create one instance of this object, | ||||||
117 | # we have to remember the position of the DATA handle | ||||||
118 | # and reset it after we've read from it, just in case | ||||||
119 | # we have to read from it again. | ||||||
120 | # This also means that we don't close it, either. Hope that doesn't | ||||||
121 | # cause a problem... | ||||||
122 | |||||||
123 | my $curpos = tell(DATA); # remember the __DATA__ position | ||||||
124 | my $style_doc = $parser->parse_fh(\*DATA); | ||||||
125 | # reset the data handle to the start, just in case | ||||||
126 | seek(DATA, $curpos, 0); | ||||||
127 | |||||||
128 | my $stylesheet = $xslt->parse_stylesheet($style_doc); | ||||||
129 | $self->{_xslt_sheet} = $stylesheet; | ||||||
130 | } | ||||||
131 | |||||||
132 | return ($self); | ||||||
133 | } # new | ||||||
134 | |||||||
135 | =head2 convert | ||||||
136 | |||||||
137 | $obj->convert(infile=>$filename, | ||||||
138 | html=>1); | ||||||
139 | |||||||
140 | Arguments: | ||||||
141 | |||||||
142 | =over | ||||||
143 | |||||||
144 | =item infile | ||||||
145 | |||||||
146 | The name of the file to convert. | ||||||
147 | |||||||
148 | =item html | ||||||
149 | |||||||
150 | Parse the input as HTML rather than XML. | ||||||
151 | |||||||
152 | =back | ||||||
153 | |||||||
154 | =cut | ||||||
155 | |||||||
156 | sub convert { | ||||||
157 | my $self = shift; | ||||||
158 | my %args = ( | ||||||
159 | html=>0, | ||||||
160 | @_ | ||||||
161 | ); | ||||||
162 | my $filename = $args{infile}; | ||||||
163 | |||||||
164 | my ($basename,$path,$suffix) = fileparse($filename,qr{\.html?}i); | ||||||
165 | my $outfile = File::Spec->catfile($path, "${basename}.xml"); | ||||||
166 | $outfile = '-' if ($filename eq ''); | ||||||
167 | |||||||
168 | # We need to read in the file first because we need to | ||||||
169 | # pre-process it | ||||||
170 | my $file_str; | ||||||
171 | if ($filename eq '-') # read from STDIN | ||||||
172 | { | ||||||
173 | local $/; | ||||||
174 | $file_str = |
||||||
175 | } | ||||||
176 | else | ||||||
177 | { | ||||||
178 | local $/; | ||||||
179 | my $fh; | ||||||
180 | open ($fh, "<", $filename) or die "could not open $filename"; | ||||||
181 | $file_str = <$fh>; | ||||||
182 | close $fh; | ||||||
183 | } | ||||||
184 | $file_str = $self->insert_sections($file_str); | ||||||
185 | |||||||
186 | my $first_ss = $self->{_xslt_sheet}; | ||||||
187 | |||||||
188 | my $source = undef; | ||||||
189 | my $result_str = ''; | ||||||
190 | if ($args{html}) | ||||||
191 | { | ||||||
192 | $source = $self->{_parser}->parse_html_string($file_str); | ||||||
193 | } | ||||||
194 | else | ||||||
195 | { | ||||||
196 | $source = $self->{_parser}->parse_string($file_str); | ||||||
197 | } | ||||||
198 | undef $file_str; | ||||||
199 | |||||||
200 | my %all_params = (); | ||||||
201 | my $results = $first_ss->transform($source, %all_params); | ||||||
202 | $result_str = $first_ss->output_string($results); | ||||||
203 | |||||||
204 | # print the result | ||||||
205 | my $outfh = undef; | ||||||
206 | if ($outfile eq '-' or $outfile eq '') | ||||||
207 | { | ||||||
208 | $outfh = \*STDOUT; | ||||||
209 | } | ||||||
210 | else | ||||||
211 | { | ||||||
212 | open(OUT, ">", $outfile) | ||||||
213 | || die "Can't open $outfile for writing!"; | ||||||
214 | $outfh = \*OUT; | ||||||
215 | } | ||||||
216 | print $outfh $result_str; | ||||||
217 | if ($outfile ne '-' and $outfile ne '') | ||||||
218 | { | ||||||
219 | close($outfh); | ||||||
220 | } | ||||||
221 | return $result_str; | ||||||
222 | } # convert | ||||||
223 | |||||||
224 | =head1 Private Methods | ||||||
225 | |||||||
226 | These are not guaranteed to be stable. | ||||||
227 | |||||||
228 | =head2 insert_sections | ||||||
229 | |||||||
230 | $my str = $obj->insert_sections($string); | ||||||
231 | |||||||
232 | This inserts tags to enclose all levels |
||||||
233 | of header. These will then be picked up by the XSLT stylesheet | ||||||
234 | and converted into section tags. | ||||||
235 | |||||||
236 | =cut | ||||||
237 | |||||||
238 | sub insert_sections { | ||||||
239 | my $self = shift; | ||||||
240 | my $string = shift; | ||||||
241 | my %args = ( | ||||||
242 | parse_type=>'xml', | ||||||
243 | @_ | ||||||
244 | ); | ||||||
245 | |||||||
246 | my $hp = new HTML::SimpleParse(); | ||||||
247 | $hp->text($string); | ||||||
248 | $hp->parse(); | ||||||
249 | |||||||
250 | my @newhtml = (); | ||||||
251 | my @levels = (); | ||||||
252 | my $tok; | ||||||
253 | my @tree = $hp->tree(); | ||||||
254 | while (@tree) | ||||||
255 | { | ||||||
256 | $tok = shift @tree; | ||||||
257 | if ($tok->{type} eq 'starttag' | ||||||
258 | and $tok->{content} =~ /^h(\d)/i) | ||||||
259 | { | ||||||
260 | # we have a header | ||||||
261 | my $header_level = $1; | ||||||
262 | # if we had a previous header, then close its div | ||||||
263 | # if it is the same or higher | ||||||
264 | if (@levels) | ||||||
265 | { | ||||||
266 | my $prev_level = $levels[$#levels]; | ||||||
267 | while ($prev_level > $header_level) | ||||||
268 | { | ||||||
269 | pop @levels; | ||||||
270 | push @newhtml, "\n"; | ||||||
271 | $prev_level = $levels[$#levels]; | ||||||
272 | } | ||||||
273 | if ($prev_level == $header_level) | ||||||
274 | { | ||||||
275 | pop @levels; | ||||||
276 | push @newhtml, "\n"; | ||||||
277 | } | ||||||
278 | } | ||||||
279 | # start a new div for the new header | ||||||
280 | push @newhtml, sprintf("\n \n", $header_level); |
||||||
281 | push @levels, $header_level; | ||||||
282 | } | ||||||
283 | elsif ($tok->{type} eq 'endtag' | ||||||
284 | and $tok->{content} =~ /^\/body/i) | ||||||
285 | { | ||||||
286 | # we need to close any remaining open section divs | ||||||
287 | while (@levels) | ||||||
288 | { | ||||||
289 | my $prev_level = pop @levels; | ||||||
290 | push @newhtml, "\n"; | ||||||
291 | } | ||||||
292 | } | ||||||
293 | push @newhtml, $hp->execute($tok); | ||||||
294 | } # go through all the tags | ||||||
295 | |||||||
296 | return join('', @newhtml); | ||||||
297 | } # insert_sections | ||||||
298 | |||||||
299 | =head1 REQUIRES | ||||||
300 | |||||||
301 | Cwd | ||||||
302 | File::Basename | ||||||
303 | File::Spec | ||||||
304 | XML::LibXML | ||||||
305 | XML::LibXSLT | ||||||
306 | HTML::SimpleParse | ||||||
307 | Test::More | ||||||
308 | |||||||
309 | =head1 INSTALLATION | ||||||
310 | |||||||
311 | To install this module, run the following commands: | ||||||
312 | |||||||
313 | perl Build.PL | ||||||
314 | ./Build | ||||||
315 | ./Build test | ||||||
316 | ./Build install | ||||||
317 | |||||||
318 | Or, if you're on a platform (like DOS or Windows) that doesn't like the | ||||||
319 | "./" notation, you can do this: | ||||||
320 | |||||||
321 | perl Build.PL | ||||||
322 | perl Build | ||||||
323 | perl Build test | ||||||
324 | perl Build install | ||||||
325 | |||||||
326 | In order to install somewhere other than the default, such as | ||||||
327 | in a directory under your home directory, like "/home/fred/perl" | ||||||
328 | go | ||||||
329 | |||||||
330 | perl Build.PL --install_base /home/fred/perl | ||||||
331 | |||||||
332 | as the first step instead. | ||||||
333 | |||||||
334 | This will install the files underneath /home/fred/perl. | ||||||
335 | |||||||
336 | You will then need to make sure that you alter the PERL5LIB variable to | ||||||
337 | find the modules, and the PATH variable to find the script. | ||||||
338 | |||||||
339 | Therefore you will need to change: | ||||||
340 | your path, to include /home/fred/perl/script (where the script will be) | ||||||
341 | |||||||
342 | PATH=/home/fred/perl/script:${PATH} | ||||||
343 | |||||||
344 | the PERL5LIB variable to add /home/fred/perl/lib | ||||||
345 | |||||||
346 | PERL5LIB=/home/fred/perl/lib:${PERL5LIB} | ||||||
347 | |||||||
348 | |||||||
349 | =head1 SEE ALSO | ||||||
350 | |||||||
351 | perl(1). | ||||||
352 | |||||||
353 | =head1 BUGS | ||||||
354 | |||||||
355 | Please report any bugs or feature requests to the author. | ||||||
356 | |||||||
357 | =head1 AUTHOR | ||||||
358 | |||||||
359 | Kathryn Andersen (RUBYKAT) | ||||||
360 | perlkat AT katspace dot com | ||||||
361 | http://www.katspace.org/tools | ||||||
362 | |||||||
363 | =head1 COPYRIGHT AND LICENCE | ||||||
364 | |||||||
365 | XSLT stylesheet based on the one at http://wiki.docbook.org/topic/Html2DocBook | ||||||
366 | by Jeff Beal | ||||||
367 | |||||||
368 | Copyright (c) 2006 by Kathryn Andersen | ||||||
369 | |||||||
370 | This program is free software; you can redistribute it and/or modify it | ||||||
371 | under the same terms as Perl itself. | ||||||
372 | |||||||
373 | |||||||
374 | =cut | ||||||
375 | |||||||
376 | 1; # End of HTML::ToDocBook | ||||||
377 | #------------------------------------------------------------------------ | ||||||
378 | # The XSLT stylesheet! | ||||||
379 | # The original stylesheet came from | ||||||
380 | # http://wiki.docbook.org/topic/Html2DocBook | ||||||
381 | # | ||||||
382 | __DATA__ |