File Coverage

blib/lib/EBook/MOBI/MobiPerl/Palm/Doc.pm

Criterion	Covered	Total	%
statement	91	162	56.1
branch	19	60	31.6
condition	19	30	63.3
subroutine	12	16	75.0
pod	4	5	80.0
total	145	273	53.1

line	stmt	bran	cond	sub	pod	time	code
1							# Palm::Doc.pm
2							#
3							# Palm::PDB helper for handling Palm Doc databases
4							#
5							# Copyright (C) 2004 Christophe Beauregard
6							#
7							# $Id: Doc.pm,v 1.19 2005/05/12 01:36:49 cpb Exp $
8
9	9			9		54	use strict;
	9					16
	9					537
10
11							package EBook::MOBI::MobiPerl::Palm::Doc;
12
13	9			9		54	use EBook::MOBI::MobiPerl::Palm::PDB;
	9					18
	9					244
14	9			9		5562	use EBook::MOBI::MobiPerl::Palm::Raw();
	9					21
	9					209
15	9			9		52	use vars qw( $VERSION @ISA );
	9					19
	9					944
16
17							$VERSION = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
18
19							@ISA = qw( EBook::MOBI::MobiPerl::Palm::Raw );
20
21	9			9		52	use constant DOC_UNCOMPRESSED => scalar 1;
	9					16
	9					661
22	9			9		48	use constant DOC_COMPRESSED => scalar 2;
	9					23
	9					454
23	9			9		43	use constant DOC_RECSIZE => scalar 4096;
	9					30
	9					17447
24
25							=head1 NAME
26
27							Palm::Doc - Handler for Palm Doc books
28
29							=head1 SYNOPSIS
30
31							use Palm::Doc;
32
33							=head1 DESCRIPTION
34
35							Helper for reading and writing Palm Doc books. The interface is based on
36							L since it just makes sense. However, because of the nature
37							of these databases, record-level processing is just a Bad Idea. Use
38							the C and C calls rather than do direct access of the
39							C<@records> array.
40
41							=head1 EXAMPLES
42
43							Convert a text file to a .pdb:
44
45							use Palm::Doc;
46							my $doc = new Palm::Doc;
47							$doc->textfile( $ARGV[0] );
48							$doc->Write( $ARGV[0] . ".pdb" );
49
50							Convert an HTML file to a .prc:
51
52							use HTML::TreeBuilder;
53							use HTML::FormatText;
54							use Palm::Doc;
55
56							my $tree = HTML::TreeBuilder->new_from_file( $ARGV[0] );
57							my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 );
58							my $doc = new Palm::Doc;
59							$doc->{attributes}{resource} = 1;
60							$doc->text( $formatter->format( $tree ) );
61							$doc->Write( $ARGV[0] . ".prc" );
62
63							=cut
64							#'
65
66							sub import
67							{
68	9			9		60	&EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
69	9					73	&EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "REAd", "TEXt" ], );
70	9					40	&EBook::MOBI::MobiPerl::Palm::PDB::RegisterPDBHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
71	9					41	&EBook::MOBI::MobiPerl::Palm::PDB::RegisterPRCHandlers( __PACKAGE__, [ "MOBI", "BOOK" ], );
72							}
73
74							=head2 new
75
76							$doc = new Palm::Doc;
77
78							Create a new Doc object. By default, it's not a resource database. Setting
79							C<$self->{attributes}{resource}> to C<1> before any manipulations will
80							cause it to become a resource database.
81
82							=cut
83
84							sub new
85							{
86	3			3	1	7	my $class = shift;
87	3					38	my $self = $class->SUPER::new(@_);
88
89	3					23	$self->{'creator'} = 'REAd';
90	3					8	$self->{'type'} = 'TEXt';
91
92	3					9	$self->{attributes}{resource} = 0;
93
94	3					9	$self->{appinfo} = undef;
95	3					7	$self->{sort} = undef;
96	3					8	$self->{records} = [];
97
98	3					18	return $self;
99							}
100
101							# determine if the given (raw) record is a Doc header record and fill in the
102							# record with appropriate fields if it is.
103							sub _parse_headerrec($) {
104	3			3		6	my $record = shift;
105	3	50				11	return undef unless exists $record->{'data'};
106
107							# Doc header is minimum of 16 bytes
108	3	50				12	return undef if length $record->{'data'} < 16;
109
110	3					24	my ($version,$spare,$ulen, $records, $recsize, $position)
111							= unpack( 'n n N n n N', $record->{'data'} );
112
113							# BEGIN: BORIS DAEPPEN JULY 2012
114							# ***********************
115							# This code just prints stuff... which is what I don't like
116							# One should not print to stderr just for fun!
117							# so this is why I put this out of work...
118
119							#my $h = sprintf ("%x", $version);
120							#print STDERR "Version: $version - $h - ";
121							#if ($version == DOC_COMPRESSED) {
122							#print STDERR " DOC_COMPRESSED\n";
123							#}
124							#if ($version == DOC_UNCOMPRESSED) {
125							#print STDERR " DOC_UNCOMPRESSED\n";
126							#}
127							#if ($version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED) {
128							#print STDERR " probably HUFFDIC_COMPRESSED - CANNOT BE DECOMPRESSED!!!\n";
129							#}
130							# ***********************
131							# END: BORIS DAEPPEN JULY 2012
132
133							# the header is followed by a list of record sizes. We don't use
134							# this since we can guess the sizes pretty easily by looking at
135							# the actual records.
136
137							# According to the spec, $version is either 1 (uncompressed)
138							# or 2 (compress), while spare is always zero. AportisDoc supposedly sets
139							# spare to something else, so screw AportisDoc.
140	3	50	33			23	return undef if $version != DOC_UNCOMPRESSED and $version != DOC_COMPRESSED;
141	3	50				10	return undef if $spare != 0;
142
143	3					7	$record->{'version'} = $version;
144	3					6	$record->{'length'} = $ulen;
145	3					6	$record->{'records'} = $records;
146	3					6	$record->{'recsize'} = $recsize;
147	3					7	$record->{'position'} = $position;
148
149	3					17	return $record;
150							}
151
152							sub _compress_record($$) {
153	3			3		16	my ($version,$in) = @_;
154	3	50				10	return $in if $version == DOC_UNCOMPRESSED;
155
156	3					6	my $out = '';
157
158	3					7	my $lin = length $in;
159	3					4	my $i = 0;
160	3					12	while( $i < $lin ) {
161							# See http://patb.dyndns.org/Programming/PilotDoc.htm for the code type
162							# taxonomy.
163
164							# Try type B compression first.
165							# If the next 3 to 10 bytes are already in the compressed buffer, we can
166							# encode them into a 2 byte sequence. Don't bother too close to the ends,
167							# however... Makes the boundary conditions simpler.
168	171	100	100			634	if( $i > 10 and $lin - $i > 10 ) {
169	116					141	my $chunk = '';
170	116					134	my $match = -1;
171
172							# the preamble is what'll be in the decoders output buffer.
173	116					186	my $preamble = substr( $in, 0, $i );
174	116					254	for( my $j = 10; $j >= 3; $j -- ) {
175	905					1191	$chunk = substr( $in, $i, $j ); # grab next $j characters
176	905					1004	$match = rindex( $preamble, $chunk ); # in the output?
177
178							# type B code has a 2047 byte sliding window, so matches have to be
179							# within that range to be useful
180	905	100	66			1797	last if $match >= 0 and ($i - $match) <= 2047;
181	893					1943	$match = -1;
182							}
183
184	116					151	my $n = length $chunk;
185	116	50	66			322	if( $match >= 0 and $n <= 10 and $n >= 3 ) {
			66
186	12					17	my $m = $i - $match;
187
188							# first 2 bits are 10, next 11 are offset, next 3 are length-3
189	12					42	$out .= pack( "n", 0x8000 + (($m<<3)&0x3ff8) + ($n-3) );
190
191	12					19	$i += $n;
192
193	12					29	next;
194							}
195							}
196
197	159					255	my $ch = substr( $in, $i ++, 1 );
198	159					192	my $och = ord($ch);
199
200							# Try type C compression.
201	159	100	100			647	if( $i+1 < $lin and $ch eq ' ' ) {
202	13					24	my $nch = substr( $in, $i, 1 );
203	13					18	my $onch = ord($nch);
204
205	13	100	100			65	if( $onch >= 0x40 and $onch < 0x80 ) {
206							# space plus ASCII character compression
207	10					24	$out .= chr($onch ^ 0x80);
208	10					14	$i ++;
209
210	10					28	next;
211							}
212							}
213
214	149	100	66			765	if( $och == 0 or ($och >= 9 and $och < 0x80) ) {
			33
215							# pass through
216	147					364	$out .= $ch;
217							} else {
218							# type A code. This is essentially an 'escape' like '\\' in strings.
219							# For efficiency, it's best to encode as long a sequence as
220							# possible with one copy. This might seem like it would cause us to miss
221							# out on a type B sequence, but in actuality keeping long binary strings
222							# together improves the likelyhood of a later type B sequence than
223							# interspersing them with x01's.
224
225	2					5	my $next = substr($in,$i - 1);
226	2	50				23	if( $next =~ /([\x01-\x08\x80-\xff]{1,8})/o ) {
227	2					7	my $binseq = $1;
228	2					5	$out .= chr(length $binseq);
229	2					4	$out .= $binseq;
230	2					8	$i += length( $binseq ) - 1; # first char, $ch, is already counted
231							}
232							}
233							}
234
235	3					20	return $out;
236							}
237
238							# algorithm taken from makedoc7.cpp with reference to
239							# http://patb.dyndns.org/Programming/PilotDoc.htm and
240							# http://www.pyrite.org/doc_format.html
241							sub _decompress_record($$) {
242	0			0		0	my ($version,$in) = @_;
243	0	0				0	return $in if $version == DOC_UNCOMPRESSED;
244
245	0					0	my $out = '';
246
247	0					0	my $lin = length $in;
248	0					0	my $i = 0;
249	0					0	while( $i < $lin ) {
250	0					0	my $ch = substr( $in, $i ++, 1 );
251	0					0	my $och = ord($ch);
252
253	0	0	0			0	if( $och >= 1 and $och <= 8 ) {
		0
		0
254							# copy this many bytes... basically a way to 'escape' data
255	0					0	$out .= substr( $in, $i, $och );
256	0					0	$i += $och;
257							} elsif( $och < 0x80 ) {
258							# pass through 0, 9-0x7f
259	0					0	$out .= $ch;
260							} elsif( $och >= 0xc0 ) {
261							# 0xc0-0xff are 'space' plus ASCII char
262	0					0	$out .= ' ';
263	0					0	$out .= chr($och ^ 0x80);
264							} else {
265							# 0x80-0xbf is sequence from already decompressed buffer
266	0					0	my $nch = substr( $in, $i ++, 1 );
267	0					0	$och = ($och << 8) + ord($nch);
268	0					0	my $m = ($och & 0x3fff) >> 3;
269	0					0	my $n = ($och & 0x7) + 3;
270
271							# This isn't very perl-like, but a simple
272							# substr($out,$lo-$m,$n) doesn't work.
273	0					0	my $lo = length $out;
274	0					0	for( my $j = 0; $j < $n; $j ++, $lo ++ ) {
275	0	0				0	die "bad Doc compression" unless ($lo-$m) >= 0;
276	0					0	$out .= substr( $out, $lo-$m, 1 );
277							}
278							}
279							}
280
281	0					0	return $out;
282							}
283
284							sub Write {
285	3			3	1	7	my $self = shift;
286
287	3					10	my $prc = $self->{attributes}{resource};
288	3	50				22	my $recs = $prc ? $self->{'resources'} : $self->{'records'};
289	3					7	my $header = $recs->[0];
290	3	50				34	unless( defined _parse_headerrec($header) ) {
291	0					0	die "@_: Doesn't appear to be a correct book...";
292							}
293
294	3					32	$self->SUPER::Write(@_);
295							}
296
297							=head2 text
298
299							$text = $doc->text;
300
301							Return the contents of the Doc database.
302
303							$text = $doc->text( @text );
304
305							Set the contents of the Doc book to the specified arguments. All the list arguments
306							will simply be concatenated together.
307
308							=cut
309
310							sub text {
311	0			0	1		my $self = shift;
312
313	0						my $body = '';
314	0						my $prc = $self->{attributes}{resource};
315
316	0	0					if( @_ > 0 ) {
		0
317	0						$body = join( '', @_ );
318
319	0						my $version = DOC_COMPRESSED;
320
321	0						$self->{'records'} = [];
322	0						$self->{'resources'} = [];
323
324							# first record is the header
325	0	0					my $header = $prc ? $self->append_Resource() : $self->append_Record();
326	0						$header->{'version'} = $version;
327	0						$header->{'length'} = 0;
328	0						$header->{'records'} = 0;
329	0						$header->{'recsize'} = DOC_RECSIZE;
330
331							# break the document into record-sized chunks
332	0						for( my $i = 0; $i < length($body); $i += DOC_RECSIZE ) {
333	0	0					my $record = $prc ? $self->append_Resource : $self->append_Record;
334	0						my $chunk = substr($body,$i,DOC_RECSIZE);
335	0						$record->{'data'} = _compress_record( $version, $chunk );
336
337	0						$header->{'records'} ++;
338	0						$header->{'length'} += length $body;
339							}
340
341	0	0					$header->{'recsize'} = $header->{'length'}
342							if $header->{'length'} < DOC_RECSIZE;
343
344							# pack up the header
345	0						$header->{'data'} = pack( 'n xx N n n N',
346							$header->{'version'}, $header->{'length'},
347							$header->{'records'}, $header->{'recsize'}, 0 );
348
349							} elsif( defined wantarray ) {
350
351	0	0					my $recs = $prc ? $self->{'resources'} : $self->{'records'};
352
353	0						my $header = $recs->[0];
354	0	0					if( defined _parse_headerrec($header) ) {
355							# a proper Doc file should be fine, but if it's not Doc
356							# compression like some Mobi docs seem to be we want to
357							# bail early. Otherwise we end up with a huge stream of
358							# substr() errors and we _still_ don't get any content.
359	0						eval {
360	0	0		0	0		sub min { return ($_[0]<$_[1]) ? $_[0] : $_[1] }
361	0						my $maxi = min($#$recs, $header->{'records'});
362	0						for( my $i = 1; $i <= $maxi; $i ++ ) {
363	0						my $data = $recs->[$i]->{'data'};
364	0						my $len = length($data);
365	0						my $overlap = "";
366	0	0					if ($self->{multibyteoverlap}) {
367	0						my $c = chop $data;
368	0						print STDERR "I:$i - $len - ", int($c), "\n";
369	0						my $n = $c & 7;
370	0						foreach (0..$n-1) {
371	0						$overlap = (chop $data) . $overlap;
372							}
373							}
374
375	0						$body .= _decompress_record( $header->{'version'},
376							$data );
377	0						$body .= $overlap;
378							}
379							};
380	0	0					return undef if $@;
381							}
382							}
383
384	0						return $body;
385							}
386
387							=head2 textfile
388
389							$doc->textfile( "README.txt" );
390
391							Set the contents of the Doc to the contents of the file and sets the name of the PDB to
392							the specified filename.
393
394							=cut
395
396							sub textfile($$) {
397	0			0	1		my ($self, $filename) = @_;
398
399	0	0					open IN, "< $filename" or return undef;
400	0						binmode IN;
401	0						$self->text( '', );
402	0						close IN;
403
404	0						$self->{'name'} = $filename;
405							}
406
407							1;
408							__END__