File Coverage

blib/lib/EBook/Ishmael/EBook/Mobi.pm

Criterion	Covered	Total	%
statement	385	545	70.6
branch	70	152	46.0
condition	16	41	39.0
subroutine	37	47	78.7
pod	0	10	0.0
total	508	795	63.9

line	stmt	bran	cond	sub	pod	time	code
1							package EBook::Ishmael::EBook::Mobi;
2	17			17		10664	use 5.016;
	17					63
3							our $VERSION = '2.03';
4	17			17		120	use strict;
	17					35
	17					486
5	17			17		84	use warnings;
	17					33
	17					1011
6
7	17			17		94	use Encode qw(from_to);
	17					27
	17					957
8
9	17			17		147	use XML::LibXML;
	17					37
	17					133
10
11	17			17		10830	use EBook::Ishmael::Decode qw(palmdoc_decode);
	17					59
	17					1419
12	17			17		149	use EBook::Ishmael::ImageID qw(image_id);
	17					41
	17					850
13	17			17		8291	use EBook::Ishmael::PDB;
	17					53
	17					771
14	17			17		140	use EBook::Ishmael::Time qw(guess_time);
	17					47
	17					125005
15
16							# Many thanks to Tommy Persson, the original author of mobi2html, a script
17							# which much of this code is based off of.
18
19							# TODO: Implement AZW4 support
20							# TODO: Add support for UTF16 MOBIs (65002)
21
22							my $TYPE = 'BOOK';
23							my $CREATOR = 'MOBI';
24
25							my $RECSIZE = 4096;
26
27							my $NULL_INDEX = 0xffffffff;
28
29							my $UNPACK_Q = !! eval { unpack "Q>", 1 };
30
31							sub heuristic {
32
33	70			70	0	166	my $class = shift;
34	70					139	my $file = shift;
35	70					152	my $fh = shift;
36
37	70	50				1134	return 0 unless -s $file >= 68;
38
39	70					396	seek $fh, 32, 0;
40	70					546	read $fh, my ($null), 1;
41
42	70	100				271	unless ($null eq "\0") {
43	35					160	return 0;
44							}
45
46	35					237	seek $fh, 60, 0;
47	35					222	read $fh, my ($type), 4;
48	35					120	read $fh, my ($creator), 4;
49
50	35	100	66			237	return 0 unless $type eq $TYPE && $creator eq $CREATOR;
51
52	11					76	seek $fh, 78, 0;
53	11					63	read $fh, my ($off), 4;
54	11					49	$off = unpack "N", $off;
55	11					80	seek $fh, $off + 36, 0;
56	11					70	read $fh, my ($ver), 4;
57	11					51	$ver = unpack "N", $ver;
58
59	11					69	return $ver != 8;
60
61							}
62
63
64							# Many thanks to Calibre, much of the code in this module was based on their
65							# huffman decoder.
66
67							package EBook::Ishmael::EBook::Mobi::MobiHuff {
68
69							my $HUFF_HDR = pack "A4 N", 'HUFF', 24;
70							my $CDIC_HDR = pack "A4 N", 'CDIC', 16;
71
72							sub _load_huff {
73
74	0			0		0	my $self = shift;
75	0					0	my $huff = shift;
76
77	0	0				0	unless (substr($huff, 0, 8) eq $HUFF_HDR) {
78	0					0	die "Invalid MOBI HUFF header\n";
79							}
80
81	0					0	my @off = unpack "N N", substr $huff, 8, 8;
82
83	0					0	@{ $self->{dict1} } = map {
84
85	0					0	my $len = $_ & 0x1f;
	0					0
86	0					0	my $term = $_ & 0x80;
87	0					0	my $max = $_ >> 8;
88
89	0	0				0	if ($len == 0) {
90	0					0	die "Invalid MOBI HUFF dictionary\n";
91							}
92
93	0	0	0			0	if ($len <= 8 and !$term) {
94	0					0	die "Invalid MOBI HUFF dictionary\n";
95							}
96
97	0					0	$max = (($max + 1) << (32 - $len)) - 1;
98
99	0					0	[ $len, $term, $max ];
100
101							} unpack "N256", substr $huff, $off[0], 4 * 256;
102
103	0					0	my @dict2 = unpack "N64", substr $huff, $off[1], 4 * 64;
104
105	0					0	my @mins = (0, map { $dict2[$_] } grep { $_ % 2 == 0 } (0 .. $#dict2));
	0					0
	0					0
106	0					0	my @maxs = (0, map { $dict2[$_] } grep { $_ % 2 != 0 } (0 .. $#dict2));
	0					0
	0					0
107
108	0					0	$self->{mincode} = [ map { $mins[$_] << (32 - $_) } (0 .. $#mins) ];
	0					0
109	0					0	$self->{maxcode} = [ map { (($maxs[$_] + 1) << (32 - $_)) - 1 } (0 .. $#maxs) ];
	0					0
110
111	0					0	return 1;
112
113							}
114
115							sub _load_cdic {
116
117	0			0		0	my $self = shift;
118	0					0	my $cdic = shift;
119
120	0	0				0	unless (substr($cdic, 0, 8) eq $CDIC_HDR) {
121	0					0	die "Invalid MOBI CDIC header\n";
122							}
123
124	0					0	my ($phrases, $bits) = unpack "N N", substr $cdic, 8, 8;
125
126	0					0	my $n = min(1 << $bits, $phrases - @{ $self->{dictionary} });
	0					0
127
128	0					0	push @{ $self->{dictionary} }, map {
129
130	0					0	my $blen = unpack "n", substr $cdic, 16 + $_;
	0					0
131
132							[
133	0					0	substr($cdic, 18 + $_, $blen & 0x7fff),
134							$blen & 0x8000,
135							];
136
137							} unpack "n$n", substr $cdic, 16;
138
139	0					0	return 1;
140
141							}
142
143							sub new {
144
145	0			0		0	my $class = shift;
146	0					0	my $huff = shift;
147	0					0	my @cdic = @_;
148
149	0					0	my $self = {
150							dict1 => [],
151							dictionary => [],
152							mincode => [],
153							maxcode => [],
154							};
155
156	0					0	bless $self, $class;
157
158	0					0	$self->_load_huff($huff);
159
160	0					0	for my $c (@cdic) {
161	0					0	$self->_load_cdic($c);
162							}
163
164	0					0	return $self;
165
166							}
167
168							sub decode {
169
170	0			0		0	my $self = shift;
171	0					0	my $data = shift;
172
173	0					0	my $left = length($data) * 8;
174	0					0	$data .= "\x00" x 8;
175	0					0	my $pos = 0;
176	0					0	my $x = unpack "Q>", $data;
177	0					0	my $n = 32;
178
179	0					0	my $s = '';
180
181	0					0	while (1) {
182
183	0	0				0	if ($n <= 0) {
184	0					0	$pos += 4;
185	0					0	$x = unpack "Q>", substr $data, $pos, 8;
186	0					0	$n += 32;
187							}
188	0					0	my $code = ($x >> $n) & ((1 << 32) - 1);
189
190	0					0	my ($len, $term, $max) = @{ $self->{dict1}[$code >> 24] };
	0					0
191	0	0				0	unless ($term) {
192	0					0	$len += 1 while $code < $self->{mincode}[$len];
193	0					0	$max = $self->{maxcode}[$len];
194							}
195
196	0					0	$n -= $len;
197	0					0	$left -= $len;
198	0	0				0	last if $left < 0;
199
200	0					0	my $r = ($max - $code) >> (32 - $len);
201
202	0					0	my ($slice, $flag) = @{ $self->{dictionary}[$r] };
	0					0
203
204	0	0				0	unless ($flag) {
205	0					0	$self->{dictionary}[$r] = [];
206	0					0	$slice = $self->decode($slice);
207	0					0	$self->{dictionary}[$r] = [ $slice, 1 ];
208							}
209
210	0					0	$s .= $slice;
211
212							}
213
214	0					0	return $s;
215
216							}
217
218							}
219
220							sub _clean_html {
221
222	5			5		11	my $html = shift;
223
224	5					275	$$html =~ s/
225	5					242	$$html =~ s/
226	5					252	$$html =~ s/<\/mbp:pagebreak>//g;
227	5					94	$$html =~ s/.*?<\/guide>//g;
228	5					77	$$html =~ s/<\/?mbp:nu>//g;
229	5					242	$$html =~ s/<\/?mbp:section//g;
230	5					70	$$html =~ s/<\/?mbp:frameset>//g;
231	5					70	$$html =~ s/<\/?mbp:slave-frame>//g;
232
233	5					13	return 1;
234
235							}
236
237							sub _trailing_entry_size {
238
239	0			0		0	my $data = shift;
240
241	0					0	my $res = 0;
242
243	0					0	my $trail = substr $data, -4;
244
245	0					0	for my $c (unpack "C4", $trail) {
246	0	0				0	if ($c & 0x80) {
247	0					0	$res = 0;
248							}
249	0					0	$res = ($res << 7) \| ($c & 0x7f);
250							}
251
252	0					0	return $res;
253
254							}
255
256							sub _trailing_entries_size {
257
258	138			138		223	my $self = shift;
259	138					192	my $data = shift;
260
261	138					210	my $res = 0;
262
263	138					349	for my $i (0 .. $self->{_trailers} - 1) {
264	0					0	my $n = _trailing_entry_size($data);
265	0					0	$res += $n;
266	0					0	substr $data, -$n, $n, '';
267							}
268
269	138	50				373	if ($self->{_extra_data} & 1) {
270	138					310	$res += (ord(substr $data, -1) & 3) + 1;
271							}
272
273	138					224	return $res;
274
275							}
276
277							# Index processing code was adapted from KindleUnpack
278
279							sub _get_index_data {
280
281	20			20		40	my $self = shift;
282	20					36	my $idx = shift;
283
284	20	50				54	return {} if $idx == $NULL_INDEX;
285
286	20					43	my $outtbl = [];
287	20					35	my $ctoc = {};
288
289	20					33	my $data;
290	20					97	$$data = $self->{_pdb}->record($idx)->data;
291
292	20					142	my ($idxhdr, $hordt1, $hordt2) = $self->_parse_indx_header($data);
293	20					48	my $icount = $idxhdr->{count};
294	20					38	my $roff = 0;
295	20					51	my $off = $idx + $icount + 1;
296
297	20					66	for my $i (0 .. $idxhdr->{nctoc} - 1) {
298	10					50	my $cdata = $self->{_pdb}->record($off + $i)->data;
299	10					44	my $ctocdict = $self->_read_ctoc(\$cdata);
300	10					45	for my $j (sort keys %$ctocdict) {
301	0					0	$ctoc->{ $j + $roff } = $ctocdict->{ $j };
302							}
303	10					32	$roff += 0x10000;
304							}
305
306	20					50	my $tagstart = $idxhdr->{len};
307	20					90	my ($ctrlcount, $tagtbl) = _read_tag_section($tagstart, $data);
308
309	20					76	for my $i ($idx + 1 .. $idx + 1 + $icount - 1) {
310	20					96	my $d = $self->{_pdb}->record($i)->data;
311	20					70	my ($hdrinfo, $ordt1, $ordt2) = $self->_parse_indx_header(\$d);
312	20					49	my $idxtpos = $hdrinfo->{start};
313	20					39	my $ecount = $hdrinfo->{count};
314	20					39	my $idxposits = [];
315	20					77	for my $j (0 .. $ecount - 1) {
316	100					219	my $pos = unpack "n", substr $d, $idxtpos + 4 + (2 * $j), 2;
317	100					212	push @$idxposits, $pos;
318							}
319	20					49	for my $j (0 .. $ecount - 1) {
320	100					208	my $spos = $idxposits->[$j];
321	100					198	my $epos = $idxposits->[$j + 1];
322	100					218	my $txtlen = ord(substr $d, $spos, 1);
323	100					265	my $txt = substr $d, $spos + 1, $txtlen;
324	100	50				255	if (@$hordt2) {
325							$txt = join '',
326	0					0	map { chr $hordt2->[ ord $_ ] }
	0					0
327							split //, $txt;
328							}
329	100					390	my $tagmap = _get_tagmap(
330							$ctrlcount,
331							$tagtbl,
332							\$d,
333							$spos + 1 + $txtlen,
334							$epos
335							);
336	100					493	push @$outtbl, [ $txt, $tagmap ];
337							}
338							}
339
340	20					185	return ( $outtbl, $ctoc );
341
342							}
343
344							sub _parse_indx_header {
345
346	40			40		105	my $self = shift;
347	40					97	my $data = shift;
348
349	40	50				152	unless (substr($$data, 0, 4) eq 'INDX') {
350	0					0	die "Index section is not INDX\n";
351							}
352
353	40					172	my @words = qw(
354							len nul1 type gen start count code lng total ordt ligt nligt nctoc
355							);
356	40					109	my $num = scalar @words;
357	40					176	my @values = unpack "N$num", substr $$data, 4, 4 * $num;
358	40					87	my $header = {};
359
360	40					121	for my $i (0 .. $#words) {
361	520					1456	$header->{ $words[$i] } = $values[$i];
362							}
363
364	40					78	my $ordt1 = [];
365	40					69	my $ordt2 = [];
366
367							my (
368	40					185	$ocnt,
369							$oentries,
370							$op1,
371							$op2,
372							$otagx
373							) = unpack "N N N N N", substr $$data, 0xa4, 4 * 5;
374
375	40	50	33			348	if ($header->{code} == 0xfdea or $ocnt != 0 or $oentries > 0) {
			33
376
377	0	0				0	unless ($ocnt == 1) {
378	0					0	die "Corrupted INDX record\n";
379							}
380	0	0				0	unless (substr($$data, $op1, 4) eq 'ORDT') {
381	0					0	die "Corrupted INDX record\n";
382							}
383	0	0				0	unless (substr($$data, $op2, 4) eq 'ORDT') {
384	0					0	die "Corrupted INDX record\n";
385							}
386
387							$ordt1 = [
388	0					0	unpack("C$oentries", substr $$data, $op1 + 4, $oentries)
389							];
390	0					0	$ordt2 = [
391							unpack("n$oentries", substr $$data, $op2 + 4, $oentries * 2)
392							];
393
394							}
395
396	40					187	return ( $header, $ordt1, $ordt2 );
397
398							}
399
400							sub _read_ctoc {
401
402	10			10		24	my $self = shift;
403	10					24	my $data = shift;
404
405	10					25	my $ctoc = {};
406
407	10					20	my $off = 0;
408	10					22	my $len = length $$data;
409
410	10					36	while ($off < $len) {
411	40	100				112	if (substr($$data, $off, 1) eq "\0") {
412	10					21	last;
413							}
414
415	30					52	my $idxoff = $off;
416
417	30					63	my ($pos, $ilen) = _vwv($data, $off);
418	30					52	$off += $pos;
419
420	30					63	my $name = substr $$data, $off, $ilen;
421	30					80	$off += $ilen;
422
423	30					171	my $ctoc->{ $idxoff } = $name;
424
425							}
426
427	10					27	return $ctoc;
428
429							}
430
431							sub _vwv {
432
433	560			560		913	my $data = shift;
434	560					858	my $off = shift;
435
436	560					919	my $value = 0;
437	560					798	my $consume = 0;
438	560					959	my $fin = 0;
439
440	560					1229	while (!$fin) {
441	810					1569	my $v = substr $$data, $off + $consume, 1;
442	810					1251	$consume++;
443	810	100				1804	if (ord($v) & 0x80) {
444	560					932	$fin = 1;
445							}
446	810					2119	$value = ($value << 7) \| (ord($v) & 0x7f);
447							}
448
449	560					1444	return ( $consume, $value );
450
451							}
452
453							sub _read_tag_section {
454
455	20			20		41	my $start = shift;
456	20					45	my $data = shift;
457
458	20					31	my $ctrlcount = 0;
459
460	20					38	my $tags = [];
461
462	20	50				96	if (substr($$data, $start, 4) eq 'TAGX') {
463	20					64	my $foff = unpack "N", substr $$data, $start + 4, 4;
464	20					80	$ctrlcount = unpack "N", substr $$data, $start + 8, 4;
465	20					153	for (my $i = 12; $i < $foff; $i += 4) {
466	80					129	my $pos = $start + $i;
467	80					327	push @$tags, [ unpack "C4", substr $$data, $pos, 4 ];
468							}
469							}
470
471	20					57	return ( $ctrlcount, $tags );
472
473							}
474
475							sub _count_setbits {
476
477	280			280		454	my $val = shift;
478	280		50			956	my $bits = shift // 8;
479
480	280					442	my $count = 0;
481	280					582	for my $i (0 .. $bits - 1) {
482	2240	100				4716	if (($val & 0x01) == 0x01) {
483	280					440	$count++;
484							}
485	2240					3703	$val >>= 1;
486							}
487
488	280					716	return $count;
489
490							}
491
492							sub _get_tagmap {
493
494	100			100		171	my $ctrlcount = shift;
495	100					160	my $tagtbl = shift;
496	100					150	my $entry = shift;
497	100					169	my $spos = shift;
498	100					147	my $epos = shift;
499
500	100					190	my $tags = [];
501	100					170	my $tagmap = {};
502	100					178	my $ctrli = 0;
503	100					5317	my $start = $spos + $ctrlcount;
504
505	100					235	for my $t (@$tagtbl) {
506	440					975	my ($tag, $values, $mask, $endflag) = @$t;
507	440	100				985	if ($endflag == 1) {
508	100					158	$ctrli++;
509	100					286	next;
510							}
511	340					682	my $cbyte = ord(substr $$entry, $spos + $ctrli, 1);
512	340					652	my $val = $cbyte & $mask;
513	340	50				740	if ($val != 0) {
514	340	100				691	if ($val == $mask) {
515	280	50				589	if (_count_setbits($mask) > 1) {
516	0					0	my ($consume, $val) = _vwv($entry, $start);
517	0					0	$start += $consume;
518	0					0	push @$tags, [ $tag, undef, $val, $values ];
519							} else {
520	280					1016	push @$tags, [ $tag, 1, undef, $values ];
521							}
522							} else {
523	60					182	while (($mask & 0x01) == 0) {
524	60					97	$mask >>= 1;
525	60					152	$val >>= 1;
526							}
527	60					193	push @$tags, [ $tag, $val, undef, $values ];
528							}
529							}
530							}
531
532	100					186	for my $t (@$tags) {
533	340					704	my ($tag, $count, $bytes, $per_entry) = @$t;
534	340					632	my $values = [];
535	340	50				784	if (defined $count) {
536	340					695	for my $i (1 .. $count) {
537	400					743	for my $j (1 .. $per_entry) {
538	530					1285	my ($consume, $data) = _vwv($entry, $start);
539	530					851	$start += $consume;
540	530					1437	push @$values, $data;
541							}
542							}
543							} else {
544	0					0	my $constotal = 0;
545	0					0	while ($constotal < $bytes) {
546	0					0	my ($consume, $data) = _vwv($entry, $start);
547	0					0	$start += $consume;
548	0					0	push @$values, $data;
549							}
550							# Should we warn if $constotal does not match $bytes?
551							}
552	340					1025	$tagmap->{ $tag } = $values;
553							}
554
555	100					417	return $tagmap;
556
557							}
558
559							sub _kf8_init {
560
561	10			10		25	my $self = shift;
562
563	10	50				51	if ($self->{_fdst} != $NULL_INDEX) {
564	10					80	my $hdr = $self->{_pdb}->record($self->{_fdst})->data;
565	10	50				152	unless (substr($hdr, 0, 4) eq 'FDST') {
566	0					0	die "KF8 Mobi missing FDST info\n";
567							}
568	10					37	my $secnum = unpack "N", substr $hdr, 0x08, 4;
569	10					552	my $sc2 = $secnum * 2;
570	10					60	my @secs = unpack "N$sc2", substr $hdr, 12, 4 * $sc2;
571							$self->{_fdsttbl} = [
572	10					44	map({ $secs[$_] } grep { $_ % 2 == 0 } 0 .. $#secs)
	30					86
	60					150
573							];
574	10					30	push @{ $self->{_fdsttbl} }, $self->{_textlen};
	10					67
575							}
576
577	10	50				189	if ($self->{_skelidx} != $NULL_INDEX) {
578	10					61	my ($outtbl, $ctoc) = $self->_get_index_data($self->{_skelidx});
579	10					25	my $fptr = 0;
580	10					30	for my $o (@$outtbl) {
581	30					67	my ($txt, $tagmap) = @$o;
582	30					117	push @{ $self->{_skeltbl} }, [
583	30					49	$fptr, $txt, $tagmap->{1}[0], $tagmap->{6}[0], $tagmap->{6}[1]
584							];
585	30					95	$fptr++;
586							}
587							}
588
589							# TODO: The $cdat is usually undef. Not too important as we don't use it
590							# for anything at the moment.
591	10	50				43	if ($self->{_fragidx} != $NULL_INDEX) {
592	10					44	my ($outtbl, $ctoc) = $self->_get_index_data($self->{_fragidx});
593	10					29	for my $o (@$outtbl) {
594	70					175	my ($txt, $tagmap) = @$o;
595	70					203	my $coff = $tagmap->{2}[0];
596	70					115	my $cdat = $ctoc->{ $coff };
597	70					421	push @{ $self->{_fragtbl} }, [
598							int($txt), $cdat, $tagmap->{3}[0], $tagmap->{4}[0],
599	70					165	$tagmap->{6}[0], $tagmap->{6}[1]
600							];
601							}
602							}
603
604	10	50				46	if ($self->{_guideidx} != $NULL_INDEX) {
605	0					0	my ($outtbl, $ctoc) = $self->_get_index_data($self->{_guideidx});
606	0					0	for my $o (@$outtbl) {
607	0					0	my ($txt, $tagmap) = @$o;
608	0					0	my $coff = $tagmap->{1}[0];
609	0					0	my $rtitle = $ctoc->{ $coff };
610	0					0	my $rtype = $txt;
611	0					0	my $fno;
612	0	0				0	if (exists $tagmap->{3}) {
613	0					0	$fno = $tagmap->{3}[0];
614							}
615	0	0				0	if (exists $tagmap->{6}) {
616	0					0	$fno = $tagmap->{6}[0];
617							}
618	0					0	push @{ $self->{_guidetbl} }, [ $rtype, $rtitle, $fno ];
	0					0
619							}
620							}
621
622	10					29	return 1;
623
624							}
625
626							sub _kf8_xhtml {
627
628	5			5		11	my $self = shift;
629
630	5					11	my @parts;
631
632	5					24	my $rawml = $self->rawml;
633
634							# xhtml is the first flow piece
635							my $source = substr(
636							$rawml,
637							$self->{_fdsttbl}[0],
638	5					209	$self->{_fdsttbl}[1] - $self->{_fdsttbl}[0]
639							);
640
641	5					30	my $fragptr = 0;
642	5					56	my $baseptr = 0;
643
644	5					14	for my $s (@{ $self->{_skeltbl} }) {
	5					24
645							my (
646	15					49	$skelnum,
647							$skelnam,
648							$fragcnt,
649							$skelpos,
650							$skellen
651							) = @$s;
652	15					31	my $baseptr = $skelpos + $skellen;
653	15					40	my $skeleton = substr $source, $skelpos, $skellen;
654	15					43	for my $i (0 .. $fragcnt - 1) {
655							my (
656							$inpos,
657							$idtxt,
658							$fnum,
659							$seqnum,
660							$spos,
661							$len
662	35					50	) = @{ $self->{_fragtbl}[$fragptr] };
	35					94
663	35					125	my $slice = substr $source, $baseptr, $len;
664	35					57	$inpos -= $skelpos;
665	35					237	my $head = substr $skeleton, 0, $inpos;
666	35					66	my $tail = substr $skeleton, $inpos;
667	35					265	$skeleton = $head . $slice . $tail;
668	35					52	$baseptr += $len;
669	35					72	$fragptr++;
670							}
671	15					119	push @parts, $skeleton;
672							}
673
674	5					25	return @parts;
675
676							}
677
678							sub _decode_record {
679
680	138			138		34658	my $self = shift;
681	138					224	my $rec = shift;
682
683	138					186	$rec++;
684
685	138					362	my $encode = $self->{_pdb}->record($rec)->data;
686	138					355	my $trail = $self->_trailing_entries_size($encode);
687	138					449	substr $encode, -$trail, $trail, '';
688
689	138	50				401	if ($self->{_compression} == 1) {
		50
		0
690	0					0	return $encode;
691							} elsif ($self->{_compression} == 2) {
692	138					346	return palmdoc_decode($encode);
693							} elsif ($self->{_compression} == 17480) {
694	0					0	return $self->{_huff}->decode($encode);
695							}
696
697							}
698
699							# TODO: Could probably optimize this.
700							sub _read_exth {
701
702	20			20		42	my $self = shift;
703	20					227	my $exth = shift;
704
705							my %exth_records = (
706	20			20		111	100 => sub { $self->{Metadata}->add_author(shift) },
707	0			0		0	101 => sub { $self->{Metadata}->add_contributor(shift) },
708	0			0		0	103 => sub { $self->{Metadata}->set_description(shift) },
709	0			0		0	104 => sub { $self->{Metadata}->set_id(shift) },
710	0			0		0	105 => sub { $self->{Metadata}->add_genre(shift) },
711	20			20		54	106 => sub { $self->{Metadata}->set_created(eval { guess_time(shift) }) },
	20					119
712	20			20		101	108 => sub { $self->{Metadata}->add_contributor(shift) },
713	0			0		0	114 => sub { $self->{Metadata}->set_format('MOBI ' . shift) },
714							201 => sub {
715	20	50		20		90	if (defined $self->{_imgrec}) {
716	20					106	$self->{_coverrec} = $self->{_imgrec} + unpack "N", shift;
717							}
718							},
719	20			20		136	524 => sub { $self->{Metadata}->add_language(shift) },
720	20					591	);
721
722	20					128	my ($doctype, $len, $items) = unpack "a4 N N", $exth;
723
724	20					84	my $pos = 12;
725
726	20					62	for my $i (1 .. $items) {
727
728	390					1325	my (undef, $size) = unpack "N N", substr $exth, $pos;
729	390					934	my $contlen = $size - 8;
730	390					1398	my ($id, undef, $content) = unpack "N N a$contlen", substr $exth, $pos;
731
732	390	100				971	if (exists $exth_records{ $id }) {
733	100					247	$exth_records{ $id }->($content);
734							}
735
736	390					792	$pos += $size;
737
738							}
739
740	20					449	return 1;
741
742							}
743
744							sub new {
745
746	20			20	0	49	my $class = shift;
747	20					44	my $file = shift;
748	20					40	my $enc = shift;
749	20		50			81	my $net = shift // 1;
750
751	20					261	my $self = {
752							Source => undef,
753							Metadata => EBook::Ishmael::EBook::Metadata->new,
754							Network => $net,
755							_pdb => undef,
756							_compression => undef,
757							_textlen => undef,
758							_recnum => undef,
759							_recsize => undef,
760							_encryption => undef,
761							_doctype => undef,
762							_length => undef,
763							_type => undef,
764							_codepage => undef,
765							_uid => undef,
766							_version => undef,
767							_exth_flag => undef,
768							_extra_data => undef,
769							_trailers => 0,
770							_huff => undef,
771							_imgrec => undef,
772							_coverrec => undef,
773							_lastcont => undef,
774							_images => [],
775							# kf8 stuff
776							_skelidx => undef,
777							_skeltbl => [],
778							_fragidx => undef,
779							_fragtbl => [],
780							_guideidx => undef,
781							_guidetbl => [],
782							_fdst => undef,
783							_fdsttbl => [ 0, $NULL_INDEX ],
784							};
785
786	20					64	bless $self, $class;
787
788	20					907	$self->{Source} = File::Spec->rel2abs($file);
789
790	20					207	$self->{_pdb} = EBook::Ishmael::PDB->new($file);
791
792	20					117	my $hdr = $self->{_pdb}->record(0)->data;
793
794							(
795							$self->{_compression},
796							undef,
797							$self->{_textlen},
798							$self->{_recnum},
799							$self->{_recsize},
800							$self->{_encryption},
801							undef,
802	20					194	) = unpack "n n N n n n n", $hdr;
803
804	20	50	33			173	unless (
			33
805							$self->{_compression} == 1 or
806							$self->{_compression} == 2 or
807							$self->{_compression} == 17480
808							) {
809	0					0	die "Mobi $self->{Source} uses an unsupported compression level\n";
810							}
811
812	20	50				69	if ($self->{_recsize} != 4096) {
813	0					0	die "$self->{Source} is not a Mobi file\n";
814							}
815
816	20	50				76	unless ($self->{_encryption} == 0) {
817	0					0	die "Cannot read encrypted Mobi $self->{Source}\n";
818							}
819
820							(
821							$self->{_doctype},
822							$self->{_length},
823							$self->{_type},
824							$self->{_codepage},
825							$self->{_uid},
826							$self->{_version},
827	20					152	) = unpack "a4 N N N N N", substr $hdr, 16, 4 * 6;
828
829	20	50	33			122	unless ($self->{_codepage} == 1252 or $self->{_codepage} == 65001) {
830	0					0	die "Mobi $self->{Source} uses an unsupported text encoding\n";
831							}
832
833							# Read some parts of the Mobi header that we care about.
834	20					76	my ($toff, $tlen) = unpack "N N", substr $hdr, 0x54, 8;
835	20					62	$self->{_imgrec} = unpack "N", substr $hdr, 0x6c, 4;
836	20					67	my ($hoff, $hcount) = unpack "N N", substr $hdr, 0x70, 8;
837	20					75	$self->{_exth_flag} = unpack "N", substr $hdr, 0x80, 4;
838	20					61	$self->{_lastcont} = unpack "n", substr $hdr, 0xc2, 2;
839	20					67	$self->{_extra_data} = unpack "n", substr $hdr, 0xf2, 2;
840
841	20	50				77	if ($self->{_compression} == 17480) {
842
843	0	0				0	unless ($UNPACK_Q) {
844	0					0	die "Cannot read AZW $self->{Source}; perl does not support " .
845							"unpacking 64-bit integars\n";
846							}
847
848	0					0	my @huffs = map { $self->{_pdb}->record($_)->data } ($hoff .. $hoff + $hcount - 1);
	0					0
849	0					0	$self->{_huff} = EBook::Ishmael::EBook::Mobi::MobiHuff->new(@huffs);
850							}
851
852	20	50	33			142	if ($self->{_length} >= 0xe3 and $self->{_version} >= 5) {
853	20					44	my $flags = $self->{_extra_data};
854	20					94	while ($flags > 1) {
855	0	0				0	$self->{_trailers}++ if $flags & 2;
856	0					0	$flags >>= 1;
857							}
858							}
859
860	20	100				234	if ($self->{_version} == 8) {
861	10					38	$self->{_fdst} = unpack "N", substr $hdr, 0xc0, 4;
862	10					38	$self->{_fragidx} = unpack "N", substr $hdr, 0xf8, 4;
863	10					26	$self->{_skelidx} = unpack "N", substr $hdr, 0xfc, 4;
864	10					36	$self->{_guideidx} = unpack "N", substr $hdr, 0x104, 4;
865	10					95	$self->_kf8_init;
866							}
867
868	20	50				120	if ($self->{_lastcont} > $self->{_pdb}->recnum - 1) {
869	0					0	$self->{_lastcont} = $self->{_pdb}->recnum - 1;
870							}
871
872	20	50				70	if ($self->{_imgrec} >= $self->{_lastcont}) {
873	0					0	undef $self->{_imgrec};
874							}
875
876	20	50				57	if (defined $self->{_imgrec}) {
877	20					133	for my $i ($self->{_imgrec} .. $self->{_lastcont}) {
878	50					158	my $img = $self->{_pdb}->record($i)->data;
879	50					229	my $format = image_id($img);
880	50	100				140	next if not defined $format;
881	40					63	push @{ $self->{_images} }, [ $i, $format ];
	40					153
882							}
883							}
884
885	20	50				68	if ($self->{_exth_flag}) {
886	20					176	$self->_read_exth(substr $hdr, $self->{_length} + 16);
887							}
888
889	20	50	33			129	if (
890							defined $self->{_coverrec} and
891	40					190	not grep { $self->{_coverrec} == $_->[0] } @{ $self->{_images} }
	20					90
892							) {
893	0					0	undef $self->{_coverrec};
894							}
895
896	20					165	$self->{Metadata}->set_title(substr $hdr, $toff, $tlen);
897
898	20	50	33			93	if (
899							not defined $self->{Metadata}->created or
900							# If the PDB's created date is greater than the MOBI's EXTH one,
901							# probably means a corrupted EXTH date.
902							$self->{_pdb}->cdate > $self->{Metadata}->created
903							) {
904	20					107	$self->{Metadata}->set_created($self->{_pdb}->cdate);
905							}
906
907	20	50				70	if ($self->{_pdb}->mdate) {
908	20					79	$self->{Metadata}->set_modified($self->{_pdb}->mdate);
909							}
910
911	20	100				100	if ($self->{_version} == 8) {
		50
912	10					70	$self->{Metadata}->set_format('KF8');
913							} elsif (not defined $self->{Metadata}->format) {
914	10					40	$self->{Metadata}->set_format('MOBI');
915							}
916
917	20					107	return $self;
918
919							}
920
921							sub rawml {
922
923	10			10	0	28	my $self = shift;
924	10					38	my %param = @_;
925
926	10		50			60	my $decode = $param{decode} // 0;
927	10		100			43	my $clean = $param{clean} // 0;
928
929							my $cont =
930							join '',
931	115					315	map { $self->_decode_record($_) }
932	10					52	0 .. $self->{_recnum} - 1;
933
934	10	100				82	_clean_html(\$cont) if $clean;
935
936	10	50	33			52	if ($decode and $self->{_codepage} == 1252) {
937	0	0				0	from_to($cont, "cp1252", "utf-8")
938							or die "Failed to encode Mobi $self->{Source} text as utf-8\n";
939							}
940
941	10					296	return $cont;
942
943							}
944
945							sub html {
946
947	6			6	0	16	my $self = shift;
948	6					13	my $out = shift;
949
950	6					13	my $html;
951
952	6	100				23	if ($self->{_version} == 8) {
953
954	3					22	for my $part ($self->_kf8_xhtml) {
955
956							my $dom = XML::LibXML->load_html(
957							string => $part,
958							no_network => !$self->{Network},
959	9					6741	recover => 2,
960							);
961
962	9	50				7763	my ($body) = $dom->findnodes('/html/body') or next;
963
964	9					397	$html .= join '', map { $_->toString } $body->childNodes;
	1821					10555
965
966							}
967
968							} else {
969
970	3					13	my $rawml = $self->rawml(clean => 1);
971	3	50				17	my $enc = $self->{_codepage} == 1252 ? "cp1252" : "utf-8";
972							my $dom = XML::LibXML->load_html(
973							string => $rawml,
974							no_network => !$self->{Network},
975	3					39	encoding => $enc,
976							recover => 2
977							);
978	3					20195	$html = $dom->documentElement->toString;
979							}
980
981	6	50				959	if (defined $out) {
982	0	0				0	open my $fh, '>', $out
983							or die "Failed to open $out for writing: $!\n";
984	0					0	binmode $fh, ':utf8';
985	0					0	print { $fh } $html;
	0					0
986	0					0	close $fh;
987	0					0	return $out;
988							} else {
989	6					548	return $html;
990							}
991
992							}
993
994							sub raw {
995
996	4			4	0	12	my $self = shift;
997	4					13	my $out = shift;
998
999	4					10	my $raw;
1000
1001	4	100				24	if ($self->{_version} == 8) {
1002
1003	2					12	for my $part ($self->_kf8_xhtml) {
1004							my $dom = XML::LibXML->load_html(
1005							string => $part,
1006							no_network => !$self->{Network},
1007	6					489	recover => 2,
1008							);
1009	6	50				6276	my ($body) = $dom->findnodes('/html/body') or next;
1010	6					693	$raw .= $body->textContent;
1011							}
1012
1013							} else {
1014
1015	2					9	my $rawml = $self->rawml(clean => 1);
1016	2	50				14	my $enc = $self->{_codepage} == 1252 ? "cp1252" : "utf-8";
1017							my $dom = XML::LibXML->load_html(
1018							string => $rawml,
1019							no_network => !$self->{Network},
1020	2					29	encoding => $enc,
1021							recover => 2,
1022							);
1023
1024	2					18648	$raw = $dom->documentElement->textContent;
1025
1026							}
1027
1028	4	50				139	if (defined $out) {
1029	0	0				0	open my $fh, '>', $out
1030							or die "Failed to open $out for writing: $!\n";
1031	0					0	binmode $fh, ':utf8';
1032	0					0	print { $fh } $raw;
	0					0
1033	0					0	close $fh;
1034	0					0	return $out;
1035							} else {
1036	4					409	return $raw;
1037							}
1038
1039							}
1040
1041							sub metadata {
1042
1043	8			8	0	22	my $self = shift;
1044
1045	8					52	return $self->{Metadata};
1046
1047							}
1048
1049							sub has_cover {
1050
1051	8			8	0	1844	my $self = shift;
1052
1053	8					53	return defined $self->{_coverrec};
1054
1055							}
1056
1057							sub cover {
1058
1059	4			4	0	9	my $self = shift;
1060	4					9	my $out = shift;
1061
1062	4	50				13	return (undef, undef) unless $self->has_cover;
1063
1064	4					23	my $bin = $self->{_pdb}->record($self->{_coverrec})->data;
1065	4					21	my $format = image_id($bin);
1066	4	50				16	return (undef, undef) if not defined $format;
1067
1068	4					18	return ($bin, $format);
1069
1070							}
1071
1072							sub image_num {
1073
1074	12			12	0	1848	my $self = shift;
1075
1076	12					22	return scalar @{ $self->{_images} };
	12					60
1077
1078							}
1079
1080							sub image {
1081
1082	8			8	0	1774	my $self = shift;
1083	8					19	my $n = shift;
1084
1085	8	50				29	if ($n >= $self->image_num) {
1086	0					0	return (undef, undef);
1087							}
1088
1089	8					47	my $img = $self->{_pdb}->record($self->{_images}->[$n][0])->data;
1090
1091	8					40	return ($img, $self->{_images}[$n][1]);
1092
1093							}
1094
1095							1;