File Coverage

blib/lib/Aozora2Epub/XHTML.pm

Criterion	Covered	Total	%
statement	223	244	91.3
branch	79	98	80.6
condition	53	73	72.6
subroutine	32	38	84.2
pod	1	8	12.5
total	388	461	84.1

line	stmt	bran	cond	sub	pod	time	code
1							package Aozora2Epub::XHTML;
2	6			6		264473	use strict;
	6					14
	6					251
3	6			6		29	use warnings;
	6					44
	6					375
4	6			6		40	use utf8;
	6					11
	6					45
5	6			6		1059	use Aozora2Epub::CachedGet qw/http_get/;
	6					17
	6					411
6	6			6		2779	use Aozora2Epub::XHTML::Tree;
	6					55
	6					266
7	6			6		912	use Aozora2Epub::Gensym;
	6					15
	6					338
8	6			6		2528	use Aozora2Epub::File;
	6					18
	6					33
9	6			6		235	use HTML::Element;
	6					9
	6					27
10	6			6		3132	use Encode::JISX0213;
	6					159793
	6					423
11	6			6		54	use Encode qw/decode/;
	6					11
	6					465
12	6			6		34	use base qw(Class::Accessor);
	6					10
	6					2805
13							__PACKAGE__->mk_accessors(qw/title subtitle author
14							contents
15							bib_info notation_notes gaiji fig/);
16
17							our $VERSION = "0.05";
18
19							sub jisx0213_to_utf8 {
20	9			9	0	23	my ($men, $ku, $ten) = @_;
21	9					14	$ku += 0xa0;
22	9					17	$ten += 0xa0;
23	9	100				42	my $euc = join('', ($men == 2 ? chr(0x8f) : ()),
24							chr($ku), chr($ten));
25	9					84	my $utf8 = decode('euc-jp-2004', $euc);
26	9					5580	return $utf8;
27							}
28
29							sub kindle_jis2chr {
30	12			12	0	34	my ($men, $ku, $ten) = @_;
31
32							# 半濁点付きカタカナフ kindleだと2文字に見えるのなんとかならんか？
33	12	100	100			68	return if $men == 1 && $ku == 6 && $ten == 88;
			66
34
35							# kindle font of these characters are broken.
36	11	100	100			91	return if $men == 1 && $ku == 90 && $ten == 61;
			66
37	10	100	100			49	return if $men == 2 && $ku == 15 && $ten == 73;
			66
38	9					30	return jisx0213_to_utf8($men, $ku, $ten);
39							}
40
41							# kindle font of these characters are broken.
42							our %kindle_broken_font_unicode = map { $_ => 1 } (
43							0x2152,
44							0x2189,
45							0x26bd,
46							0x26be,
47							0x3244,
48							);
49
50							our %kindle_ok_font_over0xffff = map { $_ => 1 } (
51							0x20d58, 0x20e97, 0x20ed7, 0x210e4, 0x2124f, 0x2296b,
52							0x22d07, 0x22e42, 0x22feb, 0x233fe, 0x23cbe, 0x249ad,
53							0x24e04, 0x24ff2, 0x2546e, 0x2567f, 0x259cc, 0x2688a,
54							0x279b4, 0x280e9, 0x28e17, 0x29170, 0x2a2b2,
55							);
56
57							sub kindle_unicode_hex2chr {
58	15			15	0	49	my $unicode_hex = shift;
59	15					56	my $unicode = hex($unicode_hex);
60
61	15	100				89	return if $kindle_broken_font_unicode{$unicode};
62
63							# kindle font is almost not avaliable in this range.
64	10	100	100			56	return if $unicode > 0xffff && !$kindle_ok_font_over0xffff{$unicode};
65
66	8					35	return chr($unicode);
67							}
68
69							sub _conv_gaiji_title_author {
70	7			7		38	my ($unicode, $men, $ku, $ten) = @_;
71	7	100				31	if ($unicode) {
72	2					8	my $ch = kindle_unicode_hex2chr($unicode);
73	2	100				7	return $ch if $ch;
74	1					4	return;
75							}
76	5					16	my $ch = kindle_jis2chr(0+$men, 0+$ku, 0+$ten);
77	5	50				16	return $ch if $ch;
78	0					0	return;
79							}
80
81							sub conv_gaiji_title_author {
82	128			128	0	220123	my $s = shift;
83	128	100				803	return $s unless $s;
84	8					68	$s =~ s{(.［＃[^、］]、(U\+([A-Fa-f0-9]+)\|.?(\d)-(\d+)-(\d+)).*?］)}
85							{
86	7					40	my $all = $1;
87	7					25	my $ch = _conv_gaiji_title_author($3, $4, $5, $6);
88	7	100				30	$ch ? $ch : $all;
89							}esg;
90	8					22	return $s
91							}
92
93							sub new {
94	0			0	1	0	my ($class, $url) = @_;
95	0					0	my $base = $url;
96	0					0	$base =~ s{[^/]+\.html$}{}s;
97	0					0	return $class->new_from_string(http_get($url), $base);
98							}
99
100							sub new_from_string {
101	40			40	0	146	my ($class, $html) = @_;
102	40					182	my $self = bless { raw_content => $html }, $class;
103	40					176	$self->process_doc();
104	40					576	return $self;
105							}
106
107							sub _process_header {
108	25			25		67	my $h = shift;
109
110							# ttt to ttt
111							# where hx is h1 h2 h3, h4, h5, etc
112	25					110	my $anchor = $h->find_by_tag_name('a');
113	25	100				719	if ($anchor) {
114	14					40	my $id = $anchor->attr('id');
115	14					146	$h->attr('id', $id);
116	14					159	$anchor->replace_with($anchor->content_list);
117							}
118	25	50				334	$h->attr('id') or $h->attr('id', gensym);
119							# to
120							# where hx is h3, h4, h5, etc
121	25					268	my $parent = $h->parent;
122	25	100	33			226	if ($parent && $parent->isa('HTML::Element')
			33
			66
			66
123							&& $parent->tag('div')
124							&& $parent->attr('class')
125							&& $parent->attr('class') =~ m{jisage_\d+}) {
126	10					337	my $indent = $parent->attr('style');
127	10					120	$indent =~ s{margin-left:}{text-indent:};
128	10	50				22	$indent .= " " . $h->attr('style') if $h->attr('style');
129	10					82	$h->attr('style', $indent);
130	10					102	$parent->replace_with($h);
131							}
132							}
133
134							sub _process_img {
135	5			5		13	my $img = shift;
136
137	5					29	my $src = $img->attr('src');
138	5	100				167	if ($src =~ m{/(gaiji/\d-\d+/(\d)-(\d\d)-(\d\d)\.png)$}) {
139	4					44	my $ch = kindle_jis2chr(0+$2, 0+$3, 0+$4);
140	4	100				16	if ($ch) {
141	1					8	$img->replace_with($ch);
142	1					34	return;
143							}
144	3					24	$img->attr('src', "../$1");
145	3					59	return $src;
146							}
147							# normal image
148	1					7	$img->attr('src', "../images/$src");
149							# find caption
150	1					20	my $br = $img->right;
151	1	50	33			37	return $src unless $br && $br->isa('HTML::Element') && $br->tag eq 'br';
			33
152	1					13	my $caption = $br->right;
153	1	50				22	return $src unless $caption;
154	1	50				6	return $src unless $caption->isa('HTML::Element');
155	1	50	33			4	return $src unless $caption->tag eq 'span' && $caption->attr('class') =~ /caption/;
156	1					35	$br->detach;
157	1					28	$caption->detach;
158	1					19	$caption->tag('figcaption');
159	1					17	$img->replace_with(['figure', $img, $caption]);
160	1					198	return $src;
161							}
162
163							sub _is_empty {
164	80			80		147	my $elem = shift;
165	80	100				512	unless ($elem->isa('HTML::Element')) {
166	32					282	return $elem =~ /^\s+$/s;
167							}
168	48					96	return $elem->tag eq 'br';
169							}
170
171							sub _list_as_html {
172	3			3		10	my @c = @_;
173
174	3	50				8	return '' unless @c;
175	3					6	my $res = '';
176	3					11	for my $c (@c) {
177	17	100				65	if ($c->isa('HTML::Element')) {
178	7					24	$res .= $c->as_HTML('<>&', undef, {});
179	7					776	next;
180							}
181	10					56	$c =~ s/^ //;
182	10					27	$c =~ s/ $//;
183	10					18	$res .= $c;
184							}
185	3					27	return $res;
186							}
187
188							sub _process_bibinfo {
189	3			3		28	my $div = shift;
190
191	3					18	my @hr = $div->find_by_tag_name('hr');
192	3					227	$_->detach for @hr;
193	3					80	my @c = $div->content_list;
194	3		66			32	while (@c && _is_empty($c[0])) { shift @c }
	3					25
195	3		66			14	while (@c && _is_empty($c[-1])) { pop @c }
	7					58
196	3					11	return _list_as_html(@c);
197							}
198
199							sub process_doc {
200	40			40	0	72	my $self = shift;
201
202	40					79	my ($title, $subtitle, $author,
203							$bib_info, $notation_notes, @images);
204							my @contents = Aozora2Epub::XHTML::Tree->new($self->{raw_content})
205							->process('h1.title', sub {
206	0			0		0	$title = shift->as_text;
207							})
208							->process('h2.subtitle', sub {
209	0			0		0	$subtitle = shift->as_text;
210							})
211							->process('h2.author', sub {
212	0			0		0	$author = shift->as_text;
213							})
214							->process('div.bibliographical_information', sub {
215	3			3		14	$bib_info = _process_bibinfo(shift)
216							})
217							->process('body > div.notation_notes', sub {
218	0			0		0	my $nn = shift;
219	0					0	$notation_notes = $nn->as_HTML('<>&', undef, {});
220							})
221							->select('div.main_text')
222							->children
223							->process('img', sub {
224	5			5		12	my $img = shift;
225	5					23	my $orig_src = _process_img($img);
226	5	100				50	$orig_src and push @images, $orig_src;
227							})
228							->process('//div[contains(@style, "width")]', => sub {
229	1			1		4	my $div = shift;
230	1					6	my $style = $div->attr('style');
231	1					29	$style =~ s/(?
232	1					4	$div->attr('style', $style);
233							})
234							->process('h1', \&_process_header)
235							->process('h2', \&_process_header)
236							->process('h3', \&_process_header)
237							->process('h4', \&_process_header)
238							->process('h5', \&_process_header)
239							->process('//div[contains(@style, "margin")]', => sub {
240	3			3		7	my $div = shift;
241	3					13	my $style = $div->attr('style');
242	3					56	$style =~ s/margin-left/margin-top/sg;
243	3					14	$style =~ s/margin-right/margin-bottom/sg;
244	3					10	$div->attr('style', $style);
245							})
246							->process('span.notes', sub {
247	20			20		43	my $span = shift;
248	20					123	my $note = $span->as_text;
249	20	100				913	return unless $note =~ m{［＃[^\］]+?、([^\］]+)］};
250	16					66	my $desc = $1;
251	16					30	my $ch = do {
252	16	100				164	if ($desc =~ /U\+([A-fa-f0-9]+)/) {
		50
253	13					63	kindle_unicode_hex2chr($1);
254							} elsif ($desc =~ /第\d水準(\d)-(\d+)-(\d+)/) {
255	3					45	kindle_jis2chr(0+$1, 0+$2, 0+$3);
256							}
257							};
258	16	100				95	return unless $ch;
259
260							# find nearest ※ and replace it to $ch
261	10					61	my $left = $span->left;
262	10	100				355	unless ($left->isa('HTML::Element')) {
263	5	100				37	if ($left =~ s/※$/$ch/) {
264	4					22	$span->parent->splice_content($span->pindex - 1, 2, $left);
265							}
266	5					248	return;
267							}
268	5	50				23	if ($left->tag eq 'ruby') {
269	5					64	my $rb = $left->find_by_tag_name('rb');
270	5					242	my $s = $rb->as_text;
271	5	100				164	if ($s =~ s/※/$ch/) {
272	4					30	$rb->replace_with(HTML::Element->new_from_lol([rb => $s]));
273	4					896	$span->delete;
274							}
275	5					241	return;
276							}
277							})
278	40					446	->as_list;
279
280							# 先頭のの連続は削除
281	40		100			2917	while ($contents[0] && _is_empty($contents[0])) { shift @contents; };
	4					42
282
283	40					249	my (@gaiji, @fig);
284	40					109	for my $path (@images) {
285	4	100				45	if ($path =~ m{gaiji/(.+\.png)$}) {
286	3					16	push @gaiji, $1;
287							} else {
288	1					5	push @fig, $path;
289							}
290							}
291	40					179	$self->title(conv_gaiji_title_author($title));
292	40					857	$self->subtitle(conv_gaiji_title_author($subtitle));
293	40					505	$self->author(conv_gaiji_title_author($author));
294	40					519	$self->contents(\@contents);
295	40		100			556	$self->bib_info($bib_info \|\| '');
296	40		50			596	$self->notation_notes($notation_notes \|\| '');
297	40					532	$self->gaiji(\@gaiji);
298	40					469	$self->fig(\@fig);
299							}
300
301							sub _is_chuuki {
302	80			80		170	my $elem = shift;
303	80		66			335	return $elem->isa('HTML::Element')
304							&& $elem->tag eq 'span'
305							&& $elem->attr('class') && $elem->attr('class') =~ /notes/;
306							}
307
308							sub _is_pagebreak {
309	66			66		96	my $elem = shift;
310	66		100			138	return _is_chuuki($elem) && $elem->as_text =~ /＃改丁\|＃改ページ/;
311							}
312
313							sub _is_center_chuuki {
314	14			14		33	my $elem = shift;
315	14		66			22	return _is_chuuki($elem) && $elem->as_text =~ /＃ページの左右中央/;
316							}
317
318							sub split {
319	37			37	0	71	my $self = shift;
320
321							# ファイルを分割
322							# ** / [#改ページ] / [#改丁]
323	37					86	my @cur;
324							my @files;
325	37					67	my @contents = @{$self->contents};
	37					108
326	37					614	while (my $c = shift @contents) {
327	90	100				506	unless ($c->isa('HTML::Element')) {
328	35					80	push @cur, $c;
329	35					136	next;
330							}
331	55	100				132	if (_is_pagebreak($c)) {
332	1	50				54	push @files, [@cur] if @cur;
333	1					2	@cur = ();
334	1					3	next;
335							}
336	54	100				1271	if ($c->tag =~ m{h[123]}) { # ファイルを区切る
337							# 直前のあるいは空白文字は新しいファイルにいれる
338	12					105	my @newcur;
339	12					21	my $last_elem = pop @cur;
340	12		100			62	while ($last_elem
			100
341							&& (_is_empty($last_elem)
342							\|\| _is_center_chuuki($last_elem))) {
343	7	100				68	push @newcur, $last_elem unless _is_center_chuuki($last_elem);
344	7					72	$last_elem = pop @cur;
345							}
346
347	12	100				66	push @cur, $last_elem if $last_elem; # popしすぎた分は戻す
348	12	100				33	push @files, [@cur] if @cur; # @curが空なら改ページ直後なので何もしない
349	12					24	push @newcur, $c;
350							# 連続する
351	12					29	while (my $c1 = shift @contents) {
352	18	100				60	unless ($c1->isa('HTML::Element')) {
353	7					8	push @newcur, $c1;
354	7					11	last;
355							}
356
357	11	100				22	if (_is_pagebreak($c1)) {
358	1	50				35	push @files, [@newcur] if @newcur;
359	1					2	@newcur = ();
360	1					2	last;
361							}
362
363	10	100	100			83	unless (_is_empty($c1)
364							\|\| $c1->tag =~ m{h[123]}) {
365	2					38	push @newcur, $c1;
366	2					5	last;
367							}
368
369	8					87	push @newcur, $c1;
370							}
371
372	12					25	@cur = @newcur;
373	12					33	next;
374							}
375	42					448	push @cur, $c;
376							}
377	37	50				154	push @files, [@cur] if @cur;
378	37					87	return map { Aozora2Epub::File->new($_) } @files;
	45					394
379							}
380
381							sub _dump_elem {
382	0			0			my ($e, $no_nl) = @_;
383
384	0	0					if (ref $e eq 'ARRAY') {
385	0						for my $x (@$e) {
386	0						_dump_elem($x, 1);
387							}
388	0						print STDERR "\n";
389	0						return;
390							}
391
392	0						my $str;
393	0	0					unless ($e->isa('HTML::Element')) {
394	0						$str = $e;
395							} else {
396	0						$str = $e->as_HTML('<>&', undef, {});
397							}
398	0	0					print STDERR "!E!$str", $no_nl ? " " : "\n";
399							}
400
401							1;
402
403							__END__