File Coverage

blib/lib/WWW/FetchStory/Fetcher.pm

Criterion	Covered	Total	%
statement	48	687	6.9
branch	0	250	0.0
condition	0	63	0.0
subroutine	16	53	30.1
pod	35	35	100.0
total	99	1088	9.1

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::FetchStory::Fetcher;
2							$WWW::FetchStory::Fetcher::VERSION = '0.2602';
3	24			24		214842	use strict;
	24					65
	24					1085
4	24			24		152	use warnings;
	24					41
	24					1756
5							=head1 NAME
6
7							WWW::FetchStory::Fetcher - fetching module for WWW::FetchStory
8
9							=head1 VERSION
10
11							version 0.2602
12
13							=head1 DESCRIPTION
14
15							This is the base class for story-fetching plugins for WWW::FetchStory.
16
17							=cut
18
19							require File::Temp;
20	24			24		13065	use Date::Format;
	24					223011
	24					1824
21	24			24		14972	use Encode::ZapCP1252;
	24					39727
	24					1844
22	24			24		10687	use HTML::Entities;
	24					117115
	24					2168
23	24			24		16546	use HTML::Strip;
	24					34799
	24					977
24	24			24		17515	use XML::LibXML;
	24					1080004
	24					172
25	24			24		19713	use HTML::Tidy::libXML;
	24					33943
	24					1065
26	24			24		14544	use EBook::EPUB;
	24					34617394
	24					2047
27	24			24		256	use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
	24					58
	24					6854
28	24			24		14147	use YAML::Any;
	24					39029
	24					233
29	24			24		255483	use WWW::Mechanize::Sleepy;
	24					3865819
	24					1558
30	24			24		381	use Encode qw( encode );
	24					72
	24					1878
31	24			24		17166	use HTTP::Cookies;
	24					236238
	24					1145
32	24			24		12943	use HTTP::Cookies::Wget;
	24					94
	24					1076
33	24			24		15568	use HTTP::Cookies::Mozilla;
	24					64783
	24					315527
34
35							=head1 METHODS
36
37							=head2 new
38
39							$obj->WWW::FetchStory::Fetcher->new();
40
41							=cut
42
43							sub new {
44	0			0	1		my $class = shift;
45	0						my %parameters = @_;
46	0		0				my $self = bless ({%parameters}, ref ($class) \|\| $class);
47
48	0						return ($self);
49							} # new
50
51							=head2 init
52
53							Initialize the object.
54
55							$obj->init(%args)
56
57							=cut
58
59							sub init {
60	0			0	1		my $self = shift;
61	0						my %parameters = @_;
62
63	0						foreach my $key (keys %parameters)
64							{
65	0						$self->{$key} = $parameters{$key};
66							}
67
68	0	0					if ($self->{use_wget})
69							{
70	0						$self->{wget_cmd} = 'wget';
71	0	0	0				if ($self->{wget_cookies} and -f $self->{wget_cookies})
72							{
73	0						$self->{wget_cmd} .= " --load-cookies " . $self->{wget_cookies};
74							}
75	0	0					if ($self->{debug})
76							{
77	0						$self->{wget_cmd} .= " --debug";
78							}
79	0	0					if ($self->{wget_options})
80							{
81	0						$self->{wget_cmd} .= ' ' . $self->{wget_options};
82							}
83							}
84							else
85							{
86	0						$self->{user_agent} = WWW::Mechanize::Sleepy->new(
87							keep_alive => 1,
88							env_proxy => 1,
89							sleep => '1..10',
90							agent => ref $self,
91							);
92	0						$self->{user_agent}->show_progress($self->{verbose} > 0);
93	0	0	0				if ($self->{firefox_cookies} and -f $self->{firefox_cookies})
		0	0
94							{
95							my $cookies = HTTP::Cookies::Mozilla->new(
96							'file' => $self->{firefox_cookies},
97	0						hide_cookie2 => 1,
98							ignore_discard => 1,
99							);
100	0	0	0				print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2);
101	0						$self->{user_agent}->cookie_jar( $cookies );
102							}
103							elsif ($self->{wget_cookies} and -f $self->{wget_cookies})
104							{
105							my $cookies = HTTP::Cookies::Wget->new(
106							'file' => $self->{wget_cookies},
107	0						hide_cookie2 => 1,
108							ignore_discard => 1,
109							);
110	0	0	0				print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2);
111	0						$self->{user_agent}->cookie_jar( $cookies );
112							}
113	0	0	0				if ($self->{debug} && $self->{debug} > 1)
114							{
115	0			0			$self->{user_agent}->add_handler("request_send", sub { shift->dump; return });
	0
	0
116	0			0			$self->{user_agent}->add_handler("response_done", sub { shift->dump; return });
	0
	0
117							}
118							}
119
120	0						$self->{stripper} = HTML::Strip->new();
121	0						$self->{stripper}->add_striptag("head");
122
123	0						return ($self);
124							} # init
125
126							=head2 name
127
128							The name of the fetcher; this is basically the last component
129							of the module name. This works as either a class function or a method.
130
131							$name = $self->name();
132
133							$name = WWW::FetchStory::Fetcher::name($class);
134
135							=cut
136
137							sub name {
138	0			0	1		my $class = shift;
139
140	0	0					my $fullname = (ref ($class) ? ref ($class) : $class);
141
142	0						my @bits = split('::', $fullname);
143	0						return pop @bits;
144							} # name
145
146							=head2 info
147
148							Information about the fetcher.
149							By default this just returns the formatted name.
150
151							$info = $self->info();
152
153							=cut
154
155							sub info {
156	0			0	1		my $self = shift;
157
158	0						my $name = $self->name();
159
160							# split the name into words
161	0						my $info = $name;
162	0						$info =~ s/([A-Z])/ $1/g;
163	0						$info =~ s/^\s+//;
164
165	0						return $info;
166							} # info
167
168							=head2 priority
169
170							The priority of this fetcher. Fetchers with higher priority
171							get tried first. This is useful where there may be a generic
172							fetcher for a particular site, and then a more specialized fetcher
173							for particular sections of a site. For example, there may be a
174							generic LiveJournal fetcher, and then refinements for particular
175							LiveJournal community, such as the sshg_exchange community.
176							This works as either a class function or a method.
177
178							This must be overridden by the specific fetcher class.
179
180							$priority = $self->priority();
181
182							$priority = WWW::FetchStory::Fetcher::priority($class);
183
184							=cut
185
186							sub priority {
187	0			0	1		my $class = shift;
188	0						return 0;
189							} # priority
190
191							=head2 allow
192
193							If this fetcher can be used for the given URL, then this returns
194							true.
195							This must be overridden by the specific fetcher class.
196
197							if ($obj->allow($url))
198							{
199							....
200							}
201
202							=cut
203
204							sub allow {
205	0			0	1		my $self = shift;
206	0						my $url = shift;
207
208	0						return 0;
209							} # allow
210
211							=head2 fetch
212
213							Fetch the story, with the given options.
214
215							%story_info = $obj->fetch(
216							urls=>\@urls,
217							basename=>$basename,
218							toc=>0,
219							yaml=>0);
220
221							=over
222
223							=item basename
224
225							Optional basename used to construct the filenames.
226							If this is not given, the basename is derived from the title of the story.
227
228							=item epub
229
230							Create an EPUB file, deleting the HTML files which have been downloaded.
231
232							=item toc
233
234							Build a table-of-contents file if this is true.
235
236							=item yaml
237
238							Build a YAML file with meta-data about this story if this is true.
239
240							=item meta_only
241
242							Don't download the story, just parse the meta-data from the web page.
243							This is useful if you've had to download the story separately due
244							to security restrictions.
245
246							=item use_file I<filename>
247
248							Use the given file to parse the meta-data from rather than from
249							the web page. (This is usually a pre-downloaded EPUB file)
250							Implies meta_only.
251
252							=item urls
253
254							The URLs of the story.
255							The first page is scraped for meta-information about the story,
256							including the title and author. Site-specific Fetcher plugins can find additional
257							information, including the URLs of all the chapters in a multi-chapter story.
258
259							=back
260
261							=cut
262
263							sub fetch {
264	0			0	1		my $self = shift;
265	0						my %args = (
266							urls=>undef,
267							basename=>'',
268							@_
269							);
270
271	0						$self->{verbose} = $args{verbose};
272
273	0						my $first_url = $args{urls}[0];
274	0						my $toc_content = $self->get_toc(%args, first_url=>$first_url);
275	0						my %story_info = $self->parse_toc(%args, content=>$toc_content,
276							url=>$first_url);
277
278							my $basename = ($args{basename}
279							? $args{basename}
280	0	0					: $self->get_story_basename($story_info{title}));
281	0						$story_info{basename} = $basename;
282	0						my @storyfiles = ();
283
284	0	0					$args{meta_only} = 1 if $args{use_file};
285	0	0					if ($args{meta_only})
286							{
287	0						$self->derive_values(info=>\%story_info);
288	0	0					warn Dump(\%story_info) if ($self->{verbose} > 1);
289							}
290							else
291							{
292	0	0	0				if ($args{epub} and exists $story_info{epub_url} and $story_info{epub_url})
			0
293							{
294							my %epub_info = $self->get_epub(base=>$basename,
295							url=>$story_info{epub_url},
296	0						meta=>\%story_info);
297	0						$story_info{storyfiles} = [$epub_info{filename}];
298
299	0						$self->derive_values(info=>\%story_info);
300	0	0					warn Dump(\%story_info) if ($self->{verbose} > 1);
301							}
302							else
303							{
304	0						my @ch_urls = @{$story_info{chapters}};
	0
305	0						my $one_chapter = (@ch_urls == 1);
306							my $first_chapter_is_toc =
307	0		0				$story_info{toc_first} \|\| $self->{first_is_toc};
308	0						delete $story_info{toc_first};
309	0						my @ch_titles = ();
310	0						my @ch_wc = ();
311	0	0	0				my $count = (($one_chapter or $first_chapter_is_toc) ? 0 : 1);
312	0						foreach (my $i = 0; $i < @ch_urls; $i++)
313							{
314	0						my $ch_title = sprintf("%s (%d)", $story_info{title}, $i+1);
315	0						my %ch_info = $self->get_chapter(base=>$basename,
316							count=>$count,
317							url=>$ch_urls[$i],
318							title=>$ch_title);
319	0						push @storyfiles, $ch_info{filename};
320	0						push @ch_titles, $ch_info{title};
321	0						push @ch_wc, $ch_info{wordcount};
322	0						$story_info{wordcount} += $ch_info{wordcount};
323	0						$count++;
324	0						sleep 1; # try not to overload the archive
325							}
326	0						$self->derive_values(info=>\%story_info);
327
328	0	0					warn Dump(\%story_info) if ($self->{verbose} > 1);
329
330	0						$story_info{storyfiles} = \@storyfiles;
331	0						$story_info{chapter_titles} = \@ch_titles;
332	0						$story_info{chapter_wc} = \@ch_wc;
333	0	0	0				if ($args{toc} and !$args{epub}) # build a table-of-contents
334							{
335	0						my $toc = $self->build_toc(info=>\%story_info);
336	0						unshift @{$story_info{storyfiles}}, $toc;
	0
337	0						unshift @{$story_info{chapter_titles}}, "Table of Contents";
	0
338							}
339	0	0					if ($args{epub})
340							{
341	0						my $epub_file = $self->build_epub(info=>\%story_info);
342							# if we have built an EPUB file, then the storyfiles
343							# are now just one EPUB file.
344	0						$story_info{storyfiles} = [$epub_file];
345							}
346							}
347							}
348	0	0					if ($args{yaml})
349							{
350	0						my $filename = sprintf("%s.yml", $story_info{basename});
351	0						my $ofh;
352	0	0					open($ofh, ">", $filename) \|\| die "Can't write to $filename";
353	0						print $ofh Dump(\%story_info);
354	0						close($ofh);
355							}
356
357	0						return %story_info;
358							} # fetch
359
360							=head1 Private Methods
361
362							=head2 get_story_basename
363
364							Figure out the file basename for a story by using its title.
365
366							$basename = $self->get_story_basename($title);
367
368							=cut
369							sub get_story_basename {
370	0			0	1		my $self = shift;
371	0						my $title = shift;
372
373							# make a word with only letters and numbers
374							# and remove HTML entities and UTF-8
375							# and with everything lowercase
376							# and the spaces replaced with underscores
377	0						my $base = $title;
378	0						$base =~ s/^The\s+//; # get rid of leading "The "
379	0						$base =~ s/^A\s+//; # get rid of leading "A "
380	0						$base =~ s/^An\s+//; # get rid of leading "An "
381	0						$base =~ s/-/ /g; # replace dashes with spaces
382	0						$base = decode_entities($base); # replace entities with UTF-8
383	0						$base =~ s/[^[:ascii:]]//g; # remove UTF-8
384	0						$base =~ s/[^\w\s]//g; # remove non-word characters
385	0						$base = lc($base);
386
387	0						my @words = split(' ', $base);
388	0						my $max_words = 3;
389	0						my @first_words = ();
390							# if there are three words or less, use all of them
391	0	0					if (@words <= $max_words)
392							{
393	0						@first_words = @words;
394							}
395							else
396							{
397	0	0					$max_words++ if (@words > 3); # four
398	0	0					$max_words++ if (@words > 5); # five if a lot
399	0		0				for (my $i = 0; $i < @words and @first_words < $max_words; $i++)
400							{
401							# skip little words
402	0	0	0				if ($words[$i] =~ /^(the\|a\|an\|and)$/)
		0
403							{
404							}
405							elsif (@words > 4 and $words[$i] =~ /^(of\|to\|in\|or\|on\|by\|i\|is\|isnt\|its)$/)
406							{
407							# if there are a lot of words, skip these little words too
408							}
409							else
410							{
411	0						push @first_words, $words[$i];
412							}
413							}
414							}
415
416	0						return join('_', @first_words);
417
418							} # get_story_basename
419
420							=head2 extract_story
421
422							Extract the story-content from the fetched content.
423
424							my ($story, $title) = $self->extract_story(content=>$content,
425							title=>$title);
426
427							=cut
428
429							sub extract_story {
430	0			0	1		my $self = shift;
431	0						my %args = (
432							content=>'',
433							title=>'',
434							@_
435							);
436
437	0						my $story = '';
438	0						my $title = '';
439	0	0					if ($args{content} =~ m#<title>([^<]+)</title>#is)
440							{
441	0						$title = $1;
442							}
443							else
444							{
445	0						$title = $args{title};
446							}
447
448							# some badly formed pages have multiple BODY tags
449	0	0					if ($args{content} =~ m#<body[^>]>.?<body[^>]>(.?)</body>#is)
		0
		0
450							{
451	0						$story = $1;
452							}
453							elsif ($args{content} =~ m#<body[^>]>(.)</body>#is)
454							{
455	0						$story = $1;
456							}
457							elsif ($args{content} =~ m#</head>(.*)#is)
458							{
459	0						$story = $1;
460							}
461
462	0	0					if ($story)
463							{
464	0						$story = $self->tidy_chars($story);
465							}
466							else
467							{
468	0						$story = $args{content};
469							}
470
471	0						return ($story, $title);
472
473							} # extract_story
474
475							=head2 make_css
476
477							Create site-specific CSS styling.
478
479							$css = $self->make_css();
480
481							=cut
482
483							sub make_css {
484	0			0	1		my $self = shift;
485
486	0						return '';
487							} # make_css
488
489							=head2 tidy
490
491							Make a tidy, compliant XHTML page from the given story-content.
492
493							$content = $self->tidy(story=>$story,
494							title=>$title);
495
496							=cut
497
498							sub tidy {
499	0			0	1		my $self = shift;
500	0						my %args = (
501							story=>'',
502							title=>'',
503							@_
504							);
505
506	0						my $story = $args{story};
507	0						$story = $self->tidy_chars($story);
508	0						my $title = $args{title};
509	0						my $css = $self->make_css(%args);
510
511	0						my $html = '';
512	0						$html .= "<html>\n";
513	0						$html .= "<head>\n";
514	0						$html .= "<title>$title</title>\n";
515	0	0					$html .= $css if $css;
516	0						$html .= "</head>\n";
517	0						$html .= "<body>\n";
518	0						$html .= "$story\n";
519	0						$html .= "</body>\n";
520	0						$html .= "</html>\n";
521
522	0						my $tidy = HTML::Tidy::libXML->new();
523	0						$html = encode("UTF-8", $html);
524	0						my $xhtml = $tidy->clean($html, 'UTF-8', 1);
525
526							# fixing some errors
527	0						$xhtml =~ s!xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml"!xmlns="http://www.w3.org/1999/xhtml"!;
528	0						$xhtml =~ s!<i/>!!g;
529	0						$xhtml =~ s!<b/>!!g;
530
531	0						return $xhtml;
532							} # tidy
533
534							=head2 get_toc
535
536							Get a table-of-contents page.
537
538							=cut
539							sub get_toc {
540	0			0	1		my $self = shift;
541	0						my %args = @_;
542	0						my $url = $args{first_url};
543
544	0						return $self->get_page($url);
545							} # get_toc
546
547							=head2 get_page
548
549							Get the contents of a URL.
550
551							=cut
552
553							sub get_page {
554	0			0	1		my $self = shift;
555	0						my $url = shift;
556
557	0	0					warn "getting $url\n" if $self->{verbose};
558	0						my $content = '';
559
560							# The "url" might be a file instead
561	0	0	0				if ($url !~ /http/ and -f $url)
		0
562							{
563	0						my $ifh;
564	0	0					open($ifh, $url) or die "FAILED to read ${url}: $!";
565	0						while(<$ifh>)
566							{
567	0						$content .= $_;
568							}
569	0						close($ifh);
570							}
571							elsif ($self->{use_wget})
572							{
573	0						my $cmd = sprintf("%s -O %s '%s'", $self->{wget_cmd}, '-', $url);
574	0	0					warn "$cmd\n" if ($self->{verbose} > 1);
575	0						my $ifh;
576	0	0					open($ifh, "${cmd}\|") or die "FAILED $cmd: $!";
577	0						while(<$ifh>)
578							{
579	0						$content .= $_;
580							}
581	0						close($ifh);
582							}
583							else
584							{
585	0						my $can_accept = HTTP::Message::decodable;
586	0						my $res = $self->{user_agent}->get($url,
587							'Accept-Encoding' => $can_accept,
588							'Keep-Alive' => "300",
589							'Connection' => 'keep-alive',
590							);
591
592							# Check the outcome of the response
593	0	0					if ($res->is_success) {
594	0	0					print $res->status_line, "\n" if $self->{debug};
595							}
596							else {
597	0						die "FAILED fetching $url ", $res->status_line;
598							}
599	0		0				$content = $res->decoded_content \|\| $res->content;
600							}
601
602	0	0	0				if (!$content and $self->{verbose})
603							{
604	0						warn "No content from $url";
605	0	0					if ($self->{debug})
606							{
607							# there's a problem, we want to debug it
608	0						exit;
609							}
610							}
611
612	0						return $content;
613							} # get_page
614
615							=head2 parse_toc
616
617							Parse the table-of-contents file.
618
619							This must be overridden by the specific fetcher class.
620
621							%info = $self->parse_toc(content=>$content,
622							url=>$url,
623							urls=>\@urls);
624
625							This should return a hash containing:
626
627							=over
628
629							=item chapters
630
631							An array of URLs for the chapters of the story. In the case where the
632							story only takes one page, that will be the chapter.
633							In the case where multiple URLs have been passed in, it will be those URLs.
634
635							=item title
636
637							The title of the story.
638
639							=back
640
641							It may also return additional information, such as Summary.
642
643							=cut
644
645							sub parse_toc {
646	0			0	1		my $self = shift;
647	0						my %args = (
648							url=>'',
649							content=>'',
650							@_
651							);
652
653	0						my %info = ();
654	0						$info{url} = $args{url};
655	0						$info{title} = $self->parse_title(%args);
656	0						$info{author} = $self->parse_author(%args);
657	0						$info{summary} = $self->parse_summary(%args);
658	0						$info{characters} = $self->parse_characters(%args);
659	0						$info{universe} = $self->parse_universe(%args);
660	0						$info{category} = $self->parse_category(%args);
661	0						$info{rating} = $self->parse_rating(%args);
662	0						$info{chapters} = $self->parse_chapter_urls(%args);
663	0						$info{epub_url} = $self->parse_epub_url(%args);
664
665	0						return %info;
666							} # parse_toc
667
668							=head2 parse_chapter_urls
669
670							Figure out the URLs for the chapters of this story.
671
672							=cut
673							sub parse_chapter_urls {
674	0			0	1		my $self = shift;
675	0						my %args = (
676							url=>'',
677							content=>'',
678							@_
679							);
680
681	0						my @chapters = ();
682	0	0					if (defined $args{urls})
683							{
684	0						@chapters = @{$args{urls}};
	0
685							}
686							else
687							{
688	0						@chapters = ($args{url});
689							}
690
691	0						return \@chapters;
692							} # parse_chapter_urls
693
694							=head2 parse_epub_url
695
696							Figure out the URL for the EPUB version of this story, if there is one.
697
698							=cut
699							sub parse_epub_url {
700	0			0	1		my $self = shift;
701	0						my %args = (
702							url=>'',
703							content=>'',
704							@_
705							);
706
707	0						return undef;
708							} # parse_epub_url
709
710							=head2 parse_title
711
712							Get the title from the content
713
714							=cut
715							sub parse_title {
716	0			0	1		my $self = shift;
717	0						my %args = (
718							content=>'',
719							@_
720							);
721
722	0						my $content = $args{content};
723	0						my $title = '';
724	0	0					if ($content =~ /<(?:b\|strong)>Title:?\s<\/(?:b\|strong)>:?\s"?(.?)"?\s<(?:br\|p\|\/p\|div\|\/div)/si)
		0
		0
		0
		0
		0
		0
		0
725							{
726	0						$title = $1;
727							}
728							elsif ($content =~ /\bTitle:\s"?(.?)"?\s*<br/s)
729							{
730	0						$title = $1;
731							}
732							elsif ($content =~ m#<h1>([^<]+)</h1>#is)
733							{
734	0						$title = $1;
735							}
736							elsif ($content =~ m#<p class=MsoTitle>([^<]+)</p>#is)
737							{
738	0						$title = $1;
739							}
740							elsif ($content =~ m#<h2>([^<]+)</h2>#is)
741							{
742	0						$title = $1;
743							}
744							elsif ($content =~ m#<h3>([^<]+)</h3>#is)
745							{
746	0						$title = $1;
747							}
748							elsif ($content =~ m#<h4>([^<]+)</h4>#is)
749							{
750	0						$title = $1;
751							}
752							elsif ($content =~ m#<title>([^<]+)</title>#is)
753							{
754	0						$title = $1;
755							}
756	0						$title =~ s/<u>//ig;
757	0						$title =~ s/<\/u>//ig;
758	0						return $title;
759							} # parse_title
760
761							=head2 parse_ch_title
762
763							Get the chapter title from the content
764
765							=cut
766							sub parse_ch_title {
767	0			0	1		my $self = shift;
768	0						my %args = (
769							content=>'',
770							@_
771							);
772
773	0						my $content = $args{content};
774	0						my $title = '';
775	0	0					if ($content =~ /Chapter \d+[:.]?\s*([^<]+)/si)
776							{
777	0						$title = $1;
778							}
779							else
780							{
781	0						$title = $self->parse_title(%args);
782							}
783	0						$title =~ s/<u>//ig;
784	0						$title =~ s/<\/u>//ig;
785	0						return $title;
786							} # parse_ch_title
787
788							=head2 parse_author
789
790							Get the author from the content
791
792							=cut
793							sub parse_author {
794	0			0	1		my $self = shift;
795	0						my %args = (
796							content=>'',
797							@_
798							);
799
800	0						my $content = $args{content};
801	0						my $author = '';
802	0	0					if ($content =~ /<(?:b\|strong)>Author:?\s<\/(?:b\|strong)>:?\s"?(.?)"?\s<(?:br\|p\|\/p\|div\|\/div)/si)
		0
		0
		0
803							{
804	0						$author = $1;
805							}
806							elsif ($content =~ /\bAuthor:\s"?(.?)"?\s*<br/si)
807							{
808	0						$author = $1;
809							}
810							elsif ($content =~ /<meta name="author" content="(.*?)"/si)
811							{
812	0						$author = $1;
813							}
814							elsif ($content =~ /<p>by (.*?)<br/si)
815							{
816	0						$author = $1;
817							}
818	0						return $author;
819							} # parse_author
820
821							=head2 parse_summary
822
823							Get the summary from the content
824
825							=cut
826							sub parse_summary {
827	0			0	1		my $self = shift;
828	0						my %args = (
829							content=>'',
830							@_
831							);
832
833	0						my $content = $args{content};
834	0						my $summary = '';
835	0	0					if ($content =~ /<(?:b\|strong)>Summary:?\s<\/(?:b\|strong)>:?\s"?(.?)"?\s<(?:br\|p\|\/p\|div\|\/div)/si)
		0
		0
		0
		0
		0
		0
		0
		0
836							{
837	0						$summary = $1;
838							}
839							elsif ($content =~ m#<i>Summary:</i>\s([^<]+)\s<br>#s)
840							{
841	0						$summary = $1;
842							}
843							elsif ($content =~ m#>Summary:\s</span>\s([^<]+)\s*<br#s)
844							{
845	0						$summary = $1;
846							}
847							elsif ($content =~ /<i>Summary:<\/i>\s(.?)\s*$/m)
848							{
849	0						$summary = $1;
850							}
851							elsif ($content =~ m#<tr><(?:th\|td)>Summary</(?:th\|td)><td>(.*?)</td></tr>#s)
852							{
853	0						$summary = $1;
854	0						$summary =~ s/<br>/ /g;
855							}
856							elsif ($content =~ /\bSummary:\s"?(.?)"?\s*<(?:br\|p\|\/p\|div\|\/div)/si)
857							{
858	0						$summary = $1;
859							}
860							elsif ($content =~ m#(?:Prompt\|Summary):</b>([^<]+)#is)
861							{
862	0						$summary = $1;
863							}
864							elsif ($content =~ m#(?:Prompt\|Summary):</strong>([^<]+)#is)
865							{
866	0						$summary = $1;
867							}
868							elsif ($content =~ m#(?:Prompt\|Summary):</u>([^<]+)#is)
869							{
870	0						$summary = $1;
871							}
872	0						return $summary;
873							} # parse_summary
874
875							=head2 parse_characters
876
877							Get the characters from the content
878
879							=cut
880							sub parse_characters {
881	0			0	1		my $self = shift;
882	0						my %args = (
883							content=>'',
884							@_
885							);
886
887	0						my $content = $args{content};
888	0						my $characters = '';
889	0	0					if ($content =~ />Characters:?\s<\/(?:b\|strong)>:?\s"?(.?)"?\s<(?:br\|p\|\/p\|div\|\/div)/si)
		0
		0
		0
		0
890							{
891	0						$characters = $1;
892							}
893							elsif ($content =~ /\bCharacters:\s"?(.?)"?\s*<br/si)
894							{
895	0						$characters = $1;
896							}
897							elsif ($content =~ m#<i>Characters:</i>\s([^<]+)\s<br>#s)
898							{
899	0						$characters = $1;
900							}
901							elsif ($content =~ m#(?:Pairings?\|Characters):</(?:b\|strong\|u)>\s*([^<]+)#is)
902							{
903	0						$characters = $1;
904							}
905							elsif ($content =~ m#<tr><(?:th\|td)>(?:Pairings?\|Characters)</(?:th\|td)><td>(.*?)</td></tr>#s)
906							{
907	0						$characters = $1;
908	0						$characters =~ s/<br>/, /g;
909							}
910	0						return $characters;
911							} # parse_characters
912
913							=head2 parse_universe
914
915							Get the universe/fandom from the content
916
917							=cut
918							sub parse_universe {
919	0			0	1		my $self = shift;
920	0						my %args = (
921							content=>'',
922							@_
923							);
924
925	0						my $content = $args{content};
926	0						my $universe = '';
927	0	0					if ($content =~ m#(?:Universe\|Fandom):</(?:b\|strong\|u)>([^<]+)#is)
928							{
929	0						$universe = $1;
930							}
931	0						return $universe;
932							} # parse_universe
933
934							=head2 parse_recipient
935
936							Get the recipient from the content
937
938							=cut
939							sub parse_recipient {
940	0			0	1		my $self = shift;
941	0						my %args = (
942							content=>'',
943							@_
944							);
945
946	0						my $content = $args{content};
947	0						my $recipient = '';
948	0	0					if ($content =~ m#(?:Recipient\|Prompter): (\w+)#is)
		0
949							{
950	0						$recipient = $1;
951							}
952							elsif ($content =~ m#Recipient:</(?:b\|strong\|u)>([^<]+)#is)
953							{
954	0						$recipient = $1;
955							}
956	0						return $recipient;
957							} # parse_recipient
958
959							=head2 parse_category
960
961							Get the categories from the content
962
963							=cut
964							sub parse_category {
965	0			0	1		my $self = shift;
966	0						my %args = (
967							content=>'',
968							@_
969							);
970
971	0						my $content = $args{content};
972	0						my $category = '';
973	0	0					if ($content =~ m#(?:Category\|Tags):</(?:b\|strong\|u)>([^<]+)#is)
		0
974							{
975	0						$category = $1;
976							}
977							elsif ($content =~ m#<tr><(?:th\|td)>Categories</(?:th\|td)><td>(.*?)</td></tr>#s)
978							{
979	0						$category = $1;
980	0						$category =~ s/<br>/, /g;
981							}
982	0						return $category;
983							} # parse_category
984
985							=head2 parse_rating
986
987							Get the rating from the content
988
989							=cut
990							sub parse_rating {
991	0			0	1		my $self = shift;
992	0						my %args = (
993							content=>'',
994							@_
995							);
996
997	0						my $content = $args{content};
998	0						my $rating = '';
999	0	0					if ($content =~ m!^Rating:\s(.*?)$!m)
		0
1000							{
1001	0						$rating = $1;
1002							}
1003							elsif ($content =~ m#Rating:</(?:b\|strong\|u)>\s*([^<]+)#is)
1004							{
1005	0						$rating = $1;
1006							}
1007	0						return $rating;
1008							} # parse_rating
1009
1010							=head2 derive_values
1011
1012							Calculate additional Meta values, such as current date.
1013
1014							=cut
1015							sub derive_values {
1016	0			0	1		my $self = shift;
1017	0						my %args = @_;
1018
1019	0						my $today = time2str('%Y-%m-%d', time);
1020	0						$args{info}->{fetch_date} = $today;
1021
1022	0						my $words = $args{info}->{wordcount};
1023	0	0					if ($words)
1024							{
1025	0						my $len = '';
1026	0	0					if ($words == 100)
		0
		0
		0
		0
		0
		0
		0
		0
1027							{
1028	0						$len = 'Drabble';
1029							} elsif ($words == 200)
1030							{
1031	0						$len = 'Double Drabble';
1032							} elsif ($words >= 75000)
1033							{
1034	0						$len = 'Long Novel';
1035							} elsif ($words >= 50000)
1036							{
1037	0						$len = 'Novel';
1038							} elsif ($words >= 25000)
1039							{
1040	0						$len = 'Novella';
1041							} elsif ($words >= 7500)
1042							{
1043	0						$len = 'Novelette';
1044							} elsif ($words >= 2000)
1045							{
1046	0						$len = 'Short Story';
1047							} elsif ($words > 500)
1048							{
1049	0						$len = 'Short Short';
1050							} elsif ($words <= 500)
1051							{
1052	0						$len = 'Flash';
1053							}
1054	0	0					$args{info}->{story_length} = $len if $len;
1055							}
1056	0						for my $field (qw{characters universe category})
1057							{
1058	0	0	0				if (exists $args{info}->{$field}
			0
1059							and defined $args{info}->{$field}
1060							and $args{info}->{$field} =~ /,/s)
1061							{
1062	0						my @chars = split(/,\s*/s, $args{info}->{$field});
1063	0						$args{info}->{$field} = \@chars;
1064							}
1065							}
1066							} # derive_values
1067
1068							=head2 get_chapter
1069
1070							Get an individual chapter of the story, tidy it,
1071							and save it to a file.
1072
1073							$filename = $obj->get_chapter(base=>$basename,
1074							count=>$count,
1075							url=>$url,
1076							title=>$title);
1077
1078							=cut
1079
1080							sub get_chapter {
1081	0			0	1		my $self = shift;
1082	0						my %args = (
1083							base=>'',
1084							count=>0,
1085							url=>'',
1086							title=>'',
1087							@_
1088							);
1089
1090	0						my $content = $self->get_page($args{url});
1091
1092	0						my ($story, $title) = $self->extract_story(%args, content=>$content);
1093
1094	0						my $chapter_title = $self->parse_ch_title(content=>$content);
1095	0	0					$chapter_title = $title if !$chapter_title;
1096
1097	0						my $html = $self->tidy(story=>$story, title=>$chapter_title);
1098
1099	0						my %wc = $self->wordcount(content=>$html);
1100
1101							#
1102							# Write the file
1103							#
1104							my $filename = ($args{count}
1105							? sprintf("%s%02d.html", $args{base}, $args{count})
1106	0	0					: sprintf("%s.html", $args{base}));
1107	0						my $ofh;
1108	0	0					open($ofh, ">", $filename) \|\| die "Can't write to $filename";
1109	0						print $ofh $html;
1110	0						close($ofh);
1111
1112							return (
1113							filename=>$filename,
1114							title=>$chapter_title,
1115							wordcount=>$wc{words},
1116							charcount=>$wc{chars},
1117	0						);
1118							} # get_chapter
1119
1120							=head2 get_epub
1121
1122							Get the EPUB version of the story, tidy it,
1123							and save it to a file.
1124
1125							$filename = $obj->get_epub(base=>$basename,
1126							url=>$url);
1127
1128							=cut
1129
1130							sub get_epub {
1131	0			0	1		my $self = shift;
1132	0						my %args = (
1133							base=>'',
1134							url=>'',
1135							meta=>undef,
1136							@_
1137							);
1138
1139	0						my %meta = %{$args{meta}};
	0
1140	0						my $content = $self->get_page($args{url});
1141	0						my %epub_info = ();
1142
1143							#
1144							# Write the file
1145							#
1146	0						my $filename = $args{base} . '.epub';
1147	0						my $ofh;
1148	0	0					open($ofh, ">", $filename) \|\| die "Can't write to $filename";
1149	0						print $ofh $content;
1150	0						close($ofh);
1151
1152	0						$epub_info{filename} = $filename;
1153
1154							#
1155							# Update the file metadata
1156							#
1157	0						my $zip = Archive::Zip->new();
1158	0						my $status = $zip->read( $filename );
1159	0	0					if ($status != AZ_OK)
1160							{
1161	0						return %epub_info;
1162							}
1163	0						my @members = $zip->membersMatching('.*\.opf');
1164	0	0	0				if (@members && $members[0])
1165							{
1166	0						my %values = ();
1167	0						my $opf = $zip->contents($members[0]);
1168	0						my $dom = XML::LibXML->load_xml(string => $opf,
1169							load_ext_dtd => 0,
1170							no_network => 1);
1171	0						my @metanodes = $dom->getElementsByLocalName('metadata');
1172	0						foreach my $metanode (@metanodes)
1173							{
1174	0	0					if ($metanode->hasChildNodes)
1175							{
1176	0						my @children = $metanode->childNodes();
1177	0						foreach my $node (@children)
1178							{
1179	0						$self->epub_parse_one_node(%args,
1180							node=>$node,
1181							values=>\%values);
1182							}
1183							}
1184							}
1185	0	0					print STDERR "get_epub: about to replace description\n" if $self->{debug};
1186	0						$self->epub_replace_description(description=>$meta{summary}, xml=>$dom);
1187
1188							# remove meta info we don't want to be added to this
1189	0						delete $meta{description};
1190	0						delete $meta{summary};
1191	0						delete $meta{title};
1192	0						delete $meta{chapters};
1193	0						delete $meta{epub_url};
1194	0						delete $meta{basename};
1195	0						delete $meta{toc_first};
1196	0	0					warn "EPUB meta: ", Dump(\%meta) if ($self->{verbose} > 1);
1197	0						$self->epub_add_meta(meta=>\%meta, xml=>$dom);
1198
1199	0						my $str = $dom->toString;
1200	0						$zip->contents($members[0], $str);
1201	0						$zip->overwrite();
1202							}
1203
1204	0						return %epub_info;
1205							} # get_epub
1206
1207							=head2 epub_replace_description
1208
1209							Replace or add the description to an EPUB file.
1210
1211							=cut
1212							sub epub_replace_description {
1213	0			0	1		my $self = shift;
1214	0						my %args = @_;
1215
1216	0						my $dom = $args{xml};
1217	0						my $desc = $args{description};
1218							# need to clean up the description removing things not okay to put in a meta tag
1219	0						$desc =~ s!<[^>]+>!!g;
1220	0						$desc =~ s!</[^>]+>!!g;
1221	0						$desc =~ s!"!''!g;
1222	0	0					print STDERR "epub_replace_description: description=$desc\n" if $self->{debug};
1223	0						my @metanodes = $dom->getElementsByLocalName('metadata');
1224	0	0					return unless @metanodes;
1225	0						my $metanode = $metanodes[0];
1226	0						my @dnodes = $metanode->getElementsByLocalName('description');
1227	0	0					if ($dnodes[0])
1228							{
1229	0						$metanode->removeChild($dnodes[0]);
1230							}
1231	0						$metanode->appendTextChild('dc:description', $desc);
1232							} # epub_replace_description
1233
1234							=head2 epub_add_meta
1235
1236							Add the given meta-data to an EPUB file.
1237
1238							=cut
1239							sub epub_add_meta {
1240	0			0	1		my $self = shift;
1241	0						my %args = @_;
1242
1243	0						my $dom = $args{xml};
1244	0						my @metanodes = $dom->getElementsByLocalName('metadata');
1245	0	0					return unless @metanodes;
1246	0						my $metanode = $metanodes[0];
1247
1248	0						my %meta = %{$args{meta}};
	0
1249	0						foreach my $key (sort keys %meta)
1250							{
1251	0						my $chunk=<<EOT;
1252							<meta name="$key" content="$meta{$key}"/>
1253							EOT
1254	0						$metanode->appendWellBalancedChunk( $chunk );
1255							}
1256
1257							} # epub_add_meta
1258
1259							=head2 epub_parse_one_node
1260
1261							Parse a node of meta-information from an EPUB file.
1262
1263							=cut
1264							sub epub_parse_one_node {
1265	0			0	1		my $self = shift;
1266	0						my %params = @_;
1267
1268	0						my $node = $params{node};
1269	0						my $oldvals = $params{values};
1270
1271	0						my %newvals = ();
1272	0						my $name = $node->localname;
1273	0	0					return undef unless $name;
1274
1275	0						my $value = $node->textContent;
1276	0						$value =~ s/^\s+//s;
1277	0						$value =~ s/\s+$//s;
1278	0						$value =~ s/\s\s+/ /gs;
1279	0	0	0				if ($name eq 'meta' and $node->hasAttributes)
		0
1280							{
1281	0						my $metaname = '';
1282	0						my $metacontent = '';
1283	0						my @atts = $node->attributes();
1284	0						foreach my $att (@atts)
1285							{
1286	0						my $n = $att->localname;
1287	0						my $v = $att->textContent;
1288	0						$v =~ s/^\s+//s;
1289	0						$v =~ s/\s+$//s;
1290	0	0					if ($n eq 'name')
1291							{
1292	0						$metaname = $v;
1293							}
1294							else
1295							{
1296	0						$metacontent = $v;
1297							}
1298							}
1299	0						$newvals{$metaname} = $metacontent;
1300							}
1301							elsif ($node->hasAttributes)
1302							{
1303	0	0					$newvals{$name}->{text} = $value unless !$value;
1304	0						my @atts = $node->attributes();
1305	0						foreach my $att (@atts)
1306							{
1307	0						my $n = $att->localname;
1308	0						my $v = $att->textContent;
1309	0						$v =~ s/^\s+//s;
1310	0						$v =~ s/\s+$//s;
1311	0						$newvals{$name}->{$n} = $v;
1312							}
1313							}
1314							else
1315							{
1316	0						$newvals{$name} = $value;
1317							}
1318
1319							# Don't want to overwrite existing values
1320	0						foreach my $newname (sort keys %newvals)
1321							{
1322	0						my $newval = $newvals{$newname};
1323	0	0					if (!ref $newval)
1324							{
1325	0	0					if (!exists $oldvals->{$newname})
		0
		0
1326							{
1327	0						$oldvals->{$newname} = $newval;
1328							}
1329							elsif (!ref $oldvals->{$newname})
1330							{
1331	0						my $v = $oldvals->{$newname};
1332	0						$oldvals->{$newname} = [$v, $newval];
1333							}
1334							elsif (ref $oldvals->{$newname} eq 'ARRAY')
1335							{
1336	0						push @{$oldvals->{$newname}}, $newval;
	0
1337							}
1338							else
1339							{
1340	0						$oldvals->{$newname}->{$newval} = $newval;
1341							}
1342							}
1343							else
1344							{
1345	0	0					if (!exists $oldvals->{$newname})
		0
1346							{
1347	0						$oldvals->{$newname} = $newval;
1348							}
1349							elsif (ref $oldvals->{$newname} eq 'ARRAY')
1350							{
1351	0						push @{$oldvals->{$newname}}, $newval;
	0
1352							}
1353							else
1354							{
1355	0						my $v = $oldvals->{$newname};
1356	0						$oldvals->{$newname} = [$v, $newval];
1357							}
1358							}
1359							}
1360							} # epub_parse_one_node
1361
1362							=head2 wordcount
1363
1364							Figure out the word-count.
1365
1366							=cut
1367							sub wordcount {
1368	0			0	1		my $self = shift;
1369	0						my %args = (
1370							@_
1371							);
1372
1373							#
1374							# Count the words
1375							#
1376	0						my $stripped = $self->{stripper}->parse($args{content});
1377	0						$self->{stripper}->eof;
1378	0						$stripped =~ s/[\n\r]/ /sg; # remove line splits
1379	0						$stripped =~ s/^\s+//;
1380	0						$stripped =~ s/\s+$//;
1381	0						$stripped =~ s/\s+/ /g; # remove excess whitespace
1382	0						my @words = split(' ', $stripped);
1383	0						my $wordcount = @words;
1384	0						my $chars = length($stripped);
1385	0	0					if ($self->{debug})
1386							{
1387	0						my $orig_length = length($args{content});
1388	0						print "orig_length=$orig_length, words=$wordcount, chars=$chars\n";
1389	0	0					if ($wordcount < 200) # too short!
1390							{
1391	0						print "====== stripped ======\n$stripped\n======\n";
1392							}
1393							}
1394							return (
1395	0						words=>$wordcount,
1396							chars=>$chars,
1397							);
1398							} # wordcount
1399
1400							=head2 build_toc
1401
1402							Build a local table-of-contents file from the meta-info about the story.
1403
1404							$self->build_toc(info=>\%info);
1405
1406							=cut
1407							sub build_toc {
1408	0			0	1		my $self = shift;
1409	0						my %args = (
1410							@_
1411							);
1412	0						my $info = $args{info};
1413
1414	0						my $filename = sprintf("%s00.html", $info->{basename});
1415
1416	0						my $html;
1417							my $characters = (ref $info->{characters}
1418	0						? join( ', ', @{$info->{characters}} )
1419	0	0					: $info->{characters});
1420							my $universe = (ref $info->{universe}
1421	0						? join( ', ', @{$info->{universe}} )
1422	0	0					: $info->{universe});
1423	0						$html = <<EOT;
1424							<html>
1425							<head><title>$info->{title}</title></head>
1426							<body>
1427							<h1>$info->{title}</h1>
1428							<p>by $info->{author}</p>
1429							<p>Fetched from <a href="$info->{url}">$info->{url}</a></p>
1430							<p><b>Summary:</b>
1431							$info->{summary}
1432							</p>
1433							<p><b>Words:</b> $info->{wordcount}<br/>
1434							<b>Universe:</b> $universe</p>
1435							<b>Characters:</b> $characters</p>
1436							<ol>
1437							EOT
1438
1439	0						my @storyfiles = @{$info->{storyfiles}};
	0
1440	0						my @ch_titles = @{$info->{chapter_titles}};
	0
1441	0						my @ch_wc = @{$info->{chapter_wc}};
	0
1442	0						for (my $i=0; $i < @storyfiles; $i++)
1443							{
1444	0						$html .= sprintf("<li><a href=\"%s\">%s</a> (%d)</li>",
1445							$storyfiles[$i],
1446							$ch_titles[$i],
1447							$ch_wc[$i]);
1448							}
1449	0						$html .= "\n</ol>\n</body></html>\n";
1450	0						my $ofh;
1451	0	0					open($ofh, ">", $filename) \|\| die "Can't write to $filename";
1452	0						print $ofh $html;
1453	0						close($ofh);
1454
1455	0						return $filename;
1456							} # build_toc
1457
1458							=head2 build_epub
1459
1460							Create an EPUB file from the story files and meta information.
1461
1462							$self->build_epub()
1463
1464							=cut
1465							sub build_epub {
1466	0			0	1		my $self = shift;
1467	0						my %args = (
1468							@_
1469							);
1470	0						my $info = $args{info};
1471
1472	0						my $epub = EBook::EPUB->new;
1473	0						$epub->add_title($info->{title});
1474	0						$epub->add_author($info->{author});
1475	0						$epub->add_description($info->{summary});
1476	0						$epub->add_language('en');
1477	0						$epub->add_source($info->{url}, 'URL');
1478	0						$epub->add_date($info->{fetch_date}, 'fetched');
1479
1480							# Add Subjects and additional Meta
1481							# Also build up the title-page
1482	0						my $info_str =<<EOT;
1483							<h1>$info->{title}</h1>
1484							<p>by $info->{author}</p>
1485							<p><b>Fetched from:</b> $info->{url}</p>
1486							<p><b>Summary:</b> $info->{summary}</p>
1487							<p>
1488							EOT
1489	0						my %know = %{$info};
	0
1490	0						delete $know{title};
1491	0						delete $know{author};
1492	0						delete $know{summary};
1493	0						delete $know{url};
1494	0						delete $know{fetch_date};
1495	0						delete $know{basename};
1496	0						delete $know{chapter_titles};
1497	0						delete $know{chapter_wc};
1498	0						delete $know{chapters};
1499	0						delete $know{storyfiles};
1500	0						foreach my $key (sort keys %know)
1501							{
1502	0	0					if (!$know{$key})
1503							{
1504	0						next;
1505							}
1506	0	0					if (!ref $know{$key})
1507							{
1508	0						$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, $know{$key});
1509	0	0					if ($know{$key} =~ /,\s*/)
1510							{
1511	0						my @array = split(/,\s*/, $know{$key});
1512	0						foreach my $v (@array)
1513							{
1514	0	0					if ($key =~ /^(?:category\|story_length)$/)
1515							{
1516	0						$epub->add_subject($v);
1517							}
1518							else
1519							{
1520	0						$epub->add_meta_item($key, $v);
1521							}
1522							}
1523							}
1524							else
1525							{
1526	0	0					if ($key =~ /^(?:category\|story_length)$/)
1527							{
1528	0						$epub->add_subject($know{$key});
1529							}
1530							else
1531							{
1532	0						$epub->add_meta_item($key, $know{$key});
1533							}
1534							}
1535							}
1536							else
1537							{
1538	0						$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, join(', ', @{$know{$key}}));
	0
1539	0						foreach my $cat (@{$know{$key}})
	0
1540							{
1541	0	0					if ($key =~ /^(?:category\|story_length)$/)
1542							{
1543	0						$epub->add_subject($cat);
1544							}
1545							else
1546							{
1547	0						$epub->add_meta_item($key, $cat);
1548							}
1549							}
1550							}
1551							}
1552
1553	0						$info_str .= "</p>\n";
1554
1555	0						my $titlepage = $self->tidy(story=>$info_str, title=>$info->{title});
1556	0						my $play_order = 1;
1557	0						my $id;
1558	0						$id = $epub->add_xhtml("title.html", $titlepage);
1559
1560							# Add top-level nav-point
1561	0						my $navpoint = $epub->add_navpoint(
1562							label => "ToC",
1563							id => $id,
1564							content => "title.html",
1565							play_order => $play_order # should always start with 1
1566							);
1567
1568	0						my @storyfiles = @{$info->{storyfiles}};
	0
1569	0						my @ch_titles = @{$info->{chapter_titles}};
	0
1570	0						for (my $i=0; $i < @storyfiles; $i++)
1571							{
1572	0						$play_order++;
1573	0						$id = $epub->copy_xhtml($storyfiles[$i], $storyfiles[$i]);
1574	0						my $navpoint = $epub->add_navpoint(
1575							label => $ch_titles[$i],
1576							id => $id,
1577							content => $storyfiles[$i],
1578							play_order => $play_order,
1579							);
1580							}
1581
1582	0						my $epub_file = $info->{basename} . '.epub';
1583	0						$epub->pack_zip($epub_file);
1584
1585							# now unlink the storyfiles
1586	0						for (my $i=0; $i < @storyfiles; $i++)
1587							{
1588	0						unlink $storyfiles[$i];
1589							}
1590
1591	0						return $epub_file;
1592							} # build_epub
1593
1594							=head2 tidy_chars
1595
1596							Remove nasty encodings.
1597
1598							$content = $self->tidy_chars($content);
1599
1600							=cut
1601							sub tidy_chars {
1602	0			0	1		my $self = shift;
1603	0						my $string = shift;
1604
1605							# numeric entities
1606	0						$string =~ s/ //sg;
1607	0						$string =~ s/'/'/sg;
1608	0						$string =~ s/"/"/sg;
1609	0						$string =~ s/-/-/sg;
1610	0						$string =~ s/ / /sg;
1611
1612							#-------------------------------------------------------
1613							# from Catalyst::Plugin::Params::Demoronize
1614	0						zap_cp1252($string);
1615
1616	0						my %replace_map = (
1617							'\302' => '',
1618							'\240' => ' ',
1619							);
1620
1621	0						foreach my $replace (keys(%{replace_map})) {
1622	0						my $rr = $replace_map{$replace};
1623	0						$string =~ s/$replace/$rr/g;
1624							}
1625
1626							#-------------------------------------------------------
1627							# from demoronizser
1628							# http://www.fourmilab.ch/webtools/demoroniser/
1629							#-------------------------------------------------------
1630
1631							# Supply missing semicolon at end of numeric entity if
1632							# Billy's bozos left it out.
1633
1634	0						$string =~ s/(&#[0-2]\d\d)\s/$1; /g;
1635
1636							# Fix dimbulb obscure numeric rendering of < > &
1637
1638	0						$string =~ s/\&\#038;/&/g;
1639	0						$string =~ s/\&\#39;/‘/g;
1640	0						$string =~ s/\&\#060;/</g;
1641	0						$string =~ s/\&\#062;/>/g;
1642
1643							# Translate Unicode numeric punctuation characters
1644							# into ISO equivalents
1645
1646	0						$string =~ s/‐/-/sg; # 0x2010 Hyphen
1647	0						$string =~ s/‑/-/sg; # 0x2011 Non-breaking hyphen
1648	0						$string =~ s/–/-/sg; # 0x2013 En dash
1649	0						$string =~ s/—/--/sg; # 0x2014 Em dash
1650	0						$string =~ s/―/--/sg; # 0x2015 Horizontal bar/quotation dash
1651	0						$string =~ s/‖/\|\|/sg; # 0x2016 Double vertical line
1652	0						$string =~ s-‗-_-sg; # 0x2017 Double low line
1653	0						$string =~ s/‘/`/sg; # 0x2018 Left single quotation mark
1654	0						$string =~ s/’/'/sg; # 0x2019 Right single quotation mark
1655	0						$string =~ s/‚/,/sg; # 0x201A Single low-9 quotation mark
1656	0						$string =~ s/‛/`/sg; # 0x201B Single high-reversed-9 quotation mark
1657	0						$string =~ s/“/"/sg; # 0x201C Left double quotation mark
1658	0						$string =~ s/”/"/sg; # 0x201D Right double quotation mark
1659	0						$string =~ s/„/,,/sg; # 0x201E Double low-9 quotation mark
1660	0						$string =~ s/‟/"/sg; # 0x201F Double high-reversed-9 quotation mark
1661	0						$string =~ s/•/*/sg; # 0x2022 Bullet
1662	0						$string =~ s/‣/*/sg; # 0x2023 Triangular bullet
1663	0						$string =~ s/․/./sg; # 0x2024 One dot leader
1664	0						$string =~ s/‥/../sg; # 0x2026 Two dot leader
1665	0						$string =~ s/…/.../sg; # 0x2026 Horizontal ellipsis
1666	0						$string =~ s/‧/·/sg; # 0x2027 Hyphenation point
1667							#-------------------------------------------------------
1668
1669							# and somehow some of the entities go funny
1670	0						$string =~ s/\&\#133;/.../g;
1671	0						$string =~ s/\ / /g;
1672	0						$string =~ s/\‘/'/g;
1673	0						$string =~ s/\’/'/g;
1674	0						$string =~ s/\“/"/g;
1675	0						$string =~ s/\”/"/g;
1676	0						$string =~ s/\"/"/g;
1677	0						$string =~ s/\–/-/g;
1678	0						$string =~ s/\…/.../g;
1679
1680							# replace double-breaks with <p>
1681	0						$string =~ s#<br\s\/?>\s<br\s*\/?>#\n<p>#sg;
1682
1683							# remove other cruft
1684	0						$string =~ s#<wbr>##sg;
1685	0						$string =~ s#</wbr>##sg;
1686	0						$string =~ s#<wbr/>##sg;
1687	0						$string =~ s#<nobr>##sg;
1688
1689							# Clean unwanted MS-Word HTML
1690	0						$string =~ s#<!--\[if gte mso \d\]>.?<!\[endif\]-->##sg;
1691	0						$string =~ s#<!--\[if !mso\]>.*?<!\[endif\]-->##sg;
1692	0						$string =~ s!<[/]?(font\|span\|xml\|del\|ins\|[ovwxp]:\w+\|st\d:\w+)[^>]*?>!!igs;
1693	0						$string =~ s!<([^>])(?:lang\|style\|size\|face\|[ovwxp]:\w+)=(?:'[^']'\|""[^""]""\|[^\s>]+)([^>])>!<$1$2>!isg;
1694	0						$string =~ s/\s*class="Banner[0-9]+"//g;
1695	0						$string =~ s/\s*class="Textbody"//g;
1696	0						$string =~ s/\s*class="MsoNormal"//g;
1697	0						$string =~ s/\s*class="MsoBodyText"//g;
1698
1699	0						return $string;
1700							} # tidy_chars
1701
1702							1; # End of WWW::FetchStory::Fetcher
1703							__END__