| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::FetchStory::Fetcher; |
|
2
|
|
|
|
|
|
|
$WWW::FetchStory::Fetcher::VERSION = '0.2602'; |
|
3
|
24
|
|
|
24
|
|
214842
|
use strict; |
|
|
24
|
|
|
|
|
65
|
|
|
|
24
|
|
|
|
|
1085
|
|
|
4
|
24
|
|
|
24
|
|
152
|
use warnings; |
|
|
24
|
|
|
|
|
41
|
|
|
|
24
|
|
|
|
|
1756
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
WWW::FetchStory::Fetcher - fetching module for WWW::FetchStory |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 VERSION |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
version 0.2602 |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
This is the base class for story-fetching plugins for WWW::FetchStory. |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=cut |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
require File::Temp; |
|
20
|
24
|
|
|
24
|
|
13065
|
use Date::Format; |
|
|
24
|
|
|
|
|
223011
|
|
|
|
24
|
|
|
|
|
1824
|
|
|
21
|
24
|
|
|
24
|
|
14972
|
use Encode::ZapCP1252; |
|
|
24
|
|
|
|
|
39727
|
|
|
|
24
|
|
|
|
|
1844
|
|
|
22
|
24
|
|
|
24
|
|
10687
|
use HTML::Entities; |
|
|
24
|
|
|
|
|
117115
|
|
|
|
24
|
|
|
|
|
2168
|
|
|
23
|
24
|
|
|
24
|
|
16546
|
use HTML::Strip; |
|
|
24
|
|
|
|
|
34799
|
|
|
|
24
|
|
|
|
|
977
|
|
|
24
|
24
|
|
|
24
|
|
17515
|
use XML::LibXML; |
|
|
24
|
|
|
|
|
1080004
|
|
|
|
24
|
|
|
|
|
172
|
|
|
25
|
24
|
|
|
24
|
|
19713
|
use HTML::Tidy::libXML; |
|
|
24
|
|
|
|
|
33943
|
|
|
|
24
|
|
|
|
|
1065
|
|
|
26
|
24
|
|
|
24
|
|
14544
|
use EBook::EPUB; |
|
|
24
|
|
|
|
|
34617394
|
|
|
|
24
|
|
|
|
|
2047
|
|
|
27
|
24
|
|
|
24
|
|
256
|
use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); |
|
|
24
|
|
|
|
|
58
|
|
|
|
24
|
|
|
|
|
6854
|
|
|
28
|
24
|
|
|
24
|
|
14147
|
use YAML::Any; |
|
|
24
|
|
|
|
|
39029
|
|
|
|
24
|
|
|
|
|
233
|
|
|
29
|
24
|
|
|
24
|
|
255483
|
use WWW::Mechanize::Sleepy; |
|
|
24
|
|
|
|
|
3865819
|
|
|
|
24
|
|
|
|
|
1558
|
|
|
30
|
24
|
|
|
24
|
|
381
|
use Encode qw( encode ); |
|
|
24
|
|
|
|
|
72
|
|
|
|
24
|
|
|
|
|
1878
|
|
|
31
|
24
|
|
|
24
|
|
17166
|
use HTTP::Cookies; |
|
|
24
|
|
|
|
|
236238
|
|
|
|
24
|
|
|
|
|
1145
|
|
|
32
|
24
|
|
|
24
|
|
12943
|
use HTTP::Cookies::Wget; |
|
|
24
|
|
|
|
|
94
|
|
|
|
24
|
|
|
|
|
1076
|
|
|
33
|
24
|
|
|
24
|
|
15568
|
use HTTP::Cookies::Mozilla; |
|
|
24
|
|
|
|
|
64783
|
|
|
|
24
|
|
|
|
|
315527
|
|
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 METHODS |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head2 new |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$obj->WWW::FetchStory::Fetcher->new(); |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=cut |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub new { |
|
44
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
|
45
|
0
|
|
|
|
|
|
my %parameters = @_; |
|
46
|
0
|
|
0
|
|
|
|
my $self = bless ({%parameters}, ref ($class) || $class); |
|
47
|
|
|
|
|
|
|
|
|
48
|
0
|
|
|
|
|
|
return ($self); |
|
49
|
|
|
|
|
|
|
} # new |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 init |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Initialize the object. |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
$obj->init(%args) |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=cut |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub init { |
|
60
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
61
|
0
|
|
|
|
|
|
my %parameters = @_; |
|
62
|
|
|
|
|
|
|
|
|
63
|
0
|
|
|
|
|
|
foreach my $key (keys %parameters) |
|
64
|
|
|
|
|
|
|
{ |
|
65
|
0
|
|
|
|
|
|
$self->{$key} = $parameters{$key}; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
0
|
0
|
|
|
|
|
if ($self->{use_wget}) |
|
69
|
|
|
|
|
|
|
{ |
|
70
|
0
|
|
|
|
|
|
$self->{wget_cmd} = 'wget'; |
|
71
|
0
|
0
|
0
|
|
|
|
if ($self->{wget_cookies} and -f $self->{wget_cookies}) |
|
72
|
|
|
|
|
|
|
{ |
|
73
|
0
|
|
|
|
|
|
$self->{wget_cmd} .= " --load-cookies " . $self->{wget_cookies}; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
0
|
0
|
|
|
|
|
if ($self->{debug}) |
|
76
|
|
|
|
|
|
|
{ |
|
77
|
0
|
|
|
|
|
|
$self->{wget_cmd} .= " --debug"; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
0
|
0
|
|
|
|
|
if ($self->{wget_options}) |
|
80
|
|
|
|
|
|
|
{ |
|
81
|
0
|
|
|
|
|
|
$self->{wget_cmd} .= ' ' . $self->{wget_options}; |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
else |
|
85
|
|
|
|
|
|
|
{ |
|
86
|
0
|
|
|
|
|
|
$self->{user_agent} = WWW::Mechanize::Sleepy->new( |
|
87
|
|
|
|
|
|
|
keep_alive => 1, |
|
88
|
|
|
|
|
|
|
env_proxy => 1, |
|
89
|
|
|
|
|
|
|
sleep => '1..10', |
|
90
|
|
|
|
|
|
|
agent => ref $self, |
|
91
|
|
|
|
|
|
|
); |
|
92
|
0
|
|
|
|
|
|
$self->{user_agent}->show_progress($self->{verbose} > 0); |
|
93
|
0
|
0
|
0
|
|
|
|
if ($self->{firefox_cookies} and -f $self->{firefox_cookies}) |
|
|
|
0
|
0
|
|
|
|
|
|
94
|
|
|
|
|
|
|
{ |
|
95
|
|
|
|
|
|
|
my $cookies = HTTP::Cookies::Mozilla->new( |
|
96
|
|
|
|
|
|
|
'file' => $self->{firefox_cookies}, |
|
97
|
0
|
|
|
|
|
|
hide_cookie2 => 1, |
|
98
|
|
|
|
|
|
|
ignore_discard => 1, |
|
99
|
|
|
|
|
|
|
); |
|
100
|
0
|
0
|
0
|
|
|
|
print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2); |
|
101
|
0
|
|
|
|
|
|
$self->{user_agent}->cookie_jar( $cookies ); |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
elsif ($self->{wget_cookies} and -f $self->{wget_cookies}) |
|
104
|
|
|
|
|
|
|
{ |
|
105
|
|
|
|
|
|
|
my $cookies = HTTP::Cookies::Wget->new( |
|
106
|
|
|
|
|
|
|
'file' => $self->{wget_cookies}, |
|
107
|
0
|
|
|
|
|
|
hide_cookie2 => 1, |
|
108
|
|
|
|
|
|
|
ignore_discard => 1, |
|
109
|
|
|
|
|
|
|
); |
|
110
|
0
|
0
|
0
|
|
|
|
print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2); |
|
111
|
0
|
|
|
|
|
|
$self->{user_agent}->cookie_jar( $cookies ); |
|
112
|
|
|
|
|
|
|
} |
|
113
|
0
|
0
|
0
|
|
|
|
if ($self->{debug} && $self->{debug} > 1) |
|
114
|
|
|
|
|
|
|
{ |
|
115
|
0
|
|
|
0
|
|
|
$self->{user_agent}->add_handler("request_send", sub { shift->dump; return }); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
116
|
0
|
|
|
0
|
|
|
$self->{user_agent}->add_handler("response_done", sub { shift->dump; return }); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
$self->{stripper} = HTML::Strip->new(); |
|
121
|
0
|
|
|
|
|
|
$self->{stripper}->add_striptag("head"); |
|
122
|
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
return ($self); |
|
124
|
|
|
|
|
|
|
} # init |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head2 name |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The name of the fetcher; this is basically the last component |
|
129
|
|
|
|
|
|
|
of the module name. This works as either a class function or a method. |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
$name = $self->name(); |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
$name = WWW::FetchStory::Fetcher::name($class); |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=cut |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub name { |
|
138
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
|
139
|
|
|
|
|
|
|
|
|
140
|
0
|
0
|
|
|
|
|
my $fullname = (ref ($class) ? ref ($class) : $class); |
|
141
|
|
|
|
|
|
|
|
|
142
|
0
|
|
|
|
|
|
my @bits = split('::', $fullname); |
|
143
|
0
|
|
|
|
|
|
return pop @bits; |
|
144
|
|
|
|
|
|
|
} # name |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 info |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Information about the fetcher. |
|
149
|
|
|
|
|
|
|
By default this just returns the formatted name. |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
$info = $self->info(); |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub info { |
|
156
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
157
|
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
my $name = $self->name(); |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# split the name into words |
|
161
|
0
|
|
|
|
|
|
my $info = $name; |
|
162
|
0
|
|
|
|
|
|
$info =~ s/([A-Z])/ $1/g; |
|
163
|
0
|
|
|
|
|
|
$info =~ s/^\s+//; |
|
164
|
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
return $info; |
|
166
|
|
|
|
|
|
|
} # info |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head2 priority |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
The priority of this fetcher. Fetchers with higher priority |
|
171
|
|
|
|
|
|
|
get tried first. This is useful where there may be a generic |
|
172
|
|
|
|
|
|
|
fetcher for a particular site, and then a more specialized fetcher |
|
173
|
|
|
|
|
|
|
for particular sections of a site. For example, there may be a |
|
174
|
|
|
|
|
|
|
generic LiveJournal fetcher, and then refinements for particular |
|
175
|
|
|
|
|
|
|
LiveJournal community, such as the sshg_exchange community. |
|
176
|
|
|
|
|
|
|
This works as either a class function or a method. |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This must be overridden by the specific fetcher class. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
$priority = $self->priority(); |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
$priority = WWW::FetchStory::Fetcher::priority($class); |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=cut |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub priority { |
|
187
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
|
188
|
0
|
|
|
|
|
|
return 0; |
|
189
|
|
|
|
|
|
|
} # priority |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=head2 allow |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
If this fetcher can be used for the given URL, then this returns |
|
194
|
|
|
|
|
|
|
true. |
|
195
|
|
|
|
|
|
|
This must be overridden by the specific fetcher class. |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
if ($obj->allow($url)) |
|
198
|
|
|
|
|
|
|
{ |
|
199
|
|
|
|
|
|
|
.... |
|
200
|
|
|
|
|
|
|
} |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=cut |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub allow { |
|
205
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
206
|
0
|
|
|
|
|
|
my $url = shift; |
|
207
|
|
|
|
|
|
|
|
|
208
|
0
|
|
|
|
|
|
return 0; |
|
209
|
|
|
|
|
|
|
} # allow |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head2 fetch |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Fetch the story, with the given options. |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
%story_info = $obj->fetch( |
|
216
|
|
|
|
|
|
|
urls=>\@urls, |
|
217
|
|
|
|
|
|
|
basename=>$basename, |
|
218
|
|
|
|
|
|
|
toc=>0, |
|
219
|
|
|
|
|
|
|
yaml=>0); |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=over |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
=item basename |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Optional basename used to construct the filenames. |
|
226
|
|
|
|
|
|
|
If this is not given, the basename is derived from the title of the story. |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=item epub |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Create an EPUB file, deleting the HTML files which have been downloaded. |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=item toc |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Build a table-of-contents file if this is true. |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=item yaml |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Build a YAML file with meta-data about this story if this is true. |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=item meta_only |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
Don't download the story, just parse the meta-data from the web page. |
|
243
|
|
|
|
|
|
|
This is useful if you've had to download the story separately due |
|
244
|
|
|
|
|
|
|
to security restrictions. |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item use_file I<filename> |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Use the given file to parse the meta-data from rather than from |
|
249
|
|
|
|
|
|
|
the web page. (This is usually a pre-downloaded EPUB file) |
|
250
|
|
|
|
|
|
|
Implies meta_only. |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item urls |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
The URLs of the story. |
|
255
|
|
|
|
|
|
|
The first page is scraped for meta-information about the story, |
|
256
|
|
|
|
|
|
|
including the title and author. Site-specific Fetcher plugins can find additional |
|
257
|
|
|
|
|
|
|
information, including the URLs of all the chapters in a multi-chapter story. |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=back |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub fetch { |
|
264
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
265
|
0
|
|
|
|
|
|
my %args = ( |
|
266
|
|
|
|
|
|
|
urls=>undef, |
|
267
|
|
|
|
|
|
|
basename=>'', |
|
268
|
|
|
|
|
|
|
@_ |
|
269
|
|
|
|
|
|
|
); |
|
270
|
|
|
|
|
|
|
|
|
271
|
0
|
|
|
|
|
|
$self->{verbose} = $args{verbose}; |
|
272
|
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
|
my $first_url = $args{urls}[0]; |
|
274
|
0
|
|
|
|
|
|
my $toc_content = $self->get_toc(%args, first_url=>$first_url); |
|
275
|
0
|
|
|
|
|
|
my %story_info = $self->parse_toc(%args, content=>$toc_content, |
|
276
|
|
|
|
|
|
|
url=>$first_url); |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
my $basename = ($args{basename} |
|
279
|
|
|
|
|
|
|
? $args{basename} |
|
280
|
0
|
0
|
|
|
|
|
: $self->get_story_basename($story_info{title})); |
|
281
|
0
|
|
|
|
|
|
$story_info{basename} = $basename; |
|
282
|
0
|
|
|
|
|
|
my @storyfiles = (); |
|
283
|
|
|
|
|
|
|
|
|
284
|
0
|
0
|
|
|
|
|
$args{meta_only} = 1 if $args{use_file}; |
|
285
|
0
|
0
|
|
|
|
|
if ($args{meta_only}) |
|
286
|
|
|
|
|
|
|
{ |
|
287
|
0
|
|
|
|
|
|
$self->derive_values(info=>\%story_info); |
|
288
|
0
|
0
|
|
|
|
|
warn Dump(\%story_info) if ($self->{verbose} > 1); |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
else |
|
291
|
|
|
|
|
|
|
{ |
|
292
|
0
|
0
|
0
|
|
|
|
if ($args{epub} and exists $story_info{epub_url} and $story_info{epub_url}) |
|
|
|
|
0
|
|
|
|
|
|
293
|
|
|
|
|
|
|
{ |
|
294
|
|
|
|
|
|
|
my %epub_info = $self->get_epub(base=>$basename, |
|
295
|
|
|
|
|
|
|
url=>$story_info{epub_url}, |
|
296
|
0
|
|
|
|
|
|
meta=>\%story_info); |
|
297
|
0
|
|
|
|
|
|
$story_info{storyfiles} = [$epub_info{filename}]; |
|
298
|
|
|
|
|
|
|
|
|
299
|
0
|
|
|
|
|
|
$self->derive_values(info=>\%story_info); |
|
300
|
0
|
0
|
|
|
|
|
warn Dump(\%story_info) if ($self->{verbose} > 1); |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
else |
|
303
|
|
|
|
|
|
|
{ |
|
304
|
0
|
|
|
|
|
|
my @ch_urls = @{$story_info{chapters}}; |
|
|
0
|
|
|
|
|
|
|
|
305
|
0
|
|
|
|
|
|
my $one_chapter = (@ch_urls == 1); |
|
306
|
|
|
|
|
|
|
my $first_chapter_is_toc = |
|
307
|
0
|
|
0
|
|
|
|
$story_info{toc_first} || $self->{first_is_toc}; |
|
308
|
0
|
|
|
|
|
|
delete $story_info{toc_first}; |
|
309
|
0
|
|
|
|
|
|
my @ch_titles = (); |
|
310
|
0
|
|
|
|
|
|
my @ch_wc = (); |
|
311
|
0
|
0
|
0
|
|
|
|
my $count = (($one_chapter or $first_chapter_is_toc) ? 0 : 1); |
|
312
|
0
|
|
|
|
|
|
foreach (my $i = 0; $i < @ch_urls; $i++) |
|
313
|
|
|
|
|
|
|
{ |
|
314
|
0
|
|
|
|
|
|
my $ch_title = sprintf("%s (%d)", $story_info{title}, $i+1); |
|
315
|
0
|
|
|
|
|
|
my %ch_info = $self->get_chapter(base=>$basename, |
|
316
|
|
|
|
|
|
|
count=>$count, |
|
317
|
|
|
|
|
|
|
url=>$ch_urls[$i], |
|
318
|
|
|
|
|
|
|
title=>$ch_title); |
|
319
|
0
|
|
|
|
|
|
push @storyfiles, $ch_info{filename}; |
|
320
|
0
|
|
|
|
|
|
push @ch_titles, $ch_info{title}; |
|
321
|
0
|
|
|
|
|
|
push @ch_wc, $ch_info{wordcount}; |
|
322
|
0
|
|
|
|
|
|
$story_info{wordcount} += $ch_info{wordcount}; |
|
323
|
0
|
|
|
|
|
|
$count++; |
|
324
|
0
|
|
|
|
|
|
sleep 1; # try not to overload the archive |
|
325
|
|
|
|
|
|
|
} |
|
326
|
0
|
|
|
|
|
|
$self->derive_values(info=>\%story_info); |
|
327
|
|
|
|
|
|
|
|
|
328
|
0
|
0
|
|
|
|
|
warn Dump(\%story_info) if ($self->{verbose} > 1); |
|
329
|
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
|
$story_info{storyfiles} = \@storyfiles; |
|
331
|
0
|
|
|
|
|
|
$story_info{chapter_titles} = \@ch_titles; |
|
332
|
0
|
|
|
|
|
|
$story_info{chapter_wc} = \@ch_wc; |
|
333
|
0
|
0
|
0
|
|
|
|
if ($args{toc} and !$args{epub}) # build a table-of-contents |
|
334
|
|
|
|
|
|
|
{ |
|
335
|
0
|
|
|
|
|
|
my $toc = $self->build_toc(info=>\%story_info); |
|
336
|
0
|
|
|
|
|
|
unshift @{$story_info{storyfiles}}, $toc; |
|
|
0
|
|
|
|
|
|
|
|
337
|
0
|
|
|
|
|
|
unshift @{$story_info{chapter_titles}}, "Table of Contents"; |
|
|
0
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
} |
|
339
|
0
|
0
|
|
|
|
|
if ($args{epub}) |
|
340
|
|
|
|
|
|
|
{ |
|
341
|
0
|
|
|
|
|
|
my $epub_file = $self->build_epub(info=>\%story_info); |
|
342
|
|
|
|
|
|
|
# if we have built an EPUB file, then the storyfiles |
|
343
|
|
|
|
|
|
|
# are now just one EPUB file. |
|
344
|
0
|
|
|
|
|
|
$story_info{storyfiles} = [$epub_file]; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
} |
|
347
|
|
|
|
|
|
|
} |
|
348
|
0
|
0
|
|
|
|
|
if ($args{yaml}) |
|
349
|
|
|
|
|
|
|
{ |
|
350
|
0
|
|
|
|
|
|
my $filename = sprintf("%s.yml", $story_info{basename}); |
|
351
|
0
|
|
|
|
|
|
my $ofh; |
|
352
|
0
|
0
|
|
|
|
|
open($ofh, ">", $filename) || die "Can't write to $filename"; |
|
353
|
0
|
|
|
|
|
|
print $ofh Dump(\%story_info); |
|
354
|
0
|
|
|
|
|
|
close($ofh); |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
return %story_info; |
|
358
|
|
|
|
|
|
|
} # fetch |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=head1 Private Methods |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 get_story_basename |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
Figure out the file basename for a story by using its title. |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
$basename = $self->get_story_basename($title); |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=cut |
|
369
|
|
|
|
|
|
|
sub get_story_basename { |
|
370
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
371
|
0
|
|
|
|
|
|
my $title = shift; |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# make a word with only letters and numbers |
|
374
|
|
|
|
|
|
|
# and remove HTML entities and UTF-8 |
|
375
|
|
|
|
|
|
|
# and with everything lowercase |
|
376
|
|
|
|
|
|
|
# and the spaces replaced with underscores |
|
377
|
0
|
|
|
|
|
|
my $base = $title; |
|
378
|
0
|
|
|
|
|
|
$base =~ s/^The\s+//; # get rid of leading "The " |
|
379
|
0
|
|
|
|
|
|
$base =~ s/^A\s+//; # get rid of leading "A " |
|
380
|
0
|
|
|
|
|
|
$base =~ s/^An\s+//; # get rid of leading "An " |
|
381
|
0
|
|
|
|
|
|
$base =~ s/-/ /g; # replace dashes with spaces |
|
382
|
0
|
|
|
|
|
|
$base = decode_entities($base); # replace entities with UTF-8 |
|
383
|
0
|
|
|
|
|
|
$base =~ s/[^[:ascii:]]//g; # remove UTF-8 |
|
384
|
0
|
|
|
|
|
|
$base =~ s/[^\w\s]//g; # remove non-word characters |
|
385
|
0
|
|
|
|
|
|
$base = lc($base); |
|
386
|
|
|
|
|
|
|
|
|
387
|
0
|
|
|
|
|
|
my @words = split(' ', $base); |
|
388
|
0
|
|
|
|
|
|
my $max_words = 3; |
|
389
|
0
|
|
|
|
|
|
my @first_words = (); |
|
390
|
|
|
|
|
|
|
# if there are three words or less, use all of them |
|
391
|
0
|
0
|
|
|
|
|
if (@words <= $max_words) |
|
392
|
|
|
|
|
|
|
{ |
|
393
|
0
|
|
|
|
|
|
@first_words = @words; |
|
394
|
|
|
|
|
|
|
} |
|
395
|
|
|
|
|
|
|
else |
|
396
|
|
|
|
|
|
|
{ |
|
397
|
0
|
0
|
|
|
|
|
$max_words++ if (@words > 3); # four |
|
398
|
0
|
0
|
|
|
|
|
$max_words++ if (@words > 5); # five if a lot |
|
399
|
0
|
|
0
|
|
|
|
for (my $i = 0; $i < @words and @first_words < $max_words; $i++) |
|
400
|
|
|
|
|
|
|
{ |
|
401
|
|
|
|
|
|
|
# skip little words |
|
402
|
0
|
0
|
0
|
|
|
|
if ($words[$i] =~ /^(the|a|an|and)$/) |
|
|
|
0
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
{ |
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
elsif (@words > 4 and $words[$i] =~ /^(of|to|in|or|on|by|i|is|isnt|its)$/) |
|
406
|
|
|
|
|
|
|
{ |
|
407
|
|
|
|
|
|
|
# if there are a lot of words, skip these little words too |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
else |
|
410
|
|
|
|
|
|
|
{ |
|
411
|
0
|
|
|
|
|
|
push @first_words, $words[$i]; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
0
|
|
|
|
|
|
return join('_', @first_words); |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
} # get_story_basename |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 extract_story |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Extract the story-content from the fetched content. |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
my ($story, $title) = $self->extract_story(content=>$content, |
|
425
|
|
|
|
|
|
|
title=>$title); |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=cut |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
sub extract_story { |
|
430
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
431
|
0
|
|
|
|
|
|
my %args = ( |
|
432
|
|
|
|
|
|
|
content=>'', |
|
433
|
|
|
|
|
|
|
title=>'', |
|
434
|
|
|
|
|
|
|
@_ |
|
435
|
|
|
|
|
|
|
); |
|
436
|
|
|
|
|
|
|
|
|
437
|
0
|
|
|
|
|
|
my $story = ''; |
|
438
|
0
|
|
|
|
|
|
my $title = ''; |
|
439
|
0
|
0
|
|
|
|
|
if ($args{content} =~ m#<title>([^<]+)</title>#is) |
|
440
|
|
|
|
|
|
|
{ |
|
441
|
0
|
|
|
|
|
|
$title = $1; |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
else |
|
444
|
|
|
|
|
|
|
{ |
|
445
|
0
|
|
|
|
|
|
$title = $args{title}; |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# some badly formed pages have multiple BODY tags |
|
449
|
0
|
0
|
|
|
|
|
if ($args{content} =~ m#<body[^>]*>.*?<body[^>]*>(.*?)</body>#is) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
{ |
|
451
|
0
|
|
|
|
|
|
$story = $1; |
|
452
|
|
|
|
|
|
|
} |
|
453
|
|
|
|
|
|
|
elsif ($args{content} =~ m#<body[^>]*>(.*)</body>#is) |
|
454
|
|
|
|
|
|
|
{ |
|
455
|
0
|
|
|
|
|
|
$story = $1; |
|
456
|
|
|
|
|
|
|
} |
|
457
|
|
|
|
|
|
|
elsif ($args{content} =~ m#</head>(.*)#is) |
|
458
|
|
|
|
|
|
|
{ |
|
459
|
0
|
|
|
|
|
|
$story = $1; |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
|
|
462
|
0
|
0
|
|
|
|
|
if ($story) |
|
463
|
|
|
|
|
|
|
{ |
|
464
|
0
|
|
|
|
|
|
$story = $self->tidy_chars($story); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
else |
|
467
|
|
|
|
|
|
|
{ |
|
468
|
0
|
|
|
|
|
|
$story = $args{content}; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
|
|
|
|
|
|
|
|
471
|
0
|
|
|
|
|
|
return ($story, $title); |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
} # extract_story |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head2 make_css |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
Create site-specific CSS styling. |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
$css = $self->make_css(); |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=cut |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub make_css { |
|
484
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
485
|
|
|
|
|
|
|
|
|
486
|
0
|
|
|
|
|
|
return ''; |
|
487
|
|
|
|
|
|
|
} # make_css |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=head2 tidy |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
Make a tidy, compliant XHTML page from the given story-content. |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
$content = $self->tidy(story=>$story, |
|
494
|
|
|
|
|
|
|
title=>$title); |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=cut |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub tidy { |
|
499
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
500
|
0
|
|
|
|
|
|
my %args = ( |
|
501
|
|
|
|
|
|
|
story=>'', |
|
502
|
|
|
|
|
|
|
title=>'', |
|
503
|
|
|
|
|
|
|
@_ |
|
504
|
|
|
|
|
|
|
); |
|
505
|
|
|
|
|
|
|
|
|
506
|
0
|
|
|
|
|
|
my $story = $args{story}; |
|
507
|
0
|
|
|
|
|
|
$story = $self->tidy_chars($story); |
|
508
|
0
|
|
|
|
|
|
my $title = $args{title}; |
|
509
|
0
|
|
|
|
|
|
my $css = $self->make_css(%args); |
|
510
|
|
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
my $html = ''; |
|
512
|
0
|
|
|
|
|
|
$html .= "<html>\n"; |
|
513
|
0
|
|
|
|
|
|
$html .= "<head>\n"; |
|
514
|
0
|
|
|
|
|
|
$html .= "<title>$title</title>\n"; |
|
515
|
0
|
0
|
|
|
|
|
$html .= $css if $css; |
|
516
|
0
|
|
|
|
|
|
$html .= "</head>\n"; |
|
517
|
0
|
|
|
|
|
|
$html .= "<body>\n"; |
|
518
|
0
|
|
|
|
|
|
$html .= "$story\n"; |
|
519
|
0
|
|
|
|
|
|
$html .= "</body>\n"; |
|
520
|
0
|
|
|
|
|
|
$html .= "</html>\n"; |
|
521
|
|
|
|
|
|
|
|
|
522
|
0
|
|
|
|
|
|
my $tidy = HTML::Tidy::libXML->new(); |
|
523
|
0
|
|
|
|
|
|
$html = encode("UTF-8", $html); |
|
524
|
0
|
|
|
|
|
|
my $xhtml = $tidy->clean($html, 'UTF-8', 1); |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
# fixing some errors |
|
527
|
0
|
|
|
|
|
|
$xhtml =~ s!xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml"!xmlns="http://www.w3.org/1999/xhtml"!; |
|
528
|
0
|
|
|
|
|
|
$xhtml =~ s!<i/>!!g; |
|
529
|
0
|
|
|
|
|
|
$xhtml =~ s!<b/>!!g; |
|
530
|
|
|
|
|
|
|
|
|
531
|
0
|
|
|
|
|
|
return $xhtml; |
|
532
|
|
|
|
|
|
|
} # tidy |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=head2 get_toc |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
Get a table-of-contents page. |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=cut |
|
539
|
|
|
|
|
|
|
sub get_toc { |
|
540
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
541
|
0
|
|
|
|
|
|
my %args = @_; |
|
542
|
0
|
|
|
|
|
|
my $url = $args{first_url}; |
|
543
|
|
|
|
|
|
|
|
|
544
|
0
|
|
|
|
|
|
return $self->get_page($url); |
|
545
|
|
|
|
|
|
|
} # get_toc |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=head2 get_page |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
Get the contents of a URL. |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=cut |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
sub get_page { |
|
554
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
555
|
0
|
|
|
|
|
|
my $url = shift; |
|
556
|
|
|
|
|
|
|
|
|
557
|
0
|
0
|
|
|
|
|
warn "getting $url\n" if $self->{verbose}; |
|
558
|
0
|
|
|
|
|
|
my $content = ''; |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
# The "url" might be a file instead |
|
561
|
0
|
0
|
0
|
|
|
|
if ($url !~ /http/ and -f $url) |
|
|
|
0
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
{ |
|
563
|
0
|
|
|
|
|
|
my $ifh; |
|
564
|
0
|
0
|
|
|
|
|
open($ifh, $url) or die "FAILED to read ${url}: $!"; |
|
565
|
0
|
|
|
|
|
|
while(<$ifh>) |
|
566
|
|
|
|
|
|
|
{ |
|
567
|
0
|
|
|
|
|
|
$content .= $_; |
|
568
|
|
|
|
|
|
|
} |
|
569
|
0
|
|
|
|
|
|
close($ifh); |
|
570
|
|
|
|
|
|
|
} |
|
571
|
|
|
|
|
|
|
elsif ($self->{use_wget}) |
|
572
|
|
|
|
|
|
|
{ |
|
573
|
0
|
|
|
|
|
|
my $cmd = sprintf("%s -O %s '%s'", $self->{wget_cmd}, '-', $url); |
|
574
|
0
|
0
|
|
|
|
|
warn "$cmd\n" if ($self->{verbose} > 1); |
|
575
|
0
|
|
|
|
|
|
my $ifh; |
|
576
|
0
|
0
|
|
|
|
|
open($ifh, "${cmd}|") or die "FAILED $cmd: $!"; |
|
577
|
0
|
|
|
|
|
|
while(<$ifh>) |
|
578
|
|
|
|
|
|
|
{ |
|
579
|
0
|
|
|
|
|
|
$content .= $_; |
|
580
|
|
|
|
|
|
|
} |
|
581
|
0
|
|
|
|
|
|
close($ifh); |
|
582
|
|
|
|
|
|
|
} |
|
583
|
|
|
|
|
|
|
else |
|
584
|
|
|
|
|
|
|
{ |
|
585
|
0
|
|
|
|
|
|
my $can_accept = HTTP::Message::decodable; |
|
586
|
0
|
|
|
|
|
|
my $res = $self->{user_agent}->get($url, |
|
587
|
|
|
|
|
|
|
'Accept-Encoding' => $can_accept, |
|
588
|
|
|
|
|
|
|
'Keep-Alive' => "300", |
|
589
|
|
|
|
|
|
|
'Connection' => 'keep-alive', |
|
590
|
|
|
|
|
|
|
); |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# Check the outcome of the response |
|
593
|
0
|
0
|
|
|
|
|
if ($res->is_success) { |
|
594
|
0
|
0
|
|
|
|
|
print $res->status_line, "\n" if $self->{debug}; |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
else { |
|
597
|
0
|
|
|
|
|
|
die "FAILED fetching $url ", $res->status_line; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
0
|
|
0
|
|
|
|
$content = $res->decoded_content || $res->content; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
0
|
0
|
0
|
|
|
|
if (!$content and $self->{verbose}) |
|
603
|
|
|
|
|
|
|
{ |
|
604
|
0
|
|
|
|
|
|
warn "No content from $url"; |
|
605
|
0
|
0
|
|
|
|
|
if ($self->{debug}) |
|
606
|
|
|
|
|
|
|
{ |
|
607
|
|
|
|
|
|
|
# there's a problem, we want to debug it |
|
608
|
0
|
|
|
|
|
|
exit; |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
|
|
612
|
0
|
|
|
|
|
|
return $content; |
|
613
|
|
|
|
|
|
|
} # get_page |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=head2 parse_toc |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
Parse the table-of-contents file. |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
This must be overridden by the specific fetcher class. |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
%info = $self->parse_toc(content=>$content, |
|
622
|
|
|
|
|
|
|
url=>$url, |
|
623
|
|
|
|
|
|
|
urls=>\@urls); |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
This should return a hash containing: |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=over |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=item chapters |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
An array of URLs for the chapters of the story. In the case where the |
|
632
|
|
|
|
|
|
|
story only takes one page, that will be the chapter. |
|
633
|
|
|
|
|
|
|
In the case where multiple URLs have been passed in, it will be those URLs. |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
=item title |
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
The title of the story. |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=back |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
It may also return additional information, such as Summary. |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=cut |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
sub parse_toc { |
|
646
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
647
|
0
|
|
|
|
|
|
my %args = ( |
|
648
|
|
|
|
|
|
|
url=>'', |
|
649
|
|
|
|
|
|
|
content=>'', |
|
650
|
|
|
|
|
|
|
@_ |
|
651
|
|
|
|
|
|
|
); |
|
652
|
|
|
|
|
|
|
|
|
653
|
0
|
|
|
|
|
|
my %info = (); |
|
654
|
0
|
|
|
|
|
|
$info{url} = $args{url}; |
|
655
|
0
|
|
|
|
|
|
$info{title} = $self->parse_title(%args); |
|
656
|
0
|
|
|
|
|
|
$info{author} = $self->parse_author(%args); |
|
657
|
0
|
|
|
|
|
|
$info{summary} = $self->parse_summary(%args); |
|
658
|
0
|
|
|
|
|
|
$info{characters} = $self->parse_characters(%args); |
|
659
|
0
|
|
|
|
|
|
$info{universe} = $self->parse_universe(%args); |
|
660
|
0
|
|
|
|
|
|
$info{category} = $self->parse_category(%args); |
|
661
|
0
|
|
|
|
|
|
$info{rating} = $self->parse_rating(%args); |
|
662
|
0
|
|
|
|
|
|
$info{chapters} = $self->parse_chapter_urls(%args); |
|
663
|
0
|
|
|
|
|
|
$info{epub_url} = $self->parse_epub_url(%args); |
|
664
|
|
|
|
|
|
|
|
|
665
|
0
|
|
|
|
|
|
return %info; |
|
666
|
|
|
|
|
|
|
} # parse_toc |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=head2 parse_chapter_urls |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
Figure out the URLs for the chapters of this story. |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=cut |
|
673
|
|
|
|
|
|
|
sub parse_chapter_urls { |
|
674
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
675
|
0
|
|
|
|
|
|
my %args = ( |
|
676
|
|
|
|
|
|
|
url=>'', |
|
677
|
|
|
|
|
|
|
content=>'', |
|
678
|
|
|
|
|
|
|
@_ |
|
679
|
|
|
|
|
|
|
); |
|
680
|
|
|
|
|
|
|
|
|
681
|
0
|
|
|
|
|
|
my @chapters = (); |
|
682
|
0
|
0
|
|
|
|
|
if (defined $args{urls}) |
|
683
|
|
|
|
|
|
|
{ |
|
684
|
0
|
|
|
|
|
|
@chapters = @{$args{urls}}; |
|
|
0
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
else |
|
687
|
|
|
|
|
|
|
{ |
|
688
|
0
|
|
|
|
|
|
@chapters = ($args{url}); |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
0
|
|
|
|
|
|
return \@chapters; |
|
692
|
|
|
|
|
|
|
} # parse_chapter_urls |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
=head2 parse_epub_url |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
Figure out the URL for the EPUB version of this story, if there is one. |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
=cut |
|
699
|
|
|
|
|
|
|
sub parse_epub_url { |
|
700
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
701
|
0
|
|
|
|
|
|
my %args = ( |
|
702
|
|
|
|
|
|
|
url=>'', |
|
703
|
|
|
|
|
|
|
content=>'', |
|
704
|
|
|
|
|
|
|
@_ |
|
705
|
|
|
|
|
|
|
); |
|
706
|
|
|
|
|
|
|
|
|
707
|
0
|
|
|
|
|
|
return undef; |
|
708
|
|
|
|
|
|
|
} # parse_epub_url |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head2 parse_title |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
Get the title from the content |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
=cut |
|
715
|
|
|
|
|
|
|
sub parse_title { |
|
716
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
717
|
0
|
|
|
|
|
|
my %args = ( |
|
718
|
|
|
|
|
|
|
content=>'', |
|
719
|
|
|
|
|
|
|
@_ |
|
720
|
|
|
|
|
|
|
); |
|
721
|
|
|
|
|
|
|
|
|
722
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
723
|
0
|
|
|
|
|
|
my $title = ''; |
|
724
|
0
|
0
|
|
|
|
|
if ($content =~ /<(?:b|strong)>Title:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
{ |
|
726
|
0
|
|
|
|
|
|
$title = $1; |
|
727
|
|
|
|
|
|
|
} |
|
728
|
|
|
|
|
|
|
elsif ($content =~ /\bTitle:\s*"?(.*?)"?\s*<br/s) |
|
729
|
|
|
|
|
|
|
{ |
|
730
|
0
|
|
|
|
|
|
$title = $1; |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
elsif ($content =~ m#<h1>([^<]+)</h1>#is) |
|
733
|
|
|
|
|
|
|
{ |
|
734
|
0
|
|
|
|
|
|
$title = $1; |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
elsif ($content =~ m#<p class=MsoTitle>([^<]+)</p>#is) |
|
737
|
|
|
|
|
|
|
{ |
|
738
|
0
|
|
|
|
|
|
$title = $1; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
elsif ($content =~ m#<h2>([^<]+)</h2>#is) |
|
741
|
|
|
|
|
|
|
{ |
|
742
|
0
|
|
|
|
|
|
$title = $1; |
|
743
|
|
|
|
|
|
|
} |
|
744
|
|
|
|
|
|
|
elsif ($content =~ m#<h3>([^<]+)</h3>#is) |
|
745
|
|
|
|
|
|
|
{ |
|
746
|
0
|
|
|
|
|
|
$title = $1; |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
elsif ($content =~ m#<h4>([^<]+)</h4>#is) |
|
749
|
|
|
|
|
|
|
{ |
|
750
|
0
|
|
|
|
|
|
$title = $1; |
|
751
|
|
|
|
|
|
|
} |
|
752
|
|
|
|
|
|
|
elsif ($content =~ m#<title>([^<]+)</title>#is) |
|
753
|
|
|
|
|
|
|
{ |
|
754
|
0
|
|
|
|
|
|
$title = $1; |
|
755
|
|
|
|
|
|
|
} |
|
756
|
0
|
|
|
|
|
|
$title =~ s/<u>//ig; |
|
757
|
0
|
|
|
|
|
|
$title =~ s/<\/u>//ig; |
|
758
|
0
|
|
|
|
|
|
return $title; |
|
759
|
|
|
|
|
|
|
} # parse_title |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=head2 parse_ch_title |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
Get the chapter title from the content |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=cut |
|
766
|
|
|
|
|
|
|
sub parse_ch_title { |
|
767
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
768
|
0
|
|
|
|
|
|
my %args = ( |
|
769
|
|
|
|
|
|
|
content=>'', |
|
770
|
|
|
|
|
|
|
@_ |
|
771
|
|
|
|
|
|
|
); |
|
772
|
|
|
|
|
|
|
|
|
773
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
774
|
0
|
|
|
|
|
|
my $title = ''; |
|
775
|
0
|
0
|
|
|
|
|
if ($content =~ /Chapter \d+[:.]?\s*([^<]+)/si) |
|
776
|
|
|
|
|
|
|
{ |
|
777
|
0
|
|
|
|
|
|
$title = $1; |
|
778
|
|
|
|
|
|
|
} |
|
779
|
|
|
|
|
|
|
else |
|
780
|
|
|
|
|
|
|
{ |
|
781
|
0
|
|
|
|
|
|
$title = $self->parse_title(%args); |
|
782
|
|
|
|
|
|
|
} |
|
783
|
0
|
|
|
|
|
|
$title =~ s/<u>//ig; |
|
784
|
0
|
|
|
|
|
|
$title =~ s/<\/u>//ig; |
|
785
|
0
|
|
|
|
|
|
return $title; |
|
786
|
|
|
|
|
|
|
} # parse_ch_title |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=head2 parse_author |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
Get the author from the content |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=cut |
|
793
|
|
|
|
|
|
|
sub parse_author { |
|
794
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
795
|
0
|
|
|
|
|
|
my %args = ( |
|
796
|
|
|
|
|
|
|
content=>'', |
|
797
|
|
|
|
|
|
|
@_ |
|
798
|
|
|
|
|
|
|
); |
|
799
|
|
|
|
|
|
|
|
|
800
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
801
|
0
|
|
|
|
|
|
my $author = ''; |
|
802
|
0
|
0
|
|
|
|
|
if ($content =~ /<(?:b|strong)>Author:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
{ |
|
804
|
0
|
|
|
|
|
|
$author = $1; |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
elsif ($content =~ /\bAuthor:\s*"?(.*?)"?\s*<br/si) |
|
807
|
|
|
|
|
|
|
{ |
|
808
|
0
|
|
|
|
|
|
$author = $1; |
|
809
|
|
|
|
|
|
|
} |
|
810
|
|
|
|
|
|
|
elsif ($content =~ /<meta name="author" content="(.*?)"/si) |
|
811
|
|
|
|
|
|
|
{ |
|
812
|
0
|
|
|
|
|
|
$author = $1; |
|
813
|
|
|
|
|
|
|
} |
|
814
|
|
|
|
|
|
|
elsif ($content =~ /<p>by (.*?)<br/si) |
|
815
|
|
|
|
|
|
|
{ |
|
816
|
0
|
|
|
|
|
|
$author = $1; |
|
817
|
|
|
|
|
|
|
} |
|
818
|
0
|
|
|
|
|
|
return $author; |
|
819
|
|
|
|
|
|
|
} # parse_author |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=head2 parse_summary |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Get the summary from the content |
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
=cut |
|
826
|
|
|
|
|
|
|
sub parse_summary { |
|
827
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
828
|
0
|
|
|
|
|
|
my %args = ( |
|
829
|
|
|
|
|
|
|
content=>'', |
|
830
|
|
|
|
|
|
|
@_ |
|
831
|
|
|
|
|
|
|
); |
|
832
|
|
|
|
|
|
|
|
|
833
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
834
|
0
|
|
|
|
|
|
my $summary = ''; |
|
835
|
0
|
0
|
|
|
|
|
if ($content =~ /<(?:b|strong)>Summary:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
{ |
|
837
|
0
|
|
|
|
|
|
$summary = $1; |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
elsif ($content =~ m#<i>Summary:</i>\s*([^<]+)\s*<br>#s) |
|
840
|
|
|
|
|
|
|
{ |
|
841
|
0
|
|
|
|
|
|
$summary = $1; |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
elsif ($content =~ m#>Summary:\s*</span>\s*([^<]+)\s*<br#s) |
|
844
|
|
|
|
|
|
|
{ |
|
845
|
0
|
|
|
|
|
|
$summary = $1; |
|
846
|
|
|
|
|
|
|
} |
|
847
|
|
|
|
|
|
|
elsif ($content =~ /<i>Summary:<\/i>\s*(.*?)\s*$/m) |
|
848
|
|
|
|
|
|
|
{ |
|
849
|
0
|
|
|
|
|
|
$summary = $1; |
|
850
|
|
|
|
|
|
|
} |
|
851
|
|
|
|
|
|
|
elsif ($content =~ m#<tr><(?:th|td)>Summary</(?:th|td)><td>(.*?)</td></tr>#s) |
|
852
|
|
|
|
|
|
|
{ |
|
853
|
0
|
|
|
|
|
|
$summary = $1; |
|
854
|
0
|
|
|
|
|
|
$summary =~ s/<br>/ /g; |
|
855
|
|
|
|
|
|
|
} |
|
856
|
|
|
|
|
|
|
elsif ($content =~ /\bSummary:\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si) |
|
857
|
|
|
|
|
|
|
{ |
|
858
|
0
|
|
|
|
|
|
$summary = $1; |
|
859
|
|
|
|
|
|
|
} |
|
860
|
|
|
|
|
|
|
elsif ($content =~ m#(?:Prompt|Summary):</b>([^<]+)#is) |
|
861
|
|
|
|
|
|
|
{ |
|
862
|
0
|
|
|
|
|
|
$summary = $1; |
|
863
|
|
|
|
|
|
|
} |
|
864
|
|
|
|
|
|
|
elsif ($content =~ m#(?:Prompt|Summary):</strong>([^<]+)#is) |
|
865
|
|
|
|
|
|
|
{ |
|
866
|
0
|
|
|
|
|
|
$summary = $1; |
|
867
|
|
|
|
|
|
|
} |
|
868
|
|
|
|
|
|
|
elsif ($content =~ m#(?:Prompt|Summary):</u>([^<]+)#is) |
|
869
|
|
|
|
|
|
|
{ |
|
870
|
0
|
|
|
|
|
|
$summary = $1; |
|
871
|
|
|
|
|
|
|
} |
|
872
|
0
|
|
|
|
|
|
return $summary; |
|
873
|
|
|
|
|
|
|
} # parse_summary |
|
874
|
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
=head2 parse_characters |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
Get the characters from the content |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=cut |
|
880
|
|
|
|
|
|
|
sub parse_characters { |
|
881
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
882
|
0
|
|
|
|
|
|
my %args = ( |
|
883
|
|
|
|
|
|
|
content=>'', |
|
884
|
|
|
|
|
|
|
@_ |
|
885
|
|
|
|
|
|
|
); |
|
886
|
|
|
|
|
|
|
|
|
887
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
888
|
0
|
|
|
|
|
|
my $characters = ''; |
|
889
|
0
|
0
|
|
|
|
|
if ($content =~ />Characters:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
{ |
|
891
|
0
|
|
|
|
|
|
$characters = $1; |
|
892
|
|
|
|
|
|
|
} |
|
893
|
|
|
|
|
|
|
elsif ($content =~ /\bCharacters:\s*"?(.*?)"?\s*<br/si) |
|
894
|
|
|
|
|
|
|
{ |
|
895
|
0
|
|
|
|
|
|
$characters = $1; |
|
896
|
|
|
|
|
|
|
} |
|
897
|
|
|
|
|
|
|
elsif ($content =~ m#<i>Characters:</i>\s*([^<]+)\s*<br>#s) |
|
898
|
|
|
|
|
|
|
{ |
|
899
|
0
|
|
|
|
|
|
$characters = $1; |
|
900
|
|
|
|
|
|
|
} |
|
901
|
|
|
|
|
|
|
elsif ($content =~ m#(?:Pairings?|Characters):</(?:b|strong|u)>\s*([^<]+)#is) |
|
902
|
|
|
|
|
|
|
{ |
|
903
|
0
|
|
|
|
|
|
$characters = $1; |
|
904
|
|
|
|
|
|
|
} |
|
905
|
|
|
|
|
|
|
elsif ($content =~ m#<tr><(?:th|td)>(?:Pairings?|Characters)</(?:th|td)><td>(.*?)</td></tr>#s) |
|
906
|
|
|
|
|
|
|
{ |
|
907
|
0
|
|
|
|
|
|
$characters = $1; |
|
908
|
0
|
|
|
|
|
|
$characters =~ s/<br>/, /g; |
|
909
|
|
|
|
|
|
|
} |
|
910
|
0
|
|
|
|
|
|
return $characters; |
|
911
|
|
|
|
|
|
|
} # parse_characters |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
=head2 parse_universe |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Get the universe/fandom from the content |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=cut |
|
918
|
|
|
|
|
|
|
sub parse_universe { |
|
919
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
920
|
0
|
|
|
|
|
|
my %args = ( |
|
921
|
|
|
|
|
|
|
content=>'', |
|
922
|
|
|
|
|
|
|
@_ |
|
923
|
|
|
|
|
|
|
); |
|
924
|
|
|
|
|
|
|
|
|
925
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
926
|
0
|
|
|
|
|
|
my $universe = ''; |
|
927
|
0
|
0
|
|
|
|
|
if ($content =~ m#(?:Universe|Fandom):</(?:b|strong|u)>([^<]+)#is) |
|
928
|
|
|
|
|
|
|
{ |
|
929
|
0
|
|
|
|
|
|
$universe = $1; |
|
930
|
|
|
|
|
|
|
} |
|
931
|
0
|
|
|
|
|
|
return $universe; |
|
932
|
|
|
|
|
|
|
} # parse_universe |
|
933
|
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=head2 parse_recipient |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
Get the recipient from the content |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
=cut |
|
939
|
|
|
|
|
|
|
sub parse_recipient { |
|
940
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
941
|
0
|
|
|
|
|
|
my %args = ( |
|
942
|
|
|
|
|
|
|
content=>'', |
|
943
|
|
|
|
|
|
|
@_ |
|
944
|
|
|
|
|
|
|
); |
|
945
|
|
|
|
|
|
|
|
|
946
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
947
|
0
|
|
|
|
|
|
my $recipient = ''; |
|
948
|
0
|
0
|
|
|
|
|
if ($content =~ m#(?:Recipient|Prompter): (\w+)#is) |
|
|
|
0
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
{ |
|
950
|
0
|
|
|
|
|
|
$recipient = $1; |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
elsif ($content =~ m#Recipient:</(?:b|strong|u)>([^<]+)#is) |
|
953
|
|
|
|
|
|
|
{ |
|
954
|
0
|
|
|
|
|
|
$recipient = $1; |
|
955
|
|
|
|
|
|
|
} |
|
956
|
0
|
|
|
|
|
|
return $recipient; |
|
957
|
|
|
|
|
|
|
} # parse_recipient |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
=head2 parse_category |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
Get the categories from the content |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
=cut |
|
964
|
|
|
|
|
|
|
sub parse_category { |
|
965
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
966
|
0
|
|
|
|
|
|
my %args = ( |
|
967
|
|
|
|
|
|
|
content=>'', |
|
968
|
|
|
|
|
|
|
@_ |
|
969
|
|
|
|
|
|
|
); |
|
970
|
|
|
|
|
|
|
|
|
971
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
972
|
0
|
|
|
|
|
|
my $category = ''; |
|
973
|
0
|
0
|
|
|
|
|
if ($content =~ m#(?:Category|Tags):</(?:b|strong|u)>([^<]+)#is) |
|
|
|
0
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
{ |
|
975
|
0
|
|
|
|
|
|
$category = $1; |
|
976
|
|
|
|
|
|
|
} |
|
977
|
|
|
|
|
|
|
elsif ($content =~ m#<tr><(?:th|td)>Categories</(?:th|td)><td>(.*?)</td></tr>#s) |
|
978
|
|
|
|
|
|
|
{ |
|
979
|
0
|
|
|
|
|
|
$category = $1; |
|
980
|
0
|
|
|
|
|
|
$category =~ s/<br>/, /g; |
|
981
|
|
|
|
|
|
|
} |
|
982
|
0
|
|
|
|
|
|
return $category; |
|
983
|
|
|
|
|
|
|
} # parse_category |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=head2 parse_rating |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
Get the rating from the content |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
=cut |
|
990
|
|
|
|
|
|
|
sub parse_rating { |
|
991
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
992
|
0
|
|
|
|
|
|
my %args = ( |
|
993
|
|
|
|
|
|
|
content=>'', |
|
994
|
|
|
|
|
|
|
@_ |
|
995
|
|
|
|
|
|
|
); |
|
996
|
|
|
|
|
|
|
|
|
997
|
0
|
|
|
|
|
|
my $content = $args{content}; |
|
998
|
0
|
|
|
|
|
|
my $rating = ''; |
|
999
|
0
|
0
|
|
|
|
|
if ($content =~ m!^Rating:\s(.*?)$!m) |
|
|
|
0
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
{ |
|
1001
|
0
|
|
|
|
|
|
$rating = $1; |
|
1002
|
|
|
|
|
|
|
} |
|
1003
|
|
|
|
|
|
|
elsif ($content =~ m#Rating:</(?:b|strong|u)>\s*([^<]+)#is) |
|
1004
|
|
|
|
|
|
|
{ |
|
1005
|
0
|
|
|
|
|
|
$rating = $1; |
|
1006
|
|
|
|
|
|
|
} |
|
1007
|
0
|
|
|
|
|
|
return $rating; |
|
1008
|
|
|
|
|
|
|
} # parse_rating |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
=head2 derive_values |
|
1011
|
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
Calculate additional Meta values, such as current date. |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
=cut |
|
1015
|
|
|
|
|
|
|
sub derive_values { |
|
1016
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1017
|
0
|
|
|
|
|
|
my %args = @_; |
|
1018
|
|
|
|
|
|
|
|
|
1019
|
0
|
|
|
|
|
|
my $today = time2str('%Y-%m-%d', time); |
|
1020
|
0
|
|
|
|
|
|
$args{info}->{fetch_date} = $today; |
|
1021
|
|
|
|
|
|
|
|
|
1022
|
0
|
|
|
|
|
|
my $words = $args{info}->{wordcount}; |
|
1023
|
0
|
0
|
|
|
|
|
if ($words) |
|
1024
|
|
|
|
|
|
|
{ |
|
1025
|
0
|
|
|
|
|
|
my $len = ''; |
|
1026
|
0
|
0
|
|
|
|
|
if ($words == 100) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
{ |
|
1028
|
0
|
|
|
|
|
|
$len = 'Drabble'; |
|
1029
|
|
|
|
|
|
|
} elsif ($words == 200) |
|
1030
|
|
|
|
|
|
|
{ |
|
1031
|
0
|
|
|
|
|
|
$len = 'Double Drabble'; |
|
1032
|
|
|
|
|
|
|
} elsif ($words >= 75000) |
|
1033
|
|
|
|
|
|
|
{ |
|
1034
|
0
|
|
|
|
|
|
$len = 'Long Novel'; |
|
1035
|
|
|
|
|
|
|
} elsif ($words >= 50000) |
|
1036
|
|
|
|
|
|
|
{ |
|
1037
|
0
|
|
|
|
|
|
$len = 'Novel'; |
|
1038
|
|
|
|
|
|
|
} elsif ($words >= 25000) |
|
1039
|
|
|
|
|
|
|
{ |
|
1040
|
0
|
|
|
|
|
|
$len = 'Novella'; |
|
1041
|
|
|
|
|
|
|
} elsif ($words >= 7500) |
|
1042
|
|
|
|
|
|
|
{ |
|
1043
|
0
|
|
|
|
|
|
$len = 'Novelette'; |
|
1044
|
|
|
|
|
|
|
} elsif ($words >= 2000) |
|
1045
|
|
|
|
|
|
|
{ |
|
1046
|
0
|
|
|
|
|
|
$len = 'Short Story'; |
|
1047
|
|
|
|
|
|
|
} elsif ($words > 500) |
|
1048
|
|
|
|
|
|
|
{ |
|
1049
|
0
|
|
|
|
|
|
$len = 'Short Short'; |
|
1050
|
|
|
|
|
|
|
} elsif ($words <= 500) |
|
1051
|
|
|
|
|
|
|
{ |
|
1052
|
0
|
|
|
|
|
|
$len = 'Flash'; |
|
1053
|
|
|
|
|
|
|
} |
|
1054
|
0
|
0
|
|
|
|
|
$args{info}->{story_length} = $len if $len; |
|
1055
|
|
|
|
|
|
|
} |
|
1056
|
0
|
|
|
|
|
|
for my $field (qw{characters universe category}) |
|
1057
|
|
|
|
|
|
|
{ |
|
1058
|
0
|
0
|
0
|
|
|
|
if (exists $args{info}->{$field} |
|
|
|
|
0
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
and defined $args{info}->{$field} |
|
1060
|
|
|
|
|
|
|
and $args{info}->{$field} =~ /,/s) |
|
1061
|
|
|
|
|
|
|
{ |
|
1062
|
0
|
|
|
|
|
|
my @chars = split(/,\s*/s, $args{info}->{$field}); |
|
1063
|
0
|
|
|
|
|
|
$args{info}->{$field} = \@chars; |
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
} |
|
1066
|
|
|
|
|
|
|
} # derive_values |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=head2 get_chapter |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
Get an individual chapter of the story, tidy it, |
|
1071
|
|
|
|
|
|
|
and save it to a file. |
|
1072
|
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
$filename = $obj->get_chapter(base=>$basename, |
|
1074
|
|
|
|
|
|
|
count=>$count, |
|
1075
|
|
|
|
|
|
|
url=>$url, |
|
1076
|
|
|
|
|
|
|
title=>$title); |
|
1077
|
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
=cut |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
sub get_chapter { |
|
1081
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1082
|
0
|
|
|
|
|
|
my %args = ( |
|
1083
|
|
|
|
|
|
|
base=>'', |
|
1084
|
|
|
|
|
|
|
count=>0, |
|
1085
|
|
|
|
|
|
|
url=>'', |
|
1086
|
|
|
|
|
|
|
title=>'', |
|
1087
|
|
|
|
|
|
|
@_ |
|
1088
|
|
|
|
|
|
|
); |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
0
|
|
|
|
|
|
my $content = $self->get_page($args{url}); |
|
1091
|
|
|
|
|
|
|
|
|
1092
|
0
|
|
|
|
|
|
my ($story, $title) = $self->extract_story(%args, content=>$content); |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
0
|
|
|
|
|
|
my $chapter_title = $self->parse_ch_title(content=>$content); |
|
1095
|
0
|
0
|
|
|
|
|
$chapter_title = $title if !$chapter_title; |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
0
|
|
|
|
|
|
my $html = $self->tidy(story=>$story, title=>$chapter_title); |
|
1098
|
|
|
|
|
|
|
|
|
1099
|
0
|
|
|
|
|
|
my %wc = $self->wordcount(content=>$html); |
|
1100
|
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
# |
|
1102
|
|
|
|
|
|
|
# Write the file |
|
1103
|
|
|
|
|
|
|
# |
|
1104
|
|
|
|
|
|
|
my $filename = ($args{count} |
|
1105
|
|
|
|
|
|
|
? sprintf("%s%02d.html", $args{base}, $args{count}) |
|
1106
|
0
|
0
|
|
|
|
|
: sprintf("%s.html", $args{base})); |
|
1107
|
0
|
|
|
|
|
|
my $ofh; |
|
1108
|
0
|
0
|
|
|
|
|
open($ofh, ">", $filename) || die "Can't write to $filename"; |
|
1109
|
0
|
|
|
|
|
|
print $ofh $html; |
|
1110
|
0
|
|
|
|
|
|
close($ofh); |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
return ( |
|
1113
|
|
|
|
|
|
|
filename=>$filename, |
|
1114
|
|
|
|
|
|
|
title=>$chapter_title, |
|
1115
|
|
|
|
|
|
|
wordcount=>$wc{words}, |
|
1116
|
|
|
|
|
|
|
charcount=>$wc{chars}, |
|
1117
|
0
|
|
|
|
|
|
); |
|
1118
|
|
|
|
|
|
|
} # get_chapter |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
=head2 get_epub |
|
1121
|
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
Get the EPUB version of the story, tidy it, |
|
1123
|
|
|
|
|
|
|
and save it to a file. |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
$filename = $obj->get_epub(base=>$basename, |
|
1126
|
|
|
|
|
|
|
url=>$url); |
|
1127
|
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
=cut |
|
1129
|
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
sub get_epub { |
|
1131
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1132
|
0
|
|
|
|
|
|
my %args = ( |
|
1133
|
|
|
|
|
|
|
base=>'', |
|
1134
|
|
|
|
|
|
|
url=>'', |
|
1135
|
|
|
|
|
|
|
meta=>undef, |
|
1136
|
|
|
|
|
|
|
@_ |
|
1137
|
|
|
|
|
|
|
); |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
0
|
|
|
|
|
|
my %meta = %{$args{meta}}; |
|
|
0
|
|
|
|
|
|
|
|
1140
|
0
|
|
|
|
|
|
my $content = $self->get_page($args{url}); |
|
1141
|
0
|
|
|
|
|
|
my %epub_info = (); |
|
1142
|
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
# |
|
1144
|
|
|
|
|
|
|
# Write the file |
|
1145
|
|
|
|
|
|
|
# |
|
1146
|
0
|
|
|
|
|
|
my $filename = $args{base} . '.epub'; |
|
1147
|
0
|
|
|
|
|
|
my $ofh; |
|
1148
|
0
|
0
|
|
|
|
|
open($ofh, ">", $filename) || die "Can't write to $filename"; |
|
1149
|
0
|
|
|
|
|
|
print $ofh $content; |
|
1150
|
0
|
|
|
|
|
|
close($ofh); |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
0
|
|
|
|
|
|
$epub_info{filename} = $filename; |
|
1153
|
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
# |
|
1155
|
|
|
|
|
|
|
# Update the file metadata |
|
1156
|
|
|
|
|
|
|
# |
|
1157
|
0
|
|
|
|
|
|
my $zip = Archive::Zip->new(); |
|
1158
|
0
|
|
|
|
|
|
my $status = $zip->read( $filename ); |
|
1159
|
0
|
0
|
|
|
|
|
if ($status != AZ_OK) |
|
1160
|
|
|
|
|
|
|
{ |
|
1161
|
0
|
|
|
|
|
|
return %epub_info; |
|
1162
|
|
|
|
|
|
|
} |
|
1163
|
0
|
|
|
|
|
|
my @members = $zip->membersMatching('.*\.opf'); |
|
1164
|
0
|
0
|
0
|
|
|
|
if (@members && $members[0]) |
|
1165
|
|
|
|
|
|
|
{ |
|
1166
|
0
|
|
|
|
|
|
my %values = (); |
|
1167
|
0
|
|
|
|
|
|
my $opf = $zip->contents($members[0]); |
|
1168
|
0
|
|
|
|
|
|
my $dom = XML::LibXML->load_xml(string => $opf, |
|
1169
|
|
|
|
|
|
|
load_ext_dtd => 0, |
|
1170
|
|
|
|
|
|
|
no_network => 1); |
|
1171
|
0
|
|
|
|
|
|
my @metanodes = $dom->getElementsByLocalName('metadata'); |
|
1172
|
0
|
|
|
|
|
|
foreach my $metanode (@metanodes) |
|
1173
|
|
|
|
|
|
|
{ |
|
1174
|
0
|
0
|
|
|
|
|
if ($metanode->hasChildNodes) |
|
1175
|
|
|
|
|
|
|
{ |
|
1176
|
0
|
|
|
|
|
|
my @children = $metanode->childNodes(); |
|
1177
|
0
|
|
|
|
|
|
foreach my $node (@children) |
|
1178
|
|
|
|
|
|
|
{ |
|
1179
|
0
|
|
|
|
|
|
$self->epub_parse_one_node(%args, |
|
1180
|
|
|
|
|
|
|
node=>$node, |
|
1181
|
|
|
|
|
|
|
values=>\%values); |
|
1182
|
|
|
|
|
|
|
} |
|
1183
|
|
|
|
|
|
|
} |
|
1184
|
|
|
|
|
|
|
} |
|
1185
|
0
|
0
|
|
|
|
|
print STDERR "get_epub: about to replace description\n" if $self->{debug}; |
|
1186
|
0
|
|
|
|
|
|
$self->epub_replace_description(description=>$meta{summary}, xml=>$dom); |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
# remove meta info we don't want to be added to this |
|
1189
|
0
|
|
|
|
|
|
delete $meta{description}; |
|
1190
|
0
|
|
|
|
|
|
delete $meta{summary}; |
|
1191
|
0
|
|
|
|
|
|
delete $meta{title}; |
|
1192
|
0
|
|
|
|
|
|
delete $meta{chapters}; |
|
1193
|
0
|
|
|
|
|
|
delete $meta{epub_url}; |
|
1194
|
0
|
|
|
|
|
|
delete $meta{basename}; |
|
1195
|
0
|
|
|
|
|
|
delete $meta{toc_first}; |
|
1196
|
0
|
0
|
|
|
|
|
warn "EPUB meta: ", Dump(\%meta) if ($self->{verbose} > 1); |
|
1197
|
0
|
|
|
|
|
|
$self->epub_add_meta(meta=>\%meta, xml=>$dom); |
|
1198
|
|
|
|
|
|
|
|
|
1199
|
0
|
|
|
|
|
|
my $str = $dom->toString; |
|
1200
|
0
|
|
|
|
|
|
$zip->contents($members[0], $str); |
|
1201
|
0
|
|
|
|
|
|
$zip->overwrite(); |
|
1202
|
|
|
|
|
|
|
} |
|
1203
|
|
|
|
|
|
|
|
|
1204
|
0
|
|
|
|
|
|
return %epub_info; |
|
1205
|
|
|
|
|
|
|
} # get_epub |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=head2 epub_replace_description |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
Replace or add the description to an EPUB file. |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=cut |
|
1212
|
|
|
|
|
|
|
sub epub_replace_description { |
|
1213
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1214
|
0
|
|
|
|
|
|
my %args = @_; |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
0
|
|
|
|
|
|
my $dom = $args{xml}; |
|
1217
|
0
|
|
|
|
|
|
my $desc = $args{description}; |
|
1218
|
|
|
|
|
|
|
# need to clean up the description removing things not okay to put in a meta tag |
|
1219
|
0
|
|
|
|
|
|
$desc =~ s!<[^>]+>!!g; |
|
1220
|
0
|
|
|
|
|
|
$desc =~ s!</[^>]+>!!g; |
|
1221
|
0
|
|
|
|
|
|
$desc =~ s!"!''!g; |
|
1222
|
0
|
0
|
|
|
|
|
print STDERR "epub_replace_description: description=$desc\n" if $self->{debug}; |
|
1223
|
0
|
|
|
|
|
|
my @metanodes = $dom->getElementsByLocalName('metadata'); |
|
1224
|
0
|
0
|
|
|
|
|
return unless @metanodes; |
|
1225
|
0
|
|
|
|
|
|
my $metanode = $metanodes[0]; |
|
1226
|
0
|
|
|
|
|
|
my @dnodes = $metanode->getElementsByLocalName('description'); |
|
1227
|
0
|
0
|
|
|
|
|
if ($dnodes[0]) |
|
1228
|
|
|
|
|
|
|
{ |
|
1229
|
0
|
|
|
|
|
|
$metanode->removeChild($dnodes[0]); |
|
1230
|
|
|
|
|
|
|
} |
|
1231
|
0
|
|
|
|
|
|
$metanode->appendTextChild('dc:description', $desc); |
|
1232
|
|
|
|
|
|
|
} # epub_replace_description |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
=head2 epub_add_meta |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
Add the given meta-data to an EPUB file. |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
=cut |
|
1239
|
|
|
|
|
|
|
sub epub_add_meta { |
|
1240
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1241
|
0
|
|
|
|
|
|
my %args = @_; |
|
1242
|
|
|
|
|
|
|
|
|
1243
|
0
|
|
|
|
|
|
my $dom = $args{xml}; |
|
1244
|
0
|
|
|
|
|
|
my @metanodes = $dom->getElementsByLocalName('metadata'); |
|
1245
|
0
|
0
|
|
|
|
|
return unless @metanodes; |
|
1246
|
0
|
|
|
|
|
|
my $metanode = $metanodes[0]; |
|
1247
|
|
|
|
|
|
|
|
|
1248
|
0
|
|
|
|
|
|
my %meta = %{$args{meta}}; |
|
|
0
|
|
|
|
|
|
|
|
1249
|
0
|
|
|
|
|
|
foreach my $key (sort keys %meta) |
|
1250
|
|
|
|
|
|
|
{ |
|
1251
|
0
|
|
|
|
|
|
my $chunk=<<EOT; |
|
1252
|
|
|
|
|
|
|
<meta name="$key" content="$meta{$key}"/> |
|
1253
|
|
|
|
|
|
|
EOT |
|
1254
|
0
|
|
|
|
|
|
$metanode->appendWellBalancedChunk( $chunk ); |
|
1255
|
|
|
|
|
|
|
} |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
} # epub_add_meta |
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
=head2 epub_parse_one_node |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
Parse a node of meta-information from an EPUB file. |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
=cut |
|
1264
|
|
|
|
|
|
|
sub epub_parse_one_node { |
|
1265
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1266
|
0
|
|
|
|
|
|
my %params = @_; |
|
1267
|
|
|
|
|
|
|
|
|
1268
|
0
|
|
|
|
|
|
my $node = $params{node}; |
|
1269
|
0
|
|
|
|
|
|
my $oldvals = $params{values}; |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
0
|
|
|
|
|
|
my %newvals = (); |
|
1272
|
0
|
|
|
|
|
|
my $name = $node->localname; |
|
1273
|
0
|
0
|
|
|
|
|
return undef unless $name; |
|
1274
|
|
|
|
|
|
|
|
|
1275
|
0
|
|
|
|
|
|
my $value = $node->textContent; |
|
1276
|
0
|
|
|
|
|
|
$value =~ s/^\s+//s; |
|
1277
|
0
|
|
|
|
|
|
$value =~ s/\s+$//s; |
|
1278
|
0
|
|
|
|
|
|
$value =~ s/\s\s+/ /gs; |
|
1279
|
0
|
0
|
0
|
|
|
|
if ($name eq 'meta' and $node->hasAttributes) |
|
|
|
0
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
{ |
|
1281
|
0
|
|
|
|
|
|
my $metaname = ''; |
|
1282
|
0
|
|
|
|
|
|
my $metacontent = ''; |
|
1283
|
0
|
|
|
|
|
|
my @atts = $node->attributes(); |
|
1284
|
0
|
|
|
|
|
|
foreach my $att (@atts) |
|
1285
|
|
|
|
|
|
|
{ |
|
1286
|
0
|
|
|
|
|
|
my $n = $att->localname; |
|
1287
|
0
|
|
|
|
|
|
my $v = $att->textContent; |
|
1288
|
0
|
|
|
|
|
|
$v =~ s/^\s+//s; |
|
1289
|
0
|
|
|
|
|
|
$v =~ s/\s+$//s; |
|
1290
|
0
|
0
|
|
|
|
|
if ($n eq 'name') |
|
1291
|
|
|
|
|
|
|
{ |
|
1292
|
0
|
|
|
|
|
|
$metaname = $v; |
|
1293
|
|
|
|
|
|
|
} |
|
1294
|
|
|
|
|
|
|
else |
|
1295
|
|
|
|
|
|
|
{ |
|
1296
|
0
|
|
|
|
|
|
$metacontent = $v; |
|
1297
|
|
|
|
|
|
|
} |
|
1298
|
|
|
|
|
|
|
} |
|
1299
|
0
|
|
|
|
|
|
$newvals{$metaname} = $metacontent; |
|
1300
|
|
|
|
|
|
|
} |
|
1301
|
|
|
|
|
|
|
elsif ($node->hasAttributes) |
|
1302
|
|
|
|
|
|
|
{ |
|
1303
|
0
|
0
|
|
|
|
|
$newvals{$name}->{text} = $value unless !$value; |
|
1304
|
0
|
|
|
|
|
|
my @atts = $node->attributes(); |
|
1305
|
0
|
|
|
|
|
|
foreach my $att (@atts) |
|
1306
|
|
|
|
|
|
|
{ |
|
1307
|
0
|
|
|
|
|
|
my $n = $att->localname; |
|
1308
|
0
|
|
|
|
|
|
my $v = $att->textContent; |
|
1309
|
0
|
|
|
|
|
|
$v =~ s/^\s+//s; |
|
1310
|
0
|
|
|
|
|
|
$v =~ s/\s+$//s; |
|
1311
|
0
|
|
|
|
|
|
$newvals{$name}->{$n} = $v; |
|
1312
|
|
|
|
|
|
|
} |
|
1313
|
|
|
|
|
|
|
} |
|
1314
|
|
|
|
|
|
|
else |
|
1315
|
|
|
|
|
|
|
{ |
|
1316
|
0
|
|
|
|
|
|
$newvals{$name} = $value; |
|
1317
|
|
|
|
|
|
|
} |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# Don't want to overwrite existing values |
|
1320
|
0
|
|
|
|
|
|
foreach my $newname (sort keys %newvals) |
|
1321
|
|
|
|
|
|
|
{ |
|
1322
|
0
|
|
|
|
|
|
my $newval = $newvals{$newname}; |
|
1323
|
0
|
0
|
|
|
|
|
if (!ref $newval) |
|
1324
|
|
|
|
|
|
|
{ |
|
1325
|
0
|
0
|
|
|
|
|
if (!exists $oldvals->{$newname}) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1326
|
|
|
|
|
|
|
{ |
|
1327
|
0
|
|
|
|
|
|
$oldvals->{$newname} = $newval; |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
elsif (!ref $oldvals->{$newname}) |
|
1330
|
|
|
|
|
|
|
{ |
|
1331
|
0
|
|
|
|
|
|
my $v = $oldvals->{$newname}; |
|
1332
|
0
|
|
|
|
|
|
$oldvals->{$newname} = [$v, $newval]; |
|
1333
|
|
|
|
|
|
|
} |
|
1334
|
|
|
|
|
|
|
elsif (ref $oldvals->{$newname} eq 'ARRAY') |
|
1335
|
|
|
|
|
|
|
{ |
|
1336
|
0
|
|
|
|
|
|
push @{$oldvals->{$newname}}, $newval; |
|
|
0
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
} |
|
1338
|
|
|
|
|
|
|
else |
|
1339
|
|
|
|
|
|
|
{ |
|
1340
|
0
|
|
|
|
|
|
$oldvals->{$newname}->{$newval} = $newval; |
|
1341
|
|
|
|
|
|
|
} |
|
1342
|
|
|
|
|
|
|
} |
|
1343
|
|
|
|
|
|
|
else |
|
1344
|
|
|
|
|
|
|
{ |
|
1345
|
0
|
0
|
|
|
|
|
if (!exists $oldvals->{$newname}) |
|
|
|
0
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
{ |
|
1347
|
0
|
|
|
|
|
|
$oldvals->{$newname} = $newval; |
|
1348
|
|
|
|
|
|
|
} |
|
1349
|
|
|
|
|
|
|
elsif (ref $oldvals->{$newname} eq 'ARRAY') |
|
1350
|
|
|
|
|
|
|
{ |
|
1351
|
0
|
|
|
|
|
|
push @{$oldvals->{$newname}}, $newval; |
|
|
0
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
} |
|
1353
|
|
|
|
|
|
|
else |
|
1354
|
|
|
|
|
|
|
{ |
|
1355
|
0
|
|
|
|
|
|
my $v = $oldvals->{$newname}; |
|
1356
|
0
|
|
|
|
|
|
$oldvals->{$newname} = [$v, $newval]; |
|
1357
|
|
|
|
|
|
|
} |
|
1358
|
|
|
|
|
|
|
} |
|
1359
|
|
|
|
|
|
|
} |
|
1360
|
|
|
|
|
|
|
} # epub_parse_one_node |
|
1361
|
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
=head2 wordcount |
|
1363
|
|
|
|
|
|
|
|
|
1364
|
|
|
|
|
|
|
Figure out the word-count. |
|
1365
|
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
=cut |
|
1367
|
|
|
|
|
|
|
sub wordcount { |
|
1368
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1369
|
0
|
|
|
|
|
|
my %args = ( |
|
1370
|
|
|
|
|
|
|
@_ |
|
1371
|
|
|
|
|
|
|
); |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# |
|
1374
|
|
|
|
|
|
|
# Count the words |
|
1375
|
|
|
|
|
|
|
# |
|
1376
|
0
|
|
|
|
|
|
my $stripped = $self->{stripper}->parse($args{content}); |
|
1377
|
0
|
|
|
|
|
|
$self->{stripper}->eof; |
|
1378
|
0
|
|
|
|
|
|
$stripped =~ s/[\n\r]/ /sg; # remove line splits |
|
1379
|
0
|
|
|
|
|
|
$stripped =~ s/^\s+//; |
|
1380
|
0
|
|
|
|
|
|
$stripped =~ s/\s+$//; |
|
1381
|
0
|
|
|
|
|
|
$stripped =~ s/\s+/ /g; # remove excess whitespace |
|
1382
|
0
|
|
|
|
|
|
my @words = split(' ', $stripped); |
|
1383
|
0
|
|
|
|
|
|
my $wordcount = @words; |
|
1384
|
0
|
|
|
|
|
|
my $chars = length($stripped); |
|
1385
|
0
|
0
|
|
|
|
|
if ($self->{debug}) |
|
1386
|
|
|
|
|
|
|
{ |
|
1387
|
0
|
|
|
|
|
|
my $orig_length = length($args{content}); |
|
1388
|
0
|
|
|
|
|
|
print "orig_length=$orig_length, words=$wordcount, chars=$chars\n"; |
|
1389
|
0
|
0
|
|
|
|
|
if ($wordcount < 200) # too short! |
|
1390
|
|
|
|
|
|
|
{ |
|
1391
|
0
|
|
|
|
|
|
print "====== stripped ======\n$stripped\n======\n"; |
|
1392
|
|
|
|
|
|
|
} |
|
1393
|
|
|
|
|
|
|
} |
|
1394
|
|
|
|
|
|
|
return ( |
|
1395
|
0
|
|
|
|
|
|
words=>$wordcount, |
|
1396
|
|
|
|
|
|
|
chars=>$chars, |
|
1397
|
|
|
|
|
|
|
); |
|
1398
|
|
|
|
|
|
|
} # wordcount |
|
1399
|
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
=head2 build_toc |
|
1401
|
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
Build a local table-of-contents file from the meta-info about the story. |
|
1403
|
|
|
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
$self->build_toc(info=>\%info); |
|
1405
|
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
=cut |
|
1407
|
|
|
|
|
|
|
sub build_toc { |
|
1408
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1409
|
0
|
|
|
|
|
|
my %args = ( |
|
1410
|
|
|
|
|
|
|
@_ |
|
1411
|
|
|
|
|
|
|
); |
|
1412
|
0
|
|
|
|
|
|
my $info = $args{info}; |
|
1413
|
|
|
|
|
|
|
|
|
1414
|
0
|
|
|
|
|
|
my $filename = sprintf("%s00.html", $info->{basename}); |
|
1415
|
|
|
|
|
|
|
|
|
1416
|
0
|
|
|
|
|
|
my $html; |
|
1417
|
|
|
|
|
|
|
my $characters = (ref $info->{characters} |
|
1418
|
0
|
|
|
|
|
|
? join( ', ', @{$info->{characters}} ) |
|
1419
|
0
|
0
|
|
|
|
|
: $info->{characters}); |
|
1420
|
|
|
|
|
|
|
my $universe = (ref $info->{universe} |
|
1421
|
0
|
|
|
|
|
|
? join( ', ', @{$info->{universe}} ) |
|
1422
|
0
|
0
|
|
|
|
|
: $info->{universe}); |
|
1423
|
0
|
|
|
|
|
|
$html = <<EOT; |
|
1424
|
|
|
|
|
|
|
<html> |
|
1425
|
|
|
|
|
|
|
<head><title>$info->{title}</title></head> |
|
1426
|
|
|
|
|
|
|
<body> |
|
1427
|
|
|
|
|
|
|
<h1>$info->{title}</h1> |
|
1428
|
|
|
|
|
|
|
<p>by $info->{author}</p> |
|
1429
|
|
|
|
|
|
|
<p>Fetched from <a href="$info->{url}">$info->{url}</a></p> |
|
1430
|
|
|
|
|
|
|
<p><b>Summary:</b> |
|
1431
|
|
|
|
|
|
|
$info->{summary} |
|
1432
|
|
|
|
|
|
|
</p> |
|
1433
|
|
|
|
|
|
|
<p><b>Words:</b> $info->{wordcount}<br/> |
|
1434
|
|
|
|
|
|
|
<b>Universe:</b> $universe</p> |
|
1435
|
|
|
|
|
|
|
<b>Characters:</b> $characters</p> |
|
1436
|
|
|
|
|
|
|
<ol> |
|
1437
|
|
|
|
|
|
|
EOT |
|
1438
|
|
|
|
|
|
|
|
|
1439
|
0
|
|
|
|
|
|
my @storyfiles = @{$info->{storyfiles}}; |
|
|
0
|
|
|
|
|
|
|
|
1440
|
0
|
|
|
|
|
|
my @ch_titles = @{$info->{chapter_titles}}; |
|
|
0
|
|
|
|
|
|
|
|
1441
|
0
|
|
|
|
|
|
my @ch_wc = @{$info->{chapter_wc}}; |
|
|
0
|
|
|
|
|
|
|
|
1442
|
0
|
|
|
|
|
|
for (my $i=0; $i < @storyfiles; $i++) |
|
1443
|
|
|
|
|
|
|
{ |
|
1444
|
0
|
|
|
|
|
|
$html .= sprintf("<li><a href=\"%s\">%s</a> (%d)</li>", |
|
1445
|
|
|
|
|
|
|
$storyfiles[$i], |
|
1446
|
|
|
|
|
|
|
$ch_titles[$i], |
|
1447
|
|
|
|
|
|
|
$ch_wc[$i]); |
|
1448
|
|
|
|
|
|
|
} |
|
1449
|
0
|
|
|
|
|
|
$html .= "\n</ol>\n</body></html>\n"; |
|
1450
|
0
|
|
|
|
|
|
my $ofh; |
|
1451
|
0
|
0
|
|
|
|
|
open($ofh, ">", $filename) || die "Can't write to $filename"; |
|
1452
|
0
|
|
|
|
|
|
print $ofh $html; |
|
1453
|
0
|
|
|
|
|
|
close($ofh); |
|
1454
|
|
|
|
|
|
|
|
|
1455
|
0
|
|
|
|
|
|
return $filename; |
|
1456
|
|
|
|
|
|
|
} # build_toc |
|
1457
|
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
=head2 build_epub |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
Create an EPUB file from the story files and meta information. |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
$self->build_epub() |
|
1463
|
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
=cut |
|
1465
|
|
|
|
|
|
|
sub build_epub { |
|
1466
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1467
|
0
|
|
|
|
|
|
my %args = ( |
|
1468
|
|
|
|
|
|
|
@_ |
|
1469
|
|
|
|
|
|
|
); |
|
1470
|
0
|
|
|
|
|
|
my $info = $args{info}; |
|
1471
|
|
|
|
|
|
|
|
|
1472
|
0
|
|
|
|
|
|
my $epub = EBook::EPUB->new; |
|
1473
|
0
|
|
|
|
|
|
$epub->add_title($info->{title}); |
|
1474
|
0
|
|
|
|
|
|
$epub->add_author($info->{author}); |
|
1475
|
0
|
|
|
|
|
|
$epub->add_description($info->{summary}); |
|
1476
|
0
|
|
|
|
|
|
$epub->add_language('en'); |
|
1477
|
0
|
|
|
|
|
|
$epub->add_source($info->{url}, 'URL'); |
|
1478
|
0
|
|
|
|
|
|
$epub->add_date($info->{fetch_date}, 'fetched'); |
|
1479
|
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
# Add Subjects and additional Meta |
|
1481
|
|
|
|
|
|
|
# Also build up the title-page |
|
1482
|
0
|
|
|
|
|
|
my $info_str =<<EOT; |
|
1483
|
|
|
|
|
|
|
<h1>$info->{title}</h1> |
|
1484
|
|
|
|
|
|
|
<p>by $info->{author}</p> |
|
1485
|
|
|
|
|
|
|
<p><b>Fetched from:</b> $info->{url}</p> |
|
1486
|
|
|
|
|
|
|
<p><b>Summary:</b> $info->{summary}</p> |
|
1487
|
|
|
|
|
|
|
<p> |
|
1488
|
|
|
|
|
|
|
EOT |
|
1489
|
0
|
|
|
|
|
|
my %know = %{$info}; |
|
|
0
|
|
|
|
|
|
|
|
1490
|
0
|
|
|
|
|
|
delete $know{title}; |
|
1491
|
0
|
|
|
|
|
|
delete $know{author}; |
|
1492
|
0
|
|
|
|
|
|
delete $know{summary}; |
|
1493
|
0
|
|
|
|
|
|
delete $know{url}; |
|
1494
|
0
|
|
|
|
|
|
delete $know{fetch_date}; |
|
1495
|
0
|
|
|
|
|
|
delete $know{basename}; |
|
1496
|
0
|
|
|
|
|
|
delete $know{chapter_titles}; |
|
1497
|
0
|
|
|
|
|
|
delete $know{chapter_wc}; |
|
1498
|
0
|
|
|
|
|
|
delete $know{chapters}; |
|
1499
|
0
|
|
|
|
|
|
delete $know{storyfiles}; |
|
1500
|
0
|
|
|
|
|
|
foreach my $key (sort keys %know) |
|
1501
|
|
|
|
|
|
|
{ |
|
1502
|
0
|
0
|
|
|
|
|
if (!$know{$key}) |
|
1503
|
|
|
|
|
|
|
{ |
|
1504
|
0
|
|
|
|
|
|
next; |
|
1505
|
|
|
|
|
|
|
} |
|
1506
|
0
|
0
|
|
|
|
|
if (!ref $know{$key}) |
|
1507
|
|
|
|
|
|
|
{ |
|
1508
|
0
|
|
|
|
|
|
$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, $know{$key}); |
|
1509
|
0
|
0
|
|
|
|
|
if ($know{$key} =~ /,\s*/) |
|
1510
|
|
|
|
|
|
|
{ |
|
1511
|
0
|
|
|
|
|
|
my @array = split(/,\s*/, $know{$key}); |
|
1512
|
0
|
|
|
|
|
|
foreach my $v (@array) |
|
1513
|
|
|
|
|
|
|
{ |
|
1514
|
0
|
0
|
|
|
|
|
if ($key =~ /^(?:category|story_length)$/) |
|
1515
|
|
|
|
|
|
|
{ |
|
1516
|
0
|
|
|
|
|
|
$epub->add_subject($v); |
|
1517
|
|
|
|
|
|
|
} |
|
1518
|
|
|
|
|
|
|
else |
|
1519
|
|
|
|
|
|
|
{ |
|
1520
|
0
|
|
|
|
|
|
$epub->add_meta_item($key, $v); |
|
1521
|
|
|
|
|
|
|
} |
|
1522
|
|
|
|
|
|
|
} |
|
1523
|
|
|
|
|
|
|
} |
|
1524
|
|
|
|
|
|
|
else |
|
1525
|
|
|
|
|
|
|
{ |
|
1526
|
0
|
0
|
|
|
|
|
if ($key =~ /^(?:category|story_length)$/) |
|
1527
|
|
|
|
|
|
|
{ |
|
1528
|
0
|
|
|
|
|
|
$epub->add_subject($know{$key}); |
|
1529
|
|
|
|
|
|
|
} |
|
1530
|
|
|
|
|
|
|
else |
|
1531
|
|
|
|
|
|
|
{ |
|
1532
|
0
|
|
|
|
|
|
$epub->add_meta_item($key, $know{$key}); |
|
1533
|
|
|
|
|
|
|
} |
|
1534
|
|
|
|
|
|
|
} |
|
1535
|
|
|
|
|
|
|
} |
|
1536
|
|
|
|
|
|
|
else |
|
1537
|
|
|
|
|
|
|
{ |
|
1538
|
0
|
|
|
|
|
|
$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, join(', ', @{$know{$key}})); |
|
|
0
|
|
|
|
|
|
|
|
1539
|
0
|
|
|
|
|
|
foreach my $cat (@{$know{$key}}) |
|
|
0
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
{ |
|
1541
|
0
|
0
|
|
|
|
|
if ($key =~ /^(?:category|story_length)$/) |
|
1542
|
|
|
|
|
|
|
{ |
|
1543
|
0
|
|
|
|
|
|
$epub->add_subject($cat); |
|
1544
|
|
|
|
|
|
|
} |
|
1545
|
|
|
|
|
|
|
else |
|
1546
|
|
|
|
|
|
|
{ |
|
1547
|
0
|
|
|
|
|
|
$epub->add_meta_item($key, $cat); |
|
1548
|
|
|
|
|
|
|
} |
|
1549
|
|
|
|
|
|
|
} |
|
1550
|
|
|
|
|
|
|
} |
|
1551
|
|
|
|
|
|
|
} |
|
1552
|
|
|
|
|
|
|
|
|
1553
|
0
|
|
|
|
|
|
$info_str .= "</p>\n"; |
|
1554
|
|
|
|
|
|
|
|
|
1555
|
0
|
|
|
|
|
|
my $titlepage = $self->tidy(story=>$info_str, title=>$info->{title}); |
|
1556
|
0
|
|
|
|
|
|
my $play_order = 1; |
|
1557
|
0
|
|
|
|
|
|
my $id; |
|
1558
|
0
|
|
|
|
|
|
$id = $epub->add_xhtml("title.html", $titlepage); |
|
1559
|
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
# Add top-level nav-point |
|
1561
|
0
|
|
|
|
|
|
my $navpoint = $epub->add_navpoint( |
|
1562
|
|
|
|
|
|
|
label => "ToC", |
|
1563
|
|
|
|
|
|
|
id => $id, |
|
1564
|
|
|
|
|
|
|
content => "title.html", |
|
1565
|
|
|
|
|
|
|
play_order => $play_order # should always start with 1 |
|
1566
|
|
|
|
|
|
|
); |
|
1567
|
|
|
|
|
|
|
|
|
1568
|
0
|
|
|
|
|
|
my @storyfiles = @{$info->{storyfiles}}; |
|
|
0
|
|
|
|
|
|
|
|
1569
|
0
|
|
|
|
|
|
my @ch_titles = @{$info->{chapter_titles}}; |
|
|
0
|
|
|
|
|
|
|
|
1570
|
0
|
|
|
|
|
|
for (my $i=0; $i < @storyfiles; $i++) |
|
1571
|
|
|
|
|
|
|
{ |
|
1572
|
0
|
|
|
|
|
|
$play_order++; |
|
1573
|
0
|
|
|
|
|
|
$id = $epub->copy_xhtml($storyfiles[$i], $storyfiles[$i]); |
|
1574
|
0
|
|
|
|
|
|
my $navpoint = $epub->add_navpoint( |
|
1575
|
|
|
|
|
|
|
label => $ch_titles[$i], |
|
1576
|
|
|
|
|
|
|
id => $id, |
|
1577
|
|
|
|
|
|
|
content => $storyfiles[$i], |
|
1578
|
|
|
|
|
|
|
play_order => $play_order, |
|
1579
|
|
|
|
|
|
|
); |
|
1580
|
|
|
|
|
|
|
} |
|
1581
|
|
|
|
|
|
|
|
|
1582
|
0
|
|
|
|
|
|
my $epub_file = $info->{basename} . '.epub'; |
|
1583
|
0
|
|
|
|
|
|
$epub->pack_zip($epub_file); |
|
1584
|
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
# now unlink the storyfiles |
|
1586
|
0
|
|
|
|
|
|
for (my $i=0; $i < @storyfiles; $i++) |
|
1587
|
|
|
|
|
|
|
{ |
|
1588
|
0
|
|
|
|
|
|
unlink $storyfiles[$i]; |
|
1589
|
|
|
|
|
|
|
} |
|
1590
|
|
|
|
|
|
|
|
|
1591
|
0
|
|
|
|
|
|
return $epub_file; |
|
1592
|
|
|
|
|
|
|
} # build_epub |
|
1593
|
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
=head2 tidy_chars |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
Remove nasty encodings. |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
$content = $self->tidy_chars($content); |
|
1599
|
|
|
|
|
|
|
|
|
1600
|
|
|
|
|
|
|
=cut |
|
1601
|
|
|
|
|
|
|
sub tidy_chars { |
|
1602
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1603
|
0
|
|
|
|
|
|
my $string = shift; |
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
# numeric entities |
|
1606
|
0
|
|
|
|
|
|
$string =~ s/ //sg; |
|
1607
|
0
|
|
|
|
|
|
$string =~ s/'/'/sg; |
|
1608
|
0
|
|
|
|
|
|
$string =~ s/"/"/sg; |
|
1609
|
0
|
|
|
|
|
|
$string =~ s/-/-/sg; |
|
1610
|
0
|
|
|
|
|
|
$string =~ s/ / /sg; |
|
1611
|
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
#------------------------------------------------------- |
|
1613
|
|
|
|
|
|
|
# from Catalyst::Plugin::Params::Demoronize |
|
1614
|
0
|
|
|
|
|
|
zap_cp1252($string); |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
0
|
|
|
|
|
|
my %replace_map = ( |
|
1617
|
|
|
|
|
|
|
'\302' => '', |
|
1618
|
|
|
|
|
|
|
'\240' => ' ', |
|
1619
|
|
|
|
|
|
|
); |
|
1620
|
|
|
|
|
|
|
|
|
1621
|
0
|
|
|
|
|
|
foreach my $replace (keys(%{replace_map})) { |
|
1622
|
0
|
|
|
|
|
|
my $rr = $replace_map{$replace}; |
|
1623
|
0
|
|
|
|
|
|
$string =~ s/$replace/$rr/g; |
|
1624
|
|
|
|
|
|
|
} |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
#------------------------------------------------------- |
|
1627
|
|
|
|
|
|
|
# from demoronizser |
|
1628
|
|
|
|
|
|
|
# http://www.fourmilab.ch/webtools/demoroniser/ |
|
1629
|
|
|
|
|
|
|
#------------------------------------------------------- |
|
1630
|
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
# Supply missing semicolon at end of numeric entity if |
|
1632
|
|
|
|
|
|
|
# Billy's bozos left it out. |
|
1633
|
|
|
|
|
|
|
|
|
1634
|
0
|
|
|
|
|
|
$string =~ s/(&#[0-2]\d\d)\s/$1; /g; |
|
1635
|
|
|
|
|
|
|
|
|
1636
|
|
|
|
|
|
|
# Fix dimbulb obscure numeric rendering of < > & |
|
1637
|
|
|
|
|
|
|
|
|
1638
|
0
|
|
|
|
|
|
$string =~ s/\&\#038;/&/g; |
|
1639
|
0
|
|
|
|
|
|
$string =~ s/\&\#39;/‘/g; |
|
1640
|
0
|
|
|
|
|
|
$string =~ s/\&\#060;/</g; |
|
1641
|
0
|
|
|
|
|
|
$string =~ s/\&\#062;/>/g; |
|
1642
|
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
# Translate Unicode numeric punctuation characters |
|
1644
|
|
|
|
|
|
|
# into ISO equivalents |
|
1645
|
|
|
|
|
|
|
|
|
1646
|
0
|
|
|
|
|
|
$string =~ s/‐/-/sg; # 0x2010 Hyphen |
|
1647
|
0
|
|
|
|
|
|
$string =~ s/‑/-/sg; # 0x2011 Non-breaking hyphen |
|
1648
|
0
|
|
|
|
|
|
$string =~ s/–/-/sg; # 0x2013 En dash |
|
1649
|
0
|
|
|
|
|
|
$string =~ s/—/--/sg; # 0x2014 Em dash |
|
1650
|
0
|
|
|
|
|
|
$string =~ s/―/--/sg; # 0x2015 Horizontal bar/quotation dash |
|
1651
|
0
|
|
|
|
|
|
$string =~ s/‖/||/sg; # 0x2016 Double vertical line |
|
1652
|
0
|
|
|
|
|
|
$string =~ s-‗-_-sg; # 0x2017 Double low line |
|
1653
|
0
|
|
|
|
|
|
$string =~ s/‘/`/sg; # 0x2018 Left single quotation mark |
|
1654
|
0
|
|
|
|
|
|
$string =~ s/’/'/sg; # 0x2019 Right single quotation mark |
|
1655
|
0
|
|
|
|
|
|
$string =~ s/‚/,/sg; # 0x201A Single low-9 quotation mark |
|
1656
|
0
|
|
|
|
|
|
$string =~ s/‛/`/sg; # 0x201B Single high-reversed-9 quotation mark |
|
1657
|
0
|
|
|
|
|
|
$string =~ s/“/"/sg; # 0x201C Left double quotation mark |
|
1658
|
0
|
|
|
|
|
|
$string =~ s/”/"/sg; # 0x201D Right double quotation mark |
|
1659
|
0
|
|
|
|
|
|
$string =~ s/„/,,/sg; # 0x201E Double low-9 quotation mark |
|
1660
|
0
|
|
|
|
|
|
$string =~ s/‟/"/sg; # 0x201F Double high-reversed-9 quotation mark |
|
1661
|
0
|
|
|
|
|
|
$string =~ s/•/*/sg; # 0x2022 Bullet |
|
1662
|
0
|
|
|
|
|
|
$string =~ s/‣/*/sg; # 0x2023 Triangular bullet |
|
1663
|
0
|
|
|
|
|
|
$string =~ s/․/./sg; # 0x2024 One dot leader |
|
1664
|
0
|
|
|
|
|
|
$string =~ s/‥/../sg; # 0x2026 Two dot leader |
|
1665
|
0
|
|
|
|
|
|
$string =~ s/…/.../sg; # 0x2026 Horizontal ellipsis |
|
1666
|
0
|
|
|
|
|
|
$string =~ s/‧/·/sg; # 0x2027 Hyphenation point |
|
1667
|
|
|
|
|
|
|
#------------------------------------------------------- |
|
1668
|
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
# and somehow some of the entities go funny |
|
1670
|
0
|
|
|
|
|
|
$string =~ s/\&\#133;/.../g; |
|
1671
|
0
|
|
|
|
|
|
$string =~ s/\ / /g; |
|
1672
|
0
|
|
|
|
|
|
$string =~ s/\‘/'/g; |
|
1673
|
0
|
|
|
|
|
|
$string =~ s/\’/'/g; |
|
1674
|
0
|
|
|
|
|
|
$string =~ s/\“/"/g; |
|
1675
|
0
|
|
|
|
|
|
$string =~ s/\”/"/g; |
|
1676
|
0
|
|
|
|
|
|
$string =~ s/\"/"/g; |
|
1677
|
0
|
|
|
|
|
|
$string =~ s/\–/-/g; |
|
1678
|
0
|
|
|
|
|
|
$string =~ s/\…/.../g; |
|
1679
|
|
|
|
|
|
|
|
|
1680
|
|
|
|
|
|
|
# replace double-breaks with <p> |
|
1681
|
0
|
|
|
|
|
|
$string =~ s#<br\s*\/?>\s*<br\s*\/?>#\n<p>#sg; |
|
1682
|
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
# remove other cruft |
|
1684
|
0
|
|
|
|
|
|
$string =~ s#<wbr>##sg; |
|
1685
|
0
|
|
|
|
|
|
$string =~ s#</wbr>##sg; |
|
1686
|
0
|
|
|
|
|
|
$string =~ s#<wbr/>##sg; |
|
1687
|
0
|
|
|
|
|
|
$string =~ s#<nobr>##sg; |
|
1688
|
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
# Clean unwanted MS-Word HTML |
|
1690
|
0
|
|
|
|
|
|
$string =~ s#<!--\[if gte mso \d*\]>.*?<!\[endif\]-->##sg; |
|
1691
|
0
|
|
|
|
|
|
$string =~ s#<!--\[if !mso\]>.*?<!\[endif\]-->##sg; |
|
1692
|
0
|
|
|
|
|
|
$string =~ s!<[/]?(font|span|xml|del|ins|[ovwxp]:\w+|st\d:\w+)[^>]*?>!!igs; |
|
1693
|
0
|
|
|
|
|
|
$string =~ s!<([^>]*)(?:lang|style|size|face|[ovwxp]:\w+)=(?:'[^']*'|""[^""]*""|[^\s>]+)([^>]*)>!<$1$2>!isg; |
|
1694
|
0
|
|
|
|
|
|
$string =~ s/\s*class="Banner[0-9]+"//g; |
|
1695
|
0
|
|
|
|
|
|
$string =~ s/\s*class="Textbody"//g; |
|
1696
|
0
|
|
|
|
|
|
$string =~ s/\s*class="MsoNormal"//g; |
|
1697
|
0
|
|
|
|
|
|
$string =~ s/\s*class="MsoBodyText"//g; |
|
1698
|
|
|
|
|
|
|
|
|
1699
|
0
|
|
|
|
|
|
return $string; |
|
1700
|
|
|
|
|
|
|
} # tidy_chars |
|
1701
|
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
1; # End of WWW::FetchStory::Fetcher |
|
1703
|
|
|
|
|
|
|
__END__ |