File Coverage

blib/lib/WWW/FetchStory/Fetcher/Gutenberg.pm

Criterion	Covered	Total	%
statement	9	111	8.1
branch	0	34	0.0
condition			n/a
subroutine	3	13	23.0
pod	10	10	100.0
total	22	168	13.1

line	stmt	bran	sub	pod	time	code
1						package WWW::FetchStory::Fetcher::Gutenberg;
2						$WWW::FetchStory::Fetcher::Gutenberg::VERSION = '0.2602';
3	1		1		182950	use strict;
	1				2
	1				34
4	1		1		3	use warnings;
	1				1
	1				99
5						=head1 NAME
6
7						WWW::FetchStory::Fetcher::Gutenberg - fetching module for WWW::FetchStory
8
9						=head1 VERSION
10
11						version 0.2602
12
13						=head1 DESCRIPTION
14
15						This is the Gutenberg story-fetching plugin for WWW::FetchStory.
16
17						=cut
18
19	1		1		7	use parent qw(WWW::FetchStory::Fetcher);
	1				2
	1				9
20
21						=head1 METHODS
22
23						=head2 info
24
25						Information about the fetcher.
26
27						$info = $self->info();
28
29						=cut
30
31						sub info {
32	0		0	1		my $self = shift;
33
34	0					my $info = "(http://www.gutenberg.org) Project Gutenberg; public-domain works";
35
36	0					return $info;
37						} # info
38
39						=head2 priority
40
41						The priority of this fetcher. Fetchers with higher priority
42						get tried first. This is useful where there may be a generic
43						fetcher for a particular site, and then a more specialized fetcher
44						for particular sections of a site. For example, there may be a
45						generic Gutenberg fetcher, and then refinements for particular
46						Gutenberg community, such as the sshg_exchange community.
47						This works as either a class function or a method.
48
49						This must be overridden by the specific fetcher class.
50
51						$priority = $self->priority();
52
53						$priority = WWW::FetchStory::Fetcher::priority($class);
54
55						=cut
56
57						sub priority {
58	0		0	1		my $class = shift;
59
60	0					return 1;
61						} # priority
62
63						=head2 allow
64
65						If this fetcher can be used for the given URL, then this returns
66						true.
67						This must be overridden by the specific fetcher class.
68
69						if ($obj->allow($url))
70						{
71						....
72						}
73
74						=cut
75
76						sub allow {
77	0		0	1		my $self = shift;
78	0					my $url = shift;
79
80	0					return ($url =~ /gutenberg\.org/);
81						} # allow
82
83						=head1 Private Methods
84
85						=head2 parse_toc
86
87						Parse the table-of-contents file.
88
89						%info = $self->parse_toc(content=>$content,
90						url=>$url,
91						urls=>\@urls);
92
93						This should return a hash containing:
94
95						=over
96
97						=item chapters
98
99						An array of URLs for the chapters of the story. In the case where the
100						story only takes one page, that will be the chapter.
101						In the case where multiple URLs have been passed in, it will be those URLs.
102
103						=item title
104
105						The title of the story.
106
107						=back
108
109						It may also return additional information, such as Summary.
110
111						=cut
112
113						sub parse_toc {
114	0		0	1		my $self = shift;
115	0					my %args = (
116						url=>'',
117						content=>'',
118						@_
119						);
120
121	0					my $content = $args{content};
122	0					my %info = ();
123	0					$info{url} = $args{url};
124	0					$info{title} = $self->parse_title(%args);
125	0					$info{author} = $self->parse_author(%args);
126	0					$info{summary} = $self->parse_summary(%args);
127	0					$info{characters} = $self->parse_characters(%args);
128	0					$info{category} = $self->parse_category(%args);
129	0					$info{chapters} = $self->parse_chapter_urls(%args);
130
131	0					return %info;
132						} # parse_toc
133
134						=head2 parse_chapter_urls
135
136						Figure out the URLs for the chapters of this story.
137
138						=cut
139						sub parse_chapter_urls {
140	0		0	1		my $self = shift;
141	0					my %args = (
142						url=>'',
143						content=>'',
144						@_
145						);
146	0					my $content = $args{content};
147	0					my @chapters = ();
148	0	0				if (defined $args{urls})
149						{
150	0					@chapters = @{$args{urls}};
	0
151						}
152	0	0				if (@chapters == 1)
153						{
154	0	0				if ($args{url} =~ m{http://www.gutenberg.org/ebooks/(\d+)})
155						{
156	0					my $sid = $1;
157	0					@chapters = ("http://www.gutenberg.org/files/${sid}/${sid}-h/${sid}-h.htm");
158						}
159						}
160
161	0					return \@chapters;
162						} # parse_chapter_urls
163
164						=head2 parse_title
165
166						Get the title from the content
167
168						=cut
169						sub parse_title {
170	0		0	1		my $self = shift;
171	0					my %args = (
172						url=>'',
173						content=>'',
174						@_
175						);
176
177	0					my $content = $args{content};
178	0					my $title = '';
179	0	0				if ($content =~ m#<th>Title</th>\s<td>\s([^<]+)\s*</td>#s)
		0
180						{
181	0					$title = $1;
182						}
183						elsif ($content =~ m#<h1 class="icon_title">\s(.?)\sby\s[^<]+\s</h1>#s)
184						{
185	0					$title = $1;
186						}
187						else
188						{
189	0					$title = $self->SUPER::parse_title(%args);
190						}
191	0					$title =~ s/\s+$//;
192	0					return $title;
193						} # parse_title
194
195						=head2 parse_ch_title
196
197						Get the chapter title from the content
198
199						=cut
200						sub parse_ch_title {
201	0		0	1		my $self = shift;
202	0					my %args = (
203						url=>'',
204						content=>'',
205						@_
206						);
207
208	0					my $content = $args{content};
209	0					my $title = '';
210	0	0				if ($content =~ m#Title: (.*)$#m)
211						{
212	0					$title = $1;
213						}
214						else
215						{
216	0					$title = $self->SUPER::parse_title(%args);
217						}
218	0					$title =~ s/\s+$//;
219	0					return $title;
220						} # parse_ch_title
221
222						=head2 parse_author
223
224						Get the author from the content
225
226						=cut
227						sub parse_author {
228	0		0	1		my $self = shift;
229	0					my %args = (
230						url=>'',
231						content=>'',
232						@_
233						);
234
235	0					my $content = $args{content};
236	0					my $author = '';
237	0	0				if ($content =~ m#<h1 class="icon_title">\s[^<]+\sby\s+([^<]+)\s*</h1>#s)
238						{
239	0					$author = $1;
240						}
241						else
242						{
243	0					$author = $self->SUPER::parse_author(%args);
244						}
245	0					$author =~ s/\s+$//;
246	0					return $author;
247						} # parse_author
248
249						=head2 parse_category
250
251						Get the category from the content
252
253						=cut
254						sub parse_category {
255	0		0	1		my $self = shift;
256	0					my %args = (
257						url=>'',
258						content=>'',
259						@_
260						);
261
262	0					my $content = $args{content};
263	0					my $category = '';
264	0					my %cats = ();
265	0					while ($content =~ m#<th>Subject</th>\s<td[^>]>\s<a [^>]+>\s+([^<]+)\s+</a>\s</td>#sg)
266						{
267	0					my $subjects = $1;
268	0	0				if ($subjects !~ /Fictitious character/)
269						{
270	0					my @subs = split(/ -- /, $subjects);
271	0	0				if (@subs)
272						{
273	0					foreach my $sub (@subs)
274						{
275						# some subjects need even more parsing
276	0	0				if ($sub =~ m#(.?) $(.)$#)
277						{
278	0					my $sub1 = $1;
279	0					my $sub2 = $2;
280	0					$cats{$sub2} = 1;
281	0					$sub = $sub1;
282						}
283	0	0				if ($sub =~ m#(.?), (.)#)
284						{
285	0					$sub = "$2 $1";
286						}
287	0					$cats{$sub} = 1;
288						}
289						}
290						else
291						{
292	0					$cats{$subjects} = 1;
293						}
294						}
295						}
296	0	0				if (%cats)
297						{
298	0					$category = join(', ', sort keys %cats);
299						}
300	0					return $category;
301						} # parse_category
302
303						=head2 parse_characters
304
305						Get the characters from the content
306
307						=cut
308						sub parse_characters {
309	0		0	1		my $self = shift;
310	0					my %args = (
311						url=>'',
312						content=>'',
313						@_
314						);
315
316	0					my $content = $args{content};
317	0					my $characters = '';
318	0					my %chars = ();
319	0					while ($content =~ m#<th>Subject</th>\s<td[^>]>\s<a [^>]+>\s+([^<]+)\s+</a>\s</td>#sg)
320						{
321	0					my $subjects = $1;
322	0	0				if ($subjects =~ /Fictitious character/)
323						{
324	0					my @subs = split(/ -- /, $subjects);
325	0	0				if (@subs)
326						{
327	0					foreach my $sub (@subs)
328						{
329						# only look at the characters
330	0	0				if ($sub =~ m#(.*?) $Fictitious character$#)
331						{
332	0					my $sub1 = $1;
333	0	0				if ($sub1 =~ m#(.?), (.)#)
334						{
335	0					$sub1 = "$2 $1";
336						}
337	0					$chars{$sub1} = 1;
338						}
339						}
340						}
341						}
342						}
343	0	0				if (%chars)
344						{
345	0					$characters = join(', ', sort keys %chars);
346						}
347	0					return $characters;
348						} # parse_characters
349
350						1; # End of WWW::FetchStory::Fetcher::Gutenberg
351						__END__