File Coverage

blib/lib/WWW/FetchStory/Fetcher/AO3.pm

Criterion	Covered	Total	%
statement	18	265	6.7
branch	0	98	0.0
condition	0	21	0.0
subroutine	6	22	27.2
pod	16	16	100.0
total	40	422	9.4

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::FetchStory::Fetcher::AO3;
2							$WWW::FetchStory::Fetcher::AO3::VERSION = '0.2602';
3	1			1		160977	use strict;
	1					2
	1					36
4	1			1		5	use warnings;
	1					1
	1					65
5							=head1 NAME
6
7							WWW::FetchStory::Fetcher::AO3 - fetching module for WWW::FetchStory
8
9							=head1 VERSION
10
11							version 0.2602
12
13							=head1 DESCRIPTION
14
15							This is the AO3 story-fetching plugin for WWW::FetchStory.
16
17							=cut
18
19	1			1		5	use parent qw(WWW::FetchStory::Fetcher);
	1					2
	1					12
20	1			1		96	use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
	1					2
	1					298
21	1			1		11	use YAML::Any;
	1					3
	1					17
22	1			1		2584	use Path::Tiny;
	1					16813
	1					5582
23
24							=head1 METHODS
25
26							=head2 info
27
28							Information about the fetcher.
29
30							$info = $self->info();
31
32							=cut
33
34							sub info {
35	0			0	1		my $self = shift;
36
37	0						my $info = "http://www.archiveofourown.org AO3 General fanfic archive";
38
39	0						return $info;
40							} # info
41
42							=head2 priority
43
44							The priority of this fetcher. Fetchers with higher priority
45							get tried first. This is useful where there may be a generic
46							fetcher for a particular site, and then a more specialized fetcher
47							for particular sections of a site. For example, there may be a
48							generic Livejournal fetcher, and then refinements for particular
49							Livejournal community, such as the sshg_exchange community.
50							This works as either a class function or a method.
51
52							$priority = $self->priority();
53
54							$priority = WWW::FetchStory::Fetcher::priority($class);
55
56							=cut
57
58							sub priority {
59	0			0	1		my $class = shift;
60
61	0						return 1;
62							} # priority
63
64							=head2 allow
65
66							If this fetcher can be used for the given URL, then this returns
67							true.
68							This must be overridden by the specific fetcher class.
69
70							if ($obj->allow($url))
71							{
72							....
73							}
74
75							=cut
76
77							sub allow {
78	0			0	1		my $self = shift;
79	0						my $url = shift;
80
81	0	0	0				if (-f $url and $url =~ /\.html$/) # is a HTML file, not a URL
82							{
83							# We can figure out if this is from AO3 by looking inside it
84	0						my $path = path($url);
85	0						my $contents = $path->slurp;
86	0						return ($contents =~ m!<meta name="author" content="Organization for Transformative Works" />!);
87							}
88							else
89							{
90	0		0				return ($url =~ /archiveofourown\.org/ \|\| $url =~ /ao3\.org/);
91							}
92							} # allow
93
94							=head1 Private Methods
95
96							=head2 get_toc
97
98							Get a table-of-contents page.
99							If we already have a downloaded EPUB file
100							then we extract the needed pages from that instead.
101
102							=cut
103							sub get_toc {
104	0			0	1		my $self = shift;
105	0						my %args = @_;
106
107	0	0	0				if ($args{use_file} and $args{use_file} =~ /epub$/)
108							{
109	0						my $file = $args{use_file};
110	0						my $zip = Archive::Zip->new();
111	0						my $status = $zip->read( $file );
112	0	0					if ($status != AZ_OK)
113							{
114	0						die "Failed to read $file";
115							}
116							# The AO3 EPUB files are generated by Calibre
117							# and have predictable names.
118							# The meta-data for the story is contained in
119							# (name)_split_000.xhtml and
120							# (name)_split_001.xhtml
121	0						my @html_members = $zip->membersMatching('.*_split_00[0-1]\.xhtml');
122	0						my $toc_content = '';
123	0						my $first_title = '';
124	0						for (my $i = 0; $i < @html_members; $i++)
125							{
126	0						my $html = $zip->contents($html_members[$i]);
127	0						my ($body,$title) = $self->extract_story(content=>$html);
128	0	0					$first_title = $title if !$first_title;
129	0						$toc_content .= $body;
130							}
131	0						my $toc_page = "<html>
132							<head><title>$first_title</title></head>
133							<body>
134							$toc_content
135							</body></html>
136							";
137	0						return $toc_page;
138							}
139							else
140							{
141	0						return $self->SUPER::get_toc(%args);
142							}
143							} # get_toc
144
145							=head2 parse_toc
146
147							Parse the table-of-contents file.
148
149							%info = $self->parse_toc(content=>$content,
150							url=>$url,
151							urls=>\@urls);
152
153							This should return a hash containing:
154
155							=over
156
157							=item chapters
158
159							An array of URLs for the chapters of the story. In the case where the
160							story only takes one page, that will be the chapter.
161							In the case where multiple URLs have been passed in, it will be those URLs.
162
163							=item title
164
165							The title of the story.
166
167							=back
168
169							It may also return additional information, such as Summary.
170
171							=cut
172
173							sub parse_toc {
174	0			0	1		my $self = shift;
175	0						my %args = (
176							url=>'',
177							content=>'',
178							@_
179							);
180
181	0	0					say STDERR whoami() if $args{verbose} > 1;
182	0	0					say STDERR Dump(\%args) if $args{verbose} > 2;
183
184	0						my $content = $args{content};
185	0						my %info = ();
186	0						$info{url} = $args{url};
187	0	0					if ($args{url} =~ /epub$/)
188							{
189							# We're using an already-downloaded EPUB file
190							# And the URL should be in the contents of the TOC
191	0	0					if ($content =~ m!Posted originally on the <a href="https?://archiveofourown.org/">Archive of Our Own</a> at <a href="(https?://archiveofourown.org/works/\d+)"!)
192							{
193	0						$info{url} = $1;
194							}
195							}
196
197							# sid: story-id, work-id
198	0						my $sid='';
199	0	0	0				if ($info{url} =~ m#archiveofourown.org/works/(\d+)#)
		0
		0
200							{
201	0						$sid = $1;
202							}
203							elsif ($args{rurl} and $args{rurl} =~ m#archiveofourown.org/works/(\d+)#)
204							{
205	0						$sid = $1;
206							}
207							elsif (!$args{rurl}) # derive the sid from the contents of the toc
208							{
209	0	0					if ($content =~ m!work_id=(\d+)!)
		0
210							{
211	0						$sid = $1;
212	0						$info{url} = "https://archiveofourown.org/works/${sid}";
213							}
214							elsif ($content =~ m!/works/(\d+)/!)
215							{
216	0						$sid = $1;
217	0						$info{url} = "https://archiveofourown.org/works/${sid}";
218							}
219							}
220	0	0					if (!$sid)
221							{
222	0						print STDERR "did not find SID for $args{url} $info{url}";
223	0						return $self->SUPER::parse_toc(%args);
224							}
225
226	0						$info{title} = $self->parse_title(%args);
227	0						$info{author} = $self->parse_author(%args);
228	0						$info{summary} = $self->parse_summary(%args);
229	0						$info{characters} = $self->parse_characters(%args);
230	0						$info{universe} = $self->parse_universe(%args);
231	0						$info{category} = $self->parse_category(%args);
232	0						$info{rating} = $self->parse_rating(%args);
233	0						$info{chapters} = $self->parse_chapter_urls(%args, sid=>$sid);
234	0						my $epub_url = $self->parse_epub_url(%args, sid=>$sid);
235	0	0					if ($epub_url)
236							{
237	0						$info{epub_url} = $epub_url;
238							}
239	0	0	0				if ($args{epub} or $args{url} =~ /epub$/) # need to parse the wordcount
240							{
241	0						$info{wordcount} = $self->parse_wordcount(%args);
242							}
243
244	0						return %info;
245							} # parse_toc
246
247							=head2 parse_chapter_urls
248
249							Figure out the URLs for the chapters of this story.
250
251							=cut
252							sub parse_chapter_urls {
253	0			0	1		my $self = shift;
254	0						my %args = (
255							urls=>undef,
256							content=>'',
257							@_
258							);
259	0						my $content = $args{content};
260	0						my $sid = $args{sid};
261	0						my @chapters = ();
262	0	0					if (defined $args{urls})
263							{
264	0						@chapters = @{$args{urls}};
	0
265							}
266	0	0	0				if (@chapters == 1
267							and $content =~ m!href="(/downloads/$sid/[^.]+\.html)!)
268							{
269	0						@chapters = ("http://archiveofourown.org$1");
270							}
271
272	0						return \@chapters;
273							} # parse_chapter_urls
274
275							=head2 parse_epub_url
276
277							Figure out the URL for the EPUB version of this story.
278
279							=cut
280							sub parse_epub_url {
281	0			0	1		my $self = shift;
282	0						my %args = (
283							url=>'',
284							content=>'',
285							@_
286							);
287	0						my $content = $args{content};
288	0						my $sid = $args{sid};
289	0						my $epub_url = '';
290	0	0					if ($content =~ m!href="(/downloads/$sid/[^.]+\.epub\?updated_at=\d+)!)
		0
291							{
292	0						$epub_url = "https://archiveofourown.org$1";
293							}
294							elsif ($content =~ m!href="(/downloads/$sid/[^.]+\.epub)!)
295							{
296	0						$epub_url = ("http://archiveofourown.org$1");
297							}
298
299	0						return $epub_url;
300							} # parse_epub_url
301
302							=head2 parse_title
303
304							Get the title.
305
306							=cut
307							sub parse_title {
308	0			0	1		my $self = shift;
309	0						my %args = @_;
310
311	0						my $content = $args{content};
312
313	0						my $title = '';
314	0	0					if ($content =~ m!<h2 class="title heading">\s([^<])</h2>!s)
		0
315							{
316	0						$title = $1;
317	0						$title =~ s/\s*$//s; # remove any trailing whitespace
318							}
319							elsif ($content =~ m!<h1[^>]>([^<])</h1>!)
320							{
321	0						$title = $1;
322							}
323							else
324							{
325	0						$title = $self->SUPER::parse_title(%args);
326							}
327	0						return $title;
328							} # parse_title
329
330							=head2 parse_author
331
332							Get the author.
333
334							=cut
335							sub parse_author {
336	0			0	1		my $self = shift;
337	0						my %args = @_;
338
339	0						my $content = $args{content};
340
341	0						my $author = '';
342	0	0					if ($content =~ m! href="/users/\w+/pseuds/\w+">([^<]+)</a>!)
		0
		0
343							{
344	0						$author = $1;
345							}
346							elsif ($content =~ m! href="/users/\w+/pseuds/[^"]+">([^<]+)</a>!)
347							{ # There might be odd characters in there
348	0						$author = $1;
349							}
350							# <div class="byline">by <a href="http://archiveofourown.org/users/SailorChibi/pseuds/SailorChibi" rel="author">SailorChibi</a></div>
351							elsif ($content =~ m! href="https?://archiveofourown\.org/users/\w+/pseuds/[^"]+" rel="author">([^<]+)</a>!)
352							{
353	0						$author = $1;
354							}
355							else
356							{
357	0						$author = $self->SUPER::parse_author(%args);
358							}
359	0						$author =~ s/_/ /g;
360	0						return $author;
361							} # parse_author
362
363							=head2 parse_summary
364
365							Get the summary.
366
367							=cut
368							sub parse_summary {
369	0			0	1		my $self = shift;
370	0						my %args = @_;
371
372	0						my $content = $args{content};
373
374	0						my $summary = '';
375	0	0					if ($content =~ m!<h3[^>]>Summary:</h3>\s<blockquote class="userstuff"><p>([^<]+)</p></blockquote>!s)
		0
		0
376							{
377							# This is a single-paragraph summary.
378	0						$summary = $1;
379							}
380							elsif ($content =~ m!<h3[^>]>Summary:</h3>\s<blockquote class="userstuff">(.*?)</blockquote>!s)
381							{
382							# This is a multi-paragraph summary, it needs to be tidied up.
383	0						$summary = $1;
384	0						$summary =~ s!<p>!!g;
385	0						$summary =~ s!</p>!!g;
386	0						$summary =~ s/^\s*//;
387	0						$summary =~ s/\s*$//;
388							}
389							elsif ($content =~ m!<p[^>]>Summary</p>\s<blockquote class="userstuff">(.*?)</blockquote>!s)
390							{
391							# This is a multi-paragraph summary, it needs to be tidied up.
392	0						$summary = $1;
393	0						$summary =~ s!<p[^>]*>! !g;
394	0						$summary =~ s!</p>!!g;
395	0						$summary =~ s/^\s*//;
396	0						$summary =~ s/\s*$//;
397							}
398							else
399							{
400	0						$summary = $self->SUPER::parse_summary(%args);
401							}
402							# AO3 tends to have messy HTML stuff stuck in the summary
403	0						$summary =~ s!<[a-zA-Z]>!!g;
404	0						$summary =~ s!</[a-zA-Z]>!!g;
405	0						$summary =~ s!&!and!g;
406	0						$summary =~ s!<[^>]+>!!g;
407	0						$summary =~ s!</[^>]+>!!g;
408	0						$summary =~ s!'!'!g;
409	0						$summary =~ s!'!'!g;
410	0						return $summary;
411							} # parse_summary
412
413							=head2 parse_wordcount
414
415							Get the wordcount.
416
417							=cut
418							sub parse_wordcount {
419	0			0	1		my $self = shift;
420	0						my %args = @_;
421
422	0						my $content = $args{content};
423
424	0						my $words = '';
425	0	0					if ($content =~ m!$(\d+) words$!m)
		0
		0
426							{
427	0						$words = $1;
428							}
429							elsif ($content =~ m!<dt class="words">Words:</dt>\s*<dd class="words">([0-9][0-9,]+)</dd>!)
430							{
431	0						$words = $1;
432	0						$words =~ s/,//;
433							}
434							elsif ($content =~ m!Words:\s*([0-9][0-9,]+)!)
435							{
436	0						$words = $1;
437	0						$words =~ s/,//;
438							}
439	0						return $words;
440							} # parse_wordcount
441
442							=head2 parse_characters
443
444							Get the characters.
445
446							=cut
447							sub parse_characters {
448	0			0	1		my $self = shift;
449	0						my %args = @_;
450
451	0						my $content = $args{content};
452
453	0						my $characters = '';
454	0	0					if ($content =~ m!<dd class="character tags">(.*?)</dd>!s)
		0
		0
455							{
456							# multiple characters inside links
457	0						my $str = $1;
458	0						my @chars = ();
459	0						while ($str =~ m!([^><]+)</a>!g)
460							{
461	0						push @chars, $1;
462							}
463	0						$characters = join(', ', @chars);
464							}
465							elsif ($content =~ m!<dt[^>]>Characters:</dt>\s<dd[^>]>(.?)</dd>!s)
466							{
467							# multiple characters inside links
468	0						my $str = $1;
469	0						my @chars = ();
470	0						while ($str =~ m!([^><]+)</a>!g)
471							{
472	0						push @chars, $1;
473							}
474	0						$characters = join(', ', @chars);
475							}
476							elsif ($content =~ m!^Characters: (.*?)$!m)
477							{
478	0						$characters = $1;
479							}
480							else
481							{
482	0						$characters = $self->SUPER::parse_characters(%args);
483							}
484							# Remove the (Universe) part of the characters
485	0						$characters =~ s!\s*$[^)]+$!!g;
486
487							# Specific character things to change
488	0						$characters =~ s!James "Bucky" Barnes!James Barnes!g;
489	0						$characters =~ s!James Bucky Barnes!James Barnes!g;
490	0						$characters =~ s!James "Rhodey" Rhodes!James Rhodes!g;
491	0						$characters =~ s!James Rhodey Rhodes!James Rhodes!g;
492	0						$characters =~ s!You!U!g;
493	0						$characters =~ s!Dummy!Dum-E!g;
494
495	0						return $characters;
496							} # parse_characters
497
498							=head2 parse_universe
499
500							Get the universe.
501
502							=cut
503							sub parse_universe {
504	0			0	1		my $self = shift;
505	0						my %args = @_;
506
507	0						my $content = $args{content};
508
509	0						my $universe = '';
510	0	0					if ($content =~ m!<dd class="fandom tags">(.*?)</dd>!s)
		0
511							{
512							# multiple fandoms inside links
513	0						my $str = $1;
514	0						my @univ = ();
515	0						while ($str =~ m!([^><]+)</a>!g)
516							{
517	0						push @univ, $1;
518							}
519	0						$universe = join(', ', @univ);
520							}
521							elsif ($content =~ m!<dt[^>]>Fandoms:</dt>\s<dd[^>]>(.?)</dd>!s)
522							{
523							# multiple fandoms inside links
524	0						my $str = $1;
525	0						my @univ = ();
526	0						while ($str =~ m!([^><]+)</a>!g)
527							{
528	0						push @univ, $1;
529							}
530	0						$universe = join(', ', @univ);
531							}
532							else
533							{
534	0						$universe = $self->SUPER::parse_universe(%args);
535							}
536
537							# Minor adjustments to AO3 tags
538	0	0					if ($universe =~ m!Harry Potter - J\. K\. Rowling!)
		0
		0
		0
539							{
540	0						$universe =~ s/\s-\sJ\. K\. Rowling//;
541							}
542							elsif ($universe =~ m!(Doctor Who)!)
543							{
544	0						$universe = $1;
545							}
546							elsif ($universe =~ m!Blake&#39;s 7!)
547							{
548	0						$universe = 'Blakes 7';
549							}
550							elsif ($universe =~ m!(Marvel Cinematic Universe\|Avengers\|Iron Man\|Captain America)!)
551							{
552	0						$universe = 'MCU';
553							}
554	0						return $universe;
555							} # parse_universe
556
557							=head2 parse_category
558
559							Get the category.
560
561							=cut
562							sub parse_category {
563	0			0	1		my $self = shift;
564	0						my %args = @_;
565
566	0						my $content = $args{content};
567
568	0						my $category = '';
569	0	0					if ($content =~ m!<dd class="freeform tags">(.*?)</dd>!s)
		0
		0
570							{
571							# multiple categories inside links
572	0						my $str = $1;
573	0						my @cats = ();
574	0						while ($str =~ m!([^><]+)</a>!g)
575							{
576	0						push @cats, $1;
577							}
578	0						$category = join(', ', @cats);
579							}
580							elsif ($content =~ m!<dt[^>]>Additional Tags:</dt>\s<dd[^>]>(.?)</dd>!s)
581							{
582							# multiple categories inside links
583	0						my $str = $1;
584	0						my @cats = ();
585	0						while ($str =~ m!([^><]+)</a>!g)
586							{
587	0						push @cats, $1;
588							}
589	0						$category = join(', ', @cats);
590							}
591							elsif ($content =~ m!Additional Tags:\s</dt>\s<dd class="freeform tags">\s<ul[^>]>\s(.?)\s*</ul>!s)
592							{
593	0						my $categories = $1;
594	0						my @cats = split(/<li>/, $categories);
595	0						my @categories = ();
596	0						foreach my $cat (@cats)
597							{
598	0	0					if ($cat =~ m!class="tag">([^<]+)</a>!)
599							{
600	0						push @categories, $1;
601							}
602							}
603	0						$category = join(', ', @categories);
604							}
605							else
606							{
607	0						$category = $self->SUPER::parse_category(%args);
608							}
609
610							# Also add the "relationship tags", if any, to the categories
611	0	0	0				if ($content =~ m!<dd class="relationship tags">(.*?)</dd>!s
612							or $content =~ m!<dt[^>]>Relationships?:</dt>\s<dd[^>]>(.?)</dd>!s)
613							{
614	0						my $str = $1;
615	0						my @cats = ($category);
616	0						while ($str =~ m!([^><]+)</a>!g)
617							{
618	0						my $rawrel = $1;
619	0						my $rel = $rawrel;
620	0	0					if ($rawrel =~ m!/!)
		0
621							{
622	0						$rawrel =~ s!/!-!g;
623	0						$rel = "${rawrel} Romance";
624							}
625							elsif ($rawrel =~ m!\&!)
626							{
627	0						$rawrel =~ s!\s\&\s!-!g;
628	0						$rel = "${rawrel} Friendship";
629							}
630	0						$rel =~ s!\s*$[^)]+$!!g; # remove universe if there is one there
631	0						$rel =~ s!James "Bucky" Barnes!James Barnes!g;
632	0						$rel =~ s!James "Rhodey" Rhodes!James Rhodes!g;
633	0						$rel =~ s!You!U!g;
634	0						$rel =~ s!Dummy!Dum-E!g;
635	0						push @cats, $rel;
636							}
637	0						$category = join(', ', @cats);
638							}
639
640	0						return $category;
641							} # parse_category
642
643							=head2 parse_rating
644
645							Get the rating from the content
646
647							=cut
648							sub parse_rating {
649	0			0	1		my $self = shift;
650	0						my %args = (
651							content=>'',
652							@_
653							);
654
655	0						my $content = $args{content};
656	0						my $rating = '';
657	0	0					if ($content =~ m!<dd class="rating tags">(.*?)</dd>!s)
		0
658							{
659							# rating inside links
660	0						my $str = $1;
661	0						while ($str =~ m!([^><]+)</a>!g)
662							{
663	0						$rating = $1;
664							}
665							}
666							elsif ($content =~ m!<dt[^>]>Rating:</dt>\s<dd[^>]>(.?)</dd>!s)
667							{
668							# rating inside links
669	0						my $str = $1;
670	0						while ($str =~ m!([^><]+)</a>!g)
671							{
672	0						$rating = $1;
673							}
674							}
675							else
676							{
677	0						$rating = $self->SUPER::parse_rating(%args);
678							}
679	0						return $rating;
680							} # parse_rating
681
682							=head2 whoami
683
684							For debugging purposes
685
686							=cut
687	0			0	1		sub whoami { ( caller(1) )[3] }
688
689							1; # End of WWW::FetchStory::Fetcher::AO3
690							__END__