File Coverage

script/jupiter

Criterion	Covered	Total	%
statement	327	348	93.9
branch	55	80	68.7
condition	35	44	79.5
subroutine	49	52	94.2
pod			n/a
total	466	524	88.9

line	stmt	bran	cond	sub	time	code
1						#! /usr/bin/env perl
2						# Planet Jupiter is a feed aggregator that creates a single HTML file
3						# Copyright (C) 2020–2021 Alex Schroeder <alex@gnu.org>
4
5						# This program is free software: you can redistribute it and/or modify
6						# it under the terms of the GNU Affero General Public License as published by
7						# the Free Software Foundation, either version 3 of the License, or
8						# (at your option) any later version.
9						#
10						# This program is distributed in the hope that it will be useful,
11						# but WITHOUT ANY WARRANTY; without even the implied warranty of
12						# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13						# GNU Affero General Public License for more details.
14						#
15						# You should have received a copy of the GNU Affero General Public License
16						# along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18						package Jupiter;
19
20	14			14	5103869	use Encode::Locale;
	14				61314
	14				677
21	14			14	103	use Encode;
	14				24
	14				1047
22	14			14	691	use utf8;
	14				40
	14				92
23
24						binmode(STDOUT, ":utf8");
25						binmode(STDERR, ":utf8");
26
27						=encoding UTF8
28
29						=head1 NAME
30
31						jupiter - turn a list of feeds into a HTML page, a river of news
32
33						=head1 SYNOPSIS
34
35						To update the feeds from one or more OPML files:
36
37						B<jupiter update> I<feed.opml> … [I</regex/> …]
38
39						To generate F<index.html>:
40
41						B<jupiter html> I<feed.opml>
42
43						=head1 DESCRIPTION
44
45						Planet Jupiter is used to pull together the latest updates from a bunch of other
46						sites and display them on a single web page, the "river of news". The sites we
47						get our updates from are defined in an OPML file.
48
49						A river of news, according to Dave Winer, is a feed aggregator. New items appear
50						at the top and old items disappear at the bottom. When it's gone, it's gone.
51						There is no count of unread items. The goal is to fight the I<fear of missing
52						out> (FOMO).
53
54						Each item looks similar to every other: headline, link, an extract, maybe a date
55						and an author. Extracts contain but the beginning of the article's text; all
56						markup is removed; no images. The goal is to make it the page easy to skim.
57						Scroll down until you find something interesting and follow the link to the
58						original article if you want to read it.
59
60						=head2 The OPML file
61
62						You B<need> an OPML file. It's an XML file linking to I<feeds>. Here's an
63						example listing just one feed. In order to add more, add more C<outline>
64						elements with the C<xmlUrl> attribute. The exact order and nesting does not
65						matter. People can I<import> these OPML files into their own feed readers and
66						thus it may make sense to spend a bit more effort in making it presentable.
67
68						<opml version="2.0">
69						<body>
70						<outline title="Alex Schroeder"
71						xmlUrl="https://alexschroeder.ch/wiki?action=rss"/>
72						</body>
73						</opml>
74
75						=head2 Update the feeds in your cache
76
77						This is how you update the feeds in a file called C<feed.opml>. It downloads all
78						the feeds linked to in the OPML file and stores them in the cache directory.
79
80						jupiter update feed.opml
81
82						The directory used to keep a copy of all the feeds in the OPML file has the same
83						name as the OPML file but without the .opml extension. In other words, if your
84						OPML file is called C<feed.opml> then the cache directory is called C<feed>.
85
86						This operation takes long because it requests an update from all the sites
87						listed in your OPML file. Don't run it too often or you'll annoy the site
88						owners.
89
90						The OPML file must use the .opml extension. You can update the feeds for
91						multiple OPML files in one go.
92
93						=head2 Adding just one feed
94
95						After a while, the list of feeds in your OPML starts getting unwieldy. When you
96						add a new feed, you might not want to fetch all of them. In this case, provide a
97						regular expression surrounded by slashes to the C<update> command:
98
99						jupiter update feed.opml /example/
100
101						Assuming a feed with a URL or title that matches the regular expression is
102						listed in your OPML file, only that feed is going to get updated.
103
104						There is no need to escape slashes in the regular expression: C<//rss/> works
105						just fine. Beware shell escaping, however. Most likely, you need to surround the
106						regular expression with single quotes if it contains spaces:
107
108						jupiter update feed.opml '/Halberds & Helmets/'
109
110						Notice how we assume that named entities such as C<&> have already been
111						parsed into the appropriate strings.
112
113						=head2 Generate the HTML
114
115						This is how you generate the C<index.html> file based on the feeds of your
116						C<feed.opml>. It assumes that you have already updated all the feeds (see
117						above).
118
119						jupiter html feed.opml
120
121						See L</OPTIONS> for ways to change how the HTML is generated.
122
123						=head2 Generate the RSS feed
124
125						This happens at the same time as when you generate the HTML. It takes all the
126						entries that are being added to the HTML and puts the into a feed.
127
128						See L</OPTIONS> for ways to change how the HTML is generated.
129
130						=head2 Why separate the two steps?
131
132						The first reason is that tinkering with the templates involves running the
133						program again and again, and you don't want to contact all the sites whenever
134						you update your templates.
135
136						The other reason is that it allows you to create subsets. For example, you can
137						fetch the feeds for three different OPML files:
138
139						jupiter update osr.opml indie.opml other.opml
140
141						And then you can create three different HTML files:
142
143						jupiter html osr.html osr.opml
144						jupiter html indie.html indie.opml
145						jupiter html rpg.html osr.opml indie.opml other.opml
146
147						For an example of how it might look, check out the setup for the planets I run.
148						L<https://alexschroeder.ch/cgit/planet/about/>
149
150						=head2 What about the JSON file?
151
152						There's a JSON file that gets generated and updated as you run Planet Jupiter.
153						It's name depends on the OPML files used. It records metadata for every feed in
154						the OPML file that isn't stored in the feeds themselves.
155
156						C<message> is the HTTP status message, or a similar message such as "No entry
157						newer than 90 days." This is set when update the feeds in your cache.
158
159						C<message> is the HTTP status code; this code could be the real status code from
160						the server (such as 404 for a "not found" status) or one generated by Jupiter
161						such that it matches the status message (such as 206 for a "partial content"
162						status when there aren't any recent entries in the feed). This is set when
163						update the feeds in your cache.
164
165						C<title> is the site's title. When you update the feeds in your cache, it is
166						taken from the OPML file. That's how the feed can have a title even if the
167						download failed. When you generate the HTML, the feeds in the cache are parsed
168						and if a title is provided, it is stored in the JSON file and overrides the
169						title in the OPML file.
170
171						C<link> is the site's link for humans. When you generate the HTML, the feeds in
172						the cache are parsed and if a link is provided, it is stored in the JSON file.
173						If the OPML element contained a C<htmlURL> attribute, however, that takes
174						precedence. The reasoning is that when a podcast is hosted on a platform which
175						generates a link that you don't like and you know the link to the human-readable
176						blog elsehwere, use the C<htmlURL> attribute in the OPML file to override this.
177
178						C<last_modified> and C<etag> are two headers used for caching from the HTTP
179						response that cannot be changed by data in the feed.
180
181						If we run into problems downloading a feed, this setup allows us to still link
182						to the feeds that aren't working, using their correct names, and describing the
183						error we encountered.
184
185						=head2 Logging
186
187						Use the C<--log=LEVEL> to set the log level. Valid values for LEVEL are debug,
188						info, warn, error, and fatal.
189
190						=head1 LICENSE
191
192						GNU Affero General Public License
193
194						=head1 INSTALLATION
195
196						Using C<cpan>:
197
198						cpan App::jupiter
199
200						Manual install:
201
202						perl Makefile.PL
203						make
204						make install
205
206						=head2 Dependencies
207
208						To run Jupiter on Debian we need:
209
210						C<libmodern-perl-perl> for L<Modern::Perl>
211
212						C<libmojolicious-perl> for L<Mojo::Template>, L<Mojo::UserAgent>, L<Mojo::Log>,
213						L<Mojo::JSON>, and L<Mojo::Util>
214
215						C<libxml-libxml-perl> for L<XML::LibXML>
216
217						C<libfile-slurper-perl> for L<File::Slurper>
218
219						C<libdatetime-perl> for L<DateTime>
220
221						C<libdatetime-format-mail-perl> for L<DateTime::Format::Mail>
222
223						C<libdatetime-format-iso8601-perl> for L<DateTime::Format::ISO8601>
224
225						Unfortunately, L<Mojo::UserAgent::Role::Queued> isn't packaged for Debian.
226						Therefore, let's build it and install it as a Debian package.
227
228						sudo apt-get install libmodule-build-tiny-perl
229						sudo apt-get install dh-make-perl
230						sudo dh-make-perl --build --cpan Mojo::UserAgent::Role::Queued
231						dpkg --install libmojo-useragent-role-queued-perl_1.15-1_all.deb
232
233						To generate the C<README.md> from the source file, you need F<pod2markdown>
234						which you get in C<libpod-markdown-perl>.
235
236						=head1 FILES
237
238						There are a number of files in the F<share> directory which you can use as
239						starting points.
240
241						F<template.html> is the HTML template.
242
243						F<default.css> is a small CSS file used by F<template.html>.
244
245						F<personalize.js> is a small Javascript file used by F<template.html> used to
246						allow visitors to jump from one article to the next using C<J> and C<K>.
247
248						F<jupiter.png> is used by F<template.html> as the icon.
249
250						F<jupiter.svg> is used by F<template.html> as the logo.
251
252						F<feed.png> is used by F<template.html> as the icon for the feeds in the
253						sidebar.
254
255						F<feed.rss> is the feed template.
256
257						=head1 OPTIONS
258
259						HTML generation uses a template, C<template.html>. It is written for
260						C<Mojo::Template> and you can find it in the F<share> directory of your
261						distribution. The default templates use other files, such as the logo, the feed
262						icon, a CSS file, and a small Javascript snippet to enable navigation using the
263						C<J> and C<K> keys (see above).
264
265						You can specify a different HTML file to generate:
266
267						B<jupiter html> I<your.html feed.opml>
268
269						If you specify two HTML files, the first is the HTML file to generate and the
270						second is the template to use. Both must use the C<.html> extension.
271
272						B<jupiter html> I<your.html your-template.html feed.opml>
273
274						Feed generation uses a template, C<feed.rss>. It writes all the entries into a
275						file called C<feed.xml>. Again, the template is written for C<Mojo::Template>.
276
277						You can specify up to two XML, RSS or ATOM files. They must uses one of these
278						three extensions: C<.xml>, C<.rss>, or C<.atom>. The first is the name of the
279						feed to generate, the second is the template to use:
280
281						B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml>
282
283						In the above case, Planet Jupiter will write a feed called F<atom.xml> based on
284						F<template.xml> and a HTML file called F<planet.html> based on F<template.html>,
285						using the cached entries matching the feeds in F<feed.opml>.
286
287						=cut
288
289	14			14	12637	use DateTime;
	14				5780885
	14				760
290	14			14	9842	use DateTime::Format::Mail;
	14				81529
	14				594
291	14			14	9431	use DateTime::Format::ISO8601;
	14				1736565
	14				1178
292	14			14	211	use File::Basename;
	14				35
	14				1424
293	14			14	782	use File::Slurper qw(read_binary write_binary read_text write_text);
	14				3316
	14				907
294	14			14	87	use List::Util qw(uniq min shuffle);
	14				24
	14				973
295	14			14	101	use Modern::Perl;
	14				25
	14				153
296	14			14	12902	use Mojo::Log;
	14				363759
	14				216
297	14			14	1210	use Mojo::JSON qw(decode_json encode_json);
	14				19092
	14				840
298	14			14	9084	use Mojo::Template;
	14				62865
	14				172
299	14			14	9570	use Mojo::UserAgent;
	14				449184
	14				164
300	14			14	9918	use Pod::Simple::Text;
	14				79017
	14				621
301	14			14	9836	use XML::LibXML;
	14				433036
	14				135
302	14			14	2517	use Mojo::Util qw(slugify trim xml_escape html_unescape);
	14				35
	14				971
303	14			14	91	use File::ShareDir 'dist_file';
	14				29
	14				670
304
305	14			14	87	use vars qw($log);
	14				29
	14				77052
306						our $log = Mojo::Log->new;
307
308						my $xpc = XML::LibXML::XPathContext->new;
309						$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom');
310						$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
311						$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/');
312						$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd');
313
314						my $undefined_date = DateTime->from_epoch( epoch => 0 );
315
316						my (%wday, %month, $wday_re, $month_re);
317						%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun);
318						%month = qw (janv. Jan févr. Feb mars Mar avr. Apr mai May juin Jun
319						juil. Jul août Aug sept. Sep oct. Oct nov. Nov déc. Dec);
320						$wday_re = join('\|', map { quotemeta } keys %wday) unless $wday_re;
321						$month_re = join('\|', map { quotemeta } keys %month) unless $month_re;
322
323						# Our tests don't want to call main
324						__PACKAGE__->main unless caller;
325
326						sub main {
327	0			0	0	my ($log_level) = grep /^--log=/, @ARGV;
328	0	0			0	$log->level(substr($log_level, 6)) if $log_level;
329	0				0	my ($command) = grep /^[a-z]+$/, @ARGV;
330	0		0		0	$command \|\|= 'help';
331	0	0			0	if ($command eq 'update') {
		0
332	0				0	update_cache(@ARGV);
333						} elsif ($command eq 'html') {
334	0				0	make_html(@ARGV);
335						} else {
336	0				0	my $parser = Pod::Simple::Text->new();
337	0				0	$parser->parse_file($0);
338						}
339						}
340
341						sub update_cache {
342	17			17	1159484	my ($feeds, $files) = read_opml(@_);
343	17				94	make_directories($feeds);
344	17				133	load_feed_metadata($feeds, $files);
345	17				263	my $ua = Mojo::UserAgent->new->with_roles('+Queued')
346						->max_redirects(3)
347						->max_active(5);
348	17				81205	make_promises($ua, $feeds);
349	17				19431	fetch_feeds($feeds);
350	17				12879	save_feed_metadata($feeds, $files);
351	17				5651	cleanup_cache($feeds);
352						}
353
354						sub make_promises {
355	17			17	44	my $ua = shift;
356	17				34	my $feeds = shift;
357	17				51	for my $feed (@$feeds) {
358	17				103	my $url = html_unescape $feed->{url}; # undo xml_escape for the request
359						$ua->on(start => sub {
360	17			17	16220	my ($ua, $tx) = @_;
361	17	50			94	$tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag});
362	17	50			100	$tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified});
363	17				503	});
364						$feed->{promise} = $ua->get_p($url)
365						->catch(sub {
366	0			0	0	$feed->{message} = "@_";
367	0				0	$feed->{code} = 521;
368						# returning 0 in the case of an error is important
369	0				0	0; })
370						# sleeping to stop blogger.com from blocking us
371	17			17	241	->finally(sub { $log->debug($url); sleep 2; });
	17				256230
	17				34004009
372						}
373						}
374
375						sub fetch_feeds {
376	17			17	56	my $feeds = shift;
377	17				562	$log->info("Fetching feeds...");
378	17				103	Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub {
379						# all returns the values in the same order!
380	17			17	14740	for (my $i = 0; $i < @_; $i++) {
381	17				81	my $feed = $feeds->[$i];
382	17				49	my $value = $_[$i];
383	17				48	my $tx = $value->[0];
384						# relies on catch returning 0 above
385	17	50			181	next unless $tx;
386	17				243	$feed->{message} = $tx->result->message;
387	17				1053	$feed->{code} = $tx->result->code;
388	17				471	$feed->{last_modified} = $tx->result->headers->last_modified;
389	17				956	$feed->{etag} = $tx->result->headers->etag;
390						# save raw bytes if this is a success
391	17	50			627	eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success;
	17				1133
392	17	50			8362	warn "Unable to write $feed->{cache_file}: $@\n" if $@;
393						}
394						})->catch(sub {
395	0			0	0	warn "Something went wrong: @_";
396	17				219	})->wait;
397						}
398
399						sub load_feed_metadata {
400	35			35	83	my $feeds = shift;
401	35				65	my $files = shift;
402	35				103	for my $file (@$files) {
403	34				143	my $filename = "$file->{path}/$file->{name}";
404	34	100			821	next unless -r "$filename.json";
405	21				230	my $data = decode_json read_binary("$filename.json");
406	21				8206	for my $feed (@$feeds) {
407	21				65	my $url = $feed->{url};
408						# don't overwrite title and htmlUrl from OPML file
409	21	50			131	$feed->{title} = $data->{$url}->{title} if $data->{$url}->{title};
410	21		100		174	$feed->{link} \|\|= $data->{$url}->{link};
411						# all the other metadata is loaded from the JSON file
412	21				62	$feed->{message} = $data->{$url}->{message};
413	21				79	$feed->{code} = $data->{$url}->{code};
414	21				91	$feed->{last_modified} = $data->{$url}->{last_modified};
415	21				70	$feed->{etag} = $data->{$url}->{etag};
416	21				61	} grep { $_->{opml_file} eq $file->{file} } @$feeds;
	21				127
417						}
418						}
419
420						sub save_feed_metadata {
421	35			35	530	my $feeds = shift;
422	35				79	my $files = shift;
423	35				115	for my $file (@$files) {
424	34				111	my $name = $file->{name};
425						my %messages = map {
426	34				70	my $feed = $_;
427	34				107	$feed->{url} => { map { $_ => $feed->{$_} } grep { $feed->{$_} } qw(title link message code last_modified etag) };
	124				498
	204				404
428	34				97	} grep { $_->{opml_file} eq $file->{file} } @$feeds;
	34				200
429	34				374	write_binary("$file->{path}/$file->{name}.json", encode_json \%messages);
430						}
431						}
432
433						sub cleanup_cache {
434	17			17	54	my $feeds = shift;
435	17				44	my %good = map { $_ => 1 } @{cache_files($feeds)};
	17				91
	17				100
436	17				50	my @unused = grep { not $good{$_} } @{existing_files($feeds)};
	17				90
	17				66
437	17	50			460	if (@unused) {
438	0				0	$log->info("Removing unused files from the cache...");
439	0				0	foreach (@unused) { $log->info($_) }
	0				0
440	0				0	unlink @unused;
441						}
442						}
443
444						sub existing_files {
445	17			17	44	my $feeds = shift;
446	17				38	my @files;
447	17				49	for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
	17				126
448	17				1808	push(@files, <"$dir/*">);
449						}
450	17				115	return \@files;
451						}
452
453						sub cache_files {
454	17			17	38	my $feeds = shift;
455	17				54	my @files = map { $_->{cache_file} } @$feeds;
	17				92
456	17				71	return \@files;
457						}
458
459						sub make_directories {
460	17			17	36	my $feeds = shift;
461	17				48	for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
	17				111
462	17	100			620	if (not -d $dir) {
463	13				1545	mkdir $dir;
464						}
465						}
466						}
467
468						sub make_html {
469	18			18	30384	my ($feeds, $files) = read_opml(@_);
470	18				135	load_feed_metadata($feeds, $files); # load messages and codes for feeds
471	18				81	my $globals = globals($files);
472	18				91	my $entries = entries($feeds, 4); # set data for feeds, too
473	18				95	add_data($feeds, $entries); # extract data from the xml
474	18				277	save_feed_metadata($feeds, $files); # save title and link for feeds
475	18				7709	$entries = limit($entries, 100);
476	18				95	write_text(html_file(@_), apply_template(read_text(html_template_file(@_)), $globals, $feeds, $entries));
477	18				204442	write_text(feed_file(@_), apply_template(read_text(feed_template_file(@_)), $globals, $feeds, $entries));
478						}
479
480						sub html_file {
481	18			18	230	my ($html) = grep /\.html$/, @_;
482	18		50		171	return $html \|\| 'index.html';
483						}
484
485						sub html_template_file {
486	18			18	106	my ($html, $template) = grep /\.html$/, @_;
487	18		33		184	$template \|\|= dist_file('App-jupiter', 'template.html');
488	18	50			4834	die "HTML template $template not found\n" unless -r $template;
489	18				145	return $template;
490						}
491
492						sub feed_file {
493	18			18	263	my ($feed) = grep /\.(xml\|rss\|atom)$/, @_;
494	18	50			155	return $feed if $feed;
495	0				0	return 'feed.xml';
496						}
497
498						sub feed_template_file {
499	18			18	210	my ($feed, $template) = grep /\.(xml\|rss\|atom)$/, @_;
500	18	50			88	return $template if $template;
501	18				139	return dist_file('App-jupiter', 'feed.rss');
502						}
503
504						sub apply_template {
505	36			36	9252	my $mnt = Mojo::Template->new;
506	36				517	return $mnt->render(@_);
507						}
508
509						=head1 TEMPLATES
510
511						The page template is called with three hash references: C<globals>, C<feeds>,
512						and C<entries>. The keys of these three hash references are documented below.
513						The values of these hashes are all I<escaped HTML> except where noted (dates and
514						file names, for example).
515
516						The technical details of how to write the templates are documented in the man
517						page for L<Mojo::Template>.
518
519						=head2 Globals
520
521						There are not many global keys.
522
523						B<date> is the the publication date of the HTML page, in ISO date format:
524						YYYY-MM-DD.
525
526						B<files> is the list of OPML files used.
527
528						=cut
529
530						sub globals {
531	18			18	44	my $files = shift;
532	18				254	my @time = gmtime;
533	18				229	my $today = DateTime->now->ymd;
534	18				8686	return {date => $today, files => $files};
535						}
536
537						=head2 Writing templates for feeds
538
539						Feeds have the following keys available:
540
541						B<title> is the title of the feed.
542
543						B<url> is the URL of the feed (RSS or Atom). This is not the link to the site!
544
545						B<link> is the URL of the web page (HTML). This is the link to the site.
546
547						B<opml_file> is the file name where this feed is listed.
548
549						B<cache_dir> is the directory where this feed is cached.
550
551						B<message> is the HTTP status message or other warning or error that we got
552						while fetching the feed.
553
554						B<code> is the HTTP status code we got while fetching the feed.
555
556						B<doc> is the L<XML::LibXML::Document>. Could be either Atom or RSS!
557
558						=cut
559
560						# Creates list of feeds. Each feed is a hash with keys title, url, opml_file,
561						# cache_dir and cache_file.
562						sub read_opml {
563	35			35	110	my (@feeds, @files);
564	35				272	my @filters = map { decode(locale => substr($_, 1, -1)) } grep /^\/.*\/$/, @_;
	2				18
565	35				510	for my $file (grep /\.opml$/, @_) {
566	35				435	my $doc = XML::LibXML->load_xml(location => $file); # this better have no errors!
567	35				18355	my @nodes = $doc->findnodes('//outline[./@xmlUrl]');
568	35				5452	my ($name, $path) = fileparse($file, '.opml', '.xml');
569						push @feeds, map {
570	35				152	my $title = xml_escape $_->getAttribute('title');
	35				227
571	35				1118	my $url = xml_escape $_->getAttribute('xmlUrl');
572	35	100	100		660	next if @filters > 0 and not grep { $url =~ /$_/ or $title =~ /$_/ } @filters;
	2	100			62
573	34				114	my $link = xml_escape $_->getAttribute('htmlUrl');
574						{
575	34				686	title => $title, # title in the OPML file
576						url => $url, # feed URL in the OPML file
577						link => $link, # web URL in the OPML file
578						opml_file => $file,
579						cache_dir => "$path/$name",
580						cache_file => "$path/$name/" . slugify($url),
581						}
582						} @nodes;
583	34	50			1878	warn "No feeds found in the OPML file $file\n" unless @nodes;
584	34				345	push @files, { file => $file, path => $path, name => $name };
585						}
586	35				1806	@feeds = shuffle @feeds;
587	35				136	return \@feeds, \@files;
588						}
589
590						sub entries {
591	18			18	38	my $feeds = shift;
592	18				35	my $limit = shift;
593	18				77	my $date = DateTime->now(time_zone => 'UTC')->subtract( days => 90 ); # compute once
594	18				31634	my @entries;
595	18				62	for my $feed (@$feeds) {
596	17	50			666	next unless -r $feed->{cache_file};
597	17				60	my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )};
	17				198
598	17	50			8524	if (not $doc) {
599	0				0	$feed->{message} = xml_escape "Parsing error: $@";
600	0				0	$feed->{code} = 422; # unprocessable
601	0				0	next;
602						}
603	17				167	$feed->{doc} = $doc;
604	17				139	my @nodes = $xpc->findnodes("/rss/channel/item \| /atom:feed/atom:entry", $doc);
605	17	100			1315	if (not @nodes) {
606	2				7	$feed->{message} = "Empty feed";
607	2				5	$feed->{code} = 204; # no content
608	2				6	next;
609						}
610						# if this is an Atom feed, we need to sort the entries ourselves (older entries at the end)
611						my @candidates = map {
612	15				45	my $entry = {};
	34				70
613	34				86	$entry->{element} = $_;
614	34				105	$entry->{id} = id($_);
615	34		66		114	$entry->{date} = updated($_) \|\| $undefined_date;
616	34				851	$entry;
617						} @nodes;
618	15				113	@candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates);
	30				1522
619	15				203	@candidates = @candidates[0 .. min($#candidates, $limit - 1)];
620						# now that we have limited the candidates, let's add more metadata from the feed
621	15				46	for my $entry (@candidates) {
622	29				54	$entry->{feed} = $feed;
623						# these two are already escaped
624	29				67	$entry->{blog_title} = $feed->{title};
625	29				65	$entry->{blog_url} = $feed->{url};
626						}
627	15				76	add_age_warning($feed, \@candidates, $date);
628	15				441	push @entries, @candidates;
629						}
630	18				242	return \@entries;
631						}
632
633						sub add_age_warning {
634	15			15	37	my $feed = shift;
635	15				23	my $entries = shift;
636	15				34	my $date = shift;
637						# feed modification date is smaller than the date given
638	15				76	my ($node) = $xpc->findnodes("/rss/channel \| /atom:feed", $feed->{doc});
639	15				758	my $feed_date = updated($node);
640	15	100	100		314	if ($feed_date and DateTime->compare($feed_date, $date) == -1) {
641	10				945	$feed->{message} = "No feed updates in 90 days";
642	10				25	$feed->{code} = 206; # partial content
643	10				106	return;
644						} else {
645						# or no entry found with a modification date equal or bigger than the date given
646	5				319	for my $entry (@$entries) {
647	11	100			368	return if DateTime->compare($entry->{date}, $date) >= 0;
648						}
649	4				327	$feed->{message} = "No entry newer than 90 days";
650	4				19	$feed->{code} = 206; # partial content
651						}
652						}
653
654						sub updated {
655	106			106	27275	my $node = shift;
656	106	50			448	return unless $node;
657	106	100			857	my @nodes = $xpc->findnodes('pubDate \| atom:updated', $node) or return;
658	99				5948	my $date = $nodes[0]->textContent;
659						my $dt = eval { DateTime::Format::Mail->parse_datetime($date) }
660						\|\| eval { DateTime::Format::ISO8601->parse_datetime($date) }
661	99		100		205	\|\| eval { DateTime::Format::Mail->parse_datetime(french($date)) };
662	99				73299	return $dt;
663						}
664
665						sub french {
666	60			60	124	my $date = shift;
667	60				849	$date =~ s/^($wday_re)/$wday{$1}/;
668	60				733	$date =~ s/\b($month_re)/$month{$1}/;
669	60				332	return $date;
670						}
671
672						sub id {
673	34			34	59	my $node = shift;
674	34	50			81	return unless $node;
675	34				267	my $id = $xpc->findvalue('guid \| atom:id', $node); # id is mandatory for Atom
676	34		100		3430	$id \|\|= $node->findvalue('link'); # one of the following three is mandatory for RSS
677	34		100		1861	$id \|\|= $node->findvalue('title');
678	34		100		1380	$id \|\|= $node->findvalue('description');
679	34				481	return $id;
680						}
681
682						sub unique {
683	33			33	584	my %seen;
684						my @unique;
685	33				94	for my $node (@_) {
686	63	100			182	next if $seen{$node->{id}};
687	59				150	$seen{$node->{id}} = 1;
688	59				117	push(@unique, $node);
689						}
690	33				157	return @unique;
691						}
692
693						sub limit {
694	18			18	50	my $entries = shift;
695	18				42	my $limit = shift;
696						# we want the most recent entries overall
697	18				75	@$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries);
	18				795
698	18				617	return [@$entries[0 .. min($#$entries, $limit - 1)]];
699						}
700
701						=head2 Writing templates for entries
702
703						Entries have the following keys available:
704
705						B<title> is the title of the post.
706
707						B<link> is the URL to the post on the web (probably a HTML page).
708
709						B<blog_title> is the title of the site.
710
711						B<blog_link> is the URL for the site on the web (probably a HTML page).
712
713						B<blog_url> is the URL for the site's feed (RSS or Atom).
714
715						B<authors> are the authors (or the Dublin Core contributor), a list of strings.
716
717						B<date> is the publication date, as a DateTime object.
718
719						B<day> is the publication date, in ISO date format: YYYY-MM-DD, for the UTC
720						timezone. The UTC timezone is picked so that the day doesn't jump back and forth
721						when sorting entries by date.
722
723						B<content> is the full post content, as string or encoded HTML.
724
725						B<excerpt> is the post content, limited to 500 characters, with paragraph
726						separators instead of HTML elements, as HTML. It is not encoded because the idea
727						is that it only gets added to the HTML and not to the feed, and the HTML it
728						contains is very controlled (only the pilcrow sign inside a span to indicate
729						paragraph breaks).
730
731						B<categories> are the categories, a list of strings.
732
733						B<element> is for internal use only. It contains the L<XML::LibXML::Element>
734						object. This could be RSS or Atom!
735
736						B<feed> is for internal use only. It's a reference to the feed this entry
737						belongs to.
738
739						=cut
740
741						sub add_data {
742	18			18	40	my $feeds = shift;
743	18				41	my $entries = shift;
744						# A note on the use of xml_escape: whenever we get data from the feed itself,
745						# it needs to be escaped if it gets printed into the HTML. For example: the
746						# feed contains a feed title of "Foo & Bar". findvalue returns "Foo &
747						# Bar". When the template inserts the title, however, we want "Foo & Bar",
748						# not "Foo & Bar". Thus: any text we get from the feed needs to be escaped
749						# if there's a chance we're going to print it again.
750	18				70	for my $feed (@$feeds) {
751	17	50			69	next unless $feed->{doc};
752						# title and url in the feed overrides title and xmlUrl set in the OPML (XML already escaped)
753	17		0		181	$feed->{title} = xml_escape($xpc->findvalue('/rss/channel/title \| /atom:feed/atom:title', $feed->{doc})) \|\| $feed->{title} \|\| "";
754	17		50		1942	$feed->{url} = xml_escape($xpc->findvalue('/atom:feed/atom:link[@rel="self"]/@href', $feed->{doc})) \|\| $feed->{url} \|\| "";
755						# link in the feed does not override htmlUrl in the OPML (XML already escaped)
756	17		100		1584	$feed->{link} = $feed->{link} \|\| xml_escape($xpc->findvalue('/rss/channel/link \| /atom:feed/atom:link[@rel="alternate"][@type="text/html"]/@href', $feed->{doc})) \|\| "";
757						# if they just pasted a domain "foo" then "//foo" is a valid URL
758	17	100			1045	$feed->{link} = "//" . $feed->{link} unless $feed->{link} =~ /\/\//;
759						}
760	18				252	for my $entry (@$entries) {
761						# copy from the feed (XML is already escaped)
762	29				214	$entry->{blog_link} = $entry->{feed}->{link};
763	29				73	$entry->{blog_title} = $entry->{feed}->{title};
764	29				56	$entry->{blog_url} = $entry->{feed}->{url};
765						# parse the elements
766	29				56	my $element = $entry->{element};
767	29		100		98	$entry->{title} = xml_escape $xpc->findvalue('title \| atom:title', $element) \|\| "Untitled";
768	29				2422	my @links = map { xml_escape $_->to_literal } map { $xpc->findnodes($_, $element) }
	16				504
	87				2649
769						# sorted by preferences!
770						qw(link atom:link[@rel="alternate"][@type="text/html"]/@href atom:link/@href);
771	29		100		726	$entry->{link} = shift(@links) \|\| "";
772	29				218	my @authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes(
	7				255
773						'author \| atom:author/atom:name \| atom:contributor/atom:name \| dc:creator \| dc:contributor', $element);
774	29	100			1207	@authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes(
	10				512
775						'/atom:feed/atom:author/atom:name \| '
776						. '/atom:feed/atom:contributor/atom:name \| '
777						. '/rss/channel/dc:creator \| '
778						. '/rss/channel/dc:contributor \| '
779						. '/rss/channel/webMaster ', $element) unless @authors;
780	29	100			837	$entry->{authors} = @authors ? \@authors : undef; # key must exist in the hash
781	29	100			272	if (DateTime->compare($entry->{date}, $undefined_date) == 0) {
782	5				404	$entry->{day} = "(no date found)";
783						} else {
784	24				2146	$entry->{day} = $entry->{date}->clone->set_time_zone('UTC')->ymd; # operate on a clone
785						}
786	29				4430	my @categories = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes('category \| atom:category/@term', $element);
	6				278
787	29	100			1069	$entry->{categories} = @categories ? \@categories : undef; # key must exist in the hash
788	29				150	$entry->{excerpt} = '';
789	29				73	$entry->{content} = '';
790	29				86	my @nodes = $xpc->findnodes('description[text()!=""] \| atom:content[text()!=""]', $element);
791	29	100			1235	@nodes = $xpc->findnodes('summary[text()!=""] \| atom:summary[text()!=""] \| itunes:summary[text()!=""]', $element) unless @nodes;
792	29				378	my $content = shift(@nodes);
793						# The default is that the content is either plain text or escaped HTML,
794						# which is what we want for RSS. For Atom feeds, type="xhtml" means that the
795						# content is XHTML, so it needs to be escaped.
796	29	100			96	if ($content) {
797	23		100		315	my $is_xhtml = $content->hasAttribute("type") && $content->getAttribute("type") eq "xhtml";
798	23				267	$entry->{excerpt} = excerpt($content->to_literal);
799	23				157	for my $child ($content->childNodes()) {
800	27				745	my $c = $child->toString();
801	27	100			106	$c = xml_escape $c if $is_xhtml;
802	27				276	$entry->{content} .= $c;
803						}
804						}
805						}
806						}
807
808						sub excerpt {
809	23			23	62	my $content = shift;
810	23	50			69	return '(no excerpt)' unless $content;
811	23				36	my $doc = eval { XML::LibXML->load_html(recover => 2, string => $content) };
	23				119
812	23				6956	my $separator = "¶";
813	23				109	for my $node ($doc->findnodes('//style')) {
814	1				60	$node->parentNode->removeChild($node);
815						}
816	23				937	for my $node ($doc->findnodes('//p \| //br \| //blockquote \| //li \| //td \| //th \| //div')) {
817	23				957	$node->appendTextNode($separator);
818						}
819	23				81	my $text = strip_html($doc->textContent());
820	23				403	$text =~ s/( +\|----+)/ /g;
821						# collapse whitespace and trim
822	23				290	$text =~ s/\s+/ /g;
823	23				153	$text = trim $text;
824						# replace paragraph repeats with their surrounding spaces
825	23				502	$text =~ s/ ?¶( ?¶)* ?/¶/g;
826	23				69	$text =~ s/^¶//;
827	23				106	$text =~ s/¶$//;
828	23				71	my $len = length($text);
829	23				85	$text = substr($text, 0, 500);
830	23	50			80	$text .= "…" if $len > 500;
831	23				78	$text = xml_escape $text;
832	23				279	$text =~ s/¶/<span class="paragraph">¶ <\/span>/g;
833	23				142	return $text;
834						}
835
836						# When there's a value that's supposed to be text but isn't, then we can try to
837						# turn it to HTML and from there to text... This is an ugly hack and I wish it
838						# wasn't necessary.
839						sub strip_html {
840	46			46	582	my $str = shift;
841	46	50			118	return '' unless $str;
842	46				66	my $doc = eval { XML::LibXML->load_html(string => $str) };
	46				162
843	46	100			12223	return $str unless $doc;
844	44				728	return $doc->textContent();
845						}
846
847						=head1 SEE ALSO
848
849						OPML 2.0, L<http://dev.opml.org/spec2.html>
850
851						RSS 2.0, L<https://cyber.harvard.edu/rss/rss.html>
852
853						Atom Syndication, L<https://tools.ietf.org/html/rfc4287>
854
855						River of News,
856						L<http://scripting.com/2014/06/02/whatIsARiverOfNewsAggregator.html>
857
858						=cut
859
860						1;