File Coverage

script/jupiter

Criterion	Covered	Total	%
statement	330	351	94.0
branch	55	80	68.7
condition	35	44	79.5
subroutine	49	52	94.2
pod			n/a
total	469	527	88.9

line	stmt	bran	cond	sub	time	code
1						#! /usr/bin/env perl
2						# Planet Jupiter is a feed aggregator that creates a single HTML file
3						# Copyright (C) 2020–2021 Alex Schroeder <alex@gnu.org>
4
5						# This program is free software: you can redistribute it and/or modify
6						# it under the terms of the GNU Affero General Public License as published by
7						# the Free Software Foundation, either version 3 of the License, or
8						# (at your option) any later version.
9						#
10						# This program is distributed in the hope that it will be useful,
11						# but WITHOUT ANY WARRANTY; without even the implied warranty of
12						# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13						# GNU Affero General Public License for more details.
14						#
15						# You should have received a copy of the GNU Affero General Public License
16						# along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18						package Jupiter;
19
20	15			15	5546999	use Encode::Locale;
	15				65577
	15				766
21	15			15	139	use Encode;
	15				43
	15				1132
22	15			15	702	use utf8;
	15				45
	15				94
23
24						binmode(STDOUT, ":utf8");
25						binmode(STDERR, ":utf8");
26
27						=encoding UTF8
28
29						=head1 NAME
30
31						jupiter - turn a list of feeds into a HTML page, a river of news
32
33						=head1 SYNOPSIS
34
35						To update the feeds from one or more OPML files:
36
37						B<jupiter update> I<feed.opml> … [I</regex/> …]
38
39						To generate F<index.html>:
40
41						B<jupiter html> I<feed.opml>
42
43						=head1 DESCRIPTION
44
45						Planet Jupiter is used to pull together the latest updates from a bunch of other
46						sites and display them on a single web page, the "river of news". The sites we
47						get our updates from are defined in an OPML file.
48
49						A river of news, according to Dave Winer, is a feed aggregator. New items appear
50						at the top and old items disappear at the bottom. When it's gone, it's gone.
51						There is no count of unread items. The goal is to fight the I<fear of missing
52						out> (FOMO).
53
54						Each item looks similar to every other: headline, link, an extract, maybe a date
55						and an author. Extracts contain but the beginning of the article's text; all
56						markup is removed; no images. The goal is to make it the page easy to skim.
57						Scroll down until you find something interesting and follow the link to the
58						original article if you want to read it.
59
60						=head2 The OPML file
61
62						You B<need> an OPML file. It's an XML file linking to I<feeds>. Here's an
63						example listing just one feed. In order to add more, add more C<outline>
64						elements with the C<xmlUrl> attribute. The exact order and nesting does not
65						matter. People can I<import> these OPML files into their own feed readers and
66						thus it may make sense to spend a bit more effort in making it presentable.
67
68						<opml version="2.0">
69						<body>
70						<outline title="Alex Schroeder"
71						xmlUrl="https://alexschroeder.ch/wiki?action=rss"/>
72						</body>
73						</opml>
74
75						=head2 Update the feeds in your cache
76
77						This is how you update the feeds in a file called C<feed.opml>. It downloads all
78						the feeds linked to in the OPML file and stores them in the cache directory.
79
80						jupiter update feed.opml
81
82						The directory used to keep a copy of all the feeds in the OPML file has the same
83						name as the OPML file but without the .opml extension. In other words, if your
84						OPML file is called C<feed.opml> then the cache directory is called C<feed>.
85
86						This operation takes long because it requests an update from all the sites
87						listed in your OPML file. Don't run it too often or you'll annoy the site
88						owners.
89
90						The OPML file must use the .opml extension. You can update the feeds for
91						multiple OPML files in one go.
92
93						=head2 Adding just one feed
94
95						After a while, the list of feeds in your OPML starts getting unwieldy. When you
96						add a new feed, you might not want to fetch all of them. In this case, provide a
97						regular expression surrounded by slashes to the C<update> command:
98
99						jupiter update feed.opml /example/
100
101						Assuming a feed with a URL or title that matches the regular expression is
102						listed in your OPML file, only that feed is going to get updated.
103
104						There is no need to escape slashes in the regular expression: C<//rss/> works
105						just fine. Beware shell escaping, however. Most likely, you need to surround the
106						regular expression with single quotes if it contains spaces:
107
108						jupiter update feed.opml '/Halberds & Helmets/'
109
110						Notice how we assume that named entities such as C<&> have already been
111						parsed into the appropriate strings.
112
113						=head2 Generate the HTML
114
115						This is how you generate the C<index.html> file based on the feeds of your
116						C<feed.opml>. It assumes that you have already updated all the feeds (see
117						above).
118
119						jupiter html feed.opml
120
121						See L</OPTIONS> for ways to change how the HTML is generated.
122
123						=head2 Generate the RSS feed
124
125						This happens at the same time as when you generate the HTML. It takes all the
126						entries that are being added to the HTML and puts the into a feed.
127
128						See L</OPTIONS> for ways to change how the HTML is generated.
129
130						=head2 Why separate the two steps?
131
132						The first reason is that tinkering with the templates involves running the
133						program again and again, and you don't want to contact all the sites whenever
134						you update your templates.
135
136						The other reason is that it allows you to create subsets. For example, you can
137						fetch the feeds for three different OPML files:
138
139						jupiter update osr.opml indie.opml other.opml
140
141						And then you can create three different HTML files:
142
143						jupiter html osr.html osr.opml
144						jupiter html indie.html indie.opml
145						jupiter html rpg.html osr.opml indie.opml other.opml
146
147						For an example of how it might look, check out the setup for the planets I run.
148						L<https://alexschroeder.ch/cgit/planet/about/>
149
150						=head2 What about the JSON file?
151
152						There's a JSON file that gets generated and updated as you run Planet Jupiter.
153						It's name depends on the OPML files used. It records metadata for every feed in
154						the OPML file that isn't stored in the feeds themselves.
155
156						C<message> is the HTTP status message, or a similar message such as "No entry
157						newer than 90 days." This is set when update the feeds in your cache.
158
159						C<message> is the HTTP status code; this code could be the real status code from
160						the server (such as 404 for a "not found" status) or one generated by Jupiter
161						such that it matches the status message (such as 206 for a "partial content"
162						status when there aren't any recent entries in the feed). This is set when
163						update the feeds in your cache.
164
165						C<title> is the site's title. When you update the feeds in your cache, it is
166						taken from the OPML file. That's how the feed can have a title even if the
167						download failed. When you generate the HTML, the feeds in the cache are parsed
168						and if a title is provided, it is stored in the JSON file and overrides the
169						title in the OPML file.
170
171						C<link> is the site's link for humans. When you generate the HTML, the feeds in
172						the cache are parsed and if a link is provided, it is stored in the JSON file.
173						If the OPML element contained a C<htmlURL> attribute, however, that takes
174						precedence. The reasoning is that when a podcast is hosted on a platform which
175						generates a link that you don't like and you know the link to the human-readable
176						blog elsehwere, use the C<htmlURL> attribute in the OPML file to override this.
177
178						C<last_modified> and C<etag> are two headers used for caching from the HTTP
179						response that cannot be changed by data in the feed.
180
181						If we run into problems downloading a feed, this setup allows us to still link
182						to the feeds that aren't working, using their correct names, and describing the
183						error we encountered.
184
185						=head2 Logging
186
187						Use the C<--log=LEVEL> to set the log level. Valid values for LEVEL are debug,
188						info, warn, error, and fatal.
189
190						=head1 LICENSE
191
192						GNU Affero General Public License
193
194						=head1 INSTALLATION
195
196						Using C<cpan>:
197
198						cpan App::jupiter
199
200						Manual install:
201
202						perl Makefile.PL
203						make
204						make install
205
206						=head2 Dependencies
207
208						To run Jupiter on Debian we need:
209
210						C<libmodern-perl-perl> for L<Modern::Perl>
211
212						C<libmojolicious-perl> for L<Mojo::Template>, L<Mojo::UserAgent>, L<Mojo::Log>,
213						L<Mojo::JSON>, and L<Mojo::Util>
214
215						C<libxml-libxml-perl> for L<XML::LibXML>
216
217						C<libfile-slurper-perl> for L<File::Slurper>
218
219						C<libdatetime-perl> for L<DateTime>
220
221						C<libdatetime-format-mail-perl> for L<DateTime::Format::Mail>
222
223						C<libdatetime-format-iso8601-perl> for L<DateTime::Format::ISO8601>
224
225						Unfortunately, L<Mojo::UserAgent::Role::Queued> isn't packaged for Debian.
226						Therefore, let's build it and install it as a Debian package.
227
228						sudo apt-get install libmodule-build-tiny-perl
229						sudo apt-get install dh-make-perl
230						sudo dh-make-perl --build --cpan Mojo::UserAgent::Role::Queued
231						dpkg --install libmojo-useragent-role-queued-perl_1.15-1_all.deb
232
233						To generate the C<README.md> from the source file, you need F<pod2markdown>
234						which you get in C<libpod-markdown-perl>.
235
236						=head1 FILES
237
238						There are a number of files in the F<share> directory which you can use as
239						starting points.
240
241						F<template.html> is the HTML template.
242
243						F<default.css> is a small CSS file used by F<template.html>.
244
245						F<personalize.js> is a small Javascript file used by F<template.html> used to
246						allow visitors to jump from one article to the next using C<J> and C<K>.
247
248						F<jupiter.png> is used by F<template.html> as the icon.
249
250						F<jupiter.svg> is used by F<template.html> as the logo.
251
252						F<feed.png> is used by F<template.html> as the icon for the feeds in the
253						sidebar.
254
255						F<feed.rss> is the feed template.
256
257						=head1 OPTIONS
258
259						HTML generation uses a template, C<template.html>. It is written for
260						C<Mojo::Template> and you can find it in the F<share> directory of your
261						distribution. The default templates use other files, such as the logo, the feed
262						icon, a CSS file, and a small Javascript snippet to enable navigation using the
263						C<J> and C<K> keys (see above).
264
265						You can specify a different HTML file to generate:
266
267						B<jupiter html> I<your.html feed.opml>
268
269						If you specify two HTML files, the first is the HTML file to generate and the
270						second is the template to use. Both must use the C<.html> extension.
271
272						B<jupiter html> I<your.html your-template.html feed.opml>
273
274						Feed generation uses a template, C<feed.rss>. It writes all the entries into a
275						file called C<feed.xml>. Again, the template is written for C<Mojo::Template>.
276
277						You can specify up to two XML, RSS or ATOM files. They must uses one of these
278						three extensions: C<.xml>, C<.rss>, or C<.atom>. The first is the name of the
279						feed to generate, the second is the template to use:
280
281						B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml>
282
283						In the above case, Planet Jupiter will write a feed called F<atom.xml> based on
284						F<template.xml> and a HTML file called F<planet.html> based on F<template.html>,
285						using the cached entries matching the feeds in F<feed.opml>.
286
287						=cut
288
289	15			15	14317	use DateTime;
	15				7003415
	15				819
290	15			15	8894	use DateTime::Format::Mail;
	15				90165
	15				633
291	15			15	9719	use DateTime::Format::ISO8601;
	15				6676034
	15				929
292	15			15	164	use File::Basename;
	15				41
	15				1397
293	15			15	781	use File::Slurper qw(read_binary write_binary read_text write_text);
	15				3328
	15				996
294	15			15	97	use List::Util qw(uniq min shuffle);
	15				52
	15				1149
295	15			15	129	use Modern::Perl;
	15				43
	15				193
296	15			15	14289	use Mojo::Log;
	15				403439
	15				186
297	15			15	1271	use Mojo::JSON qw(decode_json encode_json);
	15				18832
	15				1065
298	15			15	8549	use Mojo::Template;
	15				67050
	15				110
299	15			15	9631	use Mojo::UserAgent;
	15				448406
	15				147
300	15			15	9701	use Pod::Simple::Text;
	15				85683
	15				637
301	15			15	9879	use XML::LibXML;
	15				476143
	15				131
302	15			15	2601	use Mojo::Util qw(slugify trim xml_escape html_unescape);
	15				36
	15				1101
303	15			15	147	use File::ShareDir 'dist_file';
	15				48
	15				804
304
305	15			15	108	use vars qw($log);
	15				37
	15				87176
306						our $log = Mojo::Log->new;
307
308						my $xpc = XML::LibXML::XPathContext->new;
309						$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom');
310						$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
311						$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/');
312						$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd');
313
314						my $undefined_date = DateTime->from_epoch( epoch => 0 );
315
316						my (%wday, %month, $wday_re, $month_re);
317						%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun);
318						%month = qw (janv. Jan févr. Feb mars Mar avr. Apr mai May juin Jun
319						juil. Jul août Aug sept. Sep oct. Oct nov. Nov déc. Dec);
320						$wday_re = join('\|', map { quotemeta } keys %wday) unless $wday_re;
321						$month_re = join('\|', map { quotemeta } keys %month) unless $month_re;
322
323						# Our tests don't want to call main
324						__PACKAGE__->main unless caller;
325
326						sub main {
327	0			0	0	my ($log_level) = grep /^--log=/, @ARGV;
328	0	0			0	$log->level(substr($log_level, 6)) if $log_level;
329	0				0	my ($command) = grep /^[a-z]+$/, @ARGV;
330	0		0		0	$command \|\|= 'help';
331	0	0			0	if ($command eq 'update') {
		0
332	0				0	update_cache(@ARGV);
333						} elsif ($command eq 'html') {
334	0				0	make_html(@ARGV);
335						} else {
336	0				0	my $parser = Pod::Simple::Text->new();
337	0				0	$parser->parse_file($0);
338						}
339						}
340
341						sub update_cache {
342	18			18	1369876	my ($feeds, $files) = read_opml(@_);
343	18				95	make_directories($feeds);
344	18				166	load_feed_metadata($feeds, $files);
345	18				299	my $ua = Mojo::UserAgent->new->with_roles('+Queued')
346						->max_redirects(3)
347						->max_active(5);
348	18				90960	make_promises($ua, $feeds);
349	18				22315	fetch_feeds($feeds);
350	18				17756	save_feed_metadata($feeds, $files);
351	18				7644	cleanup_cache($feeds);
352						}
353
354						sub make_promises {
355	18			18	53	my $ua = shift;
356	18				58	my $feeds = shift;
357	18				63	for my $feed (@$feeds) {
358	18				115	my $url = html_unescape $feed->{url}; # undo xml_escape for the request
359						$ua->on(start => sub {
360	18			18	18154	my ($ua, $tx) = @_;
361	18	50			119	$tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag});
362	18	50			111	$tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified});
363	18				544	});
364						$feed->{promise} = $ua->get_p($url)
365						->catch(sub {
366	0			0	0	$feed->{message} = "@_";
367	0				0	$feed->{code} = 521;
368						# returning 0 in the case of an error is important
369	0				0	0; })
370						# sleeping to stop blogger.com from blocking us
371	18			18	326	->finally(sub { $log->debug($url); sleep 2; });
	18				290133
	18				36004299
372						}
373						}
374
375						sub fetch_feeds {
376	18			18	49	my $feeds = shift;
377	18				592	$log->info("Fetching feeds...");
378	18				141	Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub {
379						# all returns the values in the same order!
380	18			18	20460	for (my $i = 0; $i < @_; $i++) {
381	18				96	my $feed = $feeds->[$i];
382	18				63	my $value = $_[$i];
383	18				71	my $tx = $value->[0];
384						# relies on catch returning 0 above
385	18	50			162	next unless $tx;
386	18				314	$feed->{message} = $tx->result->message;
387	18				1342	$feed->{code} = $tx->result->code;
388	18				670	$feed->{last_modified} = $tx->result->headers->last_modified;
389	18				1247	$feed->{etag} = $tx->result->headers->etag;
390						# save raw bytes if this is a success
391	18	50			881	eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success;
	18				1312
392	18	50			9202	warn "Unable to write $feed->{cache_file}: $@\n" if $@;
393						}
394						})->catch(sub {
395	0			0	0	warn "Something went wrong: @_";
396	18				275	})->wait;
397						}
398
399						sub load_feed_metadata {
400	37			37	92	my $feeds = shift;
401	37				71	my $files = shift;
402	37				119	for my $file (@$files) {
403	36				179	my $filename = "$file->{path}/$file->{name}";
404	36	100			883	next unless -r "$filename.json";
405	22				309	my $data = decode_json read_binary("$filename.json");
406	22				9971	for my $feed (@$feeds) {
407	22				71	my $url = $feed->{url};
408						# don't overwrite title and htmlUrl from OPML file
409	22	50			135	$feed->{title} = $data->{$url}->{title} if $data->{$url}->{title};
410	22		100		190	$feed->{link} \|\|= $data->{$url}->{link};
411						# all the other metadata is loaded from the JSON file
412	22				74	$feed->{message} = $data->{$url}->{message};
413	22				108	$feed->{code} = $data->{$url}->{code};
414	22				91	$feed->{last_modified} = $data->{$url}->{last_modified};
415	22				80	$feed->{etag} = $data->{$url}->{etag};
416	22				64	} grep { $_->{opml_file} eq $file->{file} } @$feeds;
	22				175
417						}
418						}
419
420						sub save_feed_metadata {
421	37			37	704	my $feeds = shift;
422	37				86	my $files = shift;
423	37				119	for my $file (@$files) {
424	36				139	my $name = $file->{name};
425						my %messages = map {
426	36				88	my $feed = $_;
427	36				116	$feed->{url} => { map { $_ => $feed->{$_} } grep { $feed->{$_} } qw(title link message code last_modified etag) };
	131				614
	216				498
428	36				108	} grep { $_->{opml_file} eq $file->{file} } @$feeds;
	36				181
429	36				393	write_binary("$file->{path}/$file->{name}.json", encode_json \%messages);
430						}
431						}
432
433						sub cleanup_cache {
434	18			18	80	my $feeds = shift;
435	18				60	my %good = map { $_ => 1 } @{cache_files($feeds)};
	18				140
	18				106
436	18				80	my @unused = grep { not $good{$_} } @{existing_files($feeds)};
	18				144
	18				113
437	18	50			621	if (@unused) {
438	0				0	$log->info("Removing unused files from the cache...");
439	0				0	foreach (@unused) { $log->info($_) }
	0				0
440	0				0	unlink @unused;
441						}
442						}
443
444						sub existing_files {
445	18			18	53	my $feeds = shift;
446	18				54	my @files;
447	18				64	for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
	18				155
448	18				2492	push(@files, <"$dir/*">);
449						}
450	18				221	return \@files;
451						}
452
453						sub cache_files {
454	18			18	60	my $feeds = shift;
455	18				92	my @files = map { $_->{cache_file} } @$feeds;
	18				129
456	18				96	return \@files;
457						}
458
459						sub make_directories {
460	18			18	50	my $feeds = shift;
461	18				52	for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
	18				122
462	18	100			601	if (not -d $dir) {
463	14				2800	mkdir $dir;
464						}
465						}
466						}
467
468						sub make_html {
469	19			19	37051	my ($feeds, $files) = read_opml(@_);
470	19				132	load_feed_metadata($feeds, $files); # load messages and codes for feeds
471	19				99	my $globals = globals($files);
472	19				119	my $entries = entries($feeds, 4); # set data for feeds, too
473	19				107	add_data($feeds, $entries); # extract data from the xml
474	19				283	save_feed_metadata($feeds, $files); # save title and link for feeds
475	19				6930	$entries = limit($entries, 100);
476	19				95	write_text(html_file(@_), apply_template(read_text(html_template_file(@_)), $globals, $feeds, $entries));
477	19				217271	write_text(feed_file(@_), apply_template(read_text(feed_template_file(@_)), $globals, $feeds, $entries));
478						}
479
480						sub html_file {
481	19			19	169	my ($html) = grep /\.html$/, @_;
482	19		50		143	return $html \|\| 'index.html';
483						}
484
485						sub html_template_file {
486	19			19	105	my ($html, $template) = grep /\.html$/, @_;
487	19		33		188	$template \|\|= dist_file('App-jupiter', 'template.html');
488	19	50			4831	die "HTML template $template not found\n" unless -r $template;
489	19				165	return $template;
490						}
491
492						sub feed_file {
493	19			19	270	my ($feed) = grep /\.(xml\|rss\|atom)$/, @_;
494	19	50			168	return $feed if $feed;
495	0				0	return 'feed.xml';
496						}
497
498						sub feed_template_file {
499	19			19	151	my ($feed, $template) = grep /\.(xml\|rss\|atom)$/, @_;
500	19	50			84	return $template if $template;
501	19				102	return dist_file('App-jupiter', 'feed.rss');
502						}
503
504						sub apply_template {
505	38			38	8650	my $mnt = Mojo::Template->new;
506	38				474	return $mnt->render(@_);
507						}
508
509						=head1 TEMPLATES
510
511						The page template is called with three hash references: C<globals>, C<feeds>,
512						and C<entries>. The keys of these three hash references are documented below.
513						The values of these hashes are all I<escaped HTML> except where noted (dates and
514						file names, for example).
515
516						The technical details of how to write the templates are documented in the man
517						page for L<Mojo::Template>.
518
519						=head2 Globals
520
521						There are not many global keys.
522
523						B<date> is the the publication date of the HTML page, in ISO date format:
524						YYYY-MM-DD.
525
526						B<files> is the list of OPML files used.
527
528						=cut
529
530						sub globals {
531	19			19	52	my $files = shift;
532	19				172	my @time = gmtime;
533	19				293	my $today = DateTime->now->ymd;
534	19				10580	return {date => $today, files => $files};
535						}
536
537						=head2 Writing templates for feeds
538
539						Feeds have the following keys available:
540
541						B<title> is the title of the feed.
542
543						B<url> is the URL of the feed (RSS or Atom). This is not the link to the site!
544
545						B<link> is the URL of the web page (HTML). This is the link to the site.
546
547						B<opml_file> is the file name where this feed is listed.
548
549						B<cache_dir> is the directory where this feed is cached.
550
551						B<message> is the HTTP status message or other warning or error that we got
552						while fetching the feed.
553
554						B<code> is the HTTP status code we got while fetching the feed.
555
556						B<doc> is the L<XML::LibXML::Document>. Could be either Atom or RSS!
557
558						=cut
559
560						# Creates list of feeds. Each feed is a hash with keys title, url, opml_file,
561						# cache_dir and cache_file.
562						sub read_opml {
563	37			37	119	my (@feeds, @files);
564	37				338	my @filters = map { decode(locale => substr($_, 1, -1)) } grep /^\/.*\/$/, @_;
	2				17
565	37				533	for my $file (grep /\.opml$/, @_) {
566	37				499	my $doc = XML::LibXML->load_xml(location => $file); # this better have no errors!
567	37				22432	my @nodes = $doc->findnodes('//outline[./@xmlUrl]');
568	37				6490	my ($name, $path) = fileparse($file, '.opml', '.xml');
569						push @feeds, map {
570	37				236	my $title = xml_escape $_->getAttribute('title');
	37				269
571	37				1334	my $url = xml_escape $_->getAttribute('xmlUrl');
572	37	100	100		846	next if @filters > 0 and not grep { $url =~ /$_/ or $title =~ /$_/ } @filters;
	2	100			64
573	36				202	my $link = xml_escape $_->getAttribute('htmlUrl');
574						{
575	36				939	title => $title, # title in the OPML file
576						url => $url, # feed URL in the OPML file
577						link => $link, # web URL in the OPML file
578						opml_file => $file,
579						cache_dir => "$path/$name",
580						cache_file => "$path/$name/" . slugify($url),
581						}
582						} @nodes;
583	36	50			2316	warn "No feeds found in the OPML file $file\n" unless @nodes;
584	36				413	push @files, { file => $file, path => $path, name => $name };
585						}
586	37				2194	@feeds = shuffle @feeds;
587	37				162	return \@feeds, \@files;
588						}
589
590						sub entries {
591	19			19	48	my $feeds = shift;
592	19				47	my $limit = shift;
593	19				87	my $date = DateTime->now(time_zone => 'UTC')->subtract( days => 90 ); # compute once
594	19				37424	my $now = DateTime->now(time_zone => 'UTC');
595	19				6330	my @entries;
596	19				73	for my $feed (@$feeds) {
597	18	50			454	next unless -r $feed->{cache_file};
598	18				74	my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )};
	18				203
599	18	50			8251	if (not $doc) {
600	0				0	$feed->{message} = xml_escape "Parsing error: $@";
601	0				0	$feed->{code} = 422; # unprocessable
602	0				0	next;
603						}
604	18				193	$feed->{doc} = $doc;
605	18				161	my @nodes = $xpc->findnodes("/rss/channel/item \| /atom:feed/atom:entry", $doc);
606	18	100			1411	if (not @nodes) {
607	2				8	$feed->{message} = "Empty feed";
608	2				6	$feed->{code} = 204; # no content
609	2				80	next;
610						}
611						# if this is an Atom feed, we need to sort the entries ourselves (older entries at the end)
612						my @candidates = map {
613	16				55	my $entry = {};
	40				86
614	40				112	$entry->{element} = $_;
615	40				121	$entry->{id} = id($_);
616	40		66		165	$entry->{date} = updated($_) \|\| $undefined_date;
617	40				1069	$entry;
618						} @nodes;
619	16				60	@candidates = grep { DateTime->compare($_->{date}, $now) <= 0 } @candidates;
	40				1966
620	16				1825	@candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates);
	38				2345
621	16				198	@candidates = @candidates[0 .. min($#candidates, $limit - 1)];
622						# now that we have limited the candidates, let's add more metadata from the feed
623	16				55	for my $entry (@candidates) {
624	30				63	$entry->{feed} = $feed;
625						# these two are already escaped
626	30				72	$entry->{blog_title} = $feed->{title};
627	30				69	$entry->{blog_url} = $feed->{url};
628						}
629	16				96	add_age_warning($feed, \@candidates, $date);
630	16				467	push @entries, @candidates;
631						}
632	19				254	return \@entries;
633						}
634
635						sub add_age_warning {
636	16			16	43	my $feed = shift;
637	16				29	my $entries = shift;
638	16				27	my $date = shift;
639						# feed modification date is smaller than the date given
640	16				90	my ($node) = $xpc->findnodes("/rss/channel \| /atom:feed", $feed->{doc});
641	16				832	my $feed_date = updated($node);
642	16	100	100		365	if ($feed_date and DateTime->compare($feed_date, $date) == -1) {
643	11				928	$feed->{message} = "No feed updates in 90 days";
644	11				35	$feed->{code} = 206; # partial content
645	11				100	return;
646						} else {
647						# or no entry found with a modification date equal or bigger than the date given
648	5				231	for my $entry (@$entries) {
649	11	100			415	return if DateTime->compare($entry->{date}, $date) >= 0;
650						}
651	4				285	$feed->{message} = "No entry newer than 90 days";
652	4				23	$feed->{code} = 206; # partial content
653						}
654						}
655
656						sub updated {
657	113			113	25416	my $node = shift;
658	113	50			411	return unless $node;
659	113	100			846	my @nodes = $xpc->findnodes('pubDate \| atom:published \| atom:updated', $node) or return;
660	106				6169	my $date = $nodes[0]->textContent;
661						my $dt = eval { DateTime::Format::Mail->parse_datetime($date) }
662						\|\| eval { DateTime::Format::ISO8601->parse_datetime($date) }
663	106		100		200	\|\| eval { DateTime::Format::Mail->parse_datetime(french($date)) };
664	106				75805	return $dt;
665						}
666
667						sub french {
668	60			60	111	my $date = shift;
669	60				670	$date =~ s/^($wday_re)/$wday{$1}/;
670	60				649	$date =~ s/\b($month_re)/$month{$1}/;
671	60				328	return $date;
672						}
673
674						sub id {
675	40			40	66	my $node = shift;
676	40	50			103	return unless $node;
677	40				721	my $id = $xpc->findvalue('guid \| atom:id', $node); # id is mandatory for Atom
678	40		100		4856	$id \|\|= $node->findvalue('link'); # one of the following three is mandatory for RSS
679	40		100		2473	$id \|\|= $node->findvalue('title');
680	40		100		1682	$id \|\|= $node->findvalue('description');
681	40				790	return $id;
682						}
683
684						sub unique {
685	35			35	522	my %seen;
686						my @unique;
687	35				85	for my $node (@_) {
688	69	100			206	next if $seen{$node->{id}};
689	61				159	$seen{$node->{id}} = 1;
690	61				115	push(@unique, $node);
691						}
692	35				179	return @unique;
693						}
694
695						sub limit {
696	19			19	59	my $entries = shift;
697	19				102	my $limit = shift;
698						# we want the most recent entries overall
699	19				87	@$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries);
	18				841
700	19				643	return [@$entries[0 .. min($#$entries, $limit - 1)]];
701						}
702
703						=head2 Writing templates for entries
704
705						Entries have the following keys available:
706
707						B<title> is the title of the post.
708
709						B<link> is the URL to the post on the web (probably a HTML page).
710
711						B<blog_title> is the title of the site.
712
713						B<blog_link> is the URL for the site on the web (probably a HTML page).
714
715						B<blog_url> is the URL for the site's feed (RSS or Atom).
716
717						B<authors> are the authors (or the Dublin Core contributor), a list of strings.
718
719						B<date> is the publication date, as a DateTime object.
720
721						B<day> is the publication date, in ISO date format: YYYY-MM-DD, for the UTC
722						timezone. The UTC timezone is picked so that the day doesn't jump back and forth
723						when sorting entries by date.
724
725						B<content> is the full post content, as string or encoded HTML.
726
727						B<excerpt> is the post content, limited to 500 characters, with paragraph
728						separators instead of HTML elements, as HTML. It is not encoded because the idea
729						is that it only gets added to the HTML and not to the feed, and the HTML it
730						contains is very controlled (only the pilcrow sign inside a span to indicate
731						paragraph breaks).
732
733						B<categories> are the categories, a list of strings.
734
735						B<element> is for internal use only. It contains the L<XML::LibXML::Element>
736						object. This could be RSS or Atom!
737
738						B<feed> is for internal use only. It's a reference to the feed this entry
739						belongs to.
740
741						=cut
742
743						sub add_data {
744	19			19	52	my $feeds = shift;
745	19				45	my $entries = shift;
746						# A note on the use of xml_escape: whenever we get data from the feed itself,
747						# it needs to be escaped if it gets printed into the HTML. For example: the
748						# feed contains a feed title of "Foo & Bar". findvalue returns "Foo &
749						# Bar". When the template inserts the title, however, we want "Foo & Bar",
750						# not "Foo & Bar". Thus: any text we get from the feed needs to be escaped
751						# if there's a chance we're going to print it again.
752	19				63	for my $feed (@$feeds) {
753	18	50			76	next unless $feed->{doc};
754						# title and url in the feed overrides title and xmlUrl set in the OPML (XML already escaped)
755	18		0		209	$feed->{title} = xml_escape($xpc->findvalue('/rss/channel/title \| /atom:feed/atom:title', $feed->{doc})) \|\| $feed->{title} \|\| "";
756	18		50		1952	$feed->{url} = xml_escape($xpc->findvalue('/atom:feed/atom:link[@rel="self"]/@href', $feed->{doc})) \|\| $feed->{url} \|\| "";
757						# link in the feed does not override htmlUrl in the OPML (XML already escaped)
758	18		100		1842	$feed->{link} = $feed->{link} \|\| xml_escape($xpc->findvalue('/rss/channel/link \| /atom:feed/atom:link[@rel="alternate"][@type="text/html"]/@href', $feed->{doc})) \|\| "";
759						# if they just pasted a domain "foo" then "//foo" is a valid URL
760	18	100			1115	$feed->{link} = "//" . $feed->{link} unless $feed->{link} =~ /\/\//;
761						}
762	19				219	for my $entry (@$entries) {
763						# copy from the feed (XML is already escaped)
764	30				212	$entry->{blog_link} = $entry->{feed}->{link};
765	30				71	$entry->{blog_title} = $entry->{feed}->{title};
766	30				68	$entry->{blog_url} = $entry->{feed}->{url};
767						# parse the elements
768	30				55	my $element = $entry->{element};
769	30		100		98	$entry->{title} = xml_escape $xpc->findvalue('title \| atom:title', $element) \|\| "Untitled";
770	30				2587	my @links = map { xml_escape $_->to_literal } map { $xpc->findnodes($_, $element) }
	16				540
	90				2683
771						# sorted by preferences!
772						qw(link atom:link[@rel="alternate"][@type="text/html"]/@href atom:link/@href);
773	30		100		831	$entry->{link} = shift(@links) \|\| "";
774	30				248	my @authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes(
	8				326
775						'author \| atom:author/atom:name \| atom:contributor/atom:name \| dc:creator \| dc:contributor', $element);
776	30	100			1255	@authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes(
	10				541
777						'/atom:feed/atom:author/atom:name \| '
778						. '/atom:feed/atom:contributor/atom:name \| '
779						. '/rss/channel/dc:creator \| '
780						. '/rss/channel/dc:contributor \| '
781						. '/rss/channel/webMaster ', $element) unless @authors;
782	30	100			979	$entry->{authors} = @authors ? \@authors : undef; # key must exist in the hash
783	30	100			280	if (DateTime->compare($entry->{date}, $undefined_date) == 0) {
784	5				386	$entry->{day} = "(no date found)";
785						} else {
786	25				2185	$entry->{day} = $entry->{date}->clone->set_time_zone('UTC')->ymd; # operate on a clone
787						}
788	30				4458	my @categories = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes('category \| atom:category/@term', $element);
	6				275
789	30	100			1156	$entry->{categories} = @categories ? \@categories : undef; # key must exist in the hash
790	30				158	$entry->{excerpt} = '';
791	30				77	$entry->{content} = '';
792	30				95	my @nodes = $xpc->findnodes('description[text()!=""] \| atom:content[text()!=""]', $element);
793	30	100			1404	@nodes = $xpc->findnodes('summary[text()!=""] \| atom:summary[text()!=""] \| itunes:summary[text()!=""]', $element) unless @nodes;
794	30				507	my $content = shift(@nodes);
795						# The default is that the content is either plain text or escaped HTML,
796						# which is what we want for RSS. For Atom feeds, type="xhtml" means that the
797						# content is XHTML, so it needs to be escaped.
798	30	100			111	if ($content) {
799	23		100		312	my $is_xhtml = $content->hasAttribute("type") && $content->getAttribute("type") eq "xhtml";
800	23				313	$entry->{excerpt} = excerpt($content->to_literal);
801	23				168	for my $child ($content->childNodes()) {
802	27				769	my $c = $child->toString();
803	27	100			101	$c = xml_escape $c if $is_xhtml;
804	27				294	$entry->{content} .= $c;
805						}
806						}
807						}
808						}
809
810						sub excerpt {
811	23			23	57	my $content = shift;
812	23	50			70	return '(no excerpt)' unless $content;
813	23				47	my $doc = eval { XML::LibXML->load_html(recover => 2, string => $content) };
	23				114
814	23				6762	my $separator = "¶";
815	23				103	for my $node ($doc->findnodes('//style')) {
816	1				53	$node->parentNode->removeChild($node);
817						}
818	23				1278	for my $node ($doc->findnodes('//p \| //br \| //blockquote \| //li \| //td \| //th \| //div')) {
819	23				1109	$node->appendTextNode($separator);
820						}
821	23				79	my $text = strip_html($doc->textContent());
822	23				406	$text =~ s/( +\|----+)/ /g;
823						# collapse whitespace and trim
824	23				292	$text =~ s/\s+/ /g;
825	23				117	$text = trim $text;
826						# replace paragraph repeats with their surrounding spaces
827	23				475	$text =~ s/ ?¶( ?¶)* ?/¶/g;
828	23				82	$text =~ s/^¶//;
829	23				111	$text =~ s/¶$//;
830	23				79	my $len = length($text);
831	23				72	$text = substr($text, 0, 500);
832	23	50			74	$text .= "…" if $len > 500;
833	23				75	$text = xml_escape $text;
834	23				292	$text =~ s/¶/<span class="paragraph">¶ <\/span>/g;
835	23				142	return $text;
836						}
837
838						# When there's a value that's supposed to be text but isn't, then we can try to
839						# turn it to HTML and from there to text... This is an ugly hack and I wish it
840						# wasn't necessary.
841						sub strip_html {
842	47			47	647	my $str = shift;
843	47	50			128	return '' unless $str;
844	47				72	my $doc = eval { XML::LibXML->load_html(string => $str) };
	47				164
845	47	100			12652	return $str unless $doc;
846	45				737	return $doc->textContent();
847						}
848
849						=head1 SEE ALSO
850
851						OPML 2.0, L<http://dev.opml.org/spec2.html>
852
853						RSS 2.0, L<https://cyber.harvard.edu/rss/rss.html>
854
855						Atom Syndication, L<https://tools.ietf.org/html/rfc4287>
856
857						River of News,
858						L<http://scripting.com/2014/06/02/whatIsARiverOfNewsAggregator.html>
859
860						=cut
861
862						1;