line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#! /usr/bin/env perl |
2
|
|
|
|
|
|
|
# Planet Jupiter is a feed aggregator that creates a single HTML file |
3
|
|
|
|
|
|
|
# Copyright (C) 2020–2021 Alex Schroeder <alex@gnu.org> |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify |
6
|
|
|
|
|
|
|
# it under the terms of the GNU Affero General Public License as published by |
7
|
|
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
8
|
|
|
|
|
|
|
# (at your option) any later version. |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
11
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
12
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13
|
|
|
|
|
|
|
# GNU Affero General Public License for more details. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
16
|
|
|
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
package Jupiter; |
19
|
|
|
|
|
|
|
|
20
|
15
|
|
|
15
|
|
5546999
|
use Encode::Locale; |
|
15
|
|
|
|
|
65577
|
|
|
15
|
|
|
|
|
766
|
|
21
|
15
|
|
|
15
|
|
139
|
use Encode; |
|
15
|
|
|
|
|
43
|
|
|
15
|
|
|
|
|
1132
|
|
22
|
15
|
|
|
15
|
|
702
|
use utf8; |
|
15
|
|
|
|
|
45
|
|
|
15
|
|
|
|
|
94
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
binmode(STDOUT, ":utf8"); |
25
|
|
|
|
|
|
|
binmode(STDERR, ":utf8"); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=encoding UTF8 |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 NAME |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
jupiter - turn a list of feeds into a HTML page, a river of news |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 SYNOPSIS |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
To update the feeds from one or more OPML files: |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
B<jupiter update> I<feed.opml> … [I</regex/> …] |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
To generate F<index.html>: |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
B<jupiter html> I<feed.opml> |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Planet Jupiter is used to pull together the latest updates from a bunch of other |
46
|
|
|
|
|
|
|
sites and display them on a single web page, the "river of news". The sites we |
47
|
|
|
|
|
|
|
get our updates from are defined in an OPML file. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
A river of news, according to Dave Winer, is a feed aggregator. New items appear |
50
|
|
|
|
|
|
|
at the top and old items disappear at the bottom. When it's gone, it's gone. |
51
|
|
|
|
|
|
|
There is no count of unread items. The goal is to fight the I<fear of missing |
52
|
|
|
|
|
|
|
out> (FOMO). |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Each item looks similar to every other: headline, link, an extract, maybe a date |
55
|
|
|
|
|
|
|
and an author. Extracts contain but the beginning of the article's text; all |
56
|
|
|
|
|
|
|
markup is removed; no images. The goal is to make it the page easy to skim. |
57
|
|
|
|
|
|
|
Scroll down until you find something interesting and follow the link to the |
58
|
|
|
|
|
|
|
original article if you want to read it. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 The OPML file |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
You B<need> an OPML file. It's an XML file linking to I<feeds>. Here's an |
63
|
|
|
|
|
|
|
example listing just one feed. In order to add more, add more C<outline> |
64
|
|
|
|
|
|
|
elements with the C<xmlUrl> attribute. The exact order and nesting does not |
65
|
|
|
|
|
|
|
matter. People can I<import> these OPML files into their own feed readers and |
66
|
|
|
|
|
|
|
thus it may make sense to spend a bit more effort in making it presentable. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
<opml version="2.0"> |
69
|
|
|
|
|
|
|
<body> |
70
|
|
|
|
|
|
|
<outline title="Alex Schroeder" |
71
|
|
|
|
|
|
|
xmlUrl="https://alexschroeder.ch/wiki?action=rss"/> |
72
|
|
|
|
|
|
|
</body> |
73
|
|
|
|
|
|
|
</opml> |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head2 Update the feeds in your cache |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
This is how you update the feeds in a file called C<feed.opml>. It downloads all |
78
|
|
|
|
|
|
|
the feeds linked to in the OPML file and stores them in the cache directory. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
jupiter update feed.opml |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The directory used to keep a copy of all the feeds in the OPML file has the same |
83
|
|
|
|
|
|
|
name as the OPML file but without the .opml extension. In other words, if your |
84
|
|
|
|
|
|
|
OPML file is called C<feed.opml> then the cache directory is called C<feed>. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
This operation takes long because it requests an update from all the sites |
87
|
|
|
|
|
|
|
listed in your OPML file. Don't run it too often or you'll annoy the site |
88
|
|
|
|
|
|
|
owners. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The OPML file must use the .opml extension. You can update the feeds for |
91
|
|
|
|
|
|
|
multiple OPML files in one go. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 Adding just one feed |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
After a while, the list of feeds in your OPML starts getting unwieldy. When you |
96
|
|
|
|
|
|
|
add a new feed, you might not want to fetch all of them. In this case, provide a |
97
|
|
|
|
|
|
|
regular expression surrounded by slashes to the C<update> command: |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
jupiter update feed.opml /example/ |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Assuming a feed with a URL or title that matches the regular expression is |
102
|
|
|
|
|
|
|
listed in your OPML file, only that feed is going to get updated. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
There is no need to escape slashes in the regular expression: C<//rss/> works |
105
|
|
|
|
|
|
|
just fine. Beware shell escaping, however. Most likely, you need to surround the |
106
|
|
|
|
|
|
|
regular expression with single quotes if it contains spaces: |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
jupiter update feed.opml '/Halberds & Helmets/' |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Notice how we assume that named entities such as C<&> have already been |
111
|
|
|
|
|
|
|
parsed into the appropriate strings. |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 Generate the HTML |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
This is how you generate the C<index.html> file based on the feeds of your |
116
|
|
|
|
|
|
|
C<feed.opml>. It assumes that you have already updated all the feeds (see |
117
|
|
|
|
|
|
|
above). |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
jupiter html feed.opml |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
See L</OPTIONS> for ways to change how the HTML is generated. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 Generate the RSS feed |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
This happens at the same time as when you generate the HTML. It takes all the |
126
|
|
|
|
|
|
|
entries that are being added to the HTML and puts the into a feed. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
See L</OPTIONS> for ways to change how the HTML is generated. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head2 Why separate the two steps? |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
The first reason is that tinkering with the templates involves running the |
133
|
|
|
|
|
|
|
program again and again, and you don't want to contact all the sites whenever |
134
|
|
|
|
|
|
|
you update your templates. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
The other reason is that it allows you to create subsets. For example, you can |
137
|
|
|
|
|
|
|
fetch the feeds for three different OPML files: |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
jupiter update osr.opml indie.opml other.opml |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
And then you can create three different HTML files: |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
jupiter html osr.html osr.opml |
144
|
|
|
|
|
|
|
jupiter html indie.html indie.opml |
145
|
|
|
|
|
|
|
jupiter html rpg.html osr.opml indie.opml other.opml |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
For an example of how it might look, check out the setup for the planets I run. |
148
|
|
|
|
|
|
|
L<https://alexschroeder.ch/cgit/planet/about/> |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head2 What about the JSON file? |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
There's a JSON file that gets generated and updated as you run Planet Jupiter. |
153
|
|
|
|
|
|
|
It's name depends on the OPML files used. It records metadata for every feed in |
154
|
|
|
|
|
|
|
the OPML file that isn't stored in the feeds themselves. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
C<message> is the HTTP status message, or a similar message such as "No entry |
157
|
|
|
|
|
|
|
newer than 90 days." This is set when update the feeds in your cache. |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
C<message> is the HTTP status code; this code could be the real status code from |
160
|
|
|
|
|
|
|
the server (such as 404 for a "not found" status) or one generated by Jupiter |
161
|
|
|
|
|
|
|
such that it matches the status message (such as 206 for a "partial content" |
162
|
|
|
|
|
|
|
status when there aren't any recent entries in the feed). This is set when |
163
|
|
|
|
|
|
|
update the feeds in your cache. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
C<title> is the site's title. When you update the feeds in your cache, it is |
166
|
|
|
|
|
|
|
taken from the OPML file. That's how the feed can have a title even if the |
167
|
|
|
|
|
|
|
download failed. When you generate the HTML, the feeds in the cache are parsed |
168
|
|
|
|
|
|
|
and if a title is provided, it is stored in the JSON file and overrides the |
169
|
|
|
|
|
|
|
title in the OPML file. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
C<link> is the site's link for humans. When you generate the HTML, the feeds in |
172
|
|
|
|
|
|
|
the cache are parsed and if a link is provided, it is stored in the JSON file. |
173
|
|
|
|
|
|
|
If the OPML element contained a C<htmlURL> attribute, however, that takes |
174
|
|
|
|
|
|
|
precedence. The reasoning is that when a podcast is hosted on a platform which |
175
|
|
|
|
|
|
|
generates a link that you don't like and you know the link to the human-readable |
176
|
|
|
|
|
|
|
blog elsehwere, use the C<htmlURL> attribute in the OPML file to override this. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
C<last_modified> and C<etag> are two headers used for caching from the HTTP |
179
|
|
|
|
|
|
|
response that cannot be changed by data in the feed. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
If we run into problems downloading a feed, this setup allows us to still link |
182
|
|
|
|
|
|
|
to the feeds that aren't working, using their correct names, and describing the |
183
|
|
|
|
|
|
|
error we encountered. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 Logging |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Use the C<--log=LEVEL> to set the log level. Valid values for LEVEL are debug, |
188
|
|
|
|
|
|
|
info, warn, error, and fatal. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 LICENSE |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
GNU Affero General Public License |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head1 INSTALLATION |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Using C<cpan>: |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
cpan App::jupiter |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
Manual install: |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
perl Makefile.PL |
203
|
|
|
|
|
|
|
make |
204
|
|
|
|
|
|
|
make install |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head2 Dependencies |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
To run Jupiter on Debian we need: |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
C<libmodern-perl-perl> for L<Modern::Perl> |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
C<libmojolicious-perl> for L<Mojo::Template>, L<Mojo::UserAgent>, L<Mojo::Log>, |
213
|
|
|
|
|
|
|
L<Mojo::JSON>, and L<Mojo::Util> |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
C<libxml-libxml-perl> for L<XML::LibXML> |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
C<libfile-slurper-perl> for L<File::Slurper> |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
C<libdatetime-perl> for L<DateTime> |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
C<libdatetime-format-mail-perl> for L<DateTime::Format::Mail> |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
C<libdatetime-format-iso8601-perl> for L<DateTime::Format::ISO8601> |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
Unfortunately, L<Mojo::UserAgent::Role::Queued> isn't packaged for Debian. |
226
|
|
|
|
|
|
|
Therefore, let's build it and install it as a Debian package. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sudo apt-get install libmodule-build-tiny-perl |
229
|
|
|
|
|
|
|
sudo apt-get install dh-make-perl |
230
|
|
|
|
|
|
|
sudo dh-make-perl --build --cpan Mojo::UserAgent::Role::Queued |
231
|
|
|
|
|
|
|
dpkg --install libmojo-useragent-role-queued-perl_1.15-1_all.deb |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
To generate the C<README.md> from the source file, you need F<pod2markdown> |
234
|
|
|
|
|
|
|
which you get in C<libpod-markdown-perl>. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=head1 FILES |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
There are a number of files in the F<share> directory which you can use as |
239
|
|
|
|
|
|
|
starting points. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
F<template.html> is the HTML template. |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
F<default.css> is a small CSS file used by F<template.html>. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
F<personalize.js> is a small Javascript file used by F<template.html> used to |
246
|
|
|
|
|
|
|
allow visitors to jump from one article to the next using C<J> and C<K>. |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
F<jupiter.png> is used by F<template.html> as the icon. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
F<jupiter.svg> is used by F<template.html> as the logo. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
F<feed.png> is used by F<template.html> as the icon for the feeds in the |
253
|
|
|
|
|
|
|
sidebar. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
F<feed.rss> is the feed template. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head1 OPTIONS |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
HTML generation uses a template, C<template.html>. It is written for |
260
|
|
|
|
|
|
|
C<Mojo::Template> and you can find it in the F<share> directory of your |
261
|
|
|
|
|
|
|
distribution. The default templates use other files, such as the logo, the feed |
262
|
|
|
|
|
|
|
icon, a CSS file, and a small Javascript snippet to enable navigation using the |
263
|
|
|
|
|
|
|
C<J> and C<K> keys (see above). |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
You can specify a different HTML file to generate: |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
B<jupiter html> I<your.html feed.opml> |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
If you specify two HTML files, the first is the HTML file to generate and the |
270
|
|
|
|
|
|
|
second is the template to use. Both must use the C<.html> extension. |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
B<jupiter html> I<your.html your-template.html feed.opml> |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Feed generation uses a template, C<feed.rss>. It writes all the entries into a |
275
|
|
|
|
|
|
|
file called C<feed.xml>. Again, the template is written for C<Mojo::Template>. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
You can specify up to two XML, RSS or ATOM files. They must uses one of these |
278
|
|
|
|
|
|
|
three extensions: C<.xml>, C<.rss>, or C<.atom>. The first is the name of the |
279
|
|
|
|
|
|
|
feed to generate, the second is the template to use: |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml> |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
In the above case, Planet Jupiter will write a feed called F<atom.xml> based on |
284
|
|
|
|
|
|
|
F<template.xml> and a HTML file called F<planet.html> based on F<template.html>, |
285
|
|
|
|
|
|
|
using the cached entries matching the feeds in F<feed.opml>. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=cut |
288
|
|
|
|
|
|
|
|
289
|
15
|
|
|
15
|
|
14317
|
use DateTime; |
|
15
|
|
|
|
|
7003415
|
|
|
15
|
|
|
|
|
819
|
|
290
|
15
|
|
|
15
|
|
8894
|
use DateTime::Format::Mail; |
|
15
|
|
|
|
|
90165
|
|
|
15
|
|
|
|
|
633
|
|
291
|
15
|
|
|
15
|
|
9719
|
use DateTime::Format::ISO8601; |
|
15
|
|
|
|
|
6676034
|
|
|
15
|
|
|
|
|
929
|
|
292
|
15
|
|
|
15
|
|
164
|
use File::Basename; |
|
15
|
|
|
|
|
41
|
|
|
15
|
|
|
|
|
1397
|
|
293
|
15
|
|
|
15
|
|
781
|
use File::Slurper qw(read_binary write_binary read_text write_text); |
|
15
|
|
|
|
|
3328
|
|
|
15
|
|
|
|
|
996
|
|
294
|
15
|
|
|
15
|
|
97
|
use List::Util qw(uniq min shuffle); |
|
15
|
|
|
|
|
52
|
|
|
15
|
|
|
|
|
1149
|
|
295
|
15
|
|
|
15
|
|
129
|
use Modern::Perl; |
|
15
|
|
|
|
|
43
|
|
|
15
|
|
|
|
|
193
|
|
296
|
15
|
|
|
15
|
|
14289
|
use Mojo::Log; |
|
15
|
|
|
|
|
403439
|
|
|
15
|
|
|
|
|
186
|
|
297
|
15
|
|
|
15
|
|
1271
|
use Mojo::JSON qw(decode_json encode_json); |
|
15
|
|
|
|
|
18832
|
|
|
15
|
|
|
|
|
1065
|
|
298
|
15
|
|
|
15
|
|
8549
|
use Mojo::Template; |
|
15
|
|
|
|
|
67050
|
|
|
15
|
|
|
|
|
110
|
|
299
|
15
|
|
|
15
|
|
9631
|
use Mojo::UserAgent; |
|
15
|
|
|
|
|
448406
|
|
|
15
|
|
|
|
|
147
|
|
300
|
15
|
|
|
15
|
|
9701
|
use Pod::Simple::Text; |
|
15
|
|
|
|
|
85683
|
|
|
15
|
|
|
|
|
637
|
|
301
|
15
|
|
|
15
|
|
9879
|
use XML::LibXML; |
|
15
|
|
|
|
|
476143
|
|
|
15
|
|
|
|
|
131
|
|
302
|
15
|
|
|
15
|
|
2601
|
use Mojo::Util qw(slugify trim xml_escape html_unescape); |
|
15
|
|
|
|
|
36
|
|
|
15
|
|
|
|
|
1101
|
|
303
|
15
|
|
|
15
|
|
147
|
use File::ShareDir 'dist_file'; |
|
15
|
|
|
|
|
48
|
|
|
15
|
|
|
|
|
804
|
|
304
|
|
|
|
|
|
|
|
305
|
15
|
|
|
15
|
|
108
|
use vars qw($log); |
|
15
|
|
|
|
|
37
|
|
|
15
|
|
|
|
|
87176
|
|
306
|
|
|
|
|
|
|
our $log = Mojo::Log->new; |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
my $xpc = XML::LibXML::XPathContext->new; |
309
|
|
|
|
|
|
|
$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom'); |
310
|
|
|
|
|
|
|
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml'); |
311
|
|
|
|
|
|
|
$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/'); |
312
|
|
|
|
|
|
|
$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd'); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
my $undefined_date = DateTime->from_epoch( epoch => 0 ); |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
my (%wday, %month, $wday_re, $month_re); |
317
|
|
|
|
|
|
|
%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun); |
318
|
|
|
|
|
|
|
%month = qw (janv. Jan févr. Feb mars Mar avr. Apr mai May juin Jun |
319
|
|
|
|
|
|
|
juil. Jul août Aug sept. Sep oct. Oct nov. Nov déc. Dec); |
320
|
|
|
|
|
|
|
$wday_re = join('|', map { quotemeta } keys %wday) unless $wday_re; |
321
|
|
|
|
|
|
|
$month_re = join('|', map { quotemeta } keys %month) unless $month_re; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# Our tests don't want to call main |
324
|
|
|
|
|
|
|
__PACKAGE__->main unless caller; |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
sub main { |
327
|
0
|
|
|
0
|
|
0
|
my ($log_level) = grep /^--log=/, @ARGV; |
328
|
0
|
0
|
|
|
|
0
|
$log->level(substr($log_level, 6)) if $log_level; |
329
|
0
|
|
|
|
|
0
|
my ($command) = grep /^[a-z]+$/, @ARGV; |
330
|
0
|
|
0
|
|
|
0
|
$command ||= 'help'; |
331
|
0
|
0
|
|
|
|
0
|
if ($command eq 'update') { |
|
|
0
|
|
|
|
|
|
332
|
0
|
|
|
|
|
0
|
update_cache(@ARGV); |
333
|
|
|
|
|
|
|
} elsif ($command eq 'html') { |
334
|
0
|
|
|
|
|
0
|
make_html(@ARGV); |
335
|
|
|
|
|
|
|
} else { |
336
|
0
|
|
|
|
|
0
|
my $parser = Pod::Simple::Text->new(); |
337
|
0
|
|
|
|
|
0
|
$parser->parse_file($0); |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
sub update_cache { |
342
|
18
|
|
|
18
|
|
1369876
|
my ($feeds, $files) = read_opml(@_); |
343
|
18
|
|
|
|
|
95
|
make_directories($feeds); |
344
|
18
|
|
|
|
|
166
|
load_feed_metadata($feeds, $files); |
345
|
18
|
|
|
|
|
299
|
my $ua = Mojo::UserAgent->new->with_roles('+Queued') |
346
|
|
|
|
|
|
|
->max_redirects(3) |
347
|
|
|
|
|
|
|
->max_active(5); |
348
|
18
|
|
|
|
|
90960
|
make_promises($ua, $feeds); |
349
|
18
|
|
|
|
|
22315
|
fetch_feeds($feeds); |
350
|
18
|
|
|
|
|
17756
|
save_feed_metadata($feeds, $files); |
351
|
18
|
|
|
|
|
7644
|
cleanup_cache($feeds); |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub make_promises { |
355
|
18
|
|
|
18
|
|
53
|
my $ua = shift; |
356
|
18
|
|
|
|
|
58
|
my $feeds = shift; |
357
|
18
|
|
|
|
|
63
|
for my $feed (@$feeds) { |
358
|
18
|
|
|
|
|
115
|
my $url = html_unescape $feed->{url}; # undo xml_escape for the request |
359
|
|
|
|
|
|
|
$ua->on(start => sub { |
360
|
18
|
|
|
18
|
|
18154
|
my ($ua, $tx) = @_; |
361
|
18
|
50
|
|
|
|
119
|
$tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag}); |
362
|
18
|
50
|
|
|
|
111
|
$tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified}); |
363
|
18
|
|
|
|
|
544
|
}); |
364
|
|
|
|
|
|
|
$feed->{promise} = $ua->get_p($url) |
365
|
|
|
|
|
|
|
->catch(sub { |
366
|
0
|
|
|
0
|
|
0
|
$feed->{message} = "@_"; |
367
|
0
|
|
|
|
|
0
|
$feed->{code} = 521; |
368
|
|
|
|
|
|
|
# returning 0 in the case of an error is important |
369
|
0
|
|
|
|
|
0
|
0; }) |
370
|
|
|
|
|
|
|
# sleeping to stop blogger.com from blocking us |
371
|
18
|
|
|
18
|
|
326
|
->finally(sub { $log->debug($url); sleep 2; }); |
|
18
|
|
|
|
|
290133
|
|
|
18
|
|
|
|
|
36004299
|
|
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub fetch_feeds { |
376
|
18
|
|
|
18
|
|
49
|
my $feeds = shift; |
377
|
18
|
|
|
|
|
592
|
$log->info("Fetching feeds..."); |
378
|
18
|
|
|
|
|
141
|
Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub { |
379
|
|
|
|
|
|
|
# all returns the values in the same order! |
380
|
18
|
|
|
18
|
|
20460
|
for (my $i = 0; $i < @_; $i++) { |
381
|
18
|
|
|
|
|
96
|
my $feed = $feeds->[$i]; |
382
|
18
|
|
|
|
|
63
|
my $value = $_[$i]; |
383
|
18
|
|
|
|
|
71
|
my $tx = $value->[0]; |
384
|
|
|
|
|
|
|
# relies on catch returning 0 above |
385
|
18
|
50
|
|
|
|
162
|
next unless $tx; |
386
|
18
|
|
|
|
|
314
|
$feed->{message} = $tx->result->message; |
387
|
18
|
|
|
|
|
1342
|
$feed->{code} = $tx->result->code; |
388
|
18
|
|
|
|
|
670
|
$feed->{last_modified} = $tx->result->headers->last_modified; |
389
|
18
|
|
|
|
|
1247
|
$feed->{etag} = $tx->result->headers->etag; |
390
|
|
|
|
|
|
|
# save raw bytes if this is a success |
391
|
18
|
50
|
|
|
|
881
|
eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success; |
|
18
|
|
|
|
|
1312
|
|
392
|
18
|
50
|
|
|
|
9202
|
warn "Unable to write $feed->{cache_file}: $@\n" if $@; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
})->catch(sub { |
395
|
0
|
|
|
0
|
|
0
|
warn "Something went wrong: @_"; |
396
|
18
|
|
|
|
|
275
|
})->wait; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub load_feed_metadata { |
400
|
37
|
|
|
37
|
|
92
|
my $feeds = shift; |
401
|
37
|
|
|
|
|
71
|
my $files = shift; |
402
|
37
|
|
|
|
|
119
|
for my $file (@$files) { |
403
|
36
|
|
|
|
|
179
|
my $filename = "$file->{path}/$file->{name}"; |
404
|
36
|
100
|
|
|
|
883
|
next unless -r "$filename.json"; |
405
|
22
|
|
|
|
|
309
|
my $data = decode_json read_binary("$filename.json"); |
406
|
22
|
|
|
|
|
9971
|
for my $feed (@$feeds) { |
407
|
22
|
|
|
|
|
71
|
my $url = $feed->{url}; |
408
|
|
|
|
|
|
|
# don't overwrite title and htmlUrl from OPML file |
409
|
22
|
50
|
|
|
|
135
|
$feed->{title} = $data->{$url}->{title} if $data->{$url}->{title}; |
410
|
22
|
|
100
|
|
|
190
|
$feed->{link} ||= $data->{$url}->{link}; |
411
|
|
|
|
|
|
|
# all the other metadata is loaded from the JSON file |
412
|
22
|
|
|
|
|
74
|
$feed->{message} = $data->{$url}->{message}; |
413
|
22
|
|
|
|
|
108
|
$feed->{code} = $data->{$url}->{code}; |
414
|
22
|
|
|
|
|
91
|
$feed->{last_modified} = $data->{$url}->{last_modified}; |
415
|
22
|
|
|
|
|
80
|
$feed->{etag} = $data->{$url}->{etag}; |
416
|
22
|
|
|
|
|
64
|
} grep { $_->{opml_file} eq $file->{file} } @$feeds; |
|
22
|
|
|
|
|
175
|
|
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub save_feed_metadata { |
421
|
37
|
|
|
37
|
|
704
|
my $feeds = shift; |
422
|
37
|
|
|
|
|
86
|
my $files = shift; |
423
|
37
|
|
|
|
|
119
|
for my $file (@$files) { |
424
|
36
|
|
|
|
|
139
|
my $name = $file->{name}; |
425
|
|
|
|
|
|
|
my %messages = map { |
426
|
36
|
|
|
|
|
88
|
my $feed = $_; |
427
|
36
|
|
|
|
|
116
|
$feed->{url} => { map { $_ => $feed->{$_} } grep { $feed->{$_} } qw(title link message code last_modified etag) }; |
|
131
|
|
|
|
|
614
|
|
|
216
|
|
|
|
|
498
|
|
428
|
36
|
|
|
|
|
108
|
} grep { $_->{opml_file} eq $file->{file} } @$feeds; |
|
36
|
|
|
|
|
181
|
|
429
|
36
|
|
|
|
|
393
|
write_binary("$file->{path}/$file->{name}.json", encode_json \%messages); |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub cleanup_cache { |
434
|
18
|
|
|
18
|
|
80
|
my $feeds = shift; |
435
|
18
|
|
|
|
|
60
|
my %good = map { $_ => 1 } @{cache_files($feeds)}; |
|
18
|
|
|
|
|
140
|
|
|
18
|
|
|
|
|
106
|
|
436
|
18
|
|
|
|
|
80
|
my @unused = grep { not $good{$_} } @{existing_files($feeds)}; |
|
18
|
|
|
|
|
144
|
|
|
18
|
|
|
|
|
113
|
|
437
|
18
|
50
|
|
|
|
621
|
if (@unused) { |
438
|
0
|
|
|
|
|
0
|
$log->info("Removing unused files from the cache..."); |
439
|
0
|
|
|
|
|
0
|
foreach (@unused) { $log->info($_) } |
|
0
|
|
|
|
|
0
|
|
440
|
0
|
|
|
|
|
0
|
unlink @unused; |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub existing_files { |
445
|
18
|
|
|
18
|
|
53
|
my $feeds = shift; |
446
|
18
|
|
|
|
|
54
|
my @files; |
447
|
18
|
|
|
|
|
64
|
for my $dir (uniq map { $_->{cache_dir} } @$feeds) { |
|
18
|
|
|
|
|
155
|
|
448
|
18
|
|
|
|
|
2492
|
push(@files, <"$dir/*">); |
449
|
|
|
|
|
|
|
} |
450
|
18
|
|
|
|
|
221
|
return \@files; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
sub cache_files { |
454
|
18
|
|
|
18
|
|
60
|
my $feeds = shift; |
455
|
18
|
|
|
|
|
92
|
my @files = map { $_->{cache_file} } @$feeds; |
|
18
|
|
|
|
|
129
|
|
456
|
18
|
|
|
|
|
96
|
return \@files; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub make_directories { |
460
|
18
|
|
|
18
|
|
50
|
my $feeds = shift; |
461
|
18
|
|
|
|
|
52
|
for my $dir (uniq map { $_->{cache_dir} } @$feeds) { |
|
18
|
|
|
|
|
122
|
|
462
|
18
|
100
|
|
|
|
601
|
if (not -d $dir) { |
463
|
14
|
|
|
|
|
2800
|
mkdir $dir; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub make_html { |
469
|
19
|
|
|
19
|
|
37051
|
my ($feeds, $files) = read_opml(@_); |
470
|
19
|
|
|
|
|
132
|
load_feed_metadata($feeds, $files); # load messages and codes for feeds |
471
|
19
|
|
|
|
|
99
|
my $globals = globals($files); |
472
|
19
|
|
|
|
|
119
|
my $entries = entries($feeds, 4); # set data for feeds, too |
473
|
19
|
|
|
|
|
107
|
add_data($feeds, $entries); # extract data from the xml |
474
|
19
|
|
|
|
|
283
|
save_feed_metadata($feeds, $files); # save title and link for feeds |
475
|
19
|
|
|
|
|
6930
|
$entries = limit($entries, 100); |
476
|
19
|
|
|
|
|
95
|
write_text(html_file(@_), apply_template(read_text(html_template_file(@_)), $globals, $feeds, $entries)); |
477
|
19
|
|
|
|
|
217271
|
write_text(feed_file(@_), apply_template(read_text(feed_template_file(@_)), $globals, $feeds, $entries)); |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
sub html_file { |
481
|
19
|
|
|
19
|
|
169
|
my ($html) = grep /\.html$/, @_; |
482
|
19
|
|
50
|
|
|
143
|
return $html || 'index.html'; |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
sub html_template_file { |
486
|
19
|
|
|
19
|
|
105
|
my ($html, $template) = grep /\.html$/, @_; |
487
|
19
|
|
33
|
|
|
188
|
$template ||= dist_file('App-jupiter', 'template.html'); |
488
|
19
|
50
|
|
|
|
4831
|
die "HTML template $template not found\n" unless -r $template; |
489
|
19
|
|
|
|
|
165
|
return $template; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub feed_file { |
493
|
19
|
|
|
19
|
|
270
|
my ($feed) = grep /\.(xml|rss|atom)$/, @_; |
494
|
19
|
50
|
|
|
|
168
|
return $feed if $feed; |
495
|
0
|
|
|
|
|
0
|
return 'feed.xml'; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub feed_template_file { |
499
|
19
|
|
|
19
|
|
151
|
my ($feed, $template) = grep /\.(xml|rss|atom)$/, @_; |
500
|
19
|
50
|
|
|
|
84
|
return $template if $template; |
501
|
19
|
|
|
|
|
102
|
return dist_file('App-jupiter', 'feed.rss'); |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
sub apply_template { |
505
|
38
|
|
|
38
|
|
8650
|
my $mnt = Mojo::Template->new; |
506
|
38
|
|
|
|
|
474
|
return $mnt->render(@_); |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=head1 TEMPLATES |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
The page template is called with three hash references: C<globals>, C<feeds>, |
512
|
|
|
|
|
|
|
and C<entries>. The keys of these three hash references are documented below. |
513
|
|
|
|
|
|
|
The values of these hashes are all I<escaped HTML> except where noted (dates and |
514
|
|
|
|
|
|
|
file names, for example). |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
The technical details of how to write the templates are documented in the man |
517
|
|
|
|
|
|
|
page for L<Mojo::Template>. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=head2 Globals |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
There are not many global keys. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
B<date> is the the publication date of the HTML page, in ISO date format: |
524
|
|
|
|
|
|
|
YYYY-MM-DD. |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
B<files> is the list of OPML files used. |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=cut |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
sub globals { |
531
|
19
|
|
|
19
|
|
52
|
my $files = shift; |
532
|
19
|
|
|
|
|
172
|
my @time = gmtime; |
533
|
19
|
|
|
|
|
293
|
my $today = DateTime->now->ymd; |
534
|
19
|
|
|
|
|
10580
|
return {date => $today, files => $files}; |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 Writing templates for feeds |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Feeds have the following keys available: |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
B<title> is the title of the feed. |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
B<url> is the URL of the feed (RSS or Atom). This is not the link to the site! |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
B<link> is the URL of the web page (HTML). This is the link to the site. |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
B<opml_file> is the file name where this feed is listed. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
B<cache_dir> is the directory where this feed is cached. |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
B<message> is the HTTP status message or other warning or error that we got |
552
|
|
|
|
|
|
|
while fetching the feed. |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
B<code> is the HTTP status code we got while fetching the feed. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
B<doc> is the L<XML::LibXML::Document>. Could be either Atom or RSS! |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=cut |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
# Creates list of feeds. Each feed is a hash with keys title, url, opml_file, |
561
|
|
|
|
|
|
|
# cache_dir and cache_file. |
562
|
|
|
|
|
|
|
sub read_opml { |
563
|
37
|
|
|
37
|
|
119
|
my (@feeds, @files); |
564
|
37
|
|
|
|
|
338
|
my @filters = map { decode(locale => substr($_, 1, -1)) } grep /^\/.*\/$/, @_; |
|
2
|
|
|
|
|
17
|
|
565
|
37
|
|
|
|
|
533
|
for my $file (grep /\.opml$/, @_) { |
566
|
37
|
|
|
|
|
499
|
my $doc = XML::LibXML->load_xml(location => $file); # this better have no errors! |
567
|
37
|
|
|
|
|
22432
|
my @nodes = $doc->findnodes('//outline[./@xmlUrl]'); |
568
|
37
|
|
|
|
|
6490
|
my ($name, $path) = fileparse($file, '.opml', '.xml'); |
569
|
|
|
|
|
|
|
push @feeds, map { |
570
|
37
|
|
|
|
|
236
|
my $title = xml_escape $_->getAttribute('title'); |
|
37
|
|
|
|
|
269
|
|
571
|
37
|
|
|
|
|
1334
|
my $url = xml_escape $_->getAttribute('xmlUrl'); |
572
|
37
|
100
|
100
|
|
|
846
|
next if @filters > 0 and not grep { $url =~ /$_/ or $title =~ /$_/ } @filters; |
|
2
|
100
|
|
|
|
64
|
|
573
|
36
|
|
|
|
|
202
|
my $link = xml_escape $_->getAttribute('htmlUrl'); |
574
|
|
|
|
|
|
|
{ |
575
|
36
|
|
|
|
|
939
|
title => $title, # title in the OPML file |
576
|
|
|
|
|
|
|
url => $url, # feed URL in the OPML file |
577
|
|
|
|
|
|
|
link => $link, # web URL in the OPML file |
578
|
|
|
|
|
|
|
opml_file => $file, |
579
|
|
|
|
|
|
|
cache_dir => "$path/$name", |
580
|
|
|
|
|
|
|
cache_file => "$path/$name/" . slugify($url), |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
} @nodes; |
583
|
36
|
50
|
|
|
|
2316
|
warn "No feeds found in the OPML file $file\n" unless @nodes; |
584
|
36
|
|
|
|
|
413
|
push @files, { file => $file, path => $path, name => $name }; |
585
|
|
|
|
|
|
|
} |
586
|
37
|
|
|
|
|
2194
|
@feeds = shuffle @feeds; |
587
|
37
|
|
|
|
|
162
|
return \@feeds, \@files; |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub entries { |
591
|
19
|
|
|
19
|
|
48
|
my $feeds = shift; |
592
|
19
|
|
|
|
|
47
|
my $limit = shift; |
593
|
19
|
|
|
|
|
87
|
my $date = DateTime->now(time_zone => 'UTC')->subtract( days => 90 ); # compute once |
594
|
19
|
|
|
|
|
37424
|
my $now = DateTime->now(time_zone => 'UTC'); |
595
|
19
|
|
|
|
|
6330
|
my @entries; |
596
|
19
|
|
|
|
|
73
|
for my $feed (@$feeds) { |
597
|
18
|
50
|
|
|
|
454
|
next unless -r $feed->{cache_file}; |
598
|
18
|
|
|
|
|
74
|
my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )}; |
|
18
|
|
|
|
|
203
|
|
599
|
18
|
50
|
|
|
|
8251
|
if (not $doc) { |
600
|
0
|
|
|
|
|
0
|
$feed->{message} = xml_escape "Parsing error: $@"; |
601
|
0
|
|
|
|
|
0
|
$feed->{code} = 422; # unprocessable |
602
|
0
|
|
|
|
|
0
|
next; |
603
|
|
|
|
|
|
|
} |
604
|
18
|
|
|
|
|
193
|
$feed->{doc} = $doc; |
605
|
18
|
|
|
|
|
161
|
my @nodes = $xpc->findnodes("/rss/channel/item | /atom:feed/atom:entry", $doc); |
606
|
18
|
100
|
|
|
|
1411
|
if (not @nodes) { |
607
|
2
|
|
|
|
|
8
|
$feed->{message} = "Empty feed"; |
608
|
2
|
|
|
|
|
6
|
$feed->{code} = 204; # no content |
609
|
2
|
|
|
|
|
80
|
next; |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
# if this is an Atom feed, we need to sort the entries ourselves (older entries at the end) |
612
|
|
|
|
|
|
|
my @candidates = map { |
613
|
16
|
|
|
|
|
55
|
my $entry = {}; |
|
40
|
|
|
|
|
86
|
|
614
|
40
|
|
|
|
|
112
|
$entry->{element} = $_; |
615
|
40
|
|
|
|
|
121
|
$entry->{id} = id($_); |
616
|
40
|
|
66
|
|
|
165
|
$entry->{date} = updated($_) || $undefined_date; |
617
|
40
|
|
|
|
|
1069
|
$entry; |
618
|
|
|
|
|
|
|
} @nodes; |
619
|
16
|
|
|
|
|
60
|
@candidates = grep { DateTime->compare($_->{date}, $now) <= 0 } @candidates; |
|
40
|
|
|
|
|
1966
|
|
620
|
16
|
|
|
|
|
1825
|
@candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates); |
|
38
|
|
|
|
|
2345
|
|
621
|
16
|
|
|
|
|
198
|
@candidates = @candidates[0 .. min($#candidates, $limit - 1)]; |
622
|
|
|
|
|
|
|
# now that we have limited the candidates, let's add more metadata from the feed |
623
|
16
|
|
|
|
|
55
|
for my $entry (@candidates) { |
624
|
30
|
|
|
|
|
63
|
$entry->{feed} = $feed; |
625
|
|
|
|
|
|
|
# these two are already escaped |
626
|
30
|
|
|
|
|
72
|
$entry->{blog_title} = $feed->{title}; |
627
|
30
|
|
|
|
|
69
|
$entry->{blog_url} = $feed->{url}; |
628
|
|
|
|
|
|
|
} |
629
|
16
|
|
|
|
|
96
|
add_age_warning($feed, \@candidates, $date); |
630
|
16
|
|
|
|
|
467
|
push @entries, @candidates; |
631
|
|
|
|
|
|
|
} |
632
|
19
|
|
|
|
|
254
|
return \@entries; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
sub add_age_warning { |
636
|
16
|
|
|
16
|
|
43
|
my $feed = shift; |
637
|
16
|
|
|
|
|
29
|
my $entries = shift; |
638
|
16
|
|
|
|
|
27
|
my $date = shift; |
639
|
|
|
|
|
|
|
# feed modification date is smaller than the date given |
640
|
16
|
|
|
|
|
90
|
my ($node) = $xpc->findnodes("/rss/channel | /atom:feed", $feed->{doc}); |
641
|
16
|
|
|
|
|
832
|
my $feed_date = updated($node); |
642
|
16
|
100
|
100
|
|
|
365
|
if ($feed_date and DateTime->compare($feed_date, $date) == -1) { |
643
|
11
|
|
|
|
|
928
|
$feed->{message} = "No feed updates in 90 days"; |
644
|
11
|
|
|
|
|
35
|
$feed->{code} = 206; # partial content |
645
|
11
|
|
|
|
|
100
|
return; |
646
|
|
|
|
|
|
|
} else { |
647
|
|
|
|
|
|
|
# or no entry found with a modification date equal or bigger than the date given |
648
|
5
|
|
|
|
|
231
|
for my $entry (@$entries) { |
649
|
11
|
100
|
|
|
|
415
|
return if DateTime->compare($entry->{date}, $date) >= 0; |
650
|
|
|
|
|
|
|
} |
651
|
4
|
|
|
|
|
285
|
$feed->{message} = "No entry newer than 90 days"; |
652
|
4
|
|
|
|
|
23
|
$feed->{code} = 206; # partial content |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
sub updated { |
657
|
113
|
|
|
113
|
|
25416
|
my $node = shift; |
658
|
113
|
50
|
|
|
|
411
|
return unless $node; |
659
|
113
|
100
|
|
|
|
846
|
my @nodes = $xpc->findnodes('pubDate | atom:published | atom:updated', $node) or return; |
660
|
106
|
|
|
|
|
6169
|
my $date = $nodes[0]->textContent; |
661
|
|
|
|
|
|
|
my $dt = eval { DateTime::Format::Mail->parse_datetime($date) } |
662
|
|
|
|
|
|
|
|| eval { DateTime::Format::ISO8601->parse_datetime($date) } |
663
|
106
|
|
100
|
|
|
200
|
|| eval { DateTime::Format::Mail->parse_datetime(french($date)) }; |
664
|
106
|
|
|
|
|
75805
|
return $dt; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
sub french { |
668
|
60
|
|
|
60
|
|
111
|
my $date = shift; |
669
|
60
|
|
|
|
|
670
|
$date =~ s/^($wday_re)/$wday{$1}/; |
670
|
60
|
|
|
|
|
649
|
$date =~ s/\b($month_re)/$month{$1}/; |
671
|
60
|
|
|
|
|
328
|
return $date; |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
sub id { |
675
|
40
|
|
|
40
|
|
66
|
my $node = shift; |
676
|
40
|
50
|
|
|
|
103
|
return unless $node; |
677
|
40
|
|
|
|
|
721
|
my $id = $xpc->findvalue('guid | atom:id', $node); # id is mandatory for Atom |
678
|
40
|
|
100
|
|
|
4856
|
$id ||= $node->findvalue('link'); # one of the following three is mandatory for RSS |
679
|
40
|
|
100
|
|
|
2473
|
$id ||= $node->findvalue('title'); |
680
|
40
|
|
100
|
|
|
1682
|
$id ||= $node->findvalue('description'); |
681
|
40
|
|
|
|
|
790
|
return $id; |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
sub unique { |
685
|
35
|
|
|
35
|
|
522
|
my %seen; |
686
|
|
|
|
|
|
|
my @unique; |
687
|
35
|
|
|
|
|
85
|
for my $node (@_) { |
688
|
69
|
100
|
|
|
|
206
|
next if $seen{$node->{id}}; |
689
|
61
|
|
|
|
|
159
|
$seen{$node->{id}} = 1; |
690
|
61
|
|
|
|
|
115
|
push(@unique, $node); |
691
|
|
|
|
|
|
|
} |
692
|
35
|
|
|
|
|
179
|
return @unique; |
693
|
|
|
|
|
|
|
} |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
sub limit { |
696
|
19
|
|
|
19
|
|
59
|
my $entries = shift; |
697
|
19
|
|
|
|
|
102
|
my $limit = shift; |
698
|
|
|
|
|
|
|
# we want the most recent entries overall |
699
|
19
|
|
|
|
|
87
|
@$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries); |
|
18
|
|
|
|
|
841
|
|
700
|
19
|
|
|
|
|
643
|
return [@$entries[0 .. min($#$entries, $limit - 1)]]; |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
=head2 Writing templates for entries |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
Entries have the following keys available: |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
B<title> is the title of the post. |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
B<link> is the URL to the post on the web (probably a HTML page). |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
B<blog_title> is the title of the site. |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
B<blog_link> is the URL for the site on the web (probably a HTML page). |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
B<blog_url> is the URL for the site's feed (RSS or Atom). |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
B<authors> are the authors (or the Dublin Core contributor), a list of strings. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
B<date> is the publication date, as a DateTime object. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
B<day> is the publication date, in ISO date format: YYYY-MM-DD, for the UTC |
722
|
|
|
|
|
|
|
timezone. The UTC timezone is picked so that the day doesn't jump back and forth |
723
|
|
|
|
|
|
|
when sorting entries by date. |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
B<content> is the full post content, as string or encoded HTML. |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
B<excerpt> is the post content, limited to 500 characters, with paragraph |
728
|
|
|
|
|
|
|
separators instead of HTML elements, as HTML. It is not encoded because the idea |
729
|
|
|
|
|
|
|
is that it only gets added to the HTML and not to the feed, and the HTML it |
730
|
|
|
|
|
|
|
contains is very controlled (only the pilcrow sign inside a span to indicate |
731
|
|
|
|
|
|
|
paragraph breaks). |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
B<categories> are the categories, a list of strings. |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
B<element> is for internal use only. It contains the L<XML::LibXML::Element> |
736
|
|
|
|
|
|
|
object. This could be RSS or Atom! |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
B<feed> is for internal use only. It's a reference to the feed this entry |
739
|
|
|
|
|
|
|
belongs to. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=cut |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
sub add_data { |
744
|
19
|
|
|
19
|
|
52
|
my $feeds = shift; |
745
|
19
|
|
|
|
|
45
|
my $entries = shift; |
746
|
|
|
|
|
|
|
# A note on the use of xml_escape: whenever we get data from the feed itself, |
747
|
|
|
|
|
|
|
# it needs to be escaped if it gets printed into the HTML. For example: the |
748
|
|
|
|
|
|
|
# feed contains a feed title of "Foo & Bar". findvalue returns "Foo & |
749
|
|
|
|
|
|
|
# Bar". When the template inserts the title, however, we want "Foo & Bar", |
750
|
|
|
|
|
|
|
# not "Foo & Bar". Thus: any text we get from the feed needs to be escaped |
751
|
|
|
|
|
|
|
# if there's a chance we're going to print it again. |
752
|
19
|
|
|
|
|
63
|
for my $feed (@$feeds) { |
753
|
18
|
50
|
|
|
|
76
|
next unless $feed->{doc}; |
754
|
|
|
|
|
|
|
# title and url in the feed overrides title and xmlUrl set in the OPML (XML already escaped) |
755
|
18
|
|
0
|
|
|
209
|
$feed->{title} = xml_escape($xpc->findvalue('/rss/channel/title | /atom:feed/atom:title', $feed->{doc})) || $feed->{title} || ""; |
756
|
18
|
|
50
|
|
|
1952
|
$feed->{url} = xml_escape($xpc->findvalue('/atom:feed/atom:link[@rel="self"]/@href', $feed->{doc})) || $feed->{url} || ""; |
757
|
|
|
|
|
|
|
# link in the feed does not override htmlUrl in the OPML (XML already escaped) |
758
|
18
|
|
100
|
|
|
1842
|
$feed->{link} = $feed->{link} || xml_escape($xpc->findvalue('/rss/channel/link | /atom:feed/atom:link[@rel="alternate"][@type="text/html"]/@href', $feed->{doc})) || ""; |
759
|
|
|
|
|
|
|
# if they just pasted a domain "foo" then "//foo" is a valid URL |
760
|
18
|
100
|
|
|
|
1115
|
$feed->{link} = "//" . $feed->{link} unless $feed->{link} =~ /\/\//; |
761
|
|
|
|
|
|
|
} |
762
|
19
|
|
|
|
|
219
|
for my $entry (@$entries) { |
763
|
|
|
|
|
|
|
# copy from the feed (XML is already escaped) |
764
|
30
|
|
|
|
|
212
|
$entry->{blog_link} = $entry->{feed}->{link}; |
765
|
30
|
|
|
|
|
71
|
$entry->{blog_title} = $entry->{feed}->{title}; |
766
|
30
|
|
|
|
|
68
|
$entry->{blog_url} = $entry->{feed}->{url}; |
767
|
|
|
|
|
|
|
# parse the elements |
768
|
30
|
|
|
|
|
55
|
my $element = $entry->{element}; |
769
|
30
|
|
100
|
|
|
98
|
$entry->{title} = xml_escape $xpc->findvalue('title | atom:title', $element) || "Untitled"; |
770
|
30
|
|
|
|
|
2587
|
my @links = map { xml_escape $_->to_literal } map { $xpc->findnodes($_, $element) } |
|
16
|
|
|
|
|
540
|
|
|
90
|
|
|
|
|
2683
|
|
771
|
|
|
|
|
|
|
# sorted by preferences! |
772
|
|
|
|
|
|
|
qw(link atom:link[@rel="alternate"][@type="text/html"]/@href atom:link/@href); |
773
|
30
|
|
100
|
|
|
831
|
$entry->{link} = shift(@links) || ""; |
774
|
30
|
|
|
|
|
248
|
my @authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes( |
|
8
|
|
|
|
|
326
|
|
775
|
|
|
|
|
|
|
'author | atom:author/atom:name | atom:contributor/atom:name | dc:creator | dc:contributor', $element); |
776
|
30
|
100
|
|
|
|
1255
|
@authors = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes( |
|
10
|
|
|
|
|
541
|
|
777
|
|
|
|
|
|
|
'/atom:feed/atom:author/atom:name | ' |
778
|
|
|
|
|
|
|
. '/atom:feed/atom:contributor/atom:name | ' |
779
|
|
|
|
|
|
|
. '/rss/channel/dc:creator | ' |
780
|
|
|
|
|
|
|
. '/rss/channel/dc:contributor | ' |
781
|
|
|
|
|
|
|
. '/rss/channel/webMaster ', $element) unless @authors; |
782
|
30
|
100
|
|
|
|
979
|
$entry->{authors} = @authors ? \@authors : undef; # key must exist in the hash |
783
|
30
|
100
|
|
|
|
280
|
if (DateTime->compare($entry->{date}, $undefined_date) == 0) { |
784
|
5
|
|
|
|
|
386
|
$entry->{day} = "(no date found)"; |
785
|
|
|
|
|
|
|
} else { |
786
|
25
|
|
|
|
|
2185
|
$entry->{day} = $entry->{date}->clone->set_time_zone('UTC')->ymd; # operate on a clone |
787
|
|
|
|
|
|
|
} |
788
|
30
|
|
|
|
|
4458
|
my @categories = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes('category | atom:category/@term', $element); |
|
6
|
|
|
|
|
275
|
|
789
|
30
|
100
|
|
|
|
1156
|
$entry->{categories} = @categories ? \@categories : undef; # key must exist in the hash |
790
|
30
|
|
|
|
|
158
|
$entry->{excerpt} = ''; |
791
|
30
|
|
|
|
|
77
|
$entry->{content} = ''; |
792
|
30
|
|
|
|
|
95
|
my @nodes = $xpc->findnodes('description[text()!=""] | atom:content[text()!=""]', $element); |
793
|
30
|
100
|
|
|
|
1404
|
@nodes = $xpc->findnodes('summary[text()!=""] | atom:summary[text()!=""] | itunes:summary[text()!=""]', $element) unless @nodes; |
794
|
30
|
|
|
|
|
507
|
my $content = shift(@nodes); |
795
|
|
|
|
|
|
|
# The default is that the content is either plain text or escaped HTML, |
796
|
|
|
|
|
|
|
# which is what we want for RSS. For Atom feeds, type="xhtml" means that the |
797
|
|
|
|
|
|
|
# content is XHTML, so it needs to be escaped. |
798
|
30
|
100
|
|
|
|
111
|
if ($content) { |
799
|
23
|
|
100
|
|
|
312
|
my $is_xhtml = $content->hasAttribute("type") && $content->getAttribute("type") eq "xhtml"; |
800
|
23
|
|
|
|
|
313
|
$entry->{excerpt} = excerpt($content->to_literal); |
801
|
23
|
|
|
|
|
168
|
for my $child ($content->childNodes()) { |
802
|
27
|
|
|
|
|
769
|
my $c = $child->toString(); |
803
|
27
|
100
|
|
|
|
101
|
$c = xml_escape $c if $is_xhtml; |
804
|
27
|
|
|
|
|
294
|
$entry->{content} .= $c; |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
} |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
sub excerpt { |
811
|
23
|
|
|
23
|
|
57
|
my $content = shift; |
812
|
23
|
50
|
|
|
|
70
|
return '(no excerpt)' unless $content; |
813
|
23
|
|
|
|
|
47
|
my $doc = eval { XML::LibXML->load_html(recover => 2, string => $content) }; |
|
23
|
|
|
|
|
114
|
|
814
|
23
|
|
|
|
|
6762
|
my $separator = "¶"; |
815
|
23
|
|
|
|
|
103
|
for my $node ($doc->findnodes('//style')) { |
816
|
1
|
|
|
|
|
53
|
$node->parentNode->removeChild($node); |
817
|
|
|
|
|
|
|
} |
818
|
23
|
|
|
|
|
1278
|
for my $node ($doc->findnodes('//p | //br | //blockquote | //li | //td | //th | //div')) { |
819
|
23
|
|
|
|
|
1109
|
$node->appendTextNode($separator); |
820
|
|
|
|
|
|
|
} |
821
|
23
|
|
|
|
|
79
|
my $text = strip_html($doc->textContent()); |
822
|
23
|
|
|
|
|
406
|
$text =~ s/( +|----+)/ /g; |
823
|
|
|
|
|
|
|
# collapse whitespace and trim |
824
|
23
|
|
|
|
|
292
|
$text =~ s/\s+/ /g; |
825
|
23
|
|
|
|
|
117
|
$text = trim $text; |
826
|
|
|
|
|
|
|
# replace paragraph repeats with their surrounding spaces |
827
|
23
|
|
|
|
|
475
|
$text =~ s/ ?¶( ?¶)* ?/¶/g; |
828
|
23
|
|
|
|
|
82
|
$text =~ s/^¶//; |
829
|
23
|
|
|
|
|
111
|
$text =~ s/¶$//; |
830
|
23
|
|
|
|
|
79
|
my $len = length($text); |
831
|
23
|
|
|
|
|
72
|
$text = substr($text, 0, 500); |
832
|
23
|
50
|
|
|
|
74
|
$text .= "…" if $len > 500; |
833
|
23
|
|
|
|
|
75
|
$text = xml_escape $text; |
834
|
23
|
|
|
|
|
292
|
$text =~ s/¶/<span class="paragraph">¶ <\/span>/g; |
835
|
23
|
|
|
|
|
142
|
return $text; |
836
|
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
# When there's a value that's supposed to be text but isn't, then we can try to |
839
|
|
|
|
|
|
|
# turn it to HTML and from there to text... This is an ugly hack and I wish it |
840
|
|
|
|
|
|
|
# wasn't necessary. |
841
|
|
|
|
|
|
|
sub strip_html { |
842
|
47
|
|
|
47
|
|
647
|
my $str = shift; |
843
|
47
|
50
|
|
|
|
128
|
return '' unless $str; |
844
|
47
|
|
|
|
|
72
|
my $doc = eval { XML::LibXML->load_html(string => $str) }; |
|
47
|
|
|
|
|
164
|
|
845
|
47
|
100
|
|
|
|
12652
|
return $str unless $doc; |
846
|
45
|
|
|
|
|
737
|
return $doc->textContent(); |
847
|
|
|
|
|
|
|
} |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head1 SEE ALSO |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
OPML 2.0, L<http://dev.opml.org/spec2.html> |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
RSS 2.0, L<https://cyber.harvard.edu/rss/rss.html> |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
Atom Syndication, L<https://tools.ietf.org/html/rfc4287> |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
River of News, |
858
|
|
|
|
|
|
|
L<http://scripting.com/2014/06/02/whatIsARiverOfNewsAggregator.html> |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=cut |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
1; |