File Coverage

blib/lib/Mojolicious/Plugin/FeedReader.pm

Criterion	Covered	Total	%
statement	181	184	98.3
branch	77	92	83.7
condition	60	77	77.9
subroutine	24	24	100.0
pod	3	8	37.5
total	345	385	89.6

line	stmt	bran	cond	sub	pod	time	code
1							package Mojolicious::Plugin::FeedReader;
2	4			4		2247	use Mojo::Base 'Mojolicious::Plugin';
	4					4
	4					21
3
4							our $VERSION = '0.08';
5	4			4		661	use Mojo::Util qw(decode slurp trim);
	4					5
	4					200
6	4			4		19	use Mojo::DOM;
	4					4
	4					68
7	4			4		11	use Mojo::IOLoop;
	4					5
	4					26
8	4			4		921	use HTTP::Date;
	4					2286
	4					198
9	4			4		15	use Carp qw(carp croak);
	4					5
	4					179
10	4			4		14	use Scalar::Util qw(blessed);
	4					4
	4					8791
11
12							our @time_fields
13							= (qw(pubDate published created issued updated modified dc\:date));
14							our %is_time_field = map { $_ => 1 } @time_fields;
15
16							# feed mime-types:
17							our @feed_types = (
18							'application/x.atom+xml', 'application/atom+xml',
19							'application/xml', 'text/xml',
20							'application/rss+xml', 'application/rdf+xml'
21							);
22							our %is_feed = map { $_ => 1 } @feed_types;
23
24							sub register {
25	4			4	1	123	my ($self, $app) = @_;
26	4					7	foreach my $method (
27							qw( find_feeds parse_rss parse_opml ))
28							{
29	12					158	$app->helper($method => \&{$method});
	12					44
30							}
31	4					38	$app->helper(parse_feed => \&parse_rss);
32							}
33
34							sub make_dom {
35	25			25	0	37	my ($xml) = @_;
36	25					24	my $rss;
37	25	100	66			127	if (!ref $xml) { # assume file
		100	33
		100
		50
38	9					20	$rss = slurp $xml;
39	9	50				876	die "Unable to read file $xml: $!" unless ($rss);
40							}
41							elsif (ref $xml eq 'SCALAR') { # assume string
42	11					13	$rss = $$xml;
43							}
44							elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style)
45	1					2	return $xml;
46							}
47							elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset or similar
48	4					12	$rss = $xml->slurp;
49							}
50							else {
51	0					0	die "don't know how to make a Mojo::DOM from object $xml";
52							}
53							#my $rss_str = decode 'UTF-8', $rss;
54	24					193	my $rss_str = $rss;
55	24	50				49	die "Failed to read asset $xml (as UTF-8): $!" unless ($rss_str);
56	24					104	return Mojo::DOM->new->parse($rss_str);
57							}
58
59							sub parse_rss {
60	25			25	0	57980	my ($c, $xml, $cb) = @_;
61	25	100	100			152	if (blessed $xml && $xml->isa('Mojo::URL')) {
62							# this is the only case where we might go non-blocking:
63	4	100	66			17	if ($cb && ref $cb eq 'CODE') {
64							return
65							$c->ua->get(
66							$xml,
67							sub {
68	1			1		5458	my ($ua, $tx) = @_;
69	1					2	my $feed;
70	1	50				6	if ($tx->success) {
71	1					18	my $body = $tx->res->body;
72	1					17	my $dom = make_dom(\$body);
73	1					2634	eval { $feed = parse_rss_dom($dom); };
	1					2
74							}
75	1					4	$c->$cb($feed);
76							}
77	1					6	);
78							}
79							else {
80	3					24	my $tx = $c->ua->get($xml);
81	3	50				14967	if ($tx->success) {
82	3					50	my $body = $tx->res->body;
83	3					49	$xml = \$body;
84							}
85							else {
86	0	0				0	croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : '');
87							}
88							}
89							}
90	24					57	my $dom = make_dom($xml);
91	24	50				295594	return ($dom) ? parse_rss_dom($dom) : 1;
92							}
93
94							sub parse_rss_dom {
95	25			25	0	126	my ($dom) = @_;
96	25	50				107	die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM'));
97	25					46	my $feed = parse_rss_channel($dom); # Feed properties
98	25					87	my $items = $dom->find('item');
99	25					62412	my $entries = $dom->find('entry'); # Atom
100	25					64194	my $res = [];
101	25					81	foreach my $item ($items->each, $entries->each) {
102	130					302	push @$res, parse_rss_item($item);
103							}
104	25	100				92	if (@$res) {
105	20					38	$feed->{'items'} = $res;
106							}
107	25					3891	return $feed;
108							}
109
110							sub parse_rss_channel {
111	25			25	0	27	my ($dom) = @_;
112	25					26	my %info;
113	25					50	foreach my $k (
114							qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster},
115							@time_fields
116							)
117							{
118	400		100			2338	my $p = $dom->at("channel > $k") \|\| $dom->at("feed > $k"); # direct child
119	400	100				1852034	if ($p) {
120	94		100			372	$info{$k} = $p->text \|\| $p->content \|\| $p->attr('href');
121	94	100	100			7391	if ($k eq 'author' && $p->at('name')) {
122	14		33			1496	$info{$k} = $p->at('name')->text \|\| $p->at('name')->content;
123							}
124	94	100				2206	if ($is_time_field{$k}) {
125	17					63	$info{$k} = str2time($info{$k});
126							}
127							}
128							}
129							my ($htmlUrl)
130	50					87	= grep { defined $_ }
131	25					138	map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]');
	50					106
132							my ($description)
133	75					79	= grep { defined $_ }
134	25	100				39	map { exists $info{$_} ? $info{$_} : undef }
	75					143
135							(qw(description tagline subtitle));
136	25	100				74	$info{htmlUrl} = $htmlUrl if ($htmlUrl);
137	25	100				66	$info{description} = $description if ($description);
138
139							# normalize fields:
140	25					99	my @replace = (
141							'pubDate' => 'published',
142							'dc\:date' => 'published',
143							'created' => 'published',
144							'issued' => 'published',
145							'updated' => 'published',
146							'modified' => 'published',
147							'dc\:creator' => 'author',
148							'webMaster' => 'author'
149							);
150	25					73	while (my ($old, $new) = splice(@replace, 0, 2)) {
151	200	100	100			501	if ($info{$old} && !$info{$new}) {
152	19					69	$info{$new} = delete $info{$old};
153							}
154							}
155	25	100				98	return (keys %info) ? \%info : undef;
156							}
157
158							sub parse_rss_item {
159	130			130	0	126	my ($item) = @_;
160	130					122	my %h;
161	130					198	foreach my $k (
162							qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author),
163							@time_fields
164							)
165							{
166	2210					20201	my $p = $item->at($k);
167	2210	100				591745	if ($p) {
168
169							# skip namespaced items - like itunes:summary - unless explicitly
170							# searched:
171							next
172	779	100	100			2972	if ($p->tag =~ /\:/
			100
			100
			100
173							&& $k ne 'content\:encoded'
174							&& $k ne 'xhtml\:body'
175							&& $k ne 'dc\:date'
176							&& $k ne 'dc\:creator');
177	774		66			7706	$h{$k} = $p->text \|\| $p->content;
178	774	100	100			88278	if ($k eq 'author' && $p->at('name')) {
179	22					2289	$h{$k} = $p->at('name')->text;
180							}
181	774	100				5001	if ($is_time_field{$k}) {
182	269					585	$h{$k} = str2time($h{$k});
183							}
184							}
185							}
186
187							# let's handle links seperately, because ATOM loves these buggers:
188							$item->find('link')->each(
189							sub {
190	137			137		48780	my $l = shift;
191	137	100				265	if ($l->attr('href')) {
192	130	100	100			1558	if (!$l->attr('rel') \|\| $l->attr('rel') eq 'alternate') {
193	122					1544	$h{'link'} = $l->attr('href');
194							}
195							}
196							else {
197	7	50				88	if ($l->text =~ /\w+/) {
198	7					376	$h{'link'} = $l->text; # simple link
199							}
200
201							# else { # we have an empty link element with no 'href'. :-(
202							# $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/);
203							# }
204							}
205							}
206	130					484	);
207
208							# find tags:
209	130					2460	my @tags;
210							$item->find('category, dc\:subject')
211	130		66	93		256	->each(sub { push @tags, $_[0]->text \|\| $_[0]->attr('term') });
	93					34014
212	130	100				40730	if (@tags) {
213	76					159	$h{'tags'} = \@tags;
214							}
215							#
216							# normalize fields:
217	130					378	my @replace = (
218							'content\:encoded' => 'content',
219							'xhtml\:body' => 'content',
220							'summary' => 'description',
221							'pubDate' => 'published',
222							'dc\:date' => 'published',
223							'created' => 'published',
224							'issued' => 'published',
225							'updated' => 'published',
226							'modified' => 'published',
227							'dc\:creator' => 'author'
228
229							# 'guid' => 'link'
230							);
231	130					314	while (my ($old, $new) = splice(@replace, 0, 2)) {
232	1300	100	100			3281	if ($h{$old} && !$h{$new}) {
233	126					345	$h{$new} = delete $h{$old};
234							}
235							}
236	130					329	my %copy = ('description' => 'content', link => 'id', guid => 'id');
237	130					276	while (my ($fill, $required) = each %copy) {
238	390	100	100			1231	if ($h{$fill} && !$h{$required}) {
239	8					21	$h{$required} = $h{$fill};
240							}
241							}
242	130					271	$h{"_raw"} = $item->to_string;
243	130					62360	return \%h;
244							}
245
246							# find_feeds - get RSS/Atom feed URL from argument.
247							# Code adapted to use Mojolicious from Feed::Find by Benjamin Trott
248							# Any stupid mistakes are my own
249							sub find_feeds {
250	13			13	1	52820	my $self = shift;
251	13					15	my $url = shift;
252	13	100				30	my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef;
253							# $self->ua->max_redirects(5)->connect_timeout(30);
254							my $main = sub {
255	13			13		14	my ($tx) = @_;
256	13					14	my @feeds;
257							# if ($tx->success) { say $tx->res->code } else { say $tx->error };
258	13	100	66			63	return unless ($tx->success && $tx->res->code == 200);
259	12					281	eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); };
	12					22
260	12	50				31	if ($@) {
261	0					0	croak "Exception in find_feeds - ", $@;
262							}
263	12					88	return (@feeds);
264	13					47	};
265	13	100				22	if ($cb) { # non-blocking:
266							$self->ua->get(
267							$url,
268							sub {
269	3			3		14144	my ($ua, $tx) = @_;
270	3					6	my (@feeds) = $main->($tx);
271	3					12	$cb->(@feeds);
272							}
273	3					19	);
274							}
275							else {
276	10					43	my $tx = $self->ua->get($url);
277	10					41373	return $main->($tx);
278							}
279							}
280
281							sub _find_feed_links {
282	12			12		73	my ($self, $url, $res) = @_;
283
284	12					16	state $feed_ext = qr/\.(?:rss\|xml\|rdf)$/;
285	12					11	my @feeds;
286
287							# use split to remove charset attribute from content_type
288	12					21	my ($content_type) = split(/[; ]+/, $res->headers->content_type);
289	12	100				164	if ($is_feed{$content_type}) {
290	2					8	push @feeds, Mojo::URL->new($url)->to_abs;
291							}
292							else {
293							# we are in a web page. PHEAR.
294	10		33			29	my $base = Mojo::URL->new(
295							$res->dom->find('head base')->map('attr', 'href')->join('') \|\| $url)->to_abs($url);
296	10		33			172229	my $title
297							= $res->dom->find('head > title')->map('text')->join('') \|\| $url;
298							$res->dom->find('head link')->each(
299							sub {
300	131			131		30195	my $attrs = $_->attr();
301	131	100				1092	return unless ($attrs->{'rel'});
302	129					230	my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'});
	129					237
303	129	100				181	my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : '';
304	129	50	33			366	if ($is_feed{$type} && ($rel{'alternate'} \|\| $rel{'service.feed'})) {
			66
305	8					21	push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base);
306							}
307							}
308	10					25901	);
309							$res->dom->find('a')->grep(
310							sub {
311	374	50		374		77428	$_->attr('href')
312							&& Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io;
313							}
314							)->each(
315							sub {
316	2			2		502	push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base);
317							}
318	10					1565	);
319	10	100				3825	unless (@feeds)
320							{ # call me crazy, but maybe this is just a feed served as HTML?
321	5					20	my $body = $res->body;
322	5	100				117	if ($self->parse_feed(\$body)) {
323	1					6	push @feeds, Mojo::URL->new($url)->to_abs;
324							}
325							}
326							}
327	12					560	return @feeds;
328							}
329
330							sub parse_opml {
331	3			3	1	12627	my ($self, $opml_file) = @_;
332	3	50				15	my $opml_str = decode 'UTF-8',
333							(ref $opml_file) ? $opml_file->slurp : slurp $opml_file;
334	3					3505	my $d = Mojo::DOM->new->parse($opml_str);
335	3					93792	my (%subscriptions, %categories);
336	3					17	for my $item ($d->find(q{outline})->each) {
337	962					112811	my $node = $item->attr;
338	962	100				6951	if (!defined $node->{xmlUrl}) {
339	40		66			92	my $cat = $node->{title} \|\| $node->{text};
340	40					75	$categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl');
341							}
342							else { # file by RSS URL:
343	922					1710	$subscriptions{$node->{xmlUrl}} = $node;
344							}
345							}
346
347
348							# assign categories
349	3					263	for my $cat (keys %categories) {
350	40					78	for my $rss ($categories{$cat}->each) {
351	920	50				1529	next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories"
352	920		100			2060	$subscriptions{$rss}{'categories'} \|\|= [];
353	920					516	push @{$subscriptions{$rss}{'categories'}}, $cat;
	920					1369
354							}
355							}
356	3					971	return (values %subscriptions);
357							}
358
359
360							1;
361
362							=encoding utf-8
363
364							=head1 NAME
365
366							Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds
367
368							=head1 SYNOPSIS
369
370							# Mojolicious
371							$self->plugin('FeedReader');
372
373							# Mojolicious::Lite
374							plugin 'FeedReader';
375
376							# Blocking:
377							get '/b' => sub {
378							my $self = shift;
379							my ($feed) = $self->find_feeds(q{search.cpan.org});
380							my $out = $self->parse_feed($feed);
381							$self->render(template => 'uploads', items => $out->{items});
382							};
383
384							# Non-blocking:
385							get '/nb' => sub {
386							my $self = shift;
387							$self->render_later;
388							my $delay = Mojo::IOLoop->delay(
389							sub {
390							$self->find_feeds("search.cpan.org", shift->begin(0));
391							},
392							sub {
393							my $feed = pop;
394							$self->parse_feed($feed, shift->begin);
395							},
396							sub {
397							my $data = pop;
398							$self->render(template => 'uploads', items => $data->{items});
399							});
400							$delay->wait unless Mojo::IOLoop->is_running;
401							};
402
403							app->start;
404
405							__DATA__
406
407							@@ uploads.html.ep
408
409							% for my $item (@$items) {
410							<%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %>
411							% }
412
413
414							=head1 DESCRIPTION
415
416							B implements minimalistic helpers for identifying,
417							fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as
418							much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and
419							checking URLs, Mojo::DOM for XML/HTML parsing.
420							It is therefore rather fragile and naive, and should be considered Experimental/Toy
421							code - B.
422
423
424							=head1 METHODS
425
426							L inherits all methods from
427							L and implements the following new ones.
428
429							=head2 register
430
431							$plugin->register(Mojolicious->new);
432
433							Register plugin in L application. This method will install the helpers
434							listed below in your Mojolicious application.
435
436							=head1 HELPERS
437
438							B implements the following helpers.
439
440							=head2 find_feeds
441
442							# Call blocking
443							my (@feeds) = app->find_feeds('search.cpan.org');
444							# @feeds is a list of Mojo::URL objects
445
446							# Call non-blocking
447							$self->find_feeds('http://example.com', sub {
448							my (@feeds) = @_;
449							unless (@feeds) {
450							$self->render_exception("no feeds found, " . $info->{error});
451							}
452							else {
453							....
454							}
455							});
456
457							A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI.
458							If given a callback function as an additional argument, execution will be non-blocking.
459
460							=head2 parse_feed
461
462							# parse an RSS/Atom feed
463							# blocking
464							my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot');
465							my $feed = $self->parse_feed($url);
466							for my $item (@{$feed->{items}}) {
467							say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}});
468							}
469
470							# non-blocking
471							$self->parse_feed($url, sub {
472							my ($c, $feed) = @_;
473							$c->render(text => "Feed tagline: " . $feed->{tagline});
474							});
475
476							# parse a file
477							$feed2 = $self->parse_feed('/downloads/foo.rss');
478
479							# parse response
480							$self->ua->get($feed_url, sub {
481							my ($ua, $tx) = @_;
482							my $feed = $self->parse_feed($tx->res);
483							});
484
485							A minimalist liberal RSS/Atom parser, using Mojo::DOM queries.
486
487							Dates are parsed using L.
488
489							If parsing fails (for example, the parser was given an HTML page), the helper will return undef.
490
491							On success, the result returned is a hashref with the following keys:
492
493							=over 4
494
495							=item * title
496
497							=item * description (may be filled from subtitle or tagline if absent)
498
499							=item * htmlUrl - web page URL associated with the feed
500
501							=item * items - array ref of feed news items
502
503							=item * subtitle (optional)
504
505							=item * tagline (optional)
506
507							=item * author (name of author field, or dc:creator or webMaster)
508
509							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
510
511							=back
512
513							Each item in the items array is a hashref with the following keys:
514
515							=over 4
516
517							=item * title
518
519							=item * link
520
521							=item * content (may be filled with content:encoded, xhtml:body or description fields)
522
523							=item * id (will be equal to link or guid if it is undefined and either of those fields exists)
524
525							=item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing)
526
527							=item * guid (optional)
528
529							=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
530
531							=item * author (may be filled from author or dc:creator)
532
533							=item * tags (optional) - array ref of tags, categories or dc:subjects.
534
535							=item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed.
536
537							=back
538
539							=head2 parse_opml
540
541							my @subscriptions = app->parse_opml( 'mysubs.opml' );
542							foreach my $sub (@subscriptions) {
543							say 'RSS URL is: ', $sub->{xmlUrl};
544							say 'Website URL is: ', $sub->{htmlUrl};
545							say 'categories: ', join ',', @{$sub->{categories}};
546							}
547
548							Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs.
549
550							Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in.
551
552							=head1 CREDITS
553
554							Some tests adapted from L and L Feed autodiscovery adapted from L.
555
556							Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way
557							to infringe on the rights of the respective authors.
558
559							=head1 COPYRIGHT AND LICENSE
560
561							Copyright (C) 2014, Dotan Dimet.
562
563							This program is free software, you can redistribute it and/or modify it
564							under the terms of the Artistic License version 2.0.
565
566							=head1 SEE ALSO
567
568							L, L, L
569
570							L, L, L
571
572							=cut