File Coverage

blib/lib/Mojolicious/Plugin/FeedReader.pm
Criterion Covered Total %
statement 181 184 98.3
branch 77 92 83.7
condition 60 77 77.9
subroutine 24 24 100.0
pod 3 8 37.5
total 345 385 89.6


line stmt bran cond sub pod time code
1             package Mojolicious::Plugin::FeedReader;
2 4     4   2247 use Mojo::Base 'Mojolicious::Plugin';
  4         4  
  4         21  
3              
4             our $VERSION = '0.08';
5 4     4   661 use Mojo::Util qw(decode slurp trim);
  4         5  
  4         200  
6 4     4   19 use Mojo::DOM;
  4         4  
  4         68  
7 4     4   11 use Mojo::IOLoop;
  4         5  
  4         26  
8 4     4   921 use HTTP::Date;
  4         2286  
  4         198  
9 4     4   15 use Carp qw(carp croak);
  4         5  
  4         179  
10 4     4   14 use Scalar::Util qw(blessed);
  4         4  
  4         8791  
11              
12             our @time_fields
13             = (qw(pubDate published created issued updated modified dc\:date));
14             our %is_time_field = map { $_ => 1 } @time_fields;
15              
16             # feed mime-types:
17             our @feed_types = (
18             'application/x.atom+xml', 'application/atom+xml',
19             'application/xml', 'text/xml',
20             'application/rss+xml', 'application/rdf+xml'
21             );
22             our %is_feed = map { $_ => 1 } @feed_types;
23              
24             sub register {
25 4     4 1 123 my ($self, $app) = @_;
26 4         7 foreach my $method (
27             qw( find_feeds parse_rss parse_opml ))
28             {
29 12         158 $app->helper($method => \&{$method});
  12         44  
30             }
31 4         38 $app->helper(parse_feed => \&parse_rss);
32             }
33              
34             sub make_dom {
35 25     25 0 37 my ($xml) = @_;
36 25         24 my $rss;
37 25 100 66     127 if (!ref $xml) { # assume file
    100 33        
    100          
    50          
38 9         20 $rss = slurp $xml;
39 9 50       876 die "Unable to read file $xml: $!" unless ($rss);
40             }
41             elsif (ref $xml eq 'SCALAR') { # assume string
42 11         13 $rss = $$xml;
43             }
44             elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style)
45 1         2 return $xml;
46             }
47             elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset or similar
48 4         12 $rss = $xml->slurp;
49             }
50             else {
51 0         0 die "don't know how to make a Mojo::DOM from object $xml";
52             }
53             #my $rss_str = decode 'UTF-8', $rss;
54 24         193 my $rss_str = $rss;
55 24 50       49 die "Failed to read asset $xml (as UTF-8): $!" unless ($rss_str);
56 24         104 return Mojo::DOM->new->parse($rss_str);
57             }
58              
59             sub parse_rss {
60 25     25 0 57980 my ($c, $xml, $cb) = @_;
61 25 100 100     152 if (blessed $xml && $xml->isa('Mojo::URL')) {
62             # this is the only case where we might go non-blocking:
63 4 100 66     17 if ($cb && ref $cb eq 'CODE') {
64             return
65             $c->ua->get(
66             $xml,
67             sub {
68 1     1   5458 my ($ua, $tx) = @_;
69 1         2 my $feed;
70 1 50       6 if ($tx->success) {
71 1         18 my $body = $tx->res->body;
72 1         17 my $dom = make_dom(\$body);
73 1         2634 eval { $feed = parse_rss_dom($dom); };
  1         2  
74             }
75 1         4 $c->$cb($feed);
76             }
77 1         6 );
78             }
79             else {
80 3         24 my $tx = $c->ua->get($xml);
81 3 50       14967 if ($tx->success) {
82 3         50 my $body = $tx->res->body;
83 3         49 $xml = \$body;
84             }
85             else {
86 0 0       0 croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : '');
87             }
88             }
89             }
90 24         57 my $dom = make_dom($xml);
91 24 50       295594 return ($dom) ? parse_rss_dom($dom) : 1;
92             }
93              
94             sub parse_rss_dom {
95 25     25 0 126 my ($dom) = @_;
96 25 50       107 die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM'));
97 25         46 my $feed = parse_rss_channel($dom); # Feed properties
98 25         87 my $items = $dom->find('item');
99 25         62412 my $entries = $dom->find('entry'); # Atom
100 25         64194 my $res = [];
101 25         81 foreach my $item ($items->each, $entries->each) {
102 130         302 push @$res, parse_rss_item($item);
103             }
104 25 100       92 if (@$res) {
105 20         38 $feed->{'items'} = $res;
106             }
107 25         3891 return $feed;
108             }
109              
110             sub parse_rss_channel {
111 25     25 0 27 my ($dom) = @_;
112 25         26 my %info;
113 25         50 foreach my $k (
114             qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster},
115             @time_fields
116             )
117             {
118 400   100     2338 my $p = $dom->at("channel > $k") || $dom->at("feed > $k"); # direct child
119 400 100       1852034 if ($p) {
120 94   100     372 $info{$k} = $p->text || $p->content || $p->attr('href');
121 94 100 100     7391 if ($k eq 'author' && $p->at('name')) {
122 14   33     1496 $info{$k} = $p->at('name')->text || $p->at('name')->content;
123             }
124 94 100       2206 if ($is_time_field{$k}) {
125 17         63 $info{$k} = str2time($info{$k});
126             }
127             }
128             }
129             my ($htmlUrl)
130 50         87 = grep { defined $_ }
131 25         138 map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]');
  50         106  
132             my ($description)
133 75         79 = grep { defined $_ }
134 25 100       39 map { exists $info{$_} ? $info{$_} : undef }
  75         143  
135             (qw(description tagline subtitle));
136 25 100       74 $info{htmlUrl} = $htmlUrl if ($htmlUrl);
137 25 100       66 $info{description} = $description if ($description);
138              
139             # normalize fields:
140 25         99 my @replace = (
141             'pubDate' => 'published',
142             'dc\:date' => 'published',
143             'created' => 'published',
144             'issued' => 'published',
145             'updated' => 'published',
146             'modified' => 'published',
147             'dc\:creator' => 'author',
148             'webMaster' => 'author'
149             );
150 25         73 while (my ($old, $new) = splice(@replace, 0, 2)) {
151 200 100 100     501 if ($info{$old} && !$info{$new}) {
152 19         69 $info{$new} = delete $info{$old};
153             }
154             }
155 25 100       98 return (keys %info) ? \%info : undef;
156             }
157              
158             sub parse_rss_item {
159 130     130 0 126 my ($item) = @_;
160 130         122 my %h;
161 130         198 foreach my $k (
162             qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author),
163             @time_fields
164             )
165             {
166 2210         20201 my $p = $item->at($k);
167 2210 100       591745 if ($p) {
168              
169             # skip namespaced items - like itunes:summary - unless explicitly
170             # searched:
171             next
172 779 100 100     2972 if ($p->tag =~ /\:/
      100        
      100        
      100        
173             && $k ne 'content\:encoded'
174             && $k ne 'xhtml\:body'
175             && $k ne 'dc\:date'
176             && $k ne 'dc\:creator');
177 774   66     7706 $h{$k} = $p->text || $p->content;
178 774 100 100     88278 if ($k eq 'author' && $p->at('name')) {
179 22         2289 $h{$k} = $p->at('name')->text;
180             }
181 774 100       5001 if ($is_time_field{$k}) {
182 269         585 $h{$k} = str2time($h{$k});
183             }
184             }
185             }
186              
187             # let's handle links seperately, because ATOM loves these buggers:
188             $item->find('link')->each(
189             sub {
190 137     137   48780 my $l = shift;
191 137 100       265 if ($l->attr('href')) {
192 130 100 100     1558 if (!$l->attr('rel') || $l->attr('rel') eq 'alternate') {
193 122         1544 $h{'link'} = $l->attr('href');
194             }
195             }
196             else {
197 7 50       88 if ($l->text =~ /\w+/) {
198 7         376 $h{'link'} = $l->text; # simple link
199             }
200              
201             # else { # we have an empty link element with no 'href'. :-(
202             # $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/);
203             # }
204             }
205             }
206 130         484 );
207              
208             # find tags:
209 130         2460 my @tags;
210             $item->find('category, dc\:subject')
211 130   66 93   256 ->each(sub { push @tags, $_[0]->text || $_[0]->attr('term') });
  93         34014  
212 130 100       40730 if (@tags) {
213 76         159 $h{'tags'} = \@tags;
214             }
215             #
216             # normalize fields:
217 130         378 my @replace = (
218             'content\:encoded' => 'content',
219             'xhtml\:body' => 'content',
220             'summary' => 'description',
221             'pubDate' => 'published',
222             'dc\:date' => 'published',
223             'created' => 'published',
224             'issued' => 'published',
225             'updated' => 'published',
226             'modified' => 'published',
227             'dc\:creator' => 'author'
228              
229             # 'guid' => 'link'
230             );
231 130         314 while (my ($old, $new) = splice(@replace, 0, 2)) {
232 1300 100 100     3281 if ($h{$old} && !$h{$new}) {
233 126         345 $h{$new} = delete $h{$old};
234             }
235             }
236 130         329 my %copy = ('description' => 'content', link => 'id', guid => 'id');
237 130         276 while (my ($fill, $required) = each %copy) {
238 390 100 100     1231 if ($h{$fill} && !$h{$required}) {
239 8         21 $h{$required} = $h{$fill};
240             }
241             }
242 130         271 $h{"_raw"} = $item->to_string;
243 130         62360 return \%h;
244             }
245              
246             # find_feeds - get RSS/Atom feed URL from argument.
247             # Code adapted to use Mojolicious from Feed::Find by Benjamin Trott
248             # Any stupid mistakes are my own
249             sub find_feeds {
250 13     13 1 52820 my $self = shift;
251 13         15 my $url = shift;
252 13 100       30 my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef;
253             # $self->ua->max_redirects(5)->connect_timeout(30);
254             my $main = sub {
255 13     13   14 my ($tx) = @_;
256 13         14 my @feeds;
257             # if ($tx->success) { say $tx->res->code } else { say $tx->error };
258 13 100 66     63 return unless ($tx->success && $tx->res->code == 200);
259 12         281 eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); };
  12         22  
260 12 50       31 if ($@) {
261 0         0 croak "Exception in find_feeds - ", $@;
262             }
263 12         88 return (@feeds);
264 13         47 };
265 13 100       22 if ($cb) { # non-blocking:
266             $self->ua->get(
267             $url,
268             sub {
269 3     3   14144 my ($ua, $tx) = @_;
270 3         6 my (@feeds) = $main->($tx);
271 3         12 $cb->(@feeds);
272             }
273 3         19 );
274             }
275             else {
276 10         43 my $tx = $self->ua->get($url);
277 10         41373 return $main->($tx);
278             }
279             }
280              
281             sub _find_feed_links {
282 12     12   73 my ($self, $url, $res) = @_;
283              
284 12         16 state $feed_ext = qr/\.(?:rss|xml|rdf)$/;
285 12         11 my @feeds;
286              
287             # use split to remove charset attribute from content_type
288 12         21 my ($content_type) = split(/[; ]+/, $res->headers->content_type);
289 12 100       164 if ($is_feed{$content_type}) {
290 2         8 push @feeds, Mojo::URL->new($url)->to_abs;
291             }
292             else {
293             # we are in a web page. PHEAR.
294 10   33     29 my $base = Mojo::URL->new(
295             $res->dom->find('head base')->map('attr', 'href')->join('') || $url)->to_abs($url);
296 10   33     172229 my $title
297             = $res->dom->find('head > title')->map('text')->join('') || $url;
298             $res->dom->find('head link')->each(
299             sub {
300 131     131   30195 my $attrs = $_->attr();
301 131 100       1092 return unless ($attrs->{'rel'});
302 129         230 my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'});
  129         237  
303 129 100       181 my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : '';
304 129 50 33     366 if ($is_feed{$type} && ($rel{'alternate'} || $rel{'service.feed'})) {
      66        
305 8         21 push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base);
306             }
307             }
308 10         25901 );
309             $res->dom->find('a')->grep(
310             sub {
311 374 50   374   77428 $_->attr('href')
312             && Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io;
313             }
314             )->each(
315             sub {
316 2     2   502 push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base);
317             }
318 10         1565 );
319 10 100       3825 unless (@feeds)
320             { # call me crazy, but maybe this is just a feed served as HTML?
321 5         20 my $body = $res->body;
322 5 100       117 if ($self->parse_feed(\$body)) {
323 1         6 push @feeds, Mojo::URL->new($url)->to_abs;
324             }
325             }
326             }
327 12         560 return @feeds;
328             }
329              
330             sub parse_opml {
331 3     3 1 12627 my ($self, $opml_file) = @_;
332 3 50       15 my $opml_str = decode 'UTF-8',
333             (ref $opml_file) ? $opml_file->slurp : slurp $opml_file;
334 3         3505 my $d = Mojo::DOM->new->parse($opml_str);
335 3         93792 my (%subscriptions, %categories);
336 3         17 for my $item ($d->find(q{outline})->each) {
337 962         112811 my $node = $item->attr;
338 962 100       6951 if (!defined $node->{xmlUrl}) {
339 40   66     92 my $cat = $node->{title} || $node->{text};
340 40         75 $categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl');
341             }
342             else { # file by RSS URL:
343 922         1710 $subscriptions{$node->{xmlUrl}} = $node;
344             }
345             }
346              
347              
348             # assign categories
349 3         263 for my $cat (keys %categories) {
350 40         78 for my $rss ($categories{$cat}->each) {
351 920 50       1529 next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories"
352 920   100     2060 $subscriptions{$rss}{'categories'} ||= [];
353 920         516 push @{$subscriptions{$rss}{'categories'}}, $cat;
  920         1369  
354             }
355             }
356 3         971 return (values %subscriptions);
357             }
358              
359              
360             1;
361              
362             =encoding utf-8
363              
364             =head1 NAME
365              
366             Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds
367              
368             =head1 SYNOPSIS
369              
370             # Mojolicious
371             $self->plugin('FeedReader');
372              
373             # Mojolicious::Lite
374             plugin 'FeedReader';
375              
376             # Blocking:
377             get '/b' => sub {
378             my $self = shift;
379             my ($feed) = $self->find_feeds(q{search.cpan.org});
380             my $out = $self->parse_feed($feed);
381             $self->render(template => 'uploads', items => $out->{items});
382             };
383              
384             # Non-blocking:
385             get '/nb' => sub {
386             my $self = shift;
387             $self->render_later;
388             my $delay = Mojo::IOLoop->delay(
389             sub {
390             $self->find_feeds("search.cpan.org", shift->begin(0));
391             },
392             sub {
393             my $feed = pop;
394             $self->parse_feed($feed, shift->begin);
395             },
396             sub {
397             my $data = pop;
398             $self->render(template => 'uploads', items => $data->{items});
399             });
400             $delay->wait unless Mojo::IOLoop->is_running;
401             };
402              
403             app->start;
404              
405             __DATA__
406              
407             @@ uploads.html.ep
408            
409             % for my $item (@$items) {
410            
  • <%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %>
  • 411             % }
    412            
    413              
    414             =head1 DESCRIPTION
    415              
    416             B implements minimalistic helpers for identifying,
    417             fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as
    418             much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and
    419             checking URLs, Mojo::DOM for XML/HTML parsing.
    420             It is therefore rather fragile and naive, and should be considered Experimental/Toy
    421             code - B.
    422              
    423              
    424             =head1 METHODS
    425              
    426             L inherits all methods from
    427             L and implements the following new ones.
    428              
    429             =head2 register
    430              
    431             $plugin->register(Mojolicious->new);
    432              
    433             Register plugin in L application. This method will install the helpers
    434             listed below in your Mojolicious application.
    435              
    436             =head1 HELPERS
    437              
    438             B implements the following helpers.
    439              
    440             =head2 find_feeds
    441              
    442             # Call blocking
    443             my (@feeds) = app->find_feeds('search.cpan.org');
    444             # @feeds is a list of Mojo::URL objects
    445              
    446             # Call non-blocking
    447             $self->find_feeds('http://example.com', sub {
    448             my (@feeds) = @_;
    449             unless (@feeds) {
    450             $self->render_exception("no feeds found, " . $info->{error});
    451             }
    452             else {
    453             ....
    454             }
    455             });
    456              
    457             A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI.
    458             If given a callback function as an additional argument, execution will be non-blocking.
    459              
    460             =head2 parse_feed
    461              
    462             # parse an RSS/Atom feed
    463             # blocking
    464             my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot');
    465             my $feed = $self->parse_feed($url);
    466             for my $item (@{$feed->{items}}) {
    467             say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}});
    468             }
    469              
    470             # non-blocking
    471             $self->parse_feed($url, sub {
    472             my ($c, $feed) = @_;
    473             $c->render(text => "Feed tagline: " . $feed->{tagline});
    474             });
    475              
    476             # parse a file
    477             $feed2 = $self->parse_feed('/downloads/foo.rss');
    478              
    479             # parse response
    480             $self->ua->get($feed_url, sub {
    481             my ($ua, $tx) = @_;
    482             my $feed = $self->parse_feed($tx->res);
    483             });
    484              
    485             A minimalist liberal RSS/Atom parser, using Mojo::DOM queries.
    486              
    487             Dates are parsed using L.
    488              
    489             If parsing fails (for example, the parser was given an HTML page), the helper will return undef.
    490              
    491             On success, the result returned is a hashref with the following keys:
    492              
    493             =over 4
    494              
    495             =item * title
    496              
    497             =item * description (may be filled from subtitle or tagline if absent)
    498              
    499             =item * htmlUrl - web page URL associated with the feed
    500              
    501             =item * items - array ref of feed news items
    502              
    503             =item * subtitle (optional)
    504              
    505             =item * tagline (optional)
    506              
    507             =item * author (name of author field, or dc:creator or webMaster)
    508              
    509             =item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
    510              
    511             =back
    512              
    513             Each item in the items array is a hashref with the following keys:
    514              
    515             =over 4
    516              
    517             =item * title
    518              
    519             =item * link
    520              
    521             =item * content (may be filled with content:encoded, xhtml:body or description fields)
    522              
    523             =item * id (will be equal to link or guid if it is undefined and either of those fields exists)
    524              
    525             =item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing)
    526              
    527             =item * guid (optional)
    528              
    529             =item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified)
    530              
    531             =item * author (may be filled from author or dc:creator)
    532              
    533             =item * tags (optional) - array ref of tags, categories or dc:subjects.
    534              
    535             =item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed.
    536              
    537             =back
    538              
    539             =head2 parse_opml
    540              
    541             my @subscriptions = app->parse_opml( 'mysubs.opml' );
    542             foreach my $sub (@subscriptions) {
    543             say 'RSS URL is: ', $sub->{xmlUrl};
    544             say 'Website URL is: ', $sub->{htmlUrl};
    545             say 'categories: ', join ',', @{$sub->{categories}};
    546             }
    547              
    548             Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs.
    549              
    550             Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in.
    551              
    552             =head1 CREDITS
    553              
    554             Some tests adapted from L and L Feed autodiscovery adapted from L.
    555              
    556             Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way
    557             to infringe on the rights of the respective authors.
    558              
    559             =head1 COPYRIGHT AND LICENSE
    560              
    561             Copyright (C) 2014, Dotan Dimet.
    562              
    563             This program is free software, you can redistribute it and/or modify it
    564             under the terms of the Artistic License version 2.0.
    565              
    566             =head1 SEE ALSO
    567              
    568             L, L, L
    569              
    570             L, L, L
    571              
    572             =cut