| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Mojolicious::Plugin::FeedReader; |
|
2
|
4
|
|
|
4
|
|
2247
|
use Mojo::Base 'Mojolicious::Plugin'; |
|
|
4
|
|
|
|
|
4
|
|
|
|
4
|
|
|
|
|
21
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '0.08'; |
|
5
|
4
|
|
|
4
|
|
661
|
use Mojo::Util qw(decode slurp trim); |
|
|
4
|
|
|
|
|
5
|
|
|
|
4
|
|
|
|
|
200
|
|
|
6
|
4
|
|
|
4
|
|
19
|
use Mojo::DOM; |
|
|
4
|
|
|
|
|
4
|
|
|
|
4
|
|
|
|
|
68
|
|
|
7
|
4
|
|
|
4
|
|
11
|
use Mojo::IOLoop; |
|
|
4
|
|
|
|
|
5
|
|
|
|
4
|
|
|
|
|
26
|
|
|
8
|
4
|
|
|
4
|
|
921
|
use HTTP::Date; |
|
|
4
|
|
|
|
|
2286
|
|
|
|
4
|
|
|
|
|
198
|
|
|
9
|
4
|
|
|
4
|
|
15
|
use Carp qw(carp croak); |
|
|
4
|
|
|
|
|
5
|
|
|
|
4
|
|
|
|
|
179
|
|
|
10
|
4
|
|
|
4
|
|
14
|
use Scalar::Util qw(blessed); |
|
|
4
|
|
|
|
|
4
|
|
|
|
4
|
|
|
|
|
8791
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our @time_fields |
|
13
|
|
|
|
|
|
|
= (qw(pubDate published created issued updated modified dc\:date)); |
|
14
|
|
|
|
|
|
|
our %is_time_field = map { $_ => 1 } @time_fields; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# feed mime-types: |
|
17
|
|
|
|
|
|
|
our @feed_types = ( |
|
18
|
|
|
|
|
|
|
'application/x.atom+xml', 'application/atom+xml', |
|
19
|
|
|
|
|
|
|
'application/xml', 'text/xml', |
|
20
|
|
|
|
|
|
|
'application/rss+xml', 'application/rdf+xml' |
|
21
|
|
|
|
|
|
|
); |
|
22
|
|
|
|
|
|
|
our %is_feed = map { $_ => 1 } @feed_types; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub register { |
|
25
|
4
|
|
|
4
|
1
|
123
|
my ($self, $app) = @_; |
|
26
|
4
|
|
|
|
|
7
|
foreach my $method ( |
|
27
|
|
|
|
|
|
|
qw( find_feeds parse_rss parse_opml )) |
|
28
|
|
|
|
|
|
|
{ |
|
29
|
12
|
|
|
|
|
158
|
$app->helper($method => \&{$method}); |
|
|
12
|
|
|
|
|
44
|
|
|
30
|
|
|
|
|
|
|
} |
|
31
|
4
|
|
|
|
|
38
|
$app->helper(parse_feed => \&parse_rss); |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub make_dom { |
|
35
|
25
|
|
|
25
|
0
|
37
|
my ($xml) = @_; |
|
36
|
25
|
|
|
|
|
24
|
my $rss; |
|
37
|
25
|
100
|
66
|
|
|
127
|
if (!ref $xml) { # assume file |
|
|
|
100
|
33
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
38
|
9
|
|
|
|
|
20
|
$rss = slurp $xml; |
|
39
|
9
|
50
|
|
|
|
876
|
die "Unable to read file $xml: $!" unless ($rss); |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
elsif (ref $xml eq 'SCALAR') { # assume string |
|
42
|
11
|
|
|
|
|
13
|
$rss = $$xml; |
|
43
|
|
|
|
|
|
|
} |
|
44
|
|
|
|
|
|
|
elsif (blessed $xml && $xml->isa('Mojo::DOM')) { # Mojo::DOM (old style) |
|
45
|
1
|
|
|
|
|
2
|
return $xml; |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
elsif (blessed $xml && $xml->can('slurp')) { # Mojo::Asset or similar |
|
48
|
4
|
|
|
|
|
12
|
$rss = $xml->slurp; |
|
49
|
|
|
|
|
|
|
} |
|
50
|
|
|
|
|
|
|
else { |
|
51
|
0
|
|
|
|
|
0
|
die "don't know how to make a Mojo::DOM from object $xml"; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
#my $rss_str = decode 'UTF-8', $rss; |
|
54
|
24
|
|
|
|
|
193
|
my $rss_str = $rss; |
|
55
|
24
|
50
|
|
|
|
49
|
die "Failed to read asset $xml (as UTF-8): $!" unless ($rss_str); |
|
56
|
24
|
|
|
|
|
104
|
return Mojo::DOM->new->parse($rss_str); |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub parse_rss { |
|
60
|
25
|
|
|
25
|
0
|
57980
|
my ($c, $xml, $cb) = @_; |
|
61
|
25
|
100
|
100
|
|
|
152
|
if (blessed $xml && $xml->isa('Mojo::URL')) { |
|
62
|
|
|
|
|
|
|
# this is the only case where we might go non-blocking: |
|
63
|
4
|
100
|
66
|
|
|
17
|
if ($cb && ref $cb eq 'CODE') { |
|
64
|
|
|
|
|
|
|
return |
|
65
|
|
|
|
|
|
|
$c->ua->get( |
|
66
|
|
|
|
|
|
|
$xml, |
|
67
|
|
|
|
|
|
|
sub { |
|
68
|
1
|
|
|
1
|
|
5458
|
my ($ua, $tx) = @_; |
|
69
|
1
|
|
|
|
|
2
|
my $feed; |
|
70
|
1
|
50
|
|
|
|
6
|
if ($tx->success) { |
|
71
|
1
|
|
|
|
|
18
|
my $body = $tx->res->body; |
|
72
|
1
|
|
|
|
|
17
|
my $dom = make_dom(\$body); |
|
73
|
1
|
|
|
|
|
2634
|
eval { $feed = parse_rss_dom($dom); }; |
|
|
1
|
|
|
|
|
2
|
|
|
74
|
|
|
|
|
|
|
} |
|
75
|
1
|
|
|
|
|
4
|
$c->$cb($feed); |
|
76
|
|
|
|
|
|
|
} |
|
77
|
1
|
|
|
|
|
6
|
); |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
else { |
|
80
|
3
|
|
|
|
|
24
|
my $tx = $c->ua->get($xml); |
|
81
|
3
|
50
|
|
|
|
14967
|
if ($tx->success) { |
|
82
|
3
|
|
|
|
|
50
|
my $body = $tx->res->body; |
|
83
|
3
|
|
|
|
|
49
|
$xml = \$body; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
else { |
|
86
|
0
|
0
|
|
|
|
0
|
croak "Error getting feed from url $xml: ", (($tx->error) ? $tx->error->{message} : ''); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
} |
|
90
|
24
|
|
|
|
|
57
|
my $dom = make_dom($xml); |
|
91
|
24
|
50
|
|
|
|
295594
|
return ($dom) ? parse_rss_dom($dom) : 1; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub parse_rss_dom { |
|
95
|
25
|
|
|
25
|
0
|
126
|
my ($dom) = @_; |
|
96
|
25
|
50
|
|
|
|
107
|
die "Argument $dom is not a Mojo::DOM" unless ($dom->isa('Mojo::DOM')); |
|
97
|
25
|
|
|
|
|
46
|
my $feed = parse_rss_channel($dom); # Feed properties |
|
98
|
25
|
|
|
|
|
87
|
my $items = $dom->find('item'); |
|
99
|
25
|
|
|
|
|
62412
|
my $entries = $dom->find('entry'); # Atom |
|
100
|
25
|
|
|
|
|
64194
|
my $res = []; |
|
101
|
25
|
|
|
|
|
81
|
foreach my $item ($items->each, $entries->each) { |
|
102
|
130
|
|
|
|
|
302
|
push @$res, parse_rss_item($item); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
25
|
100
|
|
|
|
92
|
if (@$res) { |
|
105
|
20
|
|
|
|
|
38
|
$feed->{'items'} = $res; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
25
|
|
|
|
|
3891
|
return $feed; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub parse_rss_channel { |
|
111
|
25
|
|
|
25
|
0
|
27
|
my ($dom) = @_; |
|
112
|
25
|
|
|
|
|
26
|
my %info; |
|
113
|
25
|
|
|
|
|
50
|
foreach my $k ( |
|
114
|
|
|
|
|
|
|
qw{title subtitle description tagline link:not([rel]) link[rel=alternate] dc\:creator author webMaster}, |
|
115
|
|
|
|
|
|
|
@time_fields |
|
116
|
|
|
|
|
|
|
) |
|
117
|
|
|
|
|
|
|
{ |
|
118
|
400
|
|
100
|
|
|
2338
|
my $p = $dom->at("channel > $k") || $dom->at("feed > $k"); # direct child |
|
119
|
400
|
100
|
|
|
|
1852034
|
if ($p) { |
|
120
|
94
|
|
100
|
|
|
372
|
$info{$k} = $p->text || $p->content || $p->attr('href'); |
|
121
|
94
|
100
|
100
|
|
|
7391
|
if ($k eq 'author' && $p->at('name')) { |
|
122
|
14
|
|
33
|
|
|
1496
|
$info{$k} = $p->at('name')->text || $p->at('name')->content; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
94
|
100
|
|
|
|
2206
|
if ($is_time_field{$k}) { |
|
125
|
17
|
|
|
|
|
63
|
$info{$k} = str2time($info{$k}); |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
my ($htmlUrl) |
|
130
|
50
|
|
|
|
|
87
|
= grep { defined $_ } |
|
131
|
25
|
|
|
|
|
138
|
map { delete $info{$_} } ('link:not([rel])', 'link[rel=alternate]'); |
|
|
50
|
|
|
|
|
106
|
|
|
132
|
|
|
|
|
|
|
my ($description) |
|
133
|
75
|
|
|
|
|
79
|
= grep { defined $_ } |
|
134
|
25
|
100
|
|
|
|
39
|
map { exists $info{$_} ? $info{$_} : undef } |
|
|
75
|
|
|
|
|
143
|
|
|
135
|
|
|
|
|
|
|
(qw(description tagline subtitle)); |
|
136
|
25
|
100
|
|
|
|
74
|
$info{htmlUrl} = $htmlUrl if ($htmlUrl); |
|
137
|
25
|
100
|
|
|
|
66
|
$info{description} = $description if ($description); |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# normalize fields: |
|
140
|
25
|
|
|
|
|
99
|
my @replace = ( |
|
141
|
|
|
|
|
|
|
'pubDate' => 'published', |
|
142
|
|
|
|
|
|
|
'dc\:date' => 'published', |
|
143
|
|
|
|
|
|
|
'created' => 'published', |
|
144
|
|
|
|
|
|
|
'issued' => 'published', |
|
145
|
|
|
|
|
|
|
'updated' => 'published', |
|
146
|
|
|
|
|
|
|
'modified' => 'published', |
|
147
|
|
|
|
|
|
|
'dc\:creator' => 'author', |
|
148
|
|
|
|
|
|
|
'webMaster' => 'author' |
|
149
|
|
|
|
|
|
|
); |
|
150
|
25
|
|
|
|
|
73
|
while (my ($old, $new) = splice(@replace, 0, 2)) { |
|
151
|
200
|
100
|
100
|
|
|
501
|
if ($info{$old} && !$info{$new}) { |
|
152
|
19
|
|
|
|
|
69
|
$info{$new} = delete $info{$old}; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
25
|
100
|
|
|
|
98
|
return (keys %info) ? \%info : undef; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub parse_rss_item { |
|
159
|
130
|
|
|
130
|
0
|
126
|
my ($item) = @_; |
|
160
|
130
|
|
|
|
|
122
|
my %h; |
|
161
|
130
|
|
|
|
|
198
|
foreach my $k ( |
|
162
|
|
|
|
|
|
|
qw(title id summary guid content description content\:encoded xhtml\:body dc\:creator author), |
|
163
|
|
|
|
|
|
|
@time_fields |
|
164
|
|
|
|
|
|
|
) |
|
165
|
|
|
|
|
|
|
{ |
|
166
|
2210
|
|
|
|
|
20201
|
my $p = $item->at($k); |
|
167
|
2210
|
100
|
|
|
|
591745
|
if ($p) { |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# skip namespaced items - like itunes:summary - unless explicitly |
|
170
|
|
|
|
|
|
|
# searched: |
|
171
|
|
|
|
|
|
|
next |
|
172
|
779
|
100
|
100
|
|
|
2972
|
if ($p->tag =~ /\:/ |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
173
|
|
|
|
|
|
|
&& $k ne 'content\:encoded' |
|
174
|
|
|
|
|
|
|
&& $k ne 'xhtml\:body' |
|
175
|
|
|
|
|
|
|
&& $k ne 'dc\:date' |
|
176
|
|
|
|
|
|
|
&& $k ne 'dc\:creator'); |
|
177
|
774
|
|
66
|
|
|
7706
|
$h{$k} = $p->text || $p->content; |
|
178
|
774
|
100
|
100
|
|
|
88278
|
if ($k eq 'author' && $p->at('name')) { |
|
179
|
22
|
|
|
|
|
2289
|
$h{$k} = $p->at('name')->text; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
774
|
100
|
|
|
|
5001
|
if ($is_time_field{$k}) { |
|
182
|
269
|
|
|
|
|
585
|
$h{$k} = str2time($h{$k}); |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# let's handle links seperately, because ATOM loves these buggers: |
|
188
|
|
|
|
|
|
|
$item->find('link')->each( |
|
189
|
|
|
|
|
|
|
sub { |
|
190
|
137
|
|
|
137
|
|
48780
|
my $l = shift; |
|
191
|
137
|
100
|
|
|
|
265
|
if ($l->attr('href')) { |
|
192
|
130
|
100
|
100
|
|
|
1558
|
if (!$l->attr('rel') || $l->attr('rel') eq 'alternate') { |
|
193
|
122
|
|
|
|
|
1544
|
$h{'link'} = $l->attr('href'); |
|
194
|
|
|
|
|
|
|
} |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
else { |
|
197
|
7
|
50
|
|
|
|
88
|
if ($l->text =~ /\w+/) { |
|
198
|
7
|
|
|
|
|
376
|
$h{'link'} = $l->text; # simple link |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# else { # we have an empty link element with no 'href'. :-( |
|
202
|
|
|
|
|
|
|
# $h{'link'} = $1 if ($l->next->text =~ m/^(http\S+)/); |
|
203
|
|
|
|
|
|
|
# } |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
} |
|
206
|
130
|
|
|
|
|
484
|
); |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# find tags: |
|
209
|
130
|
|
|
|
|
2460
|
my @tags; |
|
210
|
|
|
|
|
|
|
$item->find('category, dc\:subject') |
|
211
|
130
|
|
66
|
93
|
|
256
|
->each(sub { push @tags, $_[0]->text || $_[0]->attr('term') }); |
|
|
93
|
|
|
|
|
34014
|
|
|
212
|
130
|
100
|
|
|
|
40730
|
if (@tags) { |
|
213
|
76
|
|
|
|
|
159
|
$h{'tags'} = \@tags; |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
# |
|
216
|
|
|
|
|
|
|
# normalize fields: |
|
217
|
130
|
|
|
|
|
378
|
my @replace = ( |
|
218
|
|
|
|
|
|
|
'content\:encoded' => 'content', |
|
219
|
|
|
|
|
|
|
'xhtml\:body' => 'content', |
|
220
|
|
|
|
|
|
|
'summary' => 'description', |
|
221
|
|
|
|
|
|
|
'pubDate' => 'published', |
|
222
|
|
|
|
|
|
|
'dc\:date' => 'published', |
|
223
|
|
|
|
|
|
|
'created' => 'published', |
|
224
|
|
|
|
|
|
|
'issued' => 'published', |
|
225
|
|
|
|
|
|
|
'updated' => 'published', |
|
226
|
|
|
|
|
|
|
'modified' => 'published', |
|
227
|
|
|
|
|
|
|
'dc\:creator' => 'author' |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# 'guid' => 'link' |
|
230
|
|
|
|
|
|
|
); |
|
231
|
130
|
|
|
|
|
314
|
while (my ($old, $new) = splice(@replace, 0, 2)) { |
|
232
|
1300
|
100
|
100
|
|
|
3281
|
if ($h{$old} && !$h{$new}) { |
|
233
|
126
|
|
|
|
|
345
|
$h{$new} = delete $h{$old}; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
130
|
|
|
|
|
329
|
my %copy = ('description' => 'content', link => 'id', guid => 'id'); |
|
237
|
130
|
|
|
|
|
276
|
while (my ($fill, $required) = each %copy) { |
|
238
|
390
|
100
|
100
|
|
|
1231
|
if ($h{$fill} && !$h{$required}) { |
|
239
|
8
|
|
|
|
|
21
|
$h{$required} = $h{$fill}; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
} |
|
242
|
130
|
|
|
|
|
271
|
$h{"_raw"} = $item->to_string; |
|
243
|
130
|
|
|
|
|
62360
|
return \%h; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# find_feeds - get RSS/Atom feed URL from argument. |
|
247
|
|
|
|
|
|
|
# Code adapted to use Mojolicious from Feed::Find by Benjamin Trott |
|
248
|
|
|
|
|
|
|
# Any stupid mistakes are my own |
|
249
|
|
|
|
|
|
|
sub find_feeds { |
|
250
|
13
|
|
|
13
|
1
|
52820
|
my $self = shift; |
|
251
|
13
|
|
|
|
|
15
|
my $url = shift; |
|
252
|
13
|
100
|
|
|
|
30
|
my $cb = (ref $_[-1] eq 'CODE') ? pop @_ : undef; |
|
253
|
|
|
|
|
|
|
# $self->ua->max_redirects(5)->connect_timeout(30); |
|
254
|
|
|
|
|
|
|
my $main = sub { |
|
255
|
13
|
|
|
13
|
|
14
|
my ($tx) = @_; |
|
256
|
13
|
|
|
|
|
14
|
my @feeds; |
|
257
|
|
|
|
|
|
|
# if ($tx->success) { say $tx->res->code } else { say $tx->error }; |
|
258
|
13
|
100
|
66
|
|
|
63
|
return unless ($tx->success && $tx->res->code == 200); |
|
259
|
12
|
|
|
|
|
281
|
eval { @feeds = _find_feed_links($self, $tx->req->url, $tx->res); }; |
|
|
12
|
|
|
|
|
22
|
|
|
260
|
12
|
50
|
|
|
|
31
|
if ($@) { |
|
261
|
0
|
|
|
|
|
0
|
croak "Exception in find_feeds - ", $@; |
|
262
|
|
|
|
|
|
|
} |
|
263
|
12
|
|
|
|
|
88
|
return (@feeds); |
|
264
|
13
|
|
|
|
|
47
|
}; |
|
265
|
13
|
100
|
|
|
|
22
|
if ($cb) { # non-blocking: |
|
266
|
|
|
|
|
|
|
$self->ua->get( |
|
267
|
|
|
|
|
|
|
$url, |
|
268
|
|
|
|
|
|
|
sub { |
|
269
|
3
|
|
|
3
|
|
14144
|
my ($ua, $tx) = @_; |
|
270
|
3
|
|
|
|
|
6
|
my (@feeds) = $main->($tx); |
|
271
|
3
|
|
|
|
|
12
|
$cb->(@feeds); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
3
|
|
|
|
|
19
|
); |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
else { |
|
276
|
10
|
|
|
|
|
43
|
my $tx = $self->ua->get($url); |
|
277
|
10
|
|
|
|
|
41373
|
return $main->($tx); |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub _find_feed_links { |
|
282
|
12
|
|
|
12
|
|
73
|
my ($self, $url, $res) = @_; |
|
283
|
|
|
|
|
|
|
|
|
284
|
12
|
|
|
|
|
16
|
state $feed_ext = qr/\.(?:rss|xml|rdf)$/; |
|
285
|
12
|
|
|
|
|
11
|
my @feeds; |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# use split to remove charset attribute from content_type |
|
288
|
12
|
|
|
|
|
21
|
my ($content_type) = split(/[; ]+/, $res->headers->content_type); |
|
289
|
12
|
100
|
|
|
|
164
|
if ($is_feed{$content_type}) { |
|
290
|
2
|
|
|
|
|
8
|
push @feeds, Mojo::URL->new($url)->to_abs; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
else { |
|
293
|
|
|
|
|
|
|
# we are in a web page. PHEAR. |
|
294
|
10
|
|
33
|
|
|
29
|
my $base = Mojo::URL->new( |
|
295
|
|
|
|
|
|
|
$res->dom->find('head base')->map('attr', 'href')->join('') || $url)->to_abs($url); |
|
296
|
10
|
|
33
|
|
|
172229
|
my $title |
|
297
|
|
|
|
|
|
|
= $res->dom->find('head > title')->map('text')->join('') || $url; |
|
298
|
|
|
|
|
|
|
$res->dom->find('head link')->each( |
|
299
|
|
|
|
|
|
|
sub { |
|
300
|
131
|
|
|
131
|
|
30195
|
my $attrs = $_->attr(); |
|
301
|
131
|
100
|
|
|
|
1092
|
return unless ($attrs->{'rel'}); |
|
302
|
129
|
|
|
|
|
230
|
my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'}); |
|
|
129
|
|
|
|
|
237
|
|
|
303
|
129
|
100
|
|
|
|
181
|
my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : ''; |
|
304
|
129
|
50
|
33
|
|
|
366
|
if ($is_feed{$type} && ($rel{'alternate'} || $rel{'service.feed'})) { |
|
|
|
|
66
|
|
|
|
|
|
305
|
8
|
|
|
|
|
21
|
push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base); |
|
306
|
|
|
|
|
|
|
} |
|
307
|
|
|
|
|
|
|
} |
|
308
|
10
|
|
|
|
|
25901
|
); |
|
309
|
|
|
|
|
|
|
$res->dom->find('a')->grep( |
|
310
|
|
|
|
|
|
|
sub { |
|
311
|
374
|
50
|
|
374
|
|
77428
|
$_->attr('href') |
|
312
|
|
|
|
|
|
|
&& Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io; |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
)->each( |
|
315
|
|
|
|
|
|
|
sub { |
|
316
|
2
|
|
|
2
|
|
502
|
push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base); |
|
317
|
|
|
|
|
|
|
} |
|
318
|
10
|
|
|
|
|
1565
|
); |
|
319
|
10
|
100
|
|
|
|
3825
|
unless (@feeds) |
|
320
|
|
|
|
|
|
|
{ # call me crazy, but maybe this is just a feed served as HTML? |
|
321
|
5
|
|
|
|
|
20
|
my $body = $res->body; |
|
322
|
5
|
100
|
|
|
|
117
|
if ($self->parse_feed(\$body)) { |
|
323
|
1
|
|
|
|
|
6
|
push @feeds, Mojo::URL->new($url)->to_abs; |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
} |
|
327
|
12
|
|
|
|
|
560
|
return @feeds; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub parse_opml { |
|
331
|
3
|
|
|
3
|
1
|
12627
|
my ($self, $opml_file) = @_; |
|
332
|
3
|
50
|
|
|
|
15
|
my $opml_str = decode 'UTF-8', |
|
333
|
|
|
|
|
|
|
(ref $opml_file) ? $opml_file->slurp : slurp $opml_file; |
|
334
|
3
|
|
|
|
|
3505
|
my $d = Mojo::DOM->new->parse($opml_str); |
|
335
|
3
|
|
|
|
|
93792
|
my (%subscriptions, %categories); |
|
336
|
3
|
|
|
|
|
17
|
for my $item ($d->find(q{outline})->each) { |
|
337
|
962
|
|
|
|
|
112811
|
my $node = $item->attr; |
|
338
|
962
|
100
|
|
|
|
6951
|
if (!defined $node->{xmlUrl}) { |
|
339
|
40
|
|
66
|
|
|
92
|
my $cat = $node->{title} || $node->{text}; |
|
340
|
40
|
|
|
|
|
75
|
$categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl'); |
|
341
|
|
|
|
|
|
|
} |
|
342
|
|
|
|
|
|
|
else { # file by RSS URL: |
|
343
|
922
|
|
|
|
|
1710
|
$subscriptions{$node->{xmlUrl}} = $node; |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# assign categories |
|
349
|
3
|
|
|
|
|
263
|
for my $cat (keys %categories) { |
|
350
|
40
|
|
|
|
|
78
|
for my $rss ($categories{$cat}->each) { |
|
351
|
920
|
50
|
|
|
|
1529
|
next unless ($subscriptions{$rss}); # don't auto-vivify for empty "categories" |
|
352
|
920
|
|
100
|
|
|
2060
|
$subscriptions{$rss}{'categories'} ||= []; |
|
353
|
920
|
|
|
|
|
516
|
push @{$subscriptions{$rss}{'categories'}}, $cat; |
|
|
920
|
|
|
|
|
1369
|
|
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
} |
|
356
|
3
|
|
|
|
|
971
|
return (values %subscriptions); |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
1; |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=encoding utf-8 |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head1 NAME |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Mojolicious::Plugin::FeedReader - Mojolicious plugin to find and parse RSS & Atom feeds |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# Mojolicious |
|
371
|
|
|
|
|
|
|
$self->plugin('FeedReader'); |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# Mojolicious::Lite |
|
374
|
|
|
|
|
|
|
plugin 'FeedReader'; |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# Blocking: |
|
377
|
|
|
|
|
|
|
get '/b' => sub { |
|
378
|
|
|
|
|
|
|
my $self = shift; |
|
379
|
|
|
|
|
|
|
my ($feed) = $self->find_feeds(q{search.cpan.org}); |
|
380
|
|
|
|
|
|
|
my $out = $self->parse_feed($feed); |
|
381
|
|
|
|
|
|
|
$self->render(template => 'uploads', items => $out->{items}); |
|
382
|
|
|
|
|
|
|
}; |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Non-blocking: |
|
385
|
|
|
|
|
|
|
get '/nb' => sub { |
|
386
|
|
|
|
|
|
|
my $self = shift; |
|
387
|
|
|
|
|
|
|
$self->render_later; |
|
388
|
|
|
|
|
|
|
my $delay = Mojo::IOLoop->delay( |
|
389
|
|
|
|
|
|
|
sub { |
|
390
|
|
|
|
|
|
|
$self->find_feeds("search.cpan.org", shift->begin(0)); |
|
391
|
|
|
|
|
|
|
}, |
|
392
|
|
|
|
|
|
|
sub { |
|
393
|
|
|
|
|
|
|
my $feed = pop; |
|
394
|
|
|
|
|
|
|
$self->parse_feed($feed, shift->begin); |
|
395
|
|
|
|
|
|
|
}, |
|
396
|
|
|
|
|
|
|
sub { |
|
397
|
|
|
|
|
|
|
my $data = pop; |
|
398
|
|
|
|
|
|
|
$self->render(template => 'uploads', items => $data->{items}); |
|
399
|
|
|
|
|
|
|
}); |
|
400
|
|
|
|
|
|
|
$delay->wait unless Mojo::IOLoop->is_running; |
|
401
|
|
|
|
|
|
|
}; |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
app->start; |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
__DATA__ |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
@@ uploads.html.ep |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
% for my $item (@$items) { |
|
410
|
|
|
|
|
|
|
<%= link_to $item->{title} => $item->{link} %> - <%= $item->{description} %> |
|
411
|
|
|
|
|
|
|
% } |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
B implements minimalistic helpers for identifying, |
|
417
|
|
|
|
|
|
|
fetching and parsing RSS and Atom Feeds. It has minimal dependencies, relying as |
|
418
|
|
|
|
|
|
|
much as possible on Mojolicious components - Mojo::UserAgent for fetching feeds and |
|
419
|
|
|
|
|
|
|
checking URLs, Mojo::DOM for XML/HTML parsing. |
|
420
|
|
|
|
|
|
|
It is therefore rather fragile and naive, and should be considered Experimental/Toy |
|
421
|
|
|
|
|
|
|
code - B |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=head1 METHODS |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
L inherits all methods from |
|
427
|
|
|
|
|
|
|
L and implements the following new ones. |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=head2 register |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
$plugin->register(Mojolicious->new); |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
Register plugin in L application. This method will install the helpers |
|
434
|
|
|
|
|
|
|
listed below in your Mojolicious application. |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=head1 HELPERS |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
B implements the following helpers. |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 find_feeds |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# Call blocking |
|
443
|
|
|
|
|
|
|
my (@feeds) = app->find_feeds('search.cpan.org'); |
|
444
|
|
|
|
|
|
|
# @feeds is a list of Mojo::URL objects |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Call non-blocking |
|
447
|
|
|
|
|
|
|
$self->find_feeds('http://example.com', sub { |
|
448
|
|
|
|
|
|
|
my (@feeds) = @_; |
|
449
|
|
|
|
|
|
|
unless (@feeds) { |
|
450
|
|
|
|
|
|
|
$self->render_exception("no feeds found, " . $info->{error}); |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
else { |
|
453
|
|
|
|
|
|
|
.... |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
}); |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
A Mojolicious port of L by Benjamin Trott. This helper implements feed auto-discovery for finding syndication feeds, given a URI. |
|
458
|
|
|
|
|
|
|
If given a callback function as an additional argument, execution will be non-blocking. |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=head2 parse_feed |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# parse an RSS/Atom feed |
|
463
|
|
|
|
|
|
|
# blocking |
|
464
|
|
|
|
|
|
|
my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot'); |
|
465
|
|
|
|
|
|
|
my $feed = $self->parse_feed($url); |
|
466
|
|
|
|
|
|
|
for my $item (@{$feed->{items}}) { |
|
467
|
|
|
|
|
|
|
say $_ for ($item->{title}, $item->{description}, 'Tags: ' . join q{,}, @{$item->{tags}}); |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# non-blocking |
|
471
|
|
|
|
|
|
|
$self->parse_feed($url, sub { |
|
472
|
|
|
|
|
|
|
my ($c, $feed) = @_; |
|
473
|
|
|
|
|
|
|
$c->render(text => "Feed tagline: " . $feed->{tagline}); |
|
474
|
|
|
|
|
|
|
}); |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
# parse a file |
|
477
|
|
|
|
|
|
|
$feed2 = $self->parse_feed('/downloads/foo.rss'); |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# parse response |
|
480
|
|
|
|
|
|
|
$self->ua->get($feed_url, sub { |
|
481
|
|
|
|
|
|
|
my ($ua, $tx) = @_; |
|
482
|
|
|
|
|
|
|
my $feed = $self->parse_feed($tx->res); |
|
483
|
|
|
|
|
|
|
}); |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
A minimalist liberal RSS/Atom parser, using Mojo::DOM queries. |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
Dates are parsed using L. |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
If parsing fails (for example, the parser was given an HTML page), the helper will return undef. |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
On success, the result returned is a hashref with the following keys: |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=over 4 |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=item * title |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=item * description (may be filled from subtitle or tagline if absent) |
|
498
|
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=item * htmlUrl - web page URL associated with the feed |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item * items - array ref of feed news items |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item * subtitle (optional) |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=item * tagline (optional) |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=item * author (name of author field, or dc:creator or webMaster) |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified) |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=back |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
Each item in the items array is a hashref with the following keys: |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=over 4 |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=item * title |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=item * link |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=item * content (may be filled with content:encoded, xhtml:body or description fields) |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item * id (will be equal to link or guid if it is undefined and either of those fields exists) |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=item * description (optional) - usually a shorter form of the content (may be filled with summary if description is missing) |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=item * guid (optional) |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=item * published - time in epoch seconds (may be filled with pubDate, dc:date, created, issued, updated or modified) |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=item * author (may be filled from author or dc:creator) |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=item * tags (optional) - array ref of tags, categories or dc:subjects. |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=item * _raw - XML serialized text of the item's Mojo::DOM node. Note that this can be different from the original XML text in the feed. |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=back |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=head2 parse_opml |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
my @subscriptions = app->parse_opml( 'mysubs.opml' ); |
|
542
|
|
|
|
|
|
|
foreach my $sub (@subscriptions) { |
|
543
|
|
|
|
|
|
|
say 'RSS URL is: ', $sub->{xmlUrl}; |
|
544
|
|
|
|
|
|
|
say 'Website URL is: ', $sub->{htmlUrl}; |
|
545
|
|
|
|
|
|
|
say 'categories: ', join ',', @{$sub->{categories}}; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs. |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head1 CREDITS |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
Some tests adapted from L and L Feed autodiscovery adapted from L. |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
Test data (web pages, feeds and excerpts) included in this package is intended for testing purposes only, and is not meant in any way |
|
557
|
|
|
|
|
|
|
to infringe on the rights of the respective authors. |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
Copyright (C) 2014, Dotan Dimet. |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
This program is free software, you can redistribute it and/or modify it |
|
564
|
|
|
|
|
|
|
under the terms of the Artistic License version 2.0. |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
L, L, L |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
L, L, L |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=cut |