File Coverage

blib/lib/BusyBird/Input/Feed.pm
Criterion Covered Total %
statement 109 134 81.3
branch 34 60 56.6
condition 10 21 47.6
subroutine 22 25 88.0
pod 4 4 100.0
total 179 244 73.3


line stmt bran cond sub pod time code
1             package BusyBird::Input::Feed;
2 5     5   251890 use strict;
  5         14  
  5         113  
3 5     5   24 use warnings;
  5         9  
  5         99  
4 5     5   2854 use XML::FeedPP;
  5         97778  
  5         146  
5 5     5   2589 use DateTime::Format::ISO8601;
  5         2470460  
  5         227  
6 5     5   2642 use BusyBird::DateTime::Format;
  5         17652  
  5         167  
7 5     5   35 use DateTime;
  5         14  
  5         91  
8 5     5   23 use Try::Tiny;
  5         12  
  5         223  
9 5     5   28 use Carp;
  5         11  
  5         201  
10 5     5   1669 use WWW::Favicon ();
  5         277900  
  5         120  
11 5     5   63 use LWP::UserAgent;
  5         13  
  5         79  
12 5     5   22 use URI;
  5         9  
  5         5338  
13              
14             our $VERSION = "0.07";
15              
16             our @CARP_NOT = qw(Try::Tiny XML::FeedPP);
17              
18             sub new {
19 5     5 1 1911 my ($class, %args) = @_;
20             my $self = bless {
21             use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1,
22             favicon_detector => WWW::Favicon->new,
23             user_agent => defined($args{user_agent}) ? $args{user_agent} : do {
24 5         52279 my $ua = LWP::UserAgent->new;
25 5         1127 $ua->env_proxy;
26 5         572 $ua->timeout(30);
27 5         72 $ua->agent("BusyBird::Inpu::Feed-$VERSION"); ## some Web sites ban LWP::UserAgent's default UserAgent...
28 5         316 $ua;
29             },
30 5 50       63 image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
    50          
    100          
31             }, $class;
32              
33             ## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
34 5         41 $self->{favicon_detector}->ua($self->{user_agent});
35            
36 5         128 return $self;
37             }
38              
39             sub _get_url_head_and_dir {
40 522     522   9415 my ($url_raw) = @_;
41 522 100       1213 return (undef, undef) if not defined $url_raw;
42 521         1901 my $url = URI->new($url_raw);
43 521         55976 my $scheme = $url->scheme;
44 521         7992 my $authority = $url->authority;
45 521 50 33     6520 return (undef, undef) if !$scheme || !$authority;
46 521         1149 my $url_head = "$scheme://$authority";
47 521         879 my $url_dir;
48 521         1280 my $path = $url->path;
49 521 50       6086 if($path =~ m{^(.*/)}i) {
50 521         1082 $url_dir = $1;
51             }else {
52 0         0 $url_dir = "/";
53             }
54 521         1657 return ($url_head, $url_dir);
55             }
56              
57             sub _extract_image_urls {
58 547     547   1152 my ($self, $feed_item) = @_;
59 547 100       1483 return () if $self->{image_max_num} == 0;
60 522         1296 my $content = $feed_item->description;
61 522 50       33519 return () if !defined($content);
62 522         1283 my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
63 522         1130 my @urls = ();
64 522   100     8351 while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
      100        
65             && $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
66 558         9789 my $url = URI->new($2);
67 558 100       35368 if(!$url->scheme) {
68             ## Only "path" segment is in the src attribute.
69 27 50 33     407 next if !defined($url_head) || !defined($url_dir);
70 27 100       88 if(substr("$url", 0, 1) eq "/") {
71 21         187 $url = "$url_head$url";
72             }else {
73 6         49 $url = "$url_head$url_dir$url";
74             }
75             }
76 558         8155 push @urls, "$url";
77             }
78 522         2992 return @urls;
79             }
80              
81             sub _get_home_url {
82 0     0   0 my ($self, $feed, $statuses) = @_;
83 0         0 my $home_url = $feed->link;
84 0 0 0     0 if(defined($home_url) && $home_url =~ m{^https?://}i) {
85 0         0 return $home_url;
86             }
87            
88 0         0 foreach my $status (@$statuses) {
89 0 0       0 $home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
90 0 0       0 return $home_url if defined $home_url;
91             }
92 0         0 return undef;
93             }
94              
95             sub _get_favicon_url {
96 0     0   0 my ($self, $feed, $statuses) = @_;
97             return try {
98 0     0   0 my $home_url = $self->_get_home_url($feed, $statuses);
99 0 0       0 return undef if not defined $home_url;
100 0         0 my $favicon_url = $self->{favicon_detector}->detect($home_url);
101 0 0       0 return undef if not defined $favicon_url;
102 0         0 my $res = $self->{user_agent}->get($favicon_url);
103 0 0       0 return undef if !$res->is_success;
104 0         0 my $type = $res->header('Content-Type');
105 0 0 0     0 return undef if defined($type) && $type !~ /^image/i;
106 0         0 return $favicon_url;
107 0         0 };
108             }
109              
110             sub _make_timestamp_datetime {
111 547     547   19534 my ($self, $timestamp_str) = @_;
112 547 100       1456 return undef if not defined $timestamp_str;
113 502 50       1895 if($timestamp_str =~ /^\d+$/) {
114 0         0 return DateTime->from_epoch(epoch => $timestamp_str, time_zone => '+0000');
115             }
116 502     502   2773 my $datetime = try { DateTime::Format::ISO8601->parse_datetime($timestamp_str) };
  502         13762  
117 502 100       335521 return $datetime if defined $datetime;
118 45         192 return BusyBird::DateTime::Format->parse_datetime($timestamp_str);
119             }
120              
121             sub _make_status_from_item {
122 547     547   1244 my ($self, $feed_title, $feed_item) = @_;
123 547         1607 my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
124 547         41209 my $text = $feed_item->title;
125 547 50       16126 $text = "" if !defined($text);
126 547         1418 my $permalink = $feed_item->link;
127 547 100       12179 my $status = {
    100          
128             text => $text,
129             busybird => { defined($permalink) ? (status_permalink => $permalink) : () },
130             created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
131             user => { screen_name => $feed_title },
132             };
133 547         167717 my $guid = $feed_item->guid;
134 547         10290 my $item_id;
135 547 100       1206 if(defined $guid) {
136 418         756 $item_id = $guid;
137 418         1161 $status->{busybird}{original}{id} = $guid;
138             }else {
139 129         284 $item_id = $feed_item->link;
140             }
141 547 100 66     3791 if(defined($created_at_dt) && defined($item_id)) {
    50          
142 502         1574 $status->{id} = $created_at_dt->epoch . '|' . $item_id;
143             }elsif(defined($item_id)) {
144 45         87 $status->{id} = $item_id;
145             }
146 547         5430 my @image_urls = $self->_extract_image_urls($feed_item);
147 547 100       1326 if(@image_urls) {
148 210         418 $status->{extended_entities}{media} = [map { +{ media_url => $_, indices => [0,0] } } @image_urls];
  558         1965  
149             }
150 547         3409 return $status;
151             }
152              
153             sub _make_statuses_from_feed {
154 31     31   1219666 my ($self, $feed) = @_;
155 31         206 my $feed_title = $feed->title;
156 31         1188 my $statuses = [ map { $self->_make_status_from_item($feed_title, $_) } $feed->get_item ];
  547         2336  
157 31 50       4444 return $statuses if !$self->{use_favicon};
158 0         0 my $favicon_url = $self->_get_favicon_url($feed, $statuses);
159 0 0       0 return $statuses if not defined $favicon_url;
160 0         0 $_->{user}{profile_image_url} = $favicon_url foreach @$statuses;
161 0         0 return $statuses;
162             }
163              
164             sub _parse_with_feedpp {
165 46     46   154 my ($self, $feed_source, $feed_type) = @_;
166             return $self->_make_statuses_from_feed(XML::FeedPP->new(
167             $feed_source, -type => $feed_type,
168             utf8_flag => 1, xml_deref => 1, lwp_useragent => $self->{user_agent},
169              
170             ## FeedPP and TreePP mess up with User-Agent. It's pretty annoying.
171 46         374 user_agent => scalar($self->{user_agent}->agent),
172             ));
173             }
174              
175             sub parse_string {
176 24     24 1 7144 my ($self, $string) = @_;
177 24         88 return $self->_parse_with_feedpp($string, "string");
178             }
179              
180             *parse = *parse_string;
181              
182             sub parse_file {
183 16     16 1 8972 my ($self, $filename) = @_;
184 16         147 return $self->_parse_with_feedpp($filename, "file");
185             }
186              
187             sub parse_url {
188 6     6 1 11784 my ($self, $url) = @_;
189 6         14 return $self->_parse_with_feedpp($url, "url");
190             }
191              
192             *parse_uri = *parse_url;
193              
194             1;
195             __END__
196              
197             =pod
198              
199             =head1 NAME
200              
201             BusyBird::Input::Feed - input BusyBird statuses from RSS/Atom feed
202              
203             =head1 SYNOPSIS
204              
205             use BusyBird;
206             use BusyBird::Input::Feed;
207            
208             my $input = BusyBird::Input::Feed->new;
209            
210             my $statuses = $input->parse($feed_xml);
211             timeline("feed")->add($statuses);
212            
213             $statuses = $input->parse_file("feed.atom");
214             timeline("feed")->add($statuses);
215            
216             $statuses = $input->parse_url('https://metacpan.org/feed/recent?f=');
217             timeline("feed")->add($statuses);
218              
219             =head1 DESCRIPTION
220              
221             L<BusyBird::Input::Feed> converts RSS and Atom feeds into L<BusyBird> status objects.
222              
223             For convenience, an executable script L<busybird_input_feed> is bundled in this distribution.
224              
225             =head1 CLASS METHODS
226              
227             =head2 $input = BusyBird::Input::Feed->new(%args)
228              
229             The constructor.
230              
231             Fields in C<%args> are:
232              
233             =over
234              
235             =item C<use_favicon> => BOOL (optional, default: true)
236              
237             If true (or omitted or C<undef>), it tries to use the favicon of the Web site providing the feed
238             as the statuses' icons.
239              
240             If it's defined and false, it won't use favicon.
241              
242             =item C<user_agent> => L<LWP::UserAgent> object (optional)
243              
244             L<LWP::UserAgent> object for fetching documents.
245              
246             =item C<image_max_num> => INT (optional, default: 3)
247              
248             The maximum number of image URLs extracted from the feed item.
249              
250             If set to 0, it extracts no images. If set to a negative value, it extracts all image URLs from the feed item.
251              
252             The extracted image URLs are stored as Twitter Entities in the status's C<extended_entities> field,
253             so that L<BusyBird> will render them.
254             See L<BusyBird::Manual::Status/extended_entities.media> for detail.
255              
256             =back
257              
258             =head1 OBJECT METHODS
259              
260             =head2 $statuses = $input->parse($feed_xml_string)
261              
262             =head2 $statuses = $input->parse_string($feed_xml_string)
263              
264             Convert the given C<$feed_xml_string> into L<BusyBird> C<$statuses>.
265             C<parse()> method is an alias for C<parse_string()>.
266              
267             C<$feed_xml_string> is the XML data to be parsed.
268             It must be a string encoded in UTF-8.
269              
270             Return value C<$statuses> is an array-ref of L<BusyBird> status objects.
271              
272             If C<$feed_xml_string> is invalid, it croaks.
273              
274             =head2 $statuses = $input->parse_file($feed_xml_filename)
275              
276             Same as C<parse_string()> except C<parse_file()> reads the file named C<$feed_xml_filename> and converts its content.
277              
278             =head2 $statuses = $input->parse_url($feed_xml_url)
279              
280             =head2 $statuses = $input->parse_uri($feed_xml_url)
281              
282             Same as C<parse_string()> except C<parse_url()> downloads the feed XML from C<$feed_xml_url> and converts its content.
283              
284             C<parse_uri()> method is an alias for C<parse_url()>.
285              
286             =head1 EXAMPLE
287              
288             The example below uses L<Parallel::ForkManager> to parallelize C<parse_url()> method of L<BusyBird::Input::Feed>.
289             It greatly reduces the total time to download a lot of RSS/Atom feeds.
290              
291             use strict;
292             use warnings;
293             use Parallel::ForkManager;
294             use BusyBird::Input::Feed;
295             use open qw(:std :encoding(utf8));
296            
297             my @feeds = (
298             'https://metacpan.org/feed/recent?f=',
299             'http://www.perl.com/pub/atom.xml',
300             'https://github.com/perl-users-jp/perl-users.jp-htdocs/commits/master.atom',
301             );
302             my $MAX_PROCESSES = 10;
303             my $pm = Parallel::ForkManager->new($MAX_PROCESSES);
304             my $input = BusyBird::Input::Feed->new;
305            
306             my @statuses = ();
307            
308             $pm->run_on_finish(sub {
309             my ($pid, $exitcode, $id, $signal, $coredump, $statuses) = @_;
310             push @statuses, @$statuses;
311             });
312            
313             foreach my $feed (@feeds) {
314             $pm->start and next;
315             warn "Start loading $feed\n";
316             my $statuses = $input->parse_url($feed);
317             warn "End loading $feed\n";
318             $pm->finish(0, $statuses);
319             }
320             $pm->wait_all_children;
321            
322             foreach my $status (@statuses) {
323             print "$status->{user}{screen_name}: $status->{text}\n";
324             }
325              
326              
327             =head1 SEE ALSO
328              
329             =over
330              
331             =item *
332              
333             L<BusyBird>
334              
335             =item *
336              
337             L<BusyBird::Manual::Status>
338              
339             =back
340              
341             =head1 REPOSITORY
342              
343             L<https://github.com/debug-ito/BusyBird-Input-Feed>
344              
345             =head1 BUGS AND FEATURE REQUESTS
346              
347             Please report bugs and feature requests to my Github issues
348             L<https://github.com/debug-ito/BusyBird-Input-Feed/issues>.
349              
350             Although I prefer Github, non-Github users can use CPAN RT
351             L<https://rt.cpan.org/Public/Dist/Display.html?Name=BusyBird-Input-Feed>.
352             Please send email to C<bug-BusyBird-Input-Feed at rt.cpan.org> to report bugs
353             if you do not have CPAN RT account.
354              
355              
356             =head1 AUTHOR
357            
358             Toshio Ito, C<< <toshioito at cpan.org> >>
359              
360              
361             =head1 LICENSE AND COPYRIGHT
362              
363             Copyright 2014 Toshio Ito.
364              
365             This program is free software; you can redistribute it and/or modify it
366             under the terms of either: the GNU General Public License as published
367             by the Free Software Foundation; or the Artistic License.
368              
369             See L<http://dev.perl.org/licenses/> for more information.
370              
371              
372             =cut
373