File Coverage

blib/lib/BusyBird/Input/Feed.pm
Criterion Covered Total %
statement 106 130 81.5
branch 30 56 53.5
condition 10 18 55.5
subroutine 22 25 88.0
pod 4 4 100.0
total 172 233 73.8


line stmt bran cond sub pod time code
1             package BusyBird::Input::Feed;
2 4     4   107348 use strict;
  4         10  
  4         144  
3 4     4   20 use warnings;
  4         8  
  4         113  
4 4     4   6519 use XML::FeedPP;
  4         137467  
  4         133  
5 4     4   4978 use DateTime::Format::ISO8601;
  4         1060545  
  4         334  
6 4     4   6482 use BusyBird::DateTime::Format;
  4         9767  
  4         152  
7 4     4   30 use DateTime;
  4         9  
  4         86  
8 4     4   20 use Try::Tiny;
  4         8  
  4         223  
9 4     4   24 use Carp;
  4         6  
  4         241  
10 4     4   3748 use WWW::Favicon ();
  4         466583  
  4         116  
11 4     4   94 use LWP::UserAgent;
  4         10  
  4         88  
12 4     4   23 use URI;
  4         8  
  4         6684  
13              
14             our $VERSION = "0.05";
15              
16             our @CARP_NOT = qw(Try::Tiny XML::FeedPP);
17              
18             sub new {
19 4     4 1 1978 my ($class, %args) = @_;
20             my $self = bless {
21             use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1,
22             favicon_detector => WWW::Favicon->new,
23 4 50       59 user_agent => defined($args{user_agent}) ? $args{user_agent} : do {
    50          
    100          
24 4         63233 my $ua = LWP::UserAgent->new;
25 4         944 $ua->env_proxy;
26 4         271 $ua->timeout(30);
27 4         62 $ua->agent("BusyBird::Inpu::Feed-$VERSION"); ## some Web sites ban LWP::UserAgent's default UserAgent...
28 4         226 $ua;
29             },
30             image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3,
31             }, $class;
32              
33             ## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001)
34 4         41 $self->{favicon_detector}->ua($self->{user_agent});
35            
36 4         139 return $self;
37             }
38              
39             sub _get_url_head_and_dir {
40 520     520   10000 my ($url_raw) = @_;
41 520 50       1089 return (undef, undef) if not defined $url_raw;
42 520         2109 my $url = URI->new($url_raw);
43 520         55121 my $scheme = $url->scheme;
44 520         8060 my $authority = $url->authority;
45 520 50 33     6898 return (undef, undef) if !$scheme || !$authority;
46 520         1037 my $url_head = "$scheme://$authority";
47 520         542 my $url_dir;
48 520         1573 my $path = $url->path;
49 520 50       7048 if($path =~ m{^(.*/)}i) {
50 520         1091 $url_dir = $1;
51             }else {
52 0         0 $url_dir = "/";
53             }
54 520         1750 return ($url_head, $url_dir);
55             }
56              
57             sub _extract_image_urls {
58 545     545   894 my ($self, $feed_item) = @_;
59 545 100       1686 return () if $self->{image_max_num} == 0;
60 520         1539 my $content = $feed_item->description;
61 520 50       33966 return () if !defined($content);
62 520         1500 my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link);
63 520         1102 my @urls = ();
64 520   100     12296 while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num})
      100        
65             && $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) {
66 558         14153 my $url = URI->new($2);
67 558 100       34407 if(!$url->scheme) {
68             ## Only "path" segment is in the src attribute.
69 27 50 33     427 next if !defined($url_head) || !defined($url_dir);
70 27 100       98 if(substr("$url", 0, 1) eq "/") {
71 21         147 $url = "$url_head$url";
72             }else {
73 6         46 $url = "$url_head$url_dir$url";
74             }
75             }
76 558         8351 push @urls, "$url";
77             }
78 520         3505 return @urls;
79             }
80              
81             sub _get_home_url {
82 0     0   0 my ($self, $feed, $statuses) = @_;
83 0         0 my $home_url = $feed->link;
84 0 0       0 return $home_url if defined $home_url;
85            
86 0         0 foreach my $status (@$statuses) {
87 0 0       0 $home_url = $status->{busybird}{status_permalink} if defined($status->{busybird});
88 0 0       0 return $home_url if defined $home_url;
89             }
90 0         0 return undef;
91             }
92              
93             sub _get_favicon_url {
94 0     0   0 my ($self, $feed, $statuses) = @_;
95             return try {
96 0     0   0 my $home_url = $self->_get_home_url($feed, $statuses);
97 0 0       0 return undef if not defined $home_url;
98 0         0 my $favicon_url = $self->{favicon_detector}->detect($home_url);
99 0 0       0 return undef if not defined $favicon_url;
100 0         0 my $res = $self->{user_agent}->get($favicon_url);
101 0 0       0 return undef if !$res->is_success;
102 0         0 my $type = $res->header('Content-Type');
103 0 0 0     0 return undef if defined($type) && $type !~ /^image/i;
104 0         0 return $favicon_url;
105 0         0 };
106             }
107              
108             sub _make_timestamp_datetime {
109 545     545   18911 my ($self, $timestamp_str) = @_;
110 545 100       1484 return undef if not defined $timestamp_str;
111 500 50       1989 if($timestamp_str =~ /^\d+$/) {
112 0         0 return DateTime->from_epoch(epoch => $timestamp_str, time_zone => '+0000');
113             }
114 500     500   3026 my $datetime = try { DateTime::Format::ISO8601->parse_datetime($timestamp_str) };
  500         10393  
115 500 100       312857 return $datetime if defined $datetime;
116 45         237 return BusyBird::DateTime::Format->parse_datetime($timestamp_str);
117             }
118              
119             sub _make_status_from_item {
120 545     545   967 my ($self, $feed_title, $feed_item) = @_;
121 545         1833 my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate);
122 545 100       73400 my $status = {
123             text => $feed_item->title,
124             busybird => { status_permalink => $feed_item->link },
125             created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ),
126             user => { screen_name => $feed_title },
127             };
128 545         163420 my $guid = $feed_item->guid;
129 545         9302 my $item_id;
130 545 100       1140 if(defined $guid) {
131 416         607 $item_id = $guid;
132 416         1383 $status->{busybird}{original}{id} = $guid;
133             }else {
134 129         323 $item_id = $feed_item->link;
135             }
136 545 100 66     3700 if(defined($created_at_dt) && defined($item_id)) {
    50          
137 500         1624 $status->{id} = $created_at_dt->epoch . '|' . $item_id;
138             }elsif(defined($item_id)) {
139 45         109 $status->{id} = $item_id;
140             }
141 545         6199 my @image_urls = $self->_extract_image_urls($feed_item);
142 545 100       1193 if(@image_urls) {
143 210         346 $status->{extended_entities}{media} = [map { +{ media_url => $_, indices => [0,0] } } @image_urls];
  558         2621  
144             }
145 545         4137 return $status;
146             }
147              
148             sub _make_statuses_from_feed {
149 30     30   1297803 my ($self, $feed) = @_;
150 30         169 my $feed_title = $feed->title;
151 30         1045 my $statuses = [ map { $self->_make_status_from_item($feed_title, $_) } $feed->get_item ];
  545         1909  
152 30 50       479 return $statuses if !$self->{use_favicon};
153 0         0 my $favicon_url = $self->_get_favicon_url($feed, $statuses);
154 0 0       0 return $statuses if not defined $favicon_url;
155 0         0 $_->{user}{profile_image_url} = $favicon_url foreach @$statuses;
156 0         0 return $statuses;
157             }
158              
159             sub _parse_with_feedpp {
160 45     45   97 my ($self, $feed_source, $feed_type) = @_;
161 45         485 return $self->_make_statuses_from_feed(XML::FeedPP->new(
162             $feed_source, -type => $feed_type,
163             utf8_flag => 1, xml_deref => 1, lwp_useragent => $self->{user_agent},
164              
165             ## FeedPP and TreePP mess up with User-Agent. It's pretty annoying.
166             user_agent => scalar($self->{user_agent}->agent),
167             ));
168             }
169              
170             sub parse_string {
171 24     24 1 7463 my ($self, $string) = @_;
172 24         98 return $self->_parse_with_feedpp($string, "string");
173             }
174              
175             *parse = *parse_string;
176              
177             sub parse_file {
178 15     15 1 11206 my ($self, $filename) = @_;
179 15         60 return $self->_parse_with_feedpp($filename, "file");
180             }
181              
182             sub parse_url {
183 6     6 1 12942 my ($self, $url) = @_;
184 6         18 return $self->_parse_with_feedpp($url, "url");
185             }
186              
187             *parse_uri = *parse_url;
188              
189             1;
190             __END__