line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package BusyBird::Input::Feed; |
2
|
4
|
|
|
4
|
|
107348
|
use strict; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
144
|
|
3
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
113
|
|
4
|
4
|
|
|
4
|
|
6519
|
use XML::FeedPP; |
|
4
|
|
|
|
|
137467
|
|
|
4
|
|
|
|
|
133
|
|
5
|
4
|
|
|
4
|
|
4978
|
use DateTime::Format::ISO8601; |
|
4
|
|
|
|
|
1060545
|
|
|
4
|
|
|
|
|
334
|
|
6
|
4
|
|
|
4
|
|
6482
|
use BusyBird::DateTime::Format; |
|
4
|
|
|
|
|
9767
|
|
|
4
|
|
|
|
|
152
|
|
7
|
4
|
|
|
4
|
|
30
|
use DateTime; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
86
|
|
8
|
4
|
|
|
4
|
|
20
|
use Try::Tiny; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
223
|
|
9
|
4
|
|
|
4
|
|
24
|
use Carp; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
241
|
|
10
|
4
|
|
|
4
|
|
3748
|
use WWW::Favicon (); |
|
4
|
|
|
|
|
466583
|
|
|
4
|
|
|
|
|
116
|
|
11
|
4
|
|
|
4
|
|
94
|
use LWP::UserAgent; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
88
|
|
12
|
4
|
|
|
4
|
|
23
|
use URI; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
6684
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = "0.05"; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @CARP_NOT = qw(Try::Tiny XML::FeedPP); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
sub new { |
19
|
4
|
|
|
4
|
1
|
1978
|
my ($class, %args) = @_; |
20
|
|
|
|
|
|
|
my $self = bless { |
21
|
|
|
|
|
|
|
use_favicon => defined($args{use_favicon}) ? $args{use_favicon} : 1, |
22
|
|
|
|
|
|
|
favicon_detector => WWW::Favicon->new, |
23
|
4
|
50
|
|
|
|
59
|
user_agent => defined($args{user_agent}) ? $args{user_agent} : do { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24
|
4
|
|
|
|
|
63233
|
my $ua = LWP::UserAgent->new; |
25
|
4
|
|
|
|
|
944
|
$ua->env_proxy; |
26
|
4
|
|
|
|
|
271
|
$ua->timeout(30); |
27
|
4
|
|
|
|
|
62
|
$ua->agent("BusyBird::Inpu::Feed-$VERSION"); ## some Web sites ban LWP::UserAgent's default UserAgent... |
28
|
4
|
|
|
|
|
226
|
$ua; |
29
|
|
|
|
|
|
|
}, |
30
|
|
|
|
|
|
|
image_max_num => defined($args{image_max_num}) ? $args{image_max_num} : 3, |
31
|
|
|
|
|
|
|
}, $class; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
## Note that WWW::Favicon#ua accessor method is not documented (as of version 0.03001) |
34
|
4
|
|
|
|
|
41
|
$self->{favicon_detector}->ua($self->{user_agent}); |
35
|
|
|
|
|
|
|
|
36
|
4
|
|
|
|
|
139
|
return $self; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub _get_url_head_and_dir { |
40
|
520
|
|
|
520
|
|
10000
|
my ($url_raw) = @_; |
41
|
520
|
50
|
|
|
|
1089
|
return (undef, undef) if not defined $url_raw; |
42
|
520
|
|
|
|
|
2109
|
my $url = URI->new($url_raw); |
43
|
520
|
|
|
|
|
55121
|
my $scheme = $url->scheme; |
44
|
520
|
|
|
|
|
8060
|
my $authority = $url->authority; |
45
|
520
|
50
|
33
|
|
|
6898
|
return (undef, undef) if !$scheme || !$authority; |
46
|
520
|
|
|
|
|
1037
|
my $url_head = "$scheme://$authority"; |
47
|
520
|
|
|
|
|
542
|
my $url_dir; |
48
|
520
|
|
|
|
|
1573
|
my $path = $url->path; |
49
|
520
|
50
|
|
|
|
7048
|
if($path =~ m{^(.*/)}i) { |
50
|
520
|
|
|
|
|
1091
|
$url_dir = $1; |
51
|
|
|
|
|
|
|
}else { |
52
|
0
|
|
|
|
|
0
|
$url_dir = "/"; |
53
|
|
|
|
|
|
|
} |
54
|
520
|
|
|
|
|
1750
|
return ($url_head, $url_dir); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _extract_image_urls { |
58
|
545
|
|
|
545
|
|
894
|
my ($self, $feed_item) = @_; |
59
|
545
|
100
|
|
|
|
1686
|
return () if $self->{image_max_num} == 0; |
60
|
520
|
|
|
|
|
1539
|
my $content = $feed_item->description; |
61
|
520
|
50
|
|
|
|
33966
|
return () if !defined($content); |
62
|
520
|
|
|
|
|
1500
|
my ($url_head, $url_dir) = _get_url_head_and_dir($feed_item->link); |
63
|
520
|
|
|
|
|
1102
|
my @urls = (); |
64
|
520
|
|
100
|
|
|
12296
|
while(($self->{image_max_num} < 0 || @urls < $self->{image_max_num}) |
|
|
|
100
|
|
|
|
|
65
|
|
|
|
|
|
|
&& $content =~ m{<\s*img\s+[^>]*src\s*=\s*(['"])([^>]+?)\1[^>]*>}ig) { |
66
|
558
|
|
|
|
|
14153
|
my $url = URI->new($2); |
67
|
558
|
100
|
|
|
|
34407
|
if(!$url->scheme) { |
68
|
|
|
|
|
|
|
## Only "path" segment is in the src attribute. |
69
|
27
|
50
|
33
|
|
|
427
|
next if !defined($url_head) || !defined($url_dir); |
70
|
27
|
100
|
|
|
|
98
|
if(substr("$url", 0, 1) eq "/") { |
71
|
21
|
|
|
|
|
147
|
$url = "$url_head$url"; |
72
|
|
|
|
|
|
|
}else { |
73
|
6
|
|
|
|
|
46
|
$url = "$url_head$url_dir$url"; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
} |
76
|
558
|
|
|
|
|
8351
|
push @urls, "$url"; |
77
|
|
|
|
|
|
|
} |
78
|
520
|
|
|
|
|
3505
|
return @urls; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub _get_home_url { |
82
|
0
|
|
|
0
|
|
0
|
my ($self, $feed, $statuses) = @_; |
83
|
0
|
|
|
|
|
0
|
my $home_url = $feed->link; |
84
|
0
|
0
|
|
|
|
0
|
return $home_url if defined $home_url; |
85
|
|
|
|
|
|
|
|
86
|
0
|
|
|
|
|
0
|
foreach my $status (@$statuses) { |
87
|
0
|
0
|
|
|
|
0
|
$home_url = $status->{busybird}{status_permalink} if defined($status->{busybird}); |
88
|
0
|
0
|
|
|
|
0
|
return $home_url if defined $home_url; |
89
|
|
|
|
|
|
|
} |
90
|
0
|
|
|
|
|
0
|
return undef; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _get_favicon_url { |
94
|
0
|
|
|
0
|
|
0
|
my ($self, $feed, $statuses) = @_; |
95
|
|
|
|
|
|
|
return try { |
96
|
0
|
|
|
0
|
|
0
|
my $home_url = $self->_get_home_url($feed, $statuses); |
97
|
0
|
0
|
|
|
|
0
|
return undef if not defined $home_url; |
98
|
0
|
|
|
|
|
0
|
my $favicon_url = $self->{favicon_detector}->detect($home_url); |
99
|
0
|
0
|
|
|
|
0
|
return undef if not defined $favicon_url; |
100
|
0
|
|
|
|
|
0
|
my $res = $self->{user_agent}->get($favicon_url); |
101
|
0
|
0
|
|
|
|
0
|
return undef if !$res->is_success; |
102
|
0
|
|
|
|
|
0
|
my $type = $res->header('Content-Type'); |
103
|
0
|
0
|
0
|
|
|
0
|
return undef if defined($type) && $type !~ /^image/i; |
104
|
0
|
|
|
|
|
0
|
return $favicon_url; |
105
|
0
|
|
|
|
|
0
|
}; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub _make_timestamp_datetime { |
109
|
545
|
|
|
545
|
|
18911
|
my ($self, $timestamp_str) = @_; |
110
|
545
|
100
|
|
|
|
1484
|
return undef if not defined $timestamp_str; |
111
|
500
|
50
|
|
|
|
1989
|
if($timestamp_str =~ /^\d+$/) { |
112
|
0
|
|
|
|
|
0
|
return DateTime->from_epoch(epoch => $timestamp_str, time_zone => '+0000'); |
113
|
|
|
|
|
|
|
} |
114
|
500
|
|
|
500
|
|
3026
|
my $datetime = try { DateTime::Format::ISO8601->parse_datetime($timestamp_str) }; |
|
500
|
|
|
|
|
10393
|
|
115
|
500
|
100
|
|
|
|
312857
|
return $datetime if defined $datetime; |
116
|
45
|
|
|
|
|
237
|
return BusyBird::DateTime::Format->parse_datetime($timestamp_str); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub _make_status_from_item { |
120
|
545
|
|
|
545
|
|
967
|
my ($self, $feed_title, $feed_item) = @_; |
121
|
545
|
|
|
|
|
1833
|
my $created_at_dt = $self->_make_timestamp_datetime($feed_item->pubDate); |
122
|
545
|
100
|
|
|
|
73400
|
my $status = { |
123
|
|
|
|
|
|
|
text => $feed_item->title, |
124
|
|
|
|
|
|
|
busybird => { status_permalink => $feed_item->link }, |
125
|
|
|
|
|
|
|
created_at => ($created_at_dt ? BusyBird::DateTime::Format->format_datetime($created_at_dt) : undef ), |
126
|
|
|
|
|
|
|
user => { screen_name => $feed_title }, |
127
|
|
|
|
|
|
|
}; |
128
|
545
|
|
|
|
|
163420
|
my $guid = $feed_item->guid; |
129
|
545
|
|
|
|
|
9302
|
my $item_id; |
130
|
545
|
100
|
|
|
|
1140
|
if(defined $guid) { |
131
|
416
|
|
|
|
|
607
|
$item_id = $guid; |
132
|
416
|
|
|
|
|
1383
|
$status->{busybird}{original}{id} = $guid; |
133
|
|
|
|
|
|
|
}else { |
134
|
129
|
|
|
|
|
323
|
$item_id = $feed_item->link; |
135
|
|
|
|
|
|
|
} |
136
|
545
|
100
|
66
|
|
|
3700
|
if(defined($created_at_dt) && defined($item_id)) { |
|
|
50
|
|
|
|
|
|
137
|
500
|
|
|
|
|
1624
|
$status->{id} = $created_at_dt->epoch . '|' . $item_id; |
138
|
|
|
|
|
|
|
}elsif(defined($item_id)) { |
139
|
45
|
|
|
|
|
109
|
$status->{id} = $item_id; |
140
|
|
|
|
|
|
|
} |
141
|
545
|
|
|
|
|
6199
|
my @image_urls = $self->_extract_image_urls($feed_item); |
142
|
545
|
100
|
|
|
|
1193
|
if(@image_urls) { |
143
|
210
|
|
|
|
|
346
|
$status->{extended_entities}{media} = [map { +{ media_url => $_, indices => [0,0] } } @image_urls]; |
|
558
|
|
|
|
|
2621
|
|
144
|
|
|
|
|
|
|
} |
145
|
545
|
|
|
|
|
4137
|
return $status; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _make_statuses_from_feed { |
149
|
30
|
|
|
30
|
|
1297803
|
my ($self, $feed) = @_; |
150
|
30
|
|
|
|
|
169
|
my $feed_title = $feed->title; |
151
|
30
|
|
|
|
|
1045
|
my $statuses = [ map { $self->_make_status_from_item($feed_title, $_) } $feed->get_item ]; |
|
545
|
|
|
|
|
1909
|
|
152
|
30
|
50
|
|
|
|
479
|
return $statuses if !$self->{use_favicon}; |
153
|
0
|
|
|
|
|
0
|
my $favicon_url = $self->_get_favicon_url($feed, $statuses); |
154
|
0
|
0
|
|
|
|
0
|
return $statuses if not defined $favicon_url; |
155
|
0
|
|
|
|
|
0
|
$_->{user}{profile_image_url} = $favicon_url foreach @$statuses; |
156
|
0
|
|
|
|
|
0
|
return $statuses; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _parse_with_feedpp { |
160
|
45
|
|
|
45
|
|
97
|
my ($self, $feed_source, $feed_type) = @_; |
161
|
45
|
|
|
|
|
485
|
return $self->_make_statuses_from_feed(XML::FeedPP->new( |
162
|
|
|
|
|
|
|
$feed_source, -type => $feed_type, |
163
|
|
|
|
|
|
|
utf8_flag => 1, xml_deref => 1, lwp_useragent => $self->{user_agent}, |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
## FeedPP and TreePP mess up with User-Agent. It's pretty annoying. |
166
|
|
|
|
|
|
|
user_agent => scalar($self->{user_agent}->agent), |
167
|
|
|
|
|
|
|
)); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub parse_string { |
171
|
24
|
|
|
24
|
1
|
7463
|
my ($self, $string) = @_; |
172
|
24
|
|
|
|
|
98
|
return $self->_parse_with_feedpp($string, "string"); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
*parse = *parse_string; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub parse_file { |
178
|
15
|
|
|
15
|
1
|
11206
|
my ($self, $filename) = @_; |
179
|
15
|
|
|
|
|
60
|
return $self->_parse_with_feedpp($filename, "file"); |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub parse_url { |
183
|
6
|
|
|
6
|
1
|
12942
|
my ($self, $url) = @_; |
184
|
6
|
|
|
|
|
18
|
return $self->_parse_with_feedpp($url, "url"); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
*parse_uri = *parse_url; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
1; |
190
|
|
|
|
|
|
|
__END__ |