line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package XML::RSS::Feed; |
2
|
18
|
|
|
18
|
|
620033
|
use strict; |
|
18
|
|
|
|
|
50
|
|
|
18
|
|
|
|
|
1434
|
|
3
|
18
|
|
|
18
|
|
196
|
use warnings; |
|
18
|
|
|
|
|
117
|
|
|
18
|
|
|
|
|
920
|
|
4
|
18
|
|
|
18
|
|
9499
|
use XML::RSS; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
use XML::RSS::Headline; |
6
|
|
|
|
|
|
|
use Time::HiRes; |
7
|
|
|
|
|
|
|
use Storable qw(store retrieve); |
8
|
|
|
|
|
|
|
use Carp qw(carp); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use constant DEFAULT_DELAY => 3600; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = 2.4; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new { |
15
|
|
|
|
|
|
|
my ( $class, %args ) = @_; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $self = bless { |
18
|
|
|
|
|
|
|
process_count => 0, |
19
|
|
|
|
|
|
|
rss_headlines => [], |
20
|
|
|
|
|
|
|
rss_headline_ids => {}, |
21
|
|
|
|
|
|
|
max_headlines => 0, |
22
|
|
|
|
|
|
|
init_headlines_seen => 0, |
23
|
|
|
|
|
|
|
}, $class; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
foreach my $method ( keys %args ) { |
26
|
|
|
|
|
|
|
if ( $self->can($method) ) { |
27
|
|
|
|
|
|
|
$self->$method( $args{$method} ); |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
else { |
30
|
|
|
|
|
|
|
carp "Invalid argument '$method'"; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
$self->_load_cached_headlines if $self->{tmpdir}; |
34
|
|
|
|
|
|
|
$self->delay(DEFAULT_DELAY) unless $self->delay; |
35
|
|
|
|
|
|
|
return $self; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub _load_cached_headlines { |
39
|
|
|
|
|
|
|
my ($self) = @_; |
40
|
|
|
|
|
|
|
my $filename_sto = $self->{tmpdir} . '/' . $self->name . '.sto'; |
41
|
|
|
|
|
|
|
my $filename_xml = $self->{tmpdir} . '/' . $self->name; |
42
|
|
|
|
|
|
|
if ( -s $filename_sto ) { |
43
|
|
|
|
|
|
|
my $cached = retrieve($filename_sto); |
44
|
|
|
|
|
|
|
my $title = $self->title || $cached->{title} || ''; |
45
|
|
|
|
|
|
|
$self->set_last_updated( $cached->{last_updated} ); |
46
|
|
|
|
|
|
|
$self->{process_count}++; |
47
|
|
|
|
|
|
|
$self->process( $cached->{items}, $title, $cached->{link} ); |
48
|
|
|
|
|
|
|
warn "[$self->{name}] Loaded Cached RSS Storable\n" if $self->{debug}; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
elsif ( -T $filename_xml ) { # legacy XML caching |
51
|
|
|
|
|
|
|
if ( open( my $fh, '<', $filename_xml ) ) { |
52
|
|
|
|
|
|
|
my $xml = do { local $/ = undef, <$fh> }; |
53
|
|
|
|
|
|
|
close $fh; |
54
|
|
|
|
|
|
|
warn "[$self->{name}] Loaded Cached RSS XML\n" if $self->{debug}; |
55
|
|
|
|
|
|
|
$self->{process_count}++; |
56
|
|
|
|
|
|
|
$self->parse($xml); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
else { |
59
|
|
|
|
|
|
|
carp "[$self->{name}] Failed to load XML cache $filename_xml\n"; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
else { |
63
|
|
|
|
|
|
|
warn "[$self->{name}] No Cache File Found\n" if $self->{debug}; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
return; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub _strip_whitespace { |
69
|
|
|
|
|
|
|
my ($string) = @_; |
70
|
|
|
|
|
|
|
$string =~ s/^\s+//; |
71
|
|
|
|
|
|
|
$string =~ s/\s+$//; |
72
|
|
|
|
|
|
|
return $string; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub _mark_all_headlines_seen { |
76
|
|
|
|
|
|
|
my ($self) = @_; |
77
|
|
|
|
|
|
|
return unless $self->{process_count} || $self->{init_headlines_seen}; |
78
|
|
|
|
|
|
|
$self->{rss_headline_ids}{ $_->id } = 1 for $self->late_breaking_news; |
79
|
|
|
|
|
|
|
return; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub parse { |
83
|
|
|
|
|
|
|
my ( $self, $xml ) = @_; |
84
|
|
|
|
|
|
|
my $rss = XML::RSS->new(); |
85
|
|
|
|
|
|
|
eval { $rss->parse($xml) }; |
86
|
|
|
|
|
|
|
unless ($@) { |
87
|
|
|
|
|
|
|
warn "[$self->{name}] Parsed RSS XML\n" if $self->{debug}; |
88
|
|
|
|
|
|
|
my $items = [ map { { item => $_ } } @{ $rss->{items} } ]; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
$self->process( |
91
|
|
|
|
|
|
|
$items, |
92
|
|
|
|
|
|
|
( $self->title || $rss->{channel}{title} ), |
93
|
|
|
|
|
|
|
$rss->{channel}{link} |
94
|
|
|
|
|
|
|
); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
return 1; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
carp "[$self->{name}] [!!] Failed to parse RSS XML: $@\n"; |
99
|
|
|
|
|
|
|
return; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub process { |
103
|
|
|
|
|
|
|
my ( $self, $items, $title, $link ) = @_; |
104
|
|
|
|
|
|
|
return unless $items; |
105
|
|
|
|
|
|
|
$self->pre_process; |
106
|
|
|
|
|
|
|
$self->process_items($items); |
107
|
|
|
|
|
|
|
$self->title($title) if $title; |
108
|
|
|
|
|
|
|
$self->link($link) if $link; |
109
|
|
|
|
|
|
|
$self->post_process; |
110
|
|
|
|
|
|
|
return 1; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub pre_process { |
114
|
|
|
|
|
|
|
my ($self) = @_; |
115
|
|
|
|
|
|
|
$self->_mark_all_headlines_seen; |
116
|
|
|
|
|
|
|
return; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub process_items { |
120
|
|
|
|
|
|
|
my ( $self, $items ) = @_; |
121
|
|
|
|
|
|
|
return unless $items; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# used 'reverse' so order seen is preserved |
124
|
|
|
|
|
|
|
for my $item ( reverse @$items ) { |
125
|
|
|
|
|
|
|
$self->create_headline(%$item); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
return 1; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub post_process { |
131
|
|
|
|
|
|
|
my ($self) = @_; |
132
|
|
|
|
|
|
|
if ( $self->init ) { |
133
|
|
|
|
|
|
|
warn "[$self->{name}] " |
134
|
|
|
|
|
|
|
. $self->late_breaking_news |
135
|
|
|
|
|
|
|
. " New Headlines Found\n" |
136
|
|
|
|
|
|
|
if $self->{debug}; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
else { |
139
|
|
|
|
|
|
|
$self->_mark_all_headlines_seen; |
140
|
|
|
|
|
|
|
$self->init(1); |
141
|
|
|
|
|
|
|
warn "[$self->{name}] " |
142
|
|
|
|
|
|
|
. $self->num_headlines |
143
|
|
|
|
|
|
|
. " Headlines Initialized\n" |
144
|
|
|
|
|
|
|
if $self->{debug}; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
$self->{process_count}++; |
147
|
|
|
|
|
|
|
$self->cache; |
148
|
|
|
|
|
|
|
$self->set_last_updated; |
149
|
|
|
|
|
|
|
return; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub create_headline { |
153
|
|
|
|
|
|
|
my ( $self, %args ) = @_; |
154
|
|
|
|
|
|
|
my $hlobj = $self->{hlobj} || 'XML::RSS::Headline'; |
155
|
|
|
|
|
|
|
$args{headline_as_id} = $self->{headline_as_id}; |
156
|
|
|
|
|
|
|
my $headline = $hlobj->new(%args); |
157
|
|
|
|
|
|
|
return unless $headline; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
unshift( @{ $self->{rss_headlines} }, $headline ) |
160
|
|
|
|
|
|
|
unless $self->seen_headline( $headline->id ); |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# remove the oldest if the new headline put us over the max_headlines limit |
163
|
|
|
|
|
|
|
if ( $self->max_headlines ) { |
164
|
|
|
|
|
|
|
while ( $self->num_headlines > $self->max_headlines ) { |
165
|
|
|
|
|
|
|
my $garbage = pop @{ $self->{rss_headlines} }; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# just in case max_headlines < number of headlines in the feed |
168
|
|
|
|
|
|
|
$self->{rss_headline_ids}{ $garbage->id } = 1; |
169
|
|
|
|
|
|
|
warn "[$self->{name}] Exceeded maximum headlines, removing " |
170
|
|
|
|
|
|
|
. "oldest headline\n" |
171
|
|
|
|
|
|
|
if $self->{debug}; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
return; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub num_headlines { |
178
|
|
|
|
|
|
|
my ($self) = @_; |
179
|
|
|
|
|
|
|
return scalar @{ $self->{rss_headlines} }; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub seen_headline { |
183
|
|
|
|
|
|
|
my ( $self, $id ) = @_; |
184
|
|
|
|
|
|
|
return 1 if exists $self->{rss_headline_ids}{$id}; |
185
|
|
|
|
|
|
|
return; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub headlines { |
189
|
|
|
|
|
|
|
my ($self) = @_; |
190
|
|
|
|
|
|
|
return wantarray ? @{ $self->{rss_headlines} } : $self->{rss_headlines}; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub late_breaking_news { |
194
|
|
|
|
|
|
|
my ($self) = @_; |
195
|
|
|
|
|
|
|
my @list = grep { !$self->seen_headline( $_->id ); } |
196
|
|
|
|
|
|
|
@{ $self->{rss_headlines} }; |
197
|
|
|
|
|
|
|
return wantarray ? @list : scalar @list; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub cache { |
201
|
|
|
|
|
|
|
my ($self) = @_; |
202
|
|
|
|
|
|
|
return unless $self->tmpdir; |
203
|
|
|
|
|
|
|
if ( -d $self->tmpdir && $self->num_headlines ) { |
204
|
|
|
|
|
|
|
my $tmp_filename = $self->tmpdir . '/' . $self->{name} . '.sto'; |
205
|
|
|
|
|
|
|
eval { store( $self->_build_dump_structure, $tmp_filename ) }; |
206
|
|
|
|
|
|
|
if ($@) { |
207
|
|
|
|
|
|
|
carp "[$self->{name}] Could not cache RSS XML to $tmp_filename\n"; |
208
|
|
|
|
|
|
|
return; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
else { |
211
|
|
|
|
|
|
|
warn "[$self->{name}] Cached RSS Storable to $tmp_filename\n" if $self->{debug}; |
212
|
|
|
|
|
|
|
return 1; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
return; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub _build_dump_structure { |
219
|
|
|
|
|
|
|
my ($self) = @_; |
220
|
|
|
|
|
|
|
my $cached = {}; |
221
|
|
|
|
|
|
|
$cached->{title} = $self->title; |
222
|
|
|
|
|
|
|
$cached->{link} = $self->link; |
223
|
|
|
|
|
|
|
$cached->{last_updated} = $self->{timestamp_hires}; |
224
|
|
|
|
|
|
|
$cached->{items} = []; |
225
|
|
|
|
|
|
|
for my $headline ( $self->headlines ) { |
226
|
|
|
|
|
|
|
push @{ $cached->{items} }, { |
227
|
|
|
|
|
|
|
headline => $headline->headline, |
228
|
|
|
|
|
|
|
url => $headline->url, |
229
|
|
|
|
|
|
|
description => $headline->description, |
230
|
|
|
|
|
|
|
first_seen => $headline->first_seen_hires, |
231
|
|
|
|
|
|
|
guid => $headline->guid, |
232
|
|
|
|
|
|
|
}; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
return $cached; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub set_last_updated { |
238
|
|
|
|
|
|
|
my ( $self, $hires_time ) = @_; |
239
|
|
|
|
|
|
|
$self->{hires_timestamp} = $hires_time if $hires_time; |
240
|
|
|
|
|
|
|
$self->{hires_timestamp} = Time::HiRes::time() |
241
|
|
|
|
|
|
|
unless $self->{hires_timestamp}; |
242
|
|
|
|
|
|
|
return; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
sub last_updated { |
246
|
|
|
|
|
|
|
my ($self) = @_; |
247
|
|
|
|
|
|
|
return int $self->{hires_timestamp}; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
sub last_updated_hires { |
251
|
|
|
|
|
|
|
my ($self) = @_; |
252
|
|
|
|
|
|
|
return $self->{hires_timestamp}; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub title { |
256
|
|
|
|
|
|
|
my ( $self, $title ) = @_; |
257
|
|
|
|
|
|
|
if ($title) { |
258
|
|
|
|
|
|
|
$title = _strip_whitespace($title); |
259
|
|
|
|
|
|
|
$self->{title} = $title if $title; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
return $self->{title}; |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
sub debug { |
265
|
|
|
|
|
|
|
my $self = shift @_; |
266
|
|
|
|
|
|
|
$self->{debug} = shift if @_; |
267
|
|
|
|
|
|
|
return $self->{debug}; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
sub init { |
271
|
|
|
|
|
|
|
my $self = shift @_; |
272
|
|
|
|
|
|
|
$self->{init} = shift if @_; |
273
|
|
|
|
|
|
|
return $self->{init}; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub name { |
277
|
|
|
|
|
|
|
my $self = shift; |
278
|
|
|
|
|
|
|
$self->{name} = shift if @_; |
279
|
|
|
|
|
|
|
return $self->{name}; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
sub delay { |
283
|
|
|
|
|
|
|
my $self = shift @_; |
284
|
|
|
|
|
|
|
$self->{delay} = shift if @_; |
285
|
|
|
|
|
|
|
return $self->{delay}; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
sub link { |
289
|
|
|
|
|
|
|
my $self = shift @_; |
290
|
|
|
|
|
|
|
$self->{link} = shift if @_; |
291
|
|
|
|
|
|
|
return $self->{link}; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub url { |
295
|
|
|
|
|
|
|
my $self = shift @_; |
296
|
|
|
|
|
|
|
$self->{url} = shift if @_; |
297
|
|
|
|
|
|
|
return $self->{url}; |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub headline_as_id { |
301
|
|
|
|
|
|
|
my ( $self, $bool ) = @_; |
302
|
|
|
|
|
|
|
if ( defined $bool ) { |
303
|
|
|
|
|
|
|
$self->{headline_as_id} = $bool; |
304
|
|
|
|
|
|
|
$_->headline_as_id($bool) for $self->headlines; |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
return $self->{headline_as_id}; |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
sub hlobj { |
310
|
|
|
|
|
|
|
my ( $self, $hlobj ) = @_; |
311
|
|
|
|
|
|
|
$self->{hlobj} = $hlobj if defined $hlobj; |
312
|
|
|
|
|
|
|
return $self->{hlobj}; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
sub tmpdir { |
316
|
|
|
|
|
|
|
my $self = shift @_; |
317
|
|
|
|
|
|
|
$self->{tmpdir} = shift if @_; |
318
|
|
|
|
|
|
|
return $self->{tmpdir}; |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
sub init_headlines_seen { |
322
|
|
|
|
|
|
|
my $self = shift @_; |
323
|
|
|
|
|
|
|
$self->{init_headlines_seen} = shift if @_; |
324
|
|
|
|
|
|
|
return $self->{init_headlines_seen}; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub max_headlines { |
328
|
|
|
|
|
|
|
my $self = shift @_; |
329
|
|
|
|
|
|
|
$self->{max_headlines} = shift if @_; |
330
|
|
|
|
|
|
|
return $self->{max_headlines}; |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub failed_to_fetch { |
334
|
|
|
|
|
|
|
carp __PACKAGE__ . '::failed_to_fetch has been deprecated'; |
335
|
|
|
|
|
|
|
return; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
sub failed_to_parse { |
339
|
|
|
|
|
|
|
carp __PACKAGE__ . '::failed_to_parse has been deprecated'; |
340
|
|
|
|
|
|
|
return; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
1; |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
__END__ |