line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
3675
|
use strict; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
121
|
|
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package HTML::RSSAutodiscovery; |
4
|
2
|
|
|
2
|
|
14
|
use base qw (HTML::Parser); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
2682
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
# $Id: RSSAutodiscovery.pm,v 1.5 2004/10/17 04:13:06 asc Exp $ |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
HTML::RSSAutodiscovery - methods for retreiving RSS-ish information from an HTML document. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use HTML::RSSAutodiscovery; |
15
|
|
|
|
|
|
|
use Data::Dumper; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $url = "http://www.diveintomark.org/"; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $html = HTML::RSSAutodiscovery->new(); |
20
|
|
|
|
|
|
|
print &Dumper($html->parse($url)); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Mark's gone a bit nuts with this and |
23
|
|
|
|
|
|
|
# the list is too long to include here... |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# see the POD for the 'parse' method for |
26
|
|
|
|
|
|
|
# details of what it returns. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 DESCRIPTION |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Methods for retreiving RSS-ish information from an HTML document. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=cut |
33
|
|
|
|
|
|
|
|
34
|
2
|
|
|
2
|
|
20619
|
use LWP::UserAgent; |
|
2
|
|
|
|
|
165752
|
|
|
2
|
|
|
|
|
74
|
|
35
|
2
|
|
|
2
|
|
21
|
use HTTP::Request; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
50
|
|
36
|
2
|
|
|
2
|
|
10
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
192
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
$HTML::RSSAutodiscovery::VERSION = '1.21'; |
39
|
|
|
|
|
|
|
|
40
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_PROXY => "http://www.syndic8.com/xmlrpc.php"; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
133
|
|
41
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_CLASS => "syndic8"; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
115
|
|
42
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_FINDSITES => join(".",SYNDIC8_CLASS,"FindSites"); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
120
|
|
43
|
2
|
|
|
2
|
|
10
|
use constant SYNDIC8_FEEDINFO => join(".",SYNDIC8_CLASS,"GetFeedInfo"); |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
85
|
|
44
|
|
|
|
|
|
|
|
45
|
2
|
|
|
2
|
|
11
|
use constant MIMETYPE_RSS => "application/rss+xml"; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
4233
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 PACKAGE METHODS |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 __PACKAGE__->new() |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Object constructor. Returns an object. Woot! |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=cut |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub new { |
56
|
2
|
|
|
2
|
1
|
1647
|
my $pkg = shift; |
57
|
|
|
|
|
|
|
|
58
|
2
|
|
|
|
|
5
|
my $self = {}; |
59
|
2
|
|
|
|
|
6
|
bless $self,$pkg; |
60
|
|
|
|
|
|
|
|
61
|
2
|
50
|
|
|
|
7
|
if (! $self->init(@_)) { |
62
|
0
|
|
|
|
|
0
|
return undef; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
2
|
|
|
|
|
6
|
return $self; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub init { |
69
|
2
|
|
|
2
|
0
|
4
|
my $self = shift; |
70
|
2
|
|
|
|
|
20
|
$self->SUPER::init(start_h=> [\&_start,"self,tagname,attr"]); |
71
|
2
|
|
|
|
|
120
|
return 1; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 OBJECT METHODS |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head2 $obj->parse($arg) |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Parse an HTML document and return RSS-ish <link> information. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
I<$arg> may be either: |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=over 4 |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item * |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
An HTML string, passed as a scalar reference. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item * |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
A URI. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=back |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Returns an array reference of hash references whose keys are : |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=over 4 |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=item * |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
I |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item * |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
I |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item * |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
I |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item * |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
I |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=back |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=cut |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub parse { |
121
|
2
|
|
|
2
|
1
|
504
|
my $self = shift; |
122
|
2
|
|
|
|
|
5
|
my $uri = shift; |
123
|
|
|
|
|
|
|
|
124
|
2
|
|
|
|
|
4
|
my $data = $uri; |
125
|
|
|
|
|
|
|
|
126
|
2
|
50
|
|
|
|
9
|
if (ref($data) ne "SCALAR") { |
127
|
2
|
|
50
|
|
|
7
|
$data = $self->_fetch($uri) || return undef; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
2
|
|
100
|
|
|
85
|
$self->{'__embedded'} ||= []; |
131
|
2
|
|
100
|
|
|
13
|
$self->{'__links'} ||= []; |
132
|
|
|
|
|
|
|
|
133
|
2
|
|
|
|
|
38
|
$self->SUPER::parse($$data); |
134
|
2
|
|
|
|
|
9
|
return $self->{'__links'}; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head2 $obj->locate($uri,\%args) |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Like the I method, but will perform additional lookups, if necessary or specified. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Valid arguments are |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=over 4 |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item * |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
B |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
String. A live, breathing URI to slurp and parse. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
I |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item * |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Hash ref whose keys may be |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=over 4 |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item * |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
B |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Boolean. Don't bother parsing the document, this will also prevent you |
164
|
|
|
|
|
|
|
from checking for embedded links. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
I don't know why you want to do this, but you can. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
False, by default. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=item * |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
B |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Boolean. Check all embedded links ending in '.xml', '.rss' or '.rdf' |
175
|
|
|
|
|
|
|
(and then 'xml', 'rss' or 'rdf') for RSS-ness. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
False, by default, unless the initial parsing of the URI returns no |
178
|
|
|
|
|
|
|
RSS links. |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=item * |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
B |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Boolean. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Boolean. Check all embedded links whose root is not the same as I<$uri> |
187
|
|
|
|
|
|
|
for RSS-ness. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
False, by default. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=item * |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
B |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Boolean. Check the syndic8 servers for sites matching I<$uri> |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
False, by default, unless the initial parsing of the URI and any embedded links |
198
|
|
|
|
|
|
|
returns no RSS links. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=back |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=back |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Returns an array reference of hash references whose keys are : |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=over 4 |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=item * |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
I |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item * |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
I |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item * |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
I |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item * |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
I |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=back |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=cut |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub locate { |
229
|
1
|
|
|
1
|
1
|
478
|
my $self = shift; |
230
|
1
|
|
|
|
|
3
|
my $uri = shift; |
231
|
1
|
|
|
|
|
2
|
my $args = shift; |
232
|
|
|
|
|
|
|
|
233
|
1
|
|
|
|
|
7
|
$self->{'__embedded'} = []; |
234
|
1
|
|
|
|
|
3
|
$self->{'__links'} = []; |
235
|
|
|
|
|
|
|
|
236
|
1
|
|
|
|
|
2
|
my $parse = 1; |
237
|
1
|
|
|
|
|
2
|
my $embedded = 0; |
238
|
1
|
|
|
|
|
2
|
my $syndic8 = 0; |
239
|
|
|
|
|
|
|
|
240
|
1
|
50
|
|
|
|
4
|
if (ref($args) eq "HASH") { |
241
|
0
|
0
|
0
|
|
|
0
|
$parse = ((defined($args->{noparse})) && ($args->{noparse})) ? 0 : 1; |
242
|
0
|
0
|
0
|
|
|
0
|
$embedded = ((defined($args->{embedded})) && ($args->{embedded})) ? 1 : 0; |
243
|
0
|
0
|
0
|
|
|
0
|
$syndic8 = ((defined($args->{syndic8})) && ($args->{syndic8})) ? 1 : 0; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
1
|
50
|
|
|
|
7
|
if ($parse) { |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# This is a hack. Do as I say, not as I do |
249
|
1
|
50
|
|
|
|
4
|
if ($embedded) { |
250
|
0
|
0
|
|
|
|
0
|
$self->{'__check_embedded'} = ($args->{embedded_and_remote}) ? 2 : 1; |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
1
|
|
|
|
|
3
|
$self->parse($uri); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
1
|
50
|
33
|
|
|
10
|
if (($parse) && (($embedded) || (scalar(@{$self->{'__links'}}) < 1))) { |
|
|
|
33
|
|
|
|
|
257
|
1
|
|
|
|
|
6
|
$self->_check_embedded($uri); |
258
|
|
|
|
|
|
|
|
259
|
1
|
50
|
|
|
|
1
|
if (scalar(@{$self->{'__links'}}) < 1) { |
|
1
|
|
|
|
|
9
|
|
260
|
1
|
|
|
|
|
14
|
$self->_check_embedded($uri,{liberal=>1}); |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
|
264
|
1
|
50
|
33
|
|
|
6
|
if (($syndic8) || (scalar(@{$self->{'__links'}}) < 1)) { |
|
1
|
|
|
|
|
6
|
|
265
|
1
|
|
|
|
|
4
|
$self->_check_syndic8($uri); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
1
|
|
|
|
|
5
|
return $self->{'__links'}; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub _fetch { |
272
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
273
|
2
|
|
|
|
|
20
|
my $uri = shift; |
274
|
|
|
|
|
|
|
|
275
|
2
|
|
33
|
|
|
24
|
$self->{'__ua'} ||= LWP::UserAgent->new(); |
276
|
|
|
|
|
|
|
|
277
|
2
|
|
|
|
|
7635
|
my $res = $self->{'__ua'}->request(HTTP::Request->new(GET=>$uri)); |
278
|
|
|
|
|
|
|
|
279
|
2
|
50
|
|
|
|
909097
|
if (! $res->is_success()) { |
280
|
0
|
|
|
|
|
0
|
return undef; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
2
|
|
|
|
|
46
|
return \$res->content(); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub _check_embedded { |
287
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
288
|
2
|
|
|
|
|
3
|
my $uri = shift; |
289
|
2
|
|
|
|
|
2
|
my $args = shift; |
290
|
|
|
|
|
|
|
|
291
|
2
|
|
50
|
|
|
8
|
my $rss = $self->_rss() |
292
|
|
|
|
|
|
|
|| return 0; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# How anal...I mean, liberal do I need to be about this? |
295
|
|
|
|
|
|
|
|
296
|
0
|
0
|
|
|
|
0
|
my $pattern = $args->{'liberal'} ? "r([dfs]+)" : "\\.r([dfs]+)"; |
297
|
0
|
|
|
|
|
0
|
my @links = grep { $_ =~ /(?:$pattern)$/ } @{$self->{'__embedded'}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
298
|
|
|
|
|
|
|
|
299
|
0
|
0
|
|
|
|
0
|
if (! @links) { |
300
|
0
|
|
|
|
|
0
|
return 1; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# We just get this out of the way |
304
|
|
|
|
|
|
|
# now in case $link is a relative |
305
|
|
|
|
|
|
|
# URL |
306
|
|
|
|
|
|
|
|
307
|
0
|
0
|
|
|
|
0
|
unless ($uri =~ /\/$/) { |
308
|
0
|
|
|
|
|
0
|
$uri .= "/"; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
0
|
foreach my $link (@links) { |
312
|
|
|
|
|
|
|
|
313
|
0
|
0
|
0
|
|
|
0
|
if (($link =~ /^http/) && ($self->{'__check_embedded'} < 2)) { |
|
|
0
|
|
|
|
|
|
314
|
0
|
0
|
|
|
|
0
|
next unless $link =~ /^$uri/; |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
elsif ($link =~ /^http/) { |
318
|
0
|
0
|
|
|
|
0
|
next if $link =~ m!127.0.0! |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
else { |
322
|
0
|
|
|
|
|
0
|
$link = $uri.$link; |
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
0
|
0
|
|
|
|
0
|
next if ($self->_linked($link)); |
326
|
|
|
|
|
|
|
|
327
|
0
|
|
|
|
|
0
|
my $data = $self->_fetch($link); |
328
|
|
|
|
|
|
|
|
329
|
0
|
0
|
|
|
|
0
|
if (! $data) { |
330
|
0
|
|
|
|
|
0
|
carp "Failed to fetch '$uri', skipping.\n"; |
331
|
0
|
|
|
|
|
0
|
next; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
0
|
|
|
|
|
0
|
eval { $rss->parse($$data); }; |
|
0
|
|
|
|
|
0
|
|
335
|
|
|
|
|
|
|
|
336
|
0
|
0
|
|
|
|
0
|
if ($@) { |
337
|
|
|
|
|
|
|
# carp "Not RSS, $@\n"; |
338
|
0
|
|
|
|
|
0
|
next; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
0
|
0
|
|
|
|
0
|
next unless (defined($rss->{'_internal'}{'version'})); |
342
|
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
0
|
push @{$self->{'__links'}} ,{ |
|
0
|
|
|
|
|
0
|
|
344
|
|
|
|
|
|
|
rel => "alternate", |
345
|
|
|
|
|
|
|
href => $uri, |
346
|
|
|
|
|
|
|
title => $rss->{"channel"}{"description"}, |
347
|
|
|
|
|
|
|
type => MIMETYPE_RSS, |
348
|
|
|
|
|
|
|
}; |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
0
|
return 1; |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub _check_syndic8 { |
356
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
357
|
1
|
|
|
|
|
2
|
my $uri = shift; |
358
|
|
|
|
|
|
|
|
359
|
1
|
|
50
|
|
|
6
|
my $rpc = $self->_xmlrpc({proxy=>SYNDIC8_PROXY}) |
360
|
|
|
|
|
|
|
|| return 0; |
361
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
0
|
$uri =~ m!^(?:http://)?(?:www)?([^/]+)(?:/.*)?$!; |
363
|
|
|
|
|
|
|
|
364
|
0
|
0
|
|
|
|
0
|
if (! $1) { |
365
|
0
|
|
|
|
|
0
|
carp "Failed to parse URI '$uri', skipping lookup.\n"; |
366
|
0
|
|
|
|
|
0
|
return 0; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
0
|
|
0
|
|
|
0
|
my $ids = $rpc->call(SYNDIC8_FINDSITES,$1)->result() |
370
|
|
|
|
|
|
|
|| return 1; |
371
|
|
|
|
|
|
|
|
372
|
0
|
|
0
|
|
|
0
|
my $info = $rpc->call(SYNDIC8_FEEDINFO,$ids)->result() |
373
|
|
|
|
|
|
|
|| return 1; |
374
|
|
|
|
|
|
|
|
375
|
0
|
|
|
|
|
0
|
foreach my $site (@$info) { |
376
|
0
|
0
|
|
|
|
0
|
next unless ($site->{"fetchable"}); |
377
|
0
|
0
|
|
|
|
0
|
next unless ($site->{status} eq "Syndicated"); |
378
|
|
|
|
|
|
|
|
379
|
0
|
0
|
|
|
|
0
|
next if ($self->_linked($site->{"dataurl"})); |
380
|
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
0
|
push @{$self->{'__links'}} ,{ |
|
0
|
|
|
|
|
0
|
|
382
|
|
|
|
|
|
|
rel => "alternate", |
383
|
|
|
|
|
|
|
href => $site->{"dataurl"}, |
384
|
|
|
|
|
|
|
title => $site->{"description"}, |
385
|
|
|
|
|
|
|
type => MIMETYPE_RSS, |
386
|
|
|
|
|
|
|
}; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
0
|
return 1; |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _rss { |
393
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
394
|
|
|
|
|
|
|
|
395
|
2
|
50
|
|
|
|
8
|
if (ref($self->{'__rss'}) eq "ARRAY") { |
396
|
0
|
|
|
|
|
0
|
return undef; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# |
400
|
|
|
|
|
|
|
|
401
|
2
|
50
|
|
|
|
6
|
if (! $self->{'__rss'}) { |
402
|
|
|
|
|
|
|
|
403
|
2
|
|
|
|
|
146
|
eval "require XML::RSS"; |
404
|
|
|
|
|
|
|
|
405
|
2
|
50
|
|
|
|
11
|
if ($@) { |
406
|
2
|
|
|
|
|
366
|
carp "Unable to load RSS parser.\n"; |
407
|
|
|
|
|
|
|
|
408
|
2
|
|
|
|
|
304
|
$self->{'__xmlrpc'} = [$@]; |
409
|
2
|
|
|
|
|
12
|
return undef; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
0
|
|
|
|
|
0
|
$self->{'__rss'} = XML::RSS->new(); |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
0
|
|
|
|
|
0
|
return $self->{'__rss'}; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _xmlrpc { |
419
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
420
|
1
|
|
|
|
|
2
|
my $args = shift; |
421
|
|
|
|
|
|
|
|
422
|
1
|
50
|
|
|
|
5
|
if (ref($self->{'__xmlrpc'}) eq "ARRAY") { |
423
|
1
|
|
|
|
|
3
|
return undef; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# |
427
|
|
|
|
|
|
|
|
428
|
0
|
0
|
0
|
|
|
0
|
if ((! $self->{'__xmlrpc'}) || |
|
|
|
0
|
|
|
|
|
429
|
|
|
|
|
|
|
(($args->{'proxy'}) && ($self->{'__xmlrpc'}->proxy() ne $args->{'proxy'}))) { |
430
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
0
|
eval "require XMLRPC::Lite"; |
432
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
0
|
if ($@) { |
434
|
0
|
|
|
|
|
0
|
carp "Unable to load XMLRPC class. Syndic8 lookup disabled.\n"; |
435
|
|
|
|
|
|
|
|
436
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'} = [$@]; |
437
|
0
|
|
|
|
|
0
|
return undef; |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'} = XMLRPC::Lite->new(); |
441
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'}->proxy($args->{'proxy'}); |
442
|
|
|
|
|
|
|
# $self->{'__xmlrpc'}->on_debug(sub{print@_}); |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
0
|
return $self->{'__xmlrpc'}; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub _linked { |
449
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
450
|
0
|
|
|
|
|
0
|
my $uri = shift; |
451
|
|
|
|
|
|
|
|
452
|
0
|
0
|
|
|
|
0
|
if (defined($self->{'__linked'}{$uri})) { |
453
|
0
|
|
|
|
|
0
|
return $self->{'__linked'}{$uri}; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
0
|
|
|
|
|
0
|
foreach (@{$self->{'__links'}}) { |
|
0
|
|
|
|
|
0
|
|
457
|
0
|
0
|
|
|
|
0
|
if ($_->{href} eq $uri) { |
458
|
0
|
|
|
|
|
0
|
$self->{'__linked'}{$uri} = 1; |
459
|
0
|
|
|
|
|
0
|
return 1; |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
0
|
|
|
|
|
0
|
$self->{'__linked'}{$uri} = 0; |
464
|
0
|
|
|
|
|
0
|
return 0; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
sub _start { |
468
|
4
|
|
|
4
|
|
8
|
my $self = shift; |
469
|
4
|
|
|
|
|
7
|
my $tag = shift; |
470
|
4
|
|
|
|
|
7
|
my $attrs = shift; |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# Anything to check? |
473
|
|
|
|
|
|
|
# We may not actually need to check anchors |
474
|
|
|
|
|
|
|
# but in the interests of keeping things |
475
|
|
|
|
|
|
|
# simple (read-ability) we defer that check |
476
|
|
|
|
|
|
|
# for later... |
477
|
|
|
|
|
|
|
|
478
|
4
|
50
|
|
|
|
23
|
unless ($tag =~ /^(link|a)$/) { |
479
|
4
|
|
|
|
|
65
|
return; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Check anchors |
483
|
|
|
|
|
|
|
# See note re: __check_emebedded in &locate() |
484
|
|
|
|
|
|
|
|
485
|
0
|
0
|
0
|
|
|
|
if (($self->{'__check_embedded'}) && ($tag eq "a")) { |
486
|
0
|
0
|
|
|
|
|
if ($attrs->{'href'} =~ /(?:\.)?r(?:df|ss)$/i) { |
487
|
0
|
|
|
|
|
|
push @{$self->{'__embedded'}} , $attrs->{'href'}; |
|
0
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
0
|
|
|
|
|
|
return; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Check links |
494
|
|
|
|
|
|
|
|
495
|
0
|
0
|
0
|
|
|
|
if ((defined($attrs->{'name'})) && |
496
|
|
|
|
|
|
|
($attrs->{'name'} =~ /^(XML|RSS)$/)) { |
497
|
0
|
|
|
|
|
|
return; |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
0
|
0
|
0
|
|
|
|
if ((defined($attrs->{'name'})) && |
|
|
|
0
|
|
|
|
|
501
|
|
|
|
|
|
|
($attrs->{'type'} ne "application/rss+xml") && |
502
|
|
|
|
|
|
|
($attrs->{'type'} ne "text/xml")) { |
503
|
|
|
|
|
|
|
|
504
|
0
|
|
|
|
|
|
return; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
0
|
|
|
|
|
|
delete $attrs->{"/"}; |
508
|
0
|
|
|
|
|
|
push @{$self->{'__links'}},$attrs; |
|
0
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=head1 VERSION |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
1.21 |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head1 DATE |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
$Date: 2004/10/17 04:13:06 $ |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=head1 AUTHOR |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
Aaron Straup Cope |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 SEE ALSO |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Because you shouldn't need all that white space to do cool stuff ;-) |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
http://diveintomark.org/archives/2002/05/30.html#rss_autodiscovery |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
http://diveintomark.org/archives/2002/08/15.html |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
http://diveintomark.org/projects/misc/rssfinder.py.txt |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head1 REQUIREMENTS |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=head2 BASIC |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
These packages are required to actually parse an HTML document or URI. |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=over 4 |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=item * |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
B |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=item * |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
B |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=item * |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
B |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=back |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=head2 EMBEDDED |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
These packages are required to check the embedded links in a URI for RSS files. |
558
|
|
|
|
|
|
|
They are not loaded until run-time so they are not required for doing basic parsing |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over 4 |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item * |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
B |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=back |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=head2 SYNDIC8 |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
These packages are required to query the syndic8 servers for RSS files associated with a URI. |
571
|
|
|
|
|
|
|
They are not loaded until run-time so they are not required for doing basic parsing |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=over 4 |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=item * |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
B |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=back |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head1 LICENSE |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Copyright (c) 2002-2004, Aaron Straup Cope. All Rights Reserved. |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
This is free software, you may use it and distribute it under the same terms as Perl itself. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=cut |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
return 1; |
590
|
|
|
|
|
|
|
|