| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
2
|
|
|
2
|
|
3675
|
use strict; |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
121
|
|
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package HTML::RSSAutodiscovery; |
|
4
|
2
|
|
|
2
|
|
14
|
use base qw (HTML::Parser); |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
2682
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
# $Id: RSSAutodiscovery.pm,v 1.5 2004/10/17 04:13:06 asc Exp $ |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
HTML::RSSAutodiscovery - methods for retreiving RSS-ish information from an HTML document. |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use HTML::RSSAutodiscovery; |
|
15
|
|
|
|
|
|
|
use Data::Dumper; |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $url = "http://www.diveintomark.org/"; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $html = HTML::RSSAutodiscovery->new(); |
|
20
|
|
|
|
|
|
|
print &Dumper($html->parse($url)); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Mark's gone a bit nuts with this and |
|
23
|
|
|
|
|
|
|
# the list is too long to include here... |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# see the POD for the 'parse' method for |
|
26
|
|
|
|
|
|
|
# details of what it returns. |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Methods for retreiving RSS-ish information from an HTML document. |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=cut |
|
33
|
|
|
|
|
|
|
|
|
34
|
2
|
|
|
2
|
|
20619
|
use LWP::UserAgent; |
|
|
2
|
|
|
|
|
165752
|
|
|
|
2
|
|
|
|
|
74
|
|
|
35
|
2
|
|
|
2
|
|
21
|
use HTTP::Request; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
50
|
|
|
36
|
2
|
|
|
2
|
|
10
|
use Carp; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
192
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
$HTML::RSSAutodiscovery::VERSION = '1.21'; |
|
39
|
|
|
|
|
|
|
|
|
40
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_PROXY => "http://www.syndic8.com/xmlrpc.php"; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
133
|
|
|
41
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_CLASS => "syndic8"; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
115
|
|
|
42
|
2
|
|
|
2
|
|
11
|
use constant SYNDIC8_FINDSITES => join(".",SYNDIC8_CLASS,"FindSites"); |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
120
|
|
|
43
|
2
|
|
|
2
|
|
10
|
use constant SYNDIC8_FEEDINFO => join(".",SYNDIC8_CLASS,"GetFeedInfo"); |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
85
|
|
|
44
|
|
|
|
|
|
|
|
|
45
|
2
|
|
|
2
|
|
11
|
use constant MIMETYPE_RSS => "application/rss+xml"; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
4233
|
|
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 PACKAGE METHODS |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 __PACKAGE__->new() |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Object constructor. Returns an object. Woot! |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=cut |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub new { |
|
56
|
2
|
|
|
2
|
1
|
1647
|
my $pkg = shift; |
|
57
|
|
|
|
|
|
|
|
|
58
|
2
|
|
|
|
|
5
|
my $self = {}; |
|
59
|
2
|
|
|
|
|
6
|
bless $self,$pkg; |
|
60
|
|
|
|
|
|
|
|
|
61
|
2
|
50
|
|
|
|
7
|
if (! $self->init(@_)) { |
|
62
|
0
|
|
|
|
|
0
|
return undef; |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
2
|
|
|
|
|
6
|
return $self; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub init { |
|
69
|
2
|
|
|
2
|
0
|
4
|
my $self = shift; |
|
70
|
2
|
|
|
|
|
20
|
$self->SUPER::init(start_h=> [\&_start,"self,tagname,attr"]); |
|
71
|
2
|
|
|
|
|
120
|
return 1; |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 OBJECT METHODS |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head2 $obj->parse($arg) |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Parse an HTML document and return RSS-ish <link> information. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
I<$arg> may be either: |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=over 4 |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item * |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
An HTML string, passed as a scalar reference. |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item * |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
A URI. |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=back |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Returns an array reference of hash references whose keys are : |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=over 4 |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=item * |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
I |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item * |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
I |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item * |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
I |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item * |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
I |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=back |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=cut |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub parse { |
|
121
|
2
|
|
|
2
|
1
|
504
|
my $self = shift; |
|
122
|
2
|
|
|
|
|
5
|
my $uri = shift; |
|
123
|
|
|
|
|
|
|
|
|
124
|
2
|
|
|
|
|
4
|
my $data = $uri; |
|
125
|
|
|
|
|
|
|
|
|
126
|
2
|
50
|
|
|
|
9
|
if (ref($data) ne "SCALAR") { |
|
127
|
2
|
|
50
|
|
|
7
|
$data = $self->_fetch($uri) || return undef; |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
2
|
|
100
|
|
|
85
|
$self->{'__embedded'} ||= []; |
|
131
|
2
|
|
100
|
|
|
13
|
$self->{'__links'} ||= []; |
|
132
|
|
|
|
|
|
|
|
|
133
|
2
|
|
|
|
|
38
|
$self->SUPER::parse($$data); |
|
134
|
2
|
|
|
|
|
9
|
return $self->{'__links'}; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head2 $obj->locate($uri,\%args) |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Like the I method, but will perform additional lookups, if necessary or specified. |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Valid arguments are |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=over 4 |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item * |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
B |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
String. A live, breathing URI to slurp and parse. |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
I |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item * |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Hash ref whose keys may be |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=over 4 |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item * |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
B |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Boolean. Don't bother parsing the document, this will also prevent you |
|
164
|
|
|
|
|
|
|
from checking for embedded links. |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
I don't know why you want to do this, but you can. |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
False, by default. |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=item * |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
B |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Boolean. Check all embedded links ending in '.xml', '.rss' or '.rdf' |
|
175
|
|
|
|
|
|
|
(and then 'xml', 'rss' or 'rdf') for RSS-ness. |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
False, by default, unless the initial parsing of the URI returns no |
|
178
|
|
|
|
|
|
|
RSS links. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=item * |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
B |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Boolean. |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Boolean. Check all embedded links whose root is not the same as I<$uri> |
|
187
|
|
|
|
|
|
|
for RSS-ness. |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
False, by default. |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=item * |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
B |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Boolean. Check the syndic8 servers for sites matching I<$uri> |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
False, by default, unless the initial parsing of the URI and any embedded links |
|
198
|
|
|
|
|
|
|
returns no RSS links. |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=back |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=back |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Returns an array reference of hash references whose keys are : |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=over 4 |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=item * |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
I |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item * |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
I |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item * |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
I |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item * |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
I |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=back |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=cut |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub locate { |
|
229
|
1
|
|
|
1
|
1
|
478
|
my $self = shift; |
|
230
|
1
|
|
|
|
|
3
|
my $uri = shift; |
|
231
|
1
|
|
|
|
|
2
|
my $args = shift; |
|
232
|
|
|
|
|
|
|
|
|
233
|
1
|
|
|
|
|
7
|
$self->{'__embedded'} = []; |
|
234
|
1
|
|
|
|
|
3
|
$self->{'__links'} = []; |
|
235
|
|
|
|
|
|
|
|
|
236
|
1
|
|
|
|
|
2
|
my $parse = 1; |
|
237
|
1
|
|
|
|
|
2
|
my $embedded = 0; |
|
238
|
1
|
|
|
|
|
2
|
my $syndic8 = 0; |
|
239
|
|
|
|
|
|
|
|
|
240
|
1
|
50
|
|
|
|
4
|
if (ref($args) eq "HASH") { |
|
241
|
0
|
0
|
0
|
|
|
0
|
$parse = ((defined($args->{noparse})) && ($args->{noparse})) ? 0 : 1; |
|
242
|
0
|
0
|
0
|
|
|
0
|
$embedded = ((defined($args->{embedded})) && ($args->{embedded})) ? 1 : 0; |
|
243
|
0
|
0
|
0
|
|
|
0
|
$syndic8 = ((defined($args->{syndic8})) && ($args->{syndic8})) ? 1 : 0; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
1
|
50
|
|
|
|
7
|
if ($parse) { |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# This is a hack. Do as I say, not as I do |
|
249
|
1
|
50
|
|
|
|
4
|
if ($embedded) { |
|
250
|
0
|
0
|
|
|
|
0
|
$self->{'__check_embedded'} = ($args->{embedded_and_remote}) ? 2 : 1; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
|
|
253
|
1
|
|
|
|
|
3
|
$self->parse($uri); |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
1
|
50
|
33
|
|
|
10
|
if (($parse) && (($embedded) || (scalar(@{$self->{'__links'}}) < 1))) { |
|
|
|
|
33
|
|
|
|
|
|
257
|
1
|
|
|
|
|
6
|
$self->_check_embedded($uri); |
|
258
|
|
|
|
|
|
|
|
|
259
|
1
|
50
|
|
|
|
1
|
if (scalar(@{$self->{'__links'}}) < 1) { |
|
|
1
|
|
|
|
|
9
|
|
|
260
|
1
|
|
|
|
|
14
|
$self->_check_embedded($uri,{liberal=>1}); |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
} |
|
263
|
|
|
|
|
|
|
|
|
264
|
1
|
50
|
33
|
|
|
6
|
if (($syndic8) || (scalar(@{$self->{'__links'}}) < 1)) { |
|
|
1
|
|
|
|
|
6
|
|
|
265
|
1
|
|
|
|
|
4
|
$self->_check_syndic8($uri); |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
|
|
268
|
1
|
|
|
|
|
5
|
return $self->{'__links'}; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub _fetch { |
|
272
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
|
273
|
2
|
|
|
|
|
20
|
my $uri = shift; |
|
274
|
|
|
|
|
|
|
|
|
275
|
2
|
|
33
|
|
|
24
|
$self->{'__ua'} ||= LWP::UserAgent->new(); |
|
276
|
|
|
|
|
|
|
|
|
277
|
2
|
|
|
|
|
7635
|
my $res = $self->{'__ua'}->request(HTTP::Request->new(GET=>$uri)); |
|
278
|
|
|
|
|
|
|
|
|
279
|
2
|
50
|
|
|
|
909097
|
if (! $res->is_success()) { |
|
280
|
0
|
|
|
|
|
0
|
return undef; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
2
|
|
|
|
|
46
|
return \$res->content(); |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub _check_embedded { |
|
287
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
|
288
|
2
|
|
|
|
|
3
|
my $uri = shift; |
|
289
|
2
|
|
|
|
|
2
|
my $args = shift; |
|
290
|
|
|
|
|
|
|
|
|
291
|
2
|
|
50
|
|
|
8
|
my $rss = $self->_rss() |
|
292
|
|
|
|
|
|
|
|| return 0; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# How anal...I mean, liberal do I need to be about this? |
|
295
|
|
|
|
|
|
|
|
|
296
|
0
|
0
|
|
|
|
0
|
my $pattern = $args->{'liberal'} ? "r([dfs]+)" : "\\.r([dfs]+)"; |
|
297
|
0
|
|
|
|
|
0
|
my @links = grep { $_ =~ /(?:$pattern)$/ } @{$self->{'__embedded'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
298
|
|
|
|
|
|
|
|
|
299
|
0
|
0
|
|
|
|
0
|
if (! @links) { |
|
300
|
0
|
|
|
|
|
0
|
return 1; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# We just get this out of the way |
|
304
|
|
|
|
|
|
|
# now in case $link is a relative |
|
305
|
|
|
|
|
|
|
# URL |
|
306
|
|
|
|
|
|
|
|
|
307
|
0
|
0
|
|
|
|
0
|
unless ($uri =~ /\/$/) { |
|
308
|
0
|
|
|
|
|
0
|
$uri .= "/"; |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
0
|
foreach my $link (@links) { |
|
312
|
|
|
|
|
|
|
|
|
313
|
0
|
0
|
0
|
|
|
0
|
if (($link =~ /^http/) && ($self->{'__check_embedded'} < 2)) { |
|
|
|
0
|
|
|
|
|
|
|
314
|
0
|
0
|
|
|
|
0
|
next unless $link =~ /^$uri/; |
|
315
|
|
|
|
|
|
|
} |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
elsif ($link =~ /^http/) { |
|
318
|
0
|
0
|
|
|
|
0
|
next if $link =~ m!127.0.0! |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
else { |
|
322
|
0
|
|
|
|
|
0
|
$link = $uri.$link; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
0
|
0
|
|
|
|
0
|
next if ($self->_linked($link)); |
|
326
|
|
|
|
|
|
|
|
|
327
|
0
|
|
|
|
|
0
|
my $data = $self->_fetch($link); |
|
328
|
|
|
|
|
|
|
|
|
329
|
0
|
0
|
|
|
|
0
|
if (! $data) { |
|
330
|
0
|
|
|
|
|
0
|
carp "Failed to fetch '$uri', skipping.\n"; |
|
331
|
0
|
|
|
|
|
0
|
next; |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
|
|
334
|
0
|
|
|
|
|
0
|
eval { $rss->parse($$data); }; |
|
|
0
|
|
|
|
|
0
|
|
|
335
|
|
|
|
|
|
|
|
|
336
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
337
|
|
|
|
|
|
|
# carp "Not RSS, $@\n"; |
|
338
|
0
|
|
|
|
|
0
|
next; |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
|
|
341
|
0
|
0
|
|
|
|
0
|
next unless (defined($rss->{'_internal'}{'version'})); |
|
342
|
|
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
0
|
push @{$self->{'__links'}} ,{ |
|
|
0
|
|
|
|
|
0
|
|
|
344
|
|
|
|
|
|
|
rel => "alternate", |
|
345
|
|
|
|
|
|
|
href => $uri, |
|
346
|
|
|
|
|
|
|
title => $rss->{"channel"}{"description"}, |
|
347
|
|
|
|
|
|
|
type => MIMETYPE_RSS, |
|
348
|
|
|
|
|
|
|
}; |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
} |
|
351
|
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
0
|
return 1; |
|
353
|
|
|
|
|
|
|
} |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub _check_syndic8 { |
|
356
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
357
|
1
|
|
|
|
|
2
|
my $uri = shift; |
|
358
|
|
|
|
|
|
|
|
|
359
|
1
|
|
50
|
|
|
6
|
my $rpc = $self->_xmlrpc({proxy=>SYNDIC8_PROXY}) |
|
360
|
|
|
|
|
|
|
|| return 0; |
|
361
|
|
|
|
|
|
|
|
|
362
|
0
|
|
|
|
|
0
|
$uri =~ m!^(?:http://)?(?:www)?([^/]+)(?:/.*)?$!; |
|
363
|
|
|
|
|
|
|
|
|
364
|
0
|
0
|
|
|
|
0
|
if (! $1) { |
|
365
|
0
|
|
|
|
|
0
|
carp "Failed to parse URI '$uri', skipping lookup.\n"; |
|
366
|
0
|
|
|
|
|
0
|
return 0; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
|
|
369
|
0
|
|
0
|
|
|
0
|
my $ids = $rpc->call(SYNDIC8_FINDSITES,$1)->result() |
|
370
|
|
|
|
|
|
|
|| return 1; |
|
371
|
|
|
|
|
|
|
|
|
372
|
0
|
|
0
|
|
|
0
|
my $info = $rpc->call(SYNDIC8_FEEDINFO,$ids)->result() |
|
373
|
|
|
|
|
|
|
|| return 1; |
|
374
|
|
|
|
|
|
|
|
|
375
|
0
|
|
|
|
|
0
|
foreach my $site (@$info) { |
|
376
|
0
|
0
|
|
|
|
0
|
next unless ($site->{"fetchable"}); |
|
377
|
0
|
0
|
|
|
|
0
|
next unless ($site->{status} eq "Syndicated"); |
|
378
|
|
|
|
|
|
|
|
|
379
|
0
|
0
|
|
|
|
0
|
next if ($self->_linked($site->{"dataurl"})); |
|
380
|
|
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
0
|
push @{$self->{'__links'}} ,{ |
|
|
0
|
|
|
|
|
0
|
|
|
382
|
|
|
|
|
|
|
rel => "alternate", |
|
383
|
|
|
|
|
|
|
href => $site->{"dataurl"}, |
|
384
|
|
|
|
|
|
|
title => $site->{"description"}, |
|
385
|
|
|
|
|
|
|
type => MIMETYPE_RSS, |
|
386
|
|
|
|
|
|
|
}; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
0
|
return 1; |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _rss { |
|
393
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
|
394
|
|
|
|
|
|
|
|
|
395
|
2
|
50
|
|
|
|
8
|
if (ref($self->{'__rss'}) eq "ARRAY") { |
|
396
|
0
|
|
|
|
|
0
|
return undef; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# |
|
400
|
|
|
|
|
|
|
|
|
401
|
2
|
50
|
|
|
|
6
|
if (! $self->{'__rss'}) { |
|
402
|
|
|
|
|
|
|
|
|
403
|
2
|
|
|
|
|
146
|
eval "require XML::RSS"; |
|
404
|
|
|
|
|
|
|
|
|
405
|
2
|
50
|
|
|
|
11
|
if ($@) { |
|
406
|
2
|
|
|
|
|
366
|
carp "Unable to load RSS parser.\n"; |
|
407
|
|
|
|
|
|
|
|
|
408
|
2
|
|
|
|
|
304
|
$self->{'__xmlrpc'} = [$@]; |
|
409
|
2
|
|
|
|
|
12
|
return undef; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
0
|
|
|
|
|
0
|
$self->{'__rss'} = XML::RSS->new(); |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
0
|
|
|
|
|
0
|
return $self->{'__rss'}; |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _xmlrpc { |
|
419
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
420
|
1
|
|
|
|
|
2
|
my $args = shift; |
|
421
|
|
|
|
|
|
|
|
|
422
|
1
|
50
|
|
|
|
5
|
if (ref($self->{'__xmlrpc'}) eq "ARRAY") { |
|
423
|
1
|
|
|
|
|
3
|
return undef; |
|
424
|
|
|
|
|
|
|
} |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
# |
|
427
|
|
|
|
|
|
|
|
|
428
|
0
|
0
|
0
|
|
|
0
|
if ((! $self->{'__xmlrpc'}) || |
|
|
|
|
0
|
|
|
|
|
|
429
|
|
|
|
|
|
|
(($args->{'proxy'}) && ($self->{'__xmlrpc'}->proxy() ne $args->{'proxy'}))) { |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
0
|
eval "require XMLRPC::Lite"; |
|
432
|
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
434
|
0
|
|
|
|
|
0
|
carp "Unable to load XMLRPC class. Syndic8 lookup disabled.\n"; |
|
435
|
|
|
|
|
|
|
|
|
436
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'} = [$@]; |
|
437
|
0
|
|
|
|
|
0
|
return undef; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'} = XMLRPC::Lite->new(); |
|
441
|
0
|
|
|
|
|
0
|
$self->{'__xmlrpc'}->proxy($args->{'proxy'}); |
|
442
|
|
|
|
|
|
|
# $self->{'__xmlrpc'}->on_debug(sub{print@_}); |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
0
|
return $self->{'__xmlrpc'}; |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub _linked { |
|
449
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
|
450
|
0
|
|
|
|
|
0
|
my $uri = shift; |
|
451
|
|
|
|
|
|
|
|
|
452
|
0
|
0
|
|
|
|
0
|
if (defined($self->{'__linked'}{$uri})) { |
|
453
|
0
|
|
|
|
|
0
|
return $self->{'__linked'}{$uri}; |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
|
|
456
|
0
|
|
|
|
|
0
|
foreach (@{$self->{'__links'}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
457
|
0
|
0
|
|
|
|
0
|
if ($_->{href} eq $uri) { |
|
458
|
0
|
|
|
|
|
0
|
$self->{'__linked'}{$uri} = 1; |
|
459
|
0
|
|
|
|
|
0
|
return 1; |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
0
|
|
|
|
|
0
|
$self->{'__linked'}{$uri} = 0; |
|
464
|
0
|
|
|
|
|
0
|
return 0; |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
sub _start { |
|
468
|
4
|
|
|
4
|
|
8
|
my $self = shift; |
|
469
|
4
|
|
|
|
|
7
|
my $tag = shift; |
|
470
|
4
|
|
|
|
|
7
|
my $attrs = shift; |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
# Anything to check? |
|
473
|
|
|
|
|
|
|
# We may not actually need to check anchors |
|
474
|
|
|
|
|
|
|
# but in the interests of keeping things |
|
475
|
|
|
|
|
|
|
# simple (read-ability) we defer that check |
|
476
|
|
|
|
|
|
|
# for later... |
|
477
|
|
|
|
|
|
|
|
|
478
|
4
|
50
|
|
|
|
23
|
unless ($tag =~ /^(link|a)$/) { |
|
479
|
4
|
|
|
|
|
65
|
return; |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Check anchors |
|
483
|
|
|
|
|
|
|
# See note re: __check_emebedded in &locate() |
|
484
|
|
|
|
|
|
|
|
|
485
|
0
|
0
|
0
|
|
|
|
if (($self->{'__check_embedded'}) && ($tag eq "a")) { |
|
486
|
0
|
0
|
|
|
|
|
if ($attrs->{'href'} =~ /(?:\.)?r(?:df|ss)$/i) { |
|
487
|
0
|
|
|
|
|
|
push @{$self->{'__embedded'}} , $attrs->{'href'}; |
|
|
0
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
0
|
|
|
|
|
|
return; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Check links |
|
494
|
|
|
|
|
|
|
|
|
495
|
0
|
0
|
0
|
|
|
|
if ((defined($attrs->{'name'})) && |
|
496
|
|
|
|
|
|
|
($attrs->{'name'} =~ /^(XML|RSS)$/)) { |
|
497
|
0
|
|
|
|
|
|
return; |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
|
|
500
|
0
|
0
|
0
|
|
|
|
if ((defined($attrs->{'name'})) && |
|
|
|
|
0
|
|
|
|
|
|
501
|
|
|
|
|
|
|
($attrs->{'type'} ne "application/rss+xml") && |
|
502
|
|
|
|
|
|
|
($attrs->{'type'} ne "text/xml")) { |
|
503
|
|
|
|
|
|
|
|
|
504
|
0
|
|
|
|
|
|
return; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
|
|
507
|
0
|
|
|
|
|
|
delete $attrs->{"/"}; |
|
508
|
0
|
|
|
|
|
|
push @{$self->{'__links'}},$attrs; |
|
|
0
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=head1 VERSION |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
1.21 |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head1 DATE |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
$Date: 2004/10/17 04:13:06 $ |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=head1 AUTHOR |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
Aaron Straup Cope |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Because you shouldn't need all that white space to do cool stuff ;-) |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
http://diveintomark.org/archives/2002/05/30.html#rss_autodiscovery |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
http://diveintomark.org/archives/2002/08/15.html |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
http://diveintomark.org/projects/misc/rssfinder.py.txt |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head1 REQUIREMENTS |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=head2 BASIC |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
These packages are required to actually parse an HTML document or URI. |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=over 4 |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=item * |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
B |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=item * |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
B |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=item * |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
B |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=back |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=head2 EMBEDDED |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
These packages are required to check the embedded links in a URI for RSS files. |
|
558
|
|
|
|
|
|
|
They are not loaded until run-time so they are not required for doing basic parsing |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=over 4 |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=item * |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
B |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=back |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=head2 SYNDIC8 |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
These packages are required to query the syndic8 servers for RSS files associated with a URI. |
|
571
|
|
|
|
|
|
|
They are not loaded until run-time so they are not required for doing basic parsing |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=over 4 |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=item * |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
B |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=back |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head1 LICENSE |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Copyright (c) 2002-2004, Aaron Straup Cope. All Rights Reserved. |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
This is free software, you may use it and distribute it under the same terms as Perl itself. |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=cut |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
return 1; |
|
590
|
|
|
|
|
|
|
|