line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Feed::Find; |
2
|
3
|
|
|
3
|
|
203878
|
use strict; |
|
3
|
|
|
|
|
49
|
|
|
3
|
|
|
|
|
95
|
|
3
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
91
|
|
4
|
3
|
|
|
3
|
|
58
|
use 5.008_001; |
|
3
|
|
|
|
|
9
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
15
|
use base qw( Class::ErrorHandler ); |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
1659
|
|
7
|
3
|
|
|
3
|
|
2211
|
use LWP::UserAgent; |
|
3
|
|
|
|
|
94376
|
|
|
3
|
|
|
|
|
120
|
|
8
|
3
|
|
|
3
|
|
2074
|
use HTML::Parser; |
|
3
|
|
|
|
|
16978
|
|
|
3
|
|
|
|
|
195
|
|
9
|
3
|
|
|
3
|
|
34
|
use URI; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
96
|
|
10
|
|
|
|
|
|
|
|
11
|
3
|
|
|
3
|
|
17
|
use vars qw( $VERSION $ua ); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
224
|
|
12
|
|
|
|
|
|
|
$VERSION = '0.11'; |
13
|
|
|
|
|
|
|
|
14
|
3
|
|
|
|
|
2663
|
use constant FEED_MIME_TYPES => [ |
15
|
|
|
|
|
|
|
'application/x.atom+xml', |
16
|
|
|
|
|
|
|
'application/atom+xml', |
17
|
|
|
|
|
|
|
'application/xml', |
18
|
|
|
|
|
|
|
'text/xml', |
19
|
|
|
|
|
|
|
'application/rss+xml', |
20
|
|
|
|
|
|
|
'application/rdf+xml', |
21
|
3
|
|
|
3
|
|
17
|
]; |
|
3
|
|
|
|
|
5
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our $FEED_EXT = qr/\.(?:rss|xml|rdf|atom)$/; |
24
|
|
|
|
|
|
|
our %IsFeed = map { $_ => 1 } @{ FEED_MIME_TYPES() }; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub find { |
27
|
4
|
|
|
4
|
1
|
6179
|
my $class = shift; |
28
|
4
|
|
|
|
|
21
|
my($uri) = @_; |
29
|
4
|
100
|
|
|
|
22
|
$ua = LWP::UserAgent->new unless defined $ua; |
30
|
4
|
|
|
|
|
2777
|
$ua->env_proxy; |
31
|
4
|
|
|
|
|
34198
|
$ua->agent(join '/', $class, $class->VERSION); |
32
|
4
|
|
|
|
|
356
|
$ua->parse_head(0); ## We're already basically doing this ourselves. |
33
|
4
|
|
|
|
|
313
|
my $req = HTTP::Request->new(GET => $uri); |
34
|
4
|
|
|
|
|
12927
|
my $p = HTML::Parser->new(api_version => 3, |
35
|
|
|
|
|
|
|
start_h => [ \&_find_links, 'self,tagname,attr' ]); |
36
|
4
|
|
|
|
|
218
|
$p->{base_uri} = $uri; |
37
|
4
|
|
|
|
|
11
|
$p->{feeds} = []; |
38
|
|
|
|
|
|
|
my $res = $ua->request($req, sub { |
39
|
7
|
|
|
7
|
|
23983
|
my($chunk, $res, $proto) = @_; |
40
|
7
|
50
|
|
|
|
44
|
if ($IsFeed{$res->content_type}) { |
41
|
0
|
|
|
|
|
0
|
push @{ $p->{feeds} }, $uri; |
|
0
|
|
|
|
|
0
|
|
42
|
0
|
|
|
|
|
0
|
die "Done parsing"; |
43
|
|
|
|
|
|
|
} |
44
|
7
|
100
|
|
|
|
363
|
$p->parse($chunk) or die "Done parsing"; |
45
|
4
|
|
|
|
|
49
|
}); |
46
|
4
|
50
|
|
|
|
1241
|
return $class->error($res->status_line) unless $res->is_success; |
47
|
4
|
|
|
|
|
46
|
@{ $p->{feeds} }; |
|
4
|
|
|
|
|
86
|
|
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub find_in_html { |
51
|
2
|
|
|
2
|
1
|
2471
|
my $class = shift; |
52
|
2
|
|
|
|
|
5
|
my($html, $base_uri) = @_; |
53
|
2
|
|
|
|
|
20
|
my $p = HTML::Parser->new(api_version => 3, |
54
|
|
|
|
|
|
|
start_h => [ \&_find_links, 'self,tagname,attr' ]); |
55
|
2
|
|
|
|
|
98
|
$p->{base_uri} = $base_uri; |
56
|
2
|
|
|
|
|
5
|
$p->{feeds} = []; |
57
|
2
|
|
|
|
|
22
|
$p->parse($$html); |
58
|
2
|
|
|
|
|
4
|
@{ $p->{feeds} }; |
|
2
|
|
|
|
|
12
|
|
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub _find_links { |
62
|
294
|
|
|
294
|
|
4615
|
my($p, $tag, $attr) = @_; |
63
|
294
|
|
|
|
|
335
|
my $base_uri = $p->{base_uri}; |
64
|
294
|
100
|
|
|
|
1189
|
if ($tag eq 'link') { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
65
|
114
|
50
|
|
|
|
171
|
return unless $attr->{rel}; |
66
|
114
|
|
|
|
|
232
|
my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel}); |
|
114
|
|
|
|
|
270
|
|
67
|
114
|
|
|
|
|
216
|
my $type = ''; |
68
|
114
|
100
|
|
|
|
173
|
if ($attr->{type}) { |
69
|
51
|
|
|
|
|
151
|
($type = lc $attr->{type}) =~ s/^\s*//; |
70
|
51
|
|
|
|
|
168
|
$type =~ s/\s*$//; |
71
|
|
|
|
|
|
|
} |
72
|
3
|
|
|
|
|
29
|
push @{ $p->{feeds} }, URI->new_abs($attr->{href}, $base_uri)->as_string |
73
|
|
|
|
|
|
|
if $IsFeed{$type} && |
74
|
114
|
50
|
33
|
|
|
702
|
($rel{alternate} || $rel{'service.feed'}); |
|
|
|
66
|
|
|
|
|
75
|
|
|
|
|
|
|
} elsif ($tag eq 'base') { |
76
|
0
|
0
|
|
|
|
0
|
$p->{base_uri} = $attr->{href} if $attr->{href}; |
77
|
|
|
|
|
|
|
} elsif ($tag =~ /^(?:meta|isindex|title|script|style|head|html)$/) { |
78
|
|
|
|
|
|
|
## Ignore other valid tags inside of . |
79
|
|
|
|
|
|
|
} elsif ($tag eq 'a') { |
80
|
12
|
50
|
|
|
|
25
|
my $href = $attr->{href} or return; |
81
|
12
|
|
|
|
|
34
|
my $uri = URI->new($href); |
82
|
12
|
100
|
|
|
|
2571
|
push @{ $p->{feeds} }, URI->new_abs($href, $base_uri)->as_string |
|
3
|
|
|
|
|
43
|
|
83
|
|
|
|
|
|
|
if $uri->path =~ /$FEED_EXT/io; |
84
|
|
|
|
|
|
|
} else { |
85
|
|
|
|
|
|
|
## Anything else indicates the start of the , |
86
|
|
|
|
|
|
|
## so we stop parsing. |
87
|
45
|
100
|
|
|
|
48
|
$p->eof if @{ $p->{feeds} }; |
|
45
|
|
|
|
|
274
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
1; |
92
|
|
|
|
|
|
|
__END__ |