line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Feed::Find; |
2
|
3
|
|
|
3
|
|
232679
|
use strict; |
|
3
|
|
|
|
|
28
|
|
|
3
|
|
|
|
|
93
|
|
3
|
3
|
|
|
3
|
|
14
|
use warnings; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
76
|
|
4
|
3
|
|
|
3
|
|
72
|
use 5.008_001; |
|
3
|
|
|
|
|
11
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
19
|
use base qw( Class::ErrorHandler ); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
1543
|
|
7
|
3
|
|
|
3
|
|
2247
|
use LWP::UserAgent; |
|
3
|
|
|
|
|
96226
|
|
|
3
|
|
|
|
|
108
|
|
8
|
3
|
|
|
3
|
|
2512
|
use HTML::Parser; |
|
3
|
|
|
|
|
17902
|
|
|
3
|
|
|
|
|
132
|
|
9
|
3
|
|
|
3
|
|
26
|
use URI; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
73
|
|
10
|
3
|
|
|
3
|
|
13
|
use Carp; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
203
|
|
11
|
|
|
|
|
|
|
|
12
|
3
|
|
|
3
|
|
19
|
use vars qw( $VERSION $ua ); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
193
|
|
13
|
|
|
|
|
|
|
$VERSION = '0.12'; |
14
|
|
|
|
|
|
|
|
15
|
3
|
|
|
|
|
2662
|
use constant FEED_MIME_TYPES => [ |
16
|
|
|
|
|
|
|
'application/x.atom+xml', |
17
|
|
|
|
|
|
|
'application/atom+xml', |
18
|
|
|
|
|
|
|
'application/xml', |
19
|
|
|
|
|
|
|
'text/xml', |
20
|
|
|
|
|
|
|
'application/rss+xml', |
21
|
|
|
|
|
|
|
'application/rdf+xml', |
22
|
3
|
|
|
3
|
|
18
|
]; |
|
3
|
|
|
|
|
5
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $FEED_EXT = qr/\.(?:rss|xml|rdf|atom)$/; |
25
|
|
|
|
|
|
|
my %IsFeed = map { $_ => 1 } @{ FEED_MIME_TYPES() }; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub find { |
28
|
4
|
|
|
4
|
1
|
6861
|
my $class = shift; |
29
|
4
|
|
|
|
|
24
|
my($uri) = @_; |
30
|
4
|
100
|
|
|
|
24
|
$ua = LWP::UserAgent->new unless defined $ua; |
31
|
4
|
|
|
|
|
3295
|
$ua->env_proxy; |
32
|
4
|
|
|
|
|
39063
|
$ua->agent(join '/', $class, $class->VERSION); |
33
|
4
|
|
|
|
|
331
|
$ua->parse_head(0); ## We're already basically doing this ourselves. |
34
|
4
|
|
|
|
|
327
|
my $req = HTTP::Request->new(GET => $uri); |
35
|
4
|
|
|
|
|
14987
|
my $p = HTML::Parser->new(api_version => 3, |
36
|
|
|
|
|
|
|
start_h => [ \&_find_links, 'self,tagname,attr' ]); |
37
|
4
|
|
|
|
|
230
|
$p->{base_uri} = $uri; |
38
|
4
|
|
|
|
|
13
|
$p->{feeds} = []; |
39
|
|
|
|
|
|
|
my $res = $ua->request($req, sub { |
40
|
7
|
|
|
7
|
|
30205
|
my($chunk, $res, $proto) = @_; |
41
|
7
|
50
|
|
|
|
46
|
if ($IsFeed{$res->content_type}) { |
42
|
0
|
|
|
|
|
0
|
push @{ $p->{feeds} }, $uri; |
|
0
|
|
|
|
|
0
|
|
43
|
0
|
|
|
|
|
0
|
croak 'Done parsing'; |
44
|
|
|
|
|
|
|
} |
45
|
7
|
100
|
|
|
|
538
|
$p->parse($chunk) or croak 'Done parsing'; |
46
|
4
|
|
|
|
|
46
|
}); |
47
|
4
|
50
|
|
|
|
1720
|
return $class->error($res->status_line) unless $res->is_success; |
48
|
4
|
|
|
|
|
42
|
return @{ $p->{feeds} }; |
|
4
|
|
|
|
|
85
|
|
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub find_in_html { |
52
|
2
|
|
|
2
|
1
|
2499
|
my $class = shift; |
53
|
2
|
|
|
|
|
5
|
my($html, $base_uri) = @_; |
54
|
2
|
|
|
|
|
11
|
my $p = HTML::Parser->new(api_version => 3, |
55
|
|
|
|
|
|
|
start_h => [ \&_find_links, 'self,tagname,attr' ]); |
56
|
2
|
|
|
|
|
82
|
$p->{base_uri} = $base_uri; |
57
|
2
|
|
|
|
|
5
|
$p->{feeds} = []; |
58
|
2
|
|
|
|
|
21
|
$p->parse($$html); |
59
|
2
|
|
|
|
|
4
|
return @{ $p->{feeds} }; |
|
2
|
|
|
|
|
13
|
|
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub _find_links { |
63
|
294
|
|
|
294
|
|
533
|
my($p, $tag, $attr) = @_; |
64
|
|
|
|
|
|
|
|
65
|
294
|
|
|
|
|
441
|
my %head_tag = map { $_ => 1 } |
|
2058
|
|
|
|
|
3375
|
|
66
|
|
|
|
|
|
|
qw[ meta isindex title script style head html ]; |
67
|
|
|
|
|
|
|
|
68
|
294
|
|
|
|
|
517
|
my $base_uri = $p->{base_uri}; |
69
|
294
|
100
|
|
|
|
695
|
if ($tag eq 'link') { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
70
|
114
|
50
|
|
|
|
208
|
return unless $attr->{rel}; |
71
|
114
|
|
|
|
|
279
|
my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel}); |
|
114
|
|
|
|
|
277
|
|
72
|
114
|
|
|
|
|
178
|
my $type = ''; |
73
|
114
|
100
|
|
|
|
200
|
if ($attr->{type}) { |
74
|
51
|
|
|
|
|
167
|
($type = lc $attr->{type}) =~ s/^\s*//; |
75
|
51
|
|
|
|
|
207
|
$type =~ s/\s*$//; |
76
|
|
|
|
|
|
|
} |
77
|
3
|
|
|
|
|
26
|
push @{ $p->{feeds} }, URI->new_abs($attr->{href}, $base_uri)->as_string |
78
|
|
|
|
|
|
|
if $IsFeed{$type} && |
79
|
114
|
50
|
33
|
|
|
282
|
($rel{alternate} || $rel{'service.feed'}); |
|
|
|
66
|
|
|
|
|
80
|
|
|
|
|
|
|
} elsif ($tag eq 'base') { |
81
|
0
|
0
|
|
|
|
0
|
$p->{base_uri} = $attr->{href} if $attr->{href}; |
82
|
|
|
|
|
|
|
} elsif ($head_tag{$tag}) { |
83
|
|
|
|
|
|
|
## Ignore other valid tags inside of . |
84
|
|
|
|
|
|
|
} elsif ($tag eq 'a') { |
85
|
12
|
50
|
|
|
|
28
|
my $href = $attr->{href} or return; |
86
|
12
|
|
|
|
|
38
|
my $uri = URI->new($href); |
87
|
12
|
100
|
|
|
|
3522
|
push @{ $p->{feeds} }, URI->new_abs($href, $base_uri)->as_string |
|
3
|
|
|
|
|
50
|
|
88
|
|
|
|
|
|
|
if $uri->path =~ /$FEED_EXT/io; |
89
|
|
|
|
|
|
|
} else { |
90
|
|
|
|
|
|
|
## Anything else indicates the start of the , |
91
|
|
|
|
|
|
|
## so we stop parsing. |
92
|
45
|
100
|
|
|
|
50
|
$p->eof if @{ $p->{feeds} }; |
|
45
|
|
|
|
|
121
|
|
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
294
|
|
|
|
|
7622
|
return; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
1; |
99
|
|
|
|
|
|
|
__END__ |