line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Crawl;
|
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
228922
|
use strict;
|
|
3
|
|
|
|
|
24
|
|
|
3
|
|
|
|
|
88
|
|
4
|
3
|
|
|
3
|
|
15
|
use warnings;
|
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
98
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
2401
|
use HTTP::Tiny 0.083;
|
|
3
|
|
|
|
|
155342
|
|
|
3
|
|
|
|
|
106
|
|
7
|
3
|
|
|
3
|
|
1860
|
use URI;
|
|
3
|
|
|
|
|
20029
|
|
|
3
|
|
|
|
|
92
|
|
8
|
3
|
|
|
3
|
|
1541
|
use JSON::PP;
|
|
3
|
|
|
|
|
28420
|
|
|
3
|
|
|
|
|
241
|
|
9
|
3
|
|
|
3
|
|
24
|
use Carp qw(croak);
|
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
3367
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.1';
|
12
|
|
|
|
|
|
|
$VERSION = eval $VERSION;
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# TODO:
|
15
|
|
|
|
|
|
|
# 1 - Use HTML Parser instead of regexps
|
16
|
|
|
|
|
|
|
# - we don't do this as it doesn't parse JS scripts and files
|
17
|
|
|
|
|
|
|
#
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub new {
|
20
|
0
|
|
|
0
|
1
|
|
my $class = shift;
|
21
|
0
|
|
|
|
|
|
my %attrs = @_;
|
22
|
|
|
|
|
|
|
|
23
|
0
|
|
0
|
|
|
|
$attrs{'agent'} //= "Perl-WWW-Crawl-$VERSION";
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$attrs{'http'} = HTTP::Tiny->new(
|
26
|
0
|
|
|
|
|
|
'agent' => $attrs{'agent'},
|
27
|
|
|
|
|
|
|
);
|
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
return bless \%attrs, $class;
|
30
|
|
|
|
|
|
|
}
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub crawl {
|
33
|
0
|
|
|
0
|
1
|
|
my ($self, $url, $callback) = @_;
|
34
|
|
|
|
|
|
|
|
35
|
0
|
0
|
|
|
|
|
$url = "https://$url" if $url =~ /^www/;
|
36
|
0
|
|
|
|
|
|
my $uri = URI->new($url);
|
37
|
0
|
0
|
|
|
|
|
croak "WWW::Crawl: No valid URI" unless $uri;
|
38
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
my (%links, %parsed);
|
40
|
0
|
|
|
|
|
|
$links{$url} = 1;
|
41
|
|
|
|
|
|
|
|
42
|
0
|
|
|
|
|
|
my $page;
|
43
|
0
|
|
|
|
|
|
my $flag = 1;
|
44
|
0
|
|
0
|
|
|
|
while (scalar keys %links and $flag) {
|
45
|
0
|
|
|
|
|
|
my $url = (keys(%links))[0];
|
46
|
0
|
|
|
|
|
|
delete $links{$url};
|
47
|
|
|
|
|
|
|
|
48
|
0
|
0
|
|
|
|
|
next if $parsed{$url};
|
49
|
0
|
|
|
|
|
|
$parsed{$url}++;
|
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
my $resp = $self->{'http'}->request('GET', $url);
|
52
|
0
|
0
|
|
|
|
|
next if $resp->{'status'} == 404;
|
53
|
0
|
0
|
|
|
|
|
if (!$resp->{'success'}) {
|
54
|
0
|
|
|
|
|
|
croak "WWW::Crawl: HTTP Response " . $resp->{'status'} . " - " . $resp->{'reason'} . "\n" . $resp->{'content'};
|
55
|
|
|
|
|
|
|
}
|
56
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
$page = $resp->{'content'};
|
58
|
|
|
|
|
|
|
|
59
|
0
|
|
|
|
|
|
while ($page =~ /href *?= *?("|')(.*?)('|")/gc) {
|
60
|
0
|
|
|
|
|
|
my $link = URI->new($2)->abs($uri)->canonical;
|
61
|
0
|
0
|
0
|
|
|
|
if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
|
62
|
0
|
|
|
|
|
|
my $address = $link->as_string;
|
63
|
0
|
|
|
|
|
|
while ($address =~ s/(\/|#)$//) {}
|
64
|
0
|
0
|
0
|
|
|
|
$links{$address}++ unless $link->path =~ /\.(pdf|css|png|jpg|svg|webmanifest)/ or $address =~ /#/;
|
65
|
|
|
|
|
|
|
}
|
66
|
|
|
|
|
|
|
}
|
67
|
|
|
|
|
|
|
# Find forms
|
68
|
0
|
|
|
|
|
|
pos($page) = 0;
|
69
|
0
|
|
|
|
|
|
while ($page =~ / |
70
|
0
|
|
|
|
|
|
my $link = URI->new($2)->abs($uri)->canonical;
|
71
|
0
|
0
|
0
|
|
|
|
if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
|
72
|
0
|
|
|
|
|
|
my $address = $link->as_string;
|
73
|
0
|
|
|
|
|
|
$links{$address}++ ;
|
74
|
|
|
|
|
|
|
}
|
75
|
|
|
|
|
|
|
}
|
76
|
|
|
|
|
|
|
# Find external JS files
|
77
|
0
|
|
|
|
|
|
pos($page) = 0;
|
78
|
0
|
|
|
|
|
|
while ($page =~ / |