| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Crawl;
|
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
454829
|
use strict;
|
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
122
|
|
|
4
|
3
|
|
|
3
|
|
18
|
use warnings;
|
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
167
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
2264
|
use HTTP::Tiny;
|
|
|
3
|
|
|
|
|
218649
|
|
|
|
3
|
|
|
|
|
162
|
|
|
7
|
3
|
|
|
3
|
|
2147
|
use URI;
|
|
|
3
|
|
|
|
|
26465
|
|
|
|
3
|
|
|
|
|
165
|
|
|
8
|
3
|
|
|
3
|
|
4742
|
use JSON::PP;
|
|
|
3
|
|
|
|
|
41595
|
|
|
|
3
|
|
|
|
|
371
|
|
|
9
|
3
|
|
|
3
|
|
59
|
use Carp qw(croak);
|
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
232
|
|
|
10
|
3
|
|
|
3
|
|
2377
|
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
|
|
|
3
|
|
|
|
|
229021
|
|
|
|
3
|
|
|
|
|
6076
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.5'; |
|
13
|
|
|
|
|
|
|
# $VERSION = eval $VERSION;
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# TODO:
|
|
16
|
|
|
|
|
|
|
# 1 - Use HTML Parser instead of regexps
|
|
17
|
|
|
|
|
|
|
# - we don't do this as it doesn't parse JS scripts and files
|
|
18
|
|
|
|
|
|
|
#
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
sub new {
|
|
21
|
0
|
|
|
0
|
1
|
|
my $class = shift;
|
|
22
|
0
|
|
|
|
|
|
my %attrs = @_;
|
|
23
|
|
|
|
|
|
|
|
|
24
|
0
|
|
0
|
|
|
|
$attrs{'agent'} //= "Perl-WWW-Crawl-$VERSION";
|
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
$attrs{'http'} = HTTP::Tiny->new(
|
|
27
|
0
|
|
|
|
|
|
'agent' => $attrs{'agent'},
|
|
28
|
|
|
|
|
|
|
);
|
|
29
|
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
return bless \%attrs, $class;
|
|
31
|
|
|
|
|
|
|
}
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub crawl {
|
|
34
|
0
|
|
|
0
|
1
|
|
my ($self, $url, $callback) = @_;
|
|
35
|
|
|
|
|
|
|
|
|
36
|
0
|
0
|
|
|
|
|
$url = "https://$url" if $url =~ /^www/;
|
|
37
|
0
|
|
|
|
|
|
my $uri = URI->new($url);
|
|
38
|
0
|
0
|
|
|
|
|
croak "WWW::Crawl: No valid URI" unless $uri;
|
|
39
|
|
|
|
|
|
|
|
|
40
|
0
|
|
|
|
|
|
my (%links, %parsed);
|
|
41
|
0
|
|
|
|
|
|
$links{$url} = 1;
|
|
42
|
|
|
|
|
|
|
|
|
43
|
0
|
|
|
|
|
|
my $page;
|
|
44
|
0
|
|
|
|
|
|
my $flag = 1;
|
|
45
|
0
|
|
0
|
|
|
|
while (scalar keys %links and $flag) {
|
|
46
|
0
|
|
|
|
|
|
my $url = (keys(%links))[0];
|
|
47
|
0
|
|
|
|
|
|
delete $links{$url};
|
|
48
|
|
|
|
|
|
|
|
|
49
|
0
|
0
|
|
|
|
|
next if $parsed{$url};
|
|
50
|
0
|
|
|
|
|
|
$parsed{$url}++;
|
|
51
|
|
|
|
|
|
|
|
|
52
|
0
|
|
|
|
|
|
my $resp = $self->_fetch_page($url);
|
|
53
|
0
|
0
|
|
|
|
|
next if $resp->{'status'} == 404;
|
|
54
|
0
|
0
|
|
|
|
|
if (!$resp->{'success'}) {
|
|
55
|
0
|
|
|
|
|
|
warn "WWW::Crawl: HTTP Response " . $resp->{'status'} . " - " . $resp->{'reason'} . "\n";
|
|
56
|
0
|
|
|
|
|
|
next;
|
|
57
|
|
|
|
|
|
|
}
|
|
58
|
|
|
|
|
|
|
|
|
59
|
0
|
|
|
|
|
|
$page = $resp->{'content'};
|
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#print "\nContent: $page\n\n";
|
|
62
|
|
|
|
|
|
|
|
|
63
|
0
|
|
|
|
|
|
while ($page =~ /href *?= *?("|')(.*?)('|")/gc) {
|
|
64
|
0
|
|
|
|
|
|
my $link = URI->new($2)->abs($uri)->canonical;
|
|
65
|
0
|
0
|
0
|
|
|
|
if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
|
|
66
|
0
|
|
|
|
|
|
my $address = $link->as_string;
|
|
67
|
0
|
|
|
|
|
|
while ($address =~ s/(\/|#)$//) {}
|
|
68
|
0
|
0
|
0
|
|
|
|
$links{$address}++ unless $link->path =~ /\.(pdf|css|png|jpg|svg|webmanifest)/ or $address =~ /#/;
|
|
69
|
|
|
|
|
|
|
}
|
|
70
|
|
|
|
|
|
|
}
|
|
71
|
|
|
|
|
|
|
# Find forms
|
|
72
|
0
|
|
|
|
|
|
pos($page) = 0;
|
|
73
|
0
|
|
|
|
|
|
while ($page =~ / |
|
74
|
0
|
|
|
|
|
|
my $link = URI->new($2)->abs($uri)->canonical;
|
|
75
|
0
|
0
|
0
|
|
|
|
if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
|
|
76
|
0
|
|
|
|
|
|
my $address = $link->as_string;
|
|
77
|
0
|
|
|
|
|
|
$links{$address}++ ;
|
|
78
|
|
|
|
|
|
|
}
|
|
79
|
|
|
|
|
|
|
}
|
|
80
|
|
|
|
|
|
|
# Find external JS files
|
|
81
|
0
|
|
|
|
|
|
pos($page) = 0;
|
|
82
|
0
|
|
|
|
|
|
while ($page =~ / |