File Coverage

blib/lib/WWW/Crawl.pm

Criterion	Covered	Total	%
statement	18	73	24.6
branch	0	30	0.0
condition	0	21	0.0
subroutine	6	8	75.0
pod	2	2	100.0
total	26	134	19.4

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Crawl;
2
3	3			3		228922	use strict;
	3					24
	3					88
4	3			3		15	use warnings;
	3					5
	3					98
5
6	3			3		2401	use HTTP::Tiny 0.083;
	3					155342
	3					106
7	3			3		1860	use URI;
	3					20029
	3					92
8	3			3		1541	use JSON::PP;
	3					28420
	3					241
9	3			3		24	use Carp qw(croak);
	3					8
	3					3367
10
11							our $VERSION = '0.1';
12							$VERSION = eval $VERSION;
13
14							# TODO:
15							# 1 - Use HTML Parser instead of regexps
16							# - we don't do this as it doesn't parse JS scripts and files
17							#
18
19							sub new {
20	0			0	1		my $class = shift;
21	0						my %attrs = @_;
22
23	0		0				$attrs{'agent'} //= "Perl-WWW-Crawl-$VERSION";
24
25							$attrs{'http'} = HTTP::Tiny->new(
26	0						'agent' => $attrs{'agent'},
27							);
28
29	0						return bless \%attrs, $class;
30							}
31
32							sub crawl {
33	0			0	1		my ($self, $url, $callback) = @_;
34
35	0	0					$url = "https://$url" if $url =~ /^www/;
36	0						my $uri = URI->new($url);
37	0	0					croak "WWW::Crawl: No valid URI" unless $uri;
38
39	0						my (%links, %parsed);
40	0						$links{$url} = 1;
41
42	0						my $page;
43	0						my $flag = 1;
44	0		0				while (scalar keys %links and $flag) {
45	0						my $url = (keys(%links))[0];
46	0						delete $links{$url};
47
48	0	0					next if $parsed{$url};
49	0						$parsed{$url}++;
50
51	0						my $resp = $self->{'http'}->request('GET', $url);
52	0	0					next if $resp->{'status'} == 404;
53	0	0					if (!$resp->{'success'}) {
54	0						croak "WWW::Crawl: HTTP Response " . $resp->{'status'} . " - " . $resp->{'reason'} . "\n" . $resp->{'content'};
55							}
56
57	0						$page = $resp->{'content'};
58
59	0						while ($page =~ /href ?= ?("\|')(.*?)('\|")/gc) {
60	0						my $link = URI->new($2)->abs($uri)->canonical;
61	0	0	0				if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
62	0						my $address = $link->as_string;
63	0						while ($address =~ s/(\/\|#)$//) {}
64	0	0	0				$links{$address}++ unless $link->path =~ /\.(pdf\|css\|png\|jpg\|svg\|webmanifest)/ or $address =~ /#/;
65							}
66							}
67							# Find forms
68	0						pos($page) = 0;
69	0						while ($page =~ /
70	0						my $link = URI->new($2)->abs($uri)->canonical;
71	0	0	0				if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
72	0						my $address = $link->as_string;
73	0						$links{$address}++ ;
74							}
75							}
76							# Find external JS files
77	0						pos($page) = 0;
78	0						while ($page =~ /