File Coverage

blib/lib/WWW/Crawl.pm

Criterion	Covered	Total	%
statement	21	86	24.4
branch	0	34	0.0
condition	0	27	0.0
subroutine	7	10	70.0
pod	2	2	100.0
total	30	159	18.8

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Crawl;
2
3	3			3		454829	use strict;
	3					7
	3					122
4	3			3		18	use warnings;
	3					5
	3					167
5
6	3			3		2264	use HTTP::Tiny;
	3					218649
	3					162
7	3			3		2147	use URI;
	3					26465
	3					165
8	3			3		4742	use JSON::PP;
	3					41595
	3					371
9	3			3		59	use Carp qw(croak);
	3					8
	3					232
10	3			3		2377	use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
	3					229021
	3					6076
11
12							our $VERSION = '0.5';
13							# $VERSION = eval $VERSION;
14
15							# TODO:
16							# 1 - Use HTML Parser instead of regexps
17							# - we don't do this as it doesn't parse JS scripts and files
18							#
19
20							sub new {
21	0			0	1		my $class = shift;
22	0						my %attrs = @_;
23
24	0		0				$attrs{'agent'} //= "Perl-WWW-Crawl-$VERSION";
25
26							$attrs{'http'} = HTTP::Tiny->new(
27	0						'agent' => $attrs{'agent'},
28							);
29
30	0						return bless \%attrs, $class;
31							}
32
33							sub crawl {
34	0			0	1		my ($self, $url, $callback) = @_;
35
36	0	0					$url = "https://$url" if $url =~ /^www/;
37	0						my $uri = URI->new($url);
38	0	0					croak "WWW::Crawl: No valid URI" unless $uri;
39
40	0						my (%links, %parsed);
41	0						$links{$url} = 1;
42
43	0						my $page;
44	0						my $flag = 1;
45	0		0				while (scalar keys %links and $flag) {
46	0						my $url = (keys(%links))[0];
47	0						delete $links{$url};
48
49	0	0					next if $parsed{$url};
50	0						$parsed{$url}++;
51
52	0						my $resp = $self->_fetch_page($url);
53	0	0					next if $resp->{'status'} == 404;
54	0	0					if (!$resp->{'success'}) {
55	0						warn "WWW::Crawl: HTTP Response " . $resp->{'status'} . " - " . $resp->{'reason'} . "\n";
56	0						next;
57							}
58
59	0						$page = $resp->{'content'};
60
61							#print "\nContent: $page\n\n";
62
63	0						while ($page =~ /href ?= ?("\|')(.*?)('\|")/gc) {
64	0						my $link = URI->new($2)->abs($uri)->canonical;
65	0	0	0				if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
66	0						my $address = $link->as_string;
67	0						while ($address =~ s/(\/\|#)$//) {}
68	0	0	0				$links{$address}++ unless $link->path =~ /\.(pdf\|css\|png\|jpg\|svg\|webmanifest)/ or $address =~ /#/;
69							}
70							}
71							# Find forms
72	0						pos($page) = 0;
73	0						while ($page =~ /
74	0						my $link = URI->new($2)->abs($uri)->canonical;
75	0	0	0				if ($link->scheme =~ /^http/ and $link->authority eq $uri->authority) {
76	0						my $address = $link->as_string;
77	0						$links{$address}++ ;
78							}
79							}
80							# Find external JS files
81	0						pos($page) = 0;
82	0						while ($page =~ /