File Coverage

blib/lib/WWW/RoboCop.pm

Criterion	Covered	Total	%
statement	55	55	100.0
branch	7	8	87.5
condition			n/a
subroutine	14	14	100.0
pod	2	2	100.0
total	78	79	98.7

line	stmt	bran	sub	pod	time	code
1	1		1		281222	use strict;
	1				8
	1				23
2	1		1		5	use warnings;
	1				1
	1				22
3	1		1		5	use feature qw( state );
	1				1
	1				127
4
5						our $VERSION = '0.000101';
6
7						use Moo;
8	1		1		447
	1				8482
	1				3
9						use MooX::HandlesVia;
10	1		1		1625	use MooX::StrictConstructor;
	1				7983
	1				6
11	1		1		453	use Type::Params qw( compile );
	1				3941
	1				4
12	1		1		20094	use Types::Standard qw( CodeRef HashRef InstanceOf );
	1				87130
	1				8
13	1		1		233	use Types::URI qw( Uri );
	1				3
	1				3
14	1		1		1088	use URI ();
	1				85035
	1				7
15	1		1		305	use WWW::Mechanize ();
	1				2
	1				13
16	1		1		7
	1				4
	1				572
17						has is_url_allowed => (
18						is => 'ro',
19						isa => CodeRef,
20						handles_via => 'Code',
21						handles => { _should_follow_link => 'execute' },
22						required => 1,
23						);
24
25						has report_for_url => (
26						is => 'ro',
27						isa => CodeRef,
28						handles_via => 'Code',
29						handles => { _log_response => 'execute' },
30						default => sub {
31						sub {
32						my $response = shift; # HTTP::Response object
33						my $referring_url = shift; # URI object
34						return {
35						$response->redirects
36						? (
37						redirects => [
38						map {
39						{
40						status => $_->code,
41						uri => $_->request->uri,
42						}
43						} $response->redirects
44						],
45						target_url => $response->request->uri,
46						)
47						: (),
48						referrer => $referring_url,
49						status => $response->code,
50						};
51						};
52						},
53						);
54
55						has ua => (
56						is => 'ro',
57						isa => InstanceOf ['WWW::Mechanize'],
58						default => sub {
59						WWW::Mechanize->new( autocheck => 0 );
60						},
61						);
62
63						has _history => (
64						is => 'ro',
65						isa => HashRef,
66						handles_via => 'Hash',
67						handles => {
68						_add_url_to_history => 'set',
69						_has_processed_url => 'exists',
70						},
71						init_arg => undef,
72						default => sub { +{} },
73						);
74
75						my $self = shift;
76						my $url = shift;
77	2		2		6	my $referring_url = shift;
78	2				4
79	2				2	my $response = $self->ua->get($url);
80						my $report = $self->_log_response( $response, $referring_url );
81	2				18	$self->_add_url_to_history( $url, $report );
82	2				17119
83	2				1047	my @links = $self->ua->find_all_links;
84
85	2				413	foreach my $link (@links) {
86
87	2				3807	# this just points back at the same url
88						next if substr( $link->url, 0, 1 ) eq '#';
89
90	6	100			479	my $uri = URI->new( $link->url_abs );
91						$uri->fragment(undef); # fragments result in duplicate urls
92	4				23
93	4				4046	next if $self->_has_processed_url($uri);
94						next unless $uri->can('host'); # no mailto: links
95	4	100			143	next unless $self->_should_follow_link( $link, $url );
96	3	50			151
97	3	100			58	$self->_get( $uri, $url );
98						}
99	1				248	}
100
101						my $self = shift;
102
103						state $check = compile(Uri);
104	1		1	1	1112	my ($url) = $check->(@_);
105
106	1				26	$self->_get($url);
107	1				6331	}
108
109	1				35	my $self = shift;
110						return %{ $self->_history };
111						}
112
113	1		1	1	17	1;
114	1				2
	1				6
115						=pod
116
117						=encoding UTF-8
118
119						=head1 NAME
120
121						WWW::RoboCop - Police your URLs!
122
123						=head1 VERSION
124
125						version 0.000101
126
127						=head1 SYNOPSIS
128
129						use feature qw( state );
130
131						use WWW::RoboCop ();
132
133						my $robocop = WWW::RoboCop->new(
134						is_url_allowed => sub {
135						state $count = 0;
136						return $count++ < 5; # just crawl 5 URLs
137						},
138						);
139
140						$robocop->crawl( 'https://example.com' );
141
142						my %history = $robocop->get_report;
143
144						# %history = (
145						# 'http://myhost.com/one' => { status => 200, ... },
146						# 'http://myhost.com/two' => { status => 404, ... },
147						# ...
148						# )
149
150						See C<examples/crawl-host.pl>, which is included with this distribution, to get a
151						quick start.
152
153						=head1 DESCRIPTION
154
155						C<WWW::RoboCop> is a simple, somewhat opinionated robot. Given a starting
156						page, this module will crawl only URLs which have been allowed by the
157						C<is_url_allowed> callback. It then creates a report of all visited pages,
158						keyed on URL. You are encouraged to provide your own report creation callback
159						so that you can collect all of the information which you require for each URL.
160
161						=head1 CONSTRUCTOR AND STARTUP
162
163						=head2 new()
164
165						Creates and returns a new C<WWW::RoboCop> object.
166
167						Below are the arguments which you may pass to C<new> when creating an object.
168
169						=head3 is_url_allowed
170
171						This argument is required. You must provide an anonymous subroutine which will
172						return true or false based on some arbitrary criteria which you provide. The
173						two arguments to this anonymous subroutine will be a L<WWW::Mechanize::Link>
174						object as well as the referring URL, in the form of a L<URI> object.
175
176						Your sub might look something like this:
177
178						use feature qw( state );
179
180						use URI ();
181						use WWW::RoboCop ();
182
183						my $robocop = WWW::RoboCop->new(
184						is_url_allowed => sub {
185						my $link = shift;
186						my $referring_url = shift;
187
188						my $upper_limit = 100;
189						my $host = 'some.host.com';
190
191						state $limit = 0;
192
193						return 0 if $limit > $upper_limit;
194						my $uri = URI->new( $link->url_abs );
195
196						# if the referring_url matches the host then this is a 1st degree
197						# outbound web link
198
199						if ( $uri->host eq $host \|\| $referring_url->host eq $host ) {
200						++$limit;
201						return 1;
202						}
203						return 0;
204						}
205						);
206
207						=head3 report_for_url
208
209						This argument is not required, but is highly recommended. The arguments to this
210						anonymous subroutine will be an L<HTTP::Response> object as well as the
211						referring URL in the form of a L<URI> object. Your sub might look something
212						like this:
213
214						my $reporter = sub {
215						my $response = shift; # HTTP::Response object
216						my $referring_url = shift; # URI object
217						return {
218						redirects => [
219						map { +{ status => $_->code, uri => $_->base->as_string } }
220						$res->redirects
221						],
222						referrer => $referring_url,
223						status => $res->code,
224						};
225						};
226
227						my $robocop = WWW::RoboCop->new(
228						is_url_allowed => sub { ... },
229						report_for_url => $reporter,
230						);
231
232						That would give you a HashRef with the status code for each link visited (200,
233						404, 500, etc) as well as the referring URL (the page on which the link was
234						found) and a list of any redirects which were followed in order to get to this
235						URL.
236
237						The default C<report_for_url> sub will already provide something like the
238						above, but you should only treat this as a stub method while you get up and
239						running. Since it's only meant to be an example, the format of the default
240						report could change at some future date without notice. You should not rely on
241						or expect it to remain consistent in future. If you are going to rely on this
242						module, you should provide your own reporting logic.
243
244						=head3 ua( WWW::Mechanize )
245
246						You can provide your own UserAgent object to this class. It should be of the
247						L<WWW::Mechanize> family. If you're looking for a significant speed boost
248						while under development, consider providing a L<WWW::Mechanize::Cached> object.
249						This can give you enough of a speedup to save you from getting distracted
250						and going off to read Hacker News while you wait.
251
252						use CHI ();
253						use WWW::Mechanize::Cached ();
254						use WWW::RoboCop ();
255
256						my $cache = CHI->new(
257						driver => 'File',
258						root_dir => /tmp/cache-example',
259						);
260
261						my $robocop = WWW::RoboCop->new(
262						is_url_allowed => sub { ... },
263						ua => WWW::Mechanize::Cached->new( cache => $cache ),
264						);
265
266						If you're not using a Cached agent, be sure to disable autocheck.
267
268						my $robocop = WWW::RoboCop->new(
269						is_url_allowed => sub { ... },
270						ua => WWW::Mechanize->new( autocheck => 0 ),
271						);
272
273						=head2 crawl( $url )
274
275						This method sets the C<WWW::RoboCop> in motion. The robot will only come to a
276						halt once has exhausted all of the allowed URLs it can find.
277
278						=head2 get_report
279
280						This method returns a Hash of crawling results, keyed on the URLs visited.
281						By default, it returns a very simple Hash, containing only the status code
282						of the visited URL. You are encouraged to provide your own callback so that
283						you can get a detailed report returned to you. You can do this by providing a
284						C<report_for_url> callback when instantiating the object.
285
286						The default report looks something like this:
287
288						# %history = (
289						# 'http://myhost.com/one' => { status => 200, ... },
290						# 'http://myhost.com/two' => { status => 404, ... },
291						# )
292
293						See C<examples/crawl-host.pl>, which is included with this distribution, to get
294						a dump of the default report.
295
296						=head1 AUTHOR
297
298						Olaf Alders <olaf@wundercounter.com>
299
300						=head1 COPYRIGHT AND LICENSE
301
302						This software is Copyright (c) 2015 by MaxMind, Inc.
303
304						This is free software, licensed under:
305
306						The Artistic License 2.0 (GPL Compatible)
307
308						=cut
309
310
311						# ABSTRACT: Police your URLs!
312