File Coverage

blib/lib/CrawlerCommons/RobotRulesParser.pm

Criterion	Covered	Total	%
statement	187	192	97.4
branch	77	82	93.9
condition	31	49	63.2
subroutine	28	31	90.3
pod	0	1	0.0
total	323	355	90.9

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							CrawlerCommons::RobotRulesParser - parser for robots.txt files
4
5							=head1 SYNOPSIS
6
7							use CrawlerCommons::RobotRulesParser;
8
9							my $rules_parser = CrawlerCommons::RobotRulesParser->new;
10
11							my $content = "User-agent: \r\nDisallow: images";
12							my $content_type = "text/plain";
13							my $robot_names = "any-old-robot";
14							my $url = "http://domain.com/";
15
16							my $robot_rules =
17							$rules_parser->parse_content($url, $content, $content_type, $robot_names);
18
19							say "We're allowed to crawl the index :)"
20							if $robot_rules->is_allowed( "https://www.domain.com/index.html");
21
22							say "Not allowed to crawl: $_" unless $robot_rules->is_allowed( $_ )
23							for ("http://www.domain.com/images/some_file.png",
24							"http://www.domain.com/images/another_file.png");
25
26							=head1 DESCRIPTION
27
28							This module is a fairly close reproduction of the Crawler-Commons
29							L<SimpleRobotRulesParser\|http://crawler-commons.github.io/crawler-commons/0.7/crawlercommons/robots/SimpleRobotRulesParser.html>
30
31							From BaseRobotsParser javadoc:
32
33							Parse the robots.txt file in <i>content</i>, and return rules appropriate
34							for processing paths by <i>userAgent</i>. Note that multiple agent names
35							may be provided as comma-separated values; the order of these shouldn't
36							matter, as the file is parsed in order, and each agent name found in the
37							file will be compared to every agent name found in robotNames.
38							Also note that names are lower-cased before comparison, and that any
39							robot name you pass shouldn't contain commas or spaces; if the name has
40							spaces, it will be split into multiple names, each of which will be
41							compared against agent names in the robots.txt file. An agent name is
42							considered a match if it's a prefix match on the provided robot name. For
43							example, if you pass in "Mozilla Crawlerbot-super 1.0", this would match
44							"crawlerbot" as the agent name, because of splitting on spaces,
45							lower-casing, and the prefix match rule.
46
47							The method failedFetch is not implemented.
48
49							=cut
50
51							###############################################################################
52							package CrawlerCommons::RobotRulesParser;
53
54
55							# MODULE IMPORTS
56							########################################
57							# Pragmas
58							#------------------#
59	2			2		185873	use 5.10.1;
	2					7
60	2			2		9	use strict;
	2					3
	2					45
61	2			2		325	use utf8;
	2					17
	2					12
62	2			2		42	use warnings;
	2					3
	2					45
63
64							# CPAN/Core
65							#------------------#
66	2			2		234	use Const::Fast;
	2					1657
	2					12
67	2			2		756	use Encode qw(decode encode);
	2					14573
	2					117
68	2			2		267	use Try::Tiny;
	2					1379
	2					86
69	2			2		481	use URI::Escape;
	2					2391
	2					123
70
71							# Moose Setup
72							#------------------#
73	2			2		666	use Moose;
	2					791673
	2					14
74	2			2		14198	use namespace::autoclean;
	2					12225
	2					7
75
76							# Moose Pragmas
77							#------------------#
78							with 'MooseX::Log::Log4perl';
79
80							# Custom Modules
81							#------------------#
82	2			2		750	use CrawlerCommons::RobotDirective;
	2					11
	2					153
83	2			2		1088	use CrawlerCommons::ParseState;
	2					11
	2					96
84	2			2		21	use CrawlerCommons::RobotRules;
	2					7
	2					56
85	2			2		939	use CrawlerCommons::RobotToken;
	2					10
	2					563
86
87							# VARIABLES/CONSTANTS
88							########################################
89							# Constants
90							#------------------#
91							const my $DEBUG => $ENV{DEBUG} // 0;
92							const my $TEST => $ENV{TEST} // 0;
93
94							const my $BLANK_DIRECTIVE_PATTERN=> qr![ \t]+(.*)!o;
95							const my $COLON_DIRECTIVE_PATTERN=> qr![ \t]:[ \t](.*)!o;
96
97							const my $MAX_CRAWL_DELAY => 300000;
98							const my $MAX_WARNINGS => 5;
99							const my $SIMPLE_HTML_PATTERN => qr!<(?:html\|head\|body)\s*>!is;
100							const my $USER_AGENT_PATTERN => qr!user-agent:!i;
101
102							# Variables
103							#------------------#
104
105							# setup
106							BEGIN {
107	2			2		19	require Log::Log4perl;
108	2	50				3879	Log::Log4perl->easy_init($Log::Log4perl::ERROR)
109							unless $Log::Log4perl::Logger::INITIALIZED;
110							}
111
112							=head1 VERSION
113
114							Version 0.03
115
116							=cut
117
118							our $VERSION = '0.03';
119
120
121
122							# MOOSE ATTRIBUTES
123							########################################
124							# Class
125							#-----------------------------------------------------------------------------#
126							#-----------------------------------------------------------------------------#
127
128							# Instance
129							#-----------------------------------------------------------------------------#
130							has 'num_warnings' => (
131							default => 0,
132							handles => {
133							increment_warnings => 'inc',
134							},
135							is => 'ro',
136							isa => 'Int',
137							traits => ['Counter']
138							);
139							#-----------------------------------------------------------------------------#
140
141
142							=head1 METHODS
143
144							=cut
145
146							# METHODS
147							########################################
148							# Construction
149							#------------------#
150							#-----------------------------------------------------------------------------#
151							#-----------------------------------------------------------------------------#
152
153							# Class Methods
154							#------------------#
155							#-----------------------------------------------------------------------------#
156							#-----------------------------------------------------------------------------#
157
158							# Instance Methods
159							#------------------#
160							#-----------------------------------------------------------------------------#
161							=head2 C<< my $robot_rules = $rules_parser->parse_content($url, $content, $content_type, $robot_name) >>
162
163							Parsers robots.txt data in C<$content> for the User-agent(s) specified in
164							C<$robot_name> returning a C<CrawlerCommons::RobotRules> object corresponding
165							to the rules defined for C<$robot_name>.
166
167							=over
168
169							=item * C<$url>
170
171							URL string that's parsed in a URI object to provide scheme, authority, and path
172							for sitemap directive values. If the directive's value begins with a '/', it
173							overrides the path value provided by this URL context string.
174
175							=item * C<$content>
176
177							The text content of the robots.txt file to be parsed.
178
179							=item * C<$content_type>
180
181							The content-type of the robots.txt content to be parsed. Assumes text/plain by
182							default. If type is text/html, the parser will attempt to strip-out html tags
183							from the content.
184
185							=item * C<$robot_name>
186
187							A string signifying for which user-agent(s) the rules should be extracted.
188
189							=back
190
191							=cut
192							sub parse_content {
193	73			73	0	1196	my ($self, $url, $content, $content_type, $robot_name) = @_;
194
195	73	100	50			516	return CrawlerCommons::RobotRules->new(
196							_mode => $CrawlerCommons::RobotRules::ALLOW_ALL)
197							if ( ($content // '') eq '' );
198
199	71					447	my $content_len = length( $content );
200	71					191	my $offset = 0;
201
202							# handle UTF-8, UTF-16LE, UTF-16BE content
203	71	100	66			1653	if ( ($content_len >= 3) && (substr($content, 0, 1) eq "\xEF") &&
		100	66
		100	66
			66
			66
			66
			66
204							(substr($content, 1, 1) eq "\xBB") &&
205							(substr($content, 2, 1) eq "\xBF") ) {
206	1					2	$offset = 3;
207	1					3	$content_len -= 3;
208	1					3	$content = substr( $content, 3);
209	1					7	$content = decode('UTF-8', $content);
210							}
211							elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFF") &&
212							(substr($content, 1, 1) eq "\xFE") ) {
213	1					2	$offset = 2;
214	1					3	$content_len -= 2;
215	1					4	$content = substr( $content, 2);
216	1					8	$content = decode('UTF-16LE', $content);
217							}
218							elsif ( ($content_len >= 2) && (substr($content, 0, 1) eq "\xFE") &&
219							(substr($content, 1, 1) eq "\xFF") ) {
220	1					3	$offset = 2;
221	1					3	$content_len -= 2;
222	1					5	$content = substr( $content, 2);
223	1					7	$content = decode('UTF-16BE', $content);
224							}
225
226							# set flags that trigger the stripping of '<' and '>' from content
227	71	50	33			4115	my $is_html_type = ($content_type // '') ne '' &&
228							lc( $content_type // '') =~ m!^text/html! ? 1 : 0;
229
230	71					207	my $has_html = 0;
231	71	100	50			1090	if ( $is_html_type \|\| ($content // '') =~ $SIMPLE_HTML_PATTERN ) {
			66
232	3	100	50			27	if ( ($content // '') !~ $USER_AGENT_PATTERN ) {
233	1					10	$self->log->warn( "Found non-robots.txt HTML file: $url");
234
235	1					126	return CrawlerCommons::RobotRules->new(
236							_mode => $CrawlerCommons::RobotRules::ALLOW_ALL);
237							}
238
239							else {
240	2	50				8	if ( $is_html_type ) {
241	0					0	$self->log->info(
242							"HTML content type returned for robots.txt file: $url");
243							}
244							else {
245	2					11	$self->log->warn("Found HTML in robots.txt file: $url");
246							}
247
248	2					186	$has_html = 1;
249							}
250							}
251
252	70					2941	my $parse_state =
253							CrawlerCommons::ParseState->new(
254							url => $url, target_name => lc($robot_name) );
255
256							# DEBUG
257	70					483	$self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state1']));
258
259	70					19495	for my $line ( split( m!(?:\n\|\r\|\r\n\|\x0085\|\x2028\|\x2029)!, $content) ) {
260	1835					6038	$self->log->trace("Input Line: [$line]\n");
261
262							# strip html tags
263	1835	100				40097	$line =~ s!<[^>]+>!!g if $has_html;
264
265							# trim comments
266	1835	100				5515	if (my $hash_idx = index( $line, '#') ) {
267	1799	100				3955	$line = substr($line, 0, $hash_idx ) if $hash_idx >= 0;
268							}
269
270							# trim whitespace
271	1835					10071	$line =~ s!^\s+\|\s+$!!;
272	1835	100				5313	next if length( $line ) == 0;
273
274	1561					3869	my $robot_token = $self->_tokenize( $line );
275
276	1561	100				50018	do {
277	923					3163	$self->_handle_user_agent( $parse_state, $robot_token );
278	923					28695	next;
279							} if $robot_token->directive->is_user_agent;
280
281	638	100				18557	do {
282	494					1833	$self->_handle_disallow( $parse_state, $robot_token );
283	494					20396	next;
284							} if $robot_token->directive->is_disallow;
285
286	144	100				4385	do {
287	57					291	$self->_handle_allow( $parse_state, $robot_token );
288	57					2216	next;
289							} if $robot_token->directive->is_allow;
290
291	87	100				2731	do {
292	17					76	$self->_handle_crawl_delay( $parse_state, $robot_token );
293	17					720	next;
294							} if $robot_token->directive->is_crawl_delay;
295
296	70	100				2123	do {
297	19					104	$self->_handle_sitemap( $parse_state, $robot_token );
298	19					1094	next;
299							} if $robot_token->directive->is_sitemap;
300
301	51	100				1554	do {
302	1					7	$self->_handle_http( $parse_state, $robot_token );
303	1					34	next;
304							} if $robot_token->directive->is_http;
305
306	50	100				1491	do {
307	13					166	$self->_report_warning(
308							sprintf(
309							"Unknown line in robots.txt file (size %d): %s",
310							length( $content ),
311							$line
312							),
313							$url
314							);
315	13					784	$parse_state->is_finished_agent_fields( 1 );
316	13					442	next;
317							} if $robot_token->directive->is_missing;
318
319	37	100				1058	do {
320	26					241	$self->_report_warning(
321							sprintf(
322							"Unknown directive in robots.txt file: %s",
323							$line
324							),
325							$url
326							);
327	26					1302	$parse_state->is_finished_agent_fields( 1 );
328	26					911	next;
329							} if $robot_token->directive->is_unknown;
330							}
331
332	70					820	$self->log->trace(Data::Dumper->Dump([$parse_state],['parse_state2']));
333
334	70					14965	my $robot_rules = $parse_state->current_rules();
335	70	100				2474	if ( $robot_rules->crawl_delay > $MAX_CRAWL_DELAY ) {
336	1					28	return CrawlerCommons::RobotRules->new(
337							_mode => $CrawlerCommons::RobotRules::ALLOW_NONE );
338							}
339							else {
340	69					390	$robot_rules->sort_rules;
341	69					2513	return $robot_rules;
342							}
343							}
344							#-----------------------------------------------------------------------------#
345
346							# Private Methods
347							#------------------#
348							#-----------------------------------------------------------------------------#
349							sub _handle_allow_or_disallow {
350	551			551		1425	my ($self, $state, $token, $allow_or_disallow ) = @_;
351
352	551					1634	$self->log->trace(Data::Dumper->Dump([\@_],['_handle_allow_or_disallow']));
353
354	551	100				253741	return if $state->is_skip_agents;
355
356	413					13669	$state->is_finished_agent_fields( 1 );
357
358	413	100				12683	return unless $state->is_adding_rules;
359
360	265		50			7765	my $path = $token->data // '';
361							try {
362	265			265		21846	$path = uri_unescape( $path );
363	265					3003	utf8::encode( $path );
364	265	100				793	if ( length( $path ) == 0 ) {
365	11					66	$state->clear_rules;
366							}
367							else {
368	254					1003	$state->add_rule( $path, $allow_or_disallow );
369							}
370							}
371							catch {
372	0			0		0	$self->_report_warning(
373							"Error parsing robot rules - can't decode path: $path\n$_",
374							$state->url
375							);
376	265					2838	};
377							}
378							#-----------------------------------------------------------------------------#
379	57			57		218	sub _handle_allow { shift->_handle_allow_or_disallow( @_, 1 ); }
380							#-----------------------------------------------------------------------------#
381							sub _handle_crawl_delay {
382	17			17		48	my ($self, $state, $token) = @_;
383
384	17					69	$self->log->trace(Data::Dumper->Dump([$state, $token],['state','token']));
385
386	17	100				3341	return if $state->is_skip_agents;
387
388	12					432	$state->is_finished_agent_fields( 1 );
389
390	12	100				398	return unless $state->is_adding_rules;
391
392	9					286	my $delay = $token->data;
393							try {
394	9			9		782	my $delay_ms = $delay * 1000;
395	9					61	$state->set_crawl_delay( $delay_ms );
396							}
397							catch {
398	0			0		0	$self->_report_warning(
399							"Error parsing robot rules - can't decode crawl delay: $delay",
400							$state->url
401							);
402	9					110	};
403							}
404							#-----------------------------------------------------------------------------#
405	494			494		1457	sub _handle_disallow { shift->_handle_allow_or_disallow( @_, 0 ); }
406							#-----------------------------------------------------------------------------#
407							sub _handle_http {
408	1			1		5	my ($self, $state, $token) = @_;
409	1					26	my $url_fragment = $token->data;
410	1	50				7	if ( index( $url_fragment, 'sitemap' ) ) {
411	1					37	my $fixed_token = CrawlerCommons::RobotToken->new(
412							data => 'http:' . $url_fragment,
413							directive =>
414							CrawlerCommons::RobotDirective
415							->get_directive('sitemap'),
416							);
417	1					10	$self->_handle_sitemap( $state, $fixed_token );
418							}
419							else {
420	0					0	$self->_report_warning(
421							"Fournd raw non-sitemap URL: http:$url_fragment", $state->url);
422							}
423							}
424							#-----------------------------------------------------------------------------#
425							sub _handle_sitemap {
426	20			20		66	my ($self, $state, $token) = @_;
427	20					634	my $sitemap = $token->data;
428							try {
429	20			20		2622	my $sitemap_url = URI->new_abs( $sitemap, URI->new( $state->url ) );
430	20		50			5867	my $host = $sitemap_url->host() // '';
431
432	20					899	$self->log->trace(<<"DUMP");
433							# _handle_sitemap
434							###################
435							sitemap $sitemap
436							sitemap_url $sitemap_url
437							host $host
438	20					1203	url ${\$state->url}
439							DUMP
440
441	20	50				296	$state->add_sitemap( $sitemap_url->as_string ) if ( $host ne '' );
442							}
443							catch {
444	0			0		0	$self->_report_warning( "Invalid URL with sitemap directive: $sitemap",
445							$state->url );
446	20					274	};
447							}
448							#-----------------------------------------------------------------------------#
449							sub _handle_user_agent {
450	923			923		1993	my ($self, $state, $token) = @_;
451	923	100				28740	if ( $state->is_matched_real_name ) {
452	121	100				3758	$state->is_skip_agents( 1 ) if $state->is_finished_agent_fields;
453	121					279	return;
454							}
455
456	802	100				24149	if ( $state->is_finished_agent_fields ) {
457	62					1984	$state->is_finished_agent_fields( 0 );
458	62					1991	$state->is_adding_rules( 0 );
459							}
460
461	802					23535	for my $target_name ( split(/,/, lc( $state->target_name ) ) ) {
462	806					23122	for my $agent_name ( split( m! \|\t\|,!, $token->data ) ) {
463	1362		50			5679	( $agent_name = lc( $agent_name // '' ) ) =~ s!^\s+\|\s+$!!g;
464
465	1362	100	100			5734	if ( $agent_name eq '*' && !$state->is_matched_wildcard ) {
		100
466	39					1221	$state->is_matched_wildcard( 1 );
467	39					1273	$state->is_adding_rules( 1 );
468							}
469							elsif ($agent_name ne '') {
470	1315					3027	for my $target_name_split ( split(/ /, $target_name) ) {
471	1340	100				4697	if (index( $target_name_split, $agent_name ) == 0 ) {
472	33					1164	$state->is_matched_real_name( 1 );
473	33					1025	$state->is_adding_rules( 1 );
474	33					196	$state->clear_rules;
475	33					139	last;
476							}
477							}
478							}
479							}
480							}
481							}
482							#-----------------------------------------------------------------------------#
483							sub _report_warning {
484	39			39		160	my ($self, $msg, $url) = @_;
485	39					1971	$self->increment_warnings;
486
487	39					1266	my $warning_count = $self->num_warnings;
488	39	100				187	$self->log->warn("Problem processing robots.txt for $url")
489							if $warning_count == 1;
490
491	39	100				670	$self->log->warn( $msg ) if $warning_count < $MAX_WARNINGS;
492							}
493							#-----------------------------------------------------------------------------#
494							sub _tokenize {
495	1561			1561		3466	my ($self, $line) = @_;
496
497	1561					3961	$self->log->trace("Parsing line: [$line]");
498
499	1561					29918	my $lower_line = lc( $line );
500	1561					6144	my ($directive) = ($lower_line =~ m!^([^:\s]+)!);
501	1561		50			3625	$directive //= '';
502
503	1561	100	100			66170	if ( $directive =~ m!^acap\-! \|\|
504							CrawlerCommons::RobotDirective->directive_exists( $directive ) ){
505
506	1522					5307	my $data_portion = substr($line, length( $directive ));
507	1522					9689	( my $data ) = ( $data_portion =~ m!$COLON_DIRECTIVE_PATTERN! );
508	1522	100				4370	( $data ) = ( $data_portion =~ m!$BLANK_DIRECTIVE_PATTERN! )
509							unless defined $data;
510	1522		50			3007	$data //= '';
511	1522					5836	$data =~ s!^\s+\|\s+$!!;
512
513	1522					4839	$self->log->trace(<<"DUMP");
514							# _tokenize dump
515							#################
516							line [$line]
517							directive [$directive]
518							data_portion [$data_portion]
519							data [$data]
520							DUMP
521
522	1522	100				91990	my $robot_directive =
523							CrawlerCommons::RobotDirective->get_directive(
524							$directive =~ m!^acap-!i ? 'acap-' : $directive );
525
526	1522					45419	return CrawlerCommons::RobotToken->new(
527							data => $data, directive => $robot_directive
528							);
529							}
530							else {
531	39	100				1864	my $robot_directive =
532							CrawlerCommons::RobotDirective->get_directive(
533							$lower_line =~ m![ \t]:[ \t](.*)! ? 'unknown' : 'missing' );
534
535	39					1339	return CrawlerCommons::RobotToken->new(
536							data => $line, directive => $robot_directive
537							);
538							}
539							}
540							#-----------------------------------------------------------------------------#
541
542							###############################################################################
543
544							__PACKAGE__->meta->make_immutable;
545
546							###############################################################################
547
548							=head1 AUTHOR
549
550							Adam K Robinson <akrobinson74@gmail.com>
551
552							=head1 COPYRIGHT AND LICENSE
553
554							This software is copyright (c) 2017 by Adam K Robinson.
555
556							This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.
557
558							=cut
559
560							1;
561
562							__END__