File Coverage

blib/lib/WWW/Spyder.pm

Criterion	Covered	Total	%
statement	75	507	14.7
branch	0	260	0.0
condition	0	128	0.0
subroutine	25	72	34.7
pod	13	13	100.0
total	113	980	11.5

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Spyder;
2
3	1			1		24344	use strict;
	1					3
	1					65
4	1			1		7	use warnings;
	1					2
	1					211
5
6	1			1		1400	use HTML::Parser 3;
	1					12055
	1					36
7
8	1			1		16172	use LWP::UserAgent;
	1					65533
	1					27
9	1			1		859	use HTTP::Cookies;
	1					7079
	1					24
10	1			1		738	use URI::URL;
	1					3938
	1					50
11	1			1		6	use HTML::Entities;
	1					2
	1					51
12
13	1			1		4	use Digest::MD5 "md5_base64"; # For making seen content key/index.
	1					1
	1					42
14
15	1			1		4	use Carp;
	1					1
	1					233
16							our $VERSION = '0.24';
17							our $VERBOSITY \|\|= 0;
18
19							# Methods
20							#--------------------------
21							{ # make it all a bit more private
22							my %_methods = (# these are methods & roots of our attribute names
23							UA => undef,
24							bell => undef,
25							html_parser => undef,
26							sleep_base => undef,
27							cookie_file => undef,
28							_exit_epoch => undef,
29							_term_count => undef,
30							);
31							# Those may all get hardcoded eventually, but they're handy for now.
32							#--------------------------
33							sub new {
34	0			0	1		my ( $caller ) = shift;
35	0		0				my $class = ref($caller) \|\| $caller;
36	0						my $self = bless {}, $class;
37
38	0						my ( $seed, %arg );
39	0	0					if ( @_ == 1 ) {
40	0						( $seed ) = @_;
41							}
42	0						%arg = ( broken_links => [],
43							exit_on => undef,
44							image_checking => 0,
45							report_broken_links => 0,
46							seed => undef,
47							sleep => undef,
48							sleep_base => 5,
49							UA => undef
50							);
51	0	0					%arg = ( %arg, @_ ) unless @_ % 2;
52
53							# Set our UA object if it was passed on to our constructor.
54	0	0					$self->{UA} = $arg{UA} if $arg{UA};
55
56							# Turn on image checking if requested. img src tags will be checked if
57							# image_checking is set to 1 in the constructor.
58	0	0					$self->{image_checking} = $arg{image_checking} if $arg{image_checking};
59
60							# Turn on broken link checking if requested. Broken link URIs can be
61							# obtained via get_broken_links().
62	0	0					$self->{report_broken_links} = $arg{report_broken_links}
63							if $arg{report_broken_links};
64
65							# Install all our methods, either set once then get only or push/shift
66							# array refs.
67	0						for my $method ( %_methods ) {
68	1			1		4	no strict "refs";
	1					1
	1					27
69	1			1		3	no warnings;
	1					2
	1					1621
70	0						my $attribute = '_' . $method;
71
72	0	0					if ( ref $_methods{$method} eq 'ARRAY' ) {
73	0						*{"$class::$method"} = sub {
74	0			0			my($self,@args) = @_;
75	0	0					return shift(@{$self->{$attribute}}) unless @args;
	0
76	0						push(@{$self->{$attribute}}, @args);
	0
77	0						};
78							} else {
79	0						*{"$class::$method"} = sub {
80	0			0			my($self,$arg) = @_;
81	0	0	0				carp "You cannot reset $method!"
82							if $arg and exists $self->{$attribute};
83	0	0					return $self->{$attribute} #get if already set
84							if exists $self->{$attribute};
85	0						$self->{$attribute} = $arg; #only set one time!
86	0						};
87							}
88							}
89
90	0		0				$seed \|\|= $arg{seed};
91	0	0					$self->seed($seed) if $seed;
92	0						$self->sleep_base($arg{sleep_base});
93	0	0					$self->_install_exit_check(\%arg) unless $self->can('_exit_check');
94	0						$self->_install_html_parser;
95	0						$self->_install_web_agent;
96
97	0						return $self;
98							}
99							#--------------------------
100							sub terms {
101	0			0	1		my ($self,@terms) = @_;
102	0	0	0				if ( @terms and not exists $self->{_terms} ) {
103	0						$self->_term_count(scalar @terms); # makes this set once op
104	0						my %terms;
105	0						$terms{$_} = qr/$_/ for @terms;
106	0						$self->{_terms} = \%terms;
107							} else {
108	0						return $self->{_terms}
109							}
110							}
111							#--------------------------
112							sub show_attributes {
113	0			0	1		my ($self) = @_;
114	0						return map {/^_(.+)$/} keys %{$self};
	0
	0
115							}
116							#--------------------------
117							sub slept {
118	0			0	1		my ($self, $time) = @_;
119	0	0					$self->{_Slept} += $time if $time;
120	0	0					return $self->{_Slept} unless $time;
121							}
122							#--------------------------
123							sub seed {
124	0			0	1		my ($self, $url) = @_;
125	0	0					$url or croak "Must provide URL to seed().";
126	0	0					croak "You have passed something besides a plain URL to seed()!"
127							if ref $url;
128	0						$self->_stack_urls($url);
129	0						return 1; # to the top of the stacks
130							}
131
132							#--------------------------
133							sub get_broken_links {
134	0			0	1		my $self = shift;
135
136	0						return $self->{broken_links};
137							}
138
139							#--------------------------
140							sub crawl {
141	0			0	1		my $self = shift;
142	0		0				my $opts = shift \|\| undef;
143	0						my $excludes = [];
144
145							# Exclude list option.
146	0	0					if ( ref($opts->{exclude}) eq 'ARRAY' ) {
147	0						$excludes = $opts->{exclude};
148							}
149
150	0						while ('I have pages to get...') {
151
152	0	0					$self->_exit_check and return;
153
154	0						my $skip_url = 0;
155	0						my $enQ = undef;
156
157							# Report a page with a 404 error in the title if report_broken_links is
158							# enabled. Also keep processing if we're looking for img src tags.
159	0	0	0				if ($self->{report_broken_links} \|\| $self->{image_checking}) {
160	0		0				$enQ = $self->_choose_courteously \|\|
161							$self->_just_choose;
162							} else {
163	0		0				$enQ = $self->_choose_courteously \|\|
164							$self->_just_choose \|\|
165							return;
166							}
167
168	0						my $url = $enQ->url;
169
170							# Skip this URL if it's in our excluded list.
171	0	0					for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; }
	0
172	0	0					next if $skip_url;
173
174	0						$self->url($url);
175	0						$self->_current_enQ($enQ);
176
177	0	0					print "GET'ing: $url\n" if $VERBOSITY;
178
179	0						my $response = $self->UA->request # no redirects &c is simple_
180							( HTTP::Request->new( GET => "$url" ) );
181
182	0	0					print STDERR "\a" if $self->bell;
183
184	0	0	0				$response or
185							carp "$url failed GET!" and next;
186
187	0						push @{$self->{_courtesy_Queue}}, $enQ->domain;
	0
188	0						shift @{$self->{_courtesy_Queue}}
	0
189							if $self->{_courtesy_Queue}
190	0	0	0				and @{$self->{_courtesy_Queue}} > 100;
191
192	0						my $head = $response->headers_as_string;
193	0	0	0				$head or
194							carp "$url has no HEAD!" and
195							next; # no headless webpages
196
197	0	0					length($head) > 1_024 and $head = substr($head,0,1_024);
198
199	0	0					print $head, "\n" if $VERBOSITY > 2;
200
201	0						my $base;
202	0						eval { $base = $response->base };
	0
203	0	0	0				$base or
204							carp "$url has no discernible BASE!" and
205							next; # no baseless webpages
206
207							# WE SHOULD also look for because some servers that we might want
208							# to look at don't properly report the content-type
209
210							# start over unless this is something we can read
211	0						my $title = '';
212	0						my $description = '';
213	0						my $is_image = 0;
214
215							# Make an exception for images.
216	0	0					if ($self->{image_checking}) {
217	0	0					if ($head =~ /Content\-Type:\s*image/i) {
218	0						my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i;
219
220	0	0					if ($img_size <= 0) {
221	0						$title = $description = '404 Not Found';
222	0						next;
223							} else {
224	0						$is_image = 1;
225							}
226							}
227							} else {
228	0	0	0				lc($head) =~ /content-type:\s?(?:text\|html)/ or
229							carp "$url doesn't look like TEXT or HTML!" and
230							next; # no weird media, movies, flash, etc
231							}
232
233	0	0					( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n,
234							unless $title;
235
236	0	0					( $description ) = $head =~
237							/[^:]?DESCRIPTION:\s((?:[^\n]+(?:\n )?)+)/i
238							unless $description;
239
240							# Add this link to our dead links list if the title matches
241							# a standard "404 Not Found" error.
242	0	0	0				if ($title && $self->{report_broken_links}) {
243	0	0					push(@{ $self->{broken_links} }, $url)
	0
244							if $title =~ /^\s404\s+Not\s+Found\s$/;
245							}
246
247	0	0					$description = $self->_snip($description) if $description;
248
249	0	0	0				my $page = $response->content or
250							carp "Failed to fetch $url." and
251							next; # no empty pages, start over with next url
252
253	0						$self->{_current_Bytes} = length($page);
254	0						$self->spyder_data($self->{_current_Bytes});
255
256							# we are going to use a digest to prevent parsing the identical
257							# content received via a different url
258	0						my $digest = md5_base64($page); # unique microtag of the page
259							# so if we've seen it before, start over with the next URL
260	0	0	0				$self->{_page_Memory}{$digest}++ and
261							carp "Seen this page's content before: $url"
262							and next;
263
264	0						$self->{_page_content} = $page;
265	0	0					print "PARSING: $url\n" if $VERBOSITY > 1;
266	0						$self->{_spydered}{$url}++;
267	0						$self->html_parser->parse($page);
268	0						$self->html_parser->eof;
269
270	0	0					$self->{_adjustment} = $self->_parse_for_terms if $self->terms;
271
272							# make links absolute and fix bad spacing in link names, then turn
273							# them into an Enqueue object
274	0						for my $pair ( @{$self->{_enqueue_Objects}} ) {
	0
275	0						my $url;
276	0						eval {
277	0						$url = URI::URL::url($pair->[0], $base)->abs;
278							};
279	0						my $name = _snip($pair->[1]);
280	0						my $item = WWW::Spyder::Enqueue->new("$url",$name);
281	0						$pair = $item;
282							}
283							# put links into the queue(s)
284	0	0					$self->_stack_urls() if $self->_links;
285
286							# clean up text a bit. should this be here...?
287	0	0	0				if ( $self->{_text} and ${$self->{_text}} ) {
	0
288	0						${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g;
	0
289							}
290
291							# in the future Page object should be installed like parsers as a
292							# reusable container
293							# return
294	0		0				my $Page =
295							WWW::Spyder::Page->new(
296							title => $title,
297							text => $self->{_text},
298							raw => \$page,
299							url => $enQ->url,
300							domain => $enQ->domain,
301							link_name => undef,
302							link => undef,
303							description => $description \|\| '',
304							pages_enQs => $self->_enqueue,
305							);
306	0						$self->_reset; #<<--clear out things that might remain
307	0						return $Page;
308							}
309							}
310							#--------------------------
311							sub _stack_urls { # should eventually be broken into stack and sift?
312
313							# dual purpose, w/ terms it filters as long as there are no urls
314							# passed, otherwise it's setting them to the top of the queues
315	0			0			my ($self, @urls) = @_;
316
317	0	0	0				print "Stacking " . join(', ', @urls) . "\n"
318							if @urls and $VERBOSITY > 5;
319
320	0	0	0				if ( $self->terms and not @urls ) {
		0
321	1			1		6	no warnings;
	1					2
	1					1351
322	0						my @Qs = $self->_queues;
323	0						for my $enQ ( @{$self->_enqueue} ) {
	0
324	0						my ( $url, $name ) = ( $enQ->url, $enQ->name );
325
326	0	0					next if $self->_seen($url);
327
328	0						my $match = 0;
329	0						while ( my ($term,$rx) = each %{$self->terms} ) {
	0
330	0						$match++ for $name =~ /$rx/g;
331							}
332	0						my $baseQ = 10;
333	0						my $adjustment = $self->{_adjustment};
334	0						$baseQ -= $adjustment; # 4 to 0
335
336	0	0	0				push @{$self->{$baseQ}}, $enQ
	0
337							and next unless $match;
338
339	0	0					if ( $VERBOSITY > 1 ) {
340	0						print "NAME: $name\n";
341	0						printf " RATIO -->> %d\n", $match;
342							}
343	0						my $queue_index = sprintf "%d",
344							$self->_term_count / $match;
345
346	0						$queue_index -= $adjustment;
347	0	0					$queue_index = 4 if $queue_index > 4;
348	0	0					$queue_index = 0 if $queue_index < 0;
349	0						my $queue = $Qs[$queue_index];
350	0	0					if ($VERBOSITY > 2) {
351	0						print "Q:$queue [$queue_index] match: $match terms:",
352							$self->_term_count, " Adjust: $adjustment\n\n";
353							}
354	0						push @{$self->{$queue}}, $enQ;
	0
355							}
356							} elsif ( @urls > 0 ) {
357	0						for my $url ( @urls ) {
358	0	0					next if $self->_seen($url);
359	0						my $queue = $self->_queues;
360	0	0					carp "Placing $url in '$queue'\n" if $VERBOSITY > 2;
361
362							# unshift because seeding is priority
363	0						unshift @{$self->{$queue}},
	0
364							WWW::Spyder::Enqueue->new($url,undef);
365							}
366							} else {
367	0						for my $enQ ( @{$self->_enqueue} ) {
	0
368	0						my ( $url, $name ) = ( $enQ->url, $enQ->name );
369	0	0					next if $self->_seen($url);
370	0						my $queue = $self->_queues;
371	0						push @{$self->{$queue}}, $enQ;
	0
372							}
373							}
374							}
375							#--------------------------
376							sub queue_count {
377	0			0	1		my ($self) = @_;
378	0						my $count = 0;
379	0						for my $Q ( $self->_queues ) {
380	0	0					next unless ref($self->{$Q}) eq 'ARRAY';
381	0						$count += scalar @{$self->{$Q}};
	0
382							}
383	0						return $count;
384							}
385							#--------------------------
386							sub spyder_time {
387	0			0	1		my ($self,$raw) = @_;
388
389	0						my $time = time() - $^T;
390	0	0					return $time if $raw;
391
392	0						my $day = int( $time / 86400 );
393	0						my $hour = int( $time / 3600 ) % 24;
394	0						my $min = int( $time / 60 ) % 60;
395	0						my $sec = $time % 60;
396
397							# also collect slept time!
398	0	0					return sprintf "%d day%s %02d:%02d:%02d",
399							$day, $day == 1?'':'s', $hour, $min, $sec;
400							}
401							#--------------------------
402							sub spyder_data {
403	0			0	1		my ($self, $bytes) = @_;
404	0	0	0				$self->{_bytes_GOT} += $bytes and return $bytes if $bytes;
405
406	0	0					return 0 unless $self->{_bytes_GOT};
407
408	0						my $for_commas = int($self->{_bytes_GOT} / 1_024);
409
410	0						for ( $for_commas ) {
411	0						1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/;
412							}
413	0						return $for_commas;
414							}
415							#--------------------------
416							sub spydered {
417	0			0	1		my ($self) = @_;
418							return wantarray ?
419	0						keys %{ $self->{_spydered} } :
	0
420	0	0					scalar keys %{ $self->{_spydered} };
421							}
422							#--------------------------
423							#sub exclude { # what about FILES TYPES!?
424							# return undef; # not working yet!
425							# my ($self,$thing) = @_;
426							# if ( $thing =~ m<^[^:]{3,5}://> )
427							# {
428							# return $self->{_Xklood}{_domain}{$thing}++;
429							# }
430							# elsif ( $thing )
431							# {
432							# return $self->{_Xklood}{_name}{$thing}++;
433							# }
434							#}
435							#--------------------------
436							#sub excluded_domains {
437							# return undef; # not working yet!
438							# my ($self) = @_;
439							# return wantarray ?
440							# keys %{$self->{_Xklood}{_domain}} :
441							# [ keys %{$self->{_Xklood}{_domain}} ];
442							#}
443							#--------------------------
444							#sub excluded_names {
445							# return undef; # not working yet!
446							# my ($self) = @_;
447							# return wantarray ?
448							# keys %{$self->{_Xklood}{_name}} :
449							# [ keys %{$self->{_Xklood}{_name}} ];
450							#}
451							#--------------------------
452							sub go_to_seed {
453	0			0	1		my ( $self, $engine, $query ) = @_;
454	0						carp "go_to_seed() is not functional yet!\n";
455	0						return; # NOT FUNCTIONAL
456	0						my $seed = WWW::Spyder::Seed::get_seed($engine, $query);
457	0						$self->seed($seed);
458							}
459							#--------------------------
460							sub verbosity {
461	0			0	1		my ( $self, $verbosity ) = @_;
462	0	0	0				carp "Not setting verbosity! Must be integer b/t 1 & 6!\n"
463							and return
464							unless $verbosity;
465	0						$VERBOSITY = $verbosity;
466							}
467							#--------------------------
468
469							#--------------------------
470							# PRIVATE Spyder Methods
471							#--------------------------
472							sub _reset {
473							# RESET MORE THAN THIS!?! make sure all the memory space is clean that
474							# needs be for clean iteration???
475	0			0			my ($self) = @_;
476	0						$self->{$_} = undef for qw( _linkText _linkSwitch _href _src
477							_current_enQ _page_content
478							_current_Bytes _alt _enqueue_Objects
479							_text );
480							}
481							#--------------------------
482							sub _current_enQ {
483	0			0			my ($self, $enQ) = @_;
484	0						my $last_enQ = $self->{_current_enQ};
485	0	0					$self->{_current_enQ} = $enQ if $enQ;
486	0						return $last_enQ; #<<-so we can get last while setting a new one
487							}
488							#--------------------------
489							sub _enqueue {
490	0			0			my ($self,$enQ) = @_;
491	0	0					push @{$self->{_enqueue_Objects}}, $enQ if $enQ;
	0
492	0						return $self->{_enqueue_Objects};
493							}
494							#--------------------------
495							sub _links {
496	0			0			my ($self) = @_;
497	0						return [ map { $_->url } @{$self->_enqueue} ];
	0
	0
498							}
499							#--------------------------
500							sub _seen {
501	0			0			my ($self,$url) = @_;
502	0						return $self->{_seenURLs}{$url}++;
503							}
504							#--------------------------
505							sub _parse_for_terms {
506	0			0			my ($self) = @_;
507	0						$self->{_page_terms_matches} = 0;
508
509	0	0					return 0 unless $self->{_text};
510
511	0						while ( my ($term,$rx) = each %{$self->terms} ) {
	0
512	0						$self->{_page_terms_matches}++ for
513							$self->{_page_content} =~ /$rx/g;
514							}
515
516	0						my $index = int( ( $self->{_page_terms_matches} /
517							length($self->{_text}) ) * 1_000 );
518							# the algorithm might look it but isn't entirely arbitrary
519
520	0	0					print " PARSE TERMS : $self->{_page_terms_matches} " .
521							"/ $self->{_current_Bytes}\n" if $VERBOSITY > 1;
522
523	0	0					return 7 if $index > 25;
524	0	0					return 6 if $index > 18;
525	0	0					return 5 if $index > 14;
526	0	0					return 4 if $index > 11;
527	0	0					return 3 if $index > 7;
528	0	0					return 2 if $index > 3;
529	0	0					return 1 if $index > 0;
530	0						return 0;
531							}
532							#--------------------------
533							sub _install_html_parser {
534	0			0			my ($self) = @_;
535
536							my $Parser = HTML::Parser->new
537							(
538							start_h =>
539							[sub {
540	1			1		7	no warnings;
	1					2
	1					508
541	0			0			my ( $tag, $attr ) = @_;
542
543							# Check for broken image links if requested.
544	0	0	0				return if $tag !~ /^(?:a\|img)$/ && ! $self->{image_checking};
545
546							# need to deal with AREA tags from maps /^(?:a(?:rea)?\|img)$/;
547	0						$attr->{href} =~ s,#[^/]*$,,;
548	0	0					$attr->{src} =~ s,#[^/]*$,, if $self->{image_checking};
549	0	0					return if lc($attr->{href}) =~ m,^\s*mailto:,;
550	0	0					return if lc($attr->{href}) =~ m,^\s*file:,;
551	0	0					return if lc($attr->{href}) =~ m,javascript:,;
552
553	0	0	0				$self->{_src} \|\|= $attr->{src} if $self->{image_checking};
554	0		0				$self->{_href} \|\|= $attr->{href};
555	0		0				$self->{_alt} \|\|= $attr->{alt};
556	0						$self->{_linkSwitch} = 1;
557
558							# Don't wait for the end handler if we have an image, as an image
559							# src tag doesn't have an end.
560	0	0	0				if ($attr->{src} && $self->{image_checking} && ! $attr->{href}) {
			0
561	0		0				$self->{_linkText} \|\|= $self->{_alt} \|\| '+';
			0
562	0						decode_entities($self->{_linkText});
563
564	0						push @{$self->{_enqueue_Objects}},
	0
565							[ $self->{_href}, $self->{_linkText} ];
566
567	0	0	0				push @{$self->{_enqueue_Objects}},
	0
568							[ $self->{_src}, $self->{_linkText} ]
569							if $self->{_src} and $self->{image_checking};
570
571							# reset all our caching variables
572	0						$self->{_linkSwitch} = $self->{_href} = $self->{_alt} =
573							$self->{_src} = $self->{_linkText} = undef;
574
575	0						return;
576							}
577							}, 'tagname, attr'],
578							text_h =>
579							[sub {
580
581	0	0		0			return unless(my $it = shift);
582	0	0					return if $it =~
583							m/(?:\Q\E)/;
584	0						${$self->{_text}} .= $it;
	0
585	0	0					$self->{_linkText} .= $it
586							if $self->{_linkSwitch};
587							}, 'dtext'],
588							end_h =>
589							[sub {
590	0			0			my ( $tag ) = @_;
591	1			1		13	no warnings; # only problem: Links
	1					2
	1					435
592
593	0	0					if ($self->{image_checking}) {
594	0	0	0				return unless $tag eq 'a' or $self->{_linkSwitch} or
			0
595							$tag eq 'img';
596							} else {
597	0	0	0				return unless $tag eq 'a' or $self->{_linkSwitch};
598							}
599
600	0		0				$self->{_linkText} \|\|= $self->{_alt} \|\| '+';
			0
601	0						decode_entities($self->{_linkText});
602
603	0						push @{$self->{_enqueue_Objects}},
	0
604							[ $self->{_href}, $self->{_linkText} ];
605
606	0	0	0				push @{$self->{_enqueue_Objects}},
	0
607							[ $self->{_src}, $self->{_linkText} ]
608							if $self->{_src} and $self->{image_checking};
609
610							# reset all our caching variables
611	0						$self->{_linkSwitch} = $self->{_href} = $self->{_alt} = $self->{_src} =
612							$self->{_linkText} = undef;
613	0						}, 'tagname'],
614							default_h => [""],
615							);
616	0						$Parser->ignore_elements(qw(script style));
617	0						$Parser->unbroken_text(1);
618	0						$self->html_parser($Parser);
619							}
620							#--------------------------
621							sub _install_web_agent {
622	0			0			my $self = shift;
623	0						my $jar_jar = undef;
624
625							# If a LWP::UserAgent object was passed in to our constructor, use
626							# it.
627	0	0					if ($self->{UA}) {
628	0						$self->UA( $self->{UA} );
629
630							# Otherwise, create a new one.
631							} else {
632	0						$self->UA( LWP::UserAgent->new );
633							}
634
635	0						$self->UA->agent('Mozilla/5.0');
636	0						$self->UA->timeout(30);
637	0						$self->UA->max_size(250_000);
638
639							# Get our cookie from our the jar passed in.
640	0	0					if ($self->{UA}) {
641	0						$jar_jar = $self->{UA}->cookie_jar();
642
643							# Or else create a new cookie.
644							} else {
645	0		0				$jar_jar = HTTP::Cookies->new
646							(file => $self->cookie_file \|\| "$ENV{HOME}/spyderCookies",
647							autosave => 1,
648							max_cookie_size => 4096,
649							max_cookies_per_domain => 5, );
650							}
651
652	0						$self->UA->cookie_jar($jar_jar);
653							}
654							#--------------------------
655							sub _install_exit_check {
656	0			0			my ($self, $arg) = @_;
657	0						my $class = ref $self;
658
659	0	0	0				unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) {
660	1			1		6	no strict "refs";
	1					1
	1					310
661	0						*{$class."::_exit_check"} =
662	0	0		0			sub { return 1 unless $self->queue_count;
663	0						return 0;
664	0						};
665	0						return;
666							}
667
668							# checks can be: links => #, success => ratio, time => 10min...
669							# a piece of code we're going to build up to eval into method-hood
670	0						my $SUB = 'sub { my $self = shift; ' .
671							'return 1 unless $self->queue_count; ';
672							#------------------------------------------------------------
673	0	0					if ( $arg->{exit_on}{pages} ) {
674	0	0					print "Installing EXIT on links: $arg->{exit_on}{pages}\n"
675							if $VERBOSITY > 1;
676	0						$SUB .= ' return 1 if ' .
677							'$self->spydered >= ' .$arg->{exit_on}{pages} .';';
678							}
679							#------------------------------------------------------------
680	0	0					if ( $arg->{exit_on}{success} ) {
681							#set necessary obj value and add to sub code
682							}
683							#------------------------------------------------------------
684	0	0					if ( $arg->{exit_on}{time} ) {
685	0	0					print "Installing EXIT on time: $arg->{exit_on}{time}\n"
686							if $VERBOSITY > 1;
687
688	0						my ($amount,$unit) =
689							$arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/;
690							# skip final "s" in case of hours, secs, mins
691
692	0						my %times = ( hour => 3600,
693							min => 60,
694							sec => 1 );
695
696	0						my $time_factor = 0;
697	0						for ( keys %times ) {
698	0	0					next unless exists $times{$unit};
699	0						$time_factor = $amount * $times{$unit};
700							}
701	0						$self->_exit_epoch($time_factor + $^T);
702
703	0						$SUB .= q{
704							return 1 if $self->_exit_epoch < time();
705							};
706							}
707							#------------------------------------------------------------
708	0						$SUB .= '}';
709
710	1			1		4	no strict "refs";
	1					2
	1					748
711	0						*{$class."::_exit_check"} = eval $SUB;
	0
712							}
713							#--------------------------
714							sub _choose_courteously {
715	0			0			my $self = shift;
716
717							# w/o the switch and $i-- it acts a bit more depth first. w/ it, it's
718							# basically hard head down breadth first
719	0	0					print "CHOOSING courteously!\n" if $VERBOSITY > 1;
720	0						for my $Q ( $self->_queues ) {
721	0	0					print "Looking for URL in $Q\n" if $VERBOSITY > 2;
722	0	0	0				next unless $self->{$Q} and @{$self->{$Q}} > 0;
	0
723	0						my %seen;
724	0						my $total = scalar @{$self->{$Q}};
	0
725	0						my $switch;
726	0						for ( my $i = 0; $i < @{$self->{$Q}}; $i++ ) {
	0
727	0						my $enQ = $self->{$Q}[$i];
728	0						my ($url,$name) = ( $enQ->url, $enQ->name );
729
730							# if we see one again, we've reshuffled as much as is useful
731	0	0					$seen{$url}++
732							and $switch = 1; # progress through to next Q
733
734	0	0					return splice(@{$self->{$Q}},$i,1)
	0
735							unless $self->_courtesy_call($enQ);
736
737	0						my $fair_bump = int( log( $total - $i ) / log(1.5) );
738
739	0						my $move_me_back = splice(@{$self->{$Q}},$i,1);
	0
740	0						splice(@{$self->{$Q}},($i+$fair_bump),0,$move_me_back);
	0
741	0	0					$i-- unless $switch;
742							}
743							}
744							# we couldn't pick one courteously
745							} # end of _choose_courteously()
746							#--------------------------
747							sub _just_choose {
748	0			0			my $self = shift;
749	0	0					print "CHOOSING first up!\n" if $VERBOSITY > 1;
750
751	0						my $enQ;
752	0						for my $Q ( $self->_queues ) {
753	0	0					next unless ref($self->{$Q}) eq 'ARRAY';
754	0						$enQ = shift @{$self->{$Q}};
	0
755	0						last;
756							}
757	0						my $tax = $self->_courtesy_call($enQ);
758	0	0					if ( $VERBOSITY > 4 ) {
759	0						print ' QUEUE: ';
760	0	0					print join("-:-", @{$self->{_courtesy_Queue}}), "\n"
	0
761							if $self->{_courtesy_Queue};
762							}
763	0						my $sleep = int(rand($self->sleep_base)) + $tax;
764
765	0	0					if ( $VERBOSITY ) {
766	0	0					printf "COURTESY NAP %d second%s ",
767							$sleep, $sleep == 1 ?'':'s';
768	0	0					printf "(Domain recently seen: %d time%s)\n",
769							$tax, $tax == 1 ?'':'s';
770							}
771	0						sleep $sleep; # courtesy to websites but human-ish w/ random
772	0						$self->slept($sleep);
773	0						return $enQ;
774							}
775							#--------------------------
776							sub _courtesy_call {
777	0			0			my ($self,$enQ) = @_;
778	0	0					return 0 unless $enQ;
779	0						my $domain = $enQ->domain;
780
781	0	0					print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5;
782
783							# yes, we have seen it in the last whatever GETs
784	0						my $seen = 0;
785	0						$seen = scalar grep { $_ eq $domain }
	0
786	0						@{$self->{_courtesy_Queue}};
787	0	0					$seen = 10 if $seen > 10;
788	0						return $seen;
789							}
790							#--------------------------
791							sub _queues { # Q9 is purely for trash so it's not returned here
792							return wantarray ?
793	0	0		0			( 0 .. 9 ) :
794							'0';
795							}
796							#--------------------------
797							sub _snip {
798	0	0		0			my $self = shift if ref($_[0]);
799	0						my ( @text ) = @_;
800	0						s/^\s+//, s/\s+$//, s/\s+/ /g for @text;
801	0	0					return wantarray ? @text : shift @text;
802							}
803							#--------------------------
804							# Spyder ENDS
805							#--------------------------
806							}# WWW::Spyder privacy ends
807
808
809							#--------------------------
810							package WWW::Spyder::Enqueue;
811							#--------------------------
812							{
813	1			1		5	use Carp;
	1					2
	1					116
814							#---------------------------------------------------------------------
815	1					6	use overload( q{""} => '_stringify',
816	1			1		6	fallback => 1 );
	1					2
817							#---------------------------------------------------------------------
818							# 0 -->> URL
819							# 1 -->> name, if any, of link URL was got from
820							# 2 -->> domain
821							#--------------------------
822							sub new {
823	0			0			my ( $caller, $url, $name ) = @_;
824	0		0				my $class = ref($caller) \|\| $caller;
825	0	0					croak "Here I am. " if ref $url;
826	0	0					return undef unless $url;
827	0	0					if ( length($url) > 512 ) { # that's toooo long, don't you think?
828	0						$url = substr($url,0,512);
829							}
830	0	0	0				if ( $name and length($name) > 512 ) {
831	0						$name = substr($url,0,509) . '...';
832							}
833	0	0					$name = '-' unless $name; # need this to find a bug later
834	0						my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),;
835	0						bless [ $url, $name, lc($domain) ], $class;
836							}
837							#--------------------------
838							sub url {
839	0			0			return $_[0]->[0];
840							}
841							#--------------------------
842							sub name {
843	0			0			return $_[0]->[1];
844							}
845							#--------------------------
846							sub domain {
847	0			0			return $_[0]->[2];
848							}
849							#--------------------------
850							sub _stringify {
851	0			0			return $_[0]->[0];
852							}
853							#--------------------------
854							}#privacy for WWW::Spyder::Enqueue ends
855
856
857							#--------------------------
858							package WWW::Spyder::Page;
859							#--------------------------
860	1			1		279	use strict;
	1					1
	1					27
861	1			1		3	use warnings;
	1					2
	1					22
862	1			1		15	use Carp;
	1					1
	1					95
863							{
864							sub new {
865	0			0			my ( $caller, %arg ) = @_;
866	0		0				my $class = ref($caller) \|\| $caller;
867	0						my $self = bless {}, $class;
868
869	0						while ( my ( $method, $val ) = each %arg ) {
870
871	1			1		4	no strict "refs";
	1					1
	1					22
872	1			1		4	no warnings;
	1					1
	1					410
873	0						my $attribute = '_' . $method;
874
875	0	0					if ( ref $val eq 'ARRAY' ) {
876	0						*{"$class::$method"} = sub {
877	0			0			my($self,$arg) = @_;
878	0	0					return @{$self->{$attribute}} unless $arg;
	0
879	0						push(@{$self->{$attribute}}, @{$arg});
	0
	0
880	0						};
881							} else {
882	0						*{"$class::$method"} = sub {
883	0			0			my($self,$arg) = @_;
884							# get if already set and deref if needed
885	0	0	0				if ( not $arg and exists $self->{$attribute} ) {
886	0						return ref($self->{$attribute}) eq 'SCALAR' ?
887	0	0					${$self->{$attribute}} : $self->{$attribute};
888							}
889	0	0					$self->{$attribute} = $arg if $arg; #only set one time!
890	0						};
891							}
892	0						$self->$method($val);
893							}
894	0						return $self;
895							}
896							#--------------------------
897							sub links {
898	0			0			my ( $self ) = @_;
899	0						return map {$_->url} @{$self->{_pages_enQs}};
	0
	0
900							}
901							#--------------------------
902							sub next_link {
903	0			0			my ( $self ) = @_;
904	0						shift @{$self->{_pages_enQs}};
	0
905							}
906							#--------------------------
907							}#privacy for ::Page ends
908
909
910							#--------------------------
911							package WWW::Spyder::Exclusions;
912							#--------------------------
913							{
914							# THIS PACKAGE IS NOT BEING USED
915							my %_domains = qw(
916							ad.doubleclick.net 1
917							ads.clickagents.com 1
918							);
919							my %_names = qw(
920
921							);
922							#--------------------------
923							sub exclude_domain {
924	0			0			$_domains{shift}++;
925							}
926							#--------------------------
927							sub excluded {
928	0			0			my $what = shift;
929	0	0					exists $_domains{$what} \|\| $_names{$what};
930							}
931							#--------------------------
932							}#privacy ends
933
934
935							#--------------------------
936							package WWW::Spyder::Seed;
937							#--------------------------
938							{
939	1			1		11	use URI::Escape;
	1					2
	1					48
940	1			1		4	use Carp;
	1					1
	1					270
941
942							my %engine_url =
943							(
944							google => 'http://www.google.com/search?q=',
945							yahoo => 1
946							);
947
948							# should we exclude the search domain at this point? i think so because
949							# otherwise we've introduced dozens of erroneous links and the engine
950							# is gonna get hammered over time for it
951							#--------------------------
952							sub get_seed {
953
954	0		0	0			my $engine = shift \|\| croak "Must provide search engine! " .
955							join(', ', sort keys %engine_url) . "\n";
956
957	0		0				my $query = shift \|\| croak "Must provide query terms!\n";
958	0						$query = uri_escape($query);
959
960	0	0					croak "$engine is not a valid choice!\n"
961							unless exists $engine_url{lc$engine};
962
963	0						return $engine_url{lc$engine} . $query;
964							}
965
966							} # Privacy for WWW::Spyder::Seed ends
967
968							1;
969
970							# Plain Old D'errrrr
971
972							=pod
973
974							=head1 NAME
975
976							WWW::Spyder - a simple non-persistent web crawler.
977
978							=head1 VERSION
979
980							0.24
981
982							=head1 SYNOPSIS
983
984							A web spider that returns plain text, HTML, and other information per
985							page crawled and can determine what pages to get and parse based on
986							supplied terms compared to the text in links as well as page content.
987
988							use WWW::Spyder;
989							# Supply your own LWP::UserAgent-compatible agent.
990							use WWW::Mechanize;
991
992							my $start_url = "http://my-great-domain.com/";
993							my $mech = WWW::Mechanize->new(agent => "PreferredAgent/0.01")
994
995							my $spyder = WWW::Spyder->new(
996							report_broken_links => 1,
997							seed => $start_url,
998							sleep_base => 5,
999							UA => $mech
1000							);
1001							while ( my $page = $spyder->crawl ) {
1002							# do something with the page...
1003							}
1004
1005							=head1 METHODS
1006
1007							=over 2
1008
1009							=item * $spyder->new()
1010
1011							Construct a new spyder object. Without at least the seed() set, or
1012							go_to_seed() turned on, the spyder isn't ready to crawl.
1013
1014							$spyder = WWW::Spyder->new(shift\|\|die"Gimme a URL!\n");
1015							# ...or...
1016							$spyder = WWW::Spyder->new( %options );
1017
1018							Options include: sleep_base (in seconds), exit_on (hash of methods and
1019							settings), report_broken_links, image_checking (verifies the images pointed to
1020							by tags, disable_cnap (disables the courtesy nap when verbose
1021							output is enabled), and UA (you can pass in an instantiated LWP::UserAgent
1022							object via UA, i.e. UA => $ua_obj). Examples below.
1023
1024							=item * $spyder->seed($url)
1025
1026							Adds a URL (or URLs) to the top of the queues for crawling. If the
1027							spyder is constructed with a single scalar argument, that is considered
1028							the seed.
1029
1030							=item * $spyder->bell([bool])
1031
1032							This will print a bell ("\a") to STDERR on every successfully crawled
1033							page. It might seem annoying but it is an excellent way to know your
1034							spyder is behaving and working. True value turns it on. Right now it
1035							can't be turned off.
1036
1037							=item * $spyder->spyder_time([bool])
1038
1039							Returns raw seconds since I was created if given a
1040							boolean value, otherwise returns "D day(s) HH::MM:SS."
1041
1042							=item * $spyder->terms([list of terms to match])
1043
1044							The more terms, the more the spyder is going to grasp at. If you give
1045							a straight list of strings, they will be turned into very open
1046							regexes. E.g.: "king" would match "sulking" and "kinglet" but not
1047							"King." It is case sensitive right now. If you want more specific
1048							matching or different behavior, pass your own regexes instead of
1049							strings.
1050
1051							$spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i );
1052
1053							terms() is only settable once right now, then it's a done deal.
1054
1055							=item * $spyder->spyder_data()
1056
1057							A comma formatted number of kilobytes retrieved so far. B give
1058							it an argument. It's a set/get routine.
1059
1060							=item * $spyder->slept()
1061
1062							Returns the total number of seconds the spyder has slept while
1063							running. Useful for getting accurate page/time counts (spyder
1064							performance) discounting the added courtesy naps.
1065
1066							=item * $spyder->UA->...
1067
1068							The user agent. It should be an L or a well-behaved
1069							subclass like L. Here are the initialized values you
1070							might want to tweak-
1071
1072							$spyder->UA->timeout(30);
1073							$spyder->UA->max_size(250_000);
1074							$spyder->UA->agent('Mozilla/5.0');
1075
1076							Changing the agent name can hurt your spyder because some servers won't
1077							return content unless it's requested by a "browser" they recognize.
1078
1079							You should probably add your email with from() as well.
1080
1081							$spyder->UA->from('bluefintuna@fish.net');
1082
1083							=item * $spyder->cookie_file([local_file])
1084
1085							They live in $ENV{HOME}/spyderCookie by default but you can set your
1086							own file if you prefer or want to save different cookie files for
1087							different spyders.
1088
1089							=item * $spyder->get_broken_links
1090
1091							Returns a reference to a list of broken link URLs if report_broken_links was
1092							was enabled in the constructor.
1093
1094							=item * $spyder->go_to_seed
1095
1096							=item * $spyder->queue_count
1097
1098							=item * $spyder->show_attributes
1099
1100							=item * $spyder->spydered
1101
1102							=item * $spyder->crawl
1103
1104							Returns (and removes) a Spyder page object from the queue of spydered pages.
1105
1106							=back
1107
1108							=head2 Sypder::Page methods
1109
1110							=over 6
1111
1112							=item * $page->title
1113
1114							=item * $page->text
1115
1116							=item * $page->raw
1117
1118							=item * $page->url
1119
1120							=item * $page->domain
1121
1122							=item * $page->link_name
1123
1124							=item * $page->link
1125
1126							=item * $page->description
1127
1128							=item * $page->pages_enQs
1129
1130							=back
1131
1132							=head2 Weird courteous behavior
1133
1134							Courtesy didn't used to be weird, but that's another story. You will
1135							probably notice that the courtesy routines force a sleep when a
1136							recently seen domain is the only choice for a new link. The sleep is
1137							partially randomized. This is to prevent the spyder from being
1138							recognized in weblogs as a robot.
1139
1140							=head2 The web and courtesy
1141
1142							B, I beg of thee, exercise the most courtesy you can. Don't
1143							let impatience get in the way. Bandwidth and server traffic are
1144							C<$MONEY> for real. The web is an extremely disorganized and corrupted
1145							database at the root but companies and individuals pay to keep it
1146							available. The less pain you cause by banging away on a webserver with
1147							a web agent, the more welcome the next web agent will be.
1148
1149							B: Google seems to be excluding generic LWP agents now. See, I
1150							told you so. A single parallel robot can really hammer a major server,
1151							even someone with as big a farm and as much bandwidth as Google.
1152
1153							=head2 VERBOSITY
1154
1155							=over 2
1156
1157							=item * $spyder->verbosity([1-6]) -OR-
1158
1159							=item * $WWW::Spyder::VERBOSITY = ...
1160
1161							Set it from 1 to 6 right now to get varying amounts of extra info to
1162							STDOUT. It's an uneven scale and will be straightened out pretty soon.
1163							If kids have a preference for sending the info to STDERR, I'll do
1164							that. I might anyway.
1165
1166							=back
1167
1168							=head1 SAMPLE USAGE
1169
1170							=head2 See "spyder-mini-bio" in this distribution
1171
1172							It's an extremely simple, but fairly cool pseudo bio-researcher.
1173
1174							=head2 Simple continually crawling spyder:
1175
1176							In the following code snippet:
1177
1178							use WWW::Spyder;
1179
1180							my $spyder = WWW::Spyder->new( shift \|\| die"Give me a URL!\n" );
1181
1182							while ( my $page = $spyder->crawl ) {
1183
1184							print '-'x70,"\n";
1185							print "Spydering: ", $page->title, "\n";
1186							print " URL: ", $page->url, "\n";
1187							print " Desc: ", $page->description \|\| 'n/a', "\n";
1188							print '-'x70,"\n";
1189							while ( my $link = $page->next_link ) {
1190							printf "%22s ->> %s\n",
1191							length($link->name) > 22 ?
1192							substr($link->name,0,19).'...' : $link->name,
1193							length($link) > 43 ?
1194							substr($link,0,40).'...' : $link;
1195							}
1196							}
1197
1198							as long as unique URLs are being found in the pages crawled, the
1199							spyder will never stop.
1200
1201							Each "crawl" returns a page object which gives the following methods
1202							to get information about the page.
1203
1204							=over 2
1205
1206							=item * $page->links
1207
1208							URLs found on the page.
1209
1210							=item * $page->title
1211
1212							Page's Title if there is one.
1213
1214							=item * $page->text
1215
1216							The parsed plain text out of the page. Uses HTML::Parser and tries to
1217							ignore non-readable stuff like comments and scripts.
1218
1219							=item * $page->url
1220
1221							=item * $page->domain
1222
1223							=item * $page->raw
1224
1225							The content returned by the server. Should be HTML.
1226
1227							=item * $page->description
1228
1229							The META description of the page if there is one.
1230
1231							=item * $page->links
1232
1233							Returns a list of the URLs in the page. Note: next_link() will shift
1234							the available list of links() each time it's called.
1235
1236							=item * $link = $page->next_link
1237
1238							next_link() destructively returns the next URI-ish object in the page.
1239							They are objects with three accessors.
1240
1241							=back
1242
1243							=over 6
1244
1245							=item * $link->url
1246
1247							This is also overloaded so that interpolating "$link" will get the
1248							URL just as the method does.
1249
1250							=item * $link->name
1251
1252							=item * $link->domain
1253
1254							=back
1255
1256							=head2 Spyder that will give up the ghost...
1257
1258							The following spyder is initialized to stop crawling when I of
1259							its conditions are met: 10mins pass or 300 pages are crawled.
1260
1261							use WWW::Spyder;
1262
1263							my $url = shift \|\| die "Please give me a URL to start!\n";
1264
1265							my $spyder = WWW::Spyder->new
1266							(seed => $url,
1267							sleep_base => 10,
1268							exit_on => { pages => 300,
1269							time => '10min', },);
1270
1271							while ( my $page = $spyder->crawl ) {
1272
1273							print '-'x70,"\n";
1274							print "Spydering: ", $page->title, "\n";
1275							print " URL: ", $page->url, "\n";
1276							print " Desc: ", $page->description \|\| '', "\n";
1277							print '-'x70,"\n";
1278							while ( my $link = $page->next_link ) {
1279							printf "%22s ->> %s\n",
1280							length($link->name) > 22 ?
1281							substr($link->name,0,19).'...' : $link->name,
1282							length($link) > 43 ?
1283							substr($link,0,40).'...' : $link;
1284							}
1285							}
1286
1287							=head2 Primitive page reader
1288
1289							use WWW::Spyder;
1290							use Text::Wrap;
1291
1292							my $url = shift \|\| die "Please give me a URL to start!\n";
1293							@ARGV or die "Please also give me a search term.\n";
1294							my $spyder = WWW::Spyder->new;
1295							$spyder->seed($url);
1296							$spyder->terms(@ARGV);
1297
1298							while ( my $page = $spyder->crawl ) {
1299							print '-'x70,"\n * ";
1300							print $page->title, "\n";
1301							print '-'x70,"\n";
1302							print wrap('','', $page->text);
1303							sleep 60;
1304							}
1305
1306							=head1 TIPS
1307
1308							If you are going to do anything important with it, implement some
1309							signal blocking to prevent accidental problems and tie your gathered
1310							information to a DB_File or some such.
1311
1312							You might want to load C. It should top the nice off
1313							at your system's max and prevent your spyder from interfering with
1314							your system.
1315
1316							You might want to to set $\| = 1.
1317
1318							=head1 PRIVATE METHODS
1319
1320							=head2 are private but hack away if you're inclined
1321
1322							=head1 TO DO
1323
1324							I is conceived to live in a future namespace as a servant class
1325							for a complex web research agent with simple interfaces to
1326							pre-designed grammars for research reports; or self-designed
1327							grammars/reports (might be implemented via Parse::FastDescent if that
1328							lazy-bones Conway would just find another 5 hours in the paltry 32
1329							hour day he's presently working).
1330
1331							I'd like the thing to be able to parse RTF, PDF, and perhaps even
1332							resource sections of image files but that isn't on the radar right
1333							now.
1334
1335							The tests should work differently. Currently they ask for outside
1336							resources without checking if there is either an open way to do it or
1337							if the user approves of it. Bad form all around.
1338
1339							=head1 TO DOABLE BY 1.0
1340
1341							Add 2-4 sample scripts that are a bit more useful.
1342
1343							There are many functions that should be under the programmer's control
1344							and not buried in the spyder. They will emerge soon. I'd like to put
1345							in hooks to allow the user to keep(), toss(), or exclude(), urls, link
1346							names, and domains, while crawling.
1347
1348							Clean up some redundant, sloppy, and weird code. Probably change or
1349							remove the AUTOLOAD.
1350
1351							Put in a go_to_seed() method and a subclass, ::Seed, with rules to
1352							construct query URLs by search engine. It would be the autostart or the
1353							fallback for perpetual spyders that run out of links. It would hit a
1354							given or default search engine with the I's terms as the query.
1355							Obviously this would only work with terms() defined.
1356
1357							Implement auto-exclusion for failure vs. success rates on names as well
1358							as domains (maybe URI suffixes too).
1359
1360							Turn length of courtesy queue into the breadth/depth setting? make it
1361							automatically adjusting...?
1362
1363							Consistently found link names are excluded from term strength sorting?
1364							Eg: "privacy policy," "read more," "copyright..."
1365
1366							Fix some image tag parsing problems and add area tag parsing.
1367
1368							Configuration for user:password by domain.
1369
1370							::Page objects become reusable so that a spyder only needs one.
1371
1372							::Enqueue objects become indexed so they are nixable from anywhere.
1373
1374							Expand exit_on routines to size, slept time, dwindling success ratio,
1375							and maybe more.
1376
1377							Make methods to set "skepticism" and "effort" which will influence the
1378							way the terms are used to keep, order, and toss URLs.
1379
1380							=head1 BE WARNED
1381
1382							This module already does some extremely useful things but it's in its
1383							infancy and it is conceived to live in a different namespace and
1384							perhaps become more private as a subservient part of a parent class.
1385							This may never happen but it's the idea. So don't put this into
1386							production code yet. I am endeavoring to keep its interface constant
1387							either way. That said, it could change completely.
1388
1389							=head2 Also!
1390
1391							This module saves cookies to the user's home. There will be more
1392							control over cookies in the future, but that's how it is right now.
1393							They live in $ENV{HOME}/spyderCookie.
1394
1395							=head2 Anche!
1396
1397							Robot Rules aren't respected. I endeavors to be polite as far
1398							as server hits are concerned, but doesn't take "no" for answer right
1399							now. I want to add this, and not just by domain, but by page settings.
1400
1401							=head1 UNDOCUMENTED FEATURES
1402
1403							A.k.a. Bugs. Don't be ridiculous! Bugs in B?!
1404
1405							There is a bug that is causing retrieval of image src tags, I think
1406							but haven't tracked it down yet, as links. I also think the plain text
1407							parsing has some problems which will be remedied shortly.
1408
1409							If you are building more than one spyder in the same script they are
1410							going to share the same exit_on parameters because it's a
1411							self-installing method. This will not always be so.
1412
1413							See B file for more open and past issues.
1414
1415							Let me know if you find any others. If you find one that is platform
1416							specific, please send patch code/suggestion because I might not have
1417							any idea how to fix it.
1418
1419							=head1 WHY C
1420
1421							I didn't want to use the more appropriate I because I think
1422							there is a better one out there somewhere in the zeitgeist and the
1423							namespace future of I is uncertain. It may end up a
1424							semi-private part of a bigger family. And I may be King of Kenya
1425							someday. One's got to dream.
1426
1427							If you like I, have feedback, wishlist usage, better
1428							algorithms/implementations for any part of it, please let me know!
1429
1430							=head1 THANKS TO
1431
1432							Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway,
1433							Raphael Manfredi, Gurusamy Sarathy, and plenty of others.
1434
1435							=head1 COMPARE WITH (PROBABLY PREFER)
1436
1437							L, L, L, L,
1438							L, and other kith and kin.
1439
1440							=head1 LICENCE AND COPYRIGHT
1441
1442							Copyright (c) 2001-2008, Ashley Pond V C<< >>. All
1443							rights reserved.
1444
1445							This module is free software; you can redistribute it and/or
1446							modify it under the same terms as Perl itself. See L.
1447
1448							=head1 DISCLAIMER OF WARRANTY
1449
1450							BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
1451							FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
1452							OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
1453							PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
1454							EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1455							WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
1456							ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
1457							YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
1458							NECESSARY SERVICING, REPAIR, OR CORRECTION.
1459
1460							IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
1461							WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
1462							REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
1463							LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
1464							OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
1465							THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
1466							RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
1467							FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
1468							SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
1469							SUCH DAMAGES.
1470
1471							=cut