line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Spyder; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
24344
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
65
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
211
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1400
|
use HTML::Parser 3; |
|
1
|
|
|
|
|
12055
|
|
|
1
|
|
|
|
|
36
|
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
16172
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
65533
|
|
|
1
|
|
|
|
|
27
|
|
9
|
1
|
|
|
1
|
|
859
|
use HTTP::Cookies; |
|
1
|
|
|
|
|
7079
|
|
|
1
|
|
|
|
|
24
|
|
10
|
1
|
|
|
1
|
|
738
|
use URI::URL; |
|
1
|
|
|
|
|
3938
|
|
|
1
|
|
|
|
|
50
|
|
11
|
1
|
|
|
1
|
|
6
|
use HTML::Entities; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
51
|
|
12
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
4
|
use Digest::MD5 "md5_base64"; # For making seen content key/index. |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
42
|
|
14
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
233
|
|
16
|
|
|
|
|
|
|
our $VERSION = '0.24'; |
17
|
|
|
|
|
|
|
our $VERBOSITY ||= 0; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Methods |
20
|
|
|
|
|
|
|
#-------------------------- |
21
|
|
|
|
|
|
|
{ # make it all a bit more private |
22
|
|
|
|
|
|
|
my %_methods = (# these are methods & roots of our attribute names |
23
|
|
|
|
|
|
|
UA => undef, |
24
|
|
|
|
|
|
|
bell => undef, |
25
|
|
|
|
|
|
|
html_parser => undef, |
26
|
|
|
|
|
|
|
sleep_base => undef, |
27
|
|
|
|
|
|
|
cookie_file => undef, |
28
|
|
|
|
|
|
|
_exit_epoch => undef, |
29
|
|
|
|
|
|
|
_term_count => undef, |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
# Those may all get hardcoded eventually, but they're handy for now. |
32
|
|
|
|
|
|
|
#-------------------------- |
33
|
|
|
|
|
|
|
sub new { |
34
|
0
|
|
|
0
|
1
|
|
my ( $caller ) = shift; |
35
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
36
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
37
|
|
|
|
|
|
|
|
38
|
0
|
|
|
|
|
|
my ( $seed, %arg ); |
39
|
0
|
0
|
|
|
|
|
if ( @_ == 1 ) { |
40
|
0
|
|
|
|
|
|
( $seed ) = @_; |
41
|
|
|
|
|
|
|
} |
42
|
0
|
|
|
|
|
|
%arg = ( broken_links => [], |
43
|
|
|
|
|
|
|
exit_on => undef, |
44
|
|
|
|
|
|
|
image_checking => 0, |
45
|
|
|
|
|
|
|
report_broken_links => 0, |
46
|
|
|
|
|
|
|
seed => undef, |
47
|
|
|
|
|
|
|
sleep => undef, |
48
|
|
|
|
|
|
|
sleep_base => 5, |
49
|
|
|
|
|
|
|
UA => undef |
50
|
|
|
|
|
|
|
); |
51
|
0
|
0
|
|
|
|
|
%arg = ( %arg, @_ ) unless @_ % 2; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Set our UA object if it was passed on to our constructor. |
54
|
0
|
0
|
|
|
|
|
$self->{UA} = $arg{UA} if $arg{UA}; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Turn on image checking if requested. img src tags will be checked if |
57
|
|
|
|
|
|
|
# image_checking is set to 1 in the constructor. |
58
|
0
|
0
|
|
|
|
|
$self->{image_checking} = $arg{image_checking} if $arg{image_checking}; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Turn on broken link checking if requested. Broken link URIs can be |
61
|
|
|
|
|
|
|
# obtained via get_broken_links(). |
62
|
0
|
0
|
|
|
|
|
$self->{report_broken_links} = $arg{report_broken_links} |
63
|
|
|
|
|
|
|
if $arg{report_broken_links}; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Install all our methods, either set once then get only or push/shift |
66
|
|
|
|
|
|
|
# array refs. |
67
|
0
|
|
|
|
|
|
for my $method ( %_methods ) { |
68
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
27
|
|
69
|
1
|
|
|
1
|
|
3
|
no warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1621
|
|
70
|
0
|
|
|
|
|
|
my $attribute = '_' . $method; |
71
|
|
|
|
|
|
|
|
72
|
0
|
0
|
|
|
|
|
if ( ref $_methods{$method} eq 'ARRAY' ) { |
73
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
74
|
0
|
|
|
0
|
|
|
my($self,@args) = @_; |
75
|
0
|
0
|
|
|
|
|
return shift(@{$self->{$attribute}}) unless @args; |
|
0
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
push(@{$self->{$attribute}}, @args); |
|
0
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
}; |
78
|
|
|
|
|
|
|
} else { |
79
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
80
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
81
|
0
|
0
|
0
|
|
|
|
carp "You cannot reset $method!" |
82
|
|
|
|
|
|
|
if $arg and exists $self->{$attribute}; |
83
|
0
|
0
|
|
|
|
|
return $self->{$attribute} #get if already set |
84
|
|
|
|
|
|
|
if exists $self->{$attribute}; |
85
|
0
|
|
|
|
|
|
$self->{$attribute} = $arg; #only set one time! |
86
|
0
|
|
|
|
|
|
}; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
0
|
|
0
|
|
|
|
$seed ||= $arg{seed}; |
91
|
0
|
0
|
|
|
|
|
$self->seed($seed) if $seed; |
92
|
0
|
|
|
|
|
|
$self->sleep_base($arg{sleep_base}); |
93
|
0
|
0
|
|
|
|
|
$self->_install_exit_check(\%arg) unless $self->can('_exit_check'); |
94
|
0
|
|
|
|
|
|
$self->_install_html_parser; |
95
|
0
|
|
|
|
|
|
$self->_install_web_agent; |
96
|
|
|
|
|
|
|
|
97
|
0
|
|
|
|
|
|
return $self; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
#-------------------------- |
100
|
|
|
|
|
|
|
sub terms { |
101
|
0
|
|
|
0
|
1
|
|
my ($self,@terms) = @_; |
102
|
0
|
0
|
0
|
|
|
|
if ( @terms and not exists $self->{_terms} ) { |
103
|
0
|
|
|
|
|
|
$self->_term_count(scalar @terms); # makes this set once op |
104
|
0
|
|
|
|
|
|
my %terms; |
105
|
0
|
|
|
|
|
|
$terms{$_} = qr/$_/ for @terms; |
106
|
0
|
|
|
|
|
|
$self->{_terms} = \%terms; |
107
|
|
|
|
|
|
|
} else { |
108
|
0
|
|
|
|
|
|
return $self->{_terms} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
#-------------------------- |
112
|
|
|
|
|
|
|
sub show_attributes { |
113
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
114
|
0
|
|
|
|
|
|
return map {/^_(.+)$/} keys %{$self}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
#-------------------------- |
117
|
|
|
|
|
|
|
sub slept { |
118
|
0
|
|
|
0
|
1
|
|
my ($self, $time) = @_; |
119
|
0
|
0
|
|
|
|
|
$self->{_Slept} += $time if $time; |
120
|
0
|
0
|
|
|
|
|
return $self->{_Slept} unless $time; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
#-------------------------- |
123
|
|
|
|
|
|
|
sub seed { |
124
|
0
|
|
|
0
|
1
|
|
my ($self, $url) = @_; |
125
|
0
|
0
|
|
|
|
|
$url or croak "Must provide URL to seed()."; |
126
|
0
|
0
|
|
|
|
|
croak "You have passed something besides a plain URL to seed()!" |
127
|
|
|
|
|
|
|
if ref $url; |
128
|
0
|
|
|
|
|
|
$self->_stack_urls($url); |
129
|
0
|
|
|
|
|
|
return 1; # to the top of the stacks |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#-------------------------- |
133
|
|
|
|
|
|
|
sub get_broken_links { |
134
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
return $self->{broken_links}; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
#-------------------------- |
140
|
|
|
|
|
|
|
sub crawl { |
141
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
142
|
0
|
|
0
|
|
|
|
my $opts = shift || undef; |
143
|
0
|
|
|
|
|
|
my $excludes = []; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Exclude list option. |
146
|
0
|
0
|
|
|
|
|
if ( ref($opts->{exclude}) eq 'ARRAY' ) { |
147
|
0
|
|
|
|
|
|
$excludes = $opts->{exclude}; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
while ('I have pages to get...') { |
151
|
|
|
|
|
|
|
|
152
|
0
|
0
|
|
|
|
|
$self->_exit_check and return; |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $skip_url = 0; |
155
|
0
|
|
|
|
|
|
my $enQ = undef; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Report a page with a 404 error in the title if report_broken_links is |
158
|
|
|
|
|
|
|
# enabled. Also keep processing if we're looking for img src tags. |
159
|
0
|
0
|
0
|
|
|
|
if ($self->{report_broken_links} || $self->{image_checking}) { |
160
|
0
|
|
0
|
|
|
|
$enQ = $self->_choose_courteously || |
161
|
|
|
|
|
|
|
$self->_just_choose; |
162
|
|
|
|
|
|
|
} else { |
163
|
0
|
|
0
|
|
|
|
$enQ = $self->_choose_courteously || |
164
|
|
|
|
|
|
|
$self->_just_choose || |
165
|
|
|
|
|
|
|
return; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
my $url = $enQ->url; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Skip this URL if it's in our excluded list. |
171
|
0
|
0
|
|
|
|
|
for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; } |
|
0
|
|
|
|
|
|
|
172
|
0
|
0
|
|
|
|
|
next if $skip_url; |
173
|
|
|
|
|
|
|
|
174
|
0
|
|
|
|
|
|
$self->url($url); |
175
|
0
|
|
|
|
|
|
$self->_current_enQ($enQ); |
176
|
|
|
|
|
|
|
|
177
|
0
|
0
|
|
|
|
|
print "GET'ing: $url\n" if $VERBOSITY; |
178
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
my $response = $self->UA->request # no redirects &c is simple_ |
180
|
|
|
|
|
|
|
( HTTP::Request->new( GET => "$url" ) ); |
181
|
|
|
|
|
|
|
|
182
|
0
|
0
|
|
|
|
|
print STDERR "\a" if $self->bell; |
183
|
|
|
|
|
|
|
|
184
|
0
|
0
|
0
|
|
|
|
$response or |
185
|
|
|
|
|
|
|
carp "$url failed GET!" and next; |
186
|
|
|
|
|
|
|
|
187
|
0
|
|
|
|
|
|
push @{$self->{_courtesy_Queue}}, $enQ->domain; |
|
0
|
|
|
|
|
|
|
188
|
0
|
|
|
|
|
|
shift @{$self->{_courtesy_Queue}} |
|
0
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
if $self->{_courtesy_Queue} |
190
|
0
|
0
|
0
|
|
|
|
and @{$self->{_courtesy_Queue}} > 100; |
191
|
|
|
|
|
|
|
|
192
|
0
|
|
|
|
|
|
my $head = $response->headers_as_string; |
193
|
0
|
0
|
0
|
|
|
|
$head or |
194
|
|
|
|
|
|
|
carp "$url has no HEAD!" and |
195
|
|
|
|
|
|
|
next; # no headless webpages |
196
|
|
|
|
|
|
|
|
197
|
0
|
0
|
|
|
|
|
length($head) > 1_024 and $head = substr($head,0,1_024); |
198
|
|
|
|
|
|
|
|
199
|
0
|
0
|
|
|
|
|
print $head, "\n" if $VERBOSITY > 2; |
200
|
|
|
|
|
|
|
|
201
|
0
|
|
|
|
|
|
my $base; |
202
|
0
|
|
|
|
|
|
eval { $base = $response->base }; |
|
0
|
|
|
|
|
|
|
203
|
0
|
0
|
0
|
|
|
|
$base or |
204
|
|
|
|
|
|
|
carp "$url has no discernible BASE!" and |
205
|
|
|
|
|
|
|
next; # no baseless webpages |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# WE SHOULD also look for because some servers that we might want |
208
|
|
|
|
|
|
|
# to look at don't properly report the content-type |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# start over unless this is something we can read |
211
|
0
|
|
|
|
|
|
my $title = ''; |
212
|
0
|
|
|
|
|
|
my $description = ''; |
213
|
0
|
|
|
|
|
|
my $is_image = 0; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Make an exception for images. |
216
|
0
|
0
|
|
|
|
|
if ($self->{image_checking}) { |
217
|
0
|
0
|
|
|
|
|
if ($head =~ /Content\-Type:\s*image/i) { |
218
|
0
|
|
|
|
|
|
my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i; |
219
|
|
|
|
|
|
|
|
220
|
0
|
0
|
|
|
|
|
if ($img_size <= 0) { |
221
|
0
|
|
|
|
|
|
$title = $description = '404 Not Found'; |
222
|
0
|
|
|
|
|
|
next; |
223
|
|
|
|
|
|
|
} else { |
224
|
0
|
|
|
|
|
|
$is_image = 1; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
} else { |
228
|
0
|
0
|
0
|
|
|
|
lc($head) =~ /content-type:\s?(?:text|html)/ or |
229
|
|
|
|
|
|
|
carp "$url doesn't look like TEXT or HTML!" and |
230
|
|
|
|
|
|
|
next; # no weird media, movies, flash, etc |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
0
|
0
|
|
|
|
|
( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n, |
234
|
|
|
|
|
|
|
unless $title; |
235
|
|
|
|
|
|
|
|
236
|
0
|
0
|
|
|
|
|
( $description ) = $head =~ |
237
|
|
|
|
|
|
|
/[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i |
238
|
|
|
|
|
|
|
unless $description; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Add this link to our dead links list if the title matches |
241
|
|
|
|
|
|
|
# a standard "404 Not Found" error. |
242
|
0
|
0
|
0
|
|
|
|
if ($title && $self->{report_broken_links}) { |
243
|
0
|
0
|
|
|
|
|
push(@{ $self->{broken_links} }, $url) |
|
0
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
if $title =~ /^\s*404\s+Not\s+Found\s*$/; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
0
|
0
|
|
|
|
|
$description = $self->_snip($description) if $description; |
248
|
|
|
|
|
|
|
|
249
|
0
|
0
|
0
|
|
|
|
my $page = $response->content or |
250
|
|
|
|
|
|
|
carp "Failed to fetch $url." and |
251
|
|
|
|
|
|
|
next; # no empty pages, start over with next url |
252
|
|
|
|
|
|
|
|
253
|
0
|
|
|
|
|
|
$self->{_current_Bytes} = length($page); |
254
|
0
|
|
|
|
|
|
$self->spyder_data($self->{_current_Bytes}); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# we are going to use a digest to prevent parsing the identical |
257
|
|
|
|
|
|
|
# content received via a different url |
258
|
0
|
|
|
|
|
|
my $digest = md5_base64($page); # unique microtag of the page |
259
|
|
|
|
|
|
|
# so if we've seen it before, start over with the next URL |
260
|
0
|
0
|
0
|
|
|
|
$self->{_page_Memory}{$digest}++ and |
261
|
|
|
|
|
|
|
carp "Seen this page's content before: $url" |
262
|
|
|
|
|
|
|
and next; |
263
|
|
|
|
|
|
|
|
264
|
0
|
|
|
|
|
|
$self->{_page_content} = $page; |
265
|
0
|
0
|
|
|
|
|
print "PARSING: $url\n" if $VERBOSITY > 1; |
266
|
0
|
|
|
|
|
|
$self->{_spydered}{$url}++; |
267
|
0
|
|
|
|
|
|
$self->html_parser->parse($page); |
268
|
0
|
|
|
|
|
|
$self->html_parser->eof; |
269
|
|
|
|
|
|
|
|
270
|
0
|
0
|
|
|
|
|
$self->{_adjustment} = $self->_parse_for_terms if $self->terms; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# make links absolute and fix bad spacing in link names, then turn |
273
|
|
|
|
|
|
|
# them into an Enqueue object |
274
|
0
|
|
|
|
|
|
for my $pair ( @{$self->{_enqueue_Objects}} ) { |
|
0
|
|
|
|
|
|
|
275
|
0
|
|
|
|
|
|
my $url; |
276
|
0
|
|
|
|
|
|
eval { |
277
|
0
|
|
|
|
|
|
$url = URI::URL::url($pair->[0], $base)->abs; |
278
|
|
|
|
|
|
|
}; |
279
|
0
|
|
|
|
|
|
my $name = _snip($pair->[1]); |
280
|
0
|
|
|
|
|
|
my $item = WWW::Spyder::Enqueue->new("$url",$name); |
281
|
0
|
|
|
|
|
|
$pair = $item; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
# put links into the queue(s) |
284
|
0
|
0
|
|
|
|
|
$self->_stack_urls() if $self->_links; |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# clean up text a bit. should this be here...? |
287
|
0
|
0
|
0
|
|
|
|
if ( $self->{_text} and ${$self->{_text}} ) { |
|
0
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
|
${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g; |
|
0
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# in the future Page object should be installed like parsers as a |
292
|
|
|
|
|
|
|
# reusable container |
293
|
|
|
|
|
|
|
# return |
294
|
0
|
|
0
|
|
|
|
my $Page = |
295
|
|
|
|
|
|
|
WWW::Spyder::Page->new( |
296
|
|
|
|
|
|
|
title => $title, |
297
|
|
|
|
|
|
|
text => $self->{_text}, |
298
|
|
|
|
|
|
|
raw => \$page, |
299
|
|
|
|
|
|
|
url => $enQ->url, |
300
|
|
|
|
|
|
|
domain => $enQ->domain, |
301
|
|
|
|
|
|
|
link_name => undef, |
302
|
|
|
|
|
|
|
link => undef, |
303
|
|
|
|
|
|
|
description => $description || '', |
304
|
|
|
|
|
|
|
pages_enQs => $self->_enqueue, |
305
|
|
|
|
|
|
|
); |
306
|
0
|
|
|
|
|
|
$self->_reset; #<<--clear out things that might remain |
307
|
0
|
|
|
|
|
|
return $Page; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
#-------------------------- |
311
|
|
|
|
|
|
|
sub _stack_urls { # should eventually be broken into stack and sift? |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# dual purpose, w/ terms it filters as long as there are no urls |
314
|
|
|
|
|
|
|
# passed, otherwise it's setting them to the top of the queues |
315
|
0
|
|
|
0
|
|
|
my ($self, @urls) = @_; |
316
|
|
|
|
|
|
|
|
317
|
0
|
0
|
0
|
|
|
|
print "Stacking " . join(', ', @urls) . "\n" |
318
|
|
|
|
|
|
|
if @urls and $VERBOSITY > 5; |
319
|
|
|
|
|
|
|
|
320
|
0
|
0
|
0
|
|
|
|
if ( $self->terms and not @urls ) { |
|
|
0
|
|
|
|
|
|
321
|
1
|
|
|
1
|
|
6
|
no warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1351
|
|
322
|
0
|
|
|
|
|
|
my @Qs = $self->_queues; |
323
|
0
|
|
|
|
|
|
for my $enQ ( @{$self->_enqueue} ) { |
|
0
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
my ( $url, $name ) = ( $enQ->url, $enQ->name ); |
325
|
|
|
|
|
|
|
|
326
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
327
|
|
|
|
|
|
|
|
328
|
0
|
|
|
|
|
|
my $match = 0; |
329
|
0
|
|
|
|
|
|
while ( my ($term,$rx) = each %{$self->terms} ) { |
|
0
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
|
$match++ for $name =~ /$rx/g; |
331
|
|
|
|
|
|
|
} |
332
|
0
|
|
|
|
|
|
my $baseQ = 10; |
333
|
0
|
|
|
|
|
|
my $adjustment = $self->{_adjustment}; |
334
|
0
|
|
|
|
|
|
$baseQ -= $adjustment; # 4 to 0 |
335
|
|
|
|
|
|
|
|
336
|
0
|
0
|
0
|
|
|
|
push @{$self->{$baseQ}}, $enQ |
|
0
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
and next unless $match; |
338
|
|
|
|
|
|
|
|
339
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY > 1 ) { |
340
|
0
|
|
|
|
|
|
print "NAME: $name\n"; |
341
|
0
|
|
|
|
|
|
printf " RATIO -->> %d\n", $match; |
342
|
|
|
|
|
|
|
} |
343
|
0
|
|
|
|
|
|
my $queue_index = sprintf "%d", |
344
|
|
|
|
|
|
|
$self->_term_count / $match; |
345
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
|
$queue_index -= $adjustment; |
347
|
0
|
0
|
|
|
|
|
$queue_index = 4 if $queue_index > 4; |
348
|
0
|
0
|
|
|
|
|
$queue_index = 0 if $queue_index < 0; |
349
|
0
|
|
|
|
|
|
my $queue = $Qs[$queue_index]; |
350
|
0
|
0
|
|
|
|
|
if ($VERBOSITY > 2) { |
351
|
0
|
|
|
|
|
|
print "Q:$queue [$queue_index] match: $match terms:", |
352
|
|
|
|
|
|
|
$self->_term_count, " Adjust: $adjustment\n\n"; |
353
|
|
|
|
|
|
|
} |
354
|
0
|
|
|
|
|
|
push @{$self->{$queue}}, $enQ; |
|
0
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} elsif ( @urls > 0 ) { |
357
|
0
|
|
|
|
|
|
for my $url ( @urls ) { |
358
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
359
|
0
|
|
|
|
|
|
my $queue = $self->_queues; |
360
|
0
|
0
|
|
|
|
|
carp "Placing $url in '$queue'\n" if $VERBOSITY > 2; |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# unshift because seeding is priority |
363
|
0
|
|
|
|
|
|
unshift @{$self->{$queue}}, |
|
0
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
WWW::Spyder::Enqueue->new($url,undef); |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
} else { |
367
|
0
|
|
|
|
|
|
for my $enQ ( @{$self->_enqueue} ) { |
|
0
|
|
|
|
|
|
|
368
|
0
|
|
|
|
|
|
my ( $url, $name ) = ( $enQ->url, $enQ->name ); |
369
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
370
|
0
|
|
|
|
|
|
my $queue = $self->_queues; |
371
|
0
|
|
|
|
|
|
push @{$self->{$queue}}, $enQ; |
|
0
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
#-------------------------- |
376
|
|
|
|
|
|
|
sub queue_count { |
377
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
378
|
0
|
|
|
|
|
|
my $count = 0; |
379
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
380
|
0
|
0
|
|
|
|
|
next unless ref($self->{$Q}) eq 'ARRAY'; |
381
|
0
|
|
|
|
|
|
$count += scalar @{$self->{$Q}}; |
|
0
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
} |
383
|
0
|
|
|
|
|
|
return $count; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
#-------------------------- |
386
|
|
|
|
|
|
|
sub spyder_time { |
387
|
0
|
|
|
0
|
1
|
|
my ($self,$raw) = @_; |
388
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
my $time = time() - $^T; |
390
|
0
|
0
|
|
|
|
|
return $time if $raw; |
391
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
|
my $day = int( $time / 86400 ); |
393
|
0
|
|
|
|
|
|
my $hour = int( $time / 3600 ) % 24; |
394
|
0
|
|
|
|
|
|
my $min = int( $time / 60 ) % 60; |
395
|
0
|
|
|
|
|
|
my $sec = $time % 60; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# also collect slept time! |
398
|
0
|
0
|
|
|
|
|
return sprintf "%d day%s %02d:%02d:%02d", |
399
|
|
|
|
|
|
|
$day, $day == 1?'':'s', $hour, $min, $sec; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
#-------------------------- |
402
|
|
|
|
|
|
|
sub spyder_data { |
403
|
0
|
|
|
0
|
1
|
|
my ($self, $bytes) = @_; |
404
|
0
|
0
|
0
|
|
|
|
$self->{_bytes_GOT} += $bytes and return $bytes if $bytes; |
405
|
|
|
|
|
|
|
|
406
|
0
|
0
|
|
|
|
|
return 0 unless $self->{_bytes_GOT}; |
407
|
|
|
|
|
|
|
|
408
|
0
|
|
|
|
|
|
my $for_commas = int($self->{_bytes_GOT} / 1_024); |
409
|
|
|
|
|
|
|
|
410
|
0
|
|
|
|
|
|
for ( $for_commas ) { |
411
|
0
|
|
|
|
|
|
1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/; |
412
|
|
|
|
|
|
|
} |
413
|
0
|
|
|
|
|
|
return $for_commas; |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
#-------------------------- |
416
|
|
|
|
|
|
|
sub spydered { |
417
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
418
|
|
|
|
|
|
|
return wantarray ? |
419
|
0
|
|
|
|
|
|
keys %{ $self->{_spydered} } : |
|
0
|
|
|
|
|
|
|
420
|
0
|
0
|
|
|
|
|
scalar keys %{ $self->{_spydered} }; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
#-------------------------- |
423
|
|
|
|
|
|
|
#sub exclude { # what about FILES TYPES!? |
424
|
|
|
|
|
|
|
# return undef; # not working yet! |
425
|
|
|
|
|
|
|
# my ($self,$thing) = @_; |
426
|
|
|
|
|
|
|
# if ( $thing =~ m<^[^:]{3,5}://> ) |
427
|
|
|
|
|
|
|
# { |
428
|
|
|
|
|
|
|
# return $self->{_Xklood}{_domain}{$thing}++; |
429
|
|
|
|
|
|
|
# } |
430
|
|
|
|
|
|
|
# elsif ( $thing ) |
431
|
|
|
|
|
|
|
# { |
432
|
|
|
|
|
|
|
# return $self->{_Xklood}{_name}{$thing}++; |
433
|
|
|
|
|
|
|
# } |
434
|
|
|
|
|
|
|
#} |
435
|
|
|
|
|
|
|
#-------------------------- |
436
|
|
|
|
|
|
|
#sub excluded_domains { |
437
|
|
|
|
|
|
|
# return undef; # not working yet! |
438
|
|
|
|
|
|
|
# my ($self) = @_; |
439
|
|
|
|
|
|
|
# return wantarray ? |
440
|
|
|
|
|
|
|
# keys %{$self->{_Xklood}{_domain}} : |
441
|
|
|
|
|
|
|
# [ keys %{$self->{_Xklood}{_domain}} ]; |
442
|
|
|
|
|
|
|
#} |
443
|
|
|
|
|
|
|
#-------------------------- |
444
|
|
|
|
|
|
|
#sub excluded_names { |
445
|
|
|
|
|
|
|
# return undef; # not working yet! |
446
|
|
|
|
|
|
|
# my ($self) = @_; |
447
|
|
|
|
|
|
|
# return wantarray ? |
448
|
|
|
|
|
|
|
# keys %{$self->{_Xklood}{_name}} : |
449
|
|
|
|
|
|
|
# [ keys %{$self->{_Xklood}{_name}} ]; |
450
|
|
|
|
|
|
|
#} |
451
|
|
|
|
|
|
|
#-------------------------- |
452
|
|
|
|
|
|
|
sub go_to_seed { |
453
|
0
|
|
|
0
|
1
|
|
my ( $self, $engine, $query ) = @_; |
454
|
0
|
|
|
|
|
|
carp "go_to_seed() is not functional yet!\n"; |
455
|
0
|
|
|
|
|
|
return; # NOT FUNCTIONAL |
456
|
0
|
|
|
|
|
|
my $seed = WWW::Spyder::Seed::get_seed($engine, $query); |
457
|
0
|
|
|
|
|
|
$self->seed($seed); |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
#-------------------------- |
460
|
|
|
|
|
|
|
sub verbosity { |
461
|
0
|
|
|
0
|
1
|
|
my ( $self, $verbosity ) = @_; |
462
|
0
|
0
|
0
|
|
|
|
carp "Not setting verbosity! Must be integer b/t 1 & 6!\n" |
463
|
|
|
|
|
|
|
and return |
464
|
|
|
|
|
|
|
unless $verbosity; |
465
|
0
|
|
|
|
|
|
$VERBOSITY = $verbosity; |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
#-------------------------- |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
#-------------------------- |
470
|
|
|
|
|
|
|
# PRIVATE Spyder Methods |
471
|
|
|
|
|
|
|
#-------------------------- |
472
|
|
|
|
|
|
|
sub _reset { |
473
|
|
|
|
|
|
|
# RESET MORE THAN THIS!?! make sure all the memory space is clean that |
474
|
|
|
|
|
|
|
# needs be for clean iteration??? |
475
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
476
|
0
|
|
|
|
|
|
$self->{$_} = undef for qw( _linkText _linkSwitch _href _src |
477
|
|
|
|
|
|
|
_current_enQ _page_content |
478
|
|
|
|
|
|
|
_current_Bytes _alt _enqueue_Objects |
479
|
|
|
|
|
|
|
_text ); |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
#-------------------------- |
482
|
|
|
|
|
|
|
sub _current_enQ { |
483
|
0
|
|
|
0
|
|
|
my ($self, $enQ) = @_; |
484
|
0
|
|
|
|
|
|
my $last_enQ = $self->{_current_enQ}; |
485
|
0
|
0
|
|
|
|
|
$self->{_current_enQ} = $enQ if $enQ; |
486
|
0
|
|
|
|
|
|
return $last_enQ; #<<-so we can get last while setting a new one |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
#-------------------------- |
489
|
|
|
|
|
|
|
sub _enqueue { |
490
|
0
|
|
|
0
|
|
|
my ($self,$enQ) = @_; |
491
|
0
|
0
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, $enQ if $enQ; |
|
0
|
|
|
|
|
|
|
492
|
0
|
|
|
|
|
|
return $self->{_enqueue_Objects}; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
#-------------------------- |
495
|
|
|
|
|
|
|
sub _links { |
496
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
497
|
0
|
|
|
|
|
|
return [ map { $_->url } @{$self->_enqueue} ]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
#-------------------------- |
500
|
|
|
|
|
|
|
sub _seen { |
501
|
0
|
|
|
0
|
|
|
my ($self,$url) = @_; |
502
|
0
|
|
|
|
|
|
return $self->{_seenURLs}{$url}++; |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
#-------------------------- |
505
|
|
|
|
|
|
|
sub _parse_for_terms { |
506
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
507
|
0
|
|
|
|
|
|
$self->{_page_terms_matches} = 0; |
508
|
|
|
|
|
|
|
|
509
|
0
|
0
|
|
|
|
|
return 0 unless $self->{_text}; |
510
|
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
while ( my ($term,$rx) = each %{$self->terms} ) { |
|
0
|
|
|
|
|
|
|
512
|
0
|
|
|
|
|
|
$self->{_page_terms_matches}++ for |
513
|
|
|
|
|
|
|
$self->{_page_content} =~ /$rx/g; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
|
my $index = int( ( $self->{_page_terms_matches} / |
517
|
|
|
|
|
|
|
length($self->{_text}) ) * 1_000 ); |
518
|
|
|
|
|
|
|
# the algorithm might look it but isn't entirely arbitrary |
519
|
|
|
|
|
|
|
|
520
|
0
|
0
|
|
|
|
|
print " PARSE TERMS : $self->{_page_terms_matches} " . |
521
|
|
|
|
|
|
|
"/ $self->{_current_Bytes}\n" if $VERBOSITY > 1; |
522
|
|
|
|
|
|
|
|
523
|
0
|
0
|
|
|
|
|
return 7 if $index > 25; |
524
|
0
|
0
|
|
|
|
|
return 6 if $index > 18; |
525
|
0
|
0
|
|
|
|
|
return 5 if $index > 14; |
526
|
0
|
0
|
|
|
|
|
return 4 if $index > 11; |
527
|
0
|
0
|
|
|
|
|
return 3 if $index > 7; |
528
|
0
|
0
|
|
|
|
|
return 2 if $index > 3; |
529
|
0
|
0
|
|
|
|
|
return 1 if $index > 0; |
530
|
0
|
|
|
|
|
|
return 0; |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
#-------------------------- |
533
|
|
|
|
|
|
|
sub _install_html_parser { |
534
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
my $Parser = HTML::Parser->new |
537
|
|
|
|
|
|
|
( |
538
|
|
|
|
|
|
|
start_h => |
539
|
|
|
|
|
|
|
[sub { |
540
|
1
|
|
|
1
|
|
7
|
no warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
508
|
|
541
|
0
|
|
|
0
|
|
|
my ( $tag, $attr ) = @_; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# Check for broken image links if requested. |
544
|
0
|
0
|
0
|
|
|
|
return if $tag !~ /^(?:a|img)$/ && ! $self->{image_checking}; |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# need to deal with AREA tags from maps /^(?:a(?:rea)?|img)$/; |
547
|
0
|
|
|
|
|
|
$attr->{href} =~ s,#[^/]*$,,; |
548
|
0
|
0
|
|
|
|
|
$attr->{src} =~ s,#[^/]*$,, if $self->{image_checking}; |
549
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,^\s*mailto:,; |
550
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,^\s*file:,; |
551
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,javascript:,; |
552
|
|
|
|
|
|
|
|
553
|
0
|
0
|
0
|
|
|
|
$self->{_src} ||= $attr->{src} if $self->{image_checking}; |
554
|
0
|
|
0
|
|
|
|
$self->{_href} ||= $attr->{href}; |
555
|
0
|
|
0
|
|
|
|
$self->{_alt} ||= $attr->{alt}; |
556
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = 1; |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# Don't wait for the end handler if we have an image, as an image |
559
|
|
|
|
|
|
|
# src tag doesn't have an end. |
560
|
0
|
0
|
0
|
|
|
|
if ($attr->{src} && $self->{image_checking} && ! $attr->{href}) { |
|
|
|
0
|
|
|
|
|
561
|
0
|
|
0
|
|
|
|
$self->{_linkText} ||= $self->{_alt} || '+'; |
|
|
|
0
|
|
|
|
|
562
|
0
|
|
|
|
|
|
decode_entities($self->{_linkText}); |
563
|
|
|
|
|
|
|
|
564
|
0
|
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
0
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
[ $self->{_href}, $self->{_linkText} ]; |
566
|
|
|
|
|
|
|
|
567
|
0
|
0
|
0
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
0
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
[ $self->{_src}, $self->{_linkText} ] |
569
|
|
|
|
|
|
|
if $self->{_src} and $self->{image_checking}; |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# reset all our caching variables |
572
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = $self->{_href} = $self->{_alt} = |
573
|
|
|
|
|
|
|
$self->{_src} = $self->{_linkText} = undef; |
574
|
|
|
|
|
|
|
|
575
|
0
|
|
|
|
|
|
return; |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
}, 'tagname, attr'], |
578
|
|
|
|
|
|
|
text_h => |
579
|
|
|
|
|
|
|
[sub { |
580
|
|
|
|
|
|
|
|
581
|
0
|
0
|
|
0
|
|
|
return unless(my $it = shift); |
582
|
0
|
0
|
|
|
|
|
return if $it =~ |
583
|
|
|
|
|
|
|
m/(?:\Q\E)/; |
584
|
0
|
|
|
|
|
|
${$self->{_text}} .= $it; |
|
0
|
|
|
|
|
|
|
585
|
0
|
0
|
|
|
|
|
$self->{_linkText} .= $it |
586
|
|
|
|
|
|
|
if $self->{_linkSwitch}; |
587
|
|
|
|
|
|
|
}, 'dtext'], |
588
|
|
|
|
|
|
|
end_h => |
589
|
|
|
|
|
|
|
[sub { |
590
|
0
|
|
|
0
|
|
|
my ( $tag ) = @_; |
591
|
1
|
|
|
1
|
|
13
|
no warnings; # only problem: Links |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
435
|
|
592
|
|
|
|
|
|
|
|
593
|
0
|
0
|
|
|
|
|
if ($self->{image_checking}) { |
594
|
0
|
0
|
0
|
|
|
|
return unless $tag eq 'a' or $self->{_linkSwitch} or |
|
|
|
0
|
|
|
|
|
595
|
|
|
|
|
|
|
$tag eq 'img'; |
596
|
|
|
|
|
|
|
} else { |
597
|
0
|
0
|
0
|
|
|
|
return unless $tag eq 'a' or $self->{_linkSwitch}; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
|
600
|
0
|
|
0
|
|
|
|
$self->{_linkText} ||= $self->{_alt} || '+'; |
|
|
|
0
|
|
|
|
|
601
|
0
|
|
|
|
|
|
decode_entities($self->{_linkText}); |
602
|
|
|
|
|
|
|
|
603
|
0
|
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
0
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
[ $self->{_href}, $self->{_linkText} ]; |
605
|
|
|
|
|
|
|
|
606
|
0
|
0
|
0
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
0
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
[ $self->{_src}, $self->{_linkText} ] |
608
|
|
|
|
|
|
|
if $self->{_src} and $self->{image_checking}; |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# reset all our caching variables |
611
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = $self->{_href} = $self->{_alt} = $self->{_src} = |
612
|
|
|
|
|
|
|
$self->{_linkText} = undef; |
613
|
0
|
|
|
|
|
|
}, 'tagname'], |
614
|
|
|
|
|
|
|
default_h => [""], |
615
|
|
|
|
|
|
|
); |
616
|
0
|
|
|
|
|
|
$Parser->ignore_elements(qw(script style)); |
617
|
0
|
|
|
|
|
|
$Parser->unbroken_text(1); |
618
|
0
|
|
|
|
|
|
$self->html_parser($Parser); |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
#-------------------------- |
621
|
|
|
|
|
|
|
sub _install_web_agent { |
622
|
0
|
|
|
0
|
|
|
my $self = shift; |
623
|
0
|
|
|
|
|
|
my $jar_jar = undef; |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
# If a LWP::UserAgent object was passed in to our constructor, use |
626
|
|
|
|
|
|
|
# it. |
627
|
0
|
0
|
|
|
|
|
if ($self->{UA}) { |
628
|
0
|
|
|
|
|
|
$self->UA( $self->{UA} ); |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Otherwise, create a new one. |
631
|
|
|
|
|
|
|
} else { |
632
|
0
|
|
|
|
|
|
$self->UA( LWP::UserAgent->new ); |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
0
|
|
|
|
|
|
$self->UA->agent('Mozilla/5.0'); |
636
|
0
|
|
|
|
|
|
$self->UA->timeout(30); |
637
|
0
|
|
|
|
|
|
$self->UA->max_size(250_000); |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
# Get our cookie from our the jar passed in. |
640
|
0
|
0
|
|
|
|
|
if ($self->{UA}) { |
641
|
0
|
|
|
|
|
|
$jar_jar = $self->{UA}->cookie_jar(); |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
# Or else create a new cookie. |
644
|
|
|
|
|
|
|
} else { |
645
|
0
|
|
0
|
|
|
|
$jar_jar = HTTP::Cookies->new |
646
|
|
|
|
|
|
|
(file => $self->cookie_file || "$ENV{HOME}/spyderCookies", |
647
|
|
|
|
|
|
|
autosave => 1, |
648
|
|
|
|
|
|
|
max_cookie_size => 4096, |
649
|
|
|
|
|
|
|
max_cookies_per_domain => 5, ); |
650
|
|
|
|
|
|
|
} |
651
|
|
|
|
|
|
|
|
652
|
0
|
|
|
|
|
|
$self->UA->cookie_jar($jar_jar); |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
#-------------------------- |
655
|
|
|
|
|
|
|
sub _install_exit_check { |
656
|
0
|
|
|
0
|
|
|
my ($self, $arg) = @_; |
657
|
0
|
|
|
|
|
|
my $class = ref $self; |
658
|
|
|
|
|
|
|
|
659
|
0
|
0
|
0
|
|
|
|
unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) { |
660
|
1
|
|
|
1
|
|
6
|
no strict "refs"; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
310
|
|
661
|
0
|
|
|
|
|
|
*{$class."::_exit_check"} = |
662
|
0
|
0
|
|
0
|
|
|
sub { return 1 unless $self->queue_count; |
663
|
0
|
|
|
|
|
|
return 0; |
664
|
0
|
|
|
|
|
|
}; |
665
|
0
|
|
|
|
|
|
return; |
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# checks can be: links => #, success => ratio, time => 10min... |
669
|
|
|
|
|
|
|
# a piece of code we're going to build up to eval into method-hood |
670
|
0
|
|
|
|
|
|
my $SUB = 'sub { my $self = shift; ' . |
671
|
|
|
|
|
|
|
'return 1 unless $self->queue_count; '; |
672
|
|
|
|
|
|
|
#------------------------------------------------------------ |
673
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{pages} ) { |
674
|
0
|
0
|
|
|
|
|
print "Installing EXIT on links: $arg->{exit_on}{pages}\n" |
675
|
|
|
|
|
|
|
if $VERBOSITY > 1; |
676
|
0
|
|
|
|
|
|
$SUB .= ' return 1 if ' . |
677
|
|
|
|
|
|
|
'$self->spydered >= ' .$arg->{exit_on}{pages} .';'; |
678
|
|
|
|
|
|
|
} |
679
|
|
|
|
|
|
|
#------------------------------------------------------------ |
680
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{success} ) { |
681
|
|
|
|
|
|
|
#set necessary obj value and add to sub code |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
#------------------------------------------------------------ |
684
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{time} ) { |
685
|
0
|
0
|
|
|
|
|
print "Installing EXIT on time: $arg->{exit_on}{time}\n" |
686
|
|
|
|
|
|
|
if $VERBOSITY > 1; |
687
|
|
|
|
|
|
|
|
688
|
0
|
|
|
|
|
|
my ($amount,$unit) = |
689
|
|
|
|
|
|
|
$arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/; |
690
|
|
|
|
|
|
|
# skip final "s" in case of hours, secs, mins |
691
|
|
|
|
|
|
|
|
692
|
0
|
|
|
|
|
|
my %times = ( hour => 3600, |
693
|
|
|
|
|
|
|
min => 60, |
694
|
|
|
|
|
|
|
sec => 1 ); |
695
|
|
|
|
|
|
|
|
696
|
0
|
|
|
|
|
|
my $time_factor = 0; |
697
|
0
|
|
|
|
|
|
for ( keys %times ) { |
698
|
0
|
0
|
|
|
|
|
next unless exists $times{$unit}; |
699
|
0
|
|
|
|
|
|
$time_factor = $amount * $times{$unit}; |
700
|
|
|
|
|
|
|
} |
701
|
0
|
|
|
|
|
|
$self->_exit_epoch($time_factor + $^T); |
702
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
|
$SUB .= q{ |
704
|
|
|
|
|
|
|
return 1 if $self->_exit_epoch < time(); |
705
|
|
|
|
|
|
|
}; |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
#------------------------------------------------------------ |
708
|
0
|
|
|
|
|
|
$SUB .= '}'; |
709
|
|
|
|
|
|
|
|
710
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
748
|
|
711
|
0
|
|
|
|
|
|
*{$class."::_exit_check"} = eval $SUB; |
|
0
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
#-------------------------- |
714
|
|
|
|
|
|
|
sub _choose_courteously { |
715
|
0
|
|
|
0
|
|
|
my $self = shift; |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
# w/o the switch and $i-- it acts a bit more depth first. w/ it, it's |
718
|
|
|
|
|
|
|
# basically hard head down breadth first |
719
|
0
|
0
|
|
|
|
|
print "CHOOSING courteously!\n" if $VERBOSITY > 1; |
720
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
721
|
0
|
0
|
|
|
|
|
print "Looking for URL in $Q\n" if $VERBOSITY > 2; |
722
|
0
|
0
|
0
|
|
|
|
next unless $self->{$Q} and @{$self->{$Q}} > 0; |
|
0
|
|
|
|
|
|
|
723
|
0
|
|
|
|
|
|
my %seen; |
724
|
0
|
|
|
|
|
|
my $total = scalar @{$self->{$Q}}; |
|
0
|
|
|
|
|
|
|
725
|
0
|
|
|
|
|
|
my $switch; |
726
|
0
|
|
|
|
|
|
for ( my $i = 0; $i < @{$self->{$Q}}; $i++ ) { |
|
0
|
|
|
|
|
|
|
727
|
0
|
|
|
|
|
|
my $enQ = $self->{$Q}[$i]; |
728
|
0
|
|
|
|
|
|
my ($url,$name) = ( $enQ->url, $enQ->name ); |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# if we see one again, we've reshuffled as much as is useful |
731
|
0
|
0
|
|
|
|
|
$seen{$url}++ |
732
|
|
|
|
|
|
|
and $switch = 1; # progress through to next Q |
733
|
|
|
|
|
|
|
|
734
|
0
|
0
|
|
|
|
|
return splice(@{$self->{$Q}},$i,1) |
|
0
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
unless $self->_courtesy_call($enQ); |
736
|
|
|
|
|
|
|
|
737
|
0
|
|
|
|
|
|
my $fair_bump = int( log( $total - $i ) / log(1.5) ); |
738
|
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
|
my $move_me_back = splice(@{$self->{$Q}},$i,1); |
|
0
|
|
|
|
|
|
|
740
|
0
|
|
|
|
|
|
splice(@{$self->{$Q}},($i+$fair_bump),0,$move_me_back); |
|
0
|
|
|
|
|
|
|
741
|
0
|
0
|
|
|
|
|
$i-- unless $switch; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
# we couldn't pick one courteously |
745
|
|
|
|
|
|
|
} # end of _choose_courteously() |
746
|
|
|
|
|
|
|
#-------------------------- |
747
|
|
|
|
|
|
|
sub _just_choose { |
748
|
0
|
|
|
0
|
|
|
my $self = shift; |
749
|
0
|
0
|
|
|
|
|
print "CHOOSING first up!\n" if $VERBOSITY > 1; |
750
|
|
|
|
|
|
|
|
751
|
0
|
|
|
|
|
|
my $enQ; |
752
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
753
|
0
|
0
|
|
|
|
|
next unless ref($self->{$Q}) eq 'ARRAY'; |
754
|
0
|
|
|
|
|
|
$enQ = shift @{$self->{$Q}}; |
|
0
|
|
|
|
|
|
|
755
|
0
|
|
|
|
|
|
last; |
756
|
|
|
|
|
|
|
} |
757
|
0
|
|
|
|
|
|
my $tax = $self->_courtesy_call($enQ); |
758
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY > 4 ) { |
759
|
0
|
|
|
|
|
|
print ' QUEUE: '; |
760
|
0
|
0
|
|
|
|
|
print join("-:-", @{$self->{_courtesy_Queue}}), "\n" |
|
0
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
if $self->{_courtesy_Queue}; |
762
|
|
|
|
|
|
|
} |
763
|
0
|
|
|
|
|
|
my $sleep = int(rand($self->sleep_base)) + $tax; |
764
|
|
|
|
|
|
|
|
765
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY ) { |
766
|
0
|
0
|
|
|
|
|
printf "COURTESY NAP %d second%s ", |
767
|
|
|
|
|
|
|
$sleep, $sleep == 1 ?'':'s'; |
768
|
0
|
0
|
|
|
|
|
printf "(Domain recently seen: %d time%s)\n", |
769
|
|
|
|
|
|
|
$tax, $tax == 1 ?'':'s'; |
770
|
|
|
|
|
|
|
} |
771
|
0
|
|
|
|
|
|
sleep $sleep; # courtesy to websites but human-ish w/ random |
772
|
0
|
|
|
|
|
|
$self->slept($sleep); |
773
|
0
|
|
|
|
|
|
return $enQ; |
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
#-------------------------- |
776
|
|
|
|
|
|
|
sub _courtesy_call { |
777
|
0
|
|
|
0
|
|
|
my ($self,$enQ) = @_; |
778
|
0
|
0
|
|
|
|
|
return 0 unless $enQ; |
779
|
0
|
|
|
|
|
|
my $domain = $enQ->domain; |
780
|
|
|
|
|
|
|
|
781
|
0
|
0
|
|
|
|
|
print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5; |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
# yes, we have seen it in the last whatever GETs |
784
|
0
|
|
|
|
|
|
my $seen = 0; |
785
|
0
|
|
|
|
|
|
$seen = scalar grep { $_ eq $domain } |
|
0
|
|
|
|
|
|
|
786
|
0
|
|
|
|
|
|
@{$self->{_courtesy_Queue}}; |
787
|
0
|
0
|
|
|
|
|
$seen = 10 if $seen > 10; |
788
|
0
|
|
|
|
|
|
return $seen; |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
#-------------------------- |
791
|
|
|
|
|
|
|
sub _queues { # Q9 is purely for trash so it's not returned here |
792
|
|
|
|
|
|
|
return wantarray ? |
793
|
0
|
0
|
|
0
|
|
|
( 0 .. 9 ) : |
794
|
|
|
|
|
|
|
'0'; |
795
|
|
|
|
|
|
|
} |
796
|
|
|
|
|
|
|
#-------------------------- |
797
|
|
|
|
|
|
|
sub _snip { |
798
|
0
|
0
|
|
0
|
|
|
my $self = shift if ref($_[0]); |
799
|
0
|
|
|
|
|
|
my ( @text ) = @_; |
800
|
0
|
|
|
|
|
|
s/^\s+//, s/\s+$//, s/\s+/ /g for @text; |
801
|
0
|
0
|
|
|
|
|
return wantarray ? @text : shift @text; |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
#-------------------------- |
804
|
|
|
|
|
|
|
# Spyder ENDS |
805
|
|
|
|
|
|
|
#-------------------------- |
806
|
|
|
|
|
|
|
}# WWW::Spyder privacy ends |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
#-------------------------- |
810
|
|
|
|
|
|
|
package WWW::Spyder::Enqueue; |
811
|
|
|
|
|
|
|
#-------------------------- |
812
|
|
|
|
|
|
|
{ |
813
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
116
|
|
814
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
815
|
1
|
|
|
|
|
6
|
use overload( q{""} => '_stringify', |
816
|
1
|
|
|
1
|
|
6
|
fallback => 1 ); |
|
1
|
|
|
|
|
2
|
|
817
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
818
|
|
|
|
|
|
|
# 0 -->> URL |
819
|
|
|
|
|
|
|
# 1 -->> name, if any, of link URL was got from |
820
|
|
|
|
|
|
|
# 2 -->> domain |
821
|
|
|
|
|
|
|
#-------------------------- |
822
|
|
|
|
|
|
|
sub new { |
823
|
0
|
|
|
0
|
|
|
my ( $caller, $url, $name ) = @_; |
824
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
825
|
0
|
0
|
|
|
|
|
croak "Here I am. " if ref $url; |
826
|
0
|
0
|
|
|
|
|
return undef unless $url; |
827
|
0
|
0
|
|
|
|
|
if ( length($url) > 512 ) { # that's toooo long, don't you think? |
828
|
0
|
|
|
|
|
|
$url = substr($url,0,512); |
829
|
|
|
|
|
|
|
} |
830
|
0
|
0
|
0
|
|
|
|
if ( $name and length($name) > 512 ) { |
831
|
0
|
|
|
|
|
|
$name = substr($url,0,509) . '...'; |
832
|
|
|
|
|
|
|
} |
833
|
0
|
0
|
|
|
|
|
$name = '-' unless $name; # need this to find a bug later |
834
|
0
|
|
|
|
|
|
my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),; |
835
|
0
|
|
|
|
|
|
bless [ $url, $name, lc($domain) ], $class; |
836
|
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
|
#-------------------------- |
838
|
|
|
|
|
|
|
sub url { |
839
|
0
|
|
|
0
|
|
|
return $_[0]->[0]; |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
#-------------------------- |
842
|
|
|
|
|
|
|
sub name { |
843
|
0
|
|
|
0
|
|
|
return $_[0]->[1]; |
844
|
|
|
|
|
|
|
} |
845
|
|
|
|
|
|
|
#-------------------------- |
846
|
|
|
|
|
|
|
sub domain { |
847
|
0
|
|
|
0
|
|
|
return $_[0]->[2]; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
#-------------------------- |
850
|
|
|
|
|
|
|
sub _stringify { |
851
|
0
|
|
|
0
|
|
|
return $_[0]->[0]; |
852
|
|
|
|
|
|
|
} |
853
|
|
|
|
|
|
|
#-------------------------- |
854
|
|
|
|
|
|
|
}#privacy for WWW::Spyder::Enqueue ends |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
#-------------------------- |
858
|
|
|
|
|
|
|
package WWW::Spyder::Page; |
859
|
|
|
|
|
|
|
#-------------------------- |
860
|
1
|
|
|
1
|
|
279
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
27
|
|
861
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
22
|
|
862
|
1
|
|
|
1
|
|
15
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
95
|
|
863
|
|
|
|
|
|
|
{ |
864
|
|
|
|
|
|
|
sub new { |
865
|
0
|
|
|
0
|
|
|
my ( $caller, %arg ) = @_; |
866
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
867
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
868
|
|
|
|
|
|
|
|
869
|
0
|
|
|
|
|
|
while ( my ( $method, $val ) = each %arg ) { |
870
|
|
|
|
|
|
|
|
871
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
872
|
1
|
|
|
1
|
|
4
|
no warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
410
|
|
873
|
0
|
|
|
|
|
|
my $attribute = '_' . $method; |
874
|
|
|
|
|
|
|
|
875
|
0
|
0
|
|
|
|
|
if ( ref $val eq 'ARRAY' ) { |
876
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
877
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
878
|
0
|
0
|
|
|
|
|
return @{$self->{$attribute}} unless $arg; |
|
0
|
|
|
|
|
|
|
879
|
0
|
|
|
|
|
|
push(@{$self->{$attribute}}, @{$arg}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
880
|
0
|
|
|
|
|
|
}; |
881
|
|
|
|
|
|
|
} else { |
882
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
883
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
884
|
|
|
|
|
|
|
# get if already set and deref if needed |
885
|
0
|
0
|
0
|
|
|
|
if ( not $arg and exists $self->{$attribute} ) { |
886
|
0
|
|
|
|
|
|
return ref($self->{$attribute}) eq 'SCALAR' ? |
887
|
0
|
0
|
|
|
|
|
${$self->{$attribute}} : $self->{$attribute}; |
888
|
|
|
|
|
|
|
} |
889
|
0
|
0
|
|
|
|
|
$self->{$attribute} = $arg if $arg; #only set one time! |
890
|
0
|
|
|
|
|
|
}; |
891
|
|
|
|
|
|
|
} |
892
|
0
|
|
|
|
|
|
$self->$method($val); |
893
|
|
|
|
|
|
|
} |
894
|
0
|
|
|
|
|
|
return $self; |
895
|
|
|
|
|
|
|
} |
896
|
|
|
|
|
|
|
#-------------------------- |
897
|
|
|
|
|
|
|
sub links { |
898
|
0
|
|
|
0
|
|
|
my ( $self ) = @_; |
899
|
0
|
|
|
|
|
|
return map {$_->url} @{$self->{_pages_enQs}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
} |
901
|
|
|
|
|
|
|
#-------------------------- |
902
|
|
|
|
|
|
|
sub next_link { |
903
|
0
|
|
|
0
|
|
|
my ( $self ) = @_; |
904
|
0
|
|
|
|
|
|
shift @{$self->{_pages_enQs}}; |
|
0
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
} |
906
|
|
|
|
|
|
|
#-------------------------- |
907
|
|
|
|
|
|
|
}#privacy for ::Page ends |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
#-------------------------- |
911
|
|
|
|
|
|
|
package WWW::Spyder::Exclusions; |
912
|
|
|
|
|
|
|
#-------------------------- |
913
|
|
|
|
|
|
|
{ |
914
|
|
|
|
|
|
|
# THIS PACKAGE IS NOT BEING USED |
915
|
|
|
|
|
|
|
my %_domains = qw( |
916
|
|
|
|
|
|
|
ad.doubleclick.net 1 |
917
|
|
|
|
|
|
|
ads.clickagents.com 1 |
918
|
|
|
|
|
|
|
); |
919
|
|
|
|
|
|
|
my %_names = qw( |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
); |
922
|
|
|
|
|
|
|
#-------------------------- |
923
|
|
|
|
|
|
|
sub exclude_domain { |
924
|
0
|
|
|
0
|
|
|
$_domains{shift}++; |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
#-------------------------- |
927
|
|
|
|
|
|
|
sub excluded { |
928
|
0
|
|
|
0
|
|
|
my $what = shift; |
929
|
0
|
0
|
|
|
|
|
exists $_domains{$what} || $_names{$what}; |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
#-------------------------- |
932
|
|
|
|
|
|
|
}#privacy ends |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
#-------------------------- |
936
|
|
|
|
|
|
|
package WWW::Spyder::Seed; |
937
|
|
|
|
|
|
|
#-------------------------- |
938
|
|
|
|
|
|
|
{ |
939
|
1
|
|
|
1
|
|
11
|
use URI::Escape; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
940
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
270
|
|
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
my %engine_url = |
943
|
|
|
|
|
|
|
( |
944
|
|
|
|
|
|
|
google => 'http://www.google.com/search?q=', |
945
|
|
|
|
|
|
|
yahoo => 1 |
946
|
|
|
|
|
|
|
); |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
# should we exclude the search domain at this point? i think so because |
949
|
|
|
|
|
|
|
# otherwise we've introduced dozens of erroneous links and the engine |
950
|
|
|
|
|
|
|
# is gonna get hammered over time for it |
951
|
|
|
|
|
|
|
#-------------------------- |
952
|
|
|
|
|
|
|
sub get_seed { |
953
|
|
|
|
|
|
|
|
954
|
0
|
|
0
|
0
|
|
|
my $engine = shift || croak "Must provide search engine! " . |
955
|
|
|
|
|
|
|
join(', ', sort keys %engine_url) . "\n"; |
956
|
|
|
|
|
|
|
|
957
|
0
|
|
0
|
|
|
|
my $query = shift || croak "Must provide query terms!\n"; |
958
|
0
|
|
|
|
|
|
$query = uri_escape($query); |
959
|
|
|
|
|
|
|
|
960
|
0
|
0
|
|
|
|
|
croak "$engine is not a valid choice!\n" |
961
|
|
|
|
|
|
|
unless exists $engine_url{lc$engine}; |
962
|
|
|
|
|
|
|
|
963
|
0
|
|
|
|
|
|
return $engine_url{lc$engine} . $query; |
964
|
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
} # Privacy for WWW::Spyder::Seed ends |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
1; |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
# Plain Old D'errrrr |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
=pod |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=head1 NAME |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
WWW::Spyder - a simple non-persistent web crawler. |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
=head1 VERSION |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
0.24 |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
=head1 SYNOPSIS |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
A web spider that returns plain text, HTML, and other information per |
985
|
|
|
|
|
|
|
page crawled and can determine what pages to get and parse based on |
986
|
|
|
|
|
|
|
supplied terms compared to the text in links as well as page content. |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
use WWW::Spyder; |
989
|
|
|
|
|
|
|
# Supply your own LWP::UserAgent-compatible agent. |
990
|
|
|
|
|
|
|
use WWW::Mechanize; |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
my $start_url = "http://my-great-domain.com/"; |
993
|
|
|
|
|
|
|
my $mech = WWW::Mechanize->new(agent => "PreferredAgent/0.01") |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new( |
996
|
|
|
|
|
|
|
report_broken_links => 1, |
997
|
|
|
|
|
|
|
seed => $start_url, |
998
|
|
|
|
|
|
|
sleep_base => 5, |
999
|
|
|
|
|
|
|
UA => $mech |
1000
|
|
|
|
|
|
|
); |
1001
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
1002
|
|
|
|
|
|
|
# do something with the page... |
1003
|
|
|
|
|
|
|
} |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
=head1 METHODS |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=over 2 |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
=item * $spyder->new() |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Construct a new spyder object. Without at least the seed() set, or |
1012
|
|
|
|
|
|
|
go_to_seed() turned on, the spyder isn't ready to crawl. |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
$spyder = WWW::Spyder->new(shift||die"Gimme a URL!\n"); |
1015
|
|
|
|
|
|
|
# ...or... |
1016
|
|
|
|
|
|
|
$spyder = WWW::Spyder->new( %options ); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
Options include: sleep_base (in seconds), exit_on (hash of methods and |
1019
|
|
|
|
|
|
|
settings), report_broken_links, image_checking (verifies the images pointed to |
1020
|
|
|
|
|
|
|
by tags, disable_cnap (disables the courtesy nap when verbose |
1021
|
|
|
|
|
|
|
output is enabled), and UA (you can pass in an instantiated LWP::UserAgent |
1022
|
|
|
|
|
|
|
object via UA, i.e. UA => $ua_obj). Examples below. |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
=item * $spyder->seed($url) |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
Adds a URL (or URLs) to the top of the queues for crawling. If the |
1027
|
|
|
|
|
|
|
spyder is constructed with a single scalar argument, that is considered |
1028
|
|
|
|
|
|
|
the seed. |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=item * $spyder->bell([bool]) |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
This will print a bell ("\a") to STDERR on every successfully crawled |
1033
|
|
|
|
|
|
|
page. It might seem annoying but it is an excellent way to know your |
1034
|
|
|
|
|
|
|
spyder is behaving and working. True value turns it on. Right now it |
1035
|
|
|
|
|
|
|
can't be turned off. |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
=item * $spyder->spyder_time([bool]) |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
Returns raw seconds since I was created if given a |
1040
|
|
|
|
|
|
|
boolean value, otherwise returns "D day(s) HH::MM:SS." |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
=item * $spyder->terms([list of terms to match]) |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
The more terms, the more the spyder is going to grasp at. If you give |
1045
|
|
|
|
|
|
|
a straight list of strings, they will be turned into very open |
1046
|
|
|
|
|
|
|
regexes. E.g.: "king" would match "sulking" and "kinglet" but not |
1047
|
|
|
|
|
|
|
"King." It is case sensitive right now. If you want more specific |
1048
|
|
|
|
|
|
|
matching or different behavior, pass your own regexes instead of |
1049
|
|
|
|
|
|
|
strings. |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
$spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i ); |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
terms() is only settable once right now, then it's a done deal. |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
=item * $spyder->spyder_data() |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
A comma formatted number of kilobytes retrieved so far. B give |
1058
|
|
|
|
|
|
|
it an argument. It's a set/get routine. |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=item * $spyder->slept() |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
Returns the total number of seconds the spyder has slept while |
1063
|
|
|
|
|
|
|
running. Useful for getting accurate page/time counts (spyder |
1064
|
|
|
|
|
|
|
performance) discounting the added courtesy naps. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item * $spyder->UA->... |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
The user agent. It should be an L or a well-behaved |
1069
|
|
|
|
|
|
|
subclass like L. Here are the initialized values you |
1070
|
|
|
|
|
|
|
might want to tweak- |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
$spyder->UA->timeout(30); |
1073
|
|
|
|
|
|
|
$spyder->UA->max_size(250_000); |
1074
|
|
|
|
|
|
|
$spyder->UA->agent('Mozilla/5.0'); |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
Changing the agent name can hurt your spyder because some servers won't |
1077
|
|
|
|
|
|
|
return content unless it's requested by a "browser" they recognize. |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
You should probably add your email with from() as well. |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
$spyder->UA->from('bluefintuna@fish.net'); |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=item * $spyder->cookie_file([local_file]) |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
They live in $ENV{HOME}/spyderCookie by default but you can set your |
1086
|
|
|
|
|
|
|
own file if you prefer or want to save different cookie files for |
1087
|
|
|
|
|
|
|
different spyders. |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=item * $spyder->get_broken_links |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
Returns a reference to a list of broken link URLs if report_broken_links was |
1092
|
|
|
|
|
|
|
was enabled in the constructor. |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
=item * $spyder->go_to_seed |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
=item * $spyder->queue_count |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=item * $spyder->show_attributes |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=item * $spyder->spydered |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=item * $spyder->crawl |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
Returns (and removes) a Spyder page object from the queue of spydered pages. |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
=back |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
=head2 Sypder::Page methods |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
=over 6 |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
=item * $page->title |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=item * $page->text |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
=item * $page->raw |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
=item * $page->url |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
=item * $page->domain |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
=item * $page->link_name |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
=item * $page->link |
1125
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
=item * $page->description |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
=item * $page->pages_enQs |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
=back |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
=head2 Weird courteous behavior |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
Courtesy didn't used to be weird, but that's another story. You will |
1135
|
|
|
|
|
|
|
probably notice that the courtesy routines force a sleep when a |
1136
|
|
|
|
|
|
|
recently seen domain is the only choice for a new link. The sleep is |
1137
|
|
|
|
|
|
|
partially randomized. This is to prevent the spyder from being |
1138
|
|
|
|
|
|
|
recognized in weblogs as a robot. |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
=head2 The web and courtesy |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
B, I beg of thee, exercise the most courtesy you can. Don't |
1143
|
|
|
|
|
|
|
let impatience get in the way. Bandwidth and server traffic are |
1144
|
|
|
|
|
|
|
C<$MONEY> for real. The web is an extremely disorganized and corrupted |
1145
|
|
|
|
|
|
|
database at the root but companies and individuals pay to keep it |
1146
|
|
|
|
|
|
|
available. The less pain you cause by banging away on a webserver with |
1147
|
|
|
|
|
|
|
a web agent, the more welcome the next web agent will be. |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
B: Google seems to be excluding generic LWP agents now. See, I |
1150
|
|
|
|
|
|
|
told you so. A single parallel robot can really hammer a major server, |
1151
|
|
|
|
|
|
|
even someone with as big a farm and as much bandwidth as Google. |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
=head2 VERBOSITY |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
=over 2 |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
=item * $spyder->verbosity([1-6]) -OR- |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
=item * $WWW::Spyder::VERBOSITY = ... |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
Set it from 1 to 6 right now to get varying amounts of extra info to |
1162
|
|
|
|
|
|
|
STDOUT. It's an uneven scale and will be straightened out pretty soon. |
1163
|
|
|
|
|
|
|
If kids have a preference for sending the info to STDERR, I'll do |
1164
|
|
|
|
|
|
|
that. I might anyway. |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
=back |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head1 SAMPLE USAGE |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
=head2 See "spyder-mini-bio" in this distribution |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
It's an extremely simple, but fairly cool pseudo bio-researcher. |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
=head2 Simple continually crawling spyder: |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
In the following code snippet: |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
use WWW::Spyder; |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new( shift || die"Give me a URL!\n" ); |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
print '-'x70,"\n"; |
1185
|
|
|
|
|
|
|
print "Spydering: ", $page->title, "\n"; |
1186
|
|
|
|
|
|
|
print " URL: ", $page->url, "\n"; |
1187
|
|
|
|
|
|
|
print " Desc: ", $page->description || 'n/a', "\n"; |
1188
|
|
|
|
|
|
|
print '-'x70,"\n"; |
1189
|
|
|
|
|
|
|
while ( my $link = $page->next_link ) { |
1190
|
|
|
|
|
|
|
printf "%22s ->> %s\n", |
1191
|
|
|
|
|
|
|
length($link->name) > 22 ? |
1192
|
|
|
|
|
|
|
substr($link->name,0,19).'...' : $link->name, |
1193
|
|
|
|
|
|
|
length($link) > 43 ? |
1194
|
|
|
|
|
|
|
substr($link,0,40).'...' : $link; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
as long as unique URLs are being found in the pages crawled, the |
1199
|
|
|
|
|
|
|
spyder will never stop. |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
Each "crawl" returns a page object which gives the following methods |
1202
|
|
|
|
|
|
|
to get information about the page. |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
=over 2 |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
=item * $page->links |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
URLs found on the page. |
1209
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
=item * $page->title |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
Page's Title if there is one. |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=item * $page->text |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
The parsed plain text out of the page. Uses HTML::Parser and tries to |
1217
|
|
|
|
|
|
|
ignore non-readable stuff like comments and scripts. |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=item * $page->url |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
=item * $page->domain |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=item * $page->raw |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
The content returned by the server. Should be HTML. |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
=item * $page->description |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
The META description of the page if there is one. |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=item * $page->links |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
Returns a list of the URLs in the page. Note: next_link() will shift |
1234
|
|
|
|
|
|
|
the available list of links() each time it's called. |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
=item * $link = $page->next_link |
1237
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
next_link() destructively returns the next URI-ish object in the page. |
1239
|
|
|
|
|
|
|
They are objects with three accessors. |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
=back |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=over 6 |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
=item * $link->url |
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
This is also overloaded so that interpolating "$link" will get the |
1248
|
|
|
|
|
|
|
URL just as the method does. |
1249
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
=item * $link->name |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
=item * $link->domain |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
=back |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
=head2 Spyder that will give up the ghost... |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
The following spyder is initialized to stop crawling when I of |
1259
|
|
|
|
|
|
|
its conditions are met: 10mins pass or 300 pages are crawled. |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
use WWW::Spyder; |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
my $url = shift || die "Please give me a URL to start!\n"; |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new |
1266
|
|
|
|
|
|
|
(seed => $url, |
1267
|
|
|
|
|
|
|
sleep_base => 10, |
1268
|
|
|
|
|
|
|
exit_on => { pages => 300, |
1269
|
|
|
|
|
|
|
time => '10min', },); |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
print '-'x70,"\n"; |
1274
|
|
|
|
|
|
|
print "Spydering: ", $page->title, "\n"; |
1275
|
|
|
|
|
|
|
print " URL: ", $page->url, "\n"; |
1276
|
|
|
|
|
|
|
print " Desc: ", $page->description || '', "\n"; |
1277
|
|
|
|
|
|
|
print '-'x70,"\n"; |
1278
|
|
|
|
|
|
|
while ( my $link = $page->next_link ) { |
1279
|
|
|
|
|
|
|
printf "%22s ->> %s\n", |
1280
|
|
|
|
|
|
|
length($link->name) > 22 ? |
1281
|
|
|
|
|
|
|
substr($link->name,0,19).'...' : $link->name, |
1282
|
|
|
|
|
|
|
length($link) > 43 ? |
1283
|
|
|
|
|
|
|
substr($link,0,40).'...' : $link; |
1284
|
|
|
|
|
|
|
} |
1285
|
|
|
|
|
|
|
} |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
=head2 Primitive page reader |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
use WWW::Spyder; |
1290
|
|
|
|
|
|
|
use Text::Wrap; |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
my $url = shift || die "Please give me a URL to start!\n"; |
1293
|
|
|
|
|
|
|
@ARGV or die "Please also give me a search term.\n"; |
1294
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new; |
1295
|
|
|
|
|
|
|
$spyder->seed($url); |
1296
|
|
|
|
|
|
|
$spyder->terms(@ARGV); |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
1299
|
|
|
|
|
|
|
print '-'x70,"\n * "; |
1300
|
|
|
|
|
|
|
print $page->title, "\n"; |
1301
|
|
|
|
|
|
|
print '-'x70,"\n"; |
1302
|
|
|
|
|
|
|
print wrap('','', $page->text); |
1303
|
|
|
|
|
|
|
sleep 60; |
1304
|
|
|
|
|
|
|
} |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
=head1 TIPS |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
If you are going to do anything important with it, implement some |
1309
|
|
|
|
|
|
|
signal blocking to prevent accidental problems and tie your gathered |
1310
|
|
|
|
|
|
|
information to a DB_File or some such. |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
You might want to load C. It should top the nice off |
1313
|
|
|
|
|
|
|
at your system's max and prevent your spyder from interfering with |
1314
|
|
|
|
|
|
|
your system. |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
You might want to to set $| = 1. |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
=head1 PRIVATE METHODS |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
=head2 are private but hack away if you're inclined |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
=head1 TO DO |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
I is conceived to live in a future namespace as a servant class |
1325
|
|
|
|
|
|
|
for a complex web research agent with simple interfaces to |
1326
|
|
|
|
|
|
|
pre-designed grammars for research reports; or self-designed |
1327
|
|
|
|
|
|
|
grammars/reports (might be implemented via Parse::FastDescent if that |
1328
|
|
|
|
|
|
|
lazy-bones Conway would just find another 5 hours in the paltry 32 |
1329
|
|
|
|
|
|
|
hour day he's presently working). |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
I'd like the thing to be able to parse RTF, PDF, and perhaps even |
1332
|
|
|
|
|
|
|
resource sections of image files but that isn't on the radar right |
1333
|
|
|
|
|
|
|
now. |
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
The tests should work differently. Currently they ask for outside |
1336
|
|
|
|
|
|
|
resources without checking if there is either an open way to do it or |
1337
|
|
|
|
|
|
|
if the user approves of it. Bad form all around. |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
=head1 TO DOABLE BY 1.0 |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
Add 2-4 sample scripts that are a bit more useful. |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
There are many functions that should be under the programmer's control |
1344
|
|
|
|
|
|
|
and not buried in the spyder. They will emerge soon. I'd like to put |
1345
|
|
|
|
|
|
|
in hooks to allow the user to keep(), toss(), or exclude(), urls, link |
1346
|
|
|
|
|
|
|
names, and domains, while crawling. |
1347
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
Clean up some redundant, sloppy, and weird code. Probably change or |
1349
|
|
|
|
|
|
|
remove the AUTOLOAD. |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
Put in a go_to_seed() method and a subclass, ::Seed, with rules to |
1352
|
|
|
|
|
|
|
construct query URLs by search engine. It would be the autostart or the |
1353
|
|
|
|
|
|
|
fallback for perpetual spyders that run out of links. It would hit a |
1354
|
|
|
|
|
|
|
given or default search engine with the I's terms as the query. |
1355
|
|
|
|
|
|
|
Obviously this would only work with terms() defined. |
1356
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
Implement auto-exclusion for failure vs. success rates on names as well |
1358
|
|
|
|
|
|
|
as domains (maybe URI suffixes too). |
1359
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
Turn length of courtesy queue into the breadth/depth setting? make it |
1361
|
|
|
|
|
|
|
automatically adjusting...? |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
Consistently found link names are excluded from term strength sorting? |
1364
|
|
|
|
|
|
|
Eg: "privacy policy," "read more," "copyright..." |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
Fix some image tag parsing problems and add area tag parsing. |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
Configuration for user:password by domain. |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
::Page objects become reusable so that a spyder only needs one. |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
::Enqueue objects become indexed so they are nixable from anywhere. |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
Expand exit_on routines to size, slept time, dwindling success ratio, |
1375
|
|
|
|
|
|
|
and maybe more. |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
Make methods to set "skepticism" and "effort" which will influence the |
1378
|
|
|
|
|
|
|
way the terms are used to keep, order, and toss URLs. |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
=head1 BE WARNED |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
This module already does some extremely useful things but it's in its |
1383
|
|
|
|
|
|
|
infancy and it is conceived to live in a different namespace and |
1384
|
|
|
|
|
|
|
perhaps become more private as a subservient part of a parent class. |
1385
|
|
|
|
|
|
|
This may never happen but it's the idea. So don't put this into |
1386
|
|
|
|
|
|
|
production code yet. I am endeavoring to keep its interface constant |
1387
|
|
|
|
|
|
|
either way. That said, it could change completely. |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
=head2 Also! |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
This module saves cookies to the user's home. There will be more |
1392
|
|
|
|
|
|
|
control over cookies in the future, but that's how it is right now. |
1393
|
|
|
|
|
|
|
They live in $ENV{HOME}/spyderCookie. |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
=head2 Anche! |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
Robot Rules aren't respected. I endeavors to be polite as far |
1398
|
|
|
|
|
|
|
as server hits are concerned, but doesn't take "no" for answer right |
1399
|
|
|
|
|
|
|
now. I want to add this, and not just by domain, but by page settings. |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
=head1 UNDOCUMENTED FEATURES |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
A.k.a. Bugs. Don't be ridiculous! Bugs in B?! |
1404
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
There is a bug that is causing retrieval of image src tags, I think |
1406
|
|
|
|
|
|
|
but haven't tracked it down yet, as links. I also think the plain text |
1407
|
|
|
|
|
|
|
parsing has some problems which will be remedied shortly. |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
If you are building more than one spyder in the same script they are |
1410
|
|
|
|
|
|
|
going to share the same exit_on parameters because it's a |
1411
|
|
|
|
|
|
|
self-installing method. This will not always be so. |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
See B file for more open and past issues. |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
Let me know if you find any others. If you find one that is platform |
1416
|
|
|
|
|
|
|
specific, please send patch code/suggestion because I might not have |
1417
|
|
|
|
|
|
|
any idea how to fix it. |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
=head1 WHY C |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
I didn't want to use the more appropriate I because I think |
1422
|
|
|
|
|
|
|
there is a better one out there somewhere in the zeitgeist and the |
1423
|
|
|
|
|
|
|
namespace future of I is uncertain. It may end up a |
1424
|
|
|
|
|
|
|
semi-private part of a bigger family. And I may be King of Kenya |
1425
|
|
|
|
|
|
|
someday. One's got to dream. |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
If you like I, have feedback, wishlist usage, better |
1428
|
|
|
|
|
|
|
algorithms/implementations for any part of it, please let me know! |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
=head1 THANKS TO |
1431
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway, |
1433
|
|
|
|
|
|
|
Raphael Manfredi, Gurusamy Sarathy, and plenty of others. |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
=head1 COMPARE WITH (PROBABLY PREFER) |
1436
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
L, L, L, L, |
1438
|
|
|
|
|
|
|
L, and other kith and kin. |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
Copyright (c) 2001-2008, Ashley Pond V C<< >>. All |
1443
|
|
|
|
|
|
|
rights reserved. |
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or |
1446
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. See L. |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
=head1 DISCLAIMER OF WARRANTY |
1449
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
1451
|
|
|
|
|
|
|
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
1452
|
|
|
|
|
|
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
1453
|
|
|
|
|
|
|
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER |
1454
|
|
|
|
|
|
|
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
1455
|
|
|
|
|
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE |
1456
|
|
|
|
|
|
|
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH |
1457
|
|
|
|
|
|
|
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL |
1458
|
|
|
|
|
|
|
NECESSARY SERVICING, REPAIR, OR CORRECTION. |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
1461
|
|
|
|
|
|
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
1462
|
|
|
|
|
|
|
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE |
1463
|
|
|
|
|
|
|
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, |
1464
|
|
|
|
|
|
|
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE |
1465
|
|
|
|
|
|
|
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING |
1466
|
|
|
|
|
|
|
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A |
1467
|
|
|
|
|
|
|
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF |
1468
|
|
|
|
|
|
|
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF |
1469
|
|
|
|
|
|
|
SUCH DAMAGES. |
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
=cut |