| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Spyder; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
24344
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
65
|
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
211
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1400
|
use HTML::Parser 3; |
|
|
1
|
|
|
|
|
12055
|
|
|
|
1
|
|
|
|
|
36
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
16172
|
use LWP::UserAgent; |
|
|
1
|
|
|
|
|
65533
|
|
|
|
1
|
|
|
|
|
27
|
|
|
9
|
1
|
|
|
1
|
|
859
|
use HTTP::Cookies; |
|
|
1
|
|
|
|
|
7079
|
|
|
|
1
|
|
|
|
|
24
|
|
|
10
|
1
|
|
|
1
|
|
738
|
use URI::URL; |
|
|
1
|
|
|
|
|
3938
|
|
|
|
1
|
|
|
|
|
50
|
|
|
11
|
1
|
|
|
1
|
|
6
|
use HTML::Entities; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
51
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
1
|
|
|
1
|
|
4
|
use Digest::MD5 "md5_base64"; # For making seen content key/index. |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
42
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
233
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '0.24'; |
|
17
|
|
|
|
|
|
|
our $VERBOSITY ||= 0; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Methods |
|
20
|
|
|
|
|
|
|
#-------------------------- |
|
21
|
|
|
|
|
|
|
{ # make it all a bit more private |
|
22
|
|
|
|
|
|
|
my %_methods = (# these are methods & roots of our attribute names |
|
23
|
|
|
|
|
|
|
UA => undef, |
|
24
|
|
|
|
|
|
|
bell => undef, |
|
25
|
|
|
|
|
|
|
html_parser => undef, |
|
26
|
|
|
|
|
|
|
sleep_base => undef, |
|
27
|
|
|
|
|
|
|
cookie_file => undef, |
|
28
|
|
|
|
|
|
|
_exit_epoch => undef, |
|
29
|
|
|
|
|
|
|
_term_count => undef, |
|
30
|
|
|
|
|
|
|
); |
|
31
|
|
|
|
|
|
|
# Those may all get hardcoded eventually, but they're handy for now. |
|
32
|
|
|
|
|
|
|
#-------------------------- |
|
33
|
|
|
|
|
|
|
sub new { |
|
34
|
0
|
|
|
0
|
1
|
|
my ( $caller ) = shift; |
|
35
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
|
36
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
|
37
|
|
|
|
|
|
|
|
|
38
|
0
|
|
|
|
|
|
my ( $seed, %arg ); |
|
39
|
0
|
0
|
|
|
|
|
if ( @_ == 1 ) { |
|
40
|
0
|
|
|
|
|
|
( $seed ) = @_; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
0
|
|
|
|
|
|
%arg = ( broken_links => [], |
|
43
|
|
|
|
|
|
|
exit_on => undef, |
|
44
|
|
|
|
|
|
|
image_checking => 0, |
|
45
|
|
|
|
|
|
|
report_broken_links => 0, |
|
46
|
|
|
|
|
|
|
seed => undef, |
|
47
|
|
|
|
|
|
|
sleep => undef, |
|
48
|
|
|
|
|
|
|
sleep_base => 5, |
|
49
|
|
|
|
|
|
|
UA => undef |
|
50
|
|
|
|
|
|
|
); |
|
51
|
0
|
0
|
|
|
|
|
%arg = ( %arg, @_ ) unless @_ % 2; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Set our UA object if it was passed on to our constructor. |
|
54
|
0
|
0
|
|
|
|
|
$self->{UA} = $arg{UA} if $arg{UA}; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Turn on image checking if requested. img src tags will be checked if |
|
57
|
|
|
|
|
|
|
# image_checking is set to 1 in the constructor. |
|
58
|
0
|
0
|
|
|
|
|
$self->{image_checking} = $arg{image_checking} if $arg{image_checking}; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Turn on broken link checking if requested. Broken link URIs can be |
|
61
|
|
|
|
|
|
|
# obtained via get_broken_links(). |
|
62
|
0
|
0
|
|
|
|
|
$self->{report_broken_links} = $arg{report_broken_links} |
|
63
|
|
|
|
|
|
|
if $arg{report_broken_links}; |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Install all our methods, either set once then get only or push/shift |
|
66
|
|
|
|
|
|
|
# array refs. |
|
67
|
0
|
|
|
|
|
|
for my $method ( %_methods ) { |
|
68
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
27
|
|
|
69
|
1
|
|
|
1
|
|
3
|
no warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
1621
|
|
|
70
|
0
|
|
|
|
|
|
my $attribute = '_' . $method; |
|
71
|
|
|
|
|
|
|
|
|
72
|
0
|
0
|
|
|
|
|
if ( ref $_methods{$method} eq 'ARRAY' ) { |
|
73
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
|
74
|
0
|
|
|
0
|
|
|
my($self,@args) = @_; |
|
75
|
0
|
0
|
|
|
|
|
return shift(@{$self->{$attribute}}) unless @args; |
|
|
0
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
push(@{$self->{$attribute}}, @args); |
|
|
0
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
}; |
|
78
|
|
|
|
|
|
|
} else { |
|
79
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
|
80
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
|
81
|
0
|
0
|
0
|
|
|
|
carp "You cannot reset $method!" |
|
82
|
|
|
|
|
|
|
if $arg and exists $self->{$attribute}; |
|
83
|
0
|
0
|
|
|
|
|
return $self->{$attribute} #get if already set |
|
84
|
|
|
|
|
|
|
if exists $self->{$attribute}; |
|
85
|
0
|
|
|
|
|
|
$self->{$attribute} = $arg; #only set one time! |
|
86
|
0
|
|
|
|
|
|
}; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
0
|
|
0
|
|
|
|
$seed ||= $arg{seed}; |
|
91
|
0
|
0
|
|
|
|
|
$self->seed($seed) if $seed; |
|
92
|
0
|
|
|
|
|
|
$self->sleep_base($arg{sleep_base}); |
|
93
|
0
|
0
|
|
|
|
|
$self->_install_exit_check(\%arg) unless $self->can('_exit_check'); |
|
94
|
0
|
|
|
|
|
|
$self->_install_html_parser; |
|
95
|
0
|
|
|
|
|
|
$self->_install_web_agent; |
|
96
|
|
|
|
|
|
|
|
|
97
|
0
|
|
|
|
|
|
return $self; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
#-------------------------- |
|
100
|
|
|
|
|
|
|
sub terms { |
|
101
|
0
|
|
|
0
|
1
|
|
my ($self,@terms) = @_; |
|
102
|
0
|
0
|
0
|
|
|
|
if ( @terms and not exists $self->{_terms} ) { |
|
103
|
0
|
|
|
|
|
|
$self->_term_count(scalar @terms); # makes this set once op |
|
104
|
0
|
|
|
|
|
|
my %terms; |
|
105
|
0
|
|
|
|
|
|
$terms{$_} = qr/$_/ for @terms; |
|
106
|
0
|
|
|
|
|
|
$self->{_terms} = \%terms; |
|
107
|
|
|
|
|
|
|
} else { |
|
108
|
0
|
|
|
|
|
|
return $self->{_terms} |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
#-------------------------- |
|
112
|
|
|
|
|
|
|
sub show_attributes { |
|
113
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
|
114
|
0
|
|
|
|
|
|
return map {/^_(.+)$/} keys %{$self}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
#-------------------------- |
|
117
|
|
|
|
|
|
|
sub slept { |
|
118
|
0
|
|
|
0
|
1
|
|
my ($self, $time) = @_; |
|
119
|
0
|
0
|
|
|
|
|
$self->{_Slept} += $time if $time; |
|
120
|
0
|
0
|
|
|
|
|
return $self->{_Slept} unless $time; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
#-------------------------- |
|
123
|
|
|
|
|
|
|
sub seed { |
|
124
|
0
|
|
|
0
|
1
|
|
my ($self, $url) = @_; |
|
125
|
0
|
0
|
|
|
|
|
$url or croak "Must provide URL to seed()."; |
|
126
|
0
|
0
|
|
|
|
|
croak "You have passed something besides a plain URL to seed()!" |
|
127
|
|
|
|
|
|
|
if ref $url; |
|
128
|
0
|
|
|
|
|
|
$self->_stack_urls($url); |
|
129
|
0
|
|
|
|
|
|
return 1; # to the top of the stacks |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#-------------------------- |
|
133
|
|
|
|
|
|
|
sub get_broken_links { |
|
134
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
135
|
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
return $self->{broken_links}; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
#-------------------------- |
|
140
|
|
|
|
|
|
|
sub crawl { |
|
141
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
142
|
0
|
|
0
|
|
|
|
my $opts = shift || undef; |
|
143
|
0
|
|
|
|
|
|
my $excludes = []; |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Exclude list option. |
|
146
|
0
|
0
|
|
|
|
|
if ( ref($opts->{exclude}) eq 'ARRAY' ) { |
|
147
|
0
|
|
|
|
|
|
$excludes = $opts->{exclude}; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
while ('I have pages to get...') { |
|
151
|
|
|
|
|
|
|
|
|
152
|
0
|
0
|
|
|
|
|
$self->_exit_check and return; |
|
153
|
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $skip_url = 0; |
|
155
|
0
|
|
|
|
|
|
my $enQ = undef; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Report a page with a 404 error in the title if report_broken_links is |
|
158
|
|
|
|
|
|
|
# enabled. Also keep processing if we're looking for img src tags. |
|
159
|
0
|
0
|
0
|
|
|
|
if ($self->{report_broken_links} || $self->{image_checking}) { |
|
160
|
0
|
|
0
|
|
|
|
$enQ = $self->_choose_courteously || |
|
161
|
|
|
|
|
|
|
$self->_just_choose; |
|
162
|
|
|
|
|
|
|
} else { |
|
163
|
0
|
|
0
|
|
|
|
$enQ = $self->_choose_courteously || |
|
164
|
|
|
|
|
|
|
$self->_just_choose || |
|
165
|
|
|
|
|
|
|
return; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
my $url = $enQ->url; |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Skip this URL if it's in our excluded list. |
|
171
|
0
|
0
|
|
|
|
|
for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; } |
|
|
0
|
|
|
|
|
|
|
|
172
|
0
|
0
|
|
|
|
|
next if $skip_url; |
|
173
|
|
|
|
|
|
|
|
|
174
|
0
|
|
|
|
|
|
$self->url($url); |
|
175
|
0
|
|
|
|
|
|
$self->_current_enQ($enQ); |
|
176
|
|
|
|
|
|
|
|
|
177
|
0
|
0
|
|
|
|
|
print "GET'ing: $url\n" if $VERBOSITY; |
|
178
|
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
my $response = $self->UA->request # no redirects &c is simple_ |
|
180
|
|
|
|
|
|
|
( HTTP::Request->new( GET => "$url" ) ); |
|
181
|
|
|
|
|
|
|
|
|
182
|
0
|
0
|
|
|
|
|
print STDERR "\a" if $self->bell; |
|
183
|
|
|
|
|
|
|
|
|
184
|
0
|
0
|
0
|
|
|
|
$response or |
|
185
|
|
|
|
|
|
|
carp "$url failed GET!" and next; |
|
186
|
|
|
|
|
|
|
|
|
187
|
0
|
|
|
|
|
|
push @{$self->{_courtesy_Queue}}, $enQ->domain; |
|
|
0
|
|
|
|
|
|
|
|
188
|
0
|
|
|
|
|
|
shift @{$self->{_courtesy_Queue}} |
|
|
0
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
if $self->{_courtesy_Queue} |
|
190
|
0
|
0
|
0
|
|
|
|
and @{$self->{_courtesy_Queue}} > 100; |
|
191
|
|
|
|
|
|
|
|
|
192
|
0
|
|
|
|
|
|
my $head = $response->headers_as_string; |
|
193
|
0
|
0
|
0
|
|
|
|
$head or |
|
194
|
|
|
|
|
|
|
carp "$url has no HEAD!" and |
|
195
|
|
|
|
|
|
|
next; # no headless webpages |
|
196
|
|
|
|
|
|
|
|
|
197
|
0
|
0
|
|
|
|
|
length($head) > 1_024 and $head = substr($head,0,1_024); |
|
198
|
|
|
|
|
|
|
|
|
199
|
0
|
0
|
|
|
|
|
print $head, "\n" if $VERBOSITY > 2; |
|
200
|
|
|
|
|
|
|
|
|
201
|
0
|
|
|
|
|
|
my $base; |
|
202
|
0
|
|
|
|
|
|
eval { $base = $response->base }; |
|
|
0
|
|
|
|
|
|
|
|
203
|
0
|
0
|
0
|
|
|
|
$base or |
|
204
|
|
|
|
|
|
|
carp "$url has no discernible BASE!" and |
|
205
|
|
|
|
|
|
|
next; # no baseless webpages |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# WE SHOULD also look for because some servers that we might want |
|
208
|
|
|
|
|
|
|
# to look at don't properly report the content-type |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# start over unless this is something we can read |
|
211
|
0
|
|
|
|
|
|
my $title = ''; |
|
212
|
0
|
|
|
|
|
|
my $description = ''; |
|
213
|
0
|
|
|
|
|
|
my $is_image = 0; |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Make an exception for images. |
|
216
|
0
|
0
|
|
|
|
|
if ($self->{image_checking}) { |
|
217
|
0
|
0
|
|
|
|
|
if ($head =~ /Content\-Type:\s*image/i) { |
|
218
|
0
|
|
|
|
|
|
my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i; |
|
219
|
|
|
|
|
|
|
|
|
220
|
0
|
0
|
|
|
|
|
if ($img_size <= 0) { |
|
221
|
0
|
|
|
|
|
|
$title = $description = '404 Not Found'; |
|
222
|
0
|
|
|
|
|
|
next; |
|
223
|
|
|
|
|
|
|
} else { |
|
224
|
0
|
|
|
|
|
|
$is_image = 1; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
} else { |
|
228
|
0
|
0
|
0
|
|
|
|
lc($head) =~ /content-type:\s?(?:text|html)/ or |
|
229
|
|
|
|
|
|
|
carp "$url doesn't look like TEXT or HTML!" and |
|
230
|
|
|
|
|
|
|
next; # no weird media, movies, flash, etc |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
|
|
233
|
0
|
0
|
|
|
|
|
( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n, |
|
234
|
|
|
|
|
|
|
unless $title; |
|
235
|
|
|
|
|
|
|
|
|
236
|
0
|
0
|
|
|
|
|
( $description ) = $head =~ |
|
237
|
|
|
|
|
|
|
/[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i |
|
238
|
|
|
|
|
|
|
unless $description; |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Add this link to our dead links list if the title matches |
|
241
|
|
|
|
|
|
|
# a standard "404 Not Found" error. |
|
242
|
0
|
0
|
0
|
|
|
|
if ($title && $self->{report_broken_links}) { |
|
243
|
0
|
0
|
|
|
|
|
push(@{ $self->{broken_links} }, $url) |
|
|
0
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
if $title =~ /^\s*404\s+Not\s+Found\s*$/; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
0
|
0
|
|
|
|
|
$description = $self->_snip($description) if $description; |
|
248
|
|
|
|
|
|
|
|
|
249
|
0
|
0
|
0
|
|
|
|
my $page = $response->content or |
|
250
|
|
|
|
|
|
|
carp "Failed to fetch $url." and |
|
251
|
|
|
|
|
|
|
next; # no empty pages, start over with next url |
|
252
|
|
|
|
|
|
|
|
|
253
|
0
|
|
|
|
|
|
$self->{_current_Bytes} = length($page); |
|
254
|
0
|
|
|
|
|
|
$self->spyder_data($self->{_current_Bytes}); |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# we are going to use a digest to prevent parsing the identical |
|
257
|
|
|
|
|
|
|
# content received via a different url |
|
258
|
0
|
|
|
|
|
|
my $digest = md5_base64($page); # unique microtag of the page |
|
259
|
|
|
|
|
|
|
# so if we've seen it before, start over with the next URL |
|
260
|
0
|
0
|
0
|
|
|
|
$self->{_page_Memory}{$digest}++ and |
|
261
|
|
|
|
|
|
|
carp "Seen this page's content before: $url" |
|
262
|
|
|
|
|
|
|
and next; |
|
263
|
|
|
|
|
|
|
|
|
264
|
0
|
|
|
|
|
|
$self->{_page_content} = $page; |
|
265
|
0
|
0
|
|
|
|
|
print "PARSING: $url\n" if $VERBOSITY > 1; |
|
266
|
0
|
|
|
|
|
|
$self->{_spydered}{$url}++; |
|
267
|
0
|
|
|
|
|
|
$self->html_parser->parse($page); |
|
268
|
0
|
|
|
|
|
|
$self->html_parser->eof; |
|
269
|
|
|
|
|
|
|
|
|
270
|
0
|
0
|
|
|
|
|
$self->{_adjustment} = $self->_parse_for_terms if $self->terms; |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# make links absolute and fix bad spacing in link names, then turn |
|
273
|
|
|
|
|
|
|
# them into an Enqueue object |
|
274
|
0
|
|
|
|
|
|
for my $pair ( @{$self->{_enqueue_Objects}} ) { |
|
|
0
|
|
|
|
|
|
|
|
275
|
0
|
|
|
|
|
|
my $url; |
|
276
|
0
|
|
|
|
|
|
eval { |
|
277
|
0
|
|
|
|
|
|
$url = URI::URL::url($pair->[0], $base)->abs; |
|
278
|
|
|
|
|
|
|
}; |
|
279
|
0
|
|
|
|
|
|
my $name = _snip($pair->[1]); |
|
280
|
0
|
|
|
|
|
|
my $item = WWW::Spyder::Enqueue->new("$url",$name); |
|
281
|
0
|
|
|
|
|
|
$pair = $item; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
# put links into the queue(s) |
|
284
|
0
|
0
|
|
|
|
|
$self->_stack_urls() if $self->_links; |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# clean up text a bit. should this be here...? |
|
287
|
0
|
0
|
0
|
|
|
|
if ( $self->{_text} and ${$self->{_text}} ) { |
|
|
0
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
|
${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g; |
|
|
0
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# in the future Page object should be installed like parsers as a |
|
292
|
|
|
|
|
|
|
# reusable container |
|
293
|
|
|
|
|
|
|
# return |
|
294
|
0
|
|
0
|
|
|
|
my $Page = |
|
295
|
|
|
|
|
|
|
WWW::Spyder::Page->new( |
|
296
|
|
|
|
|
|
|
title => $title, |
|
297
|
|
|
|
|
|
|
text => $self->{_text}, |
|
298
|
|
|
|
|
|
|
raw => \$page, |
|
299
|
|
|
|
|
|
|
url => $enQ->url, |
|
300
|
|
|
|
|
|
|
domain => $enQ->domain, |
|
301
|
|
|
|
|
|
|
link_name => undef, |
|
302
|
|
|
|
|
|
|
link => undef, |
|
303
|
|
|
|
|
|
|
description => $description || '', |
|
304
|
|
|
|
|
|
|
pages_enQs => $self->_enqueue, |
|
305
|
|
|
|
|
|
|
); |
|
306
|
0
|
|
|
|
|
|
$self->_reset; #<<--clear out things that might remain |
|
307
|
0
|
|
|
|
|
|
return $Page; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
#-------------------------- |
|
311
|
|
|
|
|
|
|
sub _stack_urls { # should eventually be broken into stack and sift? |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# dual purpose, w/ terms it filters as long as there are no urls |
|
314
|
|
|
|
|
|
|
# passed, otherwise it's setting them to the top of the queues |
|
315
|
0
|
|
|
0
|
|
|
my ($self, @urls) = @_; |
|
316
|
|
|
|
|
|
|
|
|
317
|
0
|
0
|
0
|
|
|
|
print "Stacking " . join(', ', @urls) . "\n" |
|
318
|
|
|
|
|
|
|
if @urls and $VERBOSITY > 5; |
|
319
|
|
|
|
|
|
|
|
|
320
|
0
|
0
|
0
|
|
|
|
if ( $self->terms and not @urls ) { |
|
|
|
0
|
|
|
|
|
|
|
321
|
1
|
|
|
1
|
|
6
|
no warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
1351
|
|
|
322
|
0
|
|
|
|
|
|
my @Qs = $self->_queues; |
|
323
|
0
|
|
|
|
|
|
for my $enQ ( @{$self->_enqueue} ) { |
|
|
0
|
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
my ( $url, $name ) = ( $enQ->url, $enQ->name ); |
|
325
|
|
|
|
|
|
|
|
|
326
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
|
327
|
|
|
|
|
|
|
|
|
328
|
0
|
|
|
|
|
|
my $match = 0; |
|
329
|
0
|
|
|
|
|
|
while ( my ($term,$rx) = each %{$self->terms} ) { |
|
|
0
|
|
|
|
|
|
|
|
330
|
0
|
|
|
|
|
|
$match++ for $name =~ /$rx/g; |
|
331
|
|
|
|
|
|
|
} |
|
332
|
0
|
|
|
|
|
|
my $baseQ = 10; |
|
333
|
0
|
|
|
|
|
|
my $adjustment = $self->{_adjustment}; |
|
334
|
0
|
|
|
|
|
|
$baseQ -= $adjustment; # 4 to 0 |
|
335
|
|
|
|
|
|
|
|
|
336
|
0
|
0
|
0
|
|
|
|
push @{$self->{$baseQ}}, $enQ |
|
|
0
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
and next unless $match; |
|
338
|
|
|
|
|
|
|
|
|
339
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY > 1 ) { |
|
340
|
0
|
|
|
|
|
|
print "NAME: $name\n"; |
|
341
|
0
|
|
|
|
|
|
printf " RATIO -->> %d\n", $match; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
0
|
|
|
|
|
|
my $queue_index = sprintf "%d", |
|
344
|
|
|
|
|
|
|
$self->_term_count / $match; |
|
345
|
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
|
$queue_index -= $adjustment; |
|
347
|
0
|
0
|
|
|
|
|
$queue_index = 4 if $queue_index > 4; |
|
348
|
0
|
0
|
|
|
|
|
$queue_index = 0 if $queue_index < 0; |
|
349
|
0
|
|
|
|
|
|
my $queue = $Qs[$queue_index]; |
|
350
|
0
|
0
|
|
|
|
|
if ($VERBOSITY > 2) { |
|
351
|
0
|
|
|
|
|
|
print "Q:$queue [$queue_index] match: $match terms:", |
|
352
|
|
|
|
|
|
|
$self->_term_count, " Adjust: $adjustment\n\n"; |
|
353
|
|
|
|
|
|
|
} |
|
354
|
0
|
|
|
|
|
|
push @{$self->{$queue}}, $enQ; |
|
|
0
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
} elsif ( @urls > 0 ) { |
|
357
|
0
|
|
|
|
|
|
for my $url ( @urls ) { |
|
358
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
|
359
|
0
|
|
|
|
|
|
my $queue = $self->_queues; |
|
360
|
0
|
0
|
|
|
|
|
carp "Placing $url in '$queue'\n" if $VERBOSITY > 2; |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# unshift because seeding is priority |
|
363
|
0
|
|
|
|
|
|
unshift @{$self->{$queue}}, |
|
|
0
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
WWW::Spyder::Enqueue->new($url,undef); |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
} else { |
|
367
|
0
|
|
|
|
|
|
for my $enQ ( @{$self->_enqueue} ) { |
|
|
0
|
|
|
|
|
|
|
|
368
|
0
|
|
|
|
|
|
my ( $url, $name ) = ( $enQ->url, $enQ->name ); |
|
369
|
0
|
0
|
|
|
|
|
next if $self->_seen($url); |
|
370
|
0
|
|
|
|
|
|
my $queue = $self->_queues; |
|
371
|
0
|
|
|
|
|
|
push @{$self->{$queue}}, $enQ; |
|
|
0
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
#-------------------------- |
|
376
|
|
|
|
|
|
|
sub queue_count { |
|
377
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
|
378
|
0
|
|
|
|
|
|
my $count = 0; |
|
379
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
|
380
|
0
|
0
|
|
|
|
|
next unless ref($self->{$Q}) eq 'ARRAY'; |
|
381
|
0
|
|
|
|
|
|
$count += scalar @{$self->{$Q}}; |
|
|
0
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
} |
|
383
|
0
|
|
|
|
|
|
return $count; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
#-------------------------- |
|
386
|
|
|
|
|
|
|
sub spyder_time { |
|
387
|
0
|
|
|
0
|
1
|
|
my ($self,$raw) = @_; |
|
388
|
|
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
my $time = time() - $^T; |
|
390
|
0
|
0
|
|
|
|
|
return $time if $raw; |
|
391
|
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
|
my $day = int( $time / 86400 ); |
|
393
|
0
|
|
|
|
|
|
my $hour = int( $time / 3600 ) % 24; |
|
394
|
0
|
|
|
|
|
|
my $min = int( $time / 60 ) % 60; |
|
395
|
0
|
|
|
|
|
|
my $sec = $time % 60; |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# also collect slept time! |
|
398
|
0
|
0
|
|
|
|
|
return sprintf "%d day%s %02d:%02d:%02d", |
|
399
|
|
|
|
|
|
|
$day, $day == 1?'':'s', $hour, $min, $sec; |
|
400
|
|
|
|
|
|
|
} |
|
401
|
|
|
|
|
|
|
#-------------------------- |
|
402
|
|
|
|
|
|
|
sub spyder_data { |
|
403
|
0
|
|
|
0
|
1
|
|
my ($self, $bytes) = @_; |
|
404
|
0
|
0
|
0
|
|
|
|
$self->{_bytes_GOT} += $bytes and return $bytes if $bytes; |
|
405
|
|
|
|
|
|
|
|
|
406
|
0
|
0
|
|
|
|
|
return 0 unless $self->{_bytes_GOT}; |
|
407
|
|
|
|
|
|
|
|
|
408
|
0
|
|
|
|
|
|
my $for_commas = int($self->{_bytes_GOT} / 1_024); |
|
409
|
|
|
|
|
|
|
|
|
410
|
0
|
|
|
|
|
|
for ( $for_commas ) { |
|
411
|
0
|
|
|
|
|
|
1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
0
|
|
|
|
|
|
return $for_commas; |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
#-------------------------- |
|
416
|
|
|
|
|
|
|
sub spydered { |
|
417
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
|
418
|
|
|
|
|
|
|
return wantarray ? |
|
419
|
0
|
|
|
|
|
|
keys %{ $self->{_spydered} } : |
|
|
0
|
|
|
|
|
|
|
|
420
|
0
|
0
|
|
|
|
|
scalar keys %{ $self->{_spydered} }; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
#-------------------------- |
|
423
|
|
|
|
|
|
|
#sub exclude { # what about FILES TYPES!? |
|
424
|
|
|
|
|
|
|
# return undef; # not working yet! |
|
425
|
|
|
|
|
|
|
# my ($self,$thing) = @_; |
|
426
|
|
|
|
|
|
|
# if ( $thing =~ m<^[^:]{3,5}://> ) |
|
427
|
|
|
|
|
|
|
# { |
|
428
|
|
|
|
|
|
|
# return $self->{_Xklood}{_domain}{$thing}++; |
|
429
|
|
|
|
|
|
|
# } |
|
430
|
|
|
|
|
|
|
# elsif ( $thing ) |
|
431
|
|
|
|
|
|
|
# { |
|
432
|
|
|
|
|
|
|
# return $self->{_Xklood}{_name}{$thing}++; |
|
433
|
|
|
|
|
|
|
# } |
|
434
|
|
|
|
|
|
|
#} |
|
435
|
|
|
|
|
|
|
#-------------------------- |
|
436
|
|
|
|
|
|
|
#sub excluded_domains { |
|
437
|
|
|
|
|
|
|
# return undef; # not working yet! |
|
438
|
|
|
|
|
|
|
# my ($self) = @_; |
|
439
|
|
|
|
|
|
|
# return wantarray ? |
|
440
|
|
|
|
|
|
|
# keys %{$self->{_Xklood}{_domain}} : |
|
441
|
|
|
|
|
|
|
# [ keys %{$self->{_Xklood}{_domain}} ]; |
|
442
|
|
|
|
|
|
|
#} |
|
443
|
|
|
|
|
|
|
#-------------------------- |
|
444
|
|
|
|
|
|
|
#sub excluded_names { |
|
445
|
|
|
|
|
|
|
# return undef; # not working yet! |
|
446
|
|
|
|
|
|
|
# my ($self) = @_; |
|
447
|
|
|
|
|
|
|
# return wantarray ? |
|
448
|
|
|
|
|
|
|
# keys %{$self->{_Xklood}{_name}} : |
|
449
|
|
|
|
|
|
|
# [ keys %{$self->{_Xklood}{_name}} ]; |
|
450
|
|
|
|
|
|
|
#} |
|
451
|
|
|
|
|
|
|
#-------------------------- |
|
452
|
|
|
|
|
|
|
sub go_to_seed { |
|
453
|
0
|
|
|
0
|
1
|
|
my ( $self, $engine, $query ) = @_; |
|
454
|
0
|
|
|
|
|
|
carp "go_to_seed() is not functional yet!\n"; |
|
455
|
0
|
|
|
|
|
|
return; # NOT FUNCTIONAL |
|
456
|
0
|
|
|
|
|
|
my $seed = WWW::Spyder::Seed::get_seed($engine, $query); |
|
457
|
0
|
|
|
|
|
|
$self->seed($seed); |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
#-------------------------- |
|
460
|
|
|
|
|
|
|
sub verbosity { |
|
461
|
0
|
|
|
0
|
1
|
|
my ( $self, $verbosity ) = @_; |
|
462
|
0
|
0
|
0
|
|
|
|
carp "Not setting verbosity! Must be integer b/t 1 & 6!\n" |
|
463
|
|
|
|
|
|
|
and return |
|
464
|
|
|
|
|
|
|
unless $verbosity; |
|
465
|
0
|
|
|
|
|
|
$VERBOSITY = $verbosity; |
|
466
|
|
|
|
|
|
|
} |
|
467
|
|
|
|
|
|
|
#-------------------------- |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
#-------------------------- |
|
470
|
|
|
|
|
|
|
# PRIVATE Spyder Methods |
|
471
|
|
|
|
|
|
|
#-------------------------- |
|
472
|
|
|
|
|
|
|
sub _reset { |
|
473
|
|
|
|
|
|
|
# RESET MORE THAN THIS!?! make sure all the memory space is clean that |
|
474
|
|
|
|
|
|
|
# needs be for clean iteration??? |
|
475
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
476
|
0
|
|
|
|
|
|
$self->{$_} = undef for qw( _linkText _linkSwitch _href _src |
|
477
|
|
|
|
|
|
|
_current_enQ _page_content |
|
478
|
|
|
|
|
|
|
_current_Bytes _alt _enqueue_Objects |
|
479
|
|
|
|
|
|
|
_text ); |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
#-------------------------- |
|
482
|
|
|
|
|
|
|
sub _current_enQ { |
|
483
|
0
|
|
|
0
|
|
|
my ($self, $enQ) = @_; |
|
484
|
0
|
|
|
|
|
|
my $last_enQ = $self->{_current_enQ}; |
|
485
|
0
|
0
|
|
|
|
|
$self->{_current_enQ} = $enQ if $enQ; |
|
486
|
0
|
|
|
|
|
|
return $last_enQ; #<<-so we can get last while setting a new one |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
#-------------------------- |
|
489
|
|
|
|
|
|
|
sub _enqueue { |
|
490
|
0
|
|
|
0
|
|
|
my ($self,$enQ) = @_; |
|
491
|
0
|
0
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, $enQ if $enQ; |
|
|
0
|
|
|
|
|
|
|
|
492
|
0
|
|
|
|
|
|
return $self->{_enqueue_Objects}; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
#-------------------------- |
|
495
|
|
|
|
|
|
|
sub _links { |
|
496
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
497
|
0
|
|
|
|
|
|
return [ map { $_->url } @{$self->_enqueue} ]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
#-------------------------- |
|
500
|
|
|
|
|
|
|
sub _seen { |
|
501
|
0
|
|
|
0
|
|
|
my ($self,$url) = @_; |
|
502
|
0
|
|
|
|
|
|
return $self->{_seenURLs}{$url}++; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
#-------------------------- |
|
505
|
|
|
|
|
|
|
sub _parse_for_terms { |
|
506
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
507
|
0
|
|
|
|
|
|
$self->{_page_terms_matches} = 0; |
|
508
|
|
|
|
|
|
|
|
|
509
|
0
|
0
|
|
|
|
|
return 0 unless $self->{_text}; |
|
510
|
|
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
while ( my ($term,$rx) = each %{$self->terms} ) { |
|
|
0
|
|
|
|
|
|
|
|
512
|
0
|
|
|
|
|
|
$self->{_page_terms_matches}++ for |
|
513
|
|
|
|
|
|
|
$self->{_page_content} =~ /$rx/g; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
|
my $index = int( ( $self->{_page_terms_matches} / |
|
517
|
|
|
|
|
|
|
length($self->{_text}) ) * 1_000 ); |
|
518
|
|
|
|
|
|
|
# the algorithm might look it but isn't entirely arbitrary |
|
519
|
|
|
|
|
|
|
|
|
520
|
0
|
0
|
|
|
|
|
print " PARSE TERMS : $self->{_page_terms_matches} " . |
|
521
|
|
|
|
|
|
|
"/ $self->{_current_Bytes}\n" if $VERBOSITY > 1; |
|
522
|
|
|
|
|
|
|
|
|
523
|
0
|
0
|
|
|
|
|
return 7 if $index > 25; |
|
524
|
0
|
0
|
|
|
|
|
return 6 if $index > 18; |
|
525
|
0
|
0
|
|
|
|
|
return 5 if $index > 14; |
|
526
|
0
|
0
|
|
|
|
|
return 4 if $index > 11; |
|
527
|
0
|
0
|
|
|
|
|
return 3 if $index > 7; |
|
528
|
0
|
0
|
|
|
|
|
return 2 if $index > 3; |
|
529
|
0
|
0
|
|
|
|
|
return 1 if $index > 0; |
|
530
|
0
|
|
|
|
|
|
return 0; |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
#-------------------------- |
|
533
|
|
|
|
|
|
|
sub _install_html_parser { |
|
534
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
my $Parser = HTML::Parser->new |
|
537
|
|
|
|
|
|
|
( |
|
538
|
|
|
|
|
|
|
start_h => |
|
539
|
|
|
|
|
|
|
[sub { |
|
540
|
1
|
|
|
1
|
|
7
|
no warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
508
|
|
|
541
|
0
|
|
|
0
|
|
|
my ( $tag, $attr ) = @_; |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# Check for broken image links if requested. |
|
544
|
0
|
0
|
0
|
|
|
|
return if $tag !~ /^(?:a|img)$/ && ! $self->{image_checking}; |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# need to deal with AREA tags from maps /^(?:a(?:rea)?|img)$/; |
|
547
|
0
|
|
|
|
|
|
$attr->{href} =~ s,#[^/]*$,,; |
|
548
|
0
|
0
|
|
|
|
|
$attr->{src} =~ s,#[^/]*$,, if $self->{image_checking}; |
|
549
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,^\s*mailto:,; |
|
550
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,^\s*file:,; |
|
551
|
0
|
0
|
|
|
|
|
return if lc($attr->{href}) =~ m,javascript:,; |
|
552
|
|
|
|
|
|
|
|
|
553
|
0
|
0
|
0
|
|
|
|
$self->{_src} ||= $attr->{src} if $self->{image_checking}; |
|
554
|
0
|
|
0
|
|
|
|
$self->{_href} ||= $attr->{href}; |
|
555
|
0
|
|
0
|
|
|
|
$self->{_alt} ||= $attr->{alt}; |
|
556
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = 1; |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# Don't wait for the end handler if we have an image, as an image |
|
559
|
|
|
|
|
|
|
# src tag doesn't have an end. |
|
560
|
0
|
0
|
0
|
|
|
|
if ($attr->{src} && $self->{image_checking} && ! $attr->{href}) { |
|
|
|
|
0
|
|
|
|
|
|
561
|
0
|
|
0
|
|
|
|
$self->{_linkText} ||= $self->{_alt} || '+'; |
|
|
|
|
0
|
|
|
|
|
|
562
|
0
|
|
|
|
|
|
decode_entities($self->{_linkText}); |
|
563
|
|
|
|
|
|
|
|
|
564
|
0
|
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
|
0
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
[ $self->{_href}, $self->{_linkText} ]; |
|
566
|
|
|
|
|
|
|
|
|
567
|
0
|
0
|
0
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
|
0
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
[ $self->{_src}, $self->{_linkText} ] |
|
569
|
|
|
|
|
|
|
if $self->{_src} and $self->{image_checking}; |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# reset all our caching variables |
|
572
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = $self->{_href} = $self->{_alt} = |
|
573
|
|
|
|
|
|
|
$self->{_src} = $self->{_linkText} = undef; |
|
574
|
|
|
|
|
|
|
|
|
575
|
0
|
|
|
|
|
|
return; |
|
576
|
|
|
|
|
|
|
} |
|
577
|
|
|
|
|
|
|
}, 'tagname, attr'], |
|
578
|
|
|
|
|
|
|
text_h => |
|
579
|
|
|
|
|
|
|
[sub { |
|
580
|
|
|
|
|
|
|
|
|
581
|
0
|
0
|
|
0
|
|
|
return unless(my $it = shift); |
|
582
|
0
|
0
|
|
|
|
|
return if $it =~ |
|
583
|
|
|
|
|
|
|
m/(?:\Q\E)/; |
|
584
|
0
|
|
|
|
|
|
${$self->{_text}} .= $it; |
|
|
0
|
|
|
|
|
|
|
|
585
|
0
|
0
|
|
|
|
|
$self->{_linkText} .= $it |
|
586
|
|
|
|
|
|
|
if $self->{_linkSwitch}; |
|
587
|
|
|
|
|
|
|
}, 'dtext'], |
|
588
|
|
|
|
|
|
|
end_h => |
|
589
|
|
|
|
|
|
|
[sub { |
|
590
|
0
|
|
|
0
|
|
|
my ( $tag ) = @_; |
|
591
|
1
|
|
|
1
|
|
13
|
no warnings; # only problem: Links |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
435
|
|
|
592
|
|
|
|
|
|
|
|
|
593
|
0
|
0
|
|
|
|
|
if ($self->{image_checking}) { |
|
594
|
0
|
0
|
0
|
|
|
|
return unless $tag eq 'a' or $self->{_linkSwitch} or |
|
|
|
|
0
|
|
|
|
|
|
595
|
|
|
|
|
|
|
$tag eq 'img'; |
|
596
|
|
|
|
|
|
|
} else { |
|
597
|
0
|
0
|
0
|
|
|
|
return unless $tag eq 'a' or $self->{_linkSwitch}; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
|
|
600
|
0
|
|
0
|
|
|
|
$self->{_linkText} ||= $self->{_alt} || '+'; |
|
|
|
|
0
|
|
|
|
|
|
601
|
0
|
|
|
|
|
|
decode_entities($self->{_linkText}); |
|
602
|
|
|
|
|
|
|
|
|
603
|
0
|
|
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
|
0
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
[ $self->{_href}, $self->{_linkText} ]; |
|
605
|
|
|
|
|
|
|
|
|
606
|
0
|
0
|
0
|
|
|
|
push @{$self->{_enqueue_Objects}}, |
|
|
0
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
[ $self->{_src}, $self->{_linkText} ] |
|
608
|
|
|
|
|
|
|
if $self->{_src} and $self->{image_checking}; |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# reset all our caching variables |
|
611
|
0
|
|
|
|
|
|
$self->{_linkSwitch} = $self->{_href} = $self->{_alt} = $self->{_src} = |
|
612
|
|
|
|
|
|
|
$self->{_linkText} = undef; |
|
613
|
0
|
|
|
|
|
|
}, 'tagname'], |
|
614
|
|
|
|
|
|
|
default_h => [""], |
|
615
|
|
|
|
|
|
|
); |
|
616
|
0
|
|
|
|
|
|
$Parser->ignore_elements(qw(script style)); |
|
617
|
0
|
|
|
|
|
|
$Parser->unbroken_text(1); |
|
618
|
0
|
|
|
|
|
|
$self->html_parser($Parser); |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
#-------------------------- |
|
621
|
|
|
|
|
|
|
sub _install_web_agent { |
|
622
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
623
|
0
|
|
|
|
|
|
my $jar_jar = undef; |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
# If a LWP::UserAgent object was passed in to our constructor, use |
|
626
|
|
|
|
|
|
|
# it. |
|
627
|
0
|
0
|
|
|
|
|
if ($self->{UA}) { |
|
628
|
0
|
|
|
|
|
|
$self->UA( $self->{UA} ); |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Otherwise, create a new one. |
|
631
|
|
|
|
|
|
|
} else { |
|
632
|
0
|
|
|
|
|
|
$self->UA( LWP::UserAgent->new ); |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
|
|
635
|
0
|
|
|
|
|
|
$self->UA->agent('Mozilla/5.0'); |
|
636
|
0
|
|
|
|
|
|
$self->UA->timeout(30); |
|
637
|
0
|
|
|
|
|
|
$self->UA->max_size(250_000); |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
# Get our cookie from our the jar passed in. |
|
640
|
0
|
0
|
|
|
|
|
if ($self->{UA}) { |
|
641
|
0
|
|
|
|
|
|
$jar_jar = $self->{UA}->cookie_jar(); |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
# Or else create a new cookie. |
|
644
|
|
|
|
|
|
|
} else { |
|
645
|
0
|
|
0
|
|
|
|
$jar_jar = HTTP::Cookies->new |
|
646
|
|
|
|
|
|
|
(file => $self->cookie_file || "$ENV{HOME}/spyderCookies", |
|
647
|
|
|
|
|
|
|
autosave => 1, |
|
648
|
|
|
|
|
|
|
max_cookie_size => 4096, |
|
649
|
|
|
|
|
|
|
max_cookies_per_domain => 5, ); |
|
650
|
|
|
|
|
|
|
} |
|
651
|
|
|
|
|
|
|
|
|
652
|
0
|
|
|
|
|
|
$self->UA->cookie_jar($jar_jar); |
|
653
|
|
|
|
|
|
|
} |
|
654
|
|
|
|
|
|
|
#-------------------------- |
|
655
|
|
|
|
|
|
|
sub _install_exit_check { |
|
656
|
0
|
|
|
0
|
|
|
my ($self, $arg) = @_; |
|
657
|
0
|
|
|
|
|
|
my $class = ref $self; |
|
658
|
|
|
|
|
|
|
|
|
659
|
0
|
0
|
0
|
|
|
|
unless ( ref($arg) and ref($arg->{exit_on}) eq 'HASH' ) { |
|
660
|
1
|
|
|
1
|
|
6
|
no strict "refs"; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
310
|
|
|
661
|
0
|
|
|
|
|
|
*{$class."::_exit_check"} = |
|
662
|
0
|
0
|
|
0
|
|
|
sub { return 1 unless $self->queue_count; |
|
663
|
0
|
|
|
|
|
|
return 0; |
|
664
|
0
|
|
|
|
|
|
}; |
|
665
|
0
|
|
|
|
|
|
return; |
|
666
|
|
|
|
|
|
|
} |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# checks can be: links => #, success => ratio, time => 10min... |
|
669
|
|
|
|
|
|
|
# a piece of code we're going to build up to eval into method-hood |
|
670
|
0
|
|
|
|
|
|
my $SUB = 'sub { my $self = shift; ' . |
|
671
|
|
|
|
|
|
|
'return 1 unless $self->queue_count; '; |
|
672
|
|
|
|
|
|
|
#------------------------------------------------------------ |
|
673
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{pages} ) { |
|
674
|
0
|
0
|
|
|
|
|
print "Installing EXIT on links: $arg->{exit_on}{pages}\n" |
|
675
|
|
|
|
|
|
|
if $VERBOSITY > 1; |
|
676
|
0
|
|
|
|
|
|
$SUB .= ' return 1 if ' . |
|
677
|
|
|
|
|
|
|
'$self->spydered >= ' .$arg->{exit_on}{pages} .';'; |
|
678
|
|
|
|
|
|
|
} |
|
679
|
|
|
|
|
|
|
#------------------------------------------------------------ |
|
680
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{success} ) { |
|
681
|
|
|
|
|
|
|
#set necessary obj value and add to sub code |
|
682
|
|
|
|
|
|
|
} |
|
683
|
|
|
|
|
|
|
#------------------------------------------------------------ |
|
684
|
0
|
0
|
|
|
|
|
if ( $arg->{exit_on}{time} ) { |
|
685
|
0
|
0
|
|
|
|
|
print "Installing EXIT on time: $arg->{exit_on}{time}\n" |
|
686
|
|
|
|
|
|
|
if $VERBOSITY > 1; |
|
687
|
|
|
|
|
|
|
|
|
688
|
0
|
|
|
|
|
|
my ($amount,$unit) = |
|
689
|
|
|
|
|
|
|
$arg->{exit_on}{time} =~ /^(\d+)\W*(\w+?)s?$/; |
|
690
|
|
|
|
|
|
|
# skip final "s" in case of hours, secs, mins |
|
691
|
|
|
|
|
|
|
|
|
692
|
0
|
|
|
|
|
|
my %times = ( hour => 3600, |
|
693
|
|
|
|
|
|
|
min => 60, |
|
694
|
|
|
|
|
|
|
sec => 1 ); |
|
695
|
|
|
|
|
|
|
|
|
696
|
0
|
|
|
|
|
|
my $time_factor = 0; |
|
697
|
0
|
|
|
|
|
|
for ( keys %times ) { |
|
698
|
0
|
0
|
|
|
|
|
next unless exists $times{$unit}; |
|
699
|
0
|
|
|
|
|
|
$time_factor = $amount * $times{$unit}; |
|
700
|
|
|
|
|
|
|
} |
|
701
|
0
|
|
|
|
|
|
$self->_exit_epoch($time_factor + $^T); |
|
702
|
|
|
|
|
|
|
|
|
703
|
0
|
|
|
|
|
|
$SUB .= q{ |
|
704
|
|
|
|
|
|
|
return 1 if $self->_exit_epoch < time(); |
|
705
|
|
|
|
|
|
|
}; |
|
706
|
|
|
|
|
|
|
} |
|
707
|
|
|
|
|
|
|
#------------------------------------------------------------ |
|
708
|
0
|
|
|
|
|
|
$SUB .= '}'; |
|
709
|
|
|
|
|
|
|
|
|
710
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
748
|
|
|
711
|
0
|
|
|
|
|
|
*{$class."::_exit_check"} = eval $SUB; |
|
|
0
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
} |
|
713
|
|
|
|
|
|
|
#-------------------------- |
|
714
|
|
|
|
|
|
|
sub _choose_courteously { |
|
715
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
# w/o the switch and $i-- it acts a bit more depth first. w/ it, it's |
|
718
|
|
|
|
|
|
|
# basically hard head down breadth first |
|
719
|
0
|
0
|
|
|
|
|
print "CHOOSING courteously!\n" if $VERBOSITY > 1; |
|
720
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
|
721
|
0
|
0
|
|
|
|
|
print "Looking for URL in $Q\n" if $VERBOSITY > 2; |
|
722
|
0
|
0
|
0
|
|
|
|
next unless $self->{$Q} and @{$self->{$Q}} > 0; |
|
|
0
|
|
|
|
|
|
|
|
723
|
0
|
|
|
|
|
|
my %seen; |
|
724
|
0
|
|
|
|
|
|
my $total = scalar @{$self->{$Q}}; |
|
|
0
|
|
|
|
|
|
|
|
725
|
0
|
|
|
|
|
|
my $switch; |
|
726
|
0
|
|
|
|
|
|
for ( my $i = 0; $i < @{$self->{$Q}}; $i++ ) { |
|
|
0
|
|
|
|
|
|
|
|
727
|
0
|
|
|
|
|
|
my $enQ = $self->{$Q}[$i]; |
|
728
|
0
|
|
|
|
|
|
my ($url,$name) = ( $enQ->url, $enQ->name ); |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# if we see one again, we've reshuffled as much as is useful |
|
731
|
0
|
0
|
|
|
|
|
$seen{$url}++ |
|
732
|
|
|
|
|
|
|
and $switch = 1; # progress through to next Q |
|
733
|
|
|
|
|
|
|
|
|
734
|
0
|
0
|
|
|
|
|
return splice(@{$self->{$Q}},$i,1) |
|
|
0
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
unless $self->_courtesy_call($enQ); |
|
736
|
|
|
|
|
|
|
|
|
737
|
0
|
|
|
|
|
|
my $fair_bump = int( log( $total - $i ) / log(1.5) ); |
|
738
|
|
|
|
|
|
|
|
|
739
|
0
|
|
|
|
|
|
my $move_me_back = splice(@{$self->{$Q}},$i,1); |
|
|
0
|
|
|
|
|
|
|
|
740
|
0
|
|
|
|
|
|
splice(@{$self->{$Q}},($i+$fair_bump),0,$move_me_back); |
|
|
0
|
|
|
|
|
|
|
|
741
|
0
|
0
|
|
|
|
|
$i-- unless $switch; |
|
742
|
|
|
|
|
|
|
} |
|
743
|
|
|
|
|
|
|
} |
|
744
|
|
|
|
|
|
|
# we couldn't pick one courteously |
|
745
|
|
|
|
|
|
|
} # end of _choose_courteously() |
|
746
|
|
|
|
|
|
|
#-------------------------- |
|
747
|
|
|
|
|
|
|
sub _just_choose { |
|
748
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
749
|
0
|
0
|
|
|
|
|
print "CHOOSING first up!\n" if $VERBOSITY > 1; |
|
750
|
|
|
|
|
|
|
|
|
751
|
0
|
|
|
|
|
|
my $enQ; |
|
752
|
0
|
|
|
|
|
|
for my $Q ( $self->_queues ) { |
|
753
|
0
|
0
|
|
|
|
|
next unless ref($self->{$Q}) eq 'ARRAY'; |
|
754
|
0
|
|
|
|
|
|
$enQ = shift @{$self->{$Q}}; |
|
|
0
|
|
|
|
|
|
|
|
755
|
0
|
|
|
|
|
|
last; |
|
756
|
|
|
|
|
|
|
} |
|
757
|
0
|
|
|
|
|
|
my $tax = $self->_courtesy_call($enQ); |
|
758
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY > 4 ) { |
|
759
|
0
|
|
|
|
|
|
print ' QUEUE: '; |
|
760
|
0
|
0
|
|
|
|
|
print join("-:-", @{$self->{_courtesy_Queue}}), "\n" |
|
|
0
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
if $self->{_courtesy_Queue}; |
|
762
|
|
|
|
|
|
|
} |
|
763
|
0
|
|
|
|
|
|
my $sleep = int(rand($self->sleep_base)) + $tax; |
|
764
|
|
|
|
|
|
|
|
|
765
|
0
|
0
|
|
|
|
|
if ( $VERBOSITY ) { |
|
766
|
0
|
0
|
|
|
|
|
printf "COURTESY NAP %d second%s ", |
|
767
|
|
|
|
|
|
|
$sleep, $sleep == 1 ?'':'s'; |
|
768
|
0
|
0
|
|
|
|
|
printf "(Domain recently seen: %d time%s)\n", |
|
769
|
|
|
|
|
|
|
$tax, $tax == 1 ?'':'s'; |
|
770
|
|
|
|
|
|
|
} |
|
771
|
0
|
|
|
|
|
|
sleep $sleep; # courtesy to websites but human-ish w/ random |
|
772
|
0
|
|
|
|
|
|
$self->slept($sleep); |
|
773
|
0
|
|
|
|
|
|
return $enQ; |
|
774
|
|
|
|
|
|
|
} |
|
775
|
|
|
|
|
|
|
#-------------------------- |
|
776
|
|
|
|
|
|
|
sub _courtesy_call { |
|
777
|
0
|
|
|
0
|
|
|
my ($self,$enQ) = @_; |
|
778
|
0
|
0
|
|
|
|
|
return 0 unless $enQ; |
|
779
|
0
|
|
|
|
|
|
my $domain = $enQ->domain; |
|
780
|
|
|
|
|
|
|
|
|
781
|
0
|
0
|
|
|
|
|
print 'COURTESY check: ', $domain, "\n" if $VERBOSITY > 5; |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
# yes, we have seen it in the last whatever GETs |
|
784
|
0
|
|
|
|
|
|
my $seen = 0; |
|
785
|
0
|
|
|
|
|
|
$seen = scalar grep { $_ eq $domain } |
|
|
0
|
|
|
|
|
|
|
|
786
|
0
|
|
|
|
|
|
@{$self->{_courtesy_Queue}}; |
|
787
|
0
|
0
|
|
|
|
|
$seen = 10 if $seen > 10; |
|
788
|
0
|
|
|
|
|
|
return $seen; |
|
789
|
|
|
|
|
|
|
} |
|
790
|
|
|
|
|
|
|
#-------------------------- |
|
791
|
|
|
|
|
|
|
sub _queues { # Q9 is purely for trash so it's not returned here |
|
792
|
|
|
|
|
|
|
return wantarray ? |
|
793
|
0
|
0
|
|
0
|
|
|
( 0 .. 9 ) : |
|
794
|
|
|
|
|
|
|
'0'; |
|
795
|
|
|
|
|
|
|
} |
|
796
|
|
|
|
|
|
|
#-------------------------- |
|
797
|
|
|
|
|
|
|
sub _snip { |
|
798
|
0
|
0
|
|
0
|
|
|
my $self = shift if ref($_[0]); |
|
799
|
0
|
|
|
|
|
|
my ( @text ) = @_; |
|
800
|
0
|
|
|
|
|
|
s/^\s+//, s/\s+$//, s/\s+/ /g for @text; |
|
801
|
0
|
0
|
|
|
|
|
return wantarray ? @text : shift @text; |
|
802
|
|
|
|
|
|
|
} |
|
803
|
|
|
|
|
|
|
#-------------------------- |
|
804
|
|
|
|
|
|
|
# Spyder ENDS |
|
805
|
|
|
|
|
|
|
#-------------------------- |
|
806
|
|
|
|
|
|
|
}# WWW::Spyder privacy ends |
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
#-------------------------- |
|
810
|
|
|
|
|
|
|
package WWW::Spyder::Enqueue; |
|
811
|
|
|
|
|
|
|
#-------------------------- |
|
812
|
|
|
|
|
|
|
{ |
|
813
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
116
|
|
|
814
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
|
815
|
1
|
|
|
|
|
6
|
use overload( q{""} => '_stringify', |
|
816
|
1
|
|
|
1
|
|
6
|
fallback => 1 ); |
|
|
1
|
|
|
|
|
2
|
|
|
817
|
|
|
|
|
|
|
#--------------------------------------------------------------------- |
|
818
|
|
|
|
|
|
|
# 0 -->> URL |
|
819
|
|
|
|
|
|
|
# 1 -->> name, if any, of link URL was got from |
|
820
|
|
|
|
|
|
|
# 2 -->> domain |
|
821
|
|
|
|
|
|
|
#-------------------------- |
|
822
|
|
|
|
|
|
|
sub new { |
|
823
|
0
|
|
|
0
|
|
|
my ( $caller, $url, $name ) = @_; |
|
824
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
|
825
|
0
|
0
|
|
|
|
|
croak "Here I am. " if ref $url; |
|
826
|
0
|
0
|
|
|
|
|
return undef unless $url; |
|
827
|
0
|
0
|
|
|
|
|
if ( length($url) > 512 ) { # that's toooo long, don't you think? |
|
828
|
0
|
|
|
|
|
|
$url = substr($url,0,512); |
|
829
|
|
|
|
|
|
|
} |
|
830
|
0
|
0
|
0
|
|
|
|
if ( $name and length($name) > 512 ) { |
|
831
|
0
|
|
|
|
|
|
$name = substr($url,0,509) . '...'; |
|
832
|
|
|
|
|
|
|
} |
|
833
|
0
|
0
|
|
|
|
|
$name = '-' unless $name; # need this to find a bug later |
|
834
|
0
|
|
|
|
|
|
my ( $domain ) = $url =~ m,^[^:]+:/+([^/]+),; |
|
835
|
0
|
|
|
|
|
|
bless [ $url, $name, lc($domain) ], $class; |
|
836
|
|
|
|
|
|
|
} |
|
837
|
|
|
|
|
|
|
#-------------------------- |
|
838
|
|
|
|
|
|
|
sub url { |
|
839
|
0
|
|
|
0
|
|
|
return $_[0]->[0]; |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
#-------------------------- |
|
842
|
|
|
|
|
|
|
sub name { |
|
843
|
0
|
|
|
0
|
|
|
return $_[0]->[1]; |
|
844
|
|
|
|
|
|
|
} |
|
845
|
|
|
|
|
|
|
#-------------------------- |
|
846
|
|
|
|
|
|
|
sub domain { |
|
847
|
0
|
|
|
0
|
|
|
return $_[0]->[2]; |
|
848
|
|
|
|
|
|
|
} |
|
849
|
|
|
|
|
|
|
#-------------------------- |
|
850
|
|
|
|
|
|
|
sub _stringify { |
|
851
|
0
|
|
|
0
|
|
|
return $_[0]->[0]; |
|
852
|
|
|
|
|
|
|
} |
|
853
|
|
|
|
|
|
|
#-------------------------- |
|
854
|
|
|
|
|
|
|
}#privacy for WWW::Spyder::Enqueue ends |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
#-------------------------- |
|
858
|
|
|
|
|
|
|
package WWW::Spyder::Page; |
|
859
|
|
|
|
|
|
|
#-------------------------- |
|
860
|
1
|
|
|
1
|
|
279
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
27
|
|
|
861
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
22
|
|
|
862
|
1
|
|
|
1
|
|
15
|
use Carp; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
95
|
|
|
863
|
|
|
|
|
|
|
{ |
|
864
|
|
|
|
|
|
|
sub new { |
|
865
|
0
|
|
|
0
|
|
|
my ( $caller, %arg ) = @_; |
|
866
|
0
|
|
0
|
|
|
|
my $class = ref($caller) || $caller; |
|
867
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
|
868
|
|
|
|
|
|
|
|
|
869
|
0
|
|
|
|
|
|
while ( my ( $method, $val ) = each %arg ) { |
|
870
|
|
|
|
|
|
|
|
|
871
|
1
|
|
|
1
|
|
4
|
no strict "refs"; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
22
|
|
|
872
|
1
|
|
|
1
|
|
4
|
no warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
410
|
|
|
873
|
0
|
|
|
|
|
|
my $attribute = '_' . $method; |
|
874
|
|
|
|
|
|
|
|
|
875
|
0
|
0
|
|
|
|
|
if ( ref $val eq 'ARRAY' ) { |
|
876
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
|
877
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
|
878
|
0
|
0
|
|
|
|
|
return @{$self->{$attribute}} unless $arg; |
|
|
0
|
|
|
|
|
|
|
|
879
|
0
|
|
|
|
|
|
push(@{$self->{$attribute}}, @{$arg}); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
880
|
0
|
|
|
|
|
|
}; |
|
881
|
|
|
|
|
|
|
} else { |
|
882
|
0
|
|
|
|
|
|
*{"$class::$method"} = sub { |
|
883
|
0
|
|
|
0
|
|
|
my($self,$arg) = @_; |
|
884
|
|
|
|
|
|
|
# get if already set and deref if needed |
|
885
|
0
|
0
|
0
|
|
|
|
if ( not $arg and exists $self->{$attribute} ) { |
|
886
|
0
|
|
|
|
|
|
return ref($self->{$attribute}) eq 'SCALAR' ? |
|
887
|
0
|
0
|
|
|
|
|
${$self->{$attribute}} : $self->{$attribute}; |
|
888
|
|
|
|
|
|
|
} |
|
889
|
0
|
0
|
|
|
|
|
$self->{$attribute} = $arg if $arg; #only set one time! |
|
890
|
0
|
|
|
|
|
|
}; |
|
891
|
|
|
|
|
|
|
} |
|
892
|
0
|
|
|
|
|
|
$self->$method($val); |
|
893
|
|
|
|
|
|
|
} |
|
894
|
0
|
|
|
|
|
|
return $self; |
|
895
|
|
|
|
|
|
|
} |
|
896
|
|
|
|
|
|
|
#-------------------------- |
|
897
|
|
|
|
|
|
|
sub links { |
|
898
|
0
|
|
|
0
|
|
|
my ( $self ) = @_; |
|
899
|
0
|
|
|
|
|
|
return map {$_->url} @{$self->{_pages_enQs}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
} |
|
901
|
|
|
|
|
|
|
#-------------------------- |
|
902
|
|
|
|
|
|
|
sub next_link { |
|
903
|
0
|
|
|
0
|
|
|
my ( $self ) = @_; |
|
904
|
0
|
|
|
|
|
|
shift @{$self->{_pages_enQs}}; |
|
|
0
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
#-------------------------- |
|
907
|
|
|
|
|
|
|
}#privacy for ::Page ends |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
#-------------------------- |
|
911
|
|
|
|
|
|
|
package WWW::Spyder::Exclusions; |
|
912
|
|
|
|
|
|
|
#-------------------------- |
|
913
|
|
|
|
|
|
|
{ |
|
914
|
|
|
|
|
|
|
# THIS PACKAGE IS NOT BEING USED |
|
915
|
|
|
|
|
|
|
my %_domains = qw( |
|
916
|
|
|
|
|
|
|
ad.doubleclick.net 1 |
|
917
|
|
|
|
|
|
|
ads.clickagents.com 1 |
|
918
|
|
|
|
|
|
|
); |
|
919
|
|
|
|
|
|
|
my %_names = qw( |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
); |
|
922
|
|
|
|
|
|
|
#-------------------------- |
|
923
|
|
|
|
|
|
|
sub exclude_domain { |
|
924
|
0
|
|
|
0
|
|
|
$_domains{shift}++; |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
#-------------------------- |
|
927
|
|
|
|
|
|
|
sub excluded { |
|
928
|
0
|
|
|
0
|
|
|
my $what = shift; |
|
929
|
0
|
0
|
|
|
|
|
exists $_domains{$what} || $_names{$what}; |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
#-------------------------- |
|
932
|
|
|
|
|
|
|
}#privacy ends |
|
933
|
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
#-------------------------- |
|
936
|
|
|
|
|
|
|
package WWW::Spyder::Seed; |
|
937
|
|
|
|
|
|
|
#-------------------------- |
|
938
|
|
|
|
|
|
|
{ |
|
939
|
1
|
|
|
1
|
|
11
|
use URI::Escape; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
48
|
|
|
940
|
1
|
|
|
1
|
|
4
|
use Carp; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
270
|
|
|
941
|
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
my %engine_url = |
|
943
|
|
|
|
|
|
|
( |
|
944
|
|
|
|
|
|
|
google => 'http://www.google.com/search?q=', |
|
945
|
|
|
|
|
|
|
yahoo => 1 |
|
946
|
|
|
|
|
|
|
); |
|
947
|
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
# should we exclude the search domain at this point? i think so because |
|
949
|
|
|
|
|
|
|
# otherwise we've introduced dozens of erroneous links and the engine |
|
950
|
|
|
|
|
|
|
# is gonna get hammered over time for it |
|
951
|
|
|
|
|
|
|
#-------------------------- |
|
952
|
|
|
|
|
|
|
sub get_seed { |
|
953
|
|
|
|
|
|
|
|
|
954
|
0
|
|
0
|
0
|
|
|
my $engine = shift || croak "Must provide search engine! " . |
|
955
|
|
|
|
|
|
|
join(', ', sort keys %engine_url) . "\n"; |
|
956
|
|
|
|
|
|
|
|
|
957
|
0
|
|
0
|
|
|
|
my $query = shift || croak "Must provide query terms!\n"; |
|
958
|
0
|
|
|
|
|
|
$query = uri_escape($query); |
|
959
|
|
|
|
|
|
|
|
|
960
|
0
|
0
|
|
|
|
|
croak "$engine is not a valid choice!\n" |
|
961
|
|
|
|
|
|
|
unless exists $engine_url{lc$engine}; |
|
962
|
|
|
|
|
|
|
|
|
963
|
0
|
|
|
|
|
|
return $engine_url{lc$engine} . $query; |
|
964
|
|
|
|
|
|
|
} |
|
965
|
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
} # Privacy for WWW::Spyder::Seed ends |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
1; |
|
969
|
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
# Plain Old D'errrrr |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
=pod |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=head1 NAME |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
WWW::Spyder - a simple non-persistent web crawler. |
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
=head1 VERSION |
|
979
|
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
0.24 |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
A web spider that returns plain text, HTML, and other information per |
|
985
|
|
|
|
|
|
|
page crawled and can determine what pages to get and parse based on |
|
986
|
|
|
|
|
|
|
supplied terms compared to the text in links as well as page content. |
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
use WWW::Spyder; |
|
989
|
|
|
|
|
|
|
# Supply your own LWP::UserAgent-compatible agent. |
|
990
|
|
|
|
|
|
|
use WWW::Mechanize; |
|
991
|
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
my $start_url = "http://my-great-domain.com/"; |
|
993
|
|
|
|
|
|
|
my $mech = WWW::Mechanize->new(agent => "PreferredAgent/0.01") |
|
994
|
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new( |
|
996
|
|
|
|
|
|
|
report_broken_links => 1, |
|
997
|
|
|
|
|
|
|
seed => $start_url, |
|
998
|
|
|
|
|
|
|
sleep_base => 5, |
|
999
|
|
|
|
|
|
|
UA => $mech |
|
1000
|
|
|
|
|
|
|
); |
|
1001
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
|
1002
|
|
|
|
|
|
|
# do something with the page... |
|
1003
|
|
|
|
|
|
|
} |
|
1004
|
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
=head1 METHODS |
|
1006
|
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=over 2 |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
=item * $spyder->new() |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Construct a new spyder object. Without at least the seed() set, or |
|
1012
|
|
|
|
|
|
|
go_to_seed() turned on, the spyder isn't ready to crawl. |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
$spyder = WWW::Spyder->new(shift||die"Gimme a URL!\n"); |
|
1015
|
|
|
|
|
|
|
# ...or... |
|
1016
|
|
|
|
|
|
|
$spyder = WWW::Spyder->new( %options ); |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
Options include: sleep_base (in seconds), exit_on (hash of methods and |
|
1019
|
|
|
|
|
|
|
settings), report_broken_links, image_checking (verifies the images pointed to |
|
1020
|
|
|
|
|
|
|
by tags, disable_cnap (disables the courtesy nap when verbose |
|
1021
|
|
|
|
|
|
|
output is enabled), and UA (you can pass in an instantiated LWP::UserAgent |
|
1022
|
|
|
|
|
|
|
object via UA, i.e. UA => $ua_obj). Examples below. |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
=item * $spyder->seed($url) |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
Adds a URL (or URLs) to the top of the queues for crawling. If the |
|
1027
|
|
|
|
|
|
|
spyder is constructed with a single scalar argument, that is considered |
|
1028
|
|
|
|
|
|
|
the seed. |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=item * $spyder->bell([bool]) |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
This will print a bell ("\a") to STDERR on every successfully crawled |
|
1033
|
|
|
|
|
|
|
page. It might seem annoying but it is an excellent way to know your |
|
1034
|
|
|
|
|
|
|
spyder is behaving and working. True value turns it on. Right now it |
|
1035
|
|
|
|
|
|
|
can't be turned off. |
|
1036
|
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
=item * $spyder->spyder_time([bool]) |
|
1038
|
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
Returns raw seconds since I was created if given a |
|
1040
|
|
|
|
|
|
|
boolean value, otherwise returns "D day(s) HH::MM:SS." |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
=item * $spyder->terms([list of terms to match]) |
|
1043
|
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
The more terms, the more the spyder is going to grasp at. If you give |
|
1045
|
|
|
|
|
|
|
a straight list of strings, they will be turned into very open |
|
1046
|
|
|
|
|
|
|
regexes. E.g.: "king" would match "sulking" and "kinglet" but not |
|
1047
|
|
|
|
|
|
|
"King." It is case sensitive right now. If you want more specific |
|
1048
|
|
|
|
|
|
|
matching or different behavior, pass your own regexes instead of |
|
1049
|
|
|
|
|
|
|
strings. |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
$spyder->terms( qr/\bkings?\b/i, qr/\bqueens?\b/i ); |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
terms() is only settable once right now, then it's a done deal. |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
=item * $spyder->spyder_data() |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
A comma formatted number of kilobytes retrieved so far. B give |
|
1058
|
|
|
|
|
|
|
it an argument. It's a set/get routine. |
|
1059
|
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=item * $spyder->slept() |
|
1061
|
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
Returns the total number of seconds the spyder has slept while |
|
1063
|
|
|
|
|
|
|
running. Useful for getting accurate page/time counts (spyder |
|
1064
|
|
|
|
|
|
|
performance) discounting the added courtesy naps. |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=item * $spyder->UA->... |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
The user agent. It should be an L or a well-behaved |
|
1069
|
|
|
|
|
|
|
subclass like L. Here are the initialized values you |
|
1070
|
|
|
|
|
|
|
might want to tweak- |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
$spyder->UA->timeout(30); |
|
1073
|
|
|
|
|
|
|
$spyder->UA->max_size(250_000); |
|
1074
|
|
|
|
|
|
|
$spyder->UA->agent('Mozilla/5.0'); |
|
1075
|
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
Changing the agent name can hurt your spyder because some servers won't |
|
1077
|
|
|
|
|
|
|
return content unless it's requested by a "browser" they recognize. |
|
1078
|
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
You should probably add your email with from() as well. |
|
1080
|
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
$spyder->UA->from('bluefintuna@fish.net'); |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=item * $spyder->cookie_file([local_file]) |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
They live in $ENV{HOME}/spyderCookie by default but you can set your |
|
1086
|
|
|
|
|
|
|
own file if you prefer or want to save different cookie files for |
|
1087
|
|
|
|
|
|
|
different spyders. |
|
1088
|
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=item * $spyder->get_broken_links |
|
1090
|
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
Returns a reference to a list of broken link URLs if report_broken_links was |
|
1092
|
|
|
|
|
|
|
was enabled in the constructor. |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
=item * $spyder->go_to_seed |
|
1095
|
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
=item * $spyder->queue_count |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
=item * $spyder->show_attributes |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=item * $spyder->spydered |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=item * $spyder->crawl |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
Returns (and removes) a Spyder page object from the queue of spydered pages. |
|
1105
|
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
=back |
|
1107
|
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
=head2 Sypder::Page methods |
|
1109
|
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
=over 6 |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
=item * $page->title |
|
1113
|
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=item * $page->text |
|
1115
|
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
=item * $page->raw |
|
1117
|
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
=item * $page->url |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
=item * $page->domain |
|
1121
|
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
=item * $page->link_name |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
=item * $page->link |
|
1125
|
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
=item * $page->description |
|
1127
|
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
=item * $page->pages_enQs |
|
1129
|
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
=back |
|
1131
|
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
=head2 Weird courteous behavior |
|
1133
|
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
Courtesy didn't used to be weird, but that's another story. You will |
|
1135
|
|
|
|
|
|
|
probably notice that the courtesy routines force a sleep when a |
|
1136
|
|
|
|
|
|
|
recently seen domain is the only choice for a new link. The sleep is |
|
1137
|
|
|
|
|
|
|
partially randomized. This is to prevent the spyder from being |
|
1138
|
|
|
|
|
|
|
recognized in weblogs as a robot. |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
=head2 The web and courtesy |
|
1141
|
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
B, I beg of thee, exercise the most courtesy you can. Don't |
|
1143
|
|
|
|
|
|
|
let impatience get in the way. Bandwidth and server traffic are |
|
1144
|
|
|
|
|
|
|
C<$MONEY> for real. The web is an extremely disorganized and corrupted |
|
1145
|
|
|
|
|
|
|
database at the root but companies and individuals pay to keep it |
|
1146
|
|
|
|
|
|
|
available. The less pain you cause by banging away on a webserver with |
|
1147
|
|
|
|
|
|
|
a web agent, the more welcome the next web agent will be. |
|
1148
|
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
B: Google seems to be excluding generic LWP agents now. See, I |
|
1150
|
|
|
|
|
|
|
told you so. A single parallel robot can really hammer a major server, |
|
1151
|
|
|
|
|
|
|
even someone with as big a farm and as much bandwidth as Google. |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
=head2 VERBOSITY |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
=over 2 |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
=item * $spyder->verbosity([1-6]) -OR- |
|
1158
|
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
=item * $WWW::Spyder::VERBOSITY = ... |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
Set it from 1 to 6 right now to get varying amounts of extra info to |
|
1162
|
|
|
|
|
|
|
STDOUT. It's an uneven scale and will be straightened out pretty soon. |
|
1163
|
|
|
|
|
|
|
If kids have a preference for sending the info to STDERR, I'll do |
|
1164
|
|
|
|
|
|
|
that. I might anyway. |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
=back |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head1 SAMPLE USAGE |
|
1169
|
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
=head2 See "spyder-mini-bio" in this distribution |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
It's an extremely simple, but fairly cool pseudo bio-researcher. |
|
1173
|
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
=head2 Simple continually crawling spyder: |
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
In the following code snippet: |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
use WWW::Spyder; |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new( shift || die"Give me a URL!\n" ); |
|
1181
|
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
print '-'x70,"\n"; |
|
1185
|
|
|
|
|
|
|
print "Spydering: ", $page->title, "\n"; |
|
1186
|
|
|
|
|
|
|
print " URL: ", $page->url, "\n"; |
|
1187
|
|
|
|
|
|
|
print " Desc: ", $page->description || 'n/a', "\n"; |
|
1188
|
|
|
|
|
|
|
print '-'x70,"\n"; |
|
1189
|
|
|
|
|
|
|
while ( my $link = $page->next_link ) { |
|
1190
|
|
|
|
|
|
|
printf "%22s ->> %s\n", |
|
1191
|
|
|
|
|
|
|
length($link->name) > 22 ? |
|
1192
|
|
|
|
|
|
|
substr($link->name,0,19).'...' : $link->name, |
|
1193
|
|
|
|
|
|
|
length($link) > 43 ? |
|
1194
|
|
|
|
|
|
|
substr($link,0,40).'...' : $link; |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
as long as unique URLs are being found in the pages crawled, the |
|
1199
|
|
|
|
|
|
|
spyder will never stop. |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
Each "crawl" returns a page object which gives the following methods |
|
1202
|
|
|
|
|
|
|
to get information about the page. |
|
1203
|
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
=over 2 |
|
1205
|
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
=item * $page->links |
|
1207
|
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
URLs found on the page. |
|
1209
|
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
=item * $page->title |
|
1211
|
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
Page's Title if there is one. |
|
1213
|
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=item * $page->text |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
The parsed plain text out of the page. Uses HTML::Parser and tries to |
|
1217
|
|
|
|
|
|
|
ignore non-readable stuff like comments and scripts. |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=item * $page->url |
|
1220
|
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
=item * $page->domain |
|
1222
|
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=item * $page->raw |
|
1224
|
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
The content returned by the server. Should be HTML. |
|
1226
|
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
=item * $page->description |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
The META description of the page if there is one. |
|
1230
|
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=item * $page->links |
|
1232
|
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
Returns a list of the URLs in the page. Note: next_link() will shift |
|
1234
|
|
|
|
|
|
|
the available list of links() each time it's called. |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
=item * $link = $page->next_link |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
next_link() destructively returns the next URI-ish object in the page. |
|
1239
|
|
|
|
|
|
|
They are objects with three accessors. |
|
1240
|
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
=back |
|
1242
|
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=over 6 |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
=item * $link->url |
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
This is also overloaded so that interpolating "$link" will get the |
|
1248
|
|
|
|
|
|
|
URL just as the method does. |
|
1249
|
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
=item * $link->name |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
=item * $link->domain |
|
1253
|
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
=back |
|
1255
|
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
=head2 Spyder that will give up the ghost... |
|
1257
|
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
The following spyder is initialized to stop crawling when I of |
|
1259
|
|
|
|
|
|
|
its conditions are met: 10mins pass or 300 pages are crawled. |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
use WWW::Spyder; |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
my $url = shift || die "Please give me a URL to start!\n"; |
|
1264
|
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new |
|
1266
|
|
|
|
|
|
|
(seed => $url, |
|
1267
|
|
|
|
|
|
|
sleep_base => 10, |
|
1268
|
|
|
|
|
|
|
exit_on => { pages => 300, |
|
1269
|
|
|
|
|
|
|
time => '10min', },); |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
|
1272
|
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
print '-'x70,"\n"; |
|
1274
|
|
|
|
|
|
|
print "Spydering: ", $page->title, "\n"; |
|
1275
|
|
|
|
|
|
|
print " URL: ", $page->url, "\n"; |
|
1276
|
|
|
|
|
|
|
print " Desc: ", $page->description || '', "\n"; |
|
1277
|
|
|
|
|
|
|
print '-'x70,"\n"; |
|
1278
|
|
|
|
|
|
|
while ( my $link = $page->next_link ) { |
|
1279
|
|
|
|
|
|
|
printf "%22s ->> %s\n", |
|
1280
|
|
|
|
|
|
|
length($link->name) > 22 ? |
|
1281
|
|
|
|
|
|
|
substr($link->name,0,19).'...' : $link->name, |
|
1282
|
|
|
|
|
|
|
length($link) > 43 ? |
|
1283
|
|
|
|
|
|
|
substr($link,0,40).'...' : $link; |
|
1284
|
|
|
|
|
|
|
} |
|
1285
|
|
|
|
|
|
|
} |
|
1286
|
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
=head2 Primitive page reader |
|
1288
|
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
use WWW::Spyder; |
|
1290
|
|
|
|
|
|
|
use Text::Wrap; |
|
1291
|
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
my $url = shift || die "Please give me a URL to start!\n"; |
|
1293
|
|
|
|
|
|
|
@ARGV or die "Please also give me a search term.\n"; |
|
1294
|
|
|
|
|
|
|
my $spyder = WWW::Spyder->new; |
|
1295
|
|
|
|
|
|
|
$spyder->seed($url); |
|
1296
|
|
|
|
|
|
|
$spyder->terms(@ARGV); |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
while ( my $page = $spyder->crawl ) { |
|
1299
|
|
|
|
|
|
|
print '-'x70,"\n * "; |
|
1300
|
|
|
|
|
|
|
print $page->title, "\n"; |
|
1301
|
|
|
|
|
|
|
print '-'x70,"\n"; |
|
1302
|
|
|
|
|
|
|
print wrap('','', $page->text); |
|
1303
|
|
|
|
|
|
|
sleep 60; |
|
1304
|
|
|
|
|
|
|
} |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
=head1 TIPS |
|
1307
|
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
If you are going to do anything important with it, implement some |
|
1309
|
|
|
|
|
|
|
signal blocking to prevent accidental problems and tie your gathered |
|
1310
|
|
|
|
|
|
|
information to a DB_File or some such. |
|
1311
|
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
You might want to load C. It should top the nice off |
|
1313
|
|
|
|
|
|
|
at your system's max and prevent your spyder from interfering with |
|
1314
|
|
|
|
|
|
|
your system. |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
You might want to to set $| = 1. |
|
1317
|
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
=head1 PRIVATE METHODS |
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
=head2 are private but hack away if you're inclined |
|
1321
|
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
=head1 TO DO |
|
1323
|
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
I is conceived to live in a future namespace as a servant class |
|
1325
|
|
|
|
|
|
|
for a complex web research agent with simple interfaces to |
|
1326
|
|
|
|
|
|
|
pre-designed grammars for research reports; or self-designed |
|
1327
|
|
|
|
|
|
|
grammars/reports (might be implemented via Parse::FastDescent if that |
|
1328
|
|
|
|
|
|
|
lazy-bones Conway would just find another 5 hours in the paltry 32 |
|
1329
|
|
|
|
|
|
|
hour day he's presently working). |
|
1330
|
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
I'd like the thing to be able to parse RTF, PDF, and perhaps even |
|
1332
|
|
|
|
|
|
|
resource sections of image files but that isn't on the radar right |
|
1333
|
|
|
|
|
|
|
now. |
|
1334
|
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
The tests should work differently. Currently they ask for outside |
|
1336
|
|
|
|
|
|
|
resources without checking if there is either an open way to do it or |
|
1337
|
|
|
|
|
|
|
if the user approves of it. Bad form all around. |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
=head1 TO DOABLE BY 1.0 |
|
1340
|
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
Add 2-4 sample scripts that are a bit more useful. |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
There are many functions that should be under the programmer's control |
|
1344
|
|
|
|
|
|
|
and not buried in the spyder. They will emerge soon. I'd like to put |
|
1345
|
|
|
|
|
|
|
in hooks to allow the user to keep(), toss(), or exclude(), urls, link |
|
1346
|
|
|
|
|
|
|
names, and domains, while crawling. |
|
1347
|
|
|
|
|
|
|
|
|
1348
|
|
|
|
|
|
|
Clean up some redundant, sloppy, and weird code. Probably change or |
|
1349
|
|
|
|
|
|
|
remove the AUTOLOAD. |
|
1350
|
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
Put in a go_to_seed() method and a subclass, ::Seed, with rules to |
|
1352
|
|
|
|
|
|
|
construct query URLs by search engine. It would be the autostart or the |
|
1353
|
|
|
|
|
|
|
fallback for perpetual spyders that run out of links. It would hit a |
|
1354
|
|
|
|
|
|
|
given or default search engine with the I's terms as the query. |
|
1355
|
|
|
|
|
|
|
Obviously this would only work with terms() defined. |
|
1356
|
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
Implement auto-exclusion for failure vs. success rates on names as well |
|
1358
|
|
|
|
|
|
|
as domains (maybe URI suffixes too). |
|
1359
|
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
Turn length of courtesy queue into the breadth/depth setting? make it |
|
1361
|
|
|
|
|
|
|
automatically adjusting...? |
|
1362
|
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
Consistently found link names are excluded from term strength sorting? |
|
1364
|
|
|
|
|
|
|
Eg: "privacy policy," "read more," "copyright..." |
|
1365
|
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
Fix some image tag parsing problems and add area tag parsing. |
|
1367
|
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
Configuration for user:password by domain. |
|
1369
|
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
::Page objects become reusable so that a spyder only needs one. |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
::Enqueue objects become indexed so they are nixable from anywhere. |
|
1373
|
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
Expand exit_on routines to size, slept time, dwindling success ratio, |
|
1375
|
|
|
|
|
|
|
and maybe more. |
|
1376
|
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
Make methods to set "skepticism" and "effort" which will influence the |
|
1378
|
|
|
|
|
|
|
way the terms are used to keep, order, and toss URLs. |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
=head1 BE WARNED |
|
1381
|
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
This module already does some extremely useful things but it's in its |
|
1383
|
|
|
|
|
|
|
infancy and it is conceived to live in a different namespace and |
|
1384
|
|
|
|
|
|
|
perhaps become more private as a subservient part of a parent class. |
|
1385
|
|
|
|
|
|
|
This may never happen but it's the idea. So don't put this into |
|
1386
|
|
|
|
|
|
|
production code yet. I am endeavoring to keep its interface constant |
|
1387
|
|
|
|
|
|
|
either way. That said, it could change completely. |
|
1388
|
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
=head2 Also! |
|
1390
|
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
This module saves cookies to the user's home. There will be more |
|
1392
|
|
|
|
|
|
|
control over cookies in the future, but that's how it is right now. |
|
1393
|
|
|
|
|
|
|
They live in $ENV{HOME}/spyderCookie. |
|
1394
|
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
=head2 Anche! |
|
1396
|
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
Robot Rules aren't respected. I endeavors to be polite as far |
|
1398
|
|
|
|
|
|
|
as server hits are concerned, but doesn't take "no" for answer right |
|
1399
|
|
|
|
|
|
|
now. I want to add this, and not just by domain, but by page settings. |
|
1400
|
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
=head1 UNDOCUMENTED FEATURES |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
A.k.a. Bugs. Don't be ridiculous! Bugs in B?! |
|
1404
|
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
There is a bug that is causing retrieval of image src tags, I think |
|
1406
|
|
|
|
|
|
|
but haven't tracked it down yet, as links. I also think the plain text |
|
1407
|
|
|
|
|
|
|
parsing has some problems which will be remedied shortly. |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
If you are building more than one spyder in the same script they are |
|
1410
|
|
|
|
|
|
|
going to share the same exit_on parameters because it's a |
|
1411
|
|
|
|
|
|
|
self-installing method. This will not always be so. |
|
1412
|
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
See B file for more open and past issues. |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
Let me know if you find any others. If you find one that is platform |
|
1416
|
|
|
|
|
|
|
specific, please send patch code/suggestion because I might not have |
|
1417
|
|
|
|
|
|
|
any idea how to fix it. |
|
1418
|
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
=head1 WHY C |
|
1420
|
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
I didn't want to use the more appropriate I because I think |
|
1422
|
|
|
|
|
|
|
there is a better one out there somewhere in the zeitgeist and the |
|
1423
|
|
|
|
|
|
|
namespace future of I is uncertain. It may end up a |
|
1424
|
|
|
|
|
|
|
semi-private part of a bigger family. And I may be King of Kenya |
|
1425
|
|
|
|
|
|
|
someday. One's got to dream. |
|
1426
|
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
If you like I, have feedback, wishlist usage, better |
|
1428
|
|
|
|
|
|
|
algorithms/implementations for any part of it, please let me know! |
|
1429
|
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
=head1 THANKS TO |
|
1431
|
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
Most all y'all. Especially Lincoln Stein, Gisle Aas, The Conway, |
|
1433
|
|
|
|
|
|
|
Raphael Manfredi, Gurusamy Sarathy, and plenty of others. |
|
1434
|
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
=head1 COMPARE WITH (PROBABLY PREFER) |
|
1436
|
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
L, L, L, L, |
|
1438
|
|
|
|
|
|
|
L, and other kith and kin. |
|
1439
|
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT |
|
1441
|
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
Copyright (c) 2001-2008, Ashley Pond V C<< >>. All |
|
1443
|
|
|
|
|
|
|
rights reserved. |
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or |
|
1446
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. See L. |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
=head1 DISCLAIMER OF WARRANTY |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
|
1451
|
|
|
|
|
|
|
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
|
1452
|
|
|
|
|
|
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
|
1453
|
|
|
|
|
|
|
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER |
|
1454
|
|
|
|
|
|
|
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
1455
|
|
|
|
|
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE |
|
1456
|
|
|
|
|
|
|
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH |
|
1457
|
|
|
|
|
|
|
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL |
|
1458
|
|
|
|
|
|
|
NECESSARY SERVICING, REPAIR, OR CORRECTION. |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
|
1461
|
|
|
|
|
|
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
|
1462
|
|
|
|
|
|
|
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE |
|
1463
|
|
|
|
|
|
|
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, |
|
1464
|
|
|
|
|
|
|
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE |
|
1465
|
|
|
|
|
|
|
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING |
|
1466
|
|
|
|
|
|
|
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A |
|
1467
|
|
|
|
|
|
|
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF |
|
1468
|
|
|
|
|
|
|
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF |
|
1469
|
|
|
|
|
|
|
SUCH DAMAGES. |
|
1470
|
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
=cut |