|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package WWW::Crawler::Lite;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
1781
 | 
 use strict;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
89
 | 
    | 
| 
4
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
11
 | 
 use warnings 'all';  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
92
 | 
    | 
| 
5
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
6251
 | 
 use LWP::UserAgent;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
181059
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
77
 | 
    | 
| 
6
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
2382
 | 
 use HTTP::Request::Common;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5501
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
313
 | 
    | 
| 
7
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
2820
 | 
 use WWW::RobotRules;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6705
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
    | 
| 
8
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
1869
 | 
 use URI::URL;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22988
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5228
 | 
    | 
| 
9
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
3164
 | 
 use HTML::LinkExtor;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59576
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
98
 | 
    | 
| 
10
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
6211
 | 
 use Time::HiRes 'usleep';  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7151
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
    | 
| 
11
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
689
 | 
 use Carp 'confess';  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38061
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '0.005';  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
18
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
1300
 | 
   my ($class, %args) = @_;  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $s = bless {  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     url_pattern       => 'https?://.+',  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     agent             => "WWW-Crawler-Lite/$VERSION $^O",  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     http_accept       => [qw( text/html text/plain application/xhtml+xml )],  | 
| 
24
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
 
 | 
867
 | 
     on_new_urls       => sub { my @urls = @_; },  | 
| 
25
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
     on_bad_url        => sub { my ($bad_url) = @_; },  | 
| 
26
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
     on_response       => sub { my ($url, $http_response) = @_; },  | 
| 
27
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
     on_link           => sub { my ($from, $to, $text) = @_ },  | 
| 
28
 | 
654
 | 
 
 | 
 
 | 
  
654
  
 | 
 
 | 
17017
 | 
     follow_ok         => sub { my ($url) = @_; return 1; },  | 
| 
 
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1910
 | 
    | 
| 
29
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
     link_parser       => 'default',  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     delay_seconds     => 1,  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     disallowed        => [ ],  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     %args,  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     urls              => { },  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     _responded_urls   => { },  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     RUNNING           => 1,  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     IS_INITIALIZING   => 1,  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }, $class;  | 
| 
38
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
   $s->{rules} = WWW::RobotRules->new( $s->agent );  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
40
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
   return $s;  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end new()  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Public read-only properties:  | 
| 
44
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
22
 | 
 sub agent           { shift->{agent} }  | 
| 
45
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
30
 | 
 sub url_pattern     { shift->{url_pattern} }  | 
| 
46
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
1
  
 | 
4008765
 | 
 sub delay_seconds   { shift->{delay_seconds} }  | 
| 
47
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
14
 | 
 sub http_accept     { @{ shift->{http_accept} } }  | 
| 
 
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42
 | 
    | 
| 
48
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
 sub is_initializing { shift->{IS_INITIALIZING} }  | 
| 
49
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
0
  
 | 
265
 | 
 sub is_running      { shift->{RUNNING} }  | 
| 
50
 | 
2286
 | 
 
 | 
 
 | 
  
2286
  
 | 
  
0
  
 | 
8729
 | 
 sub rules           { shift->{rules} }  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Public method:  | 
| 
53
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
13356
 | 
 sub stop { shift->{RUNNING} = 0 }  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Public getters/setters:  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub on_new_urls  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
59
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
0
  
 | 
8
 | 
   my $s = shift;  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
61
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
44
 | 
   return @_ ? $s->{on_new_urls} = shift : $s->{on_new_urls};  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end on_new_urls()  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub on_bad_url  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
66
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
   my $s = shift;  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
68
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   return @_ ? $s->{on_bad_url} = shift : $s->{on_bad_url};  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end on_bad_url()  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub on_response  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
73
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
1
  
 | 
12
 | 
   my $s = shift;  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
75
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
36
 | 
   return @_ ? $s->{on_response} = shift : $s->{on_response};  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end on_response()  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub on_link  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
80
 | 
617
 | 
 
 | 
 
 | 
  
617
  
 | 
  
1
  
 | 
867
 | 
   my $s = shift;  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
82
 | 
617
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3129
 | 
   return @_ ? $s->{on_link} = shift : $s->{on_link};  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end on_link()  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub follow_ok  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
88
 | 
654
 | 
 
 | 
 
 | 
  
654
  
 | 
  
0
  
 | 
948
 | 
   my $s = shift;  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
90
 | 
654
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2674
 | 
   return @_ ? $s->{follow_ok} = shift : $s->{follow_ok};  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end follow_ok()  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub link_parser  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
96
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
14
 | 
   my $s = shift;  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
98
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
64
 | 
   return @_ ? $s->{link_parser} = shift : $s->{link_parser};  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end link_parser()  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub url_count  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
104
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
   my ($s) = @_;  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
106
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   return scalar( keys %{ $s->{urls} } );  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end url_count()  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub crawl  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
112
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
0
  
 | 
6
 | 
   my ($s, %args) = @_;  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
114
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
5
 | 
   confess "Require param 'url' not provided" unless $args{url};  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
116
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
   my $ua = LWP::UserAgent->new( agent => $s->agent );  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $ua->add_handler( response_header => sub {  | 
| 
118
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
2434942
 | 
     my ($response, $ua, $h) = @_;  | 
| 
119
 | 
5
 | 
  
 50
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
35
 | 
     my ($type) = split /\;/, ( $response->header('content-type') || '' )  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or die "no mime type provided by server";  | 
| 
121
 | 
5
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
291
 | 
     grep { $type =~ m{\Q$_\E}i } $s->http_accept  | 
| 
 
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
407
 | 
    | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or die "unwanted mime type '$type'";  | 
| 
123
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14080
 | 
   });  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Try to find robots.txt:  | 
| 
126
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
136
 | 
   my ($proto, $domain) = $args{url} =~ m{^(https?)://(.*?)(?:/|$)};  | 
| 
127
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
   eval {  | 
| 
128
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     local $SIG{__DIE__} = \&confess;  | 
| 
129
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my $robots_url = "$proto://$domain/robots.txt";  | 
| 
130
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     my $res = $ua->request( GET $robots_url );  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # If robots.txt has extra newlines in it, the rules parser always allows (which is bad):  | 
| 
133
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1080
 | 
     (my $robots_txt = $res->content) =~ s/[\r?\n]{2,}/\n/sg;  | 
| 
134
 | 
1
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
28
 | 
     $s->rules->parse( $robots_url, $robots_txt )  | 
| 
 
 | 
 
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       if $res && $res->is_success && $res->content;  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   };  | 
| 
137
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1880
 | 
   warn "Error fetching/parsing robots.txt: $@" if $@;  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
139
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
   $s->{urls}->{$args{url}} = 'taken';  | 
| 
140
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my $res = $ua->request( GET $args{url} );  | 
| 
141
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1384624
 | 
   $s->_parse_result( $args{url}, $res );  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
143
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
   while( my $url = $s->_take_url() )  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   {  | 
| 
145
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
     usleep( $s->delay_seconds * 1_000_000 );  | 
| 
146
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
60
 | 
     last unless $s->is_running;  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
148
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
     my $res = $ua->request( GET $url );  | 
| 
149
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
944005
 | 
     my ($type) = split /\;/, $res->header('content-type');  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Only parse responses that are of the correct MIME type:  | 
| 
152
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
190
 | 
     $s->_parse_result( $url, $res )  | 
| 
153
 | 
3
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
200
 | 
       if grep { $type =~ m{\Q$_\E}i } $s->http_accept;  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }# end while()  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end crawl()  | 
| 
156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _take_url  | 
| 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
160
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
 
 | 
13
 | 
   my ($s) = @_;  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
162
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
   my $url;  | 
| 
163
 | 
2285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
396857
 | 
   SCOPE: {  | 
| 
164
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     ($url) = grep { $s->rules->allowed( $_ ) } grep { $s->{urls}->{$_} eq 'new' } keys %{ $s->{urls} }  | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
 
 | 
2295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6471
 | 
    | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
617
 | 
    | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or return;  | 
| 
166
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1316
 | 
     $s->{urls}->{$url} = 'taken';  | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   };  | 
| 
168
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
26
 | 
   return $url;  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end _take_url()  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _parse_result  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
174
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
 
 | 
15
 | 
   my ($s, $url, $res) = @_;  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
176
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
   my $base = $res->base;  | 
| 
177
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2279
 | 
   my @new_urls = ( );  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
20
 | 
   if( $s->link_parser eq 'HTML::LinkExtor' )  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   {  | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # This option added after the original regexp way was pointed out on perlmonks:  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # http://www.perlmonks.org/?node_id=946548  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $cb = sub {  | 
| 
184
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
       my ($tag, %attrs) = @_;  | 
| 
185
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       return unless uc($tag) eq 'A';  | 
| 
186
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       if( $s->follow_ok->( $attrs{href} ) )  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
188
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
         push @new_urls, { href => $attrs{href}, text => $attrs{title} || $attrs{alt} };  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }# end if()  | 
| 
190
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     };  | 
| 
191
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $parser = HTML::LinkExtor->new($cb, $base);  | 
| 
192
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $parser->parse( $res->content );  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   elsif( $s->link_parser eq 'default' )  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   {  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # This method might be a bit naive, but HTML::LinkExtor (AFAIK) doesn't allow  | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # me to get at the text within a hyperlink.  | 
| 
198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # I'm open to alternatives and recognise the problems inherent in parsing  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # HTML with regexps.  | 
| 
200
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     (my $tmp = $res->content) =~ s{(.*?)}{  | 
| 
201
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2381
 | 
       my ($href,$anchortext) = ( $1, $2 );  | 
| 
202
 | 
654
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1929
 | 
       if( $anchortext =~ m/ 
 | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
204
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
91
 | 
         my ($alt) = join ". ", $anchortext =~ m/alt\s*\=\s*"(.*?)"/sig;  | 
| 
205
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
         $anchortext =~ s///sig;  | 
| 
206
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
56
 | 
         $anchortext .= ". $alt" if $alt;  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }# end if()  | 
| 
208
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
868
 | 
       $anchortext =~ s{?.*?[/>]}{}sg;  | 
| 
209
 | 
654
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2352
 | 
       if( my ($quote) = $href =~ m/^(['"])/ )  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
211
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3836
 | 
         ($href) = $href =~ m/^$quote(.*?)$quote/;  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       else  | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
215
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         ($href) = $href =~ m/^([^\s+])/;  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }# end if()  | 
| 
217
 | 
654
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1550
 | 
       $href = "" unless defined($href);  | 
| 
218
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
995
 | 
       $href =~ s/\#.*$//;  | 
| 
219
 | 
654
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1200
 | 
       if( $href )  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
221
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2022
 | 
         (my $new = url($href, $base)->abs->as_string) =~ s/\#.*$//;  | 
| 
222
 | 
654
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
304181
 | 
         if( $s->follow_ok->( $new ) )  | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
224
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1717
 | 
           $anchortext =~ s/^\s+//s;  | 
| 
225
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1114
 | 
           $anchortext =~ s/\s+$//s;  | 
| 
226
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4952
 | 
           push @new_urls, { href => $new, text => $anchortext };  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }# end if()  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }# end if()  | 
| 
229
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6375
 | 
       "";  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }isgxe;  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }# end if()  | 
| 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
233
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
   $s->on_response->( $url, $res );  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
235
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4242
 | 
   my %accepted_urls = ( );  | 
| 
236
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   SCOPE: {  | 
| 
237
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my $pattern = $s->url_pattern;  | 
| 
238
 | 
594
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1559
 | 
     map {  | 
| 
239
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
766
 | 
       $accepted_urls{$_}++;  | 
| 
240
 | 
594
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
2981
 | 
       $s->{urls}->{$_} ||= 'new';  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     grep {  | 
| 
243
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1223
 | 
       my $u = $_;  | 
| 
244
 | 
594
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4362
 | 
       m/$pattern/ &&  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ! exists($s->{urls}->{$u}) &&  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ! grep {  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $u =~ m{^https?://[^/]+?\Q$_\E.*}  | 
| 
248
 | 
654
 | 
  
100
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
4237
 | 
       } @{$s->{disallowed}} &&  | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $s->rules->allowed( $u )  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
251
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
     map { $_->{href} } @new_urls;  | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   };  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Send the event about this page linking to those other pages:  | 
| 
255
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
   my $pattern = $s->url_pattern;  | 
| 
256
 | 
617
 | 
 
 | 
 
 | 
 
 | 
 
 | 
487325
 | 
   map {  | 
| 
257
 | 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
718
 | 
     $s->on_link->( $url, $_->{href}, $_->{text} );  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   grep {  | 
| 
260
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     my $u = $_;  | 
| 
261
 | 
617
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10801
 | 
     $u->{href} =~ m/$pattern/ &&  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ! grep {  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $u->{href} =~ m{^https?://[^/]+?\Q$_\E.*}  | 
| 
264
 | 
654
 | 
  
100
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
4208
 | 
     } @{$s->{disallowed}} &&  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $s->rules->allowed( $u->{href} )  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } @new_urls;  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
268
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3407
 | 
   $s->on_new_urls->( keys(%accepted_urls) );  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }# end _parse_result()  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;# return true:  | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =pod  | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 WWW::Crawler::Lite - A single-threaded crawler/spider for the web.  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my %pages = ( );  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $pattern = 'https?://example\.com\/';  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my %links = ( );  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $downloaded = 0;  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $crawler;  | 
| 
287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $crawler = WWW::Crawler::Lite->new(  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     agent       => 'MySuperBot/1.0',  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     url_pattern => $pattern,  | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     http_accept => [qw( text/plain text/html application/xhtml+xml )],  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     link_parser => 'default',  | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     on_response => sub {  | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       my ($url, $res) = @_;  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       warn "$url contains " . $res->content;  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $downloaded++;  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $crawler->stop() if $downloaded++ > 5;  | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     },  | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     follow_ok   => sub {  | 
| 
300
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       my ($url) = @_;  | 
| 
301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # If you like this url and want to use it, then return a true value:  | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       return 1;  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     },  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     on_link     => sub {  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       my ($from, $to, $text) = @_;  | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       return if exists($pages{$to}) && $pages{$to} eq 'BAD';  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $pages{$to}++;  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $links{$to} ||= [ ];  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       push @{$links{$to}}, { from => $from, text => $text };  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     },  | 
| 
312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     on_bad_url => sub {  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       my ($url) = @_;  | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # Mark this url as 'bad':  | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $pages{$url} = 'BAD';  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   );  | 
| 
319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $crawler->crawl( url => "http://example.com/" );  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   warn "DONE!!!!!";  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   use Data::Dumper;  | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   map {  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     warn "$_ ($pages{$_} incoming links) -> " . Dumper($links{$_})  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } sort keys %links;  | 
| 
327
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C is a single-threaded spider/crawler for the web.  It can  | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 be used within a mod_perl, CGI or Catalyst-style environment because it does not  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 fork or use threads.  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The callback-based interface is fast and simple, allowing you to focus on simply  | 
| 
335
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 processing the data that C extracts from the target website.  | 
| 
336
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 PUBLIC METHODS  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 new( %args )  | 
| 
340
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Creates and returns a new C object.  | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The C<%args> hash is not required, but may contain the following elements:  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item agent - String  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Used as the user-agent string for HTTP requests.  | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 B - C  | 
| 
352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item url_pattern - RegExp or String  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 New links that do not match this pattern will not be added to the processing queue.  | 
| 
356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 B C  | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item http_accept - ArrayRef  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This can be used to filter out unwanted responses.  | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item link_parser - String  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
365
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Valid values: 'C' and 'C'  | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The default value is 'C' which uses a naive regexp to do the link parsing.  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The upshot of using 'C' is that the regexp will also find the hyperlinked   | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 text or alt-text (of a hyperlinked img tag) and give that to your 'C' handler.  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 B C<[qw( text/html text/plain application/xhtml+xml )]>  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item on_response($url, $response) - CodeRef  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Called whenever a successful response is returned.  | 
| 
377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item on_link($from, $to, $text) - CodeRef  | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Called whenever a new link is found.  Arguments are:  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 8  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $from  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The URL that is linked *from*  | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $to  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The URL that is linked *to*  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $text  | 
| 
393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The anchor text (eg: The HTML within the link - B)  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item on_bad_url($url) - CodeRef  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Called whenever an unsuccessful response is received.  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item delay_seconds - Number  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Indicates the length of time (in seconds) that the crawler should pause before making  | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 each request.  This can be useful when you want to spider a website, not launch  | 
| 
406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 a denial of service attack on it.  | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
409
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 stop( )  | 
| 
411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Causes the crawler to stop processing its queue of URLs.  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR  | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 John Drago   | 
| 
417
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 COPYRIGHT  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This software is Free software and may be used and redistributed under the same  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 terms as perl itself.  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    |