line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
3
|
|
|
3
|
|
71610
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
71
|
|
2
|
3
|
|
|
3
|
|
11
|
use warnings; |
|
3
|
|
|
|
|
1
|
|
|
3
|
|
|
|
|
86
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package HTTP::Async::Polite; |
5
|
3
|
|
|
3
|
|
9
|
use base 'HTTP::Async'; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
873
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.33'; |
8
|
|
|
|
|
|
|
|
9
|
3
|
|
|
3
|
|
10
|
use Carp; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
124
|
|
10
|
3
|
|
|
3
|
|
10
|
use Data::Dumper; |
|
3
|
|
|
|
|
2
|
|
|
3
|
|
|
|
|
112
|
|
11
|
3
|
|
|
3
|
|
9
|
use Time::HiRes qw( time sleep ); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
12
|
|
12
|
3
|
|
|
3
|
|
229
|
use URI; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
1150
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
HTTP::Async::Polite - politely process multiple HTTP requests |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
See L - the usage is unchanged. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This L module allows you to have many requests going on at once. |
25
|
|
|
|
|
|
|
This can be very rude if you are fetching several pages from the same domain. |
26
|
|
|
|
|
|
|
This module add limits to the number of simultaneous requests to a given |
27
|
|
|
|
|
|
|
domain and adds an interval between the requests. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
In all other ways it is identical in use to the original L. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 NEW METHODS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 send_interval |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
Getter and setter for the C - the time in seconds to leave |
36
|
|
|
|
|
|
|
between each request for a given domain. By default this is set to 5 seconds. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub send_interval { |
41
|
6
|
|
|
6
|
1
|
12
|
my $self = shift; |
42
|
|
|
|
|
|
|
return scalar @_ |
43
|
6
|
100
|
|
|
|
34
|
? $self->_set_opt( 'send_interval', @_ ) |
44
|
|
|
|
|
|
|
: $self->_get_opt('send_interval'); |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 OVERLOADED METHODS |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
These methods are overloaded but otherwise work exactly as the original |
50
|
|
|
|
|
|
|
methods did. The docs here just describe what they do differently. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head2 new |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Sets the C value to the default of 5 seconds. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=cut |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub new { |
59
|
6
|
|
|
6
|
1
|
35
|
my $class = shift; |
60
|
|
|
|
|
|
|
|
61
|
6
|
|
|
|
|
31
|
my $self = $class->SUPER::new; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Set the interval between sends. |
64
|
6
|
|
|
|
|
16
|
$self->{opts}{send_interval} = 5; # seconds |
65
|
6
|
|
|
|
|
18
|
$class->_add_get_set_key('send_interval'); |
66
|
|
|
|
|
|
|
|
67
|
6
|
|
|
|
|
11
|
$self->_init(@_); |
68
|
|
|
|
|
|
|
|
69
|
6
|
|
|
|
|
7
|
return $self; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 add_with_opts |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Adds the request to the correct queue depending on the domain. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub add_with_opts { |
79
|
7
|
|
|
7
|
1
|
6
|
my $self = shift; |
80
|
7
|
|
|
|
|
5
|
my $req = shift; |
81
|
7
|
|
|
|
|
6
|
my $opts = shift; |
82
|
7
|
|
|
|
|
26
|
my $id = $self->_next_id; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# Instead of putting this request and opts directly onto the to_send array |
85
|
|
|
|
|
|
|
# instead get the domain and add it to the domain's queue. Store this |
86
|
|
|
|
|
|
|
# domain with the opts so that it is easy to get at. |
87
|
7
|
|
|
|
|
18
|
my $uri = URI->new( $req->uri ); |
88
|
7
|
|
|
|
|
378
|
my $host = $uri->host; |
89
|
7
|
|
|
|
|
148
|
my $port = $uri->port; |
90
|
7
|
|
|
|
|
112
|
my $domain = "$host:$port"; |
91
|
7
|
|
|
|
|
20
|
$opts->{_domain} = $domain; |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Get the domain array - create it if needed. |
94
|
7
|
|
100
|
|
|
42
|
my $domain_arrayref = $self->{domain_stats}{$domain}{to_send} ||= []; |
95
|
|
|
|
|
|
|
|
96
|
7
|
|
|
|
|
9
|
push @{$domain_arrayref}, [ $req, $id ]; |
|
7
|
|
|
|
|
12
|
|
97
|
7
|
|
|
|
|
23
|
$self->{id_opts}{$id} = $opts; |
98
|
|
|
|
|
|
|
|
99
|
7
|
|
|
|
|
19
|
$self->poke; |
100
|
|
|
|
|
|
|
|
101
|
7
|
|
|
|
|
20
|
return $id; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 to_send_count |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Returns the number of requests waiting to be sent. This is the number in the |
107
|
|
|
|
|
|
|
actual queue plus the number in each domain specific queue. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=cut |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub to_send_count { |
112
|
200
|
|
|
200
|
1
|
389
|
my $self = shift; |
113
|
200
|
|
|
|
|
923
|
$self->poke; |
114
|
|
|
|
|
|
|
|
115
|
200
|
|
|
|
|
183
|
my $count = scalar @{ $$self{to_send} }; |
|
200
|
|
|
|
|
383
|
|
116
|
|
|
|
|
|
|
|
117
|
390
|
|
|
|
|
577
|
$count += scalar @{ $self->{domain_stats}{$_}{to_send} } |
118
|
200
|
|
|
|
|
217
|
for keys %{ $self->{domain_stats} }; |
|
200
|
|
|
|
|
566
|
|
119
|
|
|
|
|
|
|
|
120
|
200
|
|
|
|
|
781
|
return $count; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _process_to_send { |
124
|
605
|
|
|
605
|
|
517
|
my $self = shift; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# Go through the domain specific queues and add all requests that we can |
127
|
|
|
|
|
|
|
# to the real queue. |
128
|
605
|
|
|
|
|
418
|
foreach my $domain ( keys %{ $self->{domain_stats} } ) { |
|
605
|
|
|
|
|
1441
|
|
129
|
|
|
|
|
|
|
|
130
|
1177
|
|
|
|
|
1037
|
my $domain_stats = $self->{domain_stats}{$domain}; |
131
|
1177
|
100
|
|
|
|
718
|
next unless scalar @{ $domain_stats->{to_send} }; |
|
1177
|
|
|
|
|
2020
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# warn "TRYING TO ADD REQUEST FOR $domain"; |
134
|
|
|
|
|
|
|
# warn sleep 5; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Check that this request is good to go. |
137
|
966
|
100
|
|
|
|
1471
|
next if $domain_stats->{count}; |
138
|
933
|
100
|
100
|
|
|
2910
|
next unless time > ( $domain_stats->{next_send} || 0 ); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# We can add this request. |
141
|
7
|
|
|
|
|
14
|
$domain_stats->{count}++; |
142
|
7
|
|
|
|
|
19
|
push @{ $self->{to_send} }, shift @{ $domain_stats->{to_send} }; |
|
7
|
|
|
|
|
11
|
|
|
7
|
|
|
|
|
27
|
|
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Use the original to send the requests on the queue. |
146
|
605
|
|
|
|
|
1615
|
return $self->SUPER::_process_to_send; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _add_to_return_queue { |
150
|
7
|
|
|
7
|
|
6
|
my $self = shift; |
151
|
7
|
|
|
|
|
8
|
my $req_and_id = shift; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# decrement the count for this domain so that another request can start. |
154
|
|
|
|
|
|
|
# Also set the interval so that we don't scrape too fast. |
155
|
7
|
|
|
|
|
10
|
my $id = $req_and_id->[1]; |
156
|
7
|
|
|
|
|
11
|
my $domain = $self->{id_opts}{$id}{_domain}; |
157
|
7
|
|
|
|
|
9
|
my $domain_stat = $self->{domain_stats}{$domain}; |
158
|
7
|
|
|
|
|
14
|
my $interval = $self->_get_opt( 'send_interval', $id ); |
159
|
|
|
|
|
|
|
|
160
|
7
|
|
|
|
|
10
|
$domain_stat->{count}--; |
161
|
7
|
|
|
|
|
20
|
$domain_stat->{next_send} = time + $interval; |
162
|
|
|
|
|
|
|
|
163
|
7
|
|
|
|
|
26
|
return $self->SUPER::_add_to_return_queue($req_and_id); |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head1 SEE ALSO |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
L - the module that this one is based on. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head1 AUTHOR |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
Edmund von der Burg C<< >>. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
L |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 LICENCE AND COPYRIGHT |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Copyright (c) 2006, Edmund von der Burg C<< >>. |
179
|
|
|
|
|
|
|
All rights reserved. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under |
182
|
|
|
|
|
|
|
the same terms as Perl itself. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head1 DISCLAIMER OF WARRANTY |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE |
187
|
|
|
|
|
|
|
SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE |
188
|
|
|
|
|
|
|
STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE |
189
|
|
|
|
|
|
|
SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, |
190
|
|
|
|
|
|
|
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
191
|
|
|
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND |
192
|
|
|
|
|
|
|
PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, |
193
|
|
|
|
|
|
|
YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY |
196
|
|
|
|
|
|
|
COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE |
197
|
|
|
|
|
|
|
SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE TO YOU FOR DAMAGES, |
198
|
|
|
|
|
|
|
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING |
199
|
|
|
|
|
|
|
OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO |
200
|
|
|
|
|
|
|
LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR |
201
|
|
|
|
|
|
|
THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER |
202
|
|
|
|
|
|
|
SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE |
203
|
|
|
|
|
|
|
POSSIBILITY OF SUCH DAMAGES. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=cut |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
1; |
208
|
|
|
|
|
|
|
|