| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTTP::Cache::Transparent; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
125054
|
use strict; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
110
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = '1.3'; |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
HTTP::Cache::Transparent - Cache the result of http get-requests persistently. |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use LWP::Simple; |
|
14
|
|
|
|
|
|
|
use HTTP::Cache::Transparent; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
HTTP::Cache::Transparent::init( { |
|
17
|
|
|
|
|
|
|
BasePath => '/tmp/cache', |
|
18
|
|
|
|
|
|
|
} ); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
my $data = get( 'http://www.sn.no' ); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
An implementation of http get that keeps a local cache of fetched |
|
25
|
|
|
|
|
|
|
pages to avoid fetching the same data from the server if it hasn't |
|
26
|
|
|
|
|
|
|
been updated. The cache is stored on disk and is thus persistent |
|
27
|
|
|
|
|
|
|
between invocations. |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Uses the http-headers If-Modified-Since and ETag to let the server |
|
30
|
|
|
|
|
|
|
decide if the version in the cache is up-to-date or not. |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
The cache is implemented by modifying the LWP::UserAgent class to |
|
33
|
|
|
|
|
|
|
seamlessly cache the result of all requests that can be cached. |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 INITIALIZING THE CACHE |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
HTTP::Cache::Transparent provides an init-method that sets the |
|
38
|
|
|
|
|
|
|
parameters for the cache and overloads a method in LWP::UserAgent |
|
39
|
|
|
|
|
|
|
to activate the cache.After init has been called, the normal |
|
40
|
|
|
|
|
|
|
LWP-methods (LWP::Simple as well as the more full-fledged |
|
41
|
|
|
|
|
|
|
LWP::Request methods) should be used as usual. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=over 4 |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut |
|
46
|
|
|
|
|
|
|
|
|
47
|
2
|
|
|
2
|
|
10
|
use Carp; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
164
|
|
|
48
|
2
|
|
|
2
|
|
540
|
use LWP::UserAgent; |
|
|
2
|
|
|
|
|
36595
|
|
|
|
2
|
|
|
|
|
80
|
|
|
49
|
2
|
|
|
2
|
|
13
|
use HTTP::Status qw/RC_NOT_MODIFIED RC_OK RC_PARTIAL_CONTENT status_message/; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
294
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
2
|
|
|
2
|
|
11
|
use Digest::MD5 qw/md5_hex/; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
91
|
|
|
52
|
2
|
|
|
2
|
|
1097
|
use IO::File; |
|
|
2
|
|
|
|
|
7739
|
|
|
|
2
|
|
|
|
|
245
|
|
|
53
|
2
|
|
|
2
|
|
1069
|
use File::Copy; |
|
|
2
|
|
|
|
|
5395
|
|
|
|
2
|
|
|
|
|
126
|
|
|
54
|
2
|
|
|
2
|
|
12
|
use File::Path; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
119
|
|
|
55
|
2
|
|
|
2
|
|
9
|
use Cwd; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
620
|
|
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# These are the response-headers that we should store in the |
|
58
|
|
|
|
|
|
|
# cache-entry and recreate when we return a cached response. |
|
59
|
|
|
|
|
|
|
my @cache_headers = qw/Content-Type Content-Encoding |
|
60
|
|
|
|
|
|
|
Content-Length Content-Range |
|
61
|
|
|
|
|
|
|
Last-Modified/; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
my $basepath; |
|
64
|
|
|
|
|
|
|
my $maxage; |
|
65
|
|
|
|
|
|
|
my $verbose; |
|
66
|
|
|
|
|
|
|
my $noupdate; |
|
67
|
|
|
|
|
|
|
my $approvecontent; |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my $org_simple_request; |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item init |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Initialize the HTTP cache. Takes a single parameter which is a |
|
74
|
|
|
|
|
|
|
hashref containing named arguments to the object. |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
HTTP::Cache::Transparent::init( { |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# Directory to store the cache in. |
|
79
|
|
|
|
|
|
|
BasePath => "/tmp/cache", |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# How many hours should items be kept in the cache |
|
82
|
|
|
|
|
|
|
# after they were last requested? |
|
83
|
|
|
|
|
|
|
# Default is 8*24. |
|
84
|
|
|
|
|
|
|
MaxAge => 8*24, |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Print progress-messages to STDERR. |
|
87
|
|
|
|
|
|
|
# Default is 0. |
|
88
|
|
|
|
|
|
|
Verbose => 1, |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# If a request is made for a url that has been requested |
|
91
|
|
|
|
|
|
|
# from the server less than NoUpdate seconds ago, the |
|
92
|
|
|
|
|
|
|
# response will be generated from the cache without |
|
93
|
|
|
|
|
|
|
# contacting the server. |
|
94
|
|
|
|
|
|
|
# Default is 0. |
|
95
|
|
|
|
|
|
|
NoUpdate => 15*60, |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# When a url has been downloaded and the response indicates that |
|
98
|
|
|
|
|
|
|
# has been modified compared to the content in the cache, |
|
99
|
|
|
|
|
|
|
# the ApproveContent callback is called with the HTTP::Response. |
|
100
|
|
|
|
|
|
|
# The callback shall return true if the response shall be used and |
|
101
|
|
|
|
|
|
|
# stored in the cache or false if the response shall be discarded |
|
102
|
|
|
|
|
|
|
# and the response in the cache used instead. |
|
103
|
|
|
|
|
|
|
# This mechanism can be used to work around servers that return errors |
|
104
|
|
|
|
|
|
|
# intermittently. The default is to accept all responses. |
|
105
|
|
|
|
|
|
|
ApproveContent => sub { return $_[0]->is_success }, |
|
106
|
|
|
|
|
|
|
} ); |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
The directory where the cache is stored must be writable. It must also only |
|
109
|
|
|
|
|
|
|
contain files created by HTTP::Cache::Transparent. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $initialized = 0; |
|
114
|
|
|
|
|
|
|
sub init { |
|
115
|
1
|
|
|
1
|
1
|
1005
|
my( $arg ) = @_; |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
defined( $arg->{BasePath} ) |
|
118
|
1
|
50
|
|
|
|
4
|
or croak( "You must specify a BasePath" ); |
|
119
|
|
|
|
|
|
|
|
|
120
|
1
|
|
|
|
|
2
|
$basepath = $arg->{BasePath}; |
|
121
|
|
|
|
|
|
|
|
|
122
|
1
|
50
|
|
|
|
12
|
if( not -d $basepath ) { |
|
123
|
0
|
|
|
|
|
0
|
eval { mkpath($basepath) }; |
|
|
0
|
|
|
|
|
0
|
|
|
124
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
125
|
0
|
|
|
|
|
0
|
print STDERR "$basepath is not a directory and cannot be created: $@\n"; |
|
126
|
0
|
|
|
|
|
0
|
exit 1; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# Append a trailing slash if it is missing. |
|
131
|
1
|
|
|
|
|
7
|
$basepath =~ s%([^/])$%$1/%; |
|
132
|
|
|
|
|
|
|
|
|
133
|
1
|
|
50
|
|
|
6
|
$maxage = $arg->{MaxAge} || 8*24; |
|
134
|
1
|
|
50
|
|
|
4
|
$verbose = $arg->{Verbose} || 0; |
|
135
|
1
|
|
50
|
|
|
3
|
$noupdate = $arg->{NoUpdate} || 0; |
|
136
|
1
|
|
50
|
0
|
|
11
|
$approvecontent = $arg->{ApproveContent} || sub { return 1; }; |
|
|
0
|
|
|
|
|
0
|
|
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Make sure that LWP::Simple does not use its simplified |
|
139
|
|
|
|
|
|
|
# get-method that bypasses LWP::UserAgent. |
|
140
|
1
|
|
|
|
|
2
|
$LWP::Simple::FULL_LWP++; |
|
141
|
|
|
|
|
|
|
|
|
142
|
1
|
50
|
|
|
|
4
|
unless ($initialized++) { |
|
143
|
1
|
|
|
|
|
1
|
$org_simple_request = \&LWP::UserAgent::simple_request; |
|
144
|
|
|
|
|
|
|
|
|
145
|
2
|
|
|
2
|
|
18
|
no warnings; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
3497
|
|
|
146
|
1
|
|
|
|
|
4
|
*LWP::UserAgent::simple_request = \&_simple_request_cache |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item Initializing from use-line |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
An alternative way of initializing HTTP::Cache::Transparent is to supply |
|
153
|
|
|
|
|
|
|
parameters in the use-line. This allows you to write |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
use HTTP::Cache::Transparent ( BasePath => '/tmp/cache' ); |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
which is exactly equivalent to |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
use HTTP::Cache::Transparent; |
|
160
|
|
|
|
|
|
|
HTTP::Cache::Transparent::init( BasePath => '/tmp/cache' ); |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
The advantage to using this method is that you can do |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
perl -MHTTP::Cache::Transparent=BasePath,/tmp/cache myscript.pl |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
or even set the environment variable PERL5OPT |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
PERL5OPT=-MHTTP::Cache::Transparent=BasePath,/tmp/cache |
|
169
|
|
|
|
|
|
|
myscript.pl |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
and have all the http-requests performed by myscript.pl go through the |
|
172
|
|
|
|
|
|
|
cache without changing myscript.pl |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=back |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=cut |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub import { |
|
179
|
2
|
|
|
2
|
|
22
|
my( $module, %args ) = @_; |
|
180
|
2
|
50
|
|
|
|
2528
|
return if (scalar(keys(%args)) == 0); |
|
181
|
|
|
|
|
|
|
|
|
182
|
0
|
|
|
|
|
0
|
HTTP::Cache::Transparent::init( \%args ); |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
END { |
|
186
|
2
|
|
|
2
|
|
1261
|
_remove_old_entries(); |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _simple_request_cache { |
|
190
|
3
|
|
|
3
|
|
10011753
|
my($self, $r, $content_cb, $read_size_hint) = @_; |
|
191
|
|
|
|
|
|
|
|
|
192
|
3
|
|
|
|
|
7
|
my $res; |
|
193
|
|
|
|
|
|
|
|
|
194
|
3
|
50
|
33
|
|
|
10
|
if( $r->method eq "GET" and |
|
|
|
|
33
|
|
|
|
|
|
195
|
|
|
|
|
|
|
not defined( $r->header( 'If-Modified-Since' ) ) and |
|
196
|
|
|
|
|
|
|
not defined( $content_cb ) ) { |
|
197
|
3
|
50
|
|
|
|
220
|
print STDERR "Fetching " . $r->uri |
|
198
|
|
|
|
|
|
|
if( $verbose ); |
|
199
|
|
|
|
|
|
|
|
|
200
|
3
|
|
|
|
|
9
|
my $url = $r->uri->as_string; |
|
201
|
3
|
|
|
|
|
74
|
my $key = $url; |
|
202
|
3
|
50
|
|
|
|
10
|
$key .= "\n" . $r->header('Range') |
|
203
|
|
|
|
|
|
|
if defined $r->header('Range'); |
|
204
|
|
|
|
|
|
|
|
|
205
|
3
|
|
|
|
|
382
|
my $filename = $basepath . _urlhash( $url ); |
|
206
|
|
|
|
|
|
|
|
|
207
|
3
|
|
|
|
|
4
|
my $fh; |
|
208
|
|
|
|
|
|
|
my $meta; |
|
209
|
|
|
|
|
|
|
|
|
210
|
3
|
100
|
|
|
|
83
|
if( -s $filename ) { |
|
211
|
2
|
50
|
|
|
|
21
|
$fh = new IO::File "< $filename" |
|
212
|
|
|
|
|
|
|
or die "Failed to read from $filename"; |
|
213
|
|
|
|
|
|
|
|
|
214
|
2
|
|
|
|
|
154
|
$meta = _read_meta( $fh ); |
|
215
|
|
|
|
|
|
|
|
|
216
|
2
|
50
|
|
|
|
10
|
if( $meta->{Url} eq $url ) { |
|
217
|
|
|
|
|
|
|
$meta->{'Range'} = "" |
|
218
|
2
|
50
|
|
|
|
10
|
unless defined( $meta->{'Range'} ); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Check that the Range is the same for this request as |
|
221
|
|
|
|
|
|
|
# for the one in the cache. |
|
222
|
2
|
50
|
33
|
|
|
8
|
if( (not defined( $r->header( 'Range' ) ) ) or |
|
223
|
|
|
|
|
|
|
$r->header( 'Range' ) eq $meta->{'Range'} ) { |
|
224
|
|
|
|
|
|
|
$r->header( 'If-Modified-Since', $meta->{'Last-Modified'} ) |
|
225
|
2
|
50
|
|
|
|
79
|
if exists( $meta->{'Last-Modified'} ); |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
$r->header( 'If-None-Match', $meta->{ETag} ) |
|
228
|
2
|
50
|
|
|
|
79
|
if( exists( $meta->{ETag} ) ); |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
else { |
|
232
|
0
|
|
|
|
|
0
|
warn "Cache collision: $url and $meta->{Url} have the same md5sum"; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
3
|
100
|
100
|
|
|
93
|
if( defined( $meta->{'X-HCT-LastUpdated'} ) and |
|
237
|
|
|
|
|
|
|
$noupdate > (time - $meta->{'X-HCT-LastUpdated'} ) ) { |
|
238
|
1
|
50
|
|
|
|
3
|
print STDERR " from cache without checking with server.\n" |
|
239
|
|
|
|
|
|
|
if $verbose; |
|
240
|
|
|
|
|
|
|
|
|
241
|
1
|
|
|
|
|
9
|
$res = HTTP::Response->new( $meta->{Code} ); |
|
242
|
1
|
|
|
|
|
49
|
$res->request($r); |
|
243
|
1
|
|
|
|
|
10
|
_get_from_cachefile( $filename, $fh, $res, $meta ); |
|
244
|
1
|
50
|
|
|
|
61
|
$fh->close() |
|
245
|
|
|
|
|
|
|
if defined $fh;; |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# Set X-No-Server-Contact header as content delivered without contact with external server |
|
248
|
1
|
|
|
|
|
7
|
$res->header( "X-No-Server-Contact", 1 ); |
|
249
|
|
|
|
|
|
|
|
|
250
|
1
|
|
|
|
|
63
|
return $res; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
|
|
253
|
2
|
|
|
|
|
7
|
$res = &$org_simple_request( $self, $r ); |
|
254
|
|
|
|
|
|
|
|
|
255
|
2
|
100
|
33
|
|
|
625472
|
if( $res->code == RC_NOT_MODIFIED ) { |
|
|
|
50
|
|
|
|
|
|
|
256
|
1
|
50
|
|
|
|
27
|
print STDERR " from cache.\n" |
|
257
|
|
|
|
|
|
|
if( $verbose ); |
|
258
|
|
|
|
|
|
|
|
|
259
|
1
|
|
|
|
|
9
|
_get_from_cachefile( $filename, $fh, $res, $meta ); |
|
260
|
|
|
|
|
|
|
|
|
261
|
1
|
50
|
|
|
|
38
|
$fh->close() |
|
262
|
|
|
|
|
|
|
if defined $fh;; |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# We need to rewrite the cache-entry to update X-HCT-LastUpdated |
|
265
|
1
|
|
|
|
|
12
|
_write_cache_entry( $filename, $url, $r, $res ); |
|
266
|
1
|
|
|
|
|
136
|
return $res; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
elsif( defined( $meta->{'X-HCT-LastUpdated'} ) |
|
269
|
0
|
|
|
|
|
0
|
and not &{$approvecontent}( $res ) ) { |
|
270
|
0
|
0
|
|
|
|
0
|
print STDERR " from cache since the response was not approved.\n" |
|
271
|
|
|
|
|
|
|
if( $verbose ); |
|
272
|
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
0
|
_get_from_cachefile( $filename, $fh, $res, $meta ); |
|
274
|
|
|
|
|
|
|
|
|
275
|
0
|
0
|
|
|
|
0
|
$fh->close() |
|
276
|
|
|
|
|
|
|
if defined $fh;; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# Do NOT update the cache! |
|
279
|
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
0
|
return $res; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
else { |
|
283
|
1
|
50
|
|
|
|
20
|
$fh->close() |
|
284
|
|
|
|
|
|
|
if defined $fh;; |
|
285
|
|
|
|
|
|
|
|
|
286
|
1
|
|
|
|
|
11
|
my $content = $res->content; |
|
287
|
1
|
50
|
|
|
|
16
|
$content = "" if not defined $content; |
|
288
|
|
|
|
|
|
|
|
|
289
|
1
|
50
|
33
|
|
|
5
|
if( defined( $meta->{MD5} ) and |
|
290
|
|
|
|
|
|
|
md5_hex( $content ) eq $meta->{MD5} ) { |
|
291
|
0
|
|
|
|
|
0
|
$res->header( "X-Content-Unchanged", 1 ); |
|
292
|
0
|
0
|
|
|
|
0
|
print STDERR " unchanged" |
|
293
|
|
|
|
|
|
|
if( $verbose ); |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
1
|
50
|
|
|
|
3
|
print STDERR " from server.\n" |
|
297
|
|
|
|
|
|
|
if( $verbose ); |
|
298
|
|
|
|
|
|
|
|
|
299
|
1
|
50
|
33
|
|
|
27
|
_write_cache_entry( $filename, $url, $r, $res ) |
|
300
|
|
|
|
|
|
|
if( $res->code == RC_OK or |
|
301
|
|
|
|
|
|
|
$res->code == RC_PARTIAL_CONTENT ); |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
else { |
|
305
|
|
|
|
|
|
|
# We won't try to cache this request. |
|
306
|
0
|
|
|
|
|
0
|
$res = &$org_simple_request( $self, $r, |
|
307
|
|
|
|
|
|
|
$content_cb, $read_size_hint ); |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
|
|
310
|
1
|
|
|
|
|
134
|
return $res; |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
sub _get_from_cachefile { |
|
314
|
2
|
|
|
2
|
|
5
|
my( $filename, $fh, $res, $meta ) = @_; |
|
315
|
|
|
|
|
|
|
|
|
316
|
2
|
|
|
|
|
3
|
my $content; |
|
317
|
|
|
|
|
|
|
my $buf; |
|
318
|
2
|
|
|
|
|
15
|
while ( $fh->read( $buf, 1024 ) > 0 ) { |
|
319
|
4
|
|
|
|
|
51
|
$content .= $buf; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
|
|
322
|
2
|
|
|
|
|
14
|
$fh->close(); |
|
323
|
|
|
|
|
|
|
|
|
324
|
2
|
50
|
|
|
|
26
|
$content = "" if not defined $content; |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# Set last-accessed for cache-entry. |
|
327
|
2
|
|
|
|
|
3
|
my $mtime = time; |
|
328
|
2
|
|
|
|
|
77
|
utime( $mtime, $mtime, $filename ); |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# modify response |
|
331
|
2
|
50
|
|
|
|
11
|
if( $HTTP::Message::VERSION >= 1.44 ) { |
|
332
|
2
|
|
|
|
|
14
|
$res->content_ref( \$content ); |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
else { |
|
335
|
0
|
|
|
|
|
0
|
$res->content( $content ); |
|
336
|
|
|
|
|
|
|
} |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# For HTTP::Cache::Transparent earlier than 0.4, |
|
339
|
|
|
|
|
|
|
# there is no Code in the cache. |
|
340
|
2
|
50
|
|
|
|
39
|
if( defined( $meta->{Code} ) ) { |
|
341
|
2
|
|
|
|
|
7
|
$res->code( $meta->{Code} ); |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
else { |
|
344
|
0
|
|
|
|
|
0
|
$res->code( RC_OK ); |
|
345
|
|
|
|
|
|
|
} |
|
346
|
2
|
|
50
|
|
|
18
|
$res->message(status_message($res->code) || "Unknown code"); |
|
347
|
|
|
|
|
|
|
|
|
348
|
2
|
|
|
|
|
47
|
foreach my $h (@cache_headers) { |
|
349
|
|
|
|
|
|
|
$res->header( $h, $meta->{$h} ) |
|
350
|
10
|
100
|
|
|
|
174
|
if defined( $meta->{ $h } ); |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
|
|
353
|
2
|
|
|
|
|
85
|
$res->header( "X-Cached", 1 ); |
|
354
|
2
|
|
|
|
|
145
|
$res->header( "X-Content-Unchanged", 1 ); |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# Read metadata and position filehandle at start of data. |
|
358
|
|
|
|
|
|
|
sub _read_meta { |
|
359
|
2
|
|
|
2
|
|
4
|
my( $fh ) = @_; |
|
360
|
2
|
|
|
|
|
4
|
my %meta; |
|
361
|
|
|
|
|
|
|
|
|
362
|
2
|
|
|
|
|
3
|
my( $key, $value ); |
|
363
|
2
|
|
|
|
|
3
|
do { |
|
364
|
18
|
|
|
|
|
62
|
my $line = <$fh>; |
|
365
|
18
|
|
|
|
|
57
|
( $key, $value ) = ($line =~ /(\S+)\s+(.*)[\n\r]*/); |
|
366
|
|
|
|
|
|
|
|
|
367
|
18
|
100
|
|
|
|
70
|
$meta{$key} = $value |
|
368
|
|
|
|
|
|
|
if( defined $value ); |
|
369
|
|
|
|
|
|
|
} while( defined( $value ) ); |
|
370
|
|
|
|
|
|
|
|
|
371
|
2
|
|
|
|
|
4
|
return \%meta; |
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# Write metadata and position filehandle where data should be written. |
|
375
|
|
|
|
|
|
|
sub _write_meta { |
|
376
|
2
|
|
|
2
|
|
2
|
my( $fh, $meta ) = @_; |
|
377
|
|
|
|
|
|
|
|
|
378
|
2
|
|
|
|
|
4
|
foreach my $key (sort keys( %{$meta} ) ) { |
|
|
2
|
|
|
|
|
15
|
|
|
379
|
16
|
|
|
|
|
45
|
print $fh "$key $meta->{$key}\n"; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
2
|
|
|
|
|
6
|
print $fh "\n"; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub _write_cache_entry { |
|
386
|
2
|
|
|
2
|
|
20
|
my( $filename, $url, $req, $res ) = @_; |
|
387
|
|
|
|
|
|
|
|
|
388
|
2
|
|
|
|
|
11
|
my $out_filename = "$filename.tmp$$"; |
|
389
|
2
|
50
|
|
|
|
16
|
my $fh = new IO::File "> $out_filename" |
|
390
|
|
|
|
|
|
|
or die "Failed to write to $out_filename"; |
|
391
|
|
|
|
|
|
|
|
|
392
|
2
|
|
|
|
|
361
|
my $meta; |
|
393
|
2
|
|
|
|
|
10
|
$meta->{Url} = $url; |
|
394
|
2
|
50
|
|
|
|
8
|
$meta->{ETag} = $res->header('ETag') |
|
395
|
|
|
|
|
|
|
if defined( $res->header('ETag') ); |
|
396
|
|
|
|
|
|
|
|
|
397
|
2
|
|
|
|
|
130
|
my $content = $res->content; |
|
398
|
2
|
50
|
|
|
|
27
|
$content = "" if not defined $content; |
|
399
|
|
|
|
|
|
|
|
|
400
|
2
|
|
|
|
|
21
|
$meta->{MD5} = md5_hex( $content ); |
|
401
|
2
|
50
|
|
|
|
11
|
$meta->{Range} = $req->header('Range') |
|
402
|
|
|
|
|
|
|
if defined( $req->header('Range') ); |
|
403
|
2
|
|
|
|
|
64
|
$meta->{Code} = $res->code; |
|
404
|
2
|
|
|
|
|
16
|
$meta->{'X-HCT-LastUpdated'} = time; |
|
405
|
|
|
|
|
|
|
|
|
406
|
2
|
|
|
|
|
4
|
foreach my $h (@cache_headers) { |
|
407
|
10
|
100
|
|
|
|
311
|
$meta->{$h} = $res->header( $h ) |
|
408
|
|
|
|
|
|
|
if defined $res->header( $h ); |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
2
|
|
|
|
|
92
|
_write_meta( $fh, $meta ); |
|
412
|
|
|
|
|
|
|
|
|
413
|
2
|
|
|
|
|
3
|
print $fh $content; |
|
414
|
2
|
|
|
|
|
13
|
$fh->close; |
|
415
|
|
|
|
|
|
|
|
|
416
|
2
|
50
|
|
|
|
107
|
move( $out_filename, $filename ) || unlink $out_filename; |
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub _urlhash { |
|
420
|
3
|
|
|
3
|
|
5
|
my( $url ) = @_; |
|
421
|
|
|
|
|
|
|
|
|
422
|
3
|
|
|
|
|
23
|
return md5_hex( $url ); |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
sub _remove_old_entries { |
|
426
|
2
|
100
|
66
|
2
|
|
45
|
if( defined( $basepath ) and -d( $basepath ) ) { |
|
427
|
1
|
|
|
|
|
10
|
my $oldcwd = getcwd(); |
|
428
|
1
|
|
|
|
|
15
|
chdir( $basepath ); |
|
429
|
|
|
|
|
|
|
|
|
430
|
1
|
|
|
|
|
74
|
my @files = glob("*"); |
|
431
|
1
|
|
|
|
|
3
|
foreach my $file (@files) { |
|
432
|
1
|
50
|
|
|
|
7
|
if( $file !~ m%^[0-9a-f]{32}$% ) { |
|
433
|
0
|
|
|
|
|
0
|
print STDERR "HTTP::Cache::Transparent: Unknown file found in cache directory: $basepath$file\n"; |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
else { |
|
436
|
1
|
|
|
|
|
7
|
my $age = (-M $file); |
|
437
|
|
|
|
|
|
|
# The file may have disappeared if another process has cleaned |
|
438
|
|
|
|
|
|
|
# the cache. |
|
439
|
1
|
50
|
33
|
|
|
10
|
if( defined($age) && ( $age*24 > $maxage ) ) { |
|
440
|
0
|
0
|
|
|
|
0
|
print STDERR "Deleting $file.\n" |
|
441
|
|
|
|
|
|
|
if( $verbose ); |
|
442
|
0
|
|
|
|
|
0
|
unlink( $file ); |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
1
|
|
|
|
|
24
|
chdir( $oldcwd ); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=head1 INSPECTING CACHE BEHAVIOR |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
The HTTP::Cache::Transparent inserts three special headers in the |
|
454
|
|
|
|
|
|
|
HTTP::Response object. These can be accessed via the |
|
455
|
|
|
|
|
|
|
HTTP::Response::header()-method. |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=over 4 |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=item X-Cached |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
This header is inserted and set to 1 if the response is delivered from |
|
462
|
|
|
|
|
|
|
the cache instead of from the server. |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
=item X-Content-Unchanged |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
This header is inserted and set to 1 if the content returned is the same |
|
467
|
|
|
|
|
|
|
as the content returned the last time this url was fetched. This header |
|
468
|
|
|
|
|
|
|
is always inserted and set to 1 when the response is delivered from |
|
469
|
|
|
|
|
|
|
the cache. |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=item X-No-Server-Contact |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
This header is inserted and set to 1 if the content returned has been |
|
474
|
|
|
|
|
|
|
delivered without any contact with the external server, i.e. no |
|
475
|
|
|
|
|
|
|
conditional or unconditional HTTP GET request has been sent, the content |
|
476
|
|
|
|
|
|
|
has been delivered directly from cache. This may be useful when seeking |
|
477
|
|
|
|
|
|
|
to control loading of the external server. |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=back |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=head1 LIMITATIONS |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
This module has a number of limitations that you should be aware of |
|
484
|
|
|
|
|
|
|
before using it. |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=over 4 |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=item - |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
There is no upper limit to how much diskspace the cache requires. The |
|
491
|
|
|
|
|
|
|
only limiting mechanism is that data for urls that haven't been requested |
|
492
|
|
|
|
|
|
|
in the last MaxAge hours will be removed from the cache the next time |
|
493
|
|
|
|
|
|
|
the program exits. |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=item - |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
Currently, only get-requests that store the result in memory (i.e. do |
|
498
|
|
|
|
|
|
|
not use the option to have the result stored directly in a file or |
|
499
|
|
|
|
|
|
|
delivered via a callback) is cached. I intend to remove this limitation |
|
500
|
|
|
|
|
|
|
in a future version. |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=item - |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
The support for Ranges is a bit primitive. It creates a new object in |
|
505
|
|
|
|
|
|
|
the cache for each unique combination of url and range. This will work ok |
|
506
|
|
|
|
|
|
|
as long as you always request the same range(s) for a url. |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=item - |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
The cache doesn't properly check and store all headers in the HTTP |
|
511
|
|
|
|
|
|
|
request and response. Therefore, if you request the same url repeatedly |
|
512
|
|
|
|
|
|
|
with different sets of headers (cookies, accept-encoding etc), and these |
|
513
|
|
|
|
|
|
|
headers affect the response from the server, the cache may return the |
|
514
|
|
|
|
|
|
|
wrong response. |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=item - |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
HTTP::Cache::Transparent has not been tested with threads, and will |
|
519
|
|
|
|
|
|
|
most likely not work if you use them. |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=back |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 CACHE FORMAT |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
The cache is stored on disk as one file per cached object. The filename |
|
526
|
|
|
|
|
|
|
is equal to the md5sum of the url and the Range-header if it exists. |
|
527
|
|
|
|
|
|
|
The file contains a set of |
|
528
|
|
|
|
|
|
|
key/value-pairs with metadata (one entry per line) followed by a blank |
|
529
|
|
|
|
|
|
|
line and then the actual data returned by the server. |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
The last modified date of the cache file is set to the time when the |
|
532
|
|
|
|
|
|
|
cache object was last requested by a user. |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=head1 AUTHOR |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
Mattias Holmlund, E$firstname -at- $lastname -dot- seE |
|
537
|
|
|
|
|
|
|
L |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=head1 GIT REPOSITORY |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
A git repository containing the source for this module can be found |
|
542
|
|
|
|
|
|
|
via http://git.holmlund.se/ |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Copyright (C) 2004-2007 by Mattias Holmlund |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
549
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
|
550
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=cut |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
1; |