line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTTP::OAI::UserAgent; |
2
|
|
|
|
|
|
|
|
3
|
11
|
|
|
11
|
|
65
|
use strict; |
|
11
|
|
|
|
|
18
|
|
|
11
|
|
|
|
|
275
|
|
4
|
11
|
|
|
11
|
|
46
|
use warnings; |
|
11
|
|
|
|
|
20
|
|
|
11
|
|
|
|
|
287
|
|
5
|
|
|
|
|
|
|
|
6
|
11
|
|
|
11
|
|
42
|
use vars qw(@ISA $ACCEPT); |
|
11
|
|
|
|
|
19
|
|
|
11
|
|
|
|
|
882
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '4.12'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Do not use eval() |
11
|
|
|
|
|
|
|
our $USE_EVAL = 1; |
12
|
|
|
|
|
|
|
# Ignore bad utf8 characters |
13
|
|
|
|
|
|
|
our $IGNORE_BAD_CHARS = 1; |
14
|
|
|
|
|
|
|
# Silence bad utf8 warnings |
15
|
|
|
|
|
|
|
our $SILENT_BAD_CHARS = 0; |
16
|
|
|
|
|
|
|
|
17
|
11
|
|
|
11
|
|
68
|
use constant MAX_UTF8_BYTES => 4; |
|
11
|
|
|
|
|
16
|
|
|
11
|
|
|
|
|
12705
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
require LWP::UserAgent; |
20
|
|
|
|
|
|
|
@ISA = qw(LWP::UserAgent); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
unless( $@ ) { |
23
|
|
|
|
|
|
|
$ACCEPT = "gzip"; |
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
1
|
|
|
1
|
1
|
3
|
sub delay { shift->_elem( "delay", @_ ) } |
27
|
1
|
|
|
1
|
0
|
4
|
sub last_request_completed { shift->_elem( "last_request_completed", @_ ) } |
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
0
|
1
|
0
|
sub redirect_ok { 1 } |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub _oai { |
32
|
10
|
|
|
10
|
|
30
|
my( $self, @args ) = @_; |
33
|
10
|
50
|
|
|
|
26
|
my $cb = ref($args[0]) eq "CODE" ? shift @args : undef; |
34
|
10
|
|
|
|
|
26
|
my %args = @args; |
35
|
10
|
|
33
|
|
|
53
|
$cb = delete $args{onRecord} || $cb || $self->{onRecord}; |
36
|
|
|
|
|
|
|
|
37
|
10
|
|
50
|
|
|
42
|
my $handlers = delete $args{handlers} || {}; |
38
|
|
|
|
|
|
|
|
39
|
10
|
50
|
66
|
|
|
46
|
if( !$args{force} && (my @errors = HTTP::OAI::Repository::validate_request(%args)) ) { |
40
|
0
|
|
|
|
|
0
|
return new HTTP::OAI::Response( |
41
|
|
|
|
|
|
|
code=>503, |
42
|
|
|
|
|
|
|
message=>'Invalid Request (use \'force\' to force a non-conformant request): ' . $errors[0]->toString, |
43
|
|
|
|
|
|
|
errors=>\@errors |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Get rid of any empty arguments |
48
|
10
|
|
|
|
|
25
|
for( keys %args ) { |
49
|
20
|
50
|
33
|
|
|
68
|
delete $args{$_} if !defined($args{$_}) || !length($args{$_}); |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
10
|
|
|
|
|
32
|
my $request = HTTP::Request->new( GET => $self->_buildurl(%args) ); |
53
|
|
|
|
|
|
|
|
54
|
10
|
|
|
|
|
991
|
delete $args{force}; |
55
|
|
|
|
|
|
|
|
56
|
10
|
|
|
|
|
52
|
my $response = HTTP::OAI::Response->new( |
57
|
|
|
|
|
|
|
%args, |
58
|
|
|
|
|
|
|
handlers => $handlers, |
59
|
|
|
|
|
|
|
onRecord => $cb, |
60
|
|
|
|
|
|
|
); |
61
|
10
|
|
|
|
|
38
|
$response->request( $request ); |
62
|
10
|
|
|
|
|
125
|
my $parser = XML::LibXML->new( |
63
|
|
|
|
|
|
|
Handler => HTTP::OAI::SAX::Trace->new( |
64
|
|
|
|
|
|
|
Handler => HTTP::OAI::SAX::Text->new( |
65
|
|
|
|
|
|
|
Handler => $response |
66
|
|
|
|
|
|
|
) ) ); |
67
|
10
|
|
|
|
|
598
|
$parser->{content_length} = 0; |
68
|
10
|
|
|
|
|
60
|
$parser->{content_buffer} = Encode::encode('UTF-8',''); |
69
|
|
|
|
|
|
|
|
70
|
10
|
|
|
|
|
608
|
HTTP::OAI::Debug::trace( $args{verb} . " " . ref($parser) . "->parse_chunk()" ); |
71
|
10
|
|
|
|
|
14
|
my $r; |
72
|
|
|
|
|
|
|
{ |
73
|
10
|
|
|
|
|
13
|
local $SIG{__DIE__}; |
|
10
|
|
|
|
|
30
|
|
74
|
|
|
|
|
|
|
$r = $self->SUPER::request($request,sub { |
75
|
14
|
|
|
14
|
|
27659
|
$self->lwp_callback( $parser, @_ ) |
76
|
10
|
|
|
|
|
53
|
}); |
77
|
10
|
100
|
100
|
|
|
208161
|
if( $r->is_success && !defined $r->headers->header( 'Client-Aborted' ) ) |
78
|
|
|
|
|
|
|
{ |
79
|
1
|
|
|
|
|
48
|
eval { $self->lwp_endparse( $parser ) }; |
|
1
|
|
|
|
|
10
|
|
80
|
1
|
50
|
|
|
|
70
|
if( $@ ) |
81
|
|
|
|
|
|
|
{ |
82
|
0
|
|
|
|
|
0
|
$r->headers->header( 'Client-Aborted', 'die' ); |
83
|
0
|
|
|
|
|
0
|
$r->headers->header( 'X-Died', $@ ); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
} |
87
|
10
|
100
|
66
|
|
|
363
|
if( defined($r->headers->header( 'Client-Aborted' )) && $r->headers->header( 'Client-Aborted' ) eq 'die' ) |
88
|
|
|
|
|
|
|
{ |
89
|
8
|
|
|
|
|
478
|
my $err = $r->headers->header( 'X-Died' ); |
90
|
8
|
50
|
|
|
|
230
|
if( $err eq "done" ) |
91
|
|
|
|
|
|
|
{ |
92
|
8
|
|
|
|
|
33
|
$r->code(200); |
93
|
8
|
|
|
|
|
66
|
$r->message("OK"); |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
else |
96
|
|
|
|
|
|
|
{ |
97
|
0
|
|
|
|
|
0
|
$r->code(500); |
98
|
0
|
|
|
|
|
0
|
$r->message( 'An error occurred while parsing: ' . $err ); |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
10
|
|
|
|
|
163
|
my $cnt_len = $parser->{content_length}; |
103
|
10
|
|
|
|
|
51
|
undef $parser; |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# OAI retry-after |
106
|
10
|
50
|
33
|
|
|
32
|
if( defined($r) && ( $r->code == 503 || $r->code == 429 ) && defined(my $timeout = $r->headers->header('Retry-After')) ) { |
|
|
50
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
50
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
107
|
0
|
0
|
|
|
|
0
|
if( $self->{recursion}++ > 10 ) { |
108
|
0
|
|
|
|
|
0
|
$r->code(500); |
109
|
0
|
|
|
|
|
0
|
$r->message("Server did not give a response after 10 retries"); |
110
|
0
|
|
|
|
|
0
|
return $r; |
111
|
|
|
|
|
|
|
} |
112
|
0
|
0
|
0
|
|
|
0
|
if( !$timeout or $timeout =~ /\D/ or $timeout < 0 or $timeout > 86400 ) { |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
113
|
0
|
|
|
|
|
0
|
$r->code(500); |
114
|
0
|
|
0
|
|
|
0
|
$r->message("Server specified an unsupported duration to wait (\"".($timeout||'null')."\""); |
115
|
0
|
|
|
|
|
0
|
return $r; |
116
|
|
|
|
|
|
|
} |
117
|
0
|
|
|
|
|
0
|
HTTP::OAI::Debug::trace( "Waiting $timeout seconds" ); |
118
|
0
|
|
|
|
|
0
|
sleep($timeout+10); # We wait an extra 10 secs for safety |
119
|
0
|
|
|
|
|
0
|
return $self->_oai(@args); |
120
|
|
|
|
|
|
|
# Got an empty response |
121
|
|
|
|
|
|
|
} elsif( defined($r) && $r->is_success && $cnt_len == 0 ) { |
122
|
0
|
0
|
|
|
|
0
|
if( $self->{recursion}++ > 10 ) { |
123
|
0
|
|
|
|
|
0
|
$r->code(500); |
124
|
0
|
|
|
|
|
0
|
$r->message("No content in server response"); |
125
|
0
|
|
|
|
|
0
|
return $r; |
126
|
|
|
|
|
|
|
} |
127
|
0
|
|
|
|
|
0
|
HTTP::OAI::Debug::trace( "Retrying on empty response" ); |
128
|
0
|
|
|
|
|
0
|
sleep(5); |
129
|
0
|
|
|
|
|
0
|
return $self->_oai(@args); |
130
|
|
|
|
|
|
|
# An HTTP error occurred |
131
|
|
|
|
|
|
|
} elsif( $r->is_error ) { |
132
|
1
|
|
|
|
|
72
|
return $r; |
133
|
|
|
|
|
|
|
# An error occurred during parsing |
134
|
|
|
|
|
|
|
} elsif( $@ ) { |
135
|
0
|
0
|
|
|
|
0
|
$r->code(my $code = $@ =~ /read timeout/ ? 504 : 600); |
136
|
0
|
|
|
|
|
0
|
$r->message($@); |
137
|
0
|
|
|
|
|
0
|
return $r; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# access the original response via previous |
141
|
9
|
|
|
|
|
263
|
$response->previous($r); |
142
|
|
|
|
|
|
|
|
143
|
9
|
|
|
|
|
86
|
return $response; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub request |
147
|
|
|
|
|
|
|
{ |
148
|
1
|
|
|
1
|
1
|
125
|
my( $self, @args ) = @_; |
149
|
|
|
|
|
|
|
|
150
|
1
|
|
|
|
|
6
|
my $delay = $self->delay; |
151
|
1
|
50
|
|
|
|
12
|
if( defined $delay ) |
152
|
|
|
|
|
|
|
{ |
153
|
0
|
0
|
|
|
|
0
|
if( ref($delay) eq "CODE" ) |
154
|
|
|
|
|
|
|
{ |
155
|
0
|
|
|
|
|
0
|
$delay = &$delay( $self->last_request_completed ); |
156
|
|
|
|
|
|
|
} |
157
|
0
|
0
|
|
|
|
0
|
select(undef,undef,undef,$delay) if $delay > 0; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
1
|
|
|
|
|
9
|
my $r = $self->SUPER::request( @args ); |
161
|
|
|
|
|
|
|
|
162
|
1
|
|
|
|
|
18949
|
$self->last_request_completed( time ); |
163
|
|
|
|
|
|
|
|
164
|
1
|
|
|
|
|
23
|
return $r; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub lwp_badchar |
168
|
|
|
|
|
|
|
{ |
169
|
0
|
|
|
0
|
0
|
0
|
my $codepoint = sprintf('U+%04x', ord($_[2])); |
170
|
0
|
0
|
|
|
|
0
|
unless( $SILENT_BAD_CHARS ) |
171
|
|
|
|
|
|
|
{ |
172
|
0
|
|
|
|
|
0
|
warn "Bad Unicode character $codepoint at byte offset ".$_[1]->{content_length}." from ".$_[1]->{request}->uri."\n"; |
173
|
|
|
|
|
|
|
} |
174
|
0
|
|
|
|
|
0
|
return $codepoint; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub lwp_endparse |
178
|
|
|
|
|
|
|
{ |
179
|
1
|
|
|
1
|
0
|
3
|
my( $self, $parser ) = @_; |
180
|
|
|
|
|
|
|
|
181
|
1
|
|
|
|
|
3
|
my $utf8 = $parser->{content_buffer}; |
182
|
|
|
|
|
|
|
# Replace bad chars with '?' |
183
|
1
|
50
|
33
|
|
|
13
|
if( $IGNORE_BAD_CHARS and length($utf8) ) { |
184
|
0
|
|
|
0
|
|
0
|
$utf8 = Encode::decode('UTF-8', $utf8, sub { $self->lwp_badchar($parser, @_) }); |
|
0
|
|
|
|
|
0
|
|
185
|
|
|
|
|
|
|
} |
186
|
1
|
50
|
|
|
|
4
|
if( length($utf8) > 0 ) |
187
|
|
|
|
|
|
|
{ |
188
|
0
|
|
|
|
|
0
|
_ccchars($utf8); # Fix control chars |
189
|
0
|
|
|
|
|
0
|
$parser->{content_length} += length($utf8); |
190
|
0
|
|
|
|
|
0
|
$parser->parse_chunk($utf8); |
191
|
|
|
|
|
|
|
} |
192
|
1
|
|
|
|
|
2
|
delete($parser->{content_buffer}); |
193
|
1
|
|
|
|
|
4
|
$parser->parse_chunk('', 1); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub lwp_callback |
197
|
|
|
|
|
|
|
{ |
198
|
14
|
|
|
14
|
0
|
23
|
my( $self, $parser ) = @_; |
199
|
|
|
|
|
|
|
|
200
|
11
|
|
|
11
|
|
111
|
use bytes; # fixing utf-8 will need byte semantics |
|
11
|
|
|
|
|
20
|
|
|
11
|
|
|
|
|
84
|
|
201
|
|
|
|
|
|
|
|
202
|
14
|
|
|
|
|
68
|
$parser->{content_buffer} .= $_[2]; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
do |
205
|
|
|
|
|
|
|
{ |
206
|
|
|
|
|
|
|
# FB_QUIET won't split multi-byte chars on input |
207
|
14
|
|
|
|
|
77
|
my $utf8 = Encode::decode('UTF-8', $parser->{content_buffer}, Encode::FB_QUIET); |
208
|
|
|
|
|
|
|
|
209
|
14
|
50
|
|
|
|
651
|
if( length($utf8) > 0 ) |
210
|
|
|
|
|
|
|
{ |
211
|
11
|
|
|
11
|
|
6480
|
use utf8; |
|
11
|
|
|
|
|
148
|
|
|
11
|
|
|
|
|
62
|
|
212
|
14
|
|
|
|
|
41
|
_ccchars($utf8); # Fix control chars |
213
|
14
|
|
|
|
|
23
|
$parser->{content_length} += length($utf8); |
214
|
14
|
|
|
|
|
35
|
$parser->parse_chunk($utf8); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
6
|
50
|
|
|
|
62
|
if( length($parser->{content_buffer}) > MAX_UTF8_BYTES ) |
218
|
|
|
|
|
|
|
{ |
219
|
0
|
|
|
|
|
0
|
$parser->{content_buffer} =~ s/^([\x80-\xff]{1,4})//s; |
220
|
0
|
|
|
|
|
0
|
my $badbytes = $1; |
221
|
0
|
0
|
|
|
|
0
|
if( length($badbytes) == 0 ) |
222
|
|
|
|
|
|
|
{ |
223
|
0
|
|
|
|
|
0
|
Carp::confess "Internal error - bad bytes but not in 0x80-0xff range???"; |
224
|
|
|
|
|
|
|
} |
225
|
0
|
0
|
|
|
|
0
|
if( $IGNORE_BAD_CHARS ) |
226
|
|
|
|
|
|
|
{ |
227
|
|
|
|
|
|
|
$badbytes = join('', map { |
228
|
0
|
|
|
|
|
0
|
$self->lwp_badchar($parser, $_) |
|
0
|
|
|
|
|
0
|
|
229
|
|
|
|
|
|
|
} split //, $badbytes); |
230
|
|
|
|
|
|
|
} |
231
|
0
|
|
|
|
|
0
|
$parser->parse_chunk( $badbytes ); |
232
|
|
|
|
|
|
|
} |
233
|
14
|
|
|
|
|
25
|
} while( length($parser->{content_buffer}) > MAX_UTF8_BYTES ); |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub _ccchars { |
237
|
14
|
|
|
14
|
|
589
|
$_[0] =~ s/([\x00-\x08\x0b-\x0c\x0e-\x1f])/sprintf("\\%04d",ord($1))/seg; |
|
0
|
|
|
|
|
0
|
|
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub _buildurl { |
241
|
10
|
|
|
10
|
|
26
|
my( $self, %args ) = @_; |
242
|
|
|
|
|
|
|
|
243
|
10
|
50
|
|
|
|
22
|
Carp::confess "Requires verb parameter" unless $args{'verb'}; |
244
|
|
|
|
|
|
|
|
245
|
10
|
|
|
|
|
28
|
my $uri = URI->new( $self->baseURL ); |
246
|
10
|
100
|
|
|
|
570
|
return $uri->as_string if $uri->scheme eq "file"; |
247
|
|
|
|
|
|
|
|
248
|
1
|
50
|
33
|
|
|
17
|
if( defined($args{resumptionToken}) && !$args{force} ) { |
249
|
0
|
|
|
|
|
0
|
$uri->query_form(verb=>$args{'verb'},resumptionToken=>$args{'resumptionToken'}); |
250
|
|
|
|
|
|
|
} else { |
251
|
1
|
|
|
|
|
2
|
delete $args{force}; |
252
|
|
|
|
|
|
|
# http://www.cshc.ubc.ca/oai/ breaks if verb isn't first, doh |
253
|
1
|
|
|
|
|
16
|
$uri->query_form(verb=>delete($args{'verb'}),%args); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
1
|
|
|
|
|
84
|
return $uri->as_string; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
sub decompress { |
260
|
0
|
|
|
0
|
0
|
|
my ($response) = @_; |
261
|
0
|
|
|
|
|
|
my $type = $response->headers->header("Content-Encoding"); |
262
|
0
|
0
|
|
|
|
|
return $response->{_content_filename} unless defined($type); |
263
|
0
|
0
|
|
|
|
|
if( $type eq 'gzip' ) { |
264
|
0
|
|
|
|
|
|
my $filename = File::Temp->new( UNLINK => 1 ); |
265
|
0
|
0
|
|
|
|
|
my $gz = Compress::Zlib::gzopen($response->{_content_filename}, "r") or die $!; |
266
|
0
|
|
|
|
|
|
my ($buffer,$c); |
267
|
0
|
|
|
|
|
|
my $fh = IO::File->new($filename,"w"); |
268
|
0
|
|
|
|
|
|
binmode($fh,":utf8"); |
269
|
0
|
|
|
|
|
|
while( ($c = $gz->gzread($buffer)) > 0 ) { |
270
|
0
|
|
|
|
|
|
print $fh $buffer; |
271
|
|
|
|
|
|
|
} |
272
|
0
|
|
|
|
|
|
$fh->close(); |
273
|
0
|
|
|
|
|
|
$gz->gzclose(); |
274
|
0
|
0
|
|
|
|
|
die "Error decompressing gziped response: " . $gz->gzerror() if -1 == $c; |
275
|
0
|
|
|
|
|
|
return $response->{_content_filename} = $filename; |
276
|
|
|
|
|
|
|
} else { |
277
|
0
|
|
|
|
|
|
die "Unsupported compression returned: $type\n"; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
1; |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
__END__ |