line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
4
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
5
|
|
|
|
|
|
|
# the Free Software Foundation; either version 1, or (at your option) |
6
|
|
|
|
|
|
|
# any later version. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
9
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
10
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11
|
|
|
|
|
|
|
# GNU General Public License for more details. |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
14
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
15
|
|
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# NO WARRANTY |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
21
|
|
|
|
|
|
|
# FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
22
|
|
|
|
|
|
|
# OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
23
|
|
|
|
|
|
|
# PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED |
24
|
|
|
|
|
|
|
# OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
25
|
|
|
|
|
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS |
26
|
|
|
|
|
|
|
# TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE |
27
|
|
|
|
|
|
|
# PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, |
28
|
|
|
|
|
|
|
# REPAIR OR CORRECTION. |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
# IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
31
|
|
|
|
|
|
|
# WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
32
|
|
|
|
|
|
|
# REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, |
33
|
|
|
|
|
|
|
# INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING |
34
|
|
|
|
|
|
|
# OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED |
35
|
|
|
|
|
|
|
# TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY |
36
|
|
|
|
|
|
|
# YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER |
37
|
|
|
|
|
|
|
# PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE |
38
|
|
|
|
|
|
|
# POSSIBILITY OF SUCH DAMAGES. |
39
|
|
|
|
|
|
|
# |
40
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# $Id: UA.pm 257 2008-09-03 08:23:32Z anders $ |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# COMB/XWI/UA.pm - harvesting robots with XWI interface |
46
|
|
|
|
|
|
|
# v0.01 by Yong Cao, 1997-08-08 |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
package Combine::UA; |
49
|
|
|
|
|
|
|
|
50
|
1
|
|
|
1
|
|
632
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
39
|
|
51
|
1
|
|
|
1
|
|
511
|
use Combine::Config; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
30
|
|
52
|
1
|
|
|
1
|
|
1289
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
55460
|
|
|
1
|
|
|
|
|
39
|
|
53
|
1
|
|
|
1
|
|
11
|
use HTTP::Date; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1060
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
my $expGar; |
56
|
|
|
|
|
|
|
my $userAgentGetIfModifiedSince; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub TruncatingUserAgent { |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# This function returns an LWP::UserAgent that truncates incoming data |
61
|
|
|
|
|
|
|
# when a number of bytes, that's dictated by Combine's configuration set, |
62
|
|
|
|
|
|
|
# has been received. |
63
|
|
|
|
|
|
|
# |
64
|
|
|
|
|
|
|
# Experiments (1999-02-02) have shown that the truncation is approximate |
65
|
|
|
|
|
|
|
# in that the resulting document size may vary up or down a few percents |
66
|
|
|
|
|
|
|
# or kilobytes. |
67
|
|
|
|
|
|
|
|
68
|
0
|
|
|
0
|
0
|
|
my $ua = new LWP::UserAgent(); |
69
|
|
|
|
|
|
|
# $ua->max_size(COMB::Config::GetMaxDocSize()); #Problem with webservers returning 206 partial content in a multipart |
70
|
0
|
|
|
|
|
|
$ua->timeout(Combine::Config::Get('UAtimeout')); |
71
|
0
|
|
|
|
|
|
$ua->agent("Combine/3 http://combine.it.lth.se/"); |
72
|
0
|
|
|
|
|
|
$ua->from(Combine::Config::Get('Operator-Email')); |
73
|
0
|
|
|
|
|
|
$ua->default_header('Accept-Encoding' => 'gzip'); |
74
|
0
|
0
|
|
|
|
|
if (Combine::Config::Get('httpProxy')) { |
75
|
0
|
|
|
|
|
|
$ua->proxy(['http', 'https'], Combine::Config::Get('httpProxy')); |
76
|
|
|
|
|
|
|
} |
77
|
0
|
|
|
|
|
|
$expGar = Combine::Config::Get('WaitIntervalExpirationGuaranteed'); |
78
|
0
|
|
|
|
|
|
$userAgentGetIfModifiedSince = Combine::Config::Get('UserAgentGetIfModifiedSince'); |
79
|
0
|
|
|
|
|
|
return $ua; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub fetch { # use get-if-modified-since |
84
|
0
|
|
|
0
|
0
|
|
my ($xwi, $since) = @_; |
85
|
0
|
|
|
|
|
|
my ($url_str, $ua, $req, $resp, $code, $msg, $method, $type, $ext); |
86
|
0
|
|
|
|
|
|
$ua = TruncatingUserAgent(); |
87
|
|
|
|
|
|
|
#FIX! $since = $jcf->ftime unless $since; |
88
|
0
|
0
|
|
|
|
|
$since = time - $expGar unless $since; |
89
|
0
|
|
|
|
|
|
$url_str = $xwi->url; |
90
|
0
|
|
|
|
|
|
$type = ''; #FIX $jcf->typ; |
91
|
0
|
|
|
|
|
|
$method = "GET"; |
92
|
0
|
0
|
|
|
|
|
if ( $type ) { |
93
|
0
|
0
|
|
|
|
|
$method = "HEAD" unless defined(${Combine::Config::Get('converters')}{$type}); |
|
0
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
} else { |
95
|
0
|
0
|
|
|
|
|
if ( $url_str =~ m/\.([^\/\s\.]+)\s*$/ ) { |
96
|
0
|
|
|
|
|
|
$ext = $1; |
97
|
0
|
|
|
|
|
|
$ext =~ tr/A-Z/a-z/; |
98
|
0
|
0
|
|
|
|
|
$method = "HEAD" if defined(${Combine::Config::Get('binext')}{$ext}); |
|
0
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
0
|
0
|
|
|
|
|
if ( $method eq "HEAD" ) { |
102
|
0
|
|
|
|
|
|
$req = new HTTP::Request 'HEAD'=> $url_str; |
103
|
0
|
0
|
|
|
|
|
$req->header('If-Modified-Since' => &time2str($since)) |
104
|
|
|
|
|
|
|
if $userAgentGetIfModifiedSince; |
105
|
0
|
0
|
|
|
|
|
if (Combine::Config::Get('UserAgentFollowRedirects')) { $resp = $ua->request($req); } |
|
0
|
|
|
|
|
|
|
106
|
0
|
|
|
|
|
|
else { $resp = $ua->simple_request($req); } |
107
|
0
|
|
|
|
|
|
$code = $resp->code; |
108
|
0
|
|
|
|
|
|
$msg = $resp->message(); |
109
|
0
|
|
|
|
|
|
$method = ""; |
110
|
0
|
0
|
|
|
|
|
if ( $code eq "200" ) { |
111
|
0
|
|
|
|
|
|
$type = $resp->header("content-type"); |
112
|
0
|
0
|
0
|
|
|
|
$method = "GET" if $type and defined(${Combine::Config::Get('converters')}{$type}); |
|
0
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
} |
115
|
0
|
0
|
|
|
|
|
if ( $method eq "GET" ) { |
116
|
0
|
|
|
|
|
|
$req = new HTTP::Request 'GET'=> $url_str; |
117
|
0
|
0
|
|
|
|
|
$req->header('If-Modified-Since' => &time2str($since)) |
118
|
|
|
|
|
|
|
if $userAgentGetIfModifiedSince; |
119
|
0
|
0
|
|
|
|
|
if (Combine::Config::Get('UserAgentFollowRedirects')) { $resp = $ua->request($req); } |
|
0
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
else { $resp = $ua->simple_request($req); } |
121
|
0
|
|
|
|
|
|
$code = $resp->code; |
122
|
0
|
|
|
|
|
|
$msg = $resp->message(); |
123
|
|
|
|
|
|
|
# print "$url_str; " . &time2str($since) ."; $code; $msg\n"; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
0
|
|
|
|
|
|
my @cs=$resp->header('Content-Type'); |
127
|
0
|
|
|
|
|
|
foreach my $c (@cs) { |
128
|
0
|
|
|
|
|
|
$xwi->meta_add('content-type',$c); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
$xwi->stat($code); |
132
|
|
|
|
|
|
|
#BEHÖVS??? $xwi->url($url_str); |
133
|
0
|
|
|
|
|
|
$xwi->server($resp->header("server")); |
134
|
0
|
|
|
|
|
|
$xwi->etag($resp->header("etag")); |
135
|
0
|
|
|
|
|
|
my $t = $resp->content_type; |
136
|
0
|
|
|
|
|
|
$xwi->type($t); |
137
|
0
|
|
|
|
|
|
$t = $resp->content_language; |
138
|
0
|
0
|
|
|
|
|
if (defined($t)) {$xwi->meta_add('content-language',$t);} |
|
0
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
$xwi->length($resp->header("content-length")); |
140
|
0
|
|
|
|
|
|
$xwi->location($resp->header("location")); |
141
|
0
|
|
|
|
|
|
$xwi->base($resp->base); |
142
|
|
|
|
|
|
|
#Numeric gives error message '... too small' |
143
|
|
|
|
|
|
|
# $xwi->expiryDate(&check_date($resp->expires)); |
144
|
0
|
|
|
|
|
|
$xwi->modifiedDate(&check_date($resp->header("last-modified"))); |
145
|
0
|
|
|
|
|
|
$xwi->expiryDate(&check_date($resp->header("expires"))); |
146
|
|
|
|
|
|
|
#? $xwi->checkedDate(&check_date($resp->header("date"))); |
147
|
0
|
0
|
|
|
|
|
$xwi->checkedDate(time) unless $xwi->checkedDate; |
148
|
0
|
0
|
0
|
|
|
|
if ($code eq "200" or $code eq "206") { |
149
|
0
|
0
|
0
|
|
|
|
if ( $method eq "GET" and length($resp->content_ref) > 0 ) { |
150
|
0
|
|
|
|
|
|
$xwi->truncated($resp->headers()->header('X-Content-Range')); |
151
|
|
|
|
|
|
|
} |
152
|
0
|
0
|
|
|
|
|
if ($resp->decoded_content( 'ref' => 1 )) { |
153
|
0
|
|
|
|
|
|
$xwi->content($resp->decoded_content( 'ref' => 1 )); |
154
|
|
|
|
|
|
|
} else { |
155
|
0
|
|
|
|
|
|
$xwi->content($resp->content_ref); |
156
|
|
|
|
|
|
|
#CHECK if gzip encoded anyhow? |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
0
|
|
|
|
|
|
return ($code, $msg); |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub check_date { # makes sure the date is in a correct format (UnixTime) |
163
|
0
|
|
|
0
|
0
|
|
my ($str) = @_; |
164
|
0
|
|
|
|
|
|
my $tim = undef; |
165
|
0
|
0
|
|
|
|
|
if ( $str ) { |
166
|
0
|
|
|
|
|
|
eval { $tim = &str2time( $str ) }; |
|
0
|
|
|
|
|
|
|
167
|
0
|
|
|
|
|
|
return $tim; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
1; |