| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Link::Tester::Complex; |
|
2
|
|
|
|
|
|
|
$REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ ); |
|
3
|
|
|
|
|
|
|
|
|
4
|
3
|
|
|
3
|
|
18
|
use Carp qw(carp cluck croak); |
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
378
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
WWW::Link::Tester::Complex - a careful tester for broken links |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use WWW::Link::Test::Complex |
|
13
|
|
|
|
|
|
|
$ua=create_a_user_agent(); |
|
14
|
|
|
|
|
|
|
$link=get_a_link_object(); |
|
15
|
|
|
|
|
|
|
WWW::Link::Test::Complex::test_link($ua, $link); |
|
16
|
|
|
|
|
|
|
WWW::Link::Tester::Simple::Test($url) |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
This is a link testing module based on the work of Phil Mitchell at |
|
22
|
|
|
|
|
|
|
Harvard College. The aim is to test very carefully if a link is |
|
23
|
|
|
|
|
|
|
really there. |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
N.B. I have done the minimum reasonable edits on the file so that any |
|
26
|
|
|
|
|
|
|
later improvements can be easily added. This means that the module |
|
27
|
|
|
|
|
|
|
contains and sections of code which are not relevant to |
|
28
|
|
|
|
|
|
|
LinkController. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 ROBOT LOGIC |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
This system should be controlled by the robot logic of the user agent it |
|
33
|
|
|
|
|
|
|
uses provided that the robot returns a 4xx response code. |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 AUTHOR |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Copyright (c) 2000 by the President and Fellows of Harvard College |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
|
40
|
|
|
|
|
|
|
it under the terms of the GNU General Public License as published by |
|
41
|
|
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or (at |
|
42
|
|
|
|
|
|
|
your option) any later version. |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Please see the source code for further details |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
############################################################################ |
|
49
|
|
|
|
|
|
|
# |
|
50
|
|
|
|
|
|
|
# Copyright (c) 2000 by the President and Fellows of Harvard College |
|
51
|
|
|
|
|
|
|
# |
|
52
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
|
53
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
|
54
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or (at |
|
55
|
|
|
|
|
|
|
# your option) any later version. |
|
56
|
|
|
|
|
|
|
# |
|
57
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, but |
|
58
|
|
|
|
|
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of |
|
59
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
60
|
|
|
|
|
|
|
# General Public License for more details. |
|
61
|
|
|
|
|
|
|
# |
|
62
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
63
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
|
64
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
|
65
|
|
|
|
|
|
|
# USA. |
|
66
|
|
|
|
|
|
|
# |
|
67
|
|
|
|
|
|
|
# Contact information: |
|
68
|
|
|
|
|
|
|
# |
|
69
|
|
|
|
|
|
|
# Phil Mitchell |
|
70
|
|
|
|
|
|
|
# Office for Information Systems |
|
71
|
|
|
|
|
|
|
# Harvard University |
|
72
|
|
|
|
|
|
|
# philip_mitchell at harvard.edu |
|
73
|
|
|
|
|
|
|
# |
|
74
|
|
|
|
|
|
|
############################################################################# |
|
75
|
|
|
|
|
|
|
# |
|
76
|
|
|
|
|
|
|
# When called without args, this script reads a list of URLs, one per line, |
|
77
|
|
|
|
|
|
|
# from $INPUT_FILE, extracts the url from each record, and tries to access |
|
78
|
|
|
|
|
|
|
# the url using the appropriate protocol. This includes following redirects |
|
79
|
|
|
|
|
|
|
# until either: |
|
80
|
|
|
|
|
|
|
# 1. the target page is successfully received; or |
|
81
|
|
|
|
|
|
|
# 2. a page cycle is detected; or |
|
82
|
|
|
|
|
|
|
# 3. a bad server or page request is detected; or |
|
83
|
|
|
|
|
|
|
# 4. a maximum number of redirects ($MAX_REDIRECTS) is exceeded. |
|
84
|
|
|
|
|
|
|
# |
|
85
|
|
|
|
|
|
|
#...deleted... |
|
86
|
|
|
|
|
|
|
# |
|
87
|
|
|
|
|
|
|
# Protocols supported: http, https, ftp, gopher, file, telnet. |
|
88
|
|
|
|
|
|
|
# |
|
89
|
|
|
|
|
|
|
# Status codes: |
|
90
|
|
|
|
|
|
|
# Success: All successful response codes have the form: |
|
91
|
|
|
|
|
|
|
# 2xx. Because we limit the size of responses we accept, we get a |
|
92
|
|
|
|
|
|
|
# lot of 206's in addition to 200's. |
|
93
|
|
|
|
|
|
|
# UNSUPPORTED_PROTOCOL: |
|
94
|
|
|
|
|
|
|
# Linkcheck handles {http, https, ftp, gopher, file, telnet}. Other |
|
95
|
|
|
|
|
|
|
# protocols will get this error. More commonly, it is the result of a |
|
96
|
|
|
|
|
|
|
# typo (eg. "thttp://"). |
|
97
|
|
|
|
|
|
|
# MALFORMED_URL: The url is syntactically incorrect. EG., |
|
98
|
|
|
|
|
|
|
# "http:/www.domain.com". |
|
99
|
|
|
|
|
|
|
# TELNET_FAILURE: Couldn't open the requested telnet connection. |
|
100
|
|
|
|
|
|
|
# HTTP_0_9_FAIL: Failed HTTP/0.9 connection (0.9 does not return |
|
101
|
|
|
|
|
|
|
# status codes). |
|
102
|
|
|
|
|
|
|
# REDIRECT_LIMIT_EXCEEDED: |
|
103
|
|
|
|
|
|
|
# Too many redirections. This error code should not normally be |
|
104
|
|
|
|
|
|
|
# received, it is in place to catch infinite redirect cycles. |
|
105
|
|
|
|
|
|
|
# UNKNOWN_ERROR: Rarely, LWP or HTTP modules will die, reporting an |
|
106
|
|
|
|
|
|
|
# error that is not useful to us. This error code should |
|
107
|
|
|
|
|
|
|
# not normally be received; it |
|
108
|
|
|
|
|
|
|
# will generally be corrected in subsequent passes. |
|
109
|
|
|
|
|
|
|
# |
|
110
|
|
|
|
|
|
|
# There are various configurable parameters documented below. In |
|
111
|
|
|
|
|
|
|
# addition to setting the input and output filenames, the most |
|
112
|
|
|
|
|
|
|
# important ones are those that control the timeout, the number of |
|
113
|
|
|
|
|
|
|
# retries, and the time between retries. These settings have an |
|
114
|
|
|
|
|
|
|
# important effect on the accuracy of results. |
|
115
|
|
|
|
|
|
|
# |
|
116
|
|
|
|
|
|
|
# Accuracy of results: |
|
117
|
|
|
|
|
|
|
# |
|
118
|
|
|
|
|
|
|
# Informal tests (results can be found at the end of this script) have |
|
119
|
|
|
|
|
|
|
# shown that: (1) a timeout of 30 sec is adequate; increasing to 60 |
|
120
|
|
|
|
|
|
|
# sec is not useful; 10 seconds is too short. (2) The absolute number |
|
121
|
|
|
|
|
|
|
# of recheck passes is less important than spreading them over |
|
122
|
|
|
|
|
|
|
# time. Reasonable results are obtained with 3 recheck passes, each |
|
123
|
|
|
|
|
|
|
# separated by 8 hours of sleep. |
|
124
|
|
|
|
|
|
|
# |
|
125
|
|
|
|
|
|
|
# In our set of about 10,000 urls, a first pass produces about 800 |
|
126
|
|
|
|
|
|
|
# (8%) bad urls. Subsequent passes will reduce that to about 650 |
|
127
|
|
|
|
|
|
|
# (6.5%). The use of telnet retry will reach another 25% of those |
|
128
|
|
|
|
|
|
|
# apparently bad urls. The estimate of total bad urls in our sample is |
|
129
|
|
|
|
|
|
|
# thus 4.5%. That list of bad urls is consistent across distinct runs |
|
130
|
|
|
|
|
|
|
# of the link checker at greater than 99%. Handchecking of a large |
|
131
|
|
|
|
|
|
|
# sample from this final list indicates a high degree of accuracy. |
|
132
|
|
|
|
|
|
|
# |
|
133
|
|
|
|
|
|
|
# Notes: |
|
134
|
|
|
|
|
|
|
# |
|
135
|
|
|
|
|
|
|
# - A "page cycle" is the use of a redirect or refresh tag to cycle through |
|
136
|
|
|
|
|
|
|
# a list of one or more pages for data refresh purposes. |
|
137
|
|
|
|
|
|
|
# |
|
138
|
|
|
|
|
|
|
# Design Notes: |
|
139
|
|
|
|
|
|
|
# |
|
140
|
|
|
|
|
|
|
# - Cookies: This version accepts all cookies. This allows it to handle some |
|
141
|
|
|
|
|
|
|
# URLs which require cookies. |
|
142
|
|
|
|
|
|
|
# |
|
143
|
|
|
|
|
|
|
# - Timeout bug: Due to an apparent bug in the interaction between |
|
144
|
|
|
|
|
|
|
# Solaris and certain web servers, some http responses come back |
|
145
|
|
|
|
|
|
|
# improperly terminated. As a result, LWP times out and reports a |
|
146
|
|
|
|
|
|
|
# server error when a (nearly) valid response has been received. To |
|
147
|
|
|
|
|
|
|
# avoid this, we open a telnet connection to the relevant port |
|
148
|
|
|
|
|
|
|
# (usually 80) and do a manual GET on the url. Telnet will also time |
|
149
|
|
|
|
|
|
|
# out in this case, but telnet.pm provides a dump of the partial |
|
150
|
|
|
|
|
|
|
# response received, and we use this. |
|
151
|
|
|
|
|
|
|
# |
|
152
|
|
|
|
|
|
|
# - WWW unreliability: Any given access to a server on the web is |
|
153
|
|
|
|
|
|
|
# subject to various kinds of flakiness. To avoid false reports of |
|
154
|
|
|
|
|
|
|
# bad servers, it is essential to re-test all errors, preferably over |
|
155
|
|
|
|
|
|
|
# a period of hours or days. This script completes a first pass |
|
156
|
|
|
|
|
|
|
# through all urls, typically taking 8 hours or more on 10,000 |
|
157
|
|
|
|
|
|
|
# urls. Then it performs additional ($RECHECKS) passes on all urls |
|
158
|
|
|
|
|
|
|
# that received error codes. It sleeps ($HOURS_TO_SLEEP) between |
|
159
|
|
|
|
|
|
|
# passes to improve the chances of getting a valid return code. |
|
160
|
|
|
|
|
|
|
# |
|
161
|
|
|
|
|
|
|
# - Redirects and cycles: The challenge is to follow redirects all |
|
162
|
|
|
|
|
|
|
# the way to the end of the line, but know when to stop. It is |
|
163
|
|
|
|
|
|
|
# complicated by the fact that some sites use the meta refresh tag |
|
164
|
|
|
|
|
|
|
# for their redirection, and by the fact that some sites have |
|
165
|
|
|
|
|
|
|
# infinite loop cycles for page refresh purposes. Five distinct cases |
|
166
|
|
|
|
|
|
|
# have been identified: |
|
167
|
|
|
|
|
|
|
# |
|
168
|
|
|
|
|
|
|
# 1. Proper redirect, using Location header. (Action: Follow redirect.) |
|
169
|
|
|
|
|
|
|
# 2. Proper meta refresh, on a single page. (Action: Detect cycle |
|
170
|
|
|
|
|
|
|
# and exit.) |
|
171
|
|
|
|
|
|
|
# 3. Proper meta refresh, on a cycle of pages. (Action:Detect |
|
172
|
|
|
|
|
|
|
# cycle and exit.) |
|
173
|
|
|
|
|
|
|
# 4. Redirect using meta refresh. (Action: Follow redirect.) |
|
174
|
|
|
|
|
|
|
# 5. Redirect loop on a single page for setting cookies. (Action: |
|
175
|
|
|
|
|
|
|
# Follow redirect.) |
|
176
|
|
|
|
|
|
|
# |
|
177
|
|
|
|
|
|
|
# Maintenance and Future Development Notes: |
|
178
|
|
|
|
|
|
|
# |
|
179
|
|
|
|
|
|
|
# - 401's and 403's: Currently does not handle authentication; just |
|
180
|
|
|
|
|
|
|
# reports these as errors. |
|
181
|
|
|
|
|
|
|
# |
|
182
|
|
|
|
|
|
|
# - Cookie warnings: With perl's -w option, many warnings will be |
|
183
|
|
|
|
|
|
|
# received about Cookies.pm. This seems to be due to the fact that |
|
184
|
|
|
|
|
|
|
# Cookies.pm does not cleanly handle incorrectly formatted |
|
185
|
|
|
|
|
|
|
# cookies. As far as I know, these warnings may be safely ignored. |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Author: Phil Mitchell |
|
188
|
|
|
|
|
|
|
# Date: 02/22/01 |
|
189
|
|
|
|
|
|
|
# Version: 1.5 |
|
190
|
|
|
|
|
|
|
# |
|
191
|
|
|
|
|
|
|
############################################################################# |
|
192
|
|
|
|
|
|
|
|
|
193
|
3
|
|
|
3
|
|
16
|
use WWW::Link::Tester; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
230
|
|
|
194
|
|
|
|
|
|
|
@ISA="WWW::Link::Tester"; |
|
195
|
|
|
|
|
|
|
|
|
196
|
3
|
|
|
3
|
|
15
|
use strict; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
96
|
|
|
197
|
3
|
|
|
3
|
|
3629
|
use LWP::UserAgent; |
|
|
3
|
|
|
|
|
59750
|
|
|
|
3
|
|
|
|
|
113
|
|
|
198
|
3
|
|
|
3
|
|
33
|
use HTTP::Response; |
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
76
|
|
|
199
|
3
|
|
|
3
|
|
16
|
use HTTP::Message; |
|
|
3
|
|
|
|
|
9
|
|
|
|
3
|
|
|
|
|
78
|
|
|
200
|
3
|
|
|
3
|
|
16
|
use HTTP::Status; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
1317
|
|
|
201
|
3
|
|
|
3
|
|
30
|
use HTTP::Headers; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
141
|
|
|
202
|
3
|
|
|
3
|
|
18
|
use HTTP::Request; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
76
|
|
|
203
|
3
|
|
|
3
|
|
4036
|
use HTTP::Cookies; |
|
|
3
|
|
|
|
|
27558
|
|
|
|
3
|
|
|
|
|
104
|
|
|
204
|
3
|
|
|
3
|
|
5056
|
use Net::Telnet; |
|
|
3
|
|
|
|
|
176731
|
|
|
|
3
|
|
|
|
|
304
|
|
|
205
|
|
|
|
|
|
|
#use LWP::Debug qw(+); |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
########################################### |
|
208
|
|
|
|
|
|
|
# Global variables |
|
209
|
|
|
|
|
|
|
########################################### |
|
210
|
|
|
|
|
|
|
|
|
211
|
3
|
|
|
|
|
1608
|
use vars qw( |
|
212
|
|
|
|
|
|
|
%url_hash |
|
213
|
|
|
|
|
|
|
$HTTP_DEFAULT_PORT |
|
214
|
|
|
|
|
|
|
$HTTP_VERSION |
|
215
|
|
|
|
|
|
|
$ADMIN_EMAIL |
|
216
|
|
|
|
|
|
|
$MAX_REDIRECTS |
|
217
|
|
|
|
|
|
|
$RECHECKS |
|
218
|
|
|
|
|
|
|
$HOURS_TO_SLEEP |
|
219
|
|
|
|
|
|
|
$AGENT_TIMEOUT |
|
220
|
|
|
|
|
|
|
$AGENT_MAX_RESPONSE |
|
221
|
|
|
|
|
|
|
$INPUT_FILE |
|
222
|
|
|
|
|
|
|
$OUTPUT_FILE |
|
223
|
|
|
|
|
|
|
$TMP_FILE |
|
224
|
|
|
|
|
|
|
$TELNET_LOGFILE |
|
225
|
|
|
|
|
|
|
$ADMIN_LOGFILE |
|
226
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED |
|
227
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL |
|
228
|
|
|
|
|
|
|
$MALFORMED_URL |
|
229
|
|
|
|
|
|
|
$HTTP_0_9_OKAY |
|
230
|
|
|
|
|
|
|
$HTTP_0_9_FAIL |
|
231
|
|
|
|
|
|
|
$UNKNOWN_ERROR |
|
232
|
|
|
|
|
|
|
$VERBOSE |
|
233
|
|
|
|
|
|
|
$DEBUG |
|
234
|
|
|
|
|
|
|
$LOGGING |
|
235
|
|
|
|
|
|
|
$TELNET_SUCCESS |
|
236
|
|
|
|
|
|
|
$TELNET_FAILURE |
|
237
|
|
|
|
|
|
|
$agent |
|
238
|
|
|
|
|
|
|
$telnetAgent |
|
239
|
|
|
|
|
|
|
$cookieJar |
|
240
|
|
|
|
|
|
|
$redirectCount |
|
241
|
3
|
|
|
3
|
|
28
|
); |
|
|
3
|
|
|
|
|
9
|
|
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
########################################### |
|
246
|
|
|
|
|
|
|
# Configurable parameters |
|
247
|
|
|
|
|
|
|
########################################### |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
$ADMIN_EMAIL = ''; # If non-empty, script will send confirmation and result stats. |
|
250
|
|
|
|
|
|
|
$AGENT_TIMEOUT = 10; # In seconds, time for http agent to wait. 10 secs is often too |
|
251
|
|
|
|
|
|
|
# short, leads to spurious reports of server errors. Longer than |
|
252
|
|
|
|
|
|
|
# 30 secs not usually helpful. |
|
253
|
|
|
|
|
|
|
$AGENT_MAX_RESPONSE = 524288; # In bytes, max response to accept. Mainly want to |
|
254
|
|
|
|
|
|
|
# avoid being swamped by something huge. |
|
255
|
|
|
|
|
|
|
$MAX_REDIRECTS = 15; # Number of redirects to tolerate before giving up. Should never hit |
|
256
|
|
|
|
|
|
|
# this limit; it's here to avoid infinite loop. |
|
257
|
|
|
|
|
|
|
$RECHECKS = 3; # Number of recheck passes to recheck urls that return error codes. Note |
|
258
|
|
|
|
|
|
|
# that every server error automatically gets one retry via telnet. |
|
259
|
|
|
|
|
|
|
$HOURS_TO_SLEEP = 0; # Number of hours to sleep between recheck passes. |
|
260
|
|
|
|
|
|
|
$HTTP_DEFAULT_PORT = 80; |
|
261
|
|
|
|
|
|
|
$HTTP_VERSION = 'HTTP/1.0'; # Perl's HTTP module defaults to 0.9 |
|
262
|
|
|
|
|
|
|
$INPUT_FILE = "CURRENT.URLS.TXT"; |
|
263
|
|
|
|
|
|
|
$INPUT_FILE = "smalltest.txt"; |
|
264
|
|
|
|
|
|
|
$OUTPUT_FILE = "OUT.URLS.TXT"; |
|
265
|
|
|
|
|
|
|
$ADMIN_LOGFILE = "admin_logfile.txt"; # Log for result stats. |
|
266
|
|
|
|
|
|
|
$VERBOSE = 1; # If 1, print processing status to stdout |
|
267
|
|
|
|
|
|
|
$DEBUG = 0; # If 1, provides additional output to stdout; mainly HTTP headers. |
|
268
|
|
|
|
|
|
|
$LOGGING = 1; # Enable logging to $ADMIN_LOGFILE. |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
########################################### |
|
271
|
|
|
|
|
|
|
# Misc. initializations |
|
272
|
|
|
|
|
|
|
########################################### |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$TMP_FILE = "tmp.txt"; |
|
275
|
|
|
|
|
|
|
$TELNET_LOGFILE = "telnet_logfile.txt"; # Used internally to buffer data. |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# Response codes. All successful response codes have the form: 2xx. |
|
278
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED = 'REDIRECT_LIMIT_EXCEEDED'; |
|
279
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL = 'UNSUPPORTED_PROTOCOL'; |
|
280
|
|
|
|
|
|
|
$MALFORMED_URL = 'MALFORMED_URL'; |
|
281
|
|
|
|
|
|
|
$TELNET_FAILURE = 'TELNET_FAILURE'; |
|
282
|
|
|
|
|
|
|
$HTTP_0_9_FAIL = 'HTTP_0_9_FAIL'; |
|
283
|
|
|
|
|
|
|
$UNKNOWN_ERROR = 'UNKNOWN_ERROR'; |
|
284
|
|
|
|
|
|
|
$TELNET_SUCCESS = 299; # Mimic a successful HTTP code |
|
285
|
|
|
|
|
|
|
$HTTP_0_9_OKAY = 298; |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=head1 test_link |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
This function acts as glue between follow_url and LinkController. It |
|
290
|
|
|
|
|
|
|
returns a constructed HTTP::Response. This will mean that information |
|
291
|
|
|
|
|
|
|
is lost since we actually often have created the code from another |
|
292
|
|
|
|
|
|
|
response. |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub new { |
|
297
|
4
|
|
|
4
|
0
|
84
|
my $proto = shift; |
|
298
|
4
|
|
33
|
|
|
28
|
my $class = ref($proto) || $proto; |
|
299
|
4
|
|
|
|
|
9
|
my $self = {}; |
|
300
|
4
|
|
|
|
|
14
|
$self->{"user_agent"}=shift; |
|
301
|
4
|
|
|
|
|
21
|
bless $self, $class; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
3
|
|
|
3
|
|
19
|
use vars qw($redirect_count $redirects %convert); |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
9772
|
|
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
%convert=( |
|
307
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED => RC_REDIRECT_LIMIT_EXCEEDED, |
|
308
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL => RC_PROTOCOL_UNSUPPORTED, |
|
309
|
|
|
|
|
|
|
$MALFORMED_URL => RC_PROTOCOL_UNSUPPORTED, |
|
310
|
|
|
|
|
|
|
$TELNET_FAILURE => RC_NOT_FOUND, |
|
311
|
|
|
|
|
|
|
$HTTP_0_9_FAIL => RC_INTERNAL_SERVER_ERROR, |
|
312
|
|
|
|
|
|
|
$UNKNOWN_ERROR => RC_BAD_REQUEST, |
|
313
|
|
|
|
|
|
|
); |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
sub get_response { |
|
317
|
14
|
|
|
14
|
0
|
20
|
my $self=shift; |
|
318
|
14
|
|
|
|
|
18
|
my $link=shift; |
|
319
|
14
|
|
|
|
|
22
|
$redirects=[]; |
|
320
|
14
|
|
|
|
|
21
|
$redirect_count=0; |
|
321
|
14
|
|
|
|
|
40
|
%url_hash=(); |
|
322
|
14
|
|
|
|
|
38
|
my $code=$self->follow_url($link->url()); |
|
323
|
14
|
|
|
|
|
25
|
scalar (keys %convert); |
|
324
|
14
|
|
|
|
|
51
|
CONVERT: while (my ($key,$value) = each %convert) { |
|
325
|
81
|
100
|
|
|
|
345
|
$code eq $key && do { |
|
326
|
4
|
|
|
|
|
6
|
$code=$value; |
|
327
|
4
|
|
|
|
|
9
|
last CONVERT; |
|
328
|
|
|
|
|
|
|
}; |
|
329
|
|
|
|
|
|
|
} |
|
330
|
14
|
50
|
|
|
|
217
|
print STDERR "COMPLEX generated response code $code\n" |
|
331
|
|
|
|
|
|
|
if $self->{verbose}; |
|
332
|
|
|
|
|
|
|
#cluck and die here generate coredumps!!!???! in perl 5.6.0 on Linux |
|
333
|
|
|
|
|
|
|
# cluck STDERR "COMPLEX generated response code $code"; |
|
334
|
14
|
50
|
|
|
|
57
|
die "non numeric response code generated" . $code |
|
335
|
|
|
|
|
|
|
unless $code =~ m/[1-9][0-9]+/; |
|
336
|
14
|
|
|
|
|
52
|
my $response=HTTP::Response->new($code); |
|
337
|
|
|
|
|
|
|
|
|
338
|
14
|
50
|
|
|
|
573
|
die "response: $response not reference" unless ref $response ; |
|
339
|
|
|
|
|
|
|
|
|
340
|
14
|
|
|
|
|
51
|
return $response, @$redirects |
|
341
|
|
|
|
|
|
|
} |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# Set up the web agents and helpers. |
|
344
|
|
|
|
|
|
|
# $agent = new LWP::UserAgent; |
|
345
|
|
|
|
|
|
|
# $agent->timeout($AGENT_TIMEOUT); |
|
346
|
|
|
|
|
|
|
# $agent->max_size($AGENT_MAX_RESPONSE); |
|
347
|
|
|
|
|
|
|
$cookieJar = new HTTP::Cookies; |
|
348
|
|
|
|
|
|
|
$telnetAgent = new Net::Telnet(Timeout => $AGENT_TIMEOUT, |
|
349
|
|
|
|
|
|
|
Errmode => 'return'); |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
my ($url, $result, $newResult, %results, $outputStr, $urlCount, |
|
352
|
|
|
|
|
|
|
$count, $recheckCount, %resultSummary); |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
########################################### |
|
355
|
|
|
|
|
|
|
# check_for_meta_refresh |
|
356
|
|
|
|
|
|
|
########################################### |
|
357
|
|
|
|
|
|
|
# Routine that searches input string for something of the form: |
|
358
|
|
|
|
|
|
|
# |
|
359
|
|
|
|
|
|
|
# It is tolerant of extra whitespace, single or no quotes instead of |
|
360
|
|
|
|
|
|
|
# doublequotes, spaces around equals signs, and extra verbiage, and is |
|
361
|
|
|
|
|
|
|
# case-insensitive. |
|
362
|
|
|
|
|
|
|
# Call with: String of content to be searched |
|
363
|
|
|
|
|
|
|
# Returns: url, if a meta refresh is found; otherwise returns |
|
364
|
|
|
|
|
|
|
# empty string. |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
sub check_for_meta_refresh { |
|
367
|
8
|
50
|
|
8
|
0
|
85
|
if ($DEBUG) { print "check_for_meta_refresh()...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
368
|
8
|
|
|
|
|
8
|
my $inputStr = shift; |
|
369
|
8
|
50
|
|
|
|
17
|
if ($inputStr =~ |
|
370
|
|
|
|
|
|
|
m{ #" |
|
371
|
|
|
|
|
|
|
]+? url |
|
372
|
|
|
|
|
|
|
\s* = \s* ["']? ([^"' >]+) ["']? [^>]+? > |
|
373
|
|
|
|
|
|
|
}ix) |
|
374
|
|
|
|
|
|
|
{ |
|
375
|
0
|
|
|
|
|
0
|
return $1; |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
else { |
|
378
|
8
|
|
|
|
|
16
|
return ""; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
}#end check_for_meta_refresh |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
########################################### |
|
383
|
|
|
|
|
|
|
# follow_url |
|
384
|
|
|
|
|
|
|
########################################### |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# Tries to access a given url. The main case is HTTP protocol, but |
|
387
|
|
|
|
|
|
|
# also handles any protocol handled by LWP, plus telnet. For telnet, |
|
388
|
|
|
|
|
|
|
# just tries to open a connection. For HTTP, follows redirects until |
|
389
|
|
|
|
|
|
|
# a final status code is received or until $MAX_REDIRECTS is |
|
390
|
|
|
|
|
|
|
# exceeded. Accepts all cookies. To avoid infinite loops, detects page |
|
391
|
|
|
|
|
|
|
# refresh cycles. |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# Call with: url, and optional second arg of referring url which is |
|
394
|
|
|
|
|
|
|
# used to absolutize url. |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# Returns: HTTP status code, or internal response codes (see above). |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
sub follow_url { |
|
399
|
30
|
|
|
30
|
0
|
39
|
my $self=shift; |
|
400
|
30
|
|
|
|
|
65
|
my $agent=$self->{"user_agent"}; |
|
401
|
30
|
|
|
|
|
46
|
my ($url, $referrer) = @_; |
|
402
|
30
|
|
|
|
|
38
|
my $VERBOSE=$self->{"verbose"}; |
|
403
|
|
|
|
|
|
|
|
|
404
|
30
|
50
|
|
|
|
90
|
return $MALFORMED_URL unless $url; |
|
405
|
30
|
|
|
|
|
36
|
my ($response, $protocol, $host, $port, $ping, $telnetResult, |
|
406
|
|
|
|
|
|
|
$request, $statusCode, $new_url); |
|
407
|
30
|
50
|
33
|
|
|
105
|
if ($VERBOSE || $DEBUG) { print "follow_url(): $url\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
408
|
30
|
|
|
|
|
93
|
$url_hash{$url} = 1; # Track all urls in each run, to detect cycles. |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# Note: It is crucial to hash this url BEFORE absolutizing it, b/c |
|
411
|
|
|
|
|
|
|
# we will test for cycles before absolutizing. |
|
412
|
|
|
|
|
|
|
|
|
413
|
30
|
100
|
|
|
|
53
|
if ($referrer) { $url = make_url_absolute($url, $referrer); } |
|
|
16
|
|
|
|
|
34
|
|
|
414
|
30
|
100
|
|
|
|
74
|
if (keys(%url_hash) > $MAX_REDIRECTS) { |
|
415
|
1
|
50
|
|
|
|
7
|
if ($VERBOSE) { print "Redirect limit exceeded.\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
416
|
1
|
|
|
|
|
118
|
return $REDIRECT_LIMIT_EXCEEDED; |
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# EXTRACT PROTOCOL, HOST, AND (OPTIONAL) PORT. |
|
420
|
29
|
|
|
|
|
184
|
$url =~ m{ ^\s* ([a-z]+) :// ([^/:]+) }ix; |
|
421
|
29
|
100
|
66
|
|
|
142
|
if (!($1 && $2)) { |
|
422
|
3
|
50
|
|
|
|
8
|
if ($VERBOSE) { print "URL not well-formed.\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
423
|
3
|
|
|
|
|
9
|
return $MALFORMED_URL; |
|
424
|
|
|
|
|
|
|
} |
|
425
|
|
|
|
|
|
|
else { |
|
426
|
26
|
|
|
|
|
46
|
$protocol = $1; |
|
427
|
26
|
|
|
|
|
45
|
$host = $2; |
|
428
|
|
|
|
|
|
|
} |
|
429
|
26
|
|
|
|
|
125
|
$url =~ m{ \w+ :// [^/]+ : (\d+) }x; # Extract port |
|
430
|
26
|
50
|
|
|
|
58
|
if ($1) { $port = $1; } |
|
|
26
|
|
|
|
|
51
|
|
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
# HANDLE TELNET REQUESTS -- just see if we can open the connection. |
|
433
|
26
|
50
|
|
|
|
50
|
if ($protocol =~ /^telnet$/i) { |
|
434
|
0
|
0
|
|
|
|
0
|
if ($port) { |
|
435
|
0
|
|
|
|
|
0
|
$ping = $telnetAgent->open(Host => $host, |
|
436
|
|
|
|
|
|
|
Port => $port); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
else { |
|
439
|
0
|
|
|
|
|
0
|
$ping = $telnetAgent->open(Host => $host); |
|
440
|
|
|
|
|
|
|
} |
|
441
|
0
|
0
|
|
|
|
0
|
if (!$ping) { return $TELNET_FAILURE; } |
|
|
0
|
|
|
|
|
0
|
|
|
442
|
0
|
|
|
|
|
0
|
else { return $TELNET_SUCCESS; } |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# HANDLE ALL OTHER REQUESTS (HTTP, HTTPS, FTP, GOPHER, FILE) |
|
446
|
26
|
50
|
|
|
|
73
|
if (!$agent->is_protocol_supported($protocol)) { |
|
447
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Protocol not supported.\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
448
|
0
|
|
|
|
|
0
|
return $UNSUPPORTED_PROTOCOL; |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
# Use eval to avoid aborting if LWP or HTTP sends "die". |
|
451
|
26
|
|
|
|
|
237
|
eval { |
|
452
|
26
|
|
|
|
|
88
|
$request = HTTP::Request->new(GET => $url); |
|
453
|
26
|
|
|
|
|
2780
|
$request->protocol($HTTP_VERSION); |
|
454
|
26
|
|
|
|
|
255
|
$cookieJar->add_cookie_header($request); |
|
455
|
26
|
50
|
|
|
|
4130
|
if ($DEBUG) { print "\nRequest: \n", $request->as_string; } |
|
|
0
|
|
|
|
|
0
|
|
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Use simple_request so we don't follow redirects automatically |
|
458
|
26
|
|
|
|
|
76
|
$response = $agent->simple_request($request); |
|
459
|
26
|
|
|
|
|
2067
|
$cookieJar->extract_cookies($response); |
|
460
|
26
|
|
|
|
|
2309
|
$statusCode = $response->code; |
|
461
|
|
|
|
|
|
|
}; |
|
462
|
26
|
50
|
|
|
|
236
|
if ($@) { |
|
463
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "LWP or HTTP error: $@\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
464
|
0
|
0
|
|
|
|
0
|
if ($LOGGING) { print STDERR "LWP or HTTP error: $@\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
465
|
0
|
|
|
|
|
0
|
return $UNKNOWN_ERROR; |
|
466
|
|
|
|
|
|
|
} |
|
467
|
26
|
50
|
|
|
|
51
|
if ($DEBUG) { print "Status: $statusCode\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
468
|
26
|
50
|
|
|
|
44
|
if ($DEBUG) { print "\nResponse Header: \n", $response->headers->as_string; } |
|
|
0
|
|
|
|
|
0
|
|
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# Note: In case of timeout, agent sets $statusCode to server error. |
|
471
|
26
|
100
|
|
|
|
128
|
if ($statusCode =~ /2../) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
472
|
8
|
50
|
|
|
|
17
|
if ($VERBOSE) { print "Good response, checking for meta refresh tag...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
473
|
8
|
|
|
|
|
30
|
$new_url = check_for_meta_refresh($response->content); |
|
474
|
8
|
50
|
|
|
|
18
|
if ($new_url ne "") { |
|
475
|
0
|
0
|
|
|
|
0
|
if (exists($url_hash{$new_url})) { |
|
476
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "This url already visited ... returning $statusCode.\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
477
|
0
|
|
|
|
|
0
|
return $statusCode; } |
|
478
|
|
|
|
|
|
|
else { |
|
479
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Refresh to: $new_url\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
480
|
0
|
|
|
|
|
0
|
return $self->follow_url($new_url, $url); |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
} |
|
483
|
8
|
|
|
|
|
60
|
else { return $statusCode;} |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
elsif ($statusCode =~ /3../) { |
|
486
|
16
|
|
|
|
|
19
|
$redirect_count++; |
|
487
|
16
|
50
|
|
|
|
32
|
if ($VERBOSE) { print "Proper redirect...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
488
|
|
|
|
|
|
|
# Note that we don't check for page cycles here. Some sites |
|
489
|
|
|
|
|
|
|
# will redirect to the same page while setting cookies, but |
|
490
|
|
|
|
|
|
|
# eventually they'll stop. |
|
491
|
16
|
|
|
|
|
38
|
$new_url = $response->headers->header('Location'); |
|
492
|
16
|
|
|
|
|
741
|
push @$redirects, $new_url; |
|
493
|
16
|
50
|
|
|
|
112
|
if ($VERBOSE) { print "Redirect to: $new_url\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
494
|
16
|
|
|
|
|
158
|
return $self->follow_url($new_url, $url); |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
elsif ($statusCode =~ /4../) { |
|
497
|
2
|
50
|
|
|
|
8
|
if ($VERBOSE) { print "Client error...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
498
|
2
|
|
|
|
|
15
|
return $statusCode; |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
elsif ($statusCode =~ /5../) { |
|
501
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Server error...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# You might be tempted to do a retry right here. It is problematic |
|
504
|
|
|
|
|
|
|
# b/c you need to do another follow_url, but that will clash with |
|
505
|
|
|
|
|
|
|
# url_hash -- it will look like a page cycle. But if you do the |
|
506
|
|
|
|
|
|
|
# retry by hand w/ a simple request, you don't handle all the |
|
507
|
|
|
|
|
|
|
# cases properly. What we do is retry once using telnet, and leave |
|
508
|
|
|
|
|
|
|
# other retries to subsequent passes following main loop. |
|
509
|
|
|
|
|
|
|
|
|
510
|
0
|
0
|
|
|
|
0
|
if ($protocol =~ /^http$/i) { # Only works for HTTP requests. |
|
511
|
0
|
|
|
|
|
0
|
$telnetResult = |
|
512
|
|
|
|
|
|
|
$self->telnet_http_retry($host, $url, $request, $port); |
|
513
|
0
|
0
|
|
|
|
0
|
if ($telnetResult ne 'FAIL') { |
|
514
|
0
|
|
|
|
|
0
|
$statusCode = $telnetResult; |
|
515
|
|
|
|
|
|
|
} |
|
516
|
|
|
|
|
|
|
} |
|
517
|
0
|
|
|
|
|
0
|
return $statusCode; |
|
518
|
|
|
|
|
|
|
} # end 5xx case. |
|
519
|
|
|
|
|
|
|
else { # Everything else case. |
|
520
|
0
|
|
|
|
|
0
|
return $statusCode; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
} # end sub follow_url |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
########################################### |
|
526
|
|
|
|
|
|
|
# get_location_header |
|
527
|
|
|
|
|
|
|
########################################### |
|
528
|
|
|
|
|
|
|
# Extracts the url from the Location field of an HTTP redirect. |
|
529
|
|
|
|
|
|
|
# Call with: ref to array of header lines, w or w/o body at end. |
|
530
|
|
|
|
|
|
|
# Returns: URL found in Location header, or empty string. |
|
531
|
|
|
|
|
|
|
sub get_location_header { |
|
532
|
|
|
|
|
|
|
|
|
533
|
0
|
0
|
0
|
0
|
0
|
0
|
if ($VERBOSE || $DEBUG) { print "Looking for location header... \n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
534
|
0
|
|
|
|
|
0
|
my ($headersRef) = @_; |
|
535
|
0
|
|
|
|
|
0
|
my $line; |
|
536
|
|
|
|
|
|
|
|
|
537
|
0
|
|
|
|
|
0
|
while ($line = shift @$headersRef) { |
|
538
|
0
|
0
|
|
|
|
0
|
if ($DEBUG) { print "Checking line: $line\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
539
|
0
|
0
|
|
|
|
0
|
last if $line =~ /^\s$/; |
|
540
|
0
|
0
|
|
|
|
0
|
if ($line =~ m{^Location: \s* (\S+)}x) { |
|
541
|
0
|
0
|
|
|
|
0
|
if ($DEBUG) { print "Line found: $line\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
542
|
0
|
|
|
|
|
0
|
return $1; |
|
543
|
|
|
|
|
|
|
} |
|
544
|
|
|
|
|
|
|
} |
|
545
|
0
|
|
|
|
|
0
|
return ""; |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
} # end sub get_location_header |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
########################################### |
|
550
|
|
|
|
|
|
|
# make_url_absolute |
|
551
|
|
|
|
|
|
|
########################################### |
|
552
|
|
|
|
|
|
|
# Make a relative url absolute by appending it to path of old url. |
|
553
|
|
|
|
|
|
|
# Call with: a fully qualified url as second arg, which will provide |
|
554
|
|
|
|
|
|
|
# path info for relative url which is first arg. |
|
555
|
|
|
|
|
|
|
# Returns: new absolute url |
|
556
|
|
|
|
|
|
|
sub make_url_absolute { |
|
557
|
|
|
|
|
|
|
|
|
558
|
16
|
50
|
|
16
|
0
|
33
|
if ($DEBUG) { print "make_url_absolute()...\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
559
|
16
|
|
|
|
|
22
|
my ($new_url, $old_url) = @_; |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# Test to see if it's already absolute (starts w/ a syntactically correct scheme) |
|
562
|
16
|
50
|
|
|
|
82
|
if ($new_url =~ m{^[a-z]+://}i) { |
|
563
|
16
|
|
|
|
|
34
|
return $new_url; |
|
564
|
|
|
|
|
|
|
} |
|
565
|
|
|
|
|
|
|
|
|
566
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Adding path to relative url: $new_url\n"; } |
|
|
0
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# Case 1: new url is relative to root; it starts with slash, and |
|
568
|
|
|
|
|
|
|
# should be appended to raw domain name. |
|
569
|
0
|
0
|
|
|
|
|
if ($new_url =~ m{^/} ) { |
|
|
|
0
|
|
|
|
|
|
|
570
|
0
|
|
|
|
|
|
$old_url =~ m{ (\w+ :// [^/]+) }x; |
|
571
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 1: append to $1\n"; } |
|
|
0
|
|
|
|
|
|
|
|
572
|
0
|
|
|
|
|
|
return $1 . $new_url; |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
# For cases 2 & 3, assume new url is relative to current directory; |
|
575
|
|
|
|
|
|
|
# Case 2: old url contains a trailing slash, eg. http://www.fib.com/bigfib/; |
|
576
|
|
|
|
|
|
|
# may or may not contain trailing filename |
|
577
|
|
|
|
|
|
|
elsif ($old_url =~ m{ (\w+://\S+/) }x ) { |
|
578
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 2: append to $1\n"; } |
|
|
0
|
|
|
|
|
|
|
|
579
|
0
|
|
|
|
|
|
return $1 . $new_url; |
|
580
|
|
|
|
|
|
|
} |
|
581
|
|
|
|
|
|
|
# Case 3: old url has no trailing slash, eg. http://www.fab.net |
|
582
|
|
|
|
|
|
|
else { |
|
583
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 3: append to $old_url/\n"; } |
|
|
0
|
|
|
|
|
|
|
|
584
|
0
|
|
|
|
|
|
return "$old_url/$new_url"; |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
} # End make_url_absolute |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
########################################### |
|
590
|
|
|
|
|
|
|
# telnet_http_retry |
|
591
|
|
|
|
|
|
|
########################################### |
|
592
|
|
|
|
|
|
|
# Open a telnet connection to a host and try an HTTP GET for an |
|
593
|
|
|
|
|
|
|
# url. The response is processed according to status code similarly to |
|
594
|
|
|
|
|
|
|
# follow_url, and calls follow_url to handle redirects. Uses an LWP |
|
595
|
|
|
|
|
|
|
# request object b/c that's a convenient way to stick cookies into the |
|
596
|
|
|
|
|
|
|
# request string. |
|
597
|
|
|
|
|
|
|
# Note: Handles the Solaris/LWP bug (cf notes above) by reading the |
|
598
|
|
|
|
|
|
|
# telnet.pm input_log if telnet times out. |
|
599
|
|
|
|
|
|
|
# Call with: hostname, absolute url, LWP request object, and optional |
|
600
|
|
|
|
|
|
|
# port (default is $HTTP_DEFAULT_PORT). |
|
601
|
|
|
|
|
|
|
# Returns: status code, or 'FAIL' if can't make telnet connection. |
|
602
|
|
|
|
|
|
|
sub telnet_http_retry { |
|
603
|
0
|
|
|
0
|
0
|
|
my $self=shift; |
|
604
|
0
|
0
|
0
|
|
|
|
if ($VERBOSE || $DEBUG) { |
|
605
|
0
|
|
|
|
|
|
print "Telnet HTTP retry...\n"; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
0
|
|
|
|
|
|
my ($host, $url, $request, $port) = @_; |
|
608
|
0
|
|
|
|
|
|
my ($telnetAgent, @lines, @buffer, $statusLine, $line, $logfileHandle, |
|
609
|
|
|
|
|
|
|
$httpVersion, $statusCode, $message, $contentStr, $new_url); |
|
610
|
0
|
0
|
|
|
|
|
open(LOGFILE, "+>$TELNET_LOGFILE") || warn "Can't open $TELNET_LOGFILE.\n"; |
|
611
|
0
|
0
|
0
|
|
|
|
if (!$port || $port !~ /^\d+$/) { |
|
612
|
0
|
|
|
|
|
|
$port = $HTTP_DEFAULT_PORT; |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
# Create agent and open connection. |
|
615
|
0
|
|
|
|
|
|
$telnetAgent = Net::Telnet->new(Host => $host, |
|
616
|
|
|
|
|
|
|
Port => $port, |
|
617
|
|
|
|
|
|
|
Input_log => $TELNET_LOGFILE, |
|
618
|
|
|
|
|
|
|
Timeout => $AGENT_TIMEOUT, |
|
619
|
|
|
|
|
|
|
Errmode => "return"); |
|
620
|
0
|
0
|
|
|
|
|
return 'FAIL' unless $telnetAgent; # Can't open telnet connection. |
|
621
|
0
|
|
|
|
|
|
$telnetAgent->max_buffer_length($AGENT_MAX_RESPONSE); |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# Send the request. |
|
624
|
0
|
|
|
|
|
|
$telnetAgent->print($request->as_string, "\n"); |
|
625
|
|
|
|
|
|
|
# Get the response as array of lines. |
|
626
|
0
|
|
|
|
|
|
while (@buffer = $telnetAgent->getlines) { |
|
627
|
0
|
|
|
|
|
|
push (@lines, @buffer); |
|
628
|
|
|
|
|
|
|
} |
|
629
|
0
|
0
|
|
|
|
|
if ($telnetAgent->timed_out) { |
|
630
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
631
|
0
|
|
|
|
|
|
print "Telnet http timed out. Using input log...\n"; |
|
632
|
|
|
|
|
|
|
} |
|
633
|
0
|
|
|
|
|
|
undef @lines; |
|
634
|
0
|
|
|
|
|
|
while () { |
|
635
|
0
|
|
|
|
|
|
push (@lines, $_); |
|
636
|
|
|
|
|
|
|
} |
|
637
|
0
|
0
|
|
|
|
|
close LOGFILE or warn "Problem closing $TELNET_LOGFILE.\n"; |
|
638
|
|
|
|
|
|
|
} |
|
639
|
0
|
0
|
|
|
|
|
if (!@lines) { |
|
640
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
641
|
0
|
|
|
|
|
|
print "No data received.\n"; |
|
642
|
|
|
|
|
|
|
} |
|
643
|
0
|
|
|
|
|
|
return 'FAIL'; |
|
644
|
|
|
|
|
|
|
} |
|
645
|
0
|
0
|
|
|
|
|
if ($DEBUG) { |
|
646
|
0
|
|
|
|
|
|
print @lines,"\n"; |
|
647
|
|
|
|
|
|
|
} |
|
648
|
0
|
|
|
|
|
|
$statusLine = shift @lines; |
|
649
|
|
|
|
|
|
|
# We can only process status line and headers if the response is HTTP/1.0 or |
|
650
|
|
|
|
|
|
|
# better. This regexp copied from LWP::Protocol::http.pm. |
|
651
|
0
|
0
|
|
|
|
|
if ($statusLine =~ /^(HTTP\/\d+\.\d+)[ \t]+(\d+)[ \t]*([^\012]*)\012/) { |
|
652
|
|
|
|
|
|
|
# HTTP/1.0 response or better |
|
653
|
0
|
|
|
|
|
|
($httpVersion, $statusCode, $message) = ($1, $2, $3); |
|
654
|
0
|
|
|
|
|
|
chomp $message; |
|
655
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
656
|
0
|
|
|
|
|
|
print "Status line: $httpVersion $statusCode $message \n\n"; |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
|
|
659
|
0
|
0
|
|
|
|
|
if ($statusCode =~ /2../) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
660
|
0
|
|
|
|
|
|
while ($line = shift @lines) { # Flatten array of lines. |
|
661
|
0
|
|
|
|
|
|
$contentStr .= $line; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
0
|
|
|
|
|
|
$new_url = check_for_meta_refresh($contentStr); |
|
664
|
0
|
0
|
|
|
|
|
if ($new_url ne "") { |
|
665
|
0
|
0
|
|
|
|
|
if (exists($url_hash{$new_url})) { |
|
666
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
667
|
0
|
|
|
|
|
|
print "This url already visited ... returning $statusCode.\n"; |
|
668
|
|
|
|
|
|
|
} |
|
669
|
0
|
|
|
|
|
|
return $statusCode; |
|
670
|
|
|
|
|
|
|
} else { |
|
671
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
672
|
0
|
|
|
|
|
|
print "Refresh to: $new_url\n"; |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
# Return whatever status code we get from new url |
|
675
|
0
|
|
|
|
|
|
return $self->follow_url($new_url, $url); |
|
676
|
|
|
|
|
|
|
} |
|
677
|
|
|
|
|
|
|
} else { |
|
678
|
0
|
|
|
|
|
|
return $statusCode; |
|
679
|
|
|
|
|
|
|
} |
|
680
|
|
|
|
|
|
|
} elsif ($statusCode =~ /3../) { |
|
681
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
682
|
0
|
|
|
|
|
|
print "Proper redirect...\n"; |
|
683
|
|
|
|
|
|
|
} |
|
684
|
0
|
|
|
|
|
|
$new_url = get_location_header(\@lines); |
|
685
|
0
|
0
|
|
|
|
|
if ($new_url ne "") { |
|
686
|
0
|
0
|
|
|
|
|
if (exists($url_hash{$new_url})) { |
|
687
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
688
|
0
|
|
|
|
|
|
print "This url already visited ... returning $statusCode.\n"; |
|
689
|
|
|
|
|
|
|
} |
|
690
|
0
|
|
|
|
|
|
return $statusCode; |
|
691
|
|
|
|
|
|
|
} else { |
|
692
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
693
|
0
|
|
|
|
|
|
print "Redirect to: $new_url\n"; |
|
694
|
|
|
|
|
|
|
} |
|
695
|
|
|
|
|
|
|
# Return whatever status code we get from new url |
|
696
|
0
|
|
|
|
|
|
return $self->follow_url($new_url, $url); |
|
697
|
|
|
|
|
|
|
} |
|
698
|
|
|
|
|
|
|
} else { |
|
699
|
0
|
|
|
|
|
|
return $statusCode; |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
} elsif ($statusCode =~ m{4.. | 5..}x) { |
|
702
|
0
|
|
|
|
|
|
return $statusCode; |
|
703
|
|
|
|
|
|
|
} |
|
704
|
|
|
|
|
|
|
} # if valid status line |
|
705
|
|
|
|
|
|
|
else { |
|
706
|
0
|
|
|
|
|
|
unshift(@lines, $statusLine); |
|
707
|
|
|
|
|
|
|
} |
|
708
|
|
|
|
|
|
|
# If no status line, could be HTTP/0.9 server, which just sends |
|
709
|
|
|
|
|
|
|
# back content. If it contains a tag like , assume it's |
|
710
|
|
|
|
|
|
|
# okay. |
|
711
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
|
712
|
0
|
|
|
|
|
|
print "Assuming HTTP/0.9 or less... \n"; |
|
713
|
|
|
|
|
|
|
} |
|
714
|
0
|
|
|
|
|
|
while ($line = shift @lines) { # Flatten array of lines. |
|
715
|
0
|
|
|
|
|
|
$contentStr .= $line; |
|
716
|
|
|
|
|
|
|
} |
|
717
|
0
|
0
|
|
|
|
|
if ($contentStr =~ /
|
|
718
|
0
|
|
|
|
|
|
return $HTTP_0_9_OKAY; |
|
719
|
|
|
|
|
|
|
} else { |
|
720
|
0
|
|
|
|
|
|
return $HTTP_0_9_FAIL; |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
} # end sub telnet_http_retry |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
########################################### |
|
726
|
|
|
|
|
|
|
# END (Unused snippets and test results, below) |
|
727
|
|
|
|
|
|
|
########################################### |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# NOTES: |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
# 1. It would be nice to have a robust facility for absolutizing |
|
732
|
|
|
|
|
|
|
# URLs. I tried using URI.pm for this purpose and found it to be not |
|
733
|
|
|
|
|
|
|
# robust. EG., it allows the construction of: http:/www.yahoo.com, |
|
734
|
|
|
|
|
|
|
# which is not well-formed. |
|
735
|
|
|
|
|
|
|
# 2. Tolerance of meta refresh tag match? |
|
736
|
|
|
|
|
|
|
# 3. some duplicate code went from follow_url to the |
|
737
|
|
|
|
|
|
|
# telnet_http_retry; could be factored. |
|
738
|
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
1; #Spoilt children / happy / required even |