line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package URI::Heuristic; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
URI::Heuristic - Expand URI using heuristics |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use URI::Heuristic qw(uf_uristr); |
10
|
|
|
|
|
|
|
$u = uf_uristr("example"); # http://www.example.com |
11
|
|
|
|
|
|
|
$u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol |
12
|
|
|
|
|
|
|
$u = uf_uristr("aas"); # http://www.aas.no |
13
|
|
|
|
|
|
|
$u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi |
14
|
|
|
|
|
|
|
$u = uf_uristr("/etc/passwd"); # file:/etc/passwd |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
This module provides functions that expand strings into real absolute |
19
|
|
|
|
|
|
|
URIs using some built-in heuristics. Strings that already represent |
20
|
|
|
|
|
|
|
absolute URIs (i.e. that start with a C part) are never modified |
21
|
|
|
|
|
|
|
and are returned unchanged. The main use of these functions is to |
22
|
|
|
|
|
|
|
allow abbreviated URIs similar to what many web browsers allow for URIs |
23
|
|
|
|
|
|
|
typed in by the user. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The following functions are provided: |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=over 4 |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=item uf_uristr($str) |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
Tries to make the argument string |
32
|
|
|
|
|
|
|
into a proper absolute URI string. The "uf_" prefix stands for "User |
33
|
|
|
|
|
|
|
Friendly". Under MacOS, it assumes that any string with a common URL |
34
|
|
|
|
|
|
|
scheme (http, ftp, etc.) is a URL rather than a local path. So don't name |
35
|
|
|
|
|
|
|
your volumes after common URL schemes and expect uf_uristr() to construct |
36
|
|
|
|
|
|
|
valid file: URL's on those volumes for you, because it won't. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=item uf_uri($str) |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Works the same way as uf_uristr() but |
41
|
|
|
|
|
|
|
returns a C object. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=back |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 ENVIRONMENT |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
If the hostname portion of a URI does not contain any dots, then |
48
|
|
|
|
|
|
|
certain qualified guesses are made. These guesses are governed by |
49
|
|
|
|
|
|
|
the following environment variables: |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=over 10 |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=item COUNTRY |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
The two-letter country code (ISO 3166) for your location. If |
56
|
|
|
|
|
|
|
the domain name of your host ends with two letters, then it is taken |
57
|
|
|
|
|
|
|
to be the default country. See also L. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
If COUNTRY is not set, these standard environment variables are |
62
|
|
|
|
|
|
|
examined and country (not language) information possibly found in them |
63
|
|
|
|
|
|
|
is used as the default country. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=item URL_GUESS_PATTERN |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Contains a space-separated list of URL patterns to try. The string |
68
|
|
|
|
|
|
|
"ACME" is for some reason used as a placeholder for the host name in |
69
|
|
|
|
|
|
|
the URL provided. Example: |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com" |
72
|
|
|
|
|
|
|
export URL_GUESS_PATTERN |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Specifying URL_GUESS_PATTERN disables any guessing rules based on |
75
|
|
|
|
|
|
|
country. An empty URL_GUESS_PATTERN disables any guessing that |
76
|
|
|
|
|
|
|
involves host name lookups. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=back |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 COPYRIGHT |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Copyright 1997-1998, Gisle Aas |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or |
85
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=cut |
88
|
|
|
|
|
|
|
|
89
|
1
|
|
|
1
|
|
66051
|
use strict; |
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
29
|
|
90
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
91
|
|
|
|
|
|
|
|
92
|
1
|
|
|
1
|
|
13
|
use Exporter 5.57 'import'; |
|
1
|
|
|
|
|
26
|
|
|
1
|
|
|
|
|
1668
|
|
93
|
|
|
|
|
|
|
our @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr); |
94
|
|
|
|
|
|
|
our $VERSION = '5.19'; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
our ($MY_COUNTRY, $DEBUG); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub MY_COUNTRY() { |
99
|
16
|
|
|
16
|
0
|
30
|
for ($MY_COUNTRY) { |
100
|
16
|
100
|
|
|
|
40
|
return $_ if defined; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# First try the environment. |
103
|
7
|
|
|
|
|
13
|
$_ = $ENV{COUNTRY}; |
104
|
7
|
50
|
|
|
|
11
|
return $_ if defined; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Try the country part of LC_ALL and LANG from environment |
107
|
7
|
|
|
|
|
19
|
my @srcs = ($ENV{LC_ALL}, $ENV{LANG}); |
108
|
|
|
|
|
|
|
# ...and HTTP_ACCEPT_LANGUAGE before those if present |
109
|
7
|
100
|
|
|
|
14
|
if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) { |
110
|
|
|
|
|
|
|
# TODO: q-value processing/ordering |
111
|
3
|
|
|
|
|
8
|
for $httplang (split(/\s*,\s*/, $httplang)) { |
112
|
3
|
50
|
|
|
|
14
|
if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) { |
113
|
3
|
|
|
|
|
10
|
unshift(@srcs, "${1}_${2}"); |
114
|
3
|
|
|
|
|
5
|
last; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
7
|
|
|
|
|
12
|
for (@srcs) { |
119
|
9
|
50
|
|
|
|
16
|
next unless defined; |
120
|
9
|
100
|
|
|
|
39
|
return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Last bit of domain name. This may access the network. |
124
|
2
|
|
|
|
|
9
|
require Net::Domain; |
125
|
2
|
|
|
|
|
8
|
my $fqdn = Net::Domain::hostfqdn(); |
126
|
2
|
100
|
|
|
|
15
|
$_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/; |
127
|
2
|
100
|
|
|
|
10
|
return $_ if defined; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Give up. Defined but false. |
130
|
1
|
|
|
|
|
5
|
return ($_ = 0); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
our %LOCAL_GUESSING = |
135
|
|
|
|
|
|
|
( |
136
|
|
|
|
|
|
|
'us' => [qw(www.ACME.gov www.ACME.mil)], |
137
|
|
|
|
|
|
|
'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)], |
138
|
|
|
|
|
|
|
'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)], |
139
|
|
|
|
|
|
|
'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)], |
140
|
|
|
|
|
|
|
# send corrections and new entries to |
141
|
|
|
|
|
|
|
); |
142
|
|
|
|
|
|
|
# Backwards compatibility; uk != United Kingdom in ISO 3166 |
143
|
|
|
|
|
|
|
$LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb}; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub uf_uristr ($) |
147
|
|
|
|
|
|
|
{ |
148
|
26
|
|
|
26
|
1
|
2259
|
local($_) = @_; |
149
|
26
|
50
|
|
|
|
62
|
print STDERR "uf_uristr: resolving $_\n" if $DEBUG; |
150
|
26
|
50
|
|
|
|
50
|
return unless defined; |
151
|
|
|
|
|
|
|
|
152
|
26
|
|
|
|
|
80
|
s/^\s+//; |
153
|
26
|
|
|
|
|
60
|
s/\s+$//; |
154
|
|
|
|
|
|
|
|
155
|
26
|
50
|
100
|
|
|
302
|
if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) { |
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
156
|
0
|
|
|
|
|
0
|
$_ = "http://$_"; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
} elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) { |
159
|
6
|
|
|
|
|
25
|
$_ = lc($1) . "://$_"; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
} elsif ($^O ne "MacOS" && |
162
|
|
|
|
|
|
|
(m,^/, || # absolute file name |
163
|
|
|
|
|
|
|
m,^\.\.?/, || # relative file name |
164
|
|
|
|
|
|
|
m,^[a-zA-Z]:[/\\],) # dosish file name |
165
|
|
|
|
|
|
|
) |
166
|
|
|
|
|
|
|
{ |
167
|
3
|
|
|
|
|
9
|
$_ = "file:$_"; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
} elsif ($^O eq "MacOS" && m/:/) { |
170
|
|
|
|
|
|
|
# potential MacOS file name |
171
|
0
|
0
|
|
|
|
0
|
unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) { |
172
|
0
|
|
|
|
|
0
|
require URI::file; |
173
|
0
|
|
|
|
|
0
|
my $a = URI::file->new($_)->as_string; |
174
|
0
|
0
|
|
|
|
0
|
$_ = ($a =~ m/^file:/) ? $a : "file:$a"; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
} elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) { |
177
|
2
|
|
|
|
|
9
|
$_ = "mailto:$_"; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
} elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified |
180
|
12
|
50
|
|
|
|
79
|
if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) { |
181
|
12
|
|
|
|
|
28
|
my $host = $1; |
182
|
|
|
|
|
|
|
|
183
|
12
|
|
|
|
|
18
|
my $scheme = "http"; |
184
|
12
|
100
|
|
|
|
28
|
if (/^:(\d+)\b/) { |
185
|
|
|
|
|
|
|
# Some more or less well known ports |
186
|
3
|
100
|
|
|
|
26
|
if ($1 =~ /^[56789]?443$/) { |
|
|
100
|
|
|
|
|
|
187
|
1
|
|
|
|
|
3
|
$scheme = "https"; |
188
|
|
|
|
|
|
|
} elsif ($1 eq "21") { |
189
|
1
|
|
|
|
|
3
|
$scheme = "ftp"; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
12
|
100
|
66
|
|
|
47
|
if ($host !~ /\./ && $host ne "localhost") { |
194
|
9
|
|
|
|
|
15
|
my @guess; |
195
|
9
|
100
|
|
|
|
27
|
if (exists $ENV{URL_GUESS_PATTERN}) { |
196
|
2
|
|
|
|
|
10
|
@guess = map { s/\bACME\b/$host/; $_ } |
|
2
|
|
|
|
|
7
|
|
197
|
2
|
|
|
|
|
9
|
split(' ', $ENV{URL_GUESS_PATTERN}); |
198
|
|
|
|
|
|
|
} else { |
199
|
7
|
100
|
|
|
|
17
|
if (MY_COUNTRY()) { |
200
|
6
|
|
|
|
|
11
|
my $special = $LOCAL_GUESSING{MY_COUNTRY()}; |
201
|
6
|
100
|
|
|
|
12
|
if ($special) { |
202
|
3
|
|
|
|
|
7
|
my @special = @$special; |
203
|
3
|
|
|
|
|
7
|
push(@guess, map { s/\bACME\b/$host/; $_ } |
|
9
|
|
|
|
|
29
|
|
|
9
|
|
|
|
|
21
|
|
204
|
|
|
|
|
|
|
@special); |
205
|
|
|
|
|
|
|
} else { |
206
|
3
|
|
|
|
|
10
|
push(@guess, "www.$host." . MY_COUNTRY()); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
7
|
|
|
|
|
48
|
push(@guess, map "www.$host.$_", |
210
|
|
|
|
|
|
|
"com", "org", "net", "edu", "int"); |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
|
214
|
9
|
|
|
|
|
13
|
my $guess; |
215
|
9
|
|
|
|
|
17
|
for $guess (@guess) { |
216
|
9
|
50
|
|
|
|
17
|
print STDERR "uf_uristr: gethostbyname('$guess.')..." |
217
|
|
|
|
|
|
|
if $DEBUG; |
218
|
9
|
100
|
|
|
|
24
|
if (gethostbyname("$guess.")) { |
219
|
8
|
50
|
|
|
|
75
|
print STDERR "yes\n" if $DEBUG; |
220
|
8
|
|
|
|
|
11
|
$host = $guess; |
221
|
8
|
|
|
|
|
19
|
last; |
222
|
|
|
|
|
|
|
} |
223
|
1
|
50
|
|
|
|
13
|
print STDERR "no\n" if $DEBUG; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
12
|
|
|
|
|
32
|
$_ = "$scheme://$host$_"; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
} else { |
229
|
|
|
|
|
|
|
# pure junk, just return it unchanged... |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
} |
233
|
26
|
50
|
|
|
|
55
|
print STDERR "uf_uristr: ==> $_\n" if $DEBUG; |
234
|
|
|
|
|
|
|
|
235
|
26
|
|
|
|
|
131
|
$_; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub uf_uri ($) |
239
|
|
|
|
|
|
|
{ |
240
|
0
|
|
|
0
|
1
|
0
|
require URI; |
241
|
0
|
|
|
|
|
0
|
URI->new(uf_uristr($_[0])); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# legacy |
245
|
|
|
|
|
|
|
*uf_urlstr = \*uf_uristr; |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub uf_url ($) |
248
|
|
|
|
|
|
|
{ |
249
|
5
|
|
|
5
|
0
|
434
|
require URI::URL; |
250
|
5
|
|
|
|
|
16
|
URI::URL->new(uf_uristr($_[0])); |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
1; |