File Coverage

blib/lib/URI/Heuristic.pm
Criterion Covered Total %
statement 71 78 91.0
branch 46 62 74.1
condition 7 12 58.3
subroutine 6 7 85.7
pod 2 4 50.0
total 132 163 80.9


line stmt bran cond sub pod time code
1             package URI::Heuristic;
2              
3             =head1 NAME
4              
5             URI::Heuristic - Expand URI using heuristics
6              
7             =head1 SYNOPSIS
8              
9             use URI::Heuristic qw(uf_uristr);
10             $u = uf_uristr("example"); # http://www.example.com
11             $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol
12             $u = uf_uristr("aas"); # http://www.aas.no
13             $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi
14             $u = uf_uristr("/etc/passwd"); # file:/etc/passwd
15              
16             =head1 DESCRIPTION
17              
18             This module provides functions that expand strings into real absolute
19             URIs using some built-in heuristics. Strings that already represent
20             absolute URIs (i.e. that start with a C part) are never modified
21             and are returned unchanged. The main use of these functions is to
22             allow abbreviated URIs similar to what many web browsers allow for URIs
23             typed in by the user.
24              
25             The following functions are provided:
26              
27             =over 4
28              
29             =item uf_uristr($str)
30              
31             Tries to make the argument string
32             into a proper absolute URI string. The "uf_" prefix stands for "User
33             Friendly". Under MacOS, it assumes that any string with a common URL
34             scheme (http, ftp, etc.) is a URL rather than a local path. So don't name
35             your volumes after common URL schemes and expect uf_uristr() to construct
36             valid file: URL's on those volumes for you, because it won't.
37              
38             =item uf_uri($str)
39              
40             Works the same way as uf_uristr() but
41             returns a C object.
42              
43             =back
44              
45             =head1 ENVIRONMENT
46              
47             If the hostname portion of a URI does not contain any dots, then
48             certain qualified guesses are made. These guesses are governed by
49             the following environment variables:
50              
51             =over 10
52              
53             =item COUNTRY
54              
55             The two-letter country code (ISO 3166) for your location. If
56             the domain name of your host ends with two letters, then it is taken
57             to be the default country. See also L.
58              
59             =item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG
60              
61             If COUNTRY is not set, these standard environment variables are
62             examined and country (not language) information possibly found in them
63             is used as the default country.
64              
65             =item URL_GUESS_PATTERN
66              
67             Contains a space-separated list of URL patterns to try. The string
68             "ACME" is for some reason used as a placeholder for the host name in
69             the URL provided. Example:
70              
71             URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
72             export URL_GUESS_PATTERN
73              
74             Specifying URL_GUESS_PATTERN disables any guessing rules based on
75             country. An empty URL_GUESS_PATTERN disables any guessing that
76             involves host name lookups.
77              
78             =back
79              
80             =head1 COPYRIGHT
81              
82             Copyright 1997-1998, Gisle Aas
83              
84             This library is free software; you can redistribute it and/or
85             modify it under the same terms as Perl itself.
86              
87             =cut
88              
89 1     1   66051 use strict;
  1         9  
  1         29  
90 1     1   5 use warnings;
  1         2  
  1         29  
91              
92 1     1   13 use Exporter 5.57 'import';
  1         26  
  1         1668  
93             our @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
94             our $VERSION = '5.19';
95              
96             our ($MY_COUNTRY, $DEBUG);
97              
98             sub MY_COUNTRY() {
99 16     16 0 30 for ($MY_COUNTRY) {
100 16 100       40 return $_ if defined;
101              
102             # First try the environment.
103 7         13 $_ = $ENV{COUNTRY};
104 7 50       11 return $_ if defined;
105              
106             # Try the country part of LC_ALL and LANG from environment
107 7         19 my @srcs = ($ENV{LC_ALL}, $ENV{LANG});
108             # ...and HTTP_ACCEPT_LANGUAGE before those if present
109 7 100       14 if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) {
110             # TODO: q-value processing/ordering
111 3         8 for $httplang (split(/\s*,\s*/, $httplang)) {
112 3 50       14 if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) {
113 3         10 unshift(@srcs, "${1}_${2}");
114 3         5 last;
115             }
116             }
117             }
118 7         12 for (@srcs) {
119 9 50       16 next unless defined;
120 9 100       39 return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/;
121             }
122              
123             # Last bit of domain name. This may access the network.
124 2         9 require Net::Domain;
125 2         8 my $fqdn = Net::Domain::hostfqdn();
126 2 100       15 $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
127 2 100       10 return $_ if defined;
128              
129             # Give up. Defined but false.
130 1         5 return ($_ = 0);
131             }
132             }
133              
134             our %LOCAL_GUESSING =
135             (
136             'us' => [qw(www.ACME.gov www.ACME.mil)],
137             'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
138             'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
139             'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
140             # send corrections and new entries to
141             );
142             # Backwards compatibility; uk != United Kingdom in ISO 3166
143             $LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb};
144              
145              
146             sub uf_uristr ($)
147             {
148 26     26 1 2259 local($_) = @_;
149 26 50       62 print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
150 26 50       50 return unless defined;
151              
152 26         80 s/^\s+//;
153 26         60 s/\s+$//;
154              
155 26 50 100     302 if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) {
    100 33        
    100 33        
    50          
    100          
    100          
156 0         0 $_ = "http://$_";
157              
158             } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) {
159 6         25 $_ = lc($1) . "://$_";
160              
161             } elsif ($^O ne "MacOS" &&
162             (m,^/, || # absolute file name
163             m,^\.\.?/, || # relative file name
164             m,^[a-zA-Z]:[/\\],) # dosish file name
165             )
166             {
167 3         9 $_ = "file:$_";
168              
169             } elsif ($^O eq "MacOS" && m/:/) {
170             # potential MacOS file name
171 0 0       0 unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
172 0         0 require URI::file;
173 0         0 my $a = URI::file->new($_)->as_string;
174 0 0       0 $_ = ($a =~ m/^file:/) ? $a : "file:$a";
175             }
176             } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
177 2         9 $_ = "mailto:$_";
178              
179             } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified
180 12 50       79 if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
181 12         28 my $host = $1;
182              
183 12         18 my $scheme = "http";
184 12 100       28 if (/^:(\d+)\b/) {
185             # Some more or less well known ports
186 3 100       26 if ($1 =~ /^[56789]?443$/) {
    100          
187 1         3 $scheme = "https";
188             } elsif ($1 eq "21") {
189 1         3 $scheme = "ftp";
190             }
191             }
192              
193 12 100 66     47 if ($host !~ /\./ && $host ne "localhost") {
194 9         15 my @guess;
195 9 100       27 if (exists $ENV{URL_GUESS_PATTERN}) {
196 2         10 @guess = map { s/\bACME\b/$host/; $_ }
  2         7  
197 2         9 split(' ', $ENV{URL_GUESS_PATTERN});
198             } else {
199 7 100       17 if (MY_COUNTRY()) {
200 6         11 my $special = $LOCAL_GUESSING{MY_COUNTRY()};
201 6 100       12 if ($special) {
202 3         7 my @special = @$special;
203 3         7 push(@guess, map { s/\bACME\b/$host/; $_ }
  9         29  
  9         21  
204             @special);
205             } else {
206 3         10 push(@guess, "www.$host." . MY_COUNTRY());
207             }
208             }
209 7         48 push(@guess, map "www.$host.$_",
210             "com", "org", "net", "edu", "int");
211             }
212              
213              
214 9         13 my $guess;
215 9         17 for $guess (@guess) {
216 9 50       17 print STDERR "uf_uristr: gethostbyname('$guess.')..."
217             if $DEBUG;
218 9 100       24 if (gethostbyname("$guess.")) {
219 8 50       75 print STDERR "yes\n" if $DEBUG;
220 8         11 $host = $guess;
221 8         19 last;
222             }
223 1 50       13 print STDERR "no\n" if $DEBUG;
224             }
225             }
226 12         32 $_ = "$scheme://$host$_";
227              
228             } else {
229             # pure junk, just return it unchanged...
230              
231             }
232             }
233 26 50       55 print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
234              
235 26         131 $_;
236             }
237              
238             sub uf_uri ($)
239             {
240 0     0 1 0 require URI;
241 0         0 URI->new(uf_uristr($_[0]));
242             }
243              
244             # legacy
245             *uf_urlstr = \*uf_uristr;
246              
247             sub uf_url ($)
248             {
249 5     5 0 434 require URI::URL;
250 5         16 URI::URL->new(uf_uristr($_[0]));
251             }
252              
253             1;