File Coverage

blib/lib/URI/Heuristic.pm
Criterion Covered Total %
statement 73 80 91.2
branch 48 64 75.0
condition 10 15 66.6
subroutine 6 7 85.7
pod 2 4 50.0
total 139 170 81.7


line stmt bran cond sub pod time code
1             package URI::Heuristic;
2              
3             =head1 NAME
4              
5             URI::Heuristic - Expand URI using heuristics
6              
7             =head1 SYNOPSIS
8              
9             use URI::Heuristic qw(uf_uristr);
10             $u = uf_uristr("example"); # http://www.example.com
11             $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol
12             $u = uf_uristr("aas"); # http://www.aas.no
13             $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi
14             $u = uf_uristr("/etc/passwd"); # file:/etc/passwd
15              
16             =head1 DESCRIPTION
17              
18             This module provides functions that expand strings into real absolute
19             URIs using some built-in heuristics. Strings that already represent
20             absolute URIs (i.e. that start with a C part) are never modified
21             and are returned unchanged. The main use of these functions is to
22             allow abbreviated URIs similar to what many web browsers allow for URIs
23             typed in by the user.
24              
25             The following functions are provided:
26              
27             =over 4
28              
29             =item uf_uristr($str)
30              
31             Tries to make the argument string
32             into a proper absolute URI string. The "uf_" prefix stands for "User
33             Friendly". Under MacOS, it assumes that any string with a common URL
34             scheme (http, ftp, etc.) is a URL rather than a local path. So don't name
35             your volumes after common URL schemes and expect uf_uristr() to construct
36             valid file: URL's on those volumes for you, because it won't.
37              
38             =item uf_uri($str)
39              
40             Works the same way as uf_uristr() but
41             returns a C object.
42              
43             =back
44              
45             =head1 ENVIRONMENT
46              
47             If the hostname portion of a URI does not contain any dots, then
48             certain qualified guesses are made. These guesses are governed by
49             the following environment variables:
50              
51             =over 10
52              
53             =item COUNTRY
54              
55             The two-letter country code (ISO 3166) for your location. If
56             the domain name of your host ends with two letters, then it is taken
57             to be the default country. See also L.
58              
59             =item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG
60              
61             If COUNTRY is not set, these standard environment variables are
62             examined and country (not language) information possibly found in them
63             is used as the default country.
64              
65             =item URL_GUESS_PATTERN
66              
67             Contains a space-separated list of URL patterns to try. The string
68             "ACME" is for some reason used as a placeholder for the host name in
69             the URL provided. Example:
70              
71             URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
72             export URL_GUESS_PATTERN
73              
74             Specifying URL_GUESS_PATTERN disables any guessing rules based on
75             country. An empty URL_GUESS_PATTERN disables any guessing that
76             involves host name lookups.
77              
78             =back
79              
80             =head1 COPYRIGHT
81              
82             Copyright 1997-1998, Gisle Aas
83              
84             This library is free software; you can redistribute it and/or
85             modify it under the same terms as Perl itself.
86              
87             =cut
88              
89 1     1   103088 use strict;
  1         2  
  1         45  
90 1     1   7 use warnings;
  1         8  
  1         84  
91              
92 1     1   8 use Exporter 5.57 'import';
  1         30  
  1         3404  
93             our @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
94             our $VERSION = '5.34';
95              
96             our ($MY_COUNTRY, $DEBUG);
97              
98             sub MY_COUNTRY() {
99 16     16 0 32 for ($MY_COUNTRY) {
100 16 100       48 return $_ if defined;
101              
102             # First try the environment.
103 7         15 $_ = $ENV{COUNTRY};
104 7 50       17 return $_ if defined;
105              
106             # Try the country part of LC_ALL and LANG from environment
107 7         34 my @srcs = ($ENV{LC_ALL}, $ENV{LANG});
108             # ...and HTTP_ACCEPT_LANGUAGE before those if present
109 7 100       24 if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) {
110             # TODO: q-value processing/ordering
111 3         14 for $httplang (split(/\s*,\s*/, $httplang)) {
112 3 50       18 if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) {
113 3         14 unshift(@srcs, "${1}_${2}");
114 3         7 last;
115             }
116             }
117             }
118 7         14 for (@srcs) {
119 9 50       20 next unless defined;
120 9 100       59 return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/;
121             }
122              
123             # Last bit of domain name. This may access the network.
124 2         16 require Net::Domain;
125 2         6 my $fqdn = Net::Domain::hostfqdn();
126 2 100       18 $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
127 2 100       11 return $_ if defined;
128              
129             # Give up. Defined but false.
130 1         6 return ($_ = 0);
131             }
132             }
133              
134             our %LOCAL_GUESSING =
135             (
136             'us' => [qw(www.ACME.gov www.ACME.mil)],
137             'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
138             'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
139             'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
140             # send corrections and new entries to
141             );
142             # Backwards compatibility; uk != United Kingdom in ISO 3166
143             $LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb};
144              
145              
146             sub uf_uristr ($)
147             {
148 28     28 1 502453 local($_) = @_;
149 28 50       92 print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
150 28 50       74 return unless defined;
151              
152 28         121 s/^\s+//;
153 28         74 s/\s+$//;
154              
155 28 50 100     553 if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) {
    100 100        
    100 33        
    100 33        
    50          
    100          
    100          
156 0         0 $_ = "http://$_";
157              
158             } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) {
159 6         28 $_ = lc($1) . "://$_";
160              
161             } elsif (
162             m,^//, || m,^[\\][\\],) # UNC-like file name
163             {
164 2         9 s{[\\]}{/}g;
165 2         6 $_ = "smb:$_";
166             } elsif ($^O ne "MacOS" &&
167             (m,^/, || # absolute file name
168             m,^\.\.?/, || # relative file name
169             m,^[a-zA-Z]:[/\\],) # dosish file name
170             )
171             {
172 3         9 $_ = "file:$_";
173              
174             } elsif ($^O eq "MacOS" && m/:/) {
175             # potential MacOS file name
176 0 0       0 unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
177 0         0 require URI::file;
178 0         0 my $a = URI::file->new($_)->as_string;
179 0 0       0 $_ = ($a =~ m/^file:/) ? $a : "file:$a";
180             }
181             } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
182 2         8 $_ = "mailto:$_";
183              
184             } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified
185 12 50       105 if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
186 12         36 my $host = $1;
187              
188 12         19 my $scheme = "http";
189 12 100       40 if (/^:(\d+)\b/) {
190             # Some more or less well known ports
191 3 100       16 if ($1 =~ /^[56789]?443$/) {
    100          
192 1         3 $scheme = "https";
193             } elsif ($1 eq "21") {
194 1         3 $scheme = "ftp";
195             }
196             }
197              
198 12 100 66     56 if ($host !~ /\./ && $host ne "localhost") {
199 9         15 my @guess;
200 9 100       57 if (exists $ENV{URL_GUESS_PATTERN}) {
201 2         9 @guess = map { s/\bACME\b/$host/; $_ }
  2         7  
202 2         9 split(' ', $ENV{URL_GUESS_PATTERN});
203             } else {
204 7 100       20 if (MY_COUNTRY()) {
205 6         14 my $special = $LOCAL_GUESSING{MY_COUNTRY()};
206 6 100       14 if ($special) {
207 3         10 my @special = @$special;
208 3         15 push(@guess, map { s/\bACME\b/$host/; $_ }
  9         47  
  9         31  
209             @special);
210             } else {
211 3         11 push(@guess, "www.$host." . MY_COUNTRY());
212             }
213             }
214 7         69 push(@guess, map "www.$host.$_",
215             "com", "org", "net", "edu", "int");
216             }
217              
218              
219 9         17 my $guess;
220 9         18 for $guess (@guess) {
221 9 50       25 print STDERR "uf_uristr: gethostbyname('$guess.')..."
222             if $DEBUG;
223 9 100       37 if (gethostbyname("$guess.")) {
224 8 50       89 print STDERR "yes\n" if $DEBUG;
225 8         15 $host = $guess;
226 8         24 last;
227             }
228 1 50       24 print STDERR "no\n" if $DEBUG;
229             }
230             }
231 12         39 $_ = "$scheme://$host$_";
232              
233             } else {
234             # pure junk, just return it unchanged...
235              
236             }
237             }
238 28 50       93 print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
239              
240 28         158 $_;
241             }
242              
243             sub uf_uri ($)
244             {
245 0     0 1 0 require URI;
246 0         0 URI->new(uf_uristr($_[0]));
247             }
248              
249             # legacy
250             *uf_urlstr = \*uf_uristr;
251              
252             sub uf_url ($)
253             {
254 5     5 0 884 require URI::URL;
255 5         16 URI::URL->new(uf_uristr($_[0]));
256             }
257              
258             1;