| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package URI::Heuristic; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | =head1 NAME | 
| 4 |  |  |  |  |  |  |  | 
| 5 |  |  |  |  |  |  | URI::Heuristic - Expand URI using heuristics | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | use URI::Heuristic qw(uf_uristr); | 
| 10 |  |  |  |  |  |  | $u = uf_uristr("example");          # http://www.example.com | 
| 11 |  |  |  |  |  |  | $u = uf_uristr("www.sol.no/sol");   # http://www.sol.no/sol | 
| 12 |  |  |  |  |  |  | $u = uf_uristr("aas");              # http://www.aas.no | 
| 13 |  |  |  |  |  |  | $u = uf_uristr("ftp.funet.fi");     # ftp://ftp.funet.fi | 
| 14 |  |  |  |  |  |  | $u = uf_uristr("/etc/passwd");      # file:/etc/passwd | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | This module provides functions that expand strings into real absolute | 
| 19 |  |  |  |  |  |  | URIs using some built-in heuristics.  Strings that already represent | 
| 20 |  |  |  |  |  |  | absolute URIs (i.e. that start with a C part) are never modified | 
| 21 |  |  |  |  |  |  | and are returned unchanged.  The main use of these functions is to | 
| 22 |  |  |  |  |  |  | allow abbreviated URIs similar to what many web browsers allow for URIs | 
| 23 |  |  |  |  |  |  | typed in by the user. | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | The following functions are provided: | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | =over 4 | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =item uf_uristr($str) | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | Tries to make the argument string | 
| 32 |  |  |  |  |  |  | into a proper absolute URI string.  The "uf_" prefix stands for "User | 
| 33 |  |  |  |  |  |  | Friendly".  Under MacOS, it assumes that any string with a common URL | 
| 34 |  |  |  |  |  |  | scheme (http, ftp, etc.) is a URL rather than a local path.  So don't name | 
| 35 |  |  |  |  |  |  | your volumes after common URL schemes and expect uf_uristr() to construct | 
| 36 |  |  |  |  |  |  | valid file: URL's on those volumes for you, because it won't. | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | =item uf_uri($str) | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | Works the same way as uf_uristr() but | 
| 41 |  |  |  |  |  |  | returns a C object. | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | =back | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | =head1 ENVIRONMENT | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | If the hostname portion of a URI does not contain any dots, then | 
| 48 |  |  |  |  |  |  | certain qualified guesses are made.  These guesses are governed by | 
| 49 |  |  |  |  |  |  | the following environment variables: | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | =over 10 | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | =item COUNTRY | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | The two-letter country code (ISO 3166) for your location.  If | 
| 56 |  |  |  |  |  |  | the domain name of your host ends with two letters, then it is taken | 
| 57 |  |  |  |  |  |  | to be the default country. See also L. | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | =item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | If COUNTRY is not set, these standard environment variables are | 
| 62 |  |  |  |  |  |  | examined and country (not language) information possibly found in them | 
| 63 |  |  |  |  |  |  | is used as the default country. | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | =item URL_GUESS_PATTERN | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | Contains a space-separated list of URL patterns to try.  The string | 
| 68 |  |  |  |  |  |  | "ACME" is for some reason used as a placeholder for the host name in | 
| 69 |  |  |  |  |  |  | the URL provided.  Example: | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com" | 
| 72 |  |  |  |  |  |  | export URL_GUESS_PATTERN | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | Specifying URL_GUESS_PATTERN disables any guessing rules based on | 
| 75 |  |  |  |  |  |  | country.  An empty URL_GUESS_PATTERN disables any guessing that | 
| 76 |  |  |  |  |  |  | involves host name lookups. | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | =back | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | =head1 COPYRIGHT | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | Copyright 1997-1998, Gisle Aas | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or | 
| 85 |  |  |  |  |  |  | modify it under the same terms as Perl itself. | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | =cut | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 1 |  |  | 1 |  | 68355 | use strict; | 
|  | 1 |  |  |  |  | 11 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 90 | 1 |  |  | 1 |  | 6 | use warnings; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 91 |  |  |  |  |  |  |  | 
| 92 | 1 |  |  | 1 |  | 14 | use Exporter 5.57 'import'; | 
|  | 1 |  |  |  |  | 25 |  | 
|  | 1 |  |  |  |  | 1707 |  | 
| 93 |  |  |  |  |  |  | our @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr); | 
| 94 |  |  |  |  |  |  | our $VERSION = '5.20'; | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | our ($MY_COUNTRY, $DEBUG); | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | sub MY_COUNTRY() { | 
| 99 | 16 |  |  | 16 | 0 | 26 | for ($MY_COUNTRY) { | 
| 100 | 16 | 100 |  |  |  | 39 | return $_ if defined; | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | # First try the environment. | 
| 103 | 7 |  |  |  |  | 12 | $_ = $ENV{COUNTRY}; | 
| 104 | 7 | 50 |  |  |  | 14 | return $_ if defined; | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | # Try the country part of LC_ALL and LANG from environment | 
| 107 | 7 |  |  |  |  | 16 | my @srcs = ($ENV{LC_ALL}, $ENV{LANG}); | 
| 108 |  |  |  |  |  |  | # ...and HTTP_ACCEPT_LANGUAGE before those if present | 
| 109 | 7 | 100 |  |  |  | 14 | if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) { | 
| 110 |  |  |  |  |  |  | # TODO: q-value processing/ordering | 
| 111 | 3 |  |  |  |  | 8 | for $httplang (split(/\s*,\s*/, $httplang)) { | 
| 112 | 3 | 50 |  |  |  | 15 | if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) { | 
| 113 | 3 |  |  |  |  | 10 | unshift(@srcs, "${1}_${2}"); | 
| 114 | 3 |  |  |  |  | 5 | last; | 
| 115 |  |  |  |  |  |  | } | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  | } | 
| 118 | 7 |  |  |  |  | 12 | for (@srcs) { | 
| 119 | 9 | 50 |  |  |  | 15 | next unless defined; | 
| 120 | 9 | 100 |  |  |  | 38 | return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/; | 
| 121 |  |  |  |  |  |  | } | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | # Last bit of domain name.  This may access the network. | 
| 124 | 2 |  |  |  |  | 11 | require Net::Domain; | 
| 125 | 2 |  |  |  |  | 5 | my $fqdn = Net::Domain::hostfqdn(); | 
| 126 | 2 | 100 |  |  |  | 16 | $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/; | 
| 127 | 2 | 100 |  |  |  | 7 | return $_ if defined; | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | # Give up.  Defined but false. | 
| 130 | 1 |  |  |  |  | 5 | return ($_ = 0); | 
| 131 |  |  |  |  |  |  | } | 
| 132 |  |  |  |  |  |  | } | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | our %LOCAL_GUESSING = | 
| 135 |  |  |  |  |  |  | ( | 
| 136 |  |  |  |  |  |  | 'us' => [qw(www.ACME.gov www.ACME.mil)], | 
| 137 |  |  |  |  |  |  | 'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)], | 
| 138 |  |  |  |  |  |  | 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)], | 
| 139 |  |  |  |  |  |  | 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)], | 
| 140 |  |  |  |  |  |  | # send corrections and new entries to | 
| 141 |  |  |  |  |  |  | ); | 
| 142 |  |  |  |  |  |  | # Backwards compatibility; uk != United Kingdom in ISO 3166 | 
| 143 |  |  |  |  |  |  | $LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb}; | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | sub uf_uristr ($) | 
| 147 |  |  |  |  |  |  | { | 
| 148 | 26 |  |  | 26 | 1 | 2227 | local($_) = @_; | 
| 149 | 26 | 50 |  |  |  | 65 | print STDERR "uf_uristr: resolving $_\n" if $DEBUG; | 
| 150 | 26 | 50 |  |  |  | 50 | return unless defined; | 
| 151 |  |  |  |  |  |  |  | 
| 152 | 26 |  |  |  |  | 83 | s/^\s+//; | 
| 153 | 26 |  |  |  |  | 57 | s/\s+$//; | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 26 | 50 | 100 |  |  | 335 | if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) { | 
|  |  | 100 | 33 |  |  |  |  | 
|  |  | 100 | 33 |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
| 156 | 0 |  |  |  |  | 0 | $_ = "http://$_"; | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) { | 
| 159 | 6 |  |  |  |  | 26 | $_ = lc($1) . "://$_"; | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | } elsif ($^O ne "MacOS" && | 
| 162 |  |  |  |  |  |  | (m,^/,      ||          # absolute file name | 
| 163 |  |  |  |  |  |  | m,^\.\.?/, ||          # relative file name | 
| 164 |  |  |  |  |  |  | m,^[a-zA-Z]:[/\\],)    # dosish file name | 
| 165 |  |  |  |  |  |  | ) | 
| 166 |  |  |  |  |  |  | { | 
| 167 | 3 |  |  |  |  | 10 | $_ = "file:$_"; | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | } elsif ($^O eq "MacOS" && m/:/) { | 
| 170 |  |  |  |  |  |  | # potential MacOS file name | 
| 171 | 0 | 0 |  |  |  | 0 | unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) { | 
| 172 | 0 |  |  |  |  | 0 | require URI::file; | 
| 173 | 0 |  |  |  |  | 0 | my $a = URI::file->new($_)->as_string; | 
| 174 | 0 | 0 |  |  |  | 0 | $_ = ($a =~ m/^file:/) ? $a : "file:$a"; | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) { | 
| 177 | 2 |  |  |  |  | 6 | $_ = "mailto:$_"; | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) {      # no scheme specified | 
| 180 | 12 | 50 |  |  |  | 90 | if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) { | 
| 181 | 12 |  |  |  |  | 27 | my $host = $1; | 
| 182 |  |  |  |  |  |  |  | 
| 183 | 12 |  |  |  |  | 16 | my $scheme = "http"; | 
| 184 | 12 | 100 |  |  |  | 31 | if (/^:(\d+)\b/) { | 
| 185 |  |  |  |  |  |  | # Some more or less well known ports | 
| 186 | 3 | 100 |  |  |  | 27 | if ($1 =~ /^[56789]?443$/) { | 
|  |  | 100 |  |  |  |  |  | 
| 187 | 1 |  |  |  |  | 3 | $scheme = "https"; | 
| 188 |  |  |  |  |  |  | } elsif ($1 eq "21") { | 
| 189 | 1 |  |  |  |  | 3 | $scheme = "ftp"; | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  |  | 
| 193 | 12 | 100 | 66 |  |  | 45 | if ($host !~ /\./ && $host ne "localhost") { | 
| 194 | 9 |  |  |  |  | 12 | my @guess; | 
| 195 | 9 | 100 |  |  |  | 28 | if (exists $ENV{URL_GUESS_PATTERN}) { | 
| 196 | 2 |  |  |  |  | 8 | @guess = map { s/\bACME\b/$host/; $_ } | 
|  | 2 |  |  |  |  | 5 |  | 
| 197 | 2 |  |  |  |  | 8 | split(' ', $ENV{URL_GUESS_PATTERN}); | 
| 198 |  |  |  |  |  |  | } else { | 
| 199 | 7 | 100 |  |  |  | 20 | if (MY_COUNTRY()) { | 
| 200 | 6 |  |  |  |  | 10 | my $special = $LOCAL_GUESSING{MY_COUNTRY()}; | 
| 201 | 6 | 100 |  |  |  | 14 | if ($special) { | 
| 202 | 3 |  |  |  |  | 7 | my @special = @$special; | 
| 203 | 3 |  |  |  |  | 7 | push(@guess, map { s/\bACME\b/$host/; $_ } | 
|  | 9 |  |  |  |  | 30 |  | 
|  | 9 |  |  |  |  | 21 |  | 
| 204 |  |  |  |  |  |  | @special); | 
| 205 |  |  |  |  |  |  | } else { | 
| 206 | 3 |  |  |  |  | 8 | push(@guess, "www.$host." . MY_COUNTRY()); | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  | } | 
| 209 | 7 |  |  |  |  | 43 | push(@guess, map "www.$host.$_", | 
| 210 |  |  |  |  |  |  | "com", "org", "net", "edu", "int"); | 
| 211 |  |  |  |  |  |  | } | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 9 |  |  |  |  | 18 | my $guess; | 
| 215 | 9 |  |  |  |  | 17 | for $guess (@guess) { | 
| 216 | 9 | 50 |  |  |  | 16 | print STDERR "uf_uristr: gethostbyname('$guess.')..." | 
| 217 |  |  |  |  |  |  | if $DEBUG; | 
| 218 | 9 | 100 |  |  |  | 28 | if (gethostbyname("$guess.")) { | 
| 219 | 8 | 50 |  |  |  | 75 | print STDERR "yes\n" if $DEBUG; | 
| 220 | 8 |  |  |  |  | 12 | $host = $guess; | 
| 221 | 8 |  |  |  |  | 18 | last; | 
| 222 |  |  |  |  |  |  | } | 
| 223 | 1 | 50 |  |  |  | 11 | print STDERR "no\n" if $DEBUG; | 
| 224 |  |  |  |  |  |  | } | 
| 225 |  |  |  |  |  |  | } | 
| 226 | 12 |  |  |  |  | 27 | $_ = "$scheme://$host$_"; | 
| 227 |  |  |  |  |  |  |  | 
| 228 |  |  |  |  |  |  | } else { | 
| 229 |  |  |  |  |  |  | # pure junk, just return it unchanged... | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | } | 
| 232 |  |  |  |  |  |  | } | 
| 233 | 26 | 50 |  |  |  | 56 | print STDERR "uf_uristr: ==> $_\n" if $DEBUG; | 
| 234 |  |  |  |  |  |  |  | 
| 235 | 26 |  |  |  |  | 131 | $_; | 
| 236 |  |  |  |  |  |  | } | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | sub uf_uri ($) | 
| 239 |  |  |  |  |  |  | { | 
| 240 | 0 |  |  | 0 | 1 | 0 | require URI; | 
| 241 | 0 |  |  |  |  | 0 | URI->new(uf_uristr($_[0])); | 
| 242 |  |  |  |  |  |  | } | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | # legacy | 
| 245 |  |  |  |  |  |  | *uf_urlstr = \*uf_uristr; | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | sub uf_url ($) | 
| 248 |  |  |  |  |  |  | { | 
| 249 | 5 |  |  | 5 | 0 | 429 | require URI::URL; | 
| 250 | 5 |  |  |  |  | 13 | URI::URL->new(uf_uristr($_[0])); | 
| 251 |  |  |  |  |  |  | } | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | 1; |