line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head2 link_url |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
C looks at a candidate. |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=cut |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
package WWW::Link_Controller::URL; |
8
|
|
|
|
|
|
|
$REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ ); |
9
|
1
|
|
|
1
|
|
1716
|
use URI; |
|
1
|
|
|
|
|
7931
|
|
|
1
|
|
|
|
|
27
|
|
10
|
1
|
|
|
1
|
|
8
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
83
|
|
11
|
1
|
|
|
1
|
|
17
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
47
|
|
12
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1336
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $verbose; |
15
|
|
|
|
|
|
|
our $no_warn; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
#charset definitions |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub RESERVED () { '[;/?:@&=+$]';} |
20
|
|
|
|
|
|
|
sub ALPHA () { '[A-Za-z]'; } |
21
|
|
|
|
|
|
|
sub ALPHA_NUM () { '[A-Za-z0-9]'; } |
22
|
|
|
|
|
|
|
sub SCHEME_CHAR () { '[A-Za-z0-9+.-]'; } |
23
|
|
|
|
|
|
|
sub MARK () { "[-_.!~*'()]"; } |
24
|
|
|
|
|
|
|
sub UNRESERVED () { '(?:' . ALPHA_NUM . '|' . MARK . ')' }; |
25
|
|
|
|
|
|
|
sub ESCAPE () { '[%]'; } |
26
|
|
|
|
|
|
|
sub SCHEME () { '(?:(?i)[a-z][a-z0-9+.-]*)';} |
27
|
|
|
|
|
|
|
sub ABSURI () { SCHEME . ':' . '/';} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub CONTROL () { '[\x00-\x1F\x7F]'; } |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
#N.B. % and # normally included here. We don't include % since it is |
32
|
|
|
|
|
|
|
#allowed as the escape character. and we don't include # since within |
33
|
|
|
|
|
|
|
#LinkController we consider the fragment as part of the URL in |
34
|
|
|
|
|
|
|
#contradiction with the standards, since we may be interested to check |
35
|
|
|
|
|
|
|
#that the exact fragment of the resource exists |
36
|
|
|
|
|
|
|
sub DELIMS () { '[<>"]'; } |
37
|
|
|
|
|
|
|
sub UNWISE () { '[{}|\^[]`]'; } |
38
|
|
|
|
|
|
|
sub EXCLUDED () { '(?:'. CONTROL .'|'. DELIMS .'|'. UNWISE .'|'.' '.'|'.'#'.')' }; |
39
|
|
|
|
|
|
|
sub URIC () { '(?:' . RESERVED . '|' . UNRESERVED . '|' . ESCAPE . ')' }; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
#sub AUTHORITY () { '(?:(?i)[a-z][a-z0-9+.-]*)';} |
42
|
|
|
|
|
|
|
#sub NET_PATH () { '//' . AUTHORITY . '/' . ABS_PATH; } |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head2 verify_url |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
verify url checks that a url is a valid and possible uri in the terms |
48
|
|
|
|
|
|
|
of RFC2396 |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub verify_url ($) { |
53
|
17
|
|
|
17
|
1
|
91
|
my $url=shift; |
54
|
17
|
|
|
|
|
18
|
my $control=CONTROL; |
55
|
|
|
|
|
|
|
# we don't print it out directly.. maybe we shouldn't even print out |
56
|
|
|
|
|
|
|
# the warning. |
57
|
17
|
50
|
|
|
|
60
|
do { carp "url $url contains control characters" unless $no_warn; |
|
1
|
100
|
|
|
|
3
|
|
58
|
1
|
|
|
|
|
2
|
return undef; } if $url =~ m/$control/; |
59
|
16
|
|
|
|
|
17
|
my $exclude=EXCLUDED; |
60
|
16
|
|
|
|
|
17
|
my $ex; |
61
|
16
|
50
|
|
|
|
187
|
do { carp "url $url contains excluded character: $ex" unless $no_warn; |
|
2
|
100
|
|
|
|
5
|
|
62
|
2
|
|
|
|
|
5
|
return undef; } if ($ex) = $url =~ m/($exclude)/; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
#try to identify invalid schemes. The problem here is that it's possible |
65
|
|
|
|
|
|
|
#to have a : elsewhere in a URL so we have to be very careful. |
66
|
|
|
|
|
|
|
|
67
|
14
|
|
|
|
|
18
|
my $scheme=$url; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
#chop off anything which is definitely not the scheme.. this gets rid of |
70
|
|
|
|
|
|
|
#the second part of any paths etc. This protects us against relative urls |
71
|
|
|
|
|
|
|
#which have a : in them (N.B. |
72
|
|
|
|
|
|
|
|
73
|
14
|
|
|
|
|
30
|
$scheme =~ s,[#/].*,, ; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
#now keep the bit preceeding the : |
76
|
|
|
|
|
|
|
|
77
|
14
|
|
|
|
|
46
|
($scheme) = $scheme =~ m/^([^:]*):/; |
78
|
|
|
|
|
|
|
|
79
|
14
|
100
|
|
|
|
34
|
if ( defined $scheme ) { |
80
|
13
|
|
|
|
|
13
|
my $scheme_re= '^' . ALPHA .'('. ALPHA_NUM ."|". SCHEME_CHAR .')*$' ; |
81
|
13
|
50
|
|
|
|
69
|
do { carp "url $url has illegal scheme: $scheme" unless $no_warn; |
|
3
|
100
|
|
|
|
6
|
|
82
|
3
|
|
|
|
|
8
|
return undef; } unless $scheme =~ m/$scheme_re/; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
11
|
|
|
|
|
27
|
return 1; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 untaint_url |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Used in our CGI bin programs, untaint_url takes a scalar and returns |
91
|
|
|
|
|
|
|
it untainted if and only if it's contains only valid url characters |
92
|
|
|
|
|
|
|
and it is a valid url according to verify_url. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
A fundamental assumption in using this function is that your software |
95
|
|
|
|
|
|
|
can handle B which looks like a valid URL, even if it isn't |
96
|
|
|
|
|
|
|
a valid url. E.g. C. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=cut |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub untaint_url { |
101
|
2
|
|
|
2
|
1
|
14
|
my $url=shift; |
102
|
2
|
|
|
|
|
3
|
my $re='^'. URIC .'+$'; |
103
|
2
|
|
|
|
|
44
|
my ($ret)= $url =~ m/($re)/; |
104
|
2
|
100
|
|
|
|
6
|
defined $ret or do { |
105
|
|
|
|
|
|
|
# $url =~ y/[A-Za-z0-9]/_/c;# clean url so we can print it out |
106
|
1
|
50
|
|
|
|
4
|
warn "bad url passed to url_untaint" unless $no_warn; |
107
|
1
|
|
|
|
|
2
|
return undef; |
108
|
|
|
|
|
|
|
}; |
109
|
1
|
50
|
|
|
|
3
|
return undef unless verify_url($ret); |
110
|
1
|
|
|
|
|
4
|
return $ret; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 verify_fragment |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Fragments have fairly free syntax but RFC 2396 says clearly they |
116
|
|
|
|
|
|
|
should conform to the same character set as URIs. Unfortunately, it |
117
|
|
|
|
|
|
|
seems that many people put spaces in their fragments in contradiction |
118
|
|
|
|
|
|
|
with the RFC since it works in HTML in practice. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
We choose not to accept those and people should be able to change over? |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
If it turns out, as it probably will, that there is a real need for |
123
|
|
|
|
|
|
|
spaces in cross references to other people's documents which can't be |
124
|
|
|
|
|
|
|
fixed then maybe we will have to reconsider. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub verify_fragment ($) { |
129
|
2
|
|
|
2
|
1
|
3
|
my $fragment=shift; |
130
|
2
|
50
|
|
|
|
8
|
defined $fragment or return undef; |
131
|
0
|
|
|
|
|
0
|
my $control=CONTROL; |
132
|
|
|
|
|
|
|
# we don't print it out directly.. maybe we shouldn't even print out |
133
|
|
|
|
|
|
|
# the warning. |
134
|
0
|
0
|
|
|
|
0
|
do { carp "url $fragment contains control characters" unless $no_warn; |
|
0
|
0
|
|
|
|
0
|
|
135
|
0
|
|
|
|
|
0
|
return undef; } if $fragment =~ m/$control/; |
136
|
0
|
|
|
|
|
0
|
my $exclude=EXCLUDED; |
137
|
0
|
|
|
|
|
0
|
my $ex; |
138
|
0
|
0
|
|
|
|
0
|
do { carp "url $fragment contains excluded character: $ex" unless $no_warn; |
|
0
|
0
|
|
|
|
0
|
|
139
|
0
|
|
|
|
|
0
|
return undef; } if ($ex) = $fragment =~ m/($exclude)/; |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
return 1; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub extract_fragment { |
145
|
2
|
|
|
2
|
0
|
3
|
my $link=shift; |
146
|
2
|
|
|
|
|
10
|
my ($url,$fragment)= $link =~ m/([^#]*)(?:#(.*))?/; |
147
|
2
|
50
|
|
|
|
6
|
$::verbose & 16 and do { |
148
|
0
|
0
|
|
|
|
0
|
print STDERR "URL is $url and fragment is $fragment\n" |
149
|
|
|
|
|
|
|
if defined $fragment; |
150
|
0
|
0
|
|
|
|
0
|
print STDERR "URL is $url no fragment\n" unless defined $fragment; |
151
|
|
|
|
|
|
|
}; |
152
|
2
|
|
|
|
|
5
|
return $url,$fragment; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub fixup_link_url ($$) { |
156
|
2
|
|
|
2
|
0
|
21
|
my $link=shift; |
157
|
2
|
|
|
|
|
4
|
my $base=shift; |
158
|
2
|
50
|
|
|
|
6
|
croak "usage link_url(,)" unless defined $link; |
159
|
|
|
|
|
|
|
|
160
|
2
|
|
|
|
|
7
|
my ($url,$fragment)=extract_fragment($link); |
161
|
|
|
|
|
|
|
|
162
|
2
|
50
|
|
|
|
5
|
unless (verify_url($url)) { |
163
|
0
|
0
|
|
|
|
0
|
warn "dropping url: $url" unless $no_warn; |
164
|
0
|
|
|
|
|
0
|
return undef; |
165
|
|
|
|
|
|
|
}; |
166
|
|
|
|
|
|
|
|
167
|
2
|
50
|
|
|
|
5
|
unless (verify_fragment($fragment)) { |
168
|
2
|
50
|
|
|
|
5
|
warn "dropping illegal fragment: $fragment for url $url" if defined $fragment; |
169
|
2
|
|
|
|
|
3
|
$fragment=undef; |
170
|
|
|
|
|
|
|
}; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
$url =~ m,^(?:ftp|gopher|http|https|ldap|rsync|telnet):(?:[^/]|.[^/]), |
173
|
2
|
50
|
|
|
|
8
|
and do { |
174
|
0
|
|
|
|
|
0
|
warn "ERROR: ignoring relative url with scheme $url"; |
175
|
0
|
|
|
|
|
0
|
return undef; |
176
|
|
|
|
|
|
|
}; |
177
|
|
|
|
|
|
|
|
178
|
2
|
|
|
|
|
13
|
my $urlo=URI->new($url); |
179
|
2
|
|
|
|
|
9055
|
my $aurlo=$urlo->abs($base); |
180
|
2
|
|
|
|
|
553
|
my $ret_url; |
181
|
2
|
100
|
|
|
|
7
|
if ( URI::eq($urlo,$aurlo) ) { |
182
|
1
|
|
|
|
|
322
|
$ret_url = $url; |
183
|
|
|
|
|
|
|
} else { |
184
|
1
|
|
|
|
|
10
|
$ret_url=$aurlo->as_string(); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
$ret_url =~ m,^(?:ftp|gopher|http|https|ldap|rsync|telnet):(?:[^/]|.[^/]), |
188
|
2
|
50
|
|
|
|
26
|
and do { |
189
|
0
|
|
|
|
|
0
|
warn "ERROR: abs(url) $url gave $ret_url"; |
190
|
0
|
|
|
|
|
0
|
return undef; |
191
|
|
|
|
|
|
|
}; |
192
|
|
|
|
|
|
|
|
193
|
2
|
50
|
33
|
|
|
15
|
print STDERR "fixed up link name $url\n" |
194
|
|
|
|
|
|
|
if $::verbose & 16 and defined $url; |
195
|
2
|
|
|
|
|
36
|
return $ret_url; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
99; |