File Coverage

blib/lib/WWW/Link_Controller/URL.pm

Criterion	Covered	Total	%
statement	63	81	77.7
branch	26	54	48.1
condition	1	3	33.3
subroutine	9	9	100.0
pod	3	5	60.0
total	102	152	67.1

line	stmt	bran	cond	sub	pod	time	code
1							=head2 link_url
2
3							C looks at a candidate.
4
5							=cut
6
7							package WWW::Link_Controller::URL;
8							$REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
9	1			1		1716	use URI;
	1					7931
	1					27
10	1			1		8	use Carp;
	1					2
	1					83
11	1			1		17	use strict;
	1					2
	1					47
12	1			1		6	use warnings;
	1					2
	1					1336
13
14							our $verbose;
15							our $no_warn;
16
17							#charset definitions
18
19							sub RESERVED () { '[;/?:@&=+$]';}
20							sub ALPHA () { '[A-Za-z]'; }
21							sub ALPHA_NUM () { '[A-Za-z0-9]'; }
22							sub SCHEME_CHAR () { '[A-Za-z0-9+.-]'; }
23							sub MARK () { "[-_.!~*'()]"; }
24							sub UNRESERVED () { '(?:' . ALPHA_NUM . '\|' . MARK . ')' };
25							sub ESCAPE () { '[%]'; }
26							sub SCHEME () { '(?:(?i)[a-z][a-z0-9+.-]*)';}
27							sub ABSURI () { SCHEME . ':' . '/';}
28
29							sub CONTROL () { '[\x00-\x1F\x7F]'; }
30
31							#N.B. % and # normally included here. We don't include % since it is
32							#allowed as the escape character. and we don't include # since within
33							#LinkController we consider the fragment as part of the URL in
34							#contradiction with the standards, since we may be interested to check
35							#that the exact fragment of the resource exists
36							sub DELIMS () { '[<>"]'; }
37							sub UNWISE () { '[{}\|\^[]`]'; }
38							sub EXCLUDED () { '(?:'. CONTROL .'\|'. DELIMS .'\|'. UNWISE .'\|'.' '.'\|'.'#'.')' };
39							sub URIC () { '(?:' . RESERVED . '\|' . UNRESERVED . '\|' . ESCAPE . ')' };
40
41							#sub AUTHORITY () { '(?:(?i)[a-z][a-z0-9+.-]*)';}
42							#sub NET_PATH () { '//' . AUTHORITY . '/' . ABS_PATH; }
43
44
45							=head2 verify_url
46
47							verify url checks that a url is a valid and possible uri in the terms
48							of RFC2396
49
50							=cut
51
52							sub verify_url ($) {
53	17			17	1	91	my $url=shift;
54	17					18	my $control=CONTROL;
55							# we don't print it out directly.. maybe we shouldn't even print out
56							# the warning.
57	17	50				60	do { carp "url $url contains control characters" unless $no_warn;
	1	100				3
58	1					2	return undef; } if $url =~ m/$control/;
59	16					17	my $exclude=EXCLUDED;
60	16					17	my $ex;
61	16	50				187	do { carp "url $url contains excluded character: $ex" unless $no_warn;
	2	100				5
62	2					5	return undef; } if ($ex) = $url =~ m/($exclude)/;
63
64							#try to identify invalid schemes. The problem here is that it's possible
65							#to have a : elsewhere in a URL so we have to be very careful.
66
67	14					18	my $scheme=$url;
68
69							#chop off anything which is definitely not the scheme.. this gets rid of
70							#the second part of any paths etc. This protects us against relative urls
71							#which have a : in them (N.B.
72
73	14					30	$scheme =~ s,[#/].*,, ;
74
75							#now keep the bit preceeding the :
76
77	14					46	($scheme) = $scheme =~ m/^([^:]*):/;
78
79	14	100				34	if ( defined $scheme ) {
80	13					13	my $scheme_re= '^' . ALPHA .'('. ALPHA_NUM ."\|". SCHEME_CHAR .')*$' ;
81	13	50				69	do { carp "url $url has illegal scheme: $scheme" unless $no_warn;
	3	100				6
82	3					8	return undef; } unless $scheme =~ m/$scheme_re/;
83							}
84
85	11					27	return 1;
86							}
87
88							=head2 untaint_url
89
90							Used in our CGI bin programs, untaint_url takes a scalar and returns
91							it untainted if and only if it's contains only valid url characters
92							and it is a valid url according to verify_url.
93
94							A fundamental assumption in using this function is that your software
95							can handle B which looks like a valid URL, even if it isn't
96							a valid url. E.g. C.
97
98							=cut
99
100							sub untaint_url {
101	2			2	1	14	my $url=shift;
102	2					3	my $re='^'. URIC .'+$';
103	2					44	my ($ret)= $url =~ m/($re)/;
104	2	100				6	defined $ret or do {
105							# $url =~ y/[A-Za-z0-9]/_/c;# clean url so we can print it out
106	1	50				4	warn "bad url passed to url_untaint" unless $no_warn;
107	1					2	return undef;
108							};
109	1	50				3	return undef unless verify_url($ret);
110	1					4	return $ret;
111							}
112
113							=head2 verify_fragment
114
115							Fragments have fairly free syntax but RFC 2396 says clearly they
116							should conform to the same character set as URIs. Unfortunately, it
117							seems that many people put spaces in their fragments in contradiction
118							with the RFC since it works in HTML in practice.
119
120							We choose not to accept those and people should be able to change over?
121
122							If it turns out, as it probably will, that there is a real need for
123							spaces in cross references to other people's documents which can't be
124							fixed then maybe we will have to reconsider.
125
126							=cut
127
128							sub verify_fragment ($) {
129	2			2	1	3	my $fragment=shift;
130	2	50				8	defined $fragment or return undef;
131	0					0	my $control=CONTROL;
132							# we don't print it out directly.. maybe we shouldn't even print out
133							# the warning.
134	0	0				0	do { carp "url $fragment contains control characters" unless $no_warn;
	0	0				0
135	0					0	return undef; } if $fragment =~ m/$control/;
136	0					0	my $exclude=EXCLUDED;
137	0					0	my $ex;
138	0	0				0	do { carp "url $fragment contains excluded character: $ex" unless $no_warn;
	0	0				0
139	0					0	return undef; } if ($ex) = $fragment =~ m/($exclude)/;
140
141	0					0	return 1;
142							}
143
144							sub extract_fragment {
145	2			2	0	3	my $link=shift;
146	2					10	my ($url,$fragment)= $link =~ m/([^#])(?:#(.))?/;
147	2	50				6	$::verbose & 16 and do {
148	0	0				0	print STDERR "URL is $url and fragment is $fragment\n"
149							if defined $fragment;
150	0	0				0	print STDERR "URL is $url no fragment\n" unless defined $fragment;
151							};
152	2					5	return $url,$fragment;
153							}
154
155							sub fixup_link_url ($$) {
156	2			2	0	21	my $link=shift;
157	2					4	my $base=shift;
158	2	50				6	croak "usage link_url(,)" unless defined $link;
159
160	2					7	my ($url,$fragment)=extract_fragment($link);
161
162	2	50				5	unless (verify_url($url)) {
163	0	0				0	warn "dropping url: $url" unless $no_warn;
164	0					0	return undef;
165							};
166
167	2	50				5	unless (verify_fragment($fragment)) {
168	2	50				5	warn "dropping illegal fragment: $fragment for url $url" if defined $fragment;
169	2					3	$fragment=undef;
170							};
171
172							$url =~ m,^(?:ftp\|gopher\|http\|https\|ldap\|rsync\|telnet):(?:[^/]\|.[^/]),
173	2	50				8	and do {
174	0					0	warn "ERROR: ignoring relative url with scheme $url";
175	0					0	return undef;
176							};
177
178	2					13	my $urlo=URI->new($url);
179	2					9055	my $aurlo=$urlo->abs($base);
180	2					553	my $ret_url;
181	2	100				7	if ( URI::eq($urlo,$aurlo) ) {
182	1					322	$ret_url = $url;
183							} else {
184	1					10	$ret_url=$aurlo->as_string();
185							}
186
187							$ret_url =~ m,^(?:ftp\|gopher\|http\|https\|ldap\|rsync\|telnet):(?:[^/]\|.[^/]),
188	2	50				26	and do {
189	0					0	warn "ERROR: abs(url) $url gave $ret_url";
190	0					0	return undef;
191							};
192
193	2	50	33			15	print STDERR "fixed up link name $url\n"
194							if $::verbose & 16 and defined $url;
195	2					36	return $ret_url;
196							}
197
198							99;