File Coverage

blib/lib/Mail/SpamAssassin/RegistryBoundaries.pm
Criterion Covered Total %
statement 85 90 94.4
branch 42 54 77.7
condition 4 15 26.6
subroutine 11 11 100.0
pod 3 5 60.0
total 145 175 82.8


line stmt bran cond sub pod time code
1             # The (extremely complex) rules for domain delegation.
2              
3             # <@LICENSE>
4             # Licensed to the Apache Software Foundation (ASF) under one or more
5             # contributor license agreements. See the NOTICE file distributed with
6             # this work for additional information regarding copyright ownership.
7             # The ASF licenses this file to you under the Apache License, Version 2.0
8             # (the "License"); you may not use this file except in compliance with
9             # the License. You may obtain a copy of the License at:
10             #
11             # http://www.apache.org/licenses/LICENSE-2.0
12             #
13             # Unless required by applicable law or agreed to in writing, software
14             # distributed under the License is distributed on an "AS IS" BASIS,
15             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16             # See the License for the specific language governing permissions and
17             # limitations under the License.
18             # </@LICENSE>
19              
20             =head1 NAME
21              
22             Mail::SpamAssassin::RegistryBoundaries - domain delegation rules
23              
24             =cut
25              
26             package Mail::SpamAssassin::RegistryBoundaries;
27              
28 40     40   322 use strict;
  40         96  
  40         1188  
29 40     40   246 use warnings;
  40         108  
  40         1331  
30             # use bytes;
31 40     40   269 use re 'taint';
  40         140  
  40         2279  
32              
33             our @ISA = qw();
34              
35 40     40   288 use Mail::SpamAssassin::Logger;
  40         117  
  40         2734  
36 40     40   320 use Mail::SpamAssassin::Constants qw(:ip);
  40         105  
  40         4854  
37 40     40   349 use Mail::SpamAssassin::Util qw(is_fqdn_valid);
  40         88  
  40         50099  
38              
39             my $IP_ADDRESS = IP_ADDRESS;
40              
41             # called from SpamAssassin->init() to create $self->{util_rb}
42             sub new {
43 90     90 0 257 my $class = shift;
44 90   33     655 $class = ref($class) || $class;
45              
46 90         283 my ($main) = @_;
47             my $self = {
48             'main' => $main,
49             'conf' => $main->{conf},
50 90         478 };
51 90         408 bless ($self, $class);
52              
53             # Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
54 90 50 33     744 if ($self->{conf}->{valid_tlds} && %{$self->{conf}->{valid_tlds}}) {
  90         489  
55             # International domain names are already in ASCII-compatible encoding (ACE)
56             my $tlds =
57             '(?<![a-zA-Z0-9-])(?:'. # make sure tld starts at boundary
58 90         256 join('|', keys %{$self->{conf}->{valid_tlds}}).
  90         43836  
59             ')(?!(?:[a-zA-Z0-9-]|\.[a-zA-Z0-9]))'; # make sure it ends
60             # Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
61 90 50       3989 if (eval { $self->{valid_tlds_re} = qr/$tlds/i; 1; }) {
  90         477512  
  90         6134  
62             dbg("config: registryboundaries: %d tlds loaded",
63 90         262 scalar keys %{$self->{conf}->{valid_tlds}});
  90         1362  
64             } else {
65 0         0 warn "config: registryboundaries: failed to compile valid_tlds_re: $@\n";
66 0         0 $self->{valid_tlds_re} = qr/no_tlds_defined/;
67             }
68             }
69             else {
70             # Failsafe in case no tlds defined, we don't want this to match everything..
71 0         0 $self->{valid_tlds_re} = qr/no_tlds_defined/;
72             warn "config: registryboundaries: no tlds defined, need to run sa-update\n"
73 0 0       0 if !$self->{main}->{ignore_site_cf_files};
74             }
75              
76 90         630 $self;
77             }
78              
79             # This is required because the .us domain is nuts. See split_domain.
80             our %US_STATES = qw(
81             ak 1 al 1 ar 1 az 1 ca 1 co 1 ct 1 dc 1 de 1 fl 1 ga 1 gu 1 hi 1 ia 1 id 1 il 1 in 1 ks 1 ky 1 la 1 ma 1 md 1 me 1 mi 1
82             mn 1 mo 1 ms 1 mt 1 nc 1 nd 1 ne 1 nh 1 nj 1 nm 1 nv 1 ny 1 oh 1 ok 1 or 1 pa 1 pr 1 ri 1 sc 1 sd 1 tn 1 tx 1 ut 1 va 1
83             vi 1 vt 1 wa 1 wi 1 wv 1 wy 1
84             );
85              
86             ###########################################################################
87              
88             =head1 METHODS
89              
90             =over 4
91              
92             =item ($hostname, $domain) = split_domain ($fqdn)
93              
94             Cut a fully-qualified hostname into the hostname part and the domain
95             part, splitting at the DNS registry boundary.
96              
97             Examples:
98              
99             "www.foo.com" => ( "www", "foo.com" )
100             "www.foo.co.uk" => ( "www", "foo.co.uk" )
101              
102             =cut
103              
104             sub split_domain {
105 424     424 1 570 my $self = shift;
106 424         747 my $domain = lc shift;
107              
108 424         638 my $hostname = '';
109              
110 424 50 33     1630 if (defined $domain && $domain ne '') {
111             # www..spamassassin.org -> www.spamassassin.org
112 424         839 $domain =~ tr/././s;
113              
114             # leading/trailing dots
115 424         817 $domain =~ s/^\.+//;
116 424         1000 $domain =~ s/\.+$//;
117              
118             # Split scalar domain into components
119 424         1078 my @domparts = split(/\./, $domain);
120 424         651 my @hostname;
121              
122 424         927 while (@domparts > 1) { # go until we find the TLD
123 678 100       1924 if (@domparts == 4) {
    100          
    100          
124 21 0 0     77 if ($domparts[3] eq 'us' &&
      33        
125             (($domparts[0] eq 'pvt' && $domparts[1] eq 'k12') ||
126             ($domparts[0] =~ /^c[io]$/)))
127             {
128             # http://www.neustar.us/policies/docs/rfc_1480.txt
129             # "Fire-Dept.CI.Los-Angeles.CA.US"
130             # "<school-name>.PVT.K12.<state>.US"
131 0 0       0 last if ($US_STATES{$domparts[2]});
132             }
133             }
134             elsif (@domparts == 3) {
135             # http://www.neustar.us/policies/docs/rfc_1480.txt
136             # demon.co.uk
137             # esc.edu.ar
138             # [^\.]+\.${US_STATES}\.us
139 227 100       502 if ($domparts[2] eq 'us') {
140 3 50       16 last if ($US_STATES{$domparts[1]});
141             }
142             else {
143 224         534 my $temp = join(".", @domparts);
144 224 100       759 last if ($self->{conf}->{three_level_domains}{$temp});
145             }
146             }
147             elsif (@domparts == 2) {
148             # co.uk, etc.
149 419         843 my $temp = join(".", @domparts);
150 419 100       1362 last if ($self->{conf}->{two_level_domains}{$temp});
151             }
152 659         1822 push(@hostname, shift @domparts);
153             }
154              
155             # Look for a sub-delegated TLD
156             # use @domparts to skip trying to match on TLDs that can't possibly
157             # match, but keep in mind that the hostname can be blank, so 4TLD needs 4,
158             # 3TLD needs 3, 2TLD needs 2 ...
159             #
160 424 100       1161 unshift @domparts, pop @hostname if @hostname;
161 424         907 $domain = join(".", @domparts);
162 424         965 $hostname = join(".", @hostname);
163             }
164              
165 424         1368 ($hostname, $domain);
166             }
167              
168             ###########################################################################
169              
170             =item $domain = trim_domain($fqdn)
171              
172             Cut a fully-qualified hostname into the hostname part and the domain
173             part, returning just the domain.
174              
175             Examples:
176              
177             "www.foo.com" => "foo.com"
178             "www.foo.co.uk" => "foo.co.uk"
179              
180             =cut
181              
182             sub trim_domain {
183 424     424 1 29045 my $self = shift;
184 424         617 my $domain = shift;
185              
186 424         995 my ($host, $dom) = $self->split_domain($domain);
187 424         1039 return $dom;
188             }
189              
190             ###########################################################################
191              
192             =item $ok = is_domain_valid($dom)
193              
194             Return C<1> if the domain is valid, C<undef> otherwise. A valid domain
195             (a) does not contain whitespace, (b) contains at least one dot, and (c)
196             uses a valid TLD or ccTLD.
197              
198             =back
199              
200             =cut
201              
202             sub is_domain_valid {
203 475     475 1 968 my ($self, $dom) = @_;
204              
205 475 50       922 return 0 unless defined $dom;
206              
207             # domains don't have whitespace
208 475 50       1195 return 0 if ($dom =~ /\s/);
209              
210             # ensure it ends in a known-valid TLD, and has at least 1 dot
211 475 100       1818 return 0 unless ($dom =~ /\.([^.]+)$/);
212 473 100       2117 return 0 unless ($self->{conf}->{valid_tlds}{lc $1});
213              
214 446         1133 return 1; # nah, it's ok.
215             }
216              
217             #
218              
219             sub uri_to_domain {
220 590     590 0 19439 my $self = shift;
221 590         1218 my $uri = lc shift;
222              
223             # Javascript is not going to help us, so return.
224             # Likewise ignore cid, file
225 590 100       1879 return if ($uri =~ /^(?:javascript|cid|file):/);
226              
227 586 100       1449 if ($uri =~ s/^mailto://) { # handle mailto: specially
228 38         84 $uri =~ s/\?.*//; # drop parameters ?subject= etc
229             # note above, Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
230             # uri_list_canonicalize should have made versions without ? &
231             # Keep testing with & here just in case..
232 38 100       118 return if $uri =~ /\@.*?\@/; # abort if multiple @
233 36 100       180 return unless $uri =~ s/.*@//; # drop username or abort
234             } else {
235 548         1977 $uri =~ s{^[a-z]+:/{0,2}}{}gs; # drop the protocol
236             # strip path, CGI params, fragment. note: bug 4213 shows that "&" should
237             # *not* be likewise stripped here -- it's permitted in hostnames by
238             # some common MUAs!
239 548         1337 $uri =~ s{[/?#].*}{}gs;
240 548         951 $uri =~ s{^[^/]*\@}{}gs; # drop username/passwd
241 548         973 $uri =~ s{:\d*$}{}gs; # port, bug 4191: sometimes the # is missing
242             }
243              
244             # skip undecoded URIs if the encoded bits shouldn't be.
245             # we'll see the decoded version as well. see url_encode()
246 581 100       1248 return if $uri =~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
247              
248 580         907 my $host = $uri; # unstripped/full domain name
249 580         771 my $domain = $host;
250              
251             # keep IPs intact
252 580 100       8774 if ($host !~ /^$IP_ADDRESS$/) {
253             # check that it's a valid hostname/fqdn
254 528 100       1609 return unless is_fqdn_valid($host);
255             # ignore invalid TLDs
256 441 100       1113 return unless $self->is_domain_valid($host);
257             # get rid of hostname part of domain, understanding delegation
258 415         880 $domain = $self->trim_domain($host);
259             }
260            
261             # $uri is now the domain only, optionally return unstripped host name
262 467 100       2239 return !wantarray ? $domain : ($domain, $host);
263             }
264              
265             1;
266