| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # <@LICENSE> | 
| 2 |  |  |  |  |  |  | # Licensed to the Apache Software Foundation (ASF) under one or more | 
| 3 |  |  |  |  |  |  | # contributor license agreements.  See the NOTICE file distributed with | 
| 4 |  |  |  |  |  |  | # this work for additional information regarding copyright ownership. | 
| 5 |  |  |  |  |  |  | # The ASF licenses this file to you under the Apache License, Version 2.0 | 
| 6 |  |  |  |  |  |  | # (the "License"); you may not use this file except in compliance with | 
| 7 |  |  |  |  |  |  | # the License.  You may obtain a copy of the License at: | 
| 8 |  |  |  |  |  |  | # | 
| 9 |  |  |  |  |  |  | #     http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  |  |  |  |  |  | # | 
| 11 |  |  |  |  |  |  | # Unless required by applicable law or agreed to in writing, software | 
| 12 |  |  |  |  |  |  | # distributed under the License is distributed on an "AS IS" BASIS, | 
| 13 |  |  |  |  |  |  | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 14 |  |  |  |  |  |  | # See the License for the specific language governing permissions and | 
| 15 |  |  |  |  |  |  | # limitations under the License. | 
| 16 |  |  |  |  |  |  | # </@LICENSE> | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | # HTML decoding TODOs | 
| 19 |  |  |  |  |  |  | # - add URIs to list for faster URI testing | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | package Mail::SpamAssassin::HTML; | 
| 22 |  |  |  |  |  |  |  | 
| 23 | 40 |  |  | 40 |  | 243 | use strict; | 
|  | 40 |  |  |  |  | 92 |  | 
|  | 40 |  |  |  |  | 1252 |  | 
| 24 | 40 |  |  | 40 |  | 232 | use warnings; | 
|  | 40 |  |  |  |  | 93 |  | 
|  | 40 |  |  |  |  | 1264 |  | 
| 25 | 40 |  |  | 40 |  | 218 | use re 'taint'; | 
|  | 40 |  |  |  |  | 96 |  | 
|  | 40 |  |  |  |  | 1820 |  | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | require 5.008;     # need basic Unicode support for HTML::Parser::utf8_mode | 
| 28 |  |  |  |  |  |  | # require 5.008008;  # Bug 3787; [perl #37950]: Malformed UTF-8 character ... | 
| 29 |  |  |  |  |  |  |  | 
| 30 | 40 |  |  | 40 |  | 22213 | use HTML::Parser 3.43 (); | 
|  | 40 |  |  |  |  | 226256 |  | 
|  | 40 |  |  |  |  | 1175 |  | 
| 31 | 40 |  |  | 40 |  | 308 | use Mail::SpamAssassin::Logger; | 
|  | 40 |  |  |  |  | 80 |  | 
|  | 40 |  |  |  |  | 2461 |  | 
| 32 | 40 |  |  | 40 |  | 254 | use Mail::SpamAssassin::Constants qw(:sa); | 
|  | 40 |  |  |  |  | 82 |  | 
|  | 40 |  |  |  |  | 4630 |  | 
| 33 | 40 |  |  | 40 |  | 272 | use Mail::SpamAssassin::Util qw(untaint_var); | 
|  | 40 |  |  |  |  | 84 |  | 
|  | 40 |  |  |  |  | 246941 |  | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | our @ISA = qw(HTML::Parser); | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | # elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!) | 
| 38 |  |  |  |  |  |  | # does not include XML | 
| 39 |  |  |  |  |  |  | my %elements = map {; $_ => 1 } | 
| 40 |  |  |  |  |  |  | # strict | 
| 41 |  |  |  |  |  |  | qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgroup option p param pre q samp script select small span strong style sub sup table tbody td textarea tfoot th thead title tr tt ul var ), | 
| 42 |  |  |  |  |  |  | # loose | 
| 43 |  |  |  |  |  |  | qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u ), | 
| 44 |  |  |  |  |  |  | # non-standard tags | 
| 45 |  |  |  |  |  |  | qw( nobr x-sigsep x-tab ), | 
| 46 |  |  |  |  |  |  | ; | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | # elements that we want to render, but not count as valid | 
| 49 |  |  |  |  |  |  | my %tricks = map {; $_ => 1 } | 
| 50 |  |  |  |  |  |  | # non-standard and non-valid tags | 
| 51 |  |  |  |  |  |  | qw( bgsound embed listing plaintext xmp ), | 
| 52 |  |  |  |  |  |  | # other non-standard tags handled in popfile | 
| 53 |  |  |  |  |  |  | #   blink ilayer multicol noembed nolayer spacer wbr | 
| 54 |  |  |  |  |  |  | ; | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | # elements that change text style | 
| 57 |  |  |  |  |  |  | my %elements_text_style = map {; $_ => 1 } | 
| 58 |  |  |  |  |  |  | qw( body font table tr th td big small basefont marquee span p div a ), | 
| 59 |  |  |  |  |  |  | ; | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | # elements that insert whitespace | 
| 62 |  |  |  |  |  |  | my %elements_whitespace = map {; $_ => 1 } | 
| 63 |  |  |  |  |  |  | qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp title | 
| 64 |  |  |  |  |  |  | h1 h2 h3 h4 h5 h6 ), | 
| 65 |  |  |  |  |  |  | ; | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | # elements that push URIs | 
| 68 |  |  |  |  |  |  | my %elements_uri = map {; $_ => 1 } | 
| 69 |  |  |  |  |  |  | qw( body table tr td a area link img frame iframe embed script form base bgsound ), | 
| 70 |  |  |  |  |  |  | ; | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | # style attribute not accepted | 
| 73 |  |  |  |  |  |  | #my %elements_no_style = map {; $_ => 1 } | 
| 74 |  |  |  |  |  |  | #  qw( base basefont head html meta param script style title ), | 
| 75 |  |  |  |  |  |  | #; | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | # permitted element attributes | 
| 78 |  |  |  |  |  |  | my %ok_attributes; | 
| 79 |  |  |  |  |  |  | $ok_attributes{basefont}{$_} = 1 for qw( color face size ); | 
| 80 |  |  |  |  |  |  | $ok_attributes{body}{$_} = 1 for qw( text bgcolor link alink vlink background ); | 
| 81 |  |  |  |  |  |  | $ok_attributes{font}{$_} = 1 for qw( color face size ); | 
| 82 |  |  |  |  |  |  | $ok_attributes{marquee}{$_} = 1 for qw( bgcolor background ); | 
| 83 |  |  |  |  |  |  | $ok_attributes{table}{$_} = 1 for qw( bgcolor style ); | 
| 84 |  |  |  |  |  |  | $ok_attributes{td}{$_} = 1 for qw( bgcolor style ); | 
| 85 |  |  |  |  |  |  | $ok_attributes{th}{$_} = 1 for qw( bgcolor style ); | 
| 86 |  |  |  |  |  |  | $ok_attributes{tr}{$_} = 1 for qw( bgcolor style ); | 
| 87 |  |  |  |  |  |  | $ok_attributes{span}{$_} = 1 for qw( style ); | 
| 88 |  |  |  |  |  |  | $ok_attributes{p}{$_} = 1 for qw( style ); | 
| 89 |  |  |  |  |  |  | $ok_attributes{div}{$_} = 1 for qw( style ); | 
| 90 |  |  |  |  |  |  | $ok_attributes{a}{$_} = 1 for qw( style ); | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | sub new { | 
| 93 | 7 |  |  | 7 | 1 | 29 | my ($class, $character_semantics_input, $character_semantics_output) = @_; | 
| 94 | 7 |  |  |  |  | 294 | my $self = $class->SUPER::new( | 
| 95 |  |  |  |  |  |  | api_version => 3, | 
| 96 |  |  |  |  |  |  | handlers => [ | 
| 97 |  |  |  |  |  |  | start_document => ["html_start", "self"], | 
| 98 |  |  |  |  |  |  | start => ["html_tag", "self,tagname,attr,'+1'"], | 
| 99 |  |  |  |  |  |  | end_document => ["html_end", "self"], | 
| 100 |  |  |  |  |  |  | end => ["html_tag", "self,tagname,attr,'-1'"], | 
| 101 |  |  |  |  |  |  | text => ["html_text", "self,dtext"], | 
| 102 |  |  |  |  |  |  | comment => ["html_comment", "self,text"], | 
| 103 |  |  |  |  |  |  | declaration => ["html_declaration", "self,text"], | 
| 104 |  |  |  |  |  |  | ], | 
| 105 |  |  |  |  |  |  | marked_sections => 1); | 
| 106 | 7 |  |  |  |  | 1205 | $self->{SA_character_semantics_input} = $character_semantics_input; | 
| 107 |  |  |  |  |  |  | $self->{SA_encode_results} = | 
| 108 | 7 |  | 33 |  |  | 43 | $character_semantics_input && !$character_semantics_output; | 
| 109 | 7 |  |  |  |  | 27 | $self; | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | sub html_start { | 
| 113 | 7 |  |  | 7 | 0 | 23 | my ($self) = @_; | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | # trigger HTML_MESSAGE | 
| 116 | 7 |  |  |  |  | 44 | $self->put_results(html => 1); | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | # initial display attributes | 
| 119 | 7 |  |  |  |  | 39 | $self->{basefont} = 3; | 
| 120 |  |  |  |  |  |  | my %default = (tag => "default", | 
| 121 |  |  |  |  |  |  | fgcolor => "#000000", | 
| 122 |  |  |  |  |  |  | bgcolor => "#ffffff", | 
| 123 | 7 |  |  |  |  | 79 | size => $self->{basefont}); | 
| 124 | 7 |  |  |  |  | 19 | push @{ $self->{text_style} }, \%default; | 
|  | 7 |  |  |  |  | 124 |  | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | sub html_end { | 
| 128 | 7 |  |  | 7 | 0 | 28 | my ($self) = @_; | 
| 129 |  |  |  |  |  |  |  | 
| 130 | 7 |  |  |  |  | 35 | delete $self->{text_style}; | 
| 131 |  |  |  |  |  |  |  | 
| 132 | 7 |  |  |  |  | 19 | my @uri; | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | # add the canonicalized version of each uri to the detail list | 
| 135 | 7 | 50 |  |  |  | 31 | if (defined $self->{uri}) { | 
| 136 | 7 |  |  |  |  | 24 | @uri = keys %{$self->{uri}}; | 
|  | 7 |  |  |  |  | 55 |  | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | # these keep backward compatibility, albeit a little wasteful | 
| 140 | 7 |  |  |  |  | 66 | $self->put_results(uri => \@uri); | 
| 141 | 7 |  |  |  |  | 31 | $self->put_results(anchor => $self->{anchor}); | 
| 142 |  |  |  |  |  |  |  | 
| 143 | 7 |  |  |  |  | 171 | $self->put_results(uri_detail => $self->{uri}); | 
| 144 | 7 |  |  |  |  | 168 | $self->put_results(uri_truncated => $self->{uri_truncated}); | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | # final results scalars | 
| 147 | 7 |  |  |  |  | 32 | $self->put_results(image_area => $self->{image_area}); | 
| 148 | 7 |  |  |  |  | 46 | $self->put_results(length => $self->{length}); | 
| 149 | 7 |  |  |  |  | 25 | $self->put_results(min_size => $self->{min_size}); | 
| 150 | 7 |  |  |  |  | 209 | $self->put_results(max_size => $self->{max_size}); | 
| 151 | 7 | 50 |  |  |  | 38 | if (exists $self->{tags}) { | 
| 152 |  |  |  |  |  |  | $self->put_results(closed_extra_ratio => | 
| 153 | 7 |  |  |  |  | 39 | ($self->{closed_extra} / $self->{tags})); | 
| 154 |  |  |  |  |  |  | } | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | # final result arrays | 
| 157 | 7 |  |  |  |  | 37 | $self->put_results(comment => $self->{comment}); | 
| 158 | 7 |  |  |  |  | 34 | $self->put_results(script => $self->{script}); | 
| 159 | 7 |  |  |  |  | 47 | $self->put_results(title => $self->{title}); | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | # final result hashes | 
| 162 | 7 |  |  |  |  | 31 | $self->put_results(inside => $self->{inside}); | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | # end-of-document result values that don't require looking at the text | 
| 165 | 7 | 50 |  |  |  | 27 | if (exists $self->{backhair}) { | 
| 166 | 0 |  |  |  |  | 0 | $self->put_results(backhair_count => scalar keys %{ $self->{backhair} }); | 
|  | 0 |  |  |  |  | 0 |  | 
| 167 |  |  |  |  |  |  | } | 
| 168 | 7 | 50 | 33 |  |  | 65 | if (exists $self->{elements} && exists $self->{tags}) { | 
| 169 |  |  |  |  |  |  | $self->put_results(bad_tag_ratio => | 
| 170 | 7 |  |  |  |  | 38 | ($self->{tags} - $self->{elements}) / $self->{tags}); | 
| 171 |  |  |  |  |  |  | } | 
| 172 | 7 | 50 | 33 |  |  | 81 | if (exists $self->{elements_seen} && exists $self->{tags_seen}) { | 
| 173 |  |  |  |  |  |  | $self->put_results(non_element_ratio => | 
| 174 |  |  |  |  |  |  | ($self->{tags_seen} - $self->{elements_seen}) / | 
| 175 | 7 |  |  |  |  | 36 | $self->{tags_seen}); | 
| 176 |  |  |  |  |  |  | } | 
| 177 | 7 | 50 | 33 |  |  | 80 | if (exists $self->{tags} && exists $self->{obfuscation}) { | 
| 178 |  |  |  |  |  |  | $self->put_results(obfuscation_ratio => | 
| 179 | 0 |  |  |  |  | 0 | $self->{obfuscation} / $self->{tags}); | 
| 180 |  |  |  |  |  |  | } | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | sub put_results { | 
| 184 | 136 |  |  | 136 | 0 | 173 | my $self = shift; | 
| 185 | 136 |  |  |  |  | 320 | my %results = @_; | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 136 |  |  |  |  | 356 | while (my ($k, $v) = each %results) { | 
| 188 | 136 |  |  |  |  | 483 | $self->{results}{$k} = $v; | 
| 189 |  |  |  |  |  |  | } | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | sub get_results { | 
| 193 | 7 |  |  | 7 | 0 | 19 | my ($self) = @_; | 
| 194 |  |  |  |  |  |  |  | 
| 195 | 7 |  |  |  |  | 23 | return $self->{results}; | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  | sub get_rendered_text { | 
| 199 | 21 |  |  | 21 | 0 | 38 | my $self = shift; | 
| 200 | 21 |  |  |  |  | 57 | my %options = @_; | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 21 | 100 |  |  |  | 55 | return join('', @{ $self->{text} }) unless %options; | 
|  | 7 |  |  |  |  | 135 |  | 
| 203 |  |  |  |  |  |  |  | 
| 204 | 14 |  |  |  |  | 31 | my $mask; | 
| 205 | 14 |  |  |  |  | 71 | while (my ($k, $v) = each %options) { | 
| 206 | 14 | 50 |  |  |  | 66 | next if !defined $self->{"text_$k"}; | 
| 207 | 14 | 50 |  |  |  | 44 | if (!defined $mask) { | 
| 208 | 14 | 100 |  |  |  | 149 | $mask |= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"}; | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | else { | 
| 211 | 0 | 0 |  |  |  | 0 | $mask &= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"}; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  | } | 
| 214 |  |  |  |  |  |  |  | 
| 215 | 14 |  |  |  |  | 45 | my $text = ''; | 
| 216 | 14 |  |  |  |  | 26 | my $i = 0; | 
| 217 | 14 | 100 |  |  |  | 21 | for (@{ $self->{text} }) { $text .= $_ if vec($mask, $i++, 1); } | 
|  | 14 |  |  |  |  | 42 |  | 
|  | 1372 |  |  |  |  | 2008 |  | 
| 218 | 14 |  |  |  |  | 91 | return $text; | 
| 219 |  |  |  |  |  |  | } | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | sub parse { | 
| 222 | 7 |  |  | 7 | 1 | 28 | my ($self, $text) = @_; | 
| 223 |  |  |  |  |  |  |  | 
| 224 | 7 |  |  |  |  | 23 | $self->{image_area} = 0; | 
| 225 | 7 |  |  |  |  | 23 | $self->{title_index} = -1; | 
| 226 | 7 |  |  |  |  | 14 | $self->{max_size} = 3;	# start at default size | 
| 227 | 7 |  |  |  |  | 16 | $self->{min_size} = 3;	# start at default size | 
| 228 | 7 |  |  |  |  | 22 | $self->{closed_html} = 0; | 
| 229 | 7 |  |  |  |  | 16 | $self->{closed_body} = 0; | 
| 230 | 7 |  |  |  |  | 18 | $self->{closed_extra} = 0; | 
| 231 | 7 |  |  |  |  | 16 | $self->{text} = [];		# rendered text | 
| 232 | 7 |  |  |  |  | 51 | $self->{length} += untaint_var(length($text)); | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | # NOTE: We *only* need to fix the rendering when we verify that it | 
| 235 |  |  |  |  |  |  | # differs from what people see in their MUA.  Testing is best done with | 
| 236 |  |  |  |  |  |  | # the most common MUAs and browsers, if you catch my drift. | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | # NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we | 
| 239 |  |  |  |  |  |  | # don't need to fix them here. | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | # # (outdated claim) HTML::Parser converts   into a question mark ("?") | 
| 242 |  |  |  |  |  |  | # # for some reason, so convert them to spaces.  Confirmed in 3.31, at least. | 
| 243 |  |  |  |  |  |  | # ... Actually it doesn't, it is correctly coverted into Unicode NBSP, | 
| 244 |  |  |  |  |  |  | # nevertheless it does not hurt to treat it as a space. | 
| 245 | 7 |  |  |  |  | 49 | $text =~ s/ / /g; | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | # bug 4695: we want "<br/>" to be treated the same as "<br>", and | 
| 248 |  |  |  |  |  |  | # the HTML::Parser API won't do it for us | 
| 249 | 7 |  |  |  |  | 205 | $text =~ s/<(\w+)\s*\/>/<$1>/gi; | 
| 250 |  |  |  |  |  |  |  | 
| 251 | 7 | 50 |  |  |  | 101 | if (!$self->UNIVERSAL::can('utf8_mode')) { | 
| 252 |  |  |  |  |  |  | # utf8_mode is cleared by default, only warn if it would need to be set | 
| 253 |  |  |  |  |  |  | warn "message: cannot set utf8_mode, module HTML::Parser is too old\n" | 
| 254 | 0 | 0 |  |  |  | 0 | if !$self->{SA_character_semantics_input}; | 
| 255 |  |  |  |  |  |  | } else { | 
| 256 | 7 | 50 |  |  |  | 50 | $self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1); | 
| 257 | 7 |  |  |  |  | 25 | my $utf8_mode = $self->SUPER::utf8_mode; | 
| 258 | 7 | 50 |  |  |  | 65 | dbg("message: HTML::Parser utf8_mode %s", | 
| 259 |  |  |  |  |  |  | $utf8_mode ? "on (assumed UTF-8 octets)" | 
| 260 |  |  |  |  |  |  | : "off (default, assumed Unicode characters)"); | 
| 261 |  |  |  |  |  |  | } | 
| 262 | 7 |  |  |  |  | 87 | $self->SUPER::parse($text); | 
| 263 |  |  |  |  |  |  |  | 
| 264 |  |  |  |  |  |  | # bug 7437: deal gracefully with HTML::Parser misbehavior on unclosed <style> and <script> tags | 
| 265 |  |  |  |  |  |  | # (typically from not passing the entire message to spamc, but possibly a DoS attack) | 
| 266 | 7 |  | 66 |  |  | 61 | $self->SUPER::parse("</style>") while exists $self->{inside}{style} && $self->{inside}{style} > 0; | 
| 267 | 7 |  | 33 |  |  | 49 | $self->SUPER::parse("</script>") while exists $self->{inside}{script} && $self->{inside}{script} > 0; | 
| 268 |  |  |  |  |  |  |  | 
| 269 | 7 |  |  |  |  | 116 | $self->SUPER::eof; | 
| 270 |  |  |  |  |  |  |  | 
| 271 | 7 |  |  |  |  | 35 | return $self->{text}; | 
| 272 |  |  |  |  |  |  | } | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | sub html_tag { | 
| 275 | 824 |  |  | 824 | 0 | 1662 | my ($self, $tag, $attr, $num) = @_; | 
| 276 | 824 | 50 |  |  |  | 1424 | utf8::encode($tag) if $self->{SA_encode_results}; | 
| 277 |  |  |  |  |  |  |  | 
| 278 | 824 |  |  |  |  | 1402 | my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@); | 
| 279 |  |  |  |  |  |  |  | 
| 280 | 824 | 50 | 33 |  |  | 1611 | if (exists $elements{$tag} || $maybe_namespace) { | 
| 281 | 824 |  |  |  |  | 1027 | $self->{elements}++; | 
| 282 | 824 | 100 |  |  |  | 1417 | $self->{elements_seen}++ if !exists $self->{inside}{$tag}; | 
| 283 |  |  |  |  |  |  | } | 
| 284 | 824 |  |  |  |  | 967 | $self->{tags}++; | 
| 285 | 824 | 100 |  |  |  | 1364 | $self->{tags_seen}++ if !exists $self->{inside}{$tag}; | 
| 286 | 824 |  |  |  |  | 1458 | $self->{inside}{$tag} += $num; | 
| 287 | 824 | 100 |  |  |  | 1351 | if ($self->{inside}{$tag} < 0) { | 
| 288 | 17 |  |  |  |  | 46 | $self->{inside}{$tag} = 0; | 
| 289 | 17 |  |  |  |  | 67 | $self->{closed_extra}++; | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  |  | 
| 292 | 824 | 50 |  |  |  | 1154 | return if $maybe_namespace; | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | # ignore non-elements | 
| 295 | 824 | 50 | 33 |  |  | 1635 | if (exists $elements{$tag} || exists $tricks{$tag}) { | 
| 296 | 824 | 100 |  |  |  | 1951 | $self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag}; | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | # bug 5009: things like <p> and </p> both need dealing with | 
| 299 | 824 | 100 |  |  |  | 1906 | $self->html_whitespace($tag) if exists $elements_whitespace{$tag}; | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | # start tags | 
| 302 | 824 | 100 |  |  |  | 1628 | if ($num == 1) { | 
| 303 | 464 | 100 |  |  |  | 960 | $self->html_uri($tag, $attr) if exists $elements_uri{$tag}; | 
| 304 | 464 |  |  |  |  | 1013 | $self->html_tests($tag, $attr, $num); | 
| 305 |  |  |  |  |  |  | } | 
| 306 |  |  |  |  |  |  | # end tags | 
| 307 |  |  |  |  |  |  | else { | 
| 308 | 360 | 100 |  |  |  | 562 | $self->{closed_html} = 1 if $tag eq "html"; | 
| 309 | 360 | 100 |  |  |  | 1918 | $self->{closed_body} = 1 if $tag eq "body"; | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  | } | 
| 312 |  |  |  |  |  |  | } | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | sub html_whitespace { | 
| 315 | 202 |  |  | 202 | 0 | 324 | my ($self, $tag) = @_; | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | # ordered by frequency of tag groups, note: whitespace is always "visible" | 
| 318 | 202 | 100 | 100 |  |  | 902 | if ($tag eq "br" || $tag eq "div") { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 319 | 93 |  |  |  |  | 179 | $self->display_text("\n", whitespace => 1); | 
| 320 |  |  |  |  |  |  | } | 
| 321 |  |  |  |  |  |  | elsif ($tag =~ /^(?:li|t[hd]|d[td]|embed|h\d)$/) { | 
| 322 | 15 |  |  |  |  | 26 | $self->display_text(" ", whitespace => 1); | 
| 323 |  |  |  |  |  |  | } | 
| 324 |  |  |  |  |  |  | elsif ($tag =~ /^(?:p|hr|blockquote|pre|listing|plaintext|xmp|title)$/) { | 
| 325 | 94 |  |  |  |  | 208 | $self->display_text("\n\n", whitespace => 1); | 
| 326 |  |  |  |  |  |  | } | 
| 327 |  |  |  |  |  |  | } | 
| 328 |  |  |  |  |  |  |  | 
| 329 |  |  |  |  |  |  | # puts the uri onto the internal array | 
| 330 |  |  |  |  |  |  | # note: uri may be blank (<a href=""></a> obfuscation, etc.) | 
| 331 |  |  |  |  |  |  | sub push_uri { | 
| 332 | 131 |  |  | 131 | 0 | 207 | my ($self, $type, $uri) = @_; | 
| 333 |  |  |  |  |  |  |  | 
| 334 | 131 |  |  |  |  | 185 | $uri = $self->canon_uri($uri); | 
| 335 | 131 | 50 |  |  |  | 241 | utf8::encode($uri) if $self->{SA_encode_results}; | 
| 336 |  |  |  |  |  |  |  | 
| 337 | 131 |  | 50 |  |  | 460 | my $target = target_uri($self->{base_href} || "", $uri); | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | # skip things like <iframe src="" ...> | 
| 340 | 131 | 50 |  |  |  | 589 | $self->{uri}->{$uri}->{types}->{$type} = 1  if $uri ne ''; | 
| 341 |  |  |  |  |  |  | } | 
| 342 |  |  |  |  |  |  |  | 
| 343 |  |  |  |  |  |  | sub canon_uri { | 
| 344 | 249 |  |  | 249 | 0 | 334 | my ($self, $uri) = @_; | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | # URIs don't have leading/trailing whitespace ... | 
| 347 | 249 |  |  |  |  | 458 | $uri =~ s/^\s+//; | 
| 348 | 249 |  |  |  |  | 375 | $uri =~ s/\s+$//; | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | # Make sure all the URIs are nice and short | 
| 351 | 249 | 50 |  |  |  | 441 | if (length $uri > MAX_URI_LENGTH) { | 
| 352 | 0 |  |  |  |  | 0 | $self->{'uri_truncated'} = 1; | 
| 353 | 0 |  |  |  |  | 0 | $uri = substr $uri, 0, MAX_URI_LENGTH; | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  |  | 
| 356 | 249 |  |  |  |  | 425 | return $uri; | 
| 357 |  |  |  |  |  |  | } | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | sub html_uri { | 
| 360 | 149 |  |  | 149 | 0 | 267 | my ($self, $tag, $attr) = @_; | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  | # ordered by frequency of tag groups | 
| 363 | 149 | 100 |  |  |  | 588 | if ($tag =~ /^(?:body|table|tr|td)$/) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 364 | 24 | 100 |  |  |  | 65 | if (defined $attr->{background}) { | 
| 365 | 6 |  |  |  |  | 13 | $self->push_uri($tag, $attr->{background}); | 
| 366 |  |  |  |  |  |  | } | 
| 367 |  |  |  |  |  |  | } | 
| 368 |  |  |  |  |  |  | elsif ($tag =~ /^(?:a|area|link)$/) { | 
| 369 | 118 | 50 |  |  |  | 245 | if (defined $attr->{href}) { | 
| 370 | 118 |  |  |  |  | 240 | $self->push_uri($tag, $attr->{href}); | 
| 371 |  |  |  |  |  |  | } | 
| 372 |  |  |  |  |  |  | } | 
| 373 |  |  |  |  |  |  | elsif ($tag =~ /^(?:img|frame|iframe|embed|script|bgsound)$/) { | 
| 374 | 7 | 50 |  |  |  | 14 | if (defined $attr->{src}) { | 
| 375 | 7 |  |  |  |  | 11 | $self->push_uri($tag, $attr->{src}); | 
| 376 |  |  |  |  |  |  | } | 
| 377 |  |  |  |  |  |  | } | 
| 378 |  |  |  |  |  |  | elsif ($tag eq "form") { | 
| 379 | 0 | 0 |  |  |  | 0 | if (defined $attr->{action}) { | 
| 380 | 0 |  |  |  |  | 0 | $self->push_uri($tag, $attr->{action}); | 
| 381 |  |  |  |  |  |  | } | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  | elsif ($tag eq "base") { | 
| 384 | 0 | 0 |  |  |  | 0 | if (my $uri = $attr->{href}) { | 
| 385 | 0 |  |  |  |  | 0 | $uri = $self->canon_uri($uri); | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  | # use <BASE HREF="URI"> to turn relative links into absolute links | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  | # even if it is a base URI, handle like a normal URI as well | 
| 390 | 0 |  |  |  |  | 0 | $self->push_uri($tag, $uri); | 
| 391 |  |  |  |  |  |  |  | 
| 392 |  |  |  |  |  |  | # a base URI will be ignored by browsers unless it is an absolute | 
| 393 |  |  |  |  |  |  | # URI of a standard protocol | 
| 394 | 0 | 0 |  |  |  | 0 | if ($uri =~ m@^(?:https?|ftp):/{0,2}@i) { | 
| 395 |  |  |  |  |  |  | # remove trailing filename, if any; base URIs can have the | 
| 396 |  |  |  |  |  |  | # form of "http://foo.com/index.html" | 
| 397 | 0 |  |  |  |  | 0 | $uri =~ s@^([a-z]+:/{0,2}[^/]+/.*?)[^/\.]+\.[^/\.]{2,4}$@$1@i; | 
| 398 |  |  |  |  |  |  |  | 
| 399 |  |  |  |  |  |  | # Make sure it ends in a slash | 
| 400 | 0 | 0 |  |  |  | 0 | $uri .= "/" unless $uri =~ m@/$@; | 
| 401 | 0 | 0 |  |  |  | 0 | utf8::encode($uri) if $self->{SA_encode_results}; | 
| 402 | 0 |  |  |  |  | 0 | $self->{base_href} = $uri; | 
| 403 |  |  |  |  |  |  | } | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  | } | 
| 406 |  |  |  |  |  |  | } | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | # this might not be quite right, may need to pay attention to table nesting | 
| 409 |  |  |  |  |  |  | sub close_table_tag { | 
| 410 | 15 |  |  | 15 | 0 | 18 | my ($self, $tag) = @_; | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | # don't close if never opened | 
| 413 | 15 | 50 |  |  |  | 16 | return unless grep { $_->{tag} eq $tag } @{ $self->{text_style} }; | 
|  | 66 |  |  |  |  | 107 |  | 
|  | 15 |  |  |  |  | 22 |  | 
| 414 |  |  |  |  |  |  |  | 
| 415 | 0 |  |  |  |  | 0 | my $top; | 
| 416 | 0 |  | 0 |  |  | 0 | while (@{ $self->{text_style} } && ($top = $self->{text_style}[-1]->{tag})) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 417 | 0 | 0 | 0 |  |  | 0 | if (($tag eq "td" && ($top eq "font" || $top eq "td")) || | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 418 |  |  |  |  |  |  | ($tag eq "tr" && $top =~ /^(?:font|td|tr)$/)) | 
| 419 |  |  |  |  |  |  | { | 
| 420 | 0 |  |  |  |  | 0 | pop @{ $self->{text_style} }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 421 |  |  |  |  |  |  | } | 
| 422 |  |  |  |  |  |  | else { | 
| 423 | 0 |  |  |  |  | 0 | last; | 
| 424 |  |  |  |  |  |  | } | 
| 425 |  |  |  |  |  |  | } | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | sub close_tag { | 
| 429 | 292 |  |  | 292 | 0 | 403 | my ($self, $tag) = @_; | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | # don't close if never opened | 
| 432 | 292 | 100 |  |  |  | 330 | return if !grep { $_->{tag} eq $tag } @{ $self->{text_style} }; | 
|  | 1099 |  |  |  |  | 1984 |  | 
|  | 292 |  |  |  |  | 503 |  | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | # close everything up to and including tag | 
| 435 | 277 |  |  |  |  | 359 | while (my %current = %{ pop @{ $self->{text_style} } }) { | 
|  | 288 |  |  |  |  | 285 |  | 
|  | 288 |  |  |  |  | 1428 |  | 
| 436 | 288 | 100 |  |  |  | 927 | last if $current{tag} eq $tag; | 
| 437 |  |  |  |  |  |  | } | 
| 438 |  |  |  |  |  |  | } | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | sub text_style { | 
| 441 | 604 |  |  | 604 | 0 | 944 | my ($self, $tag, $attr, $num) = @_; | 
| 442 |  |  |  |  |  |  |  | 
| 443 |  |  |  |  |  |  | # treat <th> as <td> | 
| 444 | 604 | 50 |  |  |  | 936 | $tag = "td" if $tag eq "th"; | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | # open | 
| 447 | 604 | 100 |  |  |  | 921 | if ($num == 1) { | 
| 448 |  |  |  |  |  |  | # HTML browsers generally only use first <body> for colors, | 
| 449 |  |  |  |  |  |  | # so only push if we haven't seen a body tag yet | 
| 450 | 304 | 100 |  |  |  | 480 | if ($tag eq "body") { | 
| 451 |  |  |  |  |  |  | # TODO: skip if we've already seen body | 
| 452 |  |  |  |  |  |  | } | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | # change basefont (only change size) | 
| 455 | 304 | 0 | 33 |  |  | 423 | if ($tag eq "basefont" && | 
|  |  |  | 0 |  |  |  |  | 
| 456 |  |  |  |  |  |  | exists $attr->{size} && $attr->{size} =~ /^\s*(\d+)/) | 
| 457 |  |  |  |  |  |  | { | 
| 458 | 0 |  |  |  |  | 0 | $self->{basefont} = $1; | 
| 459 | 0 |  |  |  |  | 0 | return; | 
| 460 |  |  |  |  |  |  | } | 
| 461 |  |  |  |  |  |  |  | 
| 462 |  |  |  |  |  |  | # close elements with optional end tags | 
| 463 | 304 | 100 | 100 |  |  | 917 | $self->close_table_tag($tag) if ($tag eq "td" || $tag eq "tr"); | 
| 464 |  |  |  |  |  |  |  | 
| 465 |  |  |  |  |  |  | # copy current text state | 
| 466 | 304 |  |  |  |  | 354 | my %new = %{ $self->{text_style}[-1] }; | 
|  | 304 |  |  |  |  | 1227 |  | 
| 467 |  |  |  |  |  |  |  | 
| 468 |  |  |  |  |  |  | # change tag name! | 
| 469 | 304 |  |  |  |  | 550 | $new{tag} = $tag; | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | # big and small tags | 
| 472 | 304 | 50 |  |  |  | 782 | if ($tag eq "big") { | 
| 473 | 0 |  |  |  |  | 0 | $new{size} += 1; | 
| 474 | 0 |  |  |  |  | 0 | push @{ $self->{text_style} }, \%new; | 
|  | 0 |  |  |  |  | 0 |  | 
| 475 | 0 |  |  |  |  | 0 | return; | 
| 476 |  |  |  |  |  |  | } | 
| 477 | 304 | 50 |  |  |  | 436 | if ($tag eq "small") { | 
| 478 | 0 |  |  |  |  | 0 | $new{size} -= 1; | 
| 479 | 0 |  |  |  |  | 0 | push @{ $self->{text_style} }, \%new; | 
|  | 0 |  |  |  |  | 0 |  | 
| 480 | 0 |  |  |  |  | 0 | return; | 
| 481 |  |  |  |  |  |  | } | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | # tag attributes | 
| 484 | 304 |  |  |  |  | 728 | for my $name (keys %$attr) { | 
| 485 | 583 | 100 |  |  |  | 1077 | next unless exists $ok_attributes{$tag}{$name}; | 
| 486 | 244 | 100 | 66 |  |  | 832 | if ($name eq "text" || $name eq "color") { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
| 487 |  |  |  |  |  |  | # two different names for text color | 
| 488 | 90 |  |  |  |  | 196 | $new{fgcolor} = name_to_rgb($attr->{$name}); | 
| 489 |  |  |  |  |  |  | } | 
| 490 |  |  |  |  |  |  | elsif ($name eq "size") { | 
| 491 | 105 | 50 |  |  |  | 481 | if ($attr->{size} =~ /^\s*([+-]\d+)/) { | 
|  |  | 50 |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | # relative font size | 
| 493 | 0 |  |  |  |  | 0 | $new{size} = $self->{basefont} + $1; | 
| 494 |  |  |  |  |  |  | } | 
| 495 |  |  |  |  |  |  | elsif ($attr->{size} =~ /^\s*(\d+)/) { | 
| 496 |  |  |  |  |  |  | # absolute font size | 
| 497 | 105 |  |  |  |  | 309 | $new{size} = $1; | 
| 498 |  |  |  |  |  |  | } | 
| 499 |  |  |  |  |  |  | } | 
| 500 |  |  |  |  |  |  | elsif ($name eq 'style') { | 
| 501 | 8 |  |  |  |  | 14 | $new{style} = $attr->{style}; | 
| 502 | 8 |  |  |  |  | 20 | my @parts = split(/;/, $new{style}); | 
| 503 | 8 |  |  |  |  | 13 | foreach (@parts) { | 
| 504 | 8 | 50 |  |  |  | 47 | if (/^\s*(background-)?color:\s*(.+)\s*$/i) { | 
|  |  | 50 |  |  |  |  |  | 
| 505 | 0 | 0 |  |  |  | 0 | my $whcolor = $1 ? 'bgcolor' : 'fgcolor'; | 
| 506 | 0 |  |  |  |  | 0 | my $value = lc $2; | 
| 507 |  |  |  |  |  |  |  | 
| 508 | 0 | 0 |  |  |  | 0 | if ($value =~ /rgb/) { | 
| 509 | 0 |  |  |  |  | 0 | $value =~ tr/0-9,//cd; | 
| 510 | 0 |  |  |  |  | 0 | my @rgb = split(/,/, $value); | 
| 511 |  |  |  |  |  |  | $new{$whcolor} = sprintf("#%02x%02x%02x", | 
| 512 | 0 | 0 |  |  |  | 0 | map { !$_ ? 0 : $_ > 255 ? 255 : $_ } | 
|  | 0 | 0 |  |  |  | 0 |  | 
| 513 |  |  |  |  |  |  | @rgb[0..2]); | 
| 514 |  |  |  |  |  |  | } | 
| 515 |  |  |  |  |  |  | else { | 
| 516 | 0 |  |  |  |  | 0 | $new{$whcolor} = name_to_rgb($value); | 
| 517 |  |  |  |  |  |  | } | 
| 518 |  |  |  |  |  |  | } | 
| 519 |  |  |  |  |  |  | elsif (/^\s*([a-z_-]+)\s*:\s*(\S.*?)\s*$/i) { | 
| 520 |  |  |  |  |  |  | # "display: none", "visibility: hidden", etc. | 
| 521 | 0 |  |  |  |  | 0 | $new{'style_'.$1} = $2; | 
| 522 |  |  |  |  |  |  | } | 
| 523 |  |  |  |  |  |  | } | 
| 524 |  |  |  |  |  |  | } | 
| 525 |  |  |  |  |  |  | elsif ($name eq "bgcolor") { | 
| 526 |  |  |  |  |  |  | # overwrite with hex value, $new{bgcolor} is set below | 
| 527 | 6 |  |  |  |  | 14 | $attr->{bgcolor} = name_to_rgb($attr->{bgcolor}); | 
| 528 |  |  |  |  |  |  | } | 
| 529 |  |  |  |  |  |  | else { | 
| 530 |  |  |  |  |  |  | # attribute is probably okay | 
| 531 | 35 |  |  |  |  | 57 | $new{$name} = $attr->{$name}; | 
| 532 |  |  |  |  |  |  | } | 
| 533 |  |  |  |  |  |  |  | 
| 534 | 244 | 100 |  |  |  | 706 | if ($new{size} > $self->{max_size}) { | 
|  |  | 50 |  |  |  |  |  | 
| 535 | 15 |  |  |  |  | 33 | $self->{max_size} = $new{size}; | 
| 536 |  |  |  |  |  |  | } | 
| 537 |  |  |  |  |  |  | elsif ($new{size} < $self->{min_size}) { | 
| 538 | 0 |  |  |  |  | 0 | $self->{min_size} = $new{size}; | 
| 539 |  |  |  |  |  |  | } | 
| 540 |  |  |  |  |  |  | } | 
| 541 | 304 |  |  |  |  | 395 | push @{ $self->{text_style} }, \%new; | 
|  | 304 |  |  |  |  | 670 |  | 
| 542 |  |  |  |  |  |  | } | 
| 543 |  |  |  |  |  |  | # explicitly close a tag | 
| 544 |  |  |  |  |  |  | else { | 
| 545 | 300 | 100 |  |  |  | 497 | if ($tag ne "body") { | 
| 546 |  |  |  |  |  |  | # don't close body since browsers seem to render text after </body> | 
| 547 | 292 |  |  |  |  | 497 | $self->close_tag($tag); | 
| 548 |  |  |  |  |  |  | } | 
| 549 |  |  |  |  |  |  | } | 
| 550 |  |  |  |  |  |  | } | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | sub html_font_invisible { | 
| 553 | 215 |  |  | 215 | 0 | 317 | my ($self, $text) = @_; | 
| 554 |  |  |  |  |  |  |  | 
| 555 | 215 |  |  |  |  | 299 | my $fg = $self->{text_style}[-1]->{fgcolor}; | 
| 556 | 215 |  |  |  |  | 311 | my $bg = $self->{text_style}[-1]->{bgcolor}; | 
| 557 | 215 |  |  |  |  | 269 | my $size = $self->{text_style}[-1]->{size}; | 
| 558 | 215 |  |  |  |  | 283 | my $display = $self->{text_style}[-1]->{style_display}; | 
| 559 | 215 |  |  |  |  | 284 | my $visibility = $self->{text_style}[-1]->{style_visibility}; | 
| 560 |  |  |  |  |  |  |  | 
| 561 |  |  |  |  |  |  | # invisibility | 
| 562 | 215 | 50 |  |  |  | 920 | if (substr($fg,-6) eq substr($bg,-6)) { | 
|  |  | 50 |  |  |  |  |  | 
| 563 | 0 |  |  |  |  | 0 | $self->put_results(font_low_contrast => 1); | 
| 564 | 0 |  |  |  |  | 0 | return 1; | 
| 565 |  |  |  |  |  |  | # near-invisibility | 
| 566 |  |  |  |  |  |  | } elsif ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) { | 
| 567 | 215 |  |  |  |  | 681 | my ($r1, $g1, $b1) = (hex($1), hex($2), hex($3)); | 
| 568 |  |  |  |  |  |  |  | 
| 569 | 215 | 50 |  |  |  | 576 | if ($bg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) { | 
| 570 | 215 |  |  |  |  | 418 | my ($r2, $g2, $b2) = (hex($1), hex($2), hex($3)); | 
| 571 |  |  |  |  |  |  |  | 
| 572 | 215 |  |  |  |  | 297 | my $r = ($r1 - $r2); | 
| 573 | 215 |  |  |  |  | 233 | my $g = ($g1 - $g2); | 
| 574 | 215 |  |  |  |  | 238 | my $b = ($b1 - $b2); | 
| 575 |  |  |  |  |  |  |  | 
| 576 |  |  |  |  |  |  | # geometric distance weighted by brightness | 
| 577 |  |  |  |  |  |  | # maximum distance is 191.151823601032 | 
| 578 | 215 |  |  |  |  | 752 | my $distance = ((0.2126*$r)**2 + (0.7152*$g)**2 + (0.0722*$b)**2)**0.5; | 
| 579 |  |  |  |  |  |  |  | 
| 580 |  |  |  |  |  |  | # the text is very difficult to read if the distance is under 12, | 
| 581 |  |  |  |  |  |  | # a limit of 14 to 16 might be okay if the usage significantly | 
| 582 |  |  |  |  |  |  | # increases (near-invisible text is at about 0.95% of spam and | 
| 583 |  |  |  |  |  |  | # 1.25% of HTML spam right now), but please test any changes first | 
| 584 | 215 | 50 |  |  |  | 434 | if ($distance < 12) { | 
| 585 | 0 |  |  |  |  | 0 | $self->put_results(font_low_contrast => 1); | 
| 586 | 0 |  |  |  |  | 0 | return 1; | 
| 587 |  |  |  |  |  |  | } | 
| 588 |  |  |  |  |  |  | } | 
| 589 |  |  |  |  |  |  | } | 
| 590 |  |  |  |  |  |  |  | 
| 591 |  |  |  |  |  |  |  | 
| 592 |  |  |  |  |  |  | # invalid color | 
| 593 | 215 | 50 | 33 |  |  | 664 | if ($fg eq 'invalid' or $bg eq 'invalid') { | 
| 594 | 0 |  |  |  |  | 0 | $self->put_results(font_invalid_color => 1); | 
| 595 | 0 |  |  |  |  | 0 | return 1; | 
| 596 |  |  |  |  |  |  | } | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | # size too small | 
| 599 | 215 | 50 |  |  |  | 342 | if ($size <= 1) { | 
| 600 | 0 |  |  |  |  | 0 | return 1; | 
| 601 |  |  |  |  |  |  | } | 
| 602 |  |  |  |  |  |  |  | 
| 603 |  |  |  |  |  |  | # <span style="display: none"> | 
| 604 | 215 | 50 | 33 |  |  | 367 | if ($display && lc $display eq 'none') { | 
| 605 | 0 |  |  |  |  | 0 | return 1; | 
| 606 |  |  |  |  |  |  | } | 
| 607 |  |  |  |  |  |  |  | 
| 608 | 215 | 50 | 33 |  |  | 339 | if ($visibility && lc $visibility eq 'hidden') { | 
| 609 | 0 |  |  |  |  | 0 | return 1; | 
| 610 |  |  |  |  |  |  | } | 
| 611 |  |  |  |  |  |  |  | 
| 612 | 215 |  |  |  |  | 387 | return 0; | 
| 613 |  |  |  |  |  |  | } | 
| 614 |  |  |  |  |  |  |  | 
| 615 |  |  |  |  |  |  | sub html_tests { | 
| 616 | 464 |  |  | 464 | 0 | 777 | my ($self, $tag, $attr, $num) = @_; | 
| 617 |  |  |  |  |  |  |  | 
| 618 | 464 | 100 | 100 |  |  | 988 | if ($tag eq "font" && exists $attr->{face}) { | 
| 619 | 34 | 100 |  |  |  | 260 | if ($attr->{face} !~ /^'?[a-z ][a-z -]*[a-z](?:,\s*[a-z][a-z -]*[a-z])*'?$/i) { | 
| 620 | 24 |  |  |  |  | 60 | $self->put_results(font_face_bad => 1); | 
| 621 |  |  |  |  |  |  | } | 
| 622 |  |  |  |  |  |  | } | 
| 623 | 464 | 100 | 100 |  |  | 781 | if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) { | 
|  |  |  | 100 |  |  |  |  | 
| 624 | 1 |  |  |  |  | 2 | my $uri = $self->{anchor_last}; | 
| 625 | 1 | 50 |  |  |  | 10 | utf8::encode($uri) if $self->{SA_encode_results}; | 
| 626 | 1 |  |  |  |  | 3 | $self->{uri}->{$uri}->{anchor_text}->[-1] .= "<img>\n"; | 
| 627 | 1 |  |  |  |  | 2 | $self->{anchor}->[-1] .= "<img>\n"; | 
| 628 |  |  |  |  |  |  | } | 
| 629 |  |  |  |  |  |  |  | 
| 630 | 464 | 50 | 66 |  |  | 740 | if ($tag eq "img" && exists $attr->{width} && exists $attr->{height}) { | 
|  |  |  | 33 |  |  |  |  | 
| 631 | 7 |  |  |  |  | 8 | my $width = 0; | 
| 632 | 7 |  |  |  |  | 6 | my $height = 0; | 
| 633 | 7 |  |  |  |  | 14 | my $area = 0; | 
| 634 |  |  |  |  |  |  |  | 
| 635 |  |  |  |  |  |  | # assume 800x600 screen for percentage values | 
| 636 | 7 | 50 |  |  |  | 33 | if ($attr->{width} =~ /^(\d+)(\%)?$/) { | 
| 637 | 0 |  |  |  |  | 0 | $width = $1; | 
| 638 | 0 | 0 | 0 |  |  | 0 | $width *= 8 if (defined $2 && $2 eq "%"); | 
| 639 |  |  |  |  |  |  | } | 
| 640 | 7 | 50 |  |  |  | 17 | if ($attr->{height} =~ /^(\d+)(\%)?$/) { | 
| 641 | 0 |  |  |  |  | 0 | $height = $1; | 
| 642 | 0 | 0 | 0 |  |  | 0 | $height *= 6 if (defined $2 && $2 eq "%"); | 
| 643 |  |  |  |  |  |  | } | 
| 644 |  |  |  |  |  |  | # guess size | 
| 645 | 7 | 50 |  |  |  | 15 | $width = 200 if $width <= 0; | 
| 646 | 7 | 50 |  |  |  | 9 | $height = 200 if $height <= 0; | 
| 647 | 7 | 50 | 33 |  |  | 19 | if ($width > 0 && $height > 0) { | 
| 648 | 7 |  |  |  |  | 8 | $area = $width * $height; | 
| 649 | 7 |  |  |  |  | 11 | $self->{image_area} += $area; | 
| 650 |  |  |  |  |  |  | } | 
| 651 |  |  |  |  |  |  | } | 
| 652 | 464 | 0 | 33 |  |  | 749 | if ($tag eq "form" && exists $attr->{action}) { | 
| 653 | 0 | 0 |  |  |  | 0 | $self->put_results(form_action_mailto => 1) if $attr->{action} =~ /mailto:/i | 
| 654 |  |  |  |  |  |  | } | 
| 655 | 464 | 50 | 33 |  |  | 1246 | if ($tag eq "object" || $tag eq "embed") { | 
| 656 | 0 |  |  |  |  | 0 | $self->put_results(embeds => 1); | 
| 657 |  |  |  |  |  |  | } | 
| 658 |  |  |  |  |  |  |  | 
| 659 |  |  |  |  |  |  | # special text delimiters - <a> and <title> | 
| 660 | 464 | 100 |  |  |  | 1291 | if ($tag eq "a") { | 
| 661 |  |  |  |  |  |  | my $uri = $self->{anchor_last} = | 
| 662 | 118 | 50 |  |  |  | 267 | (exists $attr->{href} ? $self->canon_uri($attr->{href}) : ""); | 
| 663 | 118 | 50 |  |  |  | 213 | utf8::encode($uri) if $self->{SA_encode_results}; | 
| 664 | 118 |  |  |  |  | 129 | push(@{$self->{uri}->{$uri}->{anchor_text}}, ''); | 
|  | 118 |  |  |  |  | 334 |  | 
| 665 | 118 |  |  |  |  | 146 | push(@{$self->{anchor}}, ''); | 
|  | 118 |  |  |  |  | 214 |  | 
| 666 |  |  |  |  |  |  | } | 
| 667 | 464 | 100 |  |  |  | 708 | if ($tag eq "title") { | 
| 668 | 1 |  |  |  |  | 2 | $self->{title_index}++; | 
| 669 | 1 |  |  |  |  | 2 | $self->{title}->[$self->{title_index}] = ""; | 
| 670 |  |  |  |  |  |  | } | 
| 671 |  |  |  |  |  |  |  | 
| 672 | 464 | 100 | 66 |  |  | 3373 | if ($tag eq "meta" && | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
| 673 |  |  |  |  |  |  | exists $attr->{'http-equiv'} && | 
| 674 |  |  |  |  |  |  | exists $attr->{content} && | 
| 675 |  |  |  |  |  |  | $attr->{'http-equiv'} =~ /Content-Type/i && | 
| 676 |  |  |  |  |  |  | $attr->{content} =~ /\bcharset\s*=\s*["']?([^"']+)/i) | 
| 677 |  |  |  |  |  |  | { | 
| 678 | 1 | 50 |  |  |  | 11 | $self->{charsets} .= exists $self->{charsets} ? " $1" : $1; | 
| 679 |  |  |  |  |  |  | } | 
| 680 |  |  |  |  |  |  | } | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | sub display_text { | 
| 683 | 686 |  |  | 686 | 0 | 880 | my $self = shift; | 
| 684 | 686 |  |  |  |  | 842 | my $text = shift; | 
| 685 | 686 |  |  |  |  | 1009 | my %display = @_; | 
| 686 |  |  |  |  |  |  |  | 
| 687 |  |  |  |  |  |  | # Unless it's specified to be invisible, then it's not invisible. ;) | 
| 688 | 686 | 50 |  |  |  | 1142 | if (!exists $display{invisible}) { | 
| 689 | 686 |  |  |  |  | 895 | $display{invisible} = 0; | 
| 690 |  |  |  |  |  |  | } | 
| 691 |  |  |  |  |  |  |  | 
| 692 | 686 | 100 |  |  |  | 981 | if ($display{whitespace}) { | 
| 693 |  |  |  |  |  |  | # trim trailing whitespace from previous element if it was not whitespace | 
| 694 |  |  |  |  |  |  | # and it was not invisible | 
| 695 | 202 | 100 | 100 |  |  | 211 | if (@{ $self->{text} } && | 
|  | 202 |  | 66 |  |  | 703 |  | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
| 696 |  |  |  |  |  |  | (!defined $self->{text_whitespace} || | 
| 697 |  |  |  |  |  |  | !vec($self->{text_whitespace}, $#{$self->{text}}, 1)) && | 
| 698 |  |  |  |  |  |  | (!defined $self->{text_invisible} || | 
| 699 |  |  |  |  |  |  | !vec($self->{text_invisible}, $#{$self->{text}}, 1))) | 
| 700 |  |  |  |  |  |  | { | 
| 701 | 185 |  |  |  |  | 465 | $self->{text}->[-1] =~ s/ $//; | 
| 702 |  |  |  |  |  |  | } | 
| 703 |  |  |  |  |  |  | } | 
| 704 |  |  |  |  |  |  | else { | 
| 705 |  |  |  |  |  |  | # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0 | 
| 706 | 484 |  |  |  |  | 2373 | $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs; | 
| 707 |  |  |  |  |  |  | # trim leading whitespace if previous element was whitespace | 
| 708 |  |  |  |  |  |  | # and current element is not invisible | 
| 709 | 484 | 100 | 66 |  |  | 1074 | if (@{ $self->{text} } && !$display{invisible} && | 
|  | 484 |  | 100 |  |  | 2182 |  | 
|  |  |  | 100 |  |  |  |  | 
| 710 |  |  |  |  |  |  | defined $self->{text_whitespace} && | 
| 711 | 293 |  |  |  |  | 817 | vec($self->{text_whitespace}, $#{$self->{text}}, 1)) | 
| 712 |  |  |  |  |  |  | { | 
| 713 | 185 |  |  |  |  | 483 | $text =~ s/^ //; | 
| 714 |  |  |  |  |  |  | } | 
| 715 |  |  |  |  |  |  | } | 
| 716 | 686 |  |  |  |  | 862 | push @{ $self->{text} }, $text; | 
|  | 686 |  |  |  |  | 1398 |  | 
| 717 | 686 |  |  |  |  | 1863 | while (my ($k, $v) = each %display) { | 
| 718 | 888 |  |  |  |  | 1288 | my $textvar = "text_".$k; | 
| 719 | 888 | 100 |  |  |  | 1501 | if (!exists $self->{$textvar}) { $self->{$textvar} = ''; } | 
|  | 13 |  |  |  |  | 62 |  | 
| 720 | 888 |  |  |  |  | 1019 | vec($self->{$textvar}, $#{$self->{text}}, 1) = $v; | 
|  | 888 |  |  |  |  | 5968 |  | 
| 721 |  |  |  |  |  |  | } | 
| 722 |  |  |  |  |  |  | } | 
| 723 |  |  |  |  |  |  |  | 
| 724 |  |  |  |  |  |  | sub html_text { | 
| 725 | 485 |  |  | 485 | 0 | 882 | my ($self, $text) = @_; | 
| 726 | 485 | 50 |  |  |  | 826 | utf8::encode($text) if $self->{SA_encode_results}; | 
| 727 |  |  |  |  |  |  |  | 
| 728 |  |  |  |  |  |  | # text that is not part of body | 
| 729 | 485 | 50 | 33 |  |  | 850 | if (exists $self->{inside}{script} && $self->{inside}{script} > 0) | 
| 730 |  |  |  |  |  |  | { | 
| 731 | 0 |  |  |  |  | 0 | push @{ $self->{script} }, $text; | 
|  | 0 |  |  |  |  | 0 |  | 
| 732 | 0 |  |  |  |  | 0 | return; | 
| 733 |  |  |  |  |  |  | } | 
| 734 | 485 | 100 | 100 |  |  | 883 | if (exists $self->{inside}{style} && $self->{inside}{style} > 0) { | 
| 735 | 1 |  |  |  |  | 4 | return; | 
| 736 |  |  |  |  |  |  | } | 
| 737 |  |  |  |  |  |  |  | 
| 738 |  |  |  |  |  |  | # text that is part of body and also stored separately | 
| 739 | 484 | 100 | 100 |  |  | 1354 | if (exists $self->{inside}{a} && $self->{inside}{a} > 0) { | 
| 740 |  |  |  |  |  |  | # this doesn't worry about nested anchors | 
| 741 | 117 |  |  |  |  | 189 | my $uri = $self->{anchor_last}; | 
| 742 | 117 | 50 |  |  |  | 187 | utf8::encode($uri) if $self->{SA_encode_results}; | 
| 743 | 117 |  |  |  |  | 254 | $self->{uri}->{$uri}->{anchor_text}->[-1] .= $text; | 
| 744 | 117 |  |  |  |  | 236 | $self->{anchor}->[-1] .= $text; | 
| 745 |  |  |  |  |  |  | } | 
| 746 | 484 | 100 | 100 |  |  | 960 | if (exists $self->{inside}{title} && $self->{inside}{title} > 0) { | 
| 747 | 1 |  |  |  |  | 4 | $self->{title}->[$self->{title_index}] .= $text; | 
| 748 |  |  |  |  |  |  | } | 
| 749 |  |  |  |  |  |  |  | 
| 750 | 484 |  |  |  |  | 544 | my $invisible_for_bayes = 0; | 
| 751 |  |  |  |  |  |  |  | 
| 752 |  |  |  |  |  |  | # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0 | 
| 753 | 484 | 100 |  |  |  | 1841 | if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) { | 
| 754 | 215 |  |  |  |  | 452 | $invisible_for_bayes = $self->html_font_invisible($text); | 
| 755 |  |  |  |  |  |  | } | 
| 756 |  |  |  |  |  |  |  | 
| 757 | 484 | 100 |  |  |  | 904 | if (exists $self->{text}->[-1]) { | 
| 758 |  |  |  |  |  |  | # ideas discarded since they would be easy to evade: | 
| 759 |  |  |  |  |  |  | # 1. using \w or [A-Za-z] instead of \S or non-punctuation | 
| 760 |  |  |  |  |  |  | # 2. exempting certain tags | 
| 761 |  |  |  |  |  |  | # no re "strict";  # since perl 5.21.8: Ranges of ASCII printables... | 
| 762 | 477 | 50 | 66 |  |  | 1369 | if ($text =~ /^[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]/s && | 
| 763 |  |  |  |  |  |  | $self->{text}->[-1] =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s) | 
| 764 |  |  |  |  |  |  | { | 
| 765 | 0 |  |  |  |  | 0 | $self->{obfuscation}++; | 
| 766 |  |  |  |  |  |  | } | 
| 767 | 477 | 100 |  |  |  | 1254 | if ($self->{text}->[-1] =~ | 
| 768 |  |  |  |  |  |  | /\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s) | 
| 769 |  |  |  |  |  |  | { | 
| 770 | 107 |  |  |  |  | 223 | my $start = length($1); | 
| 771 | 107 | 50 |  |  |  | 232 | if ($text =~ /^([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\b/s) { | 
| 772 | 0 |  |  |  |  | 0 | $self->{backhair}->{$start . "_" . length($1)}++; | 
| 773 |  |  |  |  |  |  | } | 
| 774 |  |  |  |  |  |  | } | 
| 775 |  |  |  |  |  |  | } | 
| 776 |  |  |  |  |  |  |  | 
| 777 | 484 | 50 |  |  |  | 721 | if ($invisible_for_bayes) { | 
| 778 | 0 |  |  |  |  | 0 | $self->display_text($text, invisible => 1); | 
| 779 |  |  |  |  |  |  | } | 
| 780 |  |  |  |  |  |  | else { | 
| 781 | 484 |  |  |  |  | 869 | $self->display_text($text); | 
| 782 |  |  |  |  |  |  | } | 
| 783 |  |  |  |  |  |  | } | 
| 784 |  |  |  |  |  |  |  | 
| 785 |  |  |  |  |  |  | # note: $text includes <!-- and --> | 
| 786 |  |  |  |  |  |  | sub html_comment { | 
| 787 | 1 |  |  | 1 | 0 | 5 | my ($self, $text) = @_; | 
| 788 | 1 | 50 |  |  |  | 8 | utf8::encode($text) if $self->{SA_encode_results}; | 
| 789 |  |  |  |  |  |  |  | 
| 790 | 1 |  |  |  |  | 2 | push @{ $self->{comment} }, $text; | 
|  | 1 |  |  |  |  | 6 |  | 
| 791 |  |  |  |  |  |  | } | 
| 792 |  |  |  |  |  |  |  | 
| 793 |  |  |  |  |  |  | sub html_declaration { | 
| 794 | 0 |  |  | 0 | 0 | 0 | my ($self, $text) = @_; | 
| 795 | 0 | 0 |  |  |  | 0 | utf8::encode($text) if $self->{SA_encode_results}; | 
| 796 |  |  |  |  |  |  |  | 
| 797 | 0 | 0 |  |  |  | 0 | if ($text =~ /^<!doctype/i) { | 
| 798 | 0 |  |  |  |  | 0 | my $tag = "!doctype"; | 
| 799 | 0 |  |  |  |  | 0 | $self->{elements}++; | 
| 800 | 0 |  |  |  |  | 0 | $self->{tags}++; | 
| 801 | 0 |  |  |  |  | 0 | $self->{inside}{$tag} = 0; | 
| 802 |  |  |  |  |  |  | } | 
| 803 |  |  |  |  |  |  | } | 
| 804 |  |  |  |  |  |  |  | 
| 805 |  |  |  |  |  |  | ########################################################################### | 
| 806 |  |  |  |  |  |  |  | 
| 807 |  |  |  |  |  |  | my %html_color = ( | 
| 808 |  |  |  |  |  |  | # HTML 4 defined 16 colors | 
| 809 |  |  |  |  |  |  | aqua => 0x00ffff, | 
| 810 |  |  |  |  |  |  | black => 0x000000, | 
| 811 |  |  |  |  |  |  | blue => 0x0000ff, | 
| 812 |  |  |  |  |  |  | fuchsia => 0xff00ff, | 
| 813 |  |  |  |  |  |  | gray => 0x808080, | 
| 814 |  |  |  |  |  |  | green => 0x008000, | 
| 815 |  |  |  |  |  |  | lime => 0x00ff00, | 
| 816 |  |  |  |  |  |  | maroon => 0x800000, | 
| 817 |  |  |  |  |  |  | navy => 0x000080, | 
| 818 |  |  |  |  |  |  | olive => 0x808000, | 
| 819 |  |  |  |  |  |  | purple => 0x800080, | 
| 820 |  |  |  |  |  |  | red => 0xff0000, | 
| 821 |  |  |  |  |  |  | silver => 0xc0c0c0, | 
| 822 |  |  |  |  |  |  | teal => 0x008080, | 
| 823 |  |  |  |  |  |  | white => 0xffffff, | 
| 824 |  |  |  |  |  |  | yellow => 0xffff00, | 
| 825 |  |  |  |  |  |  | # colors specified in CSS3 color module | 
| 826 |  |  |  |  |  |  | aliceblue => 0xf0f8ff, | 
| 827 |  |  |  |  |  |  | antiquewhite => 0xfaebd7, | 
| 828 |  |  |  |  |  |  | aqua => 0x00ffff, | 
| 829 |  |  |  |  |  |  | aquamarine => 0x7fffd4, | 
| 830 |  |  |  |  |  |  | azure => 0xf0ffff, | 
| 831 |  |  |  |  |  |  | beige => 0xf5f5dc, | 
| 832 |  |  |  |  |  |  | bisque => 0xffe4c4, | 
| 833 |  |  |  |  |  |  | black => 0x000000, | 
| 834 |  |  |  |  |  |  | blanchedalmond => 0xffebcd, | 
| 835 |  |  |  |  |  |  | blue => 0x0000ff, | 
| 836 |  |  |  |  |  |  | blueviolet => 0x8a2be2, | 
| 837 |  |  |  |  |  |  | brown => 0xa52a2a, | 
| 838 |  |  |  |  |  |  | burlywood => 0xdeb887, | 
| 839 |  |  |  |  |  |  | cadetblue => 0x5f9ea0, | 
| 840 |  |  |  |  |  |  | chartreuse => 0x7fff00, | 
| 841 |  |  |  |  |  |  | chocolate => 0xd2691e, | 
| 842 |  |  |  |  |  |  | coral => 0xff7f50, | 
| 843 |  |  |  |  |  |  | cornflowerblue => 0x6495ed, | 
| 844 |  |  |  |  |  |  | cornsilk => 0xfff8dc, | 
| 845 |  |  |  |  |  |  | crimson => 0xdc143c, | 
| 846 |  |  |  |  |  |  | cyan => 0x00ffff, | 
| 847 |  |  |  |  |  |  | darkblue => 0x00008b, | 
| 848 |  |  |  |  |  |  | darkcyan => 0x008b8b, | 
| 849 |  |  |  |  |  |  | darkgoldenrod => 0xb8860b, | 
| 850 |  |  |  |  |  |  | darkgray => 0xa9a9a9, | 
| 851 |  |  |  |  |  |  | darkgreen => 0x006400, | 
| 852 |  |  |  |  |  |  | darkgrey => 0xa9a9a9, | 
| 853 |  |  |  |  |  |  | darkkhaki => 0xbdb76b, | 
| 854 |  |  |  |  |  |  | darkmagenta => 0x8b008b, | 
| 855 |  |  |  |  |  |  | darkolivegreen => 0x556b2f, | 
| 856 |  |  |  |  |  |  | darkorange => 0xff8c00, | 
| 857 |  |  |  |  |  |  | darkorchid => 0x9932cc, | 
| 858 |  |  |  |  |  |  | darkred => 0x8b0000, | 
| 859 |  |  |  |  |  |  | darksalmon => 0xe9967a, | 
| 860 |  |  |  |  |  |  | darkseagreen => 0x8fbc8f, | 
| 861 |  |  |  |  |  |  | darkslateblue => 0x483d8b, | 
| 862 |  |  |  |  |  |  | darkslategray => 0x2f4f4f, | 
| 863 |  |  |  |  |  |  | darkslategrey => 0x2f4f4f, | 
| 864 |  |  |  |  |  |  | darkturquoise => 0x00ced1, | 
| 865 |  |  |  |  |  |  | darkviolet => 0x9400d3, | 
| 866 |  |  |  |  |  |  | deeppink => 0xff1493, | 
| 867 |  |  |  |  |  |  | deepskyblue => 0x00bfff, | 
| 868 |  |  |  |  |  |  | dimgray => 0x696969, | 
| 869 |  |  |  |  |  |  | dimgrey => 0x696969, | 
| 870 |  |  |  |  |  |  | dodgerblue => 0x1e90ff, | 
| 871 |  |  |  |  |  |  | firebrick => 0xb22222, | 
| 872 |  |  |  |  |  |  | floralwhite => 0xfffaf0, | 
| 873 |  |  |  |  |  |  | forestgreen => 0x228b22, | 
| 874 |  |  |  |  |  |  | fuchsia => 0xff00ff, | 
| 875 |  |  |  |  |  |  | gainsboro => 0xdcdcdc, | 
| 876 |  |  |  |  |  |  | ghostwhite => 0xf8f8ff, | 
| 877 |  |  |  |  |  |  | gold => 0xffd700, | 
| 878 |  |  |  |  |  |  | goldenrod => 0xdaa520, | 
| 879 |  |  |  |  |  |  | gray => 0x808080, | 
| 880 |  |  |  |  |  |  | green => 0x008000, | 
| 881 |  |  |  |  |  |  | greenyellow => 0xadff2f, | 
| 882 |  |  |  |  |  |  | grey => 0x808080, | 
| 883 |  |  |  |  |  |  | honeydew => 0xf0fff0, | 
| 884 |  |  |  |  |  |  | hotpink => 0xff69b4, | 
| 885 |  |  |  |  |  |  | indianred => 0xcd5c5c, | 
| 886 |  |  |  |  |  |  | indigo => 0x4b0082, | 
| 887 |  |  |  |  |  |  | ivory => 0xfffff0, | 
| 888 |  |  |  |  |  |  | khaki => 0xf0e68c, | 
| 889 |  |  |  |  |  |  | lavender => 0xe6e6fa, | 
| 890 |  |  |  |  |  |  | lavenderblush => 0xfff0f5, | 
| 891 |  |  |  |  |  |  | lawngreen => 0x7cfc00, | 
| 892 |  |  |  |  |  |  | lemonchiffon => 0xfffacd, | 
| 893 |  |  |  |  |  |  | lightblue => 0xadd8e6, | 
| 894 |  |  |  |  |  |  | lightcoral => 0xf08080, | 
| 895 |  |  |  |  |  |  | lightcyan => 0xe0ffff, | 
| 896 |  |  |  |  |  |  | lightgoldenrodyellow => 0xfafad2, | 
| 897 |  |  |  |  |  |  | lightgray => 0xd3d3d3, | 
| 898 |  |  |  |  |  |  | lightgreen => 0x90ee90, | 
| 899 |  |  |  |  |  |  | lightgrey => 0xd3d3d3, | 
| 900 |  |  |  |  |  |  | lightpink => 0xffb6c1, | 
| 901 |  |  |  |  |  |  | lightsalmon => 0xffa07a, | 
| 902 |  |  |  |  |  |  | lightseagreen => 0x20b2aa, | 
| 903 |  |  |  |  |  |  | lightskyblue => 0x87cefa, | 
| 904 |  |  |  |  |  |  | lightslategray => 0x778899, | 
| 905 |  |  |  |  |  |  | lightslategrey => 0x778899, | 
| 906 |  |  |  |  |  |  | lightsteelblue => 0xb0c4de, | 
| 907 |  |  |  |  |  |  | lightyellow => 0xffffe0, | 
| 908 |  |  |  |  |  |  | lime => 0x00ff00, | 
| 909 |  |  |  |  |  |  | limegreen => 0x32cd32, | 
| 910 |  |  |  |  |  |  | linen => 0xfaf0e6, | 
| 911 |  |  |  |  |  |  | magenta => 0xff00ff, | 
| 912 |  |  |  |  |  |  | maroon => 0x800000, | 
| 913 |  |  |  |  |  |  | mediumaquamarine => 0x66cdaa, | 
| 914 |  |  |  |  |  |  | mediumblue => 0x0000cd, | 
| 915 |  |  |  |  |  |  | mediumorchid => 0xba55d3, | 
| 916 |  |  |  |  |  |  | mediumpurple => 0x9370db, | 
| 917 |  |  |  |  |  |  | mediumseagreen => 0x3cb371, | 
| 918 |  |  |  |  |  |  | mediumslateblue => 0x7b68ee, | 
| 919 |  |  |  |  |  |  | mediumspringgreen => 0x00fa9a, | 
| 920 |  |  |  |  |  |  | mediumturquoise => 0x48d1cc, | 
| 921 |  |  |  |  |  |  | mediumvioletred => 0xc71585, | 
| 922 |  |  |  |  |  |  | midnightblue => 0x191970, | 
| 923 |  |  |  |  |  |  | mintcream => 0xf5fffa, | 
| 924 |  |  |  |  |  |  | mistyrose => 0xffe4e1, | 
| 925 |  |  |  |  |  |  | moccasin => 0xffe4b5, | 
| 926 |  |  |  |  |  |  | navajowhite => 0xffdead, | 
| 927 |  |  |  |  |  |  | navy => 0x000080, | 
| 928 |  |  |  |  |  |  | oldlace => 0xfdf5e6, | 
| 929 |  |  |  |  |  |  | olive => 0x808000, | 
| 930 |  |  |  |  |  |  | olivedrab => 0x6b8e23, | 
| 931 |  |  |  |  |  |  | orange => 0xffa500, | 
| 932 |  |  |  |  |  |  | orangered => 0xff4500, | 
| 933 |  |  |  |  |  |  | orchid => 0xda70d6, | 
| 934 |  |  |  |  |  |  | palegoldenrod => 0xeee8aa, | 
| 935 |  |  |  |  |  |  | palegreen => 0x98fb98, | 
| 936 |  |  |  |  |  |  | paleturquoise => 0xafeeee, | 
| 937 |  |  |  |  |  |  | palevioletred => 0xdb7093, | 
| 938 |  |  |  |  |  |  | papayawhip => 0xffefd5, | 
| 939 |  |  |  |  |  |  | peachpuff => 0xffdab9, | 
| 940 |  |  |  |  |  |  | peru => 0xcd853f, | 
| 941 |  |  |  |  |  |  | pink => 0xffc0cb, | 
| 942 |  |  |  |  |  |  | plum => 0xdda0dd, | 
| 943 |  |  |  |  |  |  | powderblue => 0xb0e0e6, | 
| 944 |  |  |  |  |  |  | purple => 0x800080, | 
| 945 |  |  |  |  |  |  | red => 0xff0000, | 
| 946 |  |  |  |  |  |  | rosybrown => 0xbc8f8f, | 
| 947 |  |  |  |  |  |  | royalblue => 0x4169e1, | 
| 948 |  |  |  |  |  |  | saddlebrown => 0x8b4513, | 
| 949 |  |  |  |  |  |  | salmon => 0xfa8072, | 
| 950 |  |  |  |  |  |  | sandybrown => 0xf4a460, | 
| 951 |  |  |  |  |  |  | seagreen => 0x2e8b57, | 
| 952 |  |  |  |  |  |  | seashell => 0xfff5ee, | 
| 953 |  |  |  |  |  |  | sienna => 0xa0522d, | 
| 954 |  |  |  |  |  |  | silver => 0xc0c0c0, | 
| 955 |  |  |  |  |  |  | skyblue => 0x87ceeb, | 
| 956 |  |  |  |  |  |  | slateblue => 0x6a5acd, | 
| 957 |  |  |  |  |  |  | slategray => 0x708090, | 
| 958 |  |  |  |  |  |  | slategrey => 0x708090, | 
| 959 |  |  |  |  |  |  | snow => 0xfffafa, | 
| 960 |  |  |  |  |  |  | springgreen => 0x00ff7f, | 
| 961 |  |  |  |  |  |  | steelblue => 0x4682b4, | 
| 962 |  |  |  |  |  |  | tan => 0xd2b48c, | 
| 963 |  |  |  |  |  |  | teal => 0x008080, | 
| 964 |  |  |  |  |  |  | thistle => 0xd8bfd8, | 
| 965 |  |  |  |  |  |  | tomato => 0xff6347, | 
| 966 |  |  |  |  |  |  | turquoise => 0x40e0d0, | 
| 967 |  |  |  |  |  |  | violet => 0xee82ee, | 
| 968 |  |  |  |  |  |  | wheat => 0xf5deb3, | 
| 969 |  |  |  |  |  |  | white => 0xffffff, | 
| 970 |  |  |  |  |  |  | whitesmoke => 0xf5f5f5, | 
| 971 |  |  |  |  |  |  | yellow => 0xffff00, | 
| 972 |  |  |  |  |  |  | yellowgreen => 0x9acd32, | 
| 973 |  |  |  |  |  |  | ); | 
| 974 |  |  |  |  |  |  |  | 
| 975 |  |  |  |  |  |  | sub name_to_rgb_old { | 
| 976 | 0 |  |  | 0 | 0 | 0 | my $color = lc $_[0]; | 
| 977 |  |  |  |  |  |  |  | 
| 978 |  |  |  |  |  |  | # note: Mozilla strips leading and trailing whitespace at this point, | 
| 979 |  |  |  |  |  |  | # but IE does not | 
| 980 |  |  |  |  |  |  |  | 
| 981 |  |  |  |  |  |  | # named colors | 
| 982 | 0 |  |  |  |  | 0 | my $hex = $html_color{$color}; | 
| 983 | 0 | 0 |  |  |  | 0 | if (defined $hex) { | 
| 984 | 0 |  |  |  |  | 0 | return sprintf("#%06x", $hex); | 
| 985 |  |  |  |  |  |  | } | 
| 986 |  |  |  |  |  |  |  | 
| 987 |  |  |  |  |  |  | # Flex Hex: John Graham-Cumming, http://www.jgc.org/pdf/lisa2004.pdf | 
| 988 |  |  |  |  |  |  | # strip optional # character | 
| 989 | 0 |  |  |  |  | 0 | $color =~ s/^#//; | 
| 990 |  |  |  |  |  |  | # pad right-hand-side to a multiple of three | 
| 991 | 0 | 0 |  |  |  | 0 | $color .= "0" x (3 - (length($color) % 3)) if (length($color) % 3); | 
| 992 |  |  |  |  |  |  | # split into triplets | 
| 993 | 0 |  |  |  |  | 0 | my $length = length($color) / 3; | 
| 994 | 0 |  |  |  |  | 0 | my @colors = ($color =~ /(.{$length})(.{$length})(.{$length})/); | 
| 995 |  |  |  |  |  |  | # truncate each color to a DWORD, take MSB, left pad nibbles | 
| 996 | 0 |  |  |  |  | 0 | foreach (@colors) { s/.*(.{8})$/$1/; s/(..).*/$1/; s/^(.)$/0$1/ }; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 997 |  |  |  |  |  |  | # the color | 
| 998 | 0 |  |  |  |  | 0 | $color = join("", @colors); | 
| 999 |  |  |  |  |  |  | # replace non-hex characters with 0 | 
| 1000 | 0 |  |  |  |  | 0 | $color =~ tr/0-9a-f/0/c; | 
| 1001 |  |  |  |  |  |  |  | 
| 1002 | 0 |  |  |  |  | 0 | return "#" . $color; | 
| 1003 |  |  |  |  |  |  | } | 
| 1004 |  |  |  |  |  |  |  | 
| 1005 |  |  |  |  |  |  | sub name_to_rgb { | 
| 1006 | 124 |  |  | 124 | 0 | 9090 | my $color = lc $_[0]; | 
| 1007 | 124 |  |  |  |  | 160 | my $before = $color; | 
| 1008 |  |  |  |  |  |  |  | 
| 1009 |  |  |  |  |  |  | # strip leading and ending whitespace | 
| 1010 | 124 |  |  |  |  | 468 | $color =~ s/^\s*//; | 
| 1011 | 124 |  |  |  |  | 507 | $color =~ s/\s*$//; | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 |  |  |  |  |  |  | # named colors | 
| 1014 | 124 |  |  |  |  | 254 | my $hex = $html_color{$color}; | 
| 1015 | 124 | 100 |  |  |  | 247 | if (defined $hex) { | 
| 1016 | 4 |  |  |  |  | 33 | return sprintf("#%06x", $hex); | 
| 1017 |  |  |  |  |  |  | } | 
| 1018 |  |  |  |  |  |  |  | 
| 1019 |  |  |  |  |  |  | # IF NOT A NAME, IT SHOULD BE A HEX COLOR, HEX SHORTHAND or rgb values | 
| 1020 | 120 | 100 |  |  |  | 353 | if ($color =~ m/^[#a-f0-9]*$|rgb\([\d%, ]*\)/i) { | 
| 1021 |  |  |  |  |  |  |  | 
| 1022 |  |  |  |  |  |  | #Convert the RGB values to hex values so we can fall through on the programming | 
| 1023 |  |  |  |  |  |  |  | 
| 1024 |  |  |  |  |  |  | #RGB PERCENTS TO HEX | 
| 1025 | 105 | 100 |  |  |  | 203 | if ($color =~ m/rgb\((\d+)%,\s*(\d+)%,\s*(\d+)%\s*\)/i) { | 
| 1026 | 2 |  |  |  |  | 15 | $color = "#".dec2hex(int($1/100*255)).dec2hex(int($2/100*255)).dec2hex(int($3/100*255)); | 
| 1027 |  |  |  |  |  |  | } | 
| 1028 |  |  |  |  |  |  |  | 
| 1029 |  |  |  |  |  |  | #RGB DEC TO HEX | 
| 1030 | 105 | 100 |  |  |  | 225 | if ($color =~ m/rgb\((\d+),\s*(\d+),\s*(\d+)\s*\)/i) { | 
| 1031 | 2 |  |  |  |  | 7 | $color = "#".dec2hex($1).dec2hex($2).dec2hex($3); | 
| 1032 |  |  |  |  |  |  | } | 
| 1033 |  |  |  |  |  |  |  | 
| 1034 |  |  |  |  |  |  | #PARSE THE HEX | 
| 1035 | 105 | 100 |  |  |  | 248 | if ($color =~ m/^#/) { | 
| 1036 |  |  |  |  |  |  | # strip to hex only | 
| 1037 | 99 |  |  |  |  | 309 | $color =~ s/[^a-f0-9]//ig; | 
| 1038 |  |  |  |  |  |  |  | 
| 1039 |  |  |  |  |  |  | # strip to 6 if greater than 6 | 
| 1040 | 99 | 100 |  |  |  | 215 | if (length($color) > 6) { | 
| 1041 | 1 |  |  |  |  | 3 | $color=substr($color,0,6); | 
| 1042 |  |  |  |  |  |  | } | 
| 1043 |  |  |  |  |  |  |  | 
| 1044 |  |  |  |  |  |  | # strip to 3 if length < 6) | 
| 1045 | 99 | 50 | 66 |  |  | 349 | if (length($color) > 3 && length($color) < 6) { | 
| 1046 | 0 |  |  |  |  | 0 | $color=substr($color,0,3); | 
| 1047 |  |  |  |  |  |  | } | 
| 1048 |  |  |  |  |  |  |  | 
| 1049 |  |  |  |  |  |  | # pad right-hand-side to a multiple of three | 
| 1050 | 99 | 100 |  |  |  | 214 | $color .= "0" x (3 - (length($color) % 3)) if (length($color) % 3); | 
| 1051 |  |  |  |  |  |  |  | 
| 1052 |  |  |  |  |  |  | #DUPLICATE SHORTHAND HEX | 
| 1053 | 99 | 100 |  |  |  | 199 | if (length($color) == 3) { | 
| 1054 | 3 |  |  |  |  | 15 | $color =~ m/(.)(.)(.)/; | 
| 1055 | 3 |  |  |  |  | 17 | $color = "$1$1$2$2$3$3"; | 
| 1056 |  |  |  |  |  |  | } | 
| 1057 |  |  |  |  |  |  |  | 
| 1058 |  |  |  |  |  |  | } else { | 
| 1059 | 6 |  |  |  |  | 17 | return "invalid"; | 
| 1060 |  |  |  |  |  |  | } | 
| 1061 |  |  |  |  |  |  |  | 
| 1062 |  |  |  |  |  |  | } else { | 
| 1063 |  |  |  |  |  |  | #INVALID | 
| 1064 |  |  |  |  |  |  |  | 
| 1065 |  |  |  |  |  |  | #??RETURN BLACK SINCE WE DO NOT KNOW HOW THE MUA / BROWSER WILL PARSE | 
| 1066 |  |  |  |  |  |  | #$color = "000000"; | 
| 1067 |  |  |  |  |  |  |  | 
| 1068 | 15 |  |  |  |  | 34 | return "invalid"; | 
| 1069 |  |  |  |  |  |  | } | 
| 1070 |  |  |  |  |  |  |  | 
| 1071 |  |  |  |  |  |  | #print "DEBUG: before/after name_to_rgb new version: $before/$color\n"; | 
| 1072 |  |  |  |  |  |  |  | 
| 1073 | 99 |  |  |  |  | 245 | return "#" . $color; | 
| 1074 |  |  |  |  |  |  | } | 
| 1075 |  |  |  |  |  |  |  | 
| 1076 |  |  |  |  |  |  | sub dec2hex { | 
| 1077 | 12 |  |  | 12 | 0 | 25 | my ($dec) = @_; | 
| 1078 | 12 |  |  |  |  | 25 | my ($pre) = ''; | 
| 1079 |  |  |  |  |  |  |  | 
| 1080 | 12 | 50 |  |  |  | 25 | if ($dec < 16) { | 
| 1081 | 0 |  |  |  |  | 0 | $pre = '0'; | 
| 1082 |  |  |  |  |  |  | } | 
| 1083 |  |  |  |  |  |  |  | 
| 1084 | 12 |  |  |  |  | 136 | return sprintf("$pre%lx", $dec); | 
| 1085 |  |  |  |  |  |  | } | 
| 1086 |  |  |  |  |  |  |  | 
| 1087 |  |  |  |  |  |  |  | 
| 1088 | 40 |  |  | 40 |  | 512 | use constant URI_STRICT => 0; | 
|  | 40 |  |  |  |  | 127 |  | 
|  | 40 |  |  |  |  | 38434 |  | 
| 1089 |  |  |  |  |  |  |  | 
| 1090 |  |  |  |  |  |  | # resolving relative URIs as defined in RFC 2396 (steps from section 5.2) | 
| 1091 |  |  |  |  |  |  | # using draft http://www.gbiv.com/protocols/uri/rev-2002/rfc2396bis.html | 
| 1092 |  |  |  |  |  |  | sub _parse_uri { | 
| 1093 | 346 |  |  | 346 |  | 419 | my ($u) = @_; | 
| 1094 | 346 |  |  |  |  | 351 | my %u; | 
| 1095 | 346 |  |  |  |  | 1677 | ($u{scheme}, $u{authority}, $u{path}, $u{query}, $u{fragment}) = | 
| 1096 |  |  |  |  |  |  | $u =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; | 
| 1097 | 346 |  |  |  |  | 1398 | return %u; | 
| 1098 |  |  |  |  |  |  | } | 
| 1099 |  |  |  |  |  |  |  | 
| 1100 |  |  |  |  |  |  | sub _remove_dot_segments { | 
| 1101 | 170 |  |  | 170 |  | 264 | my ($input) = @_; | 
| 1102 | 170 |  |  |  |  | 203 | my $output = ""; | 
| 1103 |  |  |  |  |  |  |  | 
| 1104 | 170 |  |  |  |  | 261 | $input =~ s@^(?:\.\.?/)@/@; | 
| 1105 |  |  |  |  |  |  |  | 
| 1106 | 170 |  |  |  |  | 280 | while ($input) { | 
| 1107 | 298 | 100 |  |  |  | 1110 | if ($input =~ s@^/\.(?:$|/)@/@) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 1108 |  |  |  |  |  |  | } | 
| 1109 |  |  |  |  |  |  | elsif ($input =~ s@^/\.\.(?:$|/)@/@) { | 
| 1110 | 20 |  |  |  |  | 56 | $output =~ s@/?[^/]*$@@; | 
| 1111 |  |  |  |  |  |  | } | 
| 1112 |  |  |  |  |  |  | elsif ($input =~ s@(/?[^/]*)@@) { | 
| 1113 | 269 |  |  |  |  | 666 | $output .= $1; | 
| 1114 |  |  |  |  |  |  | } | 
| 1115 |  |  |  |  |  |  | } | 
| 1116 | 170 |  |  |  |  | 321 | return $output; | 
| 1117 |  |  |  |  |  |  | } | 
| 1118 |  |  |  |  |  |  |  | 
| 1119 |  |  |  |  |  |  | sub _merge_uri { | 
| 1120 | 92 |  |  | 92 |  | 135 | my ($base_authority, $base_path, $r_path) = @_; | 
| 1121 |  |  |  |  |  |  |  | 
| 1122 | 92 | 50 | 66 |  |  | 215 | if (defined $base_authority && !$base_path) { | 
| 1123 | 0 |  |  |  |  | 0 | return "/" . $r_path; | 
| 1124 |  |  |  |  |  |  | } | 
| 1125 |  |  |  |  |  |  | else { | 
| 1126 | 92 | 100 |  |  |  | 153 | if ($base_path =~ m|/|) { | 
| 1127 | 34 |  |  |  |  | 393 | $base_path =~ s|(?<=/)[^/]*$||; | 
| 1128 |  |  |  |  |  |  | } | 
| 1129 |  |  |  |  |  |  | else { | 
| 1130 | 58 |  |  |  |  | 76 | $base_path = ""; | 
| 1131 |  |  |  |  |  |  | } | 
| 1132 | 92 |  |  |  |  | 247 | return $base_path . $r_path; | 
| 1133 |  |  |  |  |  |  | } | 
| 1134 |  |  |  |  |  |  | } | 
| 1135 |  |  |  |  |  |  |  | 
| 1136 |  |  |  |  |  |  | sub target_uri { | 
| 1137 | 173 |  |  | 173 | 0 | 11325 | my ($base, $r) = @_; | 
| 1138 |  |  |  |  |  |  |  | 
| 1139 | 173 |  |  |  |  | 349 | my %r = _parse_uri($r);	# parsed relative URI | 
| 1140 | 173 |  |  |  |  | 329 | my %base = _parse_uri($base);	# parsed base URI | 
| 1141 | 173 |  |  |  |  | 242 | my %t;			# generated temporary URI | 
| 1142 |  |  |  |  |  |  |  | 
| 1143 | 173 | 100 | 100 |  |  | 529 | if ((not URI_STRICT) and | 
|  |  |  | 100 |  |  |  |  | 
| 1144 |  |  |  |  |  |  | (defined $r{scheme} && defined $base{scheme}) and | 
| 1145 |  |  |  |  |  |  | ($r{scheme} eq $base{scheme})) | 
| 1146 |  |  |  |  |  |  | { | 
| 1147 | 1 |  |  |  |  | 4 | undef $r{scheme}; | 
| 1148 |  |  |  |  |  |  | } | 
| 1149 |  |  |  |  |  |  |  | 
| 1150 | 173 | 100 |  |  |  | 291 | if (defined $r{scheme}) { | 
| 1151 | 74 |  |  |  |  | 123 | $t{scheme} = $r{scheme}; | 
| 1152 | 74 |  |  |  |  | 110 | $t{authority} = $r{authority}; | 
| 1153 | 74 |  |  |  |  | 134 | $t{path} = _remove_dot_segments($r{path}); | 
| 1154 | 74 |  |  |  |  | 126 | $t{query} = $r{query}; | 
| 1155 |  |  |  |  |  |  | } | 
| 1156 |  |  |  |  |  |  | else { | 
| 1157 | 99 | 100 |  |  |  | 133 | if (defined $r{authority}) { | 
| 1158 | 1 |  |  |  |  | 2 | $t{authority} = $r{authority}; | 
| 1159 | 1 |  |  |  |  | 3 | $t{path} = _remove_dot_segments($r{path}); | 
| 1160 | 1 |  |  |  |  | 2 | $t{query} = $r{query}; | 
| 1161 |  |  |  |  |  |  | } | 
| 1162 |  |  |  |  |  |  | else { | 
| 1163 | 98 | 100 |  |  |  | 149 | if ($r{path} eq "") { | 
| 1164 | 3 |  |  |  |  | 5 | $t{path} = $base{path}; | 
| 1165 | 3 | 100 |  |  |  | 6 | if (defined $r{query}) { | 
| 1166 | 1 |  |  |  |  | 3 | $t{query} = $r{query}; | 
| 1167 |  |  |  |  |  |  | } | 
| 1168 |  |  |  |  |  |  | else { | 
| 1169 | 2 |  |  |  |  | 3 | $t{query} = $base{query}; | 
| 1170 |  |  |  |  |  |  | } | 
| 1171 |  |  |  |  |  |  | } | 
| 1172 |  |  |  |  |  |  | else { | 
| 1173 | 95 | 100 |  |  |  | 148 | if ($r{path} =~ m|^/|) { | 
| 1174 | 3 |  |  |  |  | 9 | $t{path} = _remove_dot_segments($r{path}); | 
| 1175 |  |  |  |  |  |  | } | 
| 1176 |  |  |  |  |  |  | else { | 
| 1177 | 92 |  |  |  |  | 160 | $t{path} = _merge_uri($base{authority}, $base{path}, $r{path}); | 
| 1178 | 92 |  |  |  |  | 153 | $t{path} = _remove_dot_segments($t{path}); | 
| 1179 |  |  |  |  |  |  | } | 
| 1180 | 95 |  |  |  |  | 152 | $t{query} = $r{query}; | 
| 1181 |  |  |  |  |  |  | } | 
| 1182 | 98 |  |  |  |  | 130 | $t{authority} = $base{authority}; | 
| 1183 |  |  |  |  |  |  | } | 
| 1184 | 99 |  |  |  |  | 123 | $t{scheme} = $base{scheme}; | 
| 1185 |  |  |  |  |  |  | } | 
| 1186 | 173 |  |  |  |  | 214 | $t{fragment} = $r{fragment}; | 
| 1187 |  |  |  |  |  |  |  | 
| 1188 |  |  |  |  |  |  | # recompose URI | 
| 1189 | 173 |  |  |  |  | 203 | my $result = ""; | 
| 1190 | 173 | 100 |  |  |  | 285 | if ($t{scheme}) { | 
|  |  | 50 |  |  |  |  |  | 
| 1191 | 115 |  |  |  |  | 207 | $result .= $t{scheme} . ":"; | 
| 1192 |  |  |  |  |  |  | } | 
| 1193 |  |  |  |  |  |  | elsif (defined $t{authority}) { | 
| 1194 |  |  |  |  |  |  | # this block is not part of the RFC | 
| 1195 |  |  |  |  |  |  | # TODO: figure out what MUAs actually do with unschemed URIs | 
| 1196 |  |  |  |  |  |  | # maybe look at URI::Heuristic | 
| 1197 | 0 | 0 |  |  |  | 0 | if ($t{authority} =~ /^www\d*\./i) { | 
|  |  | 0 |  |  |  |  |  | 
| 1198 |  |  |  |  |  |  | # some spammers are using unschemed URIs to escape filters | 
| 1199 | 0 |  |  |  |  | 0 | $result .= "http:"; | 
| 1200 |  |  |  |  |  |  | } | 
| 1201 |  |  |  |  |  |  | elsif ($t{authority} =~ /^ftp\d*\./i) { | 
| 1202 | 0 |  |  |  |  | 0 | $result .= "ftp:"; | 
| 1203 |  |  |  |  |  |  | } | 
| 1204 |  |  |  |  |  |  | } | 
| 1205 | 173 | 100 |  |  |  | 281 | if ($t{authority}) { | 
| 1206 | 106 |  |  |  |  | 185 | $result .= "//" . $t{authority}; | 
| 1207 |  |  |  |  |  |  | } | 
| 1208 | 173 |  |  |  |  | 264 | $result .= $t{path}; | 
| 1209 | 173 | 100 |  |  |  | 262 | if (defined $t{query}) { | 
| 1210 | 12 |  |  |  |  | 21 | $result .= "?" . $t{query}; | 
| 1211 |  |  |  |  |  |  | } | 
| 1212 | 173 | 100 |  |  |  | 239 | if (defined $t{fragment}) { | 
| 1213 | 6 |  |  |  |  | 10 | $result .= "#" . $t{fragment}; | 
| 1214 |  |  |  |  |  |  | } | 
| 1215 | 173 |  |  |  |  | 482 | return $result; | 
| 1216 |  |  |  |  |  |  | } | 
| 1217 |  |  |  |  |  |  |  | 
| 1218 |  |  |  |  |  |  | 1; | 
| 1219 |  |  |  |  |  |  | __END__ |