| blib/lib/W3C/XHTML/HTMLCompatChecker.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % | 
| statement | 7 | 10 | 70.0 | 
| branch | n/a | ||
| condition | n/a | ||
| subroutine | 3 | 4 | 75.0 | 
| pod | n/a | ||
| total | 10 | 14 | 71.4 | 
| line | stmt | bran | cond | sub | pod | time | code | 
|---|---|---|---|---|---|---|---|
| 1 | package W3C::XHTML::HTMLCompatChecker; | ||||||
| 2 | |||||||
| 3 | 3 | 3 | 2281 | use strict; | |||
| 3 | 6 | ||||||
| 3 | 116 | ||||||
| 4 | 3 | 3 | 17 | use warnings; | |||
| 3 | 6 | ||||||
| 3 | 545 | ||||||
| 5 | |||||||
| 6 | require Exporter; | ||||||
| 7 | our @ISA = qw(Exporter); | ||||||
| 8 | our %EXPORT_TAGS = ( 'all' => [ qw() ] ); | ||||||
| 9 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | ||||||
| 10 | our @EXPORT = qw(); | ||||||
| 11 | our $VERSION = sprintf "%d.%03d",q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/; | ||||||
| 12 | 0 | 0 | sub Version { $VERSION; } | ||||
| 13 | |||||||
| 14 | 3 | 3 | 5088 | use XML::Parser; | |||
| 0 | |||||||
| 0 | |||||||
| 15 | use URI; | ||||||
| 16 | use LWP::UserAgent; | ||||||
| 17 | |||||||
| 18 | # Define global constants | ||||||
| 19 | use constant TRUE => 1; | ||||||
| 20 | use constant FALSE => 0; | ||||||
| 21 | |||||||
| 22 | |||||||
| 23 | use constant APPC_FOUND_XML_DECL => 1; # http://www.w3.org/TR/xhtml1/#C_1 | ||||||
| 24 | use constant APPC_FOUND_XML_PI => 2; # http://www.w3.org/TR/xhtml1/#C_1 | ||||||
| 25 | use constant APPC_MISSING_SPACE => 3; # http://www.w3.org/TR/xhtml1/#C_2 | ||||||
| 26 | use constant APPC_UNMINIMIZED => 4; # http://www.w3.org/TR/xhtml1/#C_2 | ||||||
| 27 | use constant APPC_MINIMIZED => 5; # http://www.w3.org/TR/xhtml1/#C_3 | ||||||
| 28 | use constant APPC_MANY_ISINDEX => 6; # http://www.w3.org/TR/xhtml1/#C_6 | ||||||
| 29 | use constant APPC_ONLY_LANG => 7; # http://www.w3.org/TR/xhtml1/#C_7 | ||||||
| 30 | use constant APPC_ONLY_XML_LANG => 8; # http://www.w3.org/TR/xhtml1/#C_7 | ||||||
| 31 | use constant APPC_APOS_IN_ATTR => 9; # http://www.w3.org/TR/xhtml1/#C_16 | ||||||
| 32 | use constant APPC_APOS_IN_ELEM => 10; # http://www.w3.org/TR/xhtml1/#C_16 | ||||||
| 33 | |||||||
| 34 | use constant APPC_ERRO => 0; # @@ | ||||||
| 35 | use constant APPC_WARN => 1; # @@ | ||||||
| 36 | use constant APPC_INFO => 2; # @@ | ||||||
| 37 | use constant APPC_HINT => 3; # @@ | ||||||
| 38 | |||||||
| 39 | use constant SEVERITY_NAMES => | ||||||
| 40 | { | ||||||
| 41 | APPC_ERRO, "Error", | ||||||
| 42 | APPC_WARN, "Warning", | ||||||
| 43 | APPC_INFO, "Info", | ||||||
| 44 | APPC_HINT, "Hint", | ||||||
| 45 | }; | ||||||
| 46 | |||||||
| 47 | use constant CRITERIA => | ||||||
| 48 | { | ||||||
| 49 | APPC_FOUND_XML_DECL, [ 1, APPC_INFO, "XML declarations are problematic" ], | ||||||
| 50 | APPC_FOUND_XML_PI, [ 1, APPC_INFO, "XML processing instructions are problematic" ], | ||||||
| 51 | APPC_MISSING_SPACE,     [ 2, APPC_ERRO, " | ||||||
| 52 | APPC_UNMINIMIZED,       [ 2, APPC_ERRO, "For empty elements you shall use | ||||||
| 53 | APPC_MINIMIZED,         [ 3, APPC_ERRO, "For non-empty elements, you shall use | ||||||
| 54 | APPC_ONLY_LANG,         [ 7, APPC_ERRO, " | ||||||
| 55 | APPC_ONLY_XML_LANG,     [ 7, APPC_ERRO, " | ||||||
| 56 | APPC_MANY_ISINDEX,      [10, APPC_WARN, "Avoid more than one | ||||||
| 57 | APPC_APOS_IN_ATTR, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], | ||||||
| 58 | APPC_APOS_IN_ELEM, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], | ||||||
| 59 | }; | ||||||
| 60 | |||||||
| 61 | use constant GUIDELINE_TITLES => | ||||||
| 62 | { | ||||||
| 63 | 1, "Processing Instructions and the XML Declaration", | ||||||
| 64 | 2, "Empty Elements", | ||||||
| 65 | 3, "Element Minimization and Empty Element Content", | ||||||
| 66 | 6, "Isindex", | ||||||
| 67 | 7, "The lang and xml:lang Attributes", | ||||||
| 68 | 16, "The Named Character Reference '", | ||||||
| 69 | }; | ||||||
| 70 | |||||||
| 71 | use constant EMPTY_ELEMENTS => { map { $_ => 1 } | ||||||
| 72 | qw/ | ||||||
| 73 | base basefont link area hr img | ||||||
| 74 | meta param input isindex col br | ||||||
| 75 | / }; | ||||||
| 76 | |||||||
| 77 | # global variables... | ||||||
| 78 | our $ISINDEX = 0; | ||||||
| 79 | our $IS_RELEVANT_DOC = 1; # whether the checker is relevant to the doctype of the document being processed. | ||||||
| 80 | our $IS_RELEVANT_CT = 1; # whether the checker is relevant to the media type of the document being processed. | ||||||
| 81 | our $IS_WF = 1; # whether the document is at least well-formed XML | ||||||
| 82 | our @MESSAGES; | ||||||
| 83 | |||||||
| 84 | |||||||
| 85 | ########################### | ||||||
| 86 | # usual package interface # | ||||||
| 87 | ########################### | ||||||
| 88 | sub new | ||||||
| 89 | { | ||||||
| 90 | my $self = {}; | ||||||
| 91 | my $proto = shift; | ||||||
| 92 | my $class = ref($proto) || $proto; | ||||||
| 93 | bless($self, $class); | ||||||
| 94 | return $self; | ||||||
| 95 | } | ||||||
| 96 | |||||||
| 97 | |||||||
| 98 | |||||||
| 99 | |||||||
| 100 | ## Helper functions ####################################################### | ||||||
| 101 | sub is_empty_element { EMPTY_ELEMENTS->{shift()} } | ||||||
| 102 | sub is_isindex_element { shift eq "isindex" } | ||||||
| 103 | sub is_inside_head { shift->within_element("head") } | ||||||
| 104 | |||||||
| 105 | sub report_problem | ||||||
| 106 | { | ||||||
| 107 | my $exp = shift; | ||||||
| 108 | my $cod = shift; | ||||||
| 109 | my $loc = shift; | ||||||
| 110 | |||||||
| 111 | my $str = $exp->recognized_string; | ||||||
| 112 | my $lin = $exp->current_line; | ||||||
| 113 | my $col = $exp->current_column; | ||||||
| 114 | my $off = $exp->current_byte; | ||||||
| 115 | |||||||
| 116 | # determine position after skipping $loc, e.g. if there is | ||||||
| 117 | # | ||||||
| 118 | # 
 | ||||||
| 119 | # xml:lang = "de" | ||||||
| 120 | # class = "a b c d e f g" | ||||||
| 121 | # id = "example"/> | ||||||
| 122 | # | ||||||
| 123 | # the error is the / and it would be unhelpful to point at | ||||||
| 124 | # the < as expat would do in this case. | ||||||
| 125 | |||||||
| 126 | my $left = substr $str, 0, $loc; | ||||||
| 127 | my $lines = $left =~ y/\n//; # @@ does \n always work? | ||||||
| 128 | $left =~ s/^.*\n//s; # @@ does \n always work? | ||||||
| 129 | my $chars = length $left; | ||||||
| 130 | |||||||
| 131 | # set new positions | ||||||
| 132 | my $posy = $lin + $lines; # advance pointer | ||||||
| 133 | my $posx = $lines ? $chars : $col + $chars; # advance or replace | ||||||
| 134 | my $posxy = $off + $loc; # advance pointer | ||||||
| 135 | |||||||
| 136 | my $stext = SEVERITY_NAMES->{CRITERIA->{$cod}->[1]}; | ||||||
| 137 | my $mtext = CRITERIA->{$cod}->[2]; | ||||||
| 138 | my $cnum = CRITERIA->{$cod}->[0]; | ||||||
| 139 | my $gtitle = GUIDELINE_TITLES->{$cnum}; | ||||||
| 140 | |||||||
| 141 | push @MESSAGES, {severity => $stext, line => $posy, column => $posx + 1, cnum => $cnum, message_text => $mtext, guideline_title => $gtitle} | ||||||
| 142 | |||||||
| 143 | } | ||||||
| 144 | |||||||
| 145 | |||||||
| 146 | ## Pre-Parsing routines ################################################### | ||||||
| 147 | # make sure we are actually handling XHTML 1.0 documents served as text/html | ||||||
| 148 | # some code taken from W3C Markup Validator Codebase | ||||||
| 149 | |||||||
| 150 | sub parse_content_type { | ||||||
| 151 | my $Content_Type = shift; | ||||||
| 152 | my ($ct, @others) = split /\s*;\s*/, lc $Content_Type; | ||||||
| 153 | #print p($ct); | ||||||
| 154 | if ($ct ne "text/html") { | ||||||
| 155 | $IS_RELEVANT_CT = 0; | ||||||
| 156 | } | ||||||
| 157 | return $ct; | ||||||
| 158 | } | ||||||
| 159 | |||||||
| 160 | |||||||
| 161 | ## Handler for XML::Parser ################################################ | ||||||
| 162 | |||||||
| 163 | sub _start | ||||||
| 164 | { | ||||||
| 165 | my $exp = shift; | ||||||
| 166 | my $ele = shift; | ||||||
| 167 | my %att = @_; | ||||||
| 168 | my $str = $exp->recognized_string; | ||||||
| 169 | my $lin = $exp->current_line; | ||||||
| 170 | my $col = $exp->current_column; | ||||||
| 171 | my $off = $exp->current_byte; | ||||||
| 172 | my $end = length($str) - 1; | ||||||
| 173 | |||||||
| 174 | # check for multiple isindex elements | ||||||
| 175 | if (is_isindex_element($ele) and | ||||||
| 176 | is_inside_head($exp) and | ||||||
| 177 | $ISINDEX++) | ||||||
| 178 | { | ||||||
| 179 | report_problem($exp, APPC_MANY_ISINDEX, 0); | ||||||
| 180 | } | ||||||
| 181 | |||||||
| 182 | if ($str =~ m|/>$|) | ||||||
| 183 | { | ||||||
| 184 | # check for preceding space in empty element tag | ||||||
| 185 | if ($str !~ m|[ \x0d\x0a\t]/>$|) | ||||||
| 186 | { | ||||||
| 187 | report_problem($exp, APPC_MISSING_SPACE, $end - 1); | ||||||
| 188 | } | ||||||
| 189 | |||||||
| 190 | # check that empty element tags are used only for | ||||||
| 191 | # elements declared as EMPTY in the DTD | ||||||
| 192 | if (!is_empty_element($ele)) | ||||||
| 193 | { | ||||||
| 194 | report_problem($exp, APPC_MINIMIZED, $end - 1); | ||||||
| 195 | } | ||||||
| 196 | } | ||||||
| 197 | |||||||
| 198 | # check that elements declared as EMPTY use empty element tags | ||||||
| 199 | if (is_empty_element($ele)) | ||||||
| 200 | { | ||||||
| 201 | if ($str !~ m|/>$|) | ||||||
| 202 | { | ||||||
| 203 | report_problem($exp, APPC_UNMINIMIZED, $end); | ||||||
| 204 | } | ||||||
| 205 | } | ||||||
| 206 | |||||||
| 207 | # check for ' in attribute values | ||||||
| 208 | if ($str =~ m|'|) | ||||||
| 209 | { | ||||||
| 210 | local $_ = $str; | ||||||
| 211 | my $len = 0; | ||||||
| 212 | |||||||
| 213 | while(s/^(.*?)'//) | ||||||
| 214 | { | ||||||
| 215 | $len += length $1; | ||||||
| 216 | report_problem($exp, APPC_APOS_IN_ATTR, $len); | ||||||
| 217 | |||||||
| 218 | } | ||||||
| 219 | } | ||||||
| 220 | |||||||
| 221 | # check for ... | ||||||
| 222 | if (exists $att{'lang'} && not exists $att{'xml:lang'}) | ||||||
| 223 | { | ||||||
| 224 | report_problem($exp, APPC_ONLY_LANG, $end); | ||||||
| 225 | } | ||||||
| 226 | |||||||
| 227 | # check for ... | ||||||
| 228 | if (exists $att{'xml:lang'} && not exists $att{'lang'}) | ||||||
| 229 | { | ||||||
| 230 | report_problem($exp, APPC_ONLY_XML_LANG, $end); | ||||||
| 231 | } | ||||||
| 232 | } | ||||||
| 233 | |||||||
| 234 | sub _char | ||||||
| 235 | { | ||||||
| 236 | my $exp = shift; | ||||||
| 237 | my $txt = shift; | ||||||
| 238 | my $str = $exp->recognized_string; | ||||||
| 239 | my $lin = $exp->current_line; | ||||||
| 240 | my $col = $exp->current_column; | ||||||
| 241 | my $off = $exp->current_byte; | ||||||
| 242 | |||||||
| 243 | # check for ' in parsed character data | ||||||
| 244 | if ($str =~ /'/) | ||||||
| 245 | { | ||||||
| 246 | local $_ = $str; | ||||||
| 247 | my $len = 0; | ||||||
| 248 | |||||||
| 249 | while(s/^(.*?)'//) | ||||||
| 250 | { | ||||||
| 251 | $len += length $1; | ||||||
| 252 | report_problem($exp, APPC_APOS_IN_ELEM, $len); | ||||||
| 253 | |||||||
| 254 | } | ||||||
| 255 | } | ||||||
| 256 | } | ||||||
| 257 | |||||||
| 258 | sub _proc | ||||||
| 259 | { | ||||||
| 260 | # check for XML processing instructions | ||||||
| 261 | report_problem(shift, APPC_FOUND_XML_PI, 0); | ||||||
| 262 | } | ||||||
| 263 | |||||||
| 264 | sub _xmldecl | ||||||
| 265 | { | ||||||
| 266 | # check for XML declaration | ||||||
| 267 | report_problem(shift, APPC_FOUND_XML_DECL, 0); | ||||||
| 268 | } | ||||||
| 269 | |||||||
| 270 | sub _doctype | ||||||
| 271 | { | ||||||
| 272 | my $exp = shift; | ||||||
| 273 | my $doctypename = shift; | ||||||
| 274 | my $doctypesys = shift; | ||||||
| 275 | my $doctypepub = shift; | ||||||
| 276 | my $doctypeint = shift; | ||||||
| 277 | if (defined $doctypename) { | ||||||
| 278 | $IS_RELEVANT_DOC = 0 if ($doctypename ne "html"); | ||||||
| 279 | } | ||||||
| 280 | if(defined $doctypesys) { | ||||||
| 281 | $_ = $doctypesys; | ||||||
| 282 | $IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/.*\/xhtml.*.dtd/); | ||||||
| 283 | #$IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/.*\/xhtml1\/DTD\/xhtml1-(strict|transitional|frameset).dtd/); | ||||||
| 284 | } | ||||||
| 285 | if (defined $doctypepub) { | ||||||
| 286 | $_ = $doctypepub; | ||||||
| 287 | $IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML .*\/\/EN/); | ||||||
| 288 | # we choose to accept checking any XHTML - could be stricter and only check for XHTML 1.0 | ||||||
| 289 | #$IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML 1.0 (Strict|Transitional|Frameset)\/\/EN/); | ||||||
| 290 | } | ||||||
| 291 | if (defined $doctypeint) # there should be no internal subset | ||||||
| 292 | { | ||||||
| 293 | $IS_RELEVANT_DOC = 0 if (length $doctypeint); | ||||||
| 294 | } | ||||||
| 295 | $IS_RELEVANT_DOC = 0 if ((not defined $doctypesys) and (not defined $doctypepub)); # should not happen with XHTML 1.0 | ||||||
| 296 | } | ||||||
| 297 | |||||||
| 298 | |||||||
| 299 | |||||||
| 300 | ## Main ################################################################### | ||||||
| 301 | |||||||
| 302 | sub check_uri { | ||||||
| 303 | my $self = shift; | ||||||
| 304 | my $uri = shift; | ||||||
| 305 | my $any_xhtml = 0; # by default, only check XHTML 1.0 docs served as text/html | ||||||
| 306 | my @local_messages; | ||||||
| 307 | if (@_) { | ||||||
| 308 | my @anyxhtmlarry = @_; | ||||||
| 309 | if (int (@anyxhtmlarry) eq 2) | ||||||
| 310 | { | ||||||
| 311 | my $any_xhtml_varname=shift; | ||||||
| 312 | $any_xhtml = shift; | ||||||
| 313 | if ($any_xhtml ne "1") {$any_xhtml = 0} | ||||||
| 314 | } | ||||||
| 315 | } | ||||||
| 316 | |||||||
| 317 | # body... | ||||||
| 318 | my @messages; | ||||||
| 319 | |||||||
| 320 | if (defined $uri and length $uri and URI->new($uri)->scheme eq "http") | ||||||
| 321 | { | ||||||
| 322 | my $ua = LWP::UserAgent->new; | ||||||
| 323 | my $response = $ua->get($uri); | ||||||
| 324 | my $xml = undef; | ||||||
| 325 | my $ct = undef; | ||||||
| 326 | my @content_type_values = undef; | ||||||
| 327 | if ($response->is_success) { | ||||||
| 328 | $xml = $response->content; | ||||||
| 329 | @content_type_values = $response->header('Content-Type'); | ||||||
| 330 | $ct = $content_type_values[0]; | ||||||
| 331 | @messages = $self->check_content($xml); | ||||||
| 332 | } | ||||||
| 333 | if (defined $ct and length $ct) { | ||||||
| 334 | $ct = &parse_content_type($ct); | ||||||
| 335 | } | ||||||
| 336 | if ($IS_RELEVANT_CT eq 0 and $any_xhtml eq 0) { | ||||||
| 337 | push @local_messages, {severity => "Abort", message_text => "not text/html"}; | ||||||
| 338 | return @local_messages; | ||||||
| 339 | } | ||||||
| 340 | } | ||||||
| 341 | else { | ||||||
| 342 | push @local_messages, {severity => "Abort", message_text => "Bad URI"}; | ||||||
| 343 | return @local_messages; | ||||||
| 344 | } | ||||||
| 345 | return @messages; | ||||||
| 346 | } | ||||||
| 347 | |||||||
| 348 | sub check_content { | ||||||
| 349 | my $self = shift; | ||||||
| 350 | my $xml = shift; | ||||||
| 351 | my $any_xhtml = 0; # by default, only check XHTML 1.0 docs | ||||||
| 352 | my @local_messages; | ||||||
| 353 | |||||||
| 354 | if (@_) { | ||||||
| 355 | my @anyxhtmlarry = @_; | ||||||
| 356 | if (int (@anyxhtmlarry) eq 2) | ||||||
| 357 | { | ||||||
| 358 | my $any_xhtml_varname=shift; | ||||||
| 359 | $any_xhtml = shift; | ||||||
| 360 | if ($any_xhtml ne "1") {$any_xhtml = 0} | ||||||
| 361 | } | ||||||
| 362 | } | ||||||
| 363 | |||||||
| 364 | if (defined $xml and length $xml) | ||||||
| 365 | { | ||||||
| 366 | my $p = XML::Parser->new; | ||||||
| 367 | $p->setHandlers(Doctype => \&_doctype); | ||||||
| 368 | |||||||
| 369 | eval { $p->parsestring($xml); }; | ||||||
| 370 | #$output->param(is_relevant_ct => $IS_RELEVANT_CT); | ||||||
| 371 | #$output->param(is_relevant_doctype => $IS_RELEVANT_DOC); | ||||||
| 372 | |||||||
| 373 | if ($@) # not well-formed | ||||||
| 374 | { | ||||||
| 375 | $IS_WF = 0; | ||||||
| 376 | my $wf_errors = join '', $@; | ||||||
| 377 | push @local_messages, {severity => "Abort", message_text => "Content is not well-formed XML"}; | ||||||
| 378 | return @local_messages; | ||||||
| 379 | #$output->param(info_count => 1); | ||||||
| 380 | #$output->param(wf_errors => $wf_errors); | ||||||
| 381 | } | ||||||
| 382 | elsif (not $IS_RELEVANT_DOC) | ||||||
| 383 | { | ||||||
| 384 | if ($any_xhtml){ | ||||||
| 385 | push @local_messages, {severity => "Abort", message_text => "Content is not XHTML"}; | ||||||
| 386 | } | ||||||
| 387 | else { | ||||||
| 388 | push @local_messages, {severity => "Abort", message_text => "Content is not XHTML 1.0"}; | ||||||
| 389 | |||||||
| 390 | } | ||||||
| 391 | return @local_messages; | ||||||
| 392 | |||||||
| 393 | } | ||||||
| 394 | else # woot, Well-formed, and relevant. Let's get to work. | ||||||
| 395 | { | ||||||
| 396 | my $p = XML::Parser->new; | ||||||
| 397 | $p->setHandlers(Char => \&_char, | ||||||
| 398 | Proc => \&_proc, | ||||||
| 399 | Start => \&_start, | ||||||
| 400 | XMLDecl => \&_xmldecl); | ||||||
| 401 | eval { $p->parsestring($xml); }; | ||||||
| 402 | return @MESSAGES; | ||||||
| 403 | } | ||||||
| 404 | } | ||||||
| 405 | else { | ||||||
| 406 | return -1; | ||||||
| 407 | } | ||||||
| 408 | } | ||||||
| 409 | |||||||
| 410 | |||||||
| 411 | package W3C::XHTML::HTMLCompatChecker; | ||||||
| 412 | 1; | ||||||
| 413 | |||||||
| 414 | |||||||
| 415 | __END__ |