File Coverage

blib/lib/Test/XHTML/Valid.pm

Criterion	Covered	Total	%
statement	30	30	100.0
branch			n/a
condition			n/a
subroutine	10	10	100.0
pod			n/a
total	40	40	100.0

line	stmt	sub	time	code
1				package Test::XHTML::Valid;
2
3	8	8	24382	use strict;
	8		17
	8		308
4	8	8	44	use warnings;
	8		15
	8		293
5
6	8	8	40	use vars qw($VERSION);
	8		14
	8		576
7				$VERSION = '0.12';
8
9				#----------------------------------------------------------------------------
10
11				=head1 NAME
12
13				Test::XHTML::Valid - test web page DTD validation.
14
15				=head1 SYNOPSIS
16
17				my $txv = Test::XHTML::Valid->new();
18
19				$txv->ignore_list(@IGNORE); # patterns to ignore
20
21				# dynamic pages
22				$txv->process_root($opt{url}); # test all pages beneath root
23				$txv->process_link($opt{link}); # test single link
24				$txv->process_url_list($opt{ulist}); # test list of links
25
26				# static pages
27				$txv->process_path($opt{path}); # test all files within path
28				$txv->process_file($opt{file}); # test single file
29				$txv->process_file_list($opt{flist}); # test list of files
30
31				# XML strings
32				$txv->process_xml($xml); # test XML as a string
33
34				$txv->process_retries(); # retest any network failures
35				my $results = $txv->process_results();
36
37				$txv->content(); # for further testing of single page content
38				$txv->errors(); # all current errors reported
39				$txv->clear(); # clear all current errors and results
40
41				$txv->retrieve_url($url); # retrieve URL, no testing required
42				$txv->retrieve_file($file); # retrieve file, no testing required
43
44				$txv->logfile($file); # logfile for verbose messages
45				$txv->logclean(1); # 1 = create/overwrite, 0 = append (default)
46
47				=head1 DESCRIPTION
48
49				Using either URLs or flat files, this module attempts to validate web pages
50				according to the DTD schema specified within each page.
51
52				=cut
53
54				# -------------------------------------
55				# Library Modules
56
57	8	8	40	use base qw(Class::Accessor::Fast);
	8		15
	8		7564
58
59	8	8	32760	use File::Basename;
	8		19
	8		986
60	8	8	8579	use File::Find::Rule;
	8		77112
	8		74
61	8	8	471	use File::Path;
	8		18
	8		600
62	8	8	1026	use IO::File;
	8		11439
	8		1423
63	8	8	11314	use WWW::Mechanize;
	8		1628366
	8		704
64	8	8	23137	use XML::Catalogs::HTML -libxml; # load DTDs and ENTs locally
	8		122117
	8		101
65				use XML::LibXML '1.70';
66
67				# -------------------------------------
68				# Variables
69
70				my @IGNORE = (
71				qr/^mailto/,
72				qr/\.(xml\|txt\|pdf\|doc\|odt\|odp\|ods)$/i,
73				qr/\.(tgz\|gz\|bz2\|rar\|zip)$/i,
74				qr/\.(mp4\|avi\|wmv)$/i,
75				qr/\.(jpg\|bmp\|gif\|png)$/i,
76				);
77
78				my @RESULTS = qw( PAGES PASS FAIL NET );
79
80				# -------------------------------------
81				# Singletons
82
83				my $parser = XML::LibXML->new;
84				$parser->validation(1);
85
86				# -------------------------------------
87				# Public Methods
88
89				sub new {
90				my $proto = shift; # get the class name
91				my $class = ref($proto) \|\| $proto;
92
93				# private data
94				my %hash = @_;
95				my $self = {};
96				$self->{RESULTS}{$_} = 0 for(@RESULTS);
97				push @{ $self->{IGNORE} }, @IGNORE;
98
99				# store access to a Mechanize object
100				$self->{mech} = $hash{mech} \|\| WWW::Mechanize->new();
101
102				bless ($self, $class);
103				return $self;
104				}
105
106				sub DESTROY {
107				my $self = shift;
108				}
109
110				__PACKAGE__->mk_accessors(qw( logfile logclean ));
111
112				sub ignore_list { _ignore_list(@_); }
113
114				sub retrieve_url { _retrieve_url(@_); }
115				sub retrieve_file { _retrieve_file(@_); }
116
117				sub process_root { _process_root(@_); }
118				sub process_link { _process_link(@_); }
119				sub process_url_list { _process_ulist(@_); }
120
121				sub process_path { _process_path(@_); }
122				sub process_file { _process_file(@_); }
123				sub process_file_list { _process_flist(@_); }
124
125				sub process_xml { _process_xml(@_); }
126
127				sub process_retries { _process_retries(@_); }
128				sub process_results { _process_results(@_); }
129
130				sub content { my $self = shift; return $self->{CONTENT}; }
131				sub errors { my $self = shift; return $self->{ERRORS}; }
132				sub clear { my $self = shift; $self->{ERRORS} = undef; $self->_reset_results(); }
133				sub errstr { my $self = shift; return $self->_print_errors(); }
134
135				# -------------------------------------
136				# Private Methods
137
138				# single dynamic root, no additional processing required
139				sub _retrieve_url {
140				my $self = shift;
141				my $url = shift \|\| return;
142				$self->{ROOT} = $url;
143				$self->{mech}->get( $url );
144				if($self->{mech}->success()) {
145				$self->{CONTENT} = $self->{mech}->content;
146				} else {
147				$self->{CONTENT} = undef;
148				}
149				}
150
151				# single dynamic root
152				sub _process_root {
153				my $self = shift;
154				my $url = shift \|\| return;
155				$self->{ROOT} = $url;
156				$self->_process_pages($url);
157				}
158
159				# single dynamic page
160				sub _process_link {
161				my $self = shift;
162				my $link = shift;
163				$self->_process_page(type => 'url', page => $link);
164				}
165
166				# list of dynamic pages
167				sub _process_ulist {
168				my $self = shift;
169				my $file = shift;
170				my $fh = IO::File->new($file,'r') or die "Cannot open file [$file]: $!\n";
171				while(<$fh>) {
172				next if(/^\s*$/ \|\| /^\#/);
173				chomp;
174				$self->_process_page(type => 'url', page => $_);
175				}
176				$fh->close;
177				}
178
179				# single static page, no additional processing required
180				sub _retrieve_file {
181				my $self = shift;
182				my $file = shift \|\| return;
183				$self->{CONTENT} = undef;
184				my $fh = IO::File->new($file,'r') or die "Cannot open file [$file]: $!\n";
185				while(<$fh>) {
186				$self->{CONTENT} .= $_;
187				}
188				$fh->close;
189				}
190
191				# static pages
192				sub _process_flist {
193				my $self = shift;
194				my $file = shift;
195				my $fh = IO::File->new($file,'r') or die "Cannot open file [$file]: $!\n";
196				while(<$fh>) {
197				next if(/^\s*$/ \|\| /^\#/);
198				chomp;
199				$self->_process_page(type => 'file', page => $_);
200				}
201				$fh->close;
202				}
203
204				sub _process_file {
205				my $self = shift;
206				my $file = shift;
207				$self->_process_page(type => 'file', page => $file);
208				}
209
210				sub _process_path {
211				my $self = shift;
212				my $path = shift;
213				my @files = File::Find::Rule->file()->name(qr/\.html?/)->in($path);
214				$self->_process_page(type => 'file', page => $_) for(@files);
215				}
216
217				sub _process_xml {
218				my $self = shift;
219				my $text = shift;
220				$self->_process_page(type => 'xml', content => $text);
221				}
222
223				sub _process_results {
224				my $self = shift;
225				my %results = map {$_ => $self->{RESULTS}{$_}} @RESULTS;
226				$self->_log( sprintf "%8s%d\n", "$_:", $results{$_} ) for(@RESULTS);
227				return \%results;
228				}
229
230				sub _reset_results {
231				my $self = shift;
232				$self->{RESULTS}{$_} = 0 for(@RESULTS);
233				}
234
235				sub _print_errors {
236				my $self = shift;
237				my $str = "\nErrors:\n" ;
238				my $i = 1;
239				for my $error (@{$self->{ERRORS}}) {
240				$str .= "$i. $error->{message}\n";
241				$i++;
242				}
243				return $str;
244				}
245
246				# -------------------------------------
247				# Subroutines
248
249				sub _process_pages {
250				my $self = shift;
251				my $url = shift;
252				my (@links,%seen);
253
254				push @links, $url;
255				while(@links) {
256				my $page = shift @links;
257				next if($seen{$page});
258				$self->{mech}->get( $page );
259				unless($self->{mech}->success()) {
260				push @{ $self->{RETRIES} }, {type => 'url', page => $page};
261				next;
262				}
263
264				$seen{$page} = 1;
265				my @hrefs = map {$_->url_abs()} $self->{mech}->links();
266				for my $href (reverse sort @hrefs) {
267				next if($seen{$href});
268				next if($self->_ignore($href));
269				unshift @links, $href;
270				}
271
272				$self->_process_page(type => 'url', page => $page, content => $self->{mech}->content);
273				}
274				}
275
276				sub _process_page {
277				my $self = shift;
278				my %page = @_;
279				$self->{RESULTS}{PAGES}++;
280
281				unless($page{type} && $page{type} =~ /^(file\|url\|xml)$/) {
282				$self->{CONTENT} = undef;
283				die "Unknown format type: $page{type}\n";
284				}
285
286				sleep(1);
287				if($page{type} =~ /^(file\|url)$/) { $self->_log( "Parsing $page{page}: " ); }
288				elsif($page{type} eq 'xml') { $self->_log( "Parsing XML string: " ); }
289
290				if($page{content}) {
291				$self->{CONTENT} = $page{content};
292				} else {
293
294				if($page{type} eq 'file') {
295				$self->_retrieve_file($page{page});
296				$page{content} = $self->{CONTENT};
297
298				} elsif($page{type} eq 'url') {
299				eval { $self->{mech}->get( $page{page} ) };
300				if($@) {
301				push @{ $self->{ERRORS} }, {page => $page{page}, error => $@, message => _parse_message($@)};
302				$self->_log( "FAIL\n$@\n" );
303				$self->{RESULTS}{FAIL}++;
304				return;
305				}
306
307				unless($self->{mech}->success()) {
308				$self->{CONTENT} = undef;
309				push @{ $self->{RETRIES} }, {type => 'url', page => $page{page}};
310				return;
311				} else {
312				$page{content} = $self->{mech}->content;
313				$self->{CONTENT} = $page{content};
314				}
315				} elsif($page{type} eq 'xml') {
316				die "no content provided\n";
317				}
318				}
319
320				eval {
321				$parser->parse_string( $page{content} );
322				};
323
324				# XML::LibXML doesn't explain failures to access the external DTD, so
325				# these lines are a reference to that fact.
326				# See also - http://use.perl.org/~perigrin/journal/31137
327				if($@ =~ /Operation in progress/i \|\|
328				$@ =~ /validity error : No declaration for attribute xmlns of element (html\|body\|div)/) {
329				$self->_log( "RETRY\n" );
330				push @{ $self->{RETRIES} }, {type => $page{type}, page => $page{page}};
331				$self->{RETRIES}->[-1]{content} = $page{content} if($page{type} eq 'xml');
332				} elsif($@) {
333				push @{ $self->{ERRORS} }, {page => $page{page}, error => $@, content => $page{content}, message => _parse_message($@)};
334				$self->_log( "FAIL\n$@\n" );
335				$self->{RESULTS}{FAIL}++;
336				} else {
337				$self->_log( "PASS\n" );
338				$self->{RESULTS}{PASS}++;
339				}
340				}
341
342				sub _process_retries {
343				my $self = shift;
344				return unless($self->{RETRIES});
345
346				for my $page (sort @{ $self->{RETRIES} }) {
347				sleep(1);
348
349				if($page->{type} eq 'file') {
350				$self->_log( "Parsing $page->{page}: " );
351				eval {
352				$parser->parse_file($page->{page});
353				};
354				} elsif($page->{type} eq 'url') {
355				$self->_log( "Parsing $page->{page}: " );
356				$self->{mech}->get( $page->{page} );
357				unless($self->{mech}->success()) {
358				$self->_log( "NET FAILURE\n" );
359				$self->{RESULTS}{NET}++;
360				next;
361				}
362				$page->{content} = $self->{mech}->{content};
363				eval {
364				$parser->parse_string($page->{content});
365				};
366				} elsif($page->{type} eq 'xml') {
367				$self->_log( "Parsing XML string: " );
368				eval {
369				$parser->parse_string( $page->{content} );
370				};
371				} else {
372				die "Unknown format type: $page->{type}\n";
373				}
374
375				# XML::LibXML doesn't explain failures to access the external DTD, so
376				# these lines are a reference to that fact.
377				# See also - http://use.perl.org/~perigrin/journal/31137
378				if($@ =~ /Operation in progress/i \|\|
379				$@ =~ /validity error : No declaration for attribute xmlns of element (html\|body\|div)/) {
380				$self->_log( "NET FAILURE\n" );
381				$self->{RESULTS}{NET}++;
382				} elsif($@) {
383				push @{ $self->{ERRORS} }, {page => $page->{page}, error => $@, content => $page->{content}, message => _parse_message($@)};
384				$self->_log( "FAIL\n$@\n" );
385				$self->{RESULTS}{FAIL}++;
386				} else {
387				$self->_log( "PASS\n" );
388				$self->{RESULTS}{PASS}++;
389				}
390				}
391				}
392
393				sub _parse_message {
394				my $e = shift;
395
396				return $e unless($e && ref($e));
397				while (defined $e->{_prev}) { $e = $e->{_prev} };
398				#return $e->{message};
399				return "[$e->{line}:$e->{column}] $e->{message}";
400				}
401
402				sub _ignore {
403				my $self = shift;
404				my $url = shift or return 1; # ignore blank URLs
405
406				for my $ignore (@{$self->{IGNORE}}) {
407				return 1 if($url =~ $ignore);
408				}
409
410				# no non-http or external links
411				return 1 if($url !~ /^http/);
412				return 1 if($url =~ /^http/ && $url !~ /^$self->{ROOT}/);
413
414				# ignore revisiting the base
415				return 1 if(index("$self->{ROOT}",$url) == 0);
416
417				return 0;
418				}
419
420				sub _ignore_list {
421				my $self = shift;
422				push @{ $self->{IGNORE} }, @_;
423				}
424
425				sub _log {
426				my $self = shift;
427				my $log = $self->logfile or return;
428				mkpath(dirname($log)) unless(-f $log);
429
430				my $mode = $self->logclean ? 'w+' : 'a+';
431				$self->logclean(0);
432
433				my $fh = IO::File->new($log,$mode) or die "Cannot write to log file [$log]: $!\n";
434				print $fh @_;
435				$fh->close;
436				}
437
438				1;
439
440				__END__