File Coverage

blib/lib/W3C/LogValidator/LinkChecker.pm
Criterion Covered Total %
statement 10 49 20.4
branch 0 22 0.0
condition 0 15 0.0
subroutine 4 7 57.1
pod n/a
total 14 93 15.0


line stmt bran cond sub pod time code
1             # Copyright (c) 2005 the World Wide Web Consortium :
2             # Keio University,
3             # European Research Consortium for Informatics and Mathematics
4             # Massachusetts Institute of Technology.
5             # written by olivier Thereaux for W3C
6             #
7             # $Id: LinkChecker.pm,v 1.7 2006/01/18 04:35:35 ot Exp $
8              
9             package W3C::LogValidator::LinkChecker;
10 1     1   620 use strict;
  1         2  
  1         28  
11 1     1   5 use warnings;
  1         2  
  1         28  
12 1     1   6 use Config;
  1         1  
  1         867  
13              
14              
15             require Exporter;
16             our @ISA = qw(Exporter);
17             our %EXPORT_TAGS = ( 'all' => [ qw() ] );
18             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
19             our @EXPORT = qw();
20             our $VERSION = sprintf "%d.%03d",q$Revision: 1.7 $ =~ /(\d+)\.(\d+)/;
21              
22              
23             ###########################
24             # usual package interface #
25             ###########################
26             our $verbose = 1;
27             our %config;
28              
29             sub new
30             {
31 0     0     my $self = {};
32 0           my $proto = shift;
33 0   0       my $class = ref($proto) || $proto;
34             # mandatory vars for the API
35 0           @{$self->{URIs}} = undef;
  0            
36             # internal stuff here
37             # $self->{FOO} = undef;
38              
39             # don't change this
40 0 0         if (@_) {%config = %{(shift)};}
  0            
  0            
41 0 0         if (exists $config{verbose}) {$verbose = $config{verbose}}
  0            
42 0           bless($self, $class);
43 0           return $self;
44             }
45              
46              
47             sub uris
48             {
49 0     0     my $self = shift;
50 0 0         if (@_) { @{$self->{URIs}} = @_ }
  0            
  0            
51 0           return @{$self->{URIs}};
  0            
52             }
53              
54              
55             # internal routines
56             #sub foobar
57             #{
58             # my $self = shift;
59             # ...
60             #}
61              
62              
63             sub path_checklink
64             {
65 0     0     my $self = shift;
66 0           my $cl_path;
67              
68 0           my $found = 0;
69 0 0         if (exists $config{checklink}){
70 0           $cl_path = $config{checklink};
71            
72 0 0 0       if ( (-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
      0        
73 0           $found = 1;
74 0           return $cl_path;
75             }
76             }
77 0 0         if ($found == 0) {
78 0           foreach ("$Config{scriptdirexp}/checklink", "$Config{binexp}/checklink",
79             '/usr/bin/checklink', '/bin/checklink', '/usr/local/bin/checklink', './checklink'){
80 0           $cl_path = $_;
81 0 0         print "looking for checklink at: $cl_path..." if ($verbose >1);
82            
83 0 0 0       if ((-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
      0        
84 0           $found = 1;
85 0 0         print "found!\n" if ($verbose >1);
86 0           return $cl_path;
87             }
88             else {
89 0 0         print "\n" if ($verbose >1);
90             }
91             }
92             }
93 0 0         if ($found == 0) { die("checklink not found") }
  0            
94             }
95              
96             #########################################
97             # Actual subroutine to check the list of uris #
98             #########################################
99              
100              
101             sub process_list
102             {
103             my $self = shift;
104             my $max_invalid = undef;
105             if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
106             else {$max_invalid = 0}
107             my $max_documents = undef;
108             if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
109             else {$max_documents = 0}
110             print "Now Using the Link Checker module :\n" if $verbose;
111             my $name = "";
112             if (exists $config{ServerName}) {$name = $config{ServerName}}
113            
114             my @uris = undef;
115             my $uri;
116             my $checklink;
117             $checklink = $self->path_checklink();
118              
119             my %hits;
120             # Opening the file with the hits and URIs data
121             if (defined ($config{tmpfile}))
122             {
123 1     1   1510 use DB_File;
  0            
  0            
124             my $tmp_file = $config{tmpfile};
125             tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) ||
126             die ("Cannot create or open $tmp_file");
127             @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
128             }
129             elsif ($self->uris())
130             {
131             @uris = $self->uris();
132             foreach my $uri (@uris) { $hits{$uri} = 0 }
133             }
134              
135             print "\n (This may take a long time if you have many files to validate)\n" if ($verbose eq 1);
136             print "\n" if ($verbose > 2); # trying to breathe in the debug volume...
137              
138             # require W3C::LinkChecker; # TODO when the link checker is nicely modularized
139             my @result;
140             my @result_head;
141              
142             push @result_head, "Rank";
143             push @result_head, "Hits";
144             push @result_head, "#Error(s)";
145             push @result_head, "Address";
146             my $total_census = 0;
147             my $invalid_census = 0;
148             my $last_invalid_position = 0;
149            
150             while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
151             {
152             $uri = shift (@uris);
153             # $self->new_doc();
154             my $uri_orig = $uri;
155             $total_census++;
156             print " processing #$total_census $uri..." if ($verbose > 1);
157              
158             # FIXME at some point we will use the library instead of running the script
159             #open(LINK, "$checklink $uri 2>/dev/null |");
160             open LINK, "-|" or do {
161             require File::Spec;
162             open STDERR, "> " . File::Spec->devnull or die $!;
163             exec $checklink, $uri;
164             die "Can't execute $checklink: $!";
165             };
166             my $num_errs = 0;
167             print "\n" if ($verbose > 2);
168             while () {
169             my $line = $_;
170            
171             if (($line =~ /To do: The link is broken/) or ($line =~ /To do: There are broken fragments/) or ($line =~ /To do: The hostname could not be resolved. This link needs to be fixed/)){
172             $num_errs += 1;
173             print $line if ($verbose > 2);
174             }
175            
176             }
177             print " " if ($verbose > 2);
178              
179             if ($num_errs > 0) {
180             print " $num_errs broken link(s)\n" if ($verbose > 1);
181             my @result_tmp;
182             push @result_tmp, $total_census;
183             push @result_tmp, $hits{$uri_orig};
184             push @result_tmp, $num_errs;
185             push @result_tmp, $uri_orig;
186             push @result, [@result_tmp];
187             $invalid_census++;
188             $last_invalid_position = $total_census;
189             }
190             else {
191             print " OK.\n" if ($verbose > 1);
192             }
193              
194              
195             }
196            
197             print "Done!\n" if $verbose;
198              
199              
200              
201             print "invalid_census $invalid_census \n" if ($verbose > 2 );
202             my $intro = "Here are the most popular document(s) with broken links \nthat I could find in the logs for $name.";
203             my $outro;
204             if ($invalid_census) # we found invalid docs
205             {
206             if ($invalid_census eq 1) # let's repect grammar here
207             {
208             $intro=~ s/are/is/;
209             $intro=~ s/ //;
210             $intro=~ s/document\(s\)/document/;
211             }
212             $intro =~s//$invalid_census/;
213             my $ratio = 10000*$invalid_census/$total_census;
214             $ratio = int($ratio)/100;
215             if ($last_invalid_position eq $total_census )
216             # usual case
217             {
218             $outro="Conclusion :
219             I had to check $last_invalid_position document(s) in order to find $invalid_census HTML documents with broken links.
220             This means that about $ratio\% of your most popular documents needs fixing.";
221             }
222             else
223             # we didn't find as many invalid docs as requested
224             {
225             if ($max_invalid) {
226              
227             $outro= "Conclusion :
228             You asked for $max_invalid document with broken links but I could only find $invalid_census
229             by processing (all the) $total_census document(s) in your logs.
230             This means that about $ratio\% of your most popular documents needs fixing.";}
231             else # max_invalid set to 0, user asked for all invalid docs
232             { $outro= "Conclusion :
233             I found $invalid_census documents with broken links
234             by processing (all the) $total_census document(s) in your logs.
235             This means that about $ratio\% of your most popular documents needs fixing.";}
236             }
237             }
238             elsif (!$total_census)
239             {
240             $intro="There was nothing to check in this log.";
241             $outro="";
242             }
243             else # everything was actually OK!
244             {
245             $intro=~s/ //;
246             $outro="I couldn't find any document with broken links in this log. Congratulations!";
247             }
248             if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
249             {
250             $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
251             }
252              
253             if (defined ($config{tmpfile}))
254             {
255             untie %hits;
256             }
257             # Here is what the module will return. The hash will be sent to
258             # the output module
259              
260             my %returnhash;
261             # the name of the module
262             $returnhash{"name"}="Link Checker";
263             #intro
264             $returnhash{"intro"}=$intro;
265             #Headers for the result table
266             @{$returnhash{"thead"}}=@result_head;
267             # data for the results table
268             @{$returnhash{"trows"}}= @result;
269             #outro
270             $returnhash{"outro"}=$outro;
271             return %returnhash;
272             }
273              
274             package W3C::LogValidator::LinkChecker;
275              
276             1;
277              
278             __END__