line
stmt
bran
cond
sub
pod
time
code
1
# Copyright (c) 2005 the World Wide Web Consortium :
2
# Keio University,
3
# European Research Consortium for Informatics and Mathematics
4
# Massachusetts Institute of Technology.
5
# written by olivier Thereaux for W3C
6
#
7
# $Id: LinkChecker.pm,v 1.7 2006/01/18 04:35:35 ot Exp $
8
9
package W3C::LogValidator::LinkChecker;
10
1
1
620
use strict;
1
2
1
28
11
1
1
5
use warnings;
1
2
1
28
12
1
1
6
use Config;
1
1
1
867
13
14
15
require Exporter;
16
our @ISA = qw(Exporter);
17
our %EXPORT_TAGS = ( 'all' => [ qw() ] );
18
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
19
our @EXPORT = qw();
20
our $VERSION = sprintf "%d.%03d",q$Revision: 1.7 $ =~ /(\d+)\.(\d+)/;
21
22
23
###########################
24
# usual package interface #
25
###########################
26
our $verbose = 1;
27
our %config;
28
29
sub new
30
{
31
0
0
my $self = {};
32
0
my $proto = shift;
33
0
0
my $class = ref($proto) || $proto;
34
# mandatory vars for the API
35
0
@{$self->{URIs}} = undef;
0
36
# internal stuff here
37
# $self->{FOO} = undef;
38
39
# don't change this
40
0
0
if (@_) {%config = %{(shift)};}
0
0
41
0
0
if (exists $config{verbose}) {$verbose = $config{verbose}}
0
42
0
bless($self, $class);
43
0
return $self;
44
}
45
46
47
sub uris
48
{
49
0
0
my $self = shift;
50
0
0
if (@_) { @{$self->{URIs}} = @_ }
0
0
51
0
return @{$self->{URIs}};
0
52
}
53
54
55
# internal routines
56
#sub foobar
57
#{
58
# my $self = shift;
59
# ...
60
#}
61
62
63
sub path_checklink
64
{
65
0
0
my $self = shift;
66
0
my $cl_path;
67
68
0
my $found = 0;
69
0
0
if (exists $config{checklink}){
70
0
$cl_path = $config{checklink};
71
72
0
0
0
if ( (-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
0
73
0
$found = 1;
74
0
return $cl_path;
75
}
76
}
77
0
0
if ($found == 0) {
78
0
foreach ("$Config{scriptdirexp}/checklink", "$Config{binexp}/checklink",
79
'/usr/bin/checklink', '/bin/checklink', '/usr/local/bin/checklink', './checklink'){
80
0
$cl_path = $_;
81
0
0
print "looking for checklink at: $cl_path..." if ($verbose >1);
82
83
0
0
0
if ((-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
0
84
0
$found = 1;
85
0
0
print "found!\n" if ($verbose >1);
86
0
return $cl_path;
87
}
88
else {
89
0
0
print "\n" if ($verbose >1);
90
}
91
}
92
}
93
0
0
if ($found == 0) { die("checklink not found") }
0
94
}
95
96
#########################################
97
# Actual subroutine to check the list of uris #
98
#########################################
99
100
101
sub process_list
102
{
103
my $self = shift;
104
my $max_invalid = undef;
105
if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
106
else {$max_invalid = 0}
107
my $max_documents = undef;
108
if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
109
else {$max_documents = 0}
110
print "Now Using the Link Checker module :\n" if $verbose;
111
my $name = "";
112
if (exists $config{ServerName}) {$name = $config{ServerName}}
113
114
my @uris = undef;
115
my $uri;
116
my $checklink;
117
$checklink = $self->path_checklink();
118
119
my %hits;
120
# Opening the file with the hits and URIs data
121
if (defined ($config{tmpfile}))
122
{
123
1
1
1510
use DB_File;
0
0
124
my $tmp_file = $config{tmpfile};
125
tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) ||
126
die ("Cannot create or open $tmp_file");
127
@uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
128
}
129
elsif ($self->uris())
130
{
131
@uris = $self->uris();
132
foreach my $uri (@uris) { $hits{$uri} = 0 }
133
}
134
135
print "\n (This may take a long time if you have many files to validate)\n" if ($verbose eq 1);
136
print "\n" if ($verbose > 2); # trying to breathe in the debug volume...
137
138
# require W3C::LinkChecker; # TODO when the link checker is nicely modularized
139
my @result;
140
my @result_head;
141
142
push @result_head, "Rank";
143
push @result_head, "Hits";
144
push @result_head, "#Error(s)";
145
push @result_head, "Address";
146
my $total_census = 0;
147
my $invalid_census = 0;
148
my $last_invalid_position = 0;
149
150
while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
151
{
152
$uri = shift (@uris);
153
# $self->new_doc();
154
my $uri_orig = $uri;
155
$total_census++;
156
print " processing #$total_census $uri..." if ($verbose > 1);
157
158
# FIXME at some point we will use the library instead of running the script
159
#open(LINK, "$checklink $uri 2>/dev/null |");
160
open LINK, "-|" or do {
161
require File::Spec;
162
open STDERR, "> " . File::Spec->devnull or die $!;
163
exec $checklink, $uri;
164
die "Can't execute $checklink: $!";
165
};
166
my $num_errs = 0;
167
print "\n" if ($verbose > 2);
168
while ( ) {
169
my $line = $_;
170
171
if (($line =~ /To do: The link is broken/) or ($line =~ /To do: There are broken fragments/) or ($line =~ /To do: The hostname could not be resolved. This link needs to be fixed/)){
172
$num_errs += 1;
173
print $line if ($verbose > 2);
174
}
175
176
}
177
print " " if ($verbose > 2);
178
179
if ($num_errs > 0) {
180
print " $num_errs broken link(s)\n" if ($verbose > 1);
181
my @result_tmp;
182
push @result_tmp, $total_census;
183
push @result_tmp, $hits{$uri_orig};
184
push @result_tmp, $num_errs;
185
push @result_tmp, $uri_orig;
186
push @result, [@result_tmp];
187
$invalid_census++;
188
$last_invalid_position = $total_census;
189
}
190
else {
191
print " OK.\n" if ($verbose > 1);
192
}
193
194
195
}
196
197
print "Done!\n" if $verbose;
198
199
200
201
print "invalid_census $invalid_census \n" if ($verbose > 2 );
202
my $intro = "Here are the most popular document(s) with broken links \nthat I could find in the logs for $name.";
203
my $outro;
204
if ($invalid_census) # we found invalid docs
205
{
206
if ($invalid_census eq 1) # let's repect grammar here
207
{
208
$intro=~ s/are/is/;
209
$intro=~ s/ //;
210
$intro=~ s/document\(s\)/document/;
211
}
212
$intro =~s//$invalid_census/;
213
my $ratio = 10000*$invalid_census/$total_census;
214
$ratio = int($ratio)/100;
215
if ($last_invalid_position eq $total_census )
216
# usual case
217
{
218
$outro="Conclusion :
219
I had to check $last_invalid_position document(s) in order to find $invalid_census HTML documents with broken links.
220
This means that about $ratio\% of your most popular documents needs fixing.";
221
}
222
else
223
# we didn't find as many invalid docs as requested
224
{
225
if ($max_invalid) {
226
227
$outro= "Conclusion :
228
You asked for $max_invalid document with broken links but I could only find $invalid_census
229
by processing (all the) $total_census document(s) in your logs.
230
This means that about $ratio\% of your most popular documents needs fixing.";}
231
else # max_invalid set to 0, user asked for all invalid docs
232
{ $outro= "Conclusion :
233
I found $invalid_census documents with broken links
234
by processing (all the) $total_census document(s) in your logs.
235
This means that about $ratio\% of your most popular documents needs fixing.";}
236
}
237
}
238
elsif (!$total_census)
239
{
240
$intro="There was nothing to check in this log.";
241
$outro="";
242
}
243
else # everything was actually OK!
244
{
245
$intro=~s/ //;
246
$outro="I couldn't find any document with broken links in this log. Congratulations!";
247
}
248
if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
249
{
250
$outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
251
}
252
253
if (defined ($config{tmpfile}))
254
{
255
untie %hits;
256
}
257
# Here is what the module will return. The hash will be sent to
258
# the output module
259
260
my %returnhash;
261
# the name of the module
262
$returnhash{"name"}="Link Checker";
263
#intro
264
$returnhash{"intro"}=$intro;
265
#Headers for the result table
266
@{$returnhash{"thead"}}=@result_head;
267
# data for the results table
268
@{$returnhash{"trows"}}= @result;
269
#outro
270
$returnhash{"outro"}=$outro;
271
return %returnhash;
272
}
273
274
package W3C::LogValidator::LinkChecker;
275
276
1;
277
278
__END__