| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # Copyright (c) 2002-2005 the World Wide Web Consortium : | 
| 2 |  |  |  |  |  |  | #       Keio University, | 
| 3 |  |  |  |  |  |  | #       European Research Consortium for Informatics and Mathematics | 
| 4 |  |  |  |  |  |  | #       Massachusetts Institute of Technology. | 
| 5 |  |  |  |  |  |  | # written by Olivier Thereaux  for W3C | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | # $Id: LogValidator.pm,v 1.25 2008/11/18 16:49:57 ot Exp $ | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | package W3C::LogValidator; | 
| 10 | 1 |  |  | 1 |  | 785 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 39 |  | 
| 11 | 1 |  |  | 1 |  | 5 | no strict "refs"; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 287 |  | 
| 12 |  |  |  |  |  |  | require Exporter; | 
| 13 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 14 |  |  |  |  |  |  | our %EXPORT_TAGS = ( 'all' => [ qw() ] ); | 
| 15 |  |  |  |  |  |  | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | 
| 16 |  |  |  |  |  |  | our @EXPORT = qw(); | 
| 17 |  |  |  |  |  |  | our $VERSION = sprintf "%d.%03d",q$Revision: 1.25 $ =~ /(\d+)\.(\d+)/; | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | our %config; | 
| 20 |  |  |  |  |  |  | our $output=""; | 
| 21 |  |  |  |  |  |  | our $config_filename = undef; | 
| 22 |  |  |  |  |  |  | our $verbose; | 
| 23 |  |  |  |  |  |  | our %cmdline_conf; | 
| 24 |  |  |  |  |  |  | our %hits; # hash URI->hits | 
| 25 |  |  |  |  |  |  | our %referers; | 
| 26 |  |  |  |  |  |  | our %mimetypes; | 
| 27 |  |  |  |  |  |  | our %HTTPcodes; | 
| 28 |  |  |  |  |  |  | our $output_proc; | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | ########################### | 
| 31 |  |  |  |  |  |  | # usual package interface # | 
| 32 |  |  |  |  |  |  | ########################### | 
| 33 |  |  |  |  |  |  | sub new | 
| 34 |  |  |  |  |  |  | { | 
| 35 | 0 |  |  | 0 |  |  | my $self  = {}; | 
| 36 | 0 |  |  |  |  |  | my $proto = shift; | 
| 37 | 0 |  | 0 |  |  |  | my $class = ref($proto) || $proto; | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | # server config is imported from the config module | 
| 40 | 1 |  |  | 1 |  | 671 | use W3C::LogValidator::Config; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 421 |  | 
| 41 | 0 | 0 |  |  |  |  | if (@_) | 
| 42 |  |  |  |  |  |  | { | 
| 43 | 0 |  |  |  |  |  | $config_filename = shift; | 
| 44 |  |  |  |  |  |  | #		print "using config filename  $config_filename \n"; #debug | 
| 45 | 0 | 0 |  |  |  |  | if ($config_filename) | 
| 46 |  |  |  |  |  |  | { | 
| 47 | 0 |  |  |  |  |  | %config = W3C::LogValidator::Config->new($config_filename)->configure(); | 
| 48 |  |  |  |  |  |  | } | 
| 49 |  |  |  |  |  |  | else | 
| 50 |  |  |  |  |  |  | { | 
| 51 | 0 |  |  |  |  |  | %config = W3C::LogValidator::Config->new()->configure(); | 
| 52 |  |  |  |  |  |  | } | 
| 53 |  |  |  |  |  |  | } | 
| 54 |  |  |  |  |  |  | else | 
| 55 | 0 |  |  |  |  |  | { %config = W3C::LogValidator::Config->new()->configure(); } | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | # processing other options given at the command line | 
| 58 | 0 | 0 |  |  |  |  | if (@_) | 
| 59 |  |  |  |  |  |  | { | 
| 60 | 0 |  |  |  |  |  | %cmdline_conf= %{(shift)}; | 
|  | 0 |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | } | 
| 62 |  |  |  |  |  |  | # verbosity : overriding config if given at command line | 
| 63 | 0 | 0 |  |  |  |  | if (defined($cmdline_conf{verbose})) | 
|  |  | 0 |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | { | 
| 65 | 0 |  |  |  |  |  | ($config{LogProcessor}{verbose}) = $cmdline_conf{verbose}; | 
| 66 | 0 |  |  |  |  |  | $verbose = $cmdline_conf{verbose}; | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  | # setting default verbosity if not given | 
| 69 |  |  |  |  |  |  | elsif (! defined($config{LogProcessor}{verbose}) ) | 
| 70 |  |  |  |  |  |  | { | 
| 71 | 0 |  |  |  |  |  | ($config{LogProcessor}{verbose}) = 1; | 
| 72 | 0 |  |  |  |  |  | $verbose = 1; | 
| 73 |  |  |  |  |  |  | } | 
| 74 |  |  |  |  |  |  | # output : overriding config if given at command line | 
| 75 | 0 | 0 |  |  |  |  | if ( defined($cmdline_conf{"UseOutputModule"}) ) | 
|  |  | 0 |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | { | 
| 77 | 0 |  |  |  |  |  | $config{LogProcessor}{UseOutputModule} = $cmdline_conf{UseOutputModule}; | 
| 78 |  |  |  |  |  |  | } | 
| 79 |  |  |  |  |  |  | elsif (! defined($config{LogProcessor}{UseOutputModule})) | 
| 80 |  |  |  |  |  |  | { | 
| 81 | 0 |  |  |  |  |  | $config{LogProcessor}{UseOutputModule} = "W3C::LogValidator::Output::Raw"; | 
| 82 |  |  |  |  |  |  | } | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | # output to file | 
| 85 |  |  |  |  |  |  | # no "default value, will output to console if not set! | 
| 86 | 0 | 0 |  |  |  |  | if ( defined($cmdline_conf{"OutputTo"} ) ) | 
| 87 |  |  |  |  |  |  | { | 
| 88 | 0 |  |  |  |  |  | $config{LogProcessor}{OutputTo} = $cmdline_conf{"OutputTo"}; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | # same for e-mail address to send to | 
| 92 |  |  |  |  |  |  | # overrding conf file info with cmdline info | 
| 93 | 0 | 0 |  |  |  |  | if ( defined($cmdline_conf{"ServerAdmin"}) ) | 
| 94 |  |  |  |  |  |  | { | 
| 95 | 0 |  |  |  |  |  | $config{LogProcessor}{"ServerAdmin"} = $cmdline_conf{"ServerAdmin"}; | 
| 96 |  |  |  |  |  |  | } | 
| 97 |  |  |  |  |  |  |  | 
| 98 | 1 |  |  | 1 |  | 2931 | use File::Temp qw/ /; | 
|  | 1 |  |  |  |  | 14822 |  | 
|  | 1 |  |  |  |  | 1098 |  | 
| 99 | 0 |  |  |  |  |  | my $tmpdir = File::Spec->tmpdir; | 
| 100 | 0 |  |  |  |  |  | $config{LogProcessor}{tmpfile} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); | 
| 101 | 0 |  |  |  |  |  | $config{LogProcessor}{tmpfile_HTTP_codes} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); | 
| 102 | 0 |  |  |  |  |  | $config{LogProcessor}{tmpfile_mime_types} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); | 
| 103 | 0 |  |  |  |  |  | $config{LogProcessor}{tmpfile_referers} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); | 
| 104 | 0 |  |  |  |  |  | bless($self, $class); | 
| 105 | 0 |  |  |  |  |  | return $self; | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | sub sorted_uris | 
| 110 |  |  |  |  |  |  | { | 
| 111 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 112 | 0 | 0 |  |  |  |  | print "sorting logs: " if $verbose; # non-quiet mode | 
| 113 | 0 |  |  |  |  |  | my @uris = sort { $hits{$b} <=> $hits{$a} } | 
|  | 0 |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | keys %hits; | 
| 115 |  |  |  |  |  |  |  | 
| 116 | 0 |  |  |  |  |  | my $theuri; | 
| 117 |  |  |  |  |  |  | my  $theuri_hit; | 
| 118 | 0 |  |  |  |  |  | my @theuriarry; | 
| 119 | 0 |  |  |  |  |  | @theuriarry = @uris; | 
| 120 | 0 |  | 0 |  |  |  | while ( (@theuriarry) and ($verbose > 1)) | 
| 121 |  |  |  |  |  |  | { | 
| 122 | 0 |  |  |  |  |  | $theuri = shift (@theuriarry); | 
| 123 | 0 |  |  |  |  |  | $theuri_hit = $hits{$theuri}; | 
| 124 | 0 |  |  |  |  |  | print "	$theuri $theuri_hit\n"; | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  |  | 
| 128 | 0 | 0 |  |  |  |  | print "Done!\n" if $verbose; # non-quiet mode | 
| 129 | 0 |  |  |  |  |  | return @uris; | 
| 130 |  |  |  |  |  |  | } | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | ################### | 
| 133 |  |  |  |  |  |  | # Server routines # | 
| 134 |  |  |  |  |  |  | ################### | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | sub add_uri | 
| 137 |  |  |  |  |  |  | # usage $self->add_uri('http://foobar') | 
| 138 |  |  |  |  |  |  | { | 
| 139 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 140 | 0 | 0 |  |  |  |  | if (@_) | 
| 141 |  |  |  |  |  |  | { | 
| 142 | 0 |  |  |  |  |  | my $uri = shift; | 
| 143 | 0 | 0 |  |  |  |  | next unless defined($uri); | 
| 144 | 0 | 0 |  |  |  |  | if ( exists($hits{$uri}) ) | 
| 145 |  |  |  |  |  |  | { | 
| 146 | 0 |  |  |  |  |  | $hits{$uri} = $hits{$uri}+1; | 
| 147 |  |  |  |  |  |  | } | 
| 148 |  |  |  |  |  |  | else | 
| 149 | 0 |  |  |  |  |  | { $hits{$uri} = 1 } | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | sub add_referer | 
| 154 |  |  |  |  |  |  | # usage $self->add_referer($uri, $referer) | 
| 155 |  |  |  |  |  |  | { | 
| 156 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 157 | 0 | 0 |  |  |  |  | if (@_) | 
| 158 |  |  |  |  |  |  | { | 
| 159 | 0 |  |  |  |  |  | my $uri = shift; | 
| 160 | 0 |  |  |  |  |  | my $referer = shift; | 
| 161 | 0 |  |  |  |  |  | $referer =~ s/^"(.*)"$/$1/; | 
| 162 | 0 |  |  |  |  |  | my $preferedref = $config{LogProcessor}{RefererMatch}; | 
| 163 | 0 | 0 | 0 |  |  |  | if (($referer ne "-") and ( $referer =~ /$preferedref/)) | 
| 164 |  |  |  |  |  |  | { | 
| 165 |  |  |  |  |  |  |  | 
| 166 | 0 | 0 |  |  |  |  | if (exists $referers{"$uri : $referer"}) | 
| 167 |  |  |  |  |  |  | # nth time this referer is mentioned for $uri, incrementing | 
| 168 |  |  |  |  |  |  | { | 
| 169 | 0 |  |  |  |  |  | $referers{"$uri : $referer"} += 1; | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  | else | 
| 172 |  |  |  |  |  |  | # first time this referer is mentioned for $uri | 
| 173 |  |  |  |  |  |  | { | 
| 174 | 0 |  |  |  |  |  | $referers{"$uri : $referer"} = 1; | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | sub add_mime_type | 
| 181 |  |  |  |  |  |  | # record the mime type known for a given logged resource | 
| 182 |  |  |  |  |  |  | # usage $self->add_mime_type('http://foobar', "text/html") | 
| 183 |  |  |  |  |  |  | { | 
| 184 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 185 | 0 | 0 |  |  |  |  | if (@_) | 
| 186 |  |  |  |  |  |  | { | 
| 187 | 0 |  |  |  |  |  | my $uri = shift; | 
| 188 | 0 |  |  |  |  |  | my $mime_type = shift; | 
| 189 | 0 | 0 |  |  |  |  | next unless defined($uri); | 
| 190 | 0 | 0 |  |  |  |  | if (! exists($mimetypes{$uri}) ) | 
| 191 | 0 |  |  |  |  |  | { $mimetypes{$uri} = $mime_type; } | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | sub add_HTTP_code | 
| 196 |  |  |  |  |  |  | # record the returned HTTP Code for a given logged resource | 
| 197 |  |  |  |  |  |  | # usage $self->add_HTTP_code('http://foobar', "200") | 
| 198 |  |  |  |  |  |  | # NOTE: doesn't cover if that code changes throughout the log file - TODO fix that? | 
| 199 |  |  |  |  |  |  | { | 
| 200 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 201 | 0 | 0 |  |  |  |  | if (@_) | 
| 202 |  |  |  |  |  |  | { | 
| 203 | 0 |  |  |  |  |  | my $uri = shift; | 
| 204 | 0 |  |  |  |  |  | my $HTTP_code = shift; | 
| 205 | 0 | 0 |  |  |  |  | next unless defined($uri); | 
| 206 | 0 | 0 |  |  |  |  | if (! exists($HTTPcodes{$uri}) ) | 
| 207 |  |  |  |  |  |  | { | 
| 208 | 0 |  |  |  |  |  | $HTTPcodes{$uri} = $HTTP_code; | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | } | 
| 211 |  |  |  |  |  |  | } | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | sub read_logfiles | 
| 214 |  |  |  |  |  |  | # just looping | 
| 215 |  |  |  |  |  |  | { | 
| 216 |  |  |  |  |  |  | my $self = shift; | 
| 217 |  |  |  |  |  |  | my $current_logfile; | 
| 218 | 1 |  |  | 1 |  | 118945 | use DB_File; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | my $tmp_file = $config{LogProcessor}{tmpfile}; | 
| 220 |  |  |  |  |  |  | tie (%hits, 'DB_File', "$tmp_file") || | 
| 221 |  |  |  |  |  |  | die ("Cannot create or open $tmp_file"); | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | # TODO this should probably be triggered (on or off) by an option rather than always on | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | my $tmp_file_referers = $config{LogProcessor}{tmpfile_referers}; | 
| 226 |  |  |  |  |  |  | tie (%referers, 'DB_File', "$tmp_file_referers") || | 
| 227 |  |  |  |  |  |  | die ("Cannot create or open $tmp_file_referers"); | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | my $tmp_file_mime_types = $config{LogProcessor}{tmpfile_mime_types}; | 
| 230 |  |  |  |  |  |  | tie (%mimetypes, 'DB_File', "$tmp_file_mime_types") || | 
| 231 |  |  |  |  |  |  | die ("Cannot create or open $tmp_file_mime_types"); | 
| 232 |  |  |  |  |  |  |  | 
| 233 |  |  |  |  |  |  | my $tmp_file_HTTP_codes = $config{LogProcessor}{tmpfile_HTTP_codes}; | 
| 234 |  |  |  |  |  |  | tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes") || | 
| 235 |  |  |  |  |  |  | die ("Cannot create or open $tmp_file_HTTP_codes"); | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | print "Reading logfiles: " if ($verbose); #non-quiet mode | 
| 238 |  |  |  |  |  |  | print "\n" if ($verbose >1); # verbose or above, we'll have details so linebreak | 
| 239 |  |  |  |  |  |  | my @logfiles = @{$config{LogProcessor}{LogFiles}}; | 
| 240 |  |  |  |  |  |  | foreach $current_logfile (@logfiles) | 
| 241 |  |  |  |  |  |  | { | 
| 242 |  |  |  |  |  |  | $self->read_logfile($current_logfile); | 
| 243 |  |  |  |  |  |  | } | 
| 244 |  |  |  |  |  |  |  | 
| 245 |  |  |  |  |  |  | untie %hits; | 
| 246 |  |  |  |  |  |  | untie %HTTPcodes; | 
| 247 |  |  |  |  |  |  | untie %mimetypes; | 
| 248 |  |  |  |  |  |  | untie %referers; | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | print "Done! \n" if ($verbose); #non-quiet mode | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | } | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | sub read_logfile | 
| 257 |  |  |  |  |  |  | #read logfile, push uris  one by one with the appropriate sub | 
| 258 |  |  |  |  |  |  | { | 
| 259 |  |  |  |  |  |  | my $self = shift; | 
| 260 |  |  |  |  |  |  | my $tmp_record; | 
| 261 |  |  |  |  |  |  | my $entriesperlogfile = $config{LogProcessor}{EntriesPerLogfile}; | 
| 262 |  |  |  |  |  |  | my $allskiphosts = ($config{LogProcessor}{ExcludeHosts}) ? $config{LogProcessor}{ExcludeHosts} : ""; # default to none | 
| 263 |  |  |  |  |  |  | my @skiphostsregex = split(" ", $allskiphosts); | 
| 264 |  |  |  |  |  |  | my $entriescounter=0; | 
| 265 |  |  |  |  |  |  | my $skip_thishost = 0; | 
| 266 |  |  |  |  |  |  | if (@_) | 
| 267 |  |  |  |  |  |  | { | 
| 268 |  |  |  |  |  |  | my $logfile = shift; | 
| 269 |  |  |  |  |  |  | if (open (LOGFILE, "$logfile")) { | 
| 270 |  |  |  |  |  |  | print "	$logfile...\n" if ($verbose > 1); # verbose or above | 
| 271 |  |  |  |  |  |  | $entriescounter=0; | 
| 272 |  |  |  |  |  |  | while ( (($entriescounter < $entriesperlogfile ) or (!$entriesperlogfile)) # limit number of entries | 
| 273 |  |  |  |  |  |  | and ($tmp_record = )) | 
| 274 |  |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | { | 
| 276 |  |  |  |  |  |  | $tmp_record =~ chomp; | 
| 277 |  |  |  |  |  |  | my $logtype = $config{LogProcessor}{LogType}{$logfile}; | 
| 278 |  |  |  |  |  |  | if ($tmp_record) # not a blank line | 
| 279 |  |  |  |  |  |  | { | 
| 280 |  |  |  |  |  |  | my $tmp_record_remote_addr = $self->find_remote_addr($tmp_record, $logtype); | 
| 281 |  |  |  |  |  |  | if ($tmp_record_remote_addr) # not a blank remote host or address | 
| 282 |  |  |  |  |  |  | { | 
| 283 |  |  |  |  |  |  | $skip_thishost = 0; | 
| 284 |  |  |  |  |  |  | foreach my $skipexpression (@skiphostsregex) | 
| 285 |  |  |  |  |  |  | { | 
| 286 |  |  |  |  |  |  | if( $tmp_record_remote_addr =~ /$skipexpression/ ) | 
| 287 |  |  |  |  |  |  | { | 
| 288 |  |  |  |  |  |  | print " Skipping " . $tmp_record_remote_addr . " because it matches the ExcludeHosts pattern " . $skipexpression. "\n" if ($verbose > 2); | 
| 289 |  |  |  |  |  |  | $skip_thishost = 1; | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  | } | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | my $tmp_record_uri = $self->find_uri($tmp_record, $logtype); | 
| 295 |  |  |  |  |  |  | my $tmp_record_HTTP_method = $self->find_HTTP_Method($tmp_record, $logtype); | 
| 296 |  |  |  |  |  |  | my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype); | 
| 297 |  |  |  |  |  |  | my $tmp_record_HTTP_code = $self->find_HTTP_code($tmp_record, $logtype); | 
| 298 |  |  |  |  |  |  | my $tmp_record_referer = $self->find_referer($tmp_record, $logtype); | 
| 299 |  |  |  |  |  |  | if ( | 
| 300 |  |  |  |  |  |  | ($skip_thishost == 0) | 
| 301 |  |  |  |  |  |  | and | 
| 302 |  |  |  |  |  |  | ($tmp_record_HTTP_method eq "GET") | 
| 303 |  |  |  |  |  |  | and | 
| 304 |  |  |  |  |  |  | ($self->no_cgi($tmp_record) or ($config{LogProcessor}{ExcludeCGI} eq 0)) | 
| 305 |  |  |  |  |  |  | ) { | 
| 306 |  |  |  |  |  |  | $self->add_uri($tmp_record_uri); | 
| 307 |  |  |  |  |  |  | $self->add_mime_type($tmp_record_uri, $tmp_record_mime_type); | 
| 308 |  |  |  |  |  |  | $self->add_HTTP_code($tmp_record_uri,$tmp_record_HTTP_code); | 
| 309 |  |  |  |  |  |  | $self->add_referer($tmp_record_uri,$tmp_record_referer); | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  | } | 
| 312 |  |  |  |  |  |  | $entriescounter++; | 
| 313 |  |  |  |  |  |  | } | 
| 314 |  |  |  |  |  |  | print "	  added $entriescounter URIs.\n" if ($verbose > 2); | 
| 315 |  |  |  |  |  |  | close LOGFILE; | 
| 316 |  |  |  |  |  |  | } elsif ($logfile) { | 
| 317 |  |  |  |  |  |  | die "could not open log file $logfile : $!"; | 
| 318 |  |  |  |  |  |  | } | 
| 319 |  |  |  |  |  |  | } | 
| 320 |  |  |  |  |  |  | } | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | sub no_cgi | 
| 323 |  |  |  |  |  |  | { | 
| 324 |  |  |  |  |  |  | my $self = shift; | 
| 325 |  |  |  |  |  |  | if (@_) | 
| 326 |  |  |  |  |  |  | { | 
| 327 |  |  |  |  |  |  | my $tmp_uri = shift; | 
| 328 |  |  |  |  |  |  | if (defined $tmp_uri) { | 
| 329 |  |  |  |  |  |  | return (!($tmp_uri =~ /\?/)) | 
| 330 |  |  |  |  |  |  | } | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  | } | 
| 333 |  |  |  |  |  |  |  | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | sub find_uri | 
| 336 |  |  |  |  |  |  | # finds the "real" URI from a log record | 
| 337 |  |  |  |  |  |  | { | 
| 338 |  |  |  |  |  |  | my $self = shift; | 
| 339 |  |  |  |  |  |  | if (@_) | 
| 340 |  |  |  |  |  |  | { | 
| 341 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 342 |  |  |  |  |  |  | my @record_arry; | 
| 343 |  |  |  |  |  |  | @record_arry = split(" ", $tmprecord); | 
| 344 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 345 |  |  |  |  |  |  | # for the moment... TODO | 
| 346 |  |  |  |  |  |  | my $logtype = shift; | 
| 347 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 348 |  |  |  |  |  |  | if ($logtype eq "plain") | 
| 349 |  |  |  |  |  |  | { | 
| 350 |  |  |  |  |  |  | $tmprecord = $record_arry[0]; | 
| 351 |  |  |  |  |  |  | $tmprecord = $self->remove_duplicates($tmprecord); | 
| 352 |  |  |  |  |  |  | } | 
| 353 |  |  |  |  |  |  | else #common combined or full or w3c | 
| 354 |  |  |  |  |  |  | { | 
| 355 |  |  |  |  |  |  | $tmprecord = $record_arry[6]; | 
| 356 |  |  |  |  |  |  | $tmprecord = $self->remove_duplicates($tmprecord); | 
| 357 |  |  |  |  |  |  | if( !( $tmprecord =~ m/^https?\:/ ) ) { | 
| 358 |  |  |  |  |  |  | $tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord); | 
| 359 |  |  |  |  |  |  | sub find_remote_addr | 
| 360 |  |  |  |  |  |  | # finds the returned HTTP code from a log record, if available | 
| 361 |  |  |  |  |  |  | { | 
| 362 |  |  |  |  |  |  | my $self = shift; | 
| 363 |  |  |  |  |  |  | if (@_) | 
| 364 |  |  |  |  |  |  | { | 
| 365 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 366 |  |  |  |  |  |  | my @record_arry; | 
| 367 |  |  |  |  |  |  | @record_arry = split(" ", $tmprecord); | 
| 368 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 369 |  |  |  |  |  |  | # for the moment... TODO | 
| 370 |  |  |  |  |  |  | my $logtype = shift; | 
| 371 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 372 |  |  |  |  |  |  | if ($logtype eq "plain") | 
| 373 |  |  |  |  |  |  | { | 
| 374 |  |  |  |  |  |  | $tmprecord = ""; | 
| 375 |  |  |  |  |  |  | } | 
| 376 |  |  |  |  |  |  | else #common combined full or w3c | 
| 377 |  |  |  |  |  |  | { | 
| 378 |  |  |  |  |  |  | $tmprecord = $record_arry[0]; | 
| 379 |  |  |  |  |  |  | } | 
| 380 |  |  |  |  |  |  | #print "Remote Addr $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); | 
| 381 |  |  |  |  |  |  | return $tmprecord; | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  | } | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  | } | 
| 387 |  |  |  |  |  |  | #print "$tmprecord \n" if ($verbose > 2); | 
| 388 |  |  |  |  |  |  | return $tmprecord; | 
| 389 |  |  |  |  |  |  | } | 
| 390 |  |  |  |  |  |  | } | 
| 391 |  |  |  |  |  |  |  | 
| 392 |  |  |  |  |  |  | sub find_HTTP_Method | 
| 393 |  |  |  |  |  |  | # finds the returned HTTP Method from a log record, if available | 
| 394 |  |  |  |  |  |  | { | 
| 395 |  |  |  |  |  |  | my $self = shift; | 
| 396 |  |  |  |  |  |  | if (@_) | 
| 397 |  |  |  |  |  |  | { | 
| 398 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 399 |  |  |  |  |  |  | my @record_arry; | 
| 400 |  |  |  |  |  |  | @record_arry = split(" ", $tmprecord); | 
| 401 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 402 |  |  |  |  |  |  | # for the moment... TODO | 
| 403 |  |  |  |  |  |  | my $logtype = shift; | 
| 404 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 405 |  |  |  |  |  |  | if ($logtype eq "plain") | 
| 406 |  |  |  |  |  |  | { | 
| 407 |  |  |  |  |  |  | # we consider each of those GETs | 
| 408 |  |  |  |  |  |  | $tmprecord = "GET"; | 
| 409 |  |  |  |  |  |  | } | 
| 410 |  |  |  |  |  |  | else #common combined full or w3c | 
| 411 |  |  |  |  |  |  | { | 
| 412 |  |  |  |  |  |  | $tmprecord = $record_arry[5]; | 
| 413 |  |  |  |  |  |  | $tmprecord =~ s/^"//; | 
| 414 |  |  |  |  |  |  | } | 
| 415 |  |  |  |  |  |  | #print "HTTP Code $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); | 
| 416 |  |  |  |  |  |  | return $tmprecord; | 
| 417 |  |  |  |  |  |  | } | 
| 418 |  |  |  |  |  |  | } | 
| 419 |  |  |  |  |  |  |  | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | sub find_HTTP_code | 
| 422 |  |  |  |  |  |  | # finds the returned HTTP code from a log record, if available | 
| 423 |  |  |  |  |  |  | { | 
| 424 |  |  |  |  |  |  | my $self = shift; | 
| 425 |  |  |  |  |  |  | if (@_) | 
| 426 |  |  |  |  |  |  | { | 
| 427 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 428 |  |  |  |  |  |  | my @record_arry; | 
| 429 |  |  |  |  |  |  | @record_arry = split(" ", $tmprecord); | 
| 430 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 431 |  |  |  |  |  |  | # for the moment... TODO | 
| 432 |  |  |  |  |  |  | my $logtype = shift; | 
| 433 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 434 |  |  |  |  |  |  | if ($logtype eq "plain") | 
| 435 |  |  |  |  |  |  | { | 
| 436 |  |  |  |  |  |  | $tmprecord = ""; | 
| 437 |  |  |  |  |  |  | } | 
| 438 |  |  |  |  |  |  | else #common combined full or w3c | 
| 439 |  |  |  |  |  |  | { | 
| 440 |  |  |  |  |  |  | $tmprecord = $record_arry[8]; | 
| 441 |  |  |  |  |  |  | } | 
| 442 |  |  |  |  |  |  | #print "HTTP Code $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); | 
| 443 |  |  |  |  |  |  | return $tmprecord; | 
| 444 |  |  |  |  |  |  | } | 
| 445 |  |  |  |  |  |  | } | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  | sub find_referer | 
| 448 |  |  |  |  |  |  | # finds the referrer info from a log record, if available | 
| 449 |  |  |  |  |  |  | { | 
| 450 |  |  |  |  |  |  | my $self = shift; | 
| 451 |  |  |  |  |  |  | if (@_) | 
| 452 |  |  |  |  |  |  | { | 
| 453 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 454 |  |  |  |  |  |  | my @record_arry; | 
| 455 |  |  |  |  |  |  | @record_arry = split(" ", $tmprecord); | 
| 456 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 457 |  |  |  |  |  |  | # for the moment... TODO | 
| 458 |  |  |  |  |  |  | my $logtype = shift; | 
| 459 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 460 |  |  |  |  |  |  | if ( ($logtype eq "plain") or ($logtype eq "common")) | 
| 461 |  |  |  |  |  |  | { | 
| 462 |  |  |  |  |  |  | $tmprecord = ""; | 
| 463 |  |  |  |  |  |  | } | 
| 464 |  |  |  |  |  |  | else #combined or full or w3c | 
| 465 |  |  |  |  |  |  | { | 
| 466 |  |  |  |  |  |  | $tmprecord = $record_arry[10]; | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  | #print "referrer $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); | 
| 469 |  |  |  |  |  |  | return $tmprecord; | 
| 470 |  |  |  |  |  |  | } | 
| 471 |  |  |  |  |  |  | } | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | sub find_mime_type | 
| 474 |  |  |  |  |  |  | # only for W3c extended log format - find the mime type for the resource | 
| 475 |  |  |  |  |  |  | { | 
| 476 |  |  |  |  |  |  | my $self = shift; | 
| 477 |  |  |  |  |  |  | if (@_) | 
| 478 |  |  |  |  |  |  | { | 
| 479 |  |  |  |  |  |  | my $tmprecord = shift; | 
| 480 |  |  |  |  |  |  | my @record_arry; | 
| 481 |  |  |  |  |  |  | @record_arry = split(' ', $tmprecord); | 
| 482 |  |  |  |  |  |  | # hardcoded to most apache log formats, included common and combined | 
| 483 |  |  |  |  |  |  | # for the moment... TODO | 
| 484 |  |  |  |  |  |  | my $logtype = shift; | 
| 485 |  |  |  |  |  |  | # print "log type $logtype" if ($verbose > 2); | 
| 486 |  |  |  |  |  |  | if ($logtype eq "w3c") | 
| 487 |  |  |  |  |  |  | { | 
| 488 |  |  |  |  |  |  |  | 
| 489 |  |  |  |  |  |  | $tmprecord = pop @record_arry; | 
| 490 |  |  |  |  |  |  | } | 
| 491 |  |  |  |  |  |  | else # all other formats | 
| 492 |  |  |  |  |  |  | { | 
| 493 |  |  |  |  |  |  | $tmprecord = ""; | 
| 494 |  |  |  |  |  |  | } | 
| 495 |  |  |  |  |  |  | #print "mime type $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); | 
| 496 |  |  |  |  |  |  | return $tmprecord; | 
| 497 |  |  |  |  |  |  | } | 
| 498 |  |  |  |  |  |  | } | 
| 499 |  |  |  |  |  |  |  | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  |  | 
| 502 |  |  |  |  |  |  | sub remove_duplicates | 
| 503 |  |  |  |  |  |  | # removes "directory index" suffixes such as index.html, etc | 
| 504 |  |  |  |  |  |  | # so that http://foobar/ and http://foobar/index.html be counted as one resource | 
| 505 |  |  |  |  |  |  | # also removes URI fragments | 
| 506 |  |  |  |  |  |  | { | 
| 507 |  |  |  |  |  |  | my $self = shift; | 
| 508 |  |  |  |  |  |  | my $tmprecord; | 
| 509 |  |  |  |  |  |  | if (@_) { $tmprecord = shift;} | 
| 510 |  |  |  |  |  |  |  | 
| 511 |  |  |  |  |  |  | # remove frags | 
| 512 |  |  |  |  |  |  | $tmprecord =~ s/\#.*$// if ($tmprecord); | 
| 513 |  |  |  |  |  |  |  | 
| 514 |  |  |  |  |  |  | # remove indexes | 
| 515 |  |  |  |  |  |  | my $index_file; | 
| 516 |  |  |  |  |  |  | foreach $index_file (split (" ",$config{LogProcessor}{DirectoryIndex})) | 
| 517 |  |  |  |  |  |  | { | 
| 518 |  |  |  |  |  |  | $tmprecord =~ s/$index_file$// if ($tmprecord); | 
| 519 |  |  |  |  |  |  | } | 
| 520 |  |  |  |  |  |  | return $tmprecord; | 
| 521 |  |  |  |  |  |  | } | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  |  | 
| 524 |  |  |  |  |  |  | sub hit | 
| 525 |  |  |  |  |  |  | { | 
| 526 |  |  |  |  |  |  | my $self = shift; | 
| 527 |  |  |  |  |  |  | my $uri=undef; | 
| 528 |  |  |  |  |  |  | if (@_) {$uri=shift} | 
| 529 |  |  |  |  |  |  | return $hits{$uri}; | 
| 530 |  |  |  |  |  |  | } | 
| 531 |  |  |  |  |  |  |  | 
| 532 |  |  |  |  |  |  | sub config_module | 
| 533 |  |  |  |  |  |  | { | 
| 534 |  |  |  |  |  |  | my $self = shift; | 
| 535 |  |  |  |  |  |  | my $module_used; #= undef; | 
| 536 |  |  |  |  |  |  | if (@_) | 
| 537 |  |  |  |  |  |  | { | 
| 538 |  |  |  |  |  |  | $module_used = shift; | 
| 539 |  |  |  |  |  |  | } | 
| 540 |  |  |  |  |  |  | my %tmpconfig = %{$config{LogProcessor}}; | 
| 541 |  |  |  |  |  |  | #add module specific variables, override if necessary. | 
| 542 |  |  |  |  |  |  | if ( ($module_used) and (defined ($config{$module_used}))) | 
| 543 |  |  |  |  |  |  | { | 
| 544 |  |  |  |  |  |  | foreach my $modkey (keys %{$config{$module_used}}) | 
| 545 |  |  |  |  |  |  | { | 
| 546 |  |  |  |  |  |  | if ( $config{$module_used}{$modkey} ) | 
| 547 |  |  |  |  |  |  | { | 
| 548 |  |  |  |  |  |  | $tmpconfig{$modkey} = $config{$module_used}{$modkey} | 
| 549 |  |  |  |  |  |  | } | 
| 550 |  |  |  |  |  |  | } | 
| 551 |  |  |  |  |  |  | } | 
| 552 |  |  |  |  |  |  | return %tmpconfig; | 
| 553 |  |  |  |  |  |  | } | 
| 554 |  |  |  |  |  |  |  | 
| 555 |  |  |  |  |  |  |  | 
| 556 |  |  |  |  |  |  | sub use_modules | 
| 557 |  |  |  |  |  |  | { | 
| 558 |  |  |  |  |  |  | my $self = shift; | 
| 559 |  |  |  |  |  |  | my @modules; | 
| 560 |  |  |  |  |  |  | # the value of the hash may be an array or a single value, | 
| 561 |  |  |  |  |  |  | # we have to check this | 
| 562 |  |  |  |  |  |  | if (defined @{ $config{LogProcessor}{UseValidationModule} }) | 
| 563 |  |  |  |  |  |  | { | 
| 564 |  |  |  |  |  |  | @modules = @{$config{LogProcessor}{UseValidationModule}} | 
| 565 |  |  |  |  |  |  | } | 
| 566 |  |  |  |  |  |  | else # single entry that we push in an array | 
| 567 |  |  |  |  |  |  | { | 
| 568 |  |  |  |  |  |  | push @modules, $config{LogProcessor}{UseValidationModule}; | 
| 569 |  |  |  |  |  |  | } | 
| 570 |  |  |  |  |  |  | foreach my $module_to_use (@modules) | 
| 571 |  |  |  |  |  |  | { | 
| 572 |  |  |  |  |  |  | my $output_tmp = ""; | 
| 573 |  |  |  |  |  |  | eval "use $module_to_use"; | 
| 574 |  |  |  |  |  |  | my $process_module; | 
| 575 |  |  |  |  |  |  | my %mod_config=$self->config_module($module_to_use); | 
| 576 |  |  |  |  |  |  | $process_module = $module_to_use->new(\%mod_config); | 
| 577 |  |  |  |  |  |  | #	$process_module->uris($self->sorted_uris); # not used anymore | 
| 578 |  |  |  |  |  |  | my %results = $process_module->process_list; | 
| 579 |  |  |  |  |  |  | my $shut_up = 0; | 
| 580 |  |  |  |  |  |  | if ( exists $config{LogProcessor}{QuietIfNoReport} ) | 
| 581 |  |  |  |  |  |  | { | 
| 582 |  |  |  |  |  |  | $shut_up = $config{LogProcessor}{QuietIfNoReport}; | 
| 583 |  |  |  |  |  |  | } | 
| 584 |  |  |  |  |  |  | # We're applying the output module and getting its (potential) output | 
| 585 |  |  |  |  |  |  | if ($shut_up and int(@{$results{"trows"}}) == 0) | 
| 586 |  |  |  |  |  |  | { | 
| 587 |  |  |  |  |  |  | print "nothing interesting to report - skipping\n" if ($verbose >1) | 
| 588 |  |  |  |  |  |  | } | 
| 589 |  |  |  |  |  |  | else { | 
| 590 |  |  |  |  |  |  | $output_tmp = $output_proc->output(\%results); | 
| 591 |  |  |  |  |  |  | $output = $output.$output_tmp; | 
| 592 |  |  |  |  |  |  | } | 
| 593 |  |  |  |  |  |  | # TODO maybe make this a hash, one output string per output module used | 
| 594 |  |  |  |  |  |  | # that would allow us to have several output modules at the time... | 
| 595 |  |  |  |  |  |  | # is this very useful? | 
| 596 |  |  |  |  |  |  | } | 
| 597 |  |  |  |  |  |  | } | 
| 598 |  |  |  |  |  |  |  | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | sub process | 
| 601 |  |  |  |  |  |  | # this is the main routine | 
| 602 |  |  |  |  |  |  | # processes the logfile, sorts the uris, and uses the chosen modules | 
| 603 |  |  |  |  |  |  | { | 
| 604 |  |  |  |  |  |  | my $self = shift; | 
| 605 |  |  |  |  |  |  | if ($verbose > 2) #debug | 
| 606 |  |  |  |  |  |  | { | 
| 607 |  |  |  |  |  |  | print "showing general config : \n"; | 
| 608 |  |  |  |  |  |  | foreach my $key ( keys %config) | 
| 609 |  |  |  |  |  |  | { | 
| 610 |  |  |  |  |  |  | my %modname = %{$config{$key}}; | 
| 611 |  |  |  |  |  |  | print "Module	$key\n"; | 
| 612 |  |  |  |  |  |  | foreach my $modkey (keys %{$config{$key}}) | 
| 613 |  |  |  |  |  |  | { | 
| 614 |  |  |  |  |  |  | my $value = $config{$key}{$modkey}; | 
| 615 |  |  |  |  |  |  | print " $modkey $value\n"; | 
| 616 |  |  |  |  |  |  | } | 
| 617 |  |  |  |  |  |  | } | 
| 618 |  |  |  |  |  |  | print "End of config\n\n" | 
| 619 |  |  |  |  |  |  | } | 
| 620 |  |  |  |  |  |  |  | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | $self->read_logfiles; | 
| 623 |  |  |  |  |  |  | my $outputmodule = $config{LogProcessor}{UseOutputModule}; | 
| 624 |  |  |  |  |  |  | eval "use $outputmodule"; | 
| 625 |  |  |  |  |  |  | $output_proc = $outputmodule->new(\%{$config{LogProcessor}}); | 
| 626 |  |  |  |  |  |  | $self->use_modules; | 
| 627 |  |  |  |  |  |  | $output_proc->finish($output); | 
| 628 |  |  |  |  |  |  |  | 
| 629 |  |  |  |  |  |  | } | 
| 630 |  |  |  |  |  |  |  | 
| 631 |  |  |  |  |  |  | package W3C::LogValidator; | 
| 632 |  |  |  |  |  |  | 1; | 
| 633 |  |  |  |  |  |  |  | 
| 634 |  |  |  |  |  |  | __END__ |