File Coverage

lib/Apache/Logmonster.pm
Criterion Covered Total %
statement 122 547 22.3
branch 32 330 9.7
condition 10 71 14.0
subroutine 23 40 57.5
pod 12 21 57.1
total 199 1009 19.7


line stmt bran cond sub pod time code
1             package Apache::Logmonster;
2              
3 3     3   43572 use strict;
  3         6  
  3         127  
4 3     3   16 use warnings;
  3         6  
  3         175  
5              
6             our $VERSION = '5.36';
7              
8 3     3   15 use Carp;
  3         20  
  3         296  
9 3     3   5413 use Compress::Zlib;
  3         269795  
  3         1499  
10 3     3   27 use Cwd;
  3         6  
  3         222  
11             #use Data::Dumper;
12 3     3   3042 use Date::Parse;
  3         27163  
  3         453  
13 3     3   2979 use FileHandle;
  3         3003  
  3         19  
14 3     3   1556 use File::Basename;
  3         7  
  3         283  
15 3     3   2998 use File::Copy;
  3         11252  
  3         192  
16 3     3   2812 use Regexp::Log;
  3         5055  
  3         119  
17              
18 3     3   22 use lib 'lib';
  3         8  
  3         28  
19 3     3   3587 use Apache::Logmonster::Utility;
  3         12  
  3         185  
20 3     3   2198 use Regexp::Log::Monster;
  3         7  
  3         11926  
21             my ( $util, $err, %fhs, $debug );
22              
23             sub new {
24 3     3 1 3288 my $class = shift;
25 3   50     28 $debug = shift || 0;
26              
27 3 50       24 my $self = {
28             conf => undef,
29             debug => $debug ? 1 : 0,
30             };
31              
32 3         10 bless( $self, $class );
33              
34 3         16 $self->get_util();
35              
36 3         13 return $self;
37             };
38              
39             sub check_awstats_file {
40 0     0 1 0 my $self = shift;
41 0         0 my $domain = shift;
42              
43 0         0 my $conf = $self->{'conf'};
44 0   0     0 my $confdir = $conf->{confdir} || '/etc/awstats';
45 0   0     0 my $statsdir = $conf->{statsdir} || '/var/db/awstats';
46              
47 0         0 my $adc = "$confdir/awstats.$domain.conf";
48              
49 0 0       0 return if -f $adc;
50              
51 0         0 $util->file_write( $adc,
52             lines => [
53             <<"EO_AWSTATS_VHOST"
54             Include "$confdir/awstats.model.conf"
55             SiteDomain = $domain
56             DirData = $statsdir/$domain
57             HostAliases = $domain localhost 127.0.0.1
58             EO_AWSTATS_VHOST
59             ],
60             debug => 0,
61             );
62             };
63              
64             sub check_config {
65 1     1 1 31080 my $self = shift;
66 1         20 my $conf = $self->{'conf'};
67              
68 1         11 $err = "performing sanity tests";
69 1 50       16 $self->_progress_begin($err) if $debug;
70              
71 1 50       32 print "\n\t verbose mode $debug\n" if $debug > 1;
72              
73 1 50       11 if ( $debug > 1 ) {
74 0         0 print "\t clean mode ";
75 0 0       0 print $conf->{'clean'} ? "enabled.\n" : "disabled.\n";
76             }
77              
78 1         12 my $tmpdir = $conf->{tmpdir};
79 1 50       15 print "\t temporary working directory is $tmpdir.\n" if $debug > 1;
80              
81 1 50       66 if ( ! -d $tmpdir ) {
82 0 0       0 print "\t temp dir does not existing, creating..." if $debug > 1;
83 0 0       0 if ( !mkdir $tmpdir, oct('0755') ) {
84 0         0 die "FATAL: The directory $tmpdir does not exist and I could not "
85             . "create it. Edit logmonster.conf or create it.\n";
86             }
87 0 0       0 print "done.\n" if $debug > 1;
88              
89             # this will fail unless we're root, but that should not matter much
90 0 0       0 print "\t setting permissions on temp dir..." if $debug > 1;
91 0 0 0     0 $util->chown( $tmpdir,
      0        
92             uid => $conf->{'log_user'} || 'www',
93             gid => $conf->{'log_group'} || 'www',
94             debug => $debug > 1 ? 1 : 0,
95             fatal => 0,
96             );
97 0 0       0 print "done.\n" if $debug > 1;
98             }
99              
100 1 50 33     153 if ( !-w $tmpdir || !-r $tmpdir ) {
101 0         0 croak "FATAL: \$tmpdir ($tmpdir) must be read and writable!";
102             }
103              
104 1 50       9 if ( $conf->{'clean'} ) {
105 1 50       32 if ( !$util->clean_tmp_dir( $tmpdir, debug => 1, fatal=>0 ) ) {
106 0         0 croak "\nfailed to clean out $tmpdir";
107             }
108             }
109              
110 1 50       13 die "\nFATAL: you must edit logmonster.conf and set default_vhost!\n"
111             if ! defined $conf->{'default_vhost'};
112              
113 1 50       11 if ( $conf->{'time_offset'} ) {
114 0         0 my ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date( debug=>0 );
115              
116 0   0     0 my $interval = $self->{rotation_interval} || 'day';
117 0         0 my $bump = $conf->{time_offset};
118 0         0 my $logbase = $conf->{logbase};
119              
120 0 0       0 my $how_far_back = $interval eq "hour" ? .04 # back 1 hour
    0          
121             : $interval eq "month" ? $dd + 1 # last month
122             : 1; # 1 day
123              
124 0         0 ( $dd, $mm, $yy, $lm, $hh, $mn )
125             = $util->get_the_date( bump => $bump + $how_far_back, debug => 0 );
126              
127 0 0       0 die "OK then, try again.\n"
128             if ! $util->yes_or_no(
129             "\nDoes the date $yy/$mm/$dd look correct? ");
130             }
131              
132 1 50       7 $self->_progress_end('passed') if $debug == 1;
133              
134 1         37 return 1;
135             };
136              
137             sub compress_log_file {
138 0     0 1 0 my $self = shift;
139 0         0 my $host = shift;
140 0         0 my $logfile = shift;
141              
142 0         0 my $debug = $self->{'debug'};
143              
144 0 0 0     0 unless ( $host && $logfile ) {
145 0         0 croak "compress_log_file: called incorrectly!";
146             }
147              
148 0         0 my $REPORT = $self->{'report'};
149              
150 0 0       0 if ( $host eq "localhost" ) {
151 0         0 my $gzip = $util->find_bin( 'gzip', debug => 0 );
152              
153 0 0       0 if ( !-e $logfile ) {
154 0         0 print $REPORT "compress_log_file: $logfile does not exist!\n";
155 0 0       0 if ( -e "$logfile.gz" ) {
156 0         0 print $REPORT " already compressed as $logfile.gz!\n";
157 0         0 return 1;
158             }
159 0         0 return;
160             }
161              
162 0         0 my $cmd = "$gzip $logfile";
163 0 0       0 $self->_progress("gzipping localhost:$logfile") if $debug;
164 0         0 print $REPORT "syscmd: $cmd\n";
165 0         0 my $r = $util->syscmd( $cmd, debug => 0 );
166 0 0       0 print $REPORT "syscmd: error result: $r\n" if ( $r != 0 );
167              
168 0         0 return 1;
169             }
170              
171 0 0       0 $self->_progress_begin("checking $host for $logfile") if $debug;
172              
173             # $host is remote, so we interact via SSH
174 0         0 my $ssh = $util->find_bin( "ssh", debug => 0 );
175 0         0 my $cmd = "$ssh $host test -f $logfile";
176              
177 0         0 print $REPORT "compress_log_file: checking $host\n";
178 0         0 print $REPORT "\tsyscmd: $cmd\n";
179              
180 0 0       0 $self->_progress_continue() if $debug;
181              
182             # does the file exist?
183 0 0       0 if ( !$util->syscmd( $cmd, debug => 0, fatal => 0 ) ) {
184 0 0       0 $self->_progress_continue() if $debug;
185              
186             # does file.gz exist?
187 0 0       0 if ( -f "$logfile.gz" ) {
188 0         0 $err = "ALREADY COMPRESSED";
189 0         0 print $REPORT "\t$err\n";
190              
191 0 0       0 $self->_progress_end($err) if $debug;
192              
193 0         0 return 1;
194             }
195 0 0       0 $self->_progress_end("no") if $debug;
196              
197 0         0 print $REPORT "no\n";
198 0         0 return;
199             }
200              
201 0 0       0 $self->_progress_end("yes") if $debug;
202              
203 0         0 print $REPORT "yes\n";
204              
205 0         0 $err = "compressing log file on $host";
206 0 0       0 $self->_progress_begin($err) if $debug;
207              
208 0         0 $cmd = "$ssh $host gzip $logfile";
209 0         0 print $REPORT "\tcompressing\n\tsyscmd: $cmd \n";
210              
211 0 0       0 $self->_progress_continue() if !$debug;
212              
213 0         0 my $r = $util->syscmd( $cmd, debug => 0, fatal => 0 );
214 0 0       0 if ( !$r ) {
215 0         0 print $REPORT "\terror result: $r\n";
216 0         0 return;
217             }
218             $debug
219 0 0       0 ? print "done\n"
220             : $self->_progress_end();
221              
222 0         0 return 1;
223             };
224              
225             sub consolidate_logfile {
226              
227 0     0 1 0 my $self = shift;
228 0         0 my $host = shift;
229 0         0 my $remote_logfile = shift;
230 0         0 my $local_logfile = shift;
231              
232 0         0 my $dry_run = $self->{'dry_run'};
233 0         0 my $debug = $self->{'debug'};
234 0         0 my $REPORT = $self->{'report'};
235              
236 0         0 my ( $r, $size );
237              
238             # retrieve yesterdays log files
239 0 0       0 if ( $host eq "localhost" ) {
240 0         0 $err
241             = "consolidate_logfile: checking localhost for\n\t $remote_logfile...";
242 0 0       0 $self->_progress_begin($err) if $debug;
243 0         0 print $REPORT $err;
244              
245             # requires "use File::Copy"
246 0         0 $r = copy $remote_logfile, $local_logfile;
247 0 0       0 print $REPORT "FAILED: $!\n" unless ($r);
248              
249 0         0 $size = ( stat $local_logfile )[7];
250              
251 0 0       0 if ( $size > 1000000 ) { $size = sprintf "%.2f MB", $size / 1000000; }
  0         0  
252 0         0 else { $size = sprintf "%.2f KB", $size / 1000; }
253              
254 0         0 $err = "retrieved $size\n";
255 0 0       0 $self->_progress_end($err) if $debug;
256 0         0 print $REPORT $err;
257 0         0 return 1;
258             }
259              
260 0 0       0 return 1 if $dry_run;
261              
262 0         0 my $scp = $util->find_bin( "scp", debug => 0 );
263 0         0 $scp .= " -q";
264              
265 0 0       0 $self->_progress_begin("\tconsolidate_logfile: fetching") if $debug;
266              
267 0         0 print $REPORT
268             "\tsyscmd: $scp \n\t\t$host:$remote_logfile \n\t\t$local_logfile\n";
269              
270 0         0 $r = $util->syscmd( "$scp $host:$remote_logfile $local_logfile",
271             debug => 0
272             );
273 0 0       0 print $REPORT "syscmd: error result: $r\n" if !$r;
274              
275 0         0 $size = ( stat $local_logfile )[7];
276 0 0       0 if ( !$size ) {
277 0         0 $err = "FAILED. No logfiles retrieved!";
278 0 0       0 $self->_progress_end($err) if $debug;
279 0         0 print $REPORT "\t $err \n";
280 0         0 return;
281             }
282              
283 0 0       0 if ( $size > 1000000 ) { $size = sprintf "%.2f MB", $size / 1000000; }
  0         0  
284 0         0 else { $size = sprintf "%.2f KB", $size / 1000; }
285              
286 0         0 $err = "retrieved $size";
287 0 0       0 $self->_progress_end($err) if $debug;
288 0         0 print $REPORT "\t $err\n";
289              
290 0         0 return 1;
291             };
292              
293             sub feed_the_machine {
294 0     0 1 0 my $self = shift;
295 0         0 my $domains_ref = shift;
296              
297 0 0 0     0 if ( !$domains_ref || ref $domains_ref ne 'HASH' ) {
298 0         0 croak "feed_the_machine: invalid parameters passed.";
299             }
300              
301 0         0 my $debug = $self->{'debug'};
302 0         0 my $conf = $self->{'conf'};
303 0         0 my $REPORT = $self->{'report'};
304 0         0 my $interval = $self->{'rotation_interval'};
305              
306 0         0 my ( $cmd, $r );
307              
308 0         0 my $tmpdir = $conf->{'tmpdir'};
309 0         0 my $processor = $conf->{'processor'};
310              
311 0         0 foreach my $file ( $util->get_dir_files( "$tmpdir/doms" ) ) {
312 0 0       0 next if ( $file =~ /\.bak$/ );
313              
314 3     3   24 use File::Basename;
  3         7  
  3         4141  
315 0         0 my $domain = fileparse($file);
316 0         0 my $statsdir = "$conf->{'statsdir'}/$domain";
317              
318 0         0 $util->cwd_source_dir( $statsdir, debug => 0 );
319              
320 0 0       0 if ( ! -d $statsdir ) {
321 0 0       0 print "skipping $file because $statsdir is not a directory.\n"
322             if $debug;
323 0         0 next;
324             };
325              
326             # allow domain to select their stats processor
327 0 0       0 if ( -f "$statsdir/.processor" ) {
328 0         0 $processor = `head -n1 $statsdir/.processor`;
329 0         0 chomp $processor;
330             }
331              
332 0 0       0 if ( $processor eq "webalizer" ) {
    0          
    0          
333 0         0 my $webalizer
334             = $util->find_bin( "webalizer", debug => 0 );
335 0 0       0 $webalizer .= " -q" if !$debug;
336 0 0 0     0 $webalizer .= " -p"
337             if ( $interval eq "hour" || $interval eq "day" );
338 0         0 $cmd = "$webalizer -n $domain -o $statsdir $file";
339 0 0       0 printf "$webalizer -n %-20s -o $statsdir\n", $domain if $debug;
340 0         0 printf $REPORT "$webalizer -n %-20s -o $statsdir\n", $domain;
341             }
342             elsif ( $processor eq "http-analyze" ) {
343 0         0 my $http_analyze
344             = $util->find_bin( "http-analyze", debug => 0 );
345 0 0 0     0 $http_analyze .= " -d"
346             if ( $interval eq "hour" || $interval eq "day" );
347 0 0       0 $http_analyze .= " -m" if ( $interval eq "month" );
348 0         0 $cmd = "$http_analyze -S $domain -o $statsdir $file";
349 0 0       0 printf "$http_analyze -S %-20s -o $statsdir\n", $domain if $debug;
350 0         0 printf $REPORT "$http_analyze -S %-20s -o $statsdir\n", $domain;
351             }
352             elsif ( $processor eq "awstats" ) {
353 0         0 $self->check_awstats_file( $domain );
354              
355 0         0 my $aws_cgi = "/usr/local/www/awstats/cgi-bin"; # freebsd port location
356 0 0       0 $aws_cgi = "/usr/local/www/cgi-bin" unless -d $aws_cgi;
357 0 0       0 $aws_cgi = "/var/www/cgi-bin" unless -d $aws_cgi;
358              
359 0         0 my $awstats = $util->find_bin( "awstats.pl",
360             debug => 0,
361             dir => $aws_cgi,
362             );
363 0         0 $cmd = "$awstats -config=$domain -logfile=$file";
364 0 0       0 printf "$awstats for \%-20s to $statsdir\n", $domain if $debug;
365 0         0 printf $REPORT "$awstats for \%-20s to $statsdir\n", $domain;
366             }
367             else {
368 0         0 $err = "Sorry, that is not supported! Valid options are: webalizer, http-analyze, and awstats.\n";
369 0         0 print $err;
370 0         0 print $REPORT $err;
371             }
372              
373 0 0       0 unless ( $self->{'dry_run'} ) {
374 0 0       0 print "running $processor!\n" if $debug;
375 0 0       0 print $REPORT "syscmd: $cmd\n" if $debug;
376 0         0 $r = $util->syscmd( $cmd, debug => 0 );
377 0 0       0 print $REPORT "syscmd: error result: $r\n" if ( $r != 0 );
378             }
379              
380              
381 0 0       0 if ( $conf->{'clean'} ) {
382 0         0 $util->file_delete( $file, debug => 0 );
383 0         0 next;
384             }
385              
386 0         0 print "\nDon't forget about $file\n";
387 0         0 print $REPORT "\nDon't forget about $file\n";
388             }
389             };
390              
391             sub fetch_log_files {
392              
393 0     0 1 0 my $self = shift;
394 0         0 my $debug = $self->{'debug'};
395 0         0 my $conf = $self->{'conf'};
396 0         0 my $dry_run = $self->{'dry_run'};
397              
398 0         0 my $r;
399              
400             # in a format like this: /var/log/apache/200?/09/25
401 0         0 my $logdir = $self->get_log_dir();
402 0         0 my $tmpdir = $conf->{'tmpdir'};
403              
404 0         0 my $access_log = "$logdir/" . $conf->{'access'};
405 0         0 my $error_log = "$logdir/" . $conf->{'error'};
406              
407 0 0       0 print "fetch_log_files: warming up.\n" if $debug > 1;
408              
409             WEBHOST:
410 0         0 foreach my $webserver ( split( / /, $conf->{'hosts'} ) ) {
411 0         0 my $compressed = 0;
412              
413 0 0       0 if ( !$dry_run ) {
414              
415             # compress yesterdays log files
416 0         0 $self->compress_log_file( $webserver, $error_log );
417              
418 0 0       0 if ( !$self->compress_log_file( $webserver, $access_log ) ) {
419              
420             # if there is no compressed logfile, there is no point in
421             # trying to retrieve.
422 0         0 next WEBHOST;
423             }
424             }
425              
426 0         0 my $local_logfile = "$tmpdir/$webserver-" . $conf->{'access'} . ".gz";
427              
428 0         0 $self->consolidate_logfile(
429             $webserver, # hostname to retrieve from
430             "$access_log.gz", # the logfile to fetch
431             $local_logfile, # where to put it
432             );
433             }
434              
435 0         0 return 1;
436             };
437              
438             sub get_log_dir {
439              
440 5     5 1 12 my $self = shift;
441 5         8 my $debug = $self->{'debug'};
442 5         9 my $conf = $self->{'conf'};
443              
444 5   50     29 my $interval = $self->{'rotation_interval'} || "day";
445              
446 5 50       23 unless ($conf) {
447 0         0 croak "get_log_dir: \$conf is not set!\n";
448             }
449              
450 5         10 my $bump = $conf->{'time_offset'};
451 5         8 my $logbase = $conf->{'logbase'};
452              
453 5         25 my ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date( debug => 0 );
454              
455 5 50       24 my $how_far_back = $interval eq "hour"
    50          
456             ? .04 # back 1 hour
457             : $interval eq "month" ? $dd + 1 # last month
458             : 1; # 1 day
459              
460 5 50       11 if ($bump) {
461 0 0       0 ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date(
462             bump => $bump + $how_far_back,
463             debug => $debug > 1 ? 1 : 0,
464             );
465             }
466             else {
467 5 50       23 ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date(
468             bump => $how_far_back,
469             debug => $debug > 1 ? 1 : 0,
470             );
471             }
472              
473 5 0       32 my $logdir
    50          
    50          
474             = $interval eq "hour" ? "$logbase/$yy/$mm/$dd/$hh"
475             : $interval eq "day" ? "$logbase/$yy/$mm/$dd"
476             : $interval eq "month" ? "$logbase/$yy/$mm"
477             : "$logbase";
478              
479 5 50       16 print "get_log_dir: using $logdir\n" if $debug > 1;
480 5         27 return $logdir;
481             };
482              
483             sub get_log_files {
484 0     0 0 0 my $self = shift;
485 0 0       0 my $dir = shift or die "missing dir argument";
486              
487 0         0 my @logs = glob("$dir/*.gz");
488              
489 0         0 my $debug = $self->{debug};
490 0         0 my $REPORT = $self->{report};
491              
492             # make sure we have logs to process
493 0 0 0     0 if ( !$logs[0] or $logs[0] eq '' ) {
494 0         0 $err = "WARNING: No web server log files found!\n";
495 0 0       0 print $err if $debug;
496 0         0 print $REPORT $err;
497 0         0 return;
498             }
499              
500 0 0       0 if ( $debug > 1 ) {
501 0         0 print "found logfiles \n\t" . join( "\n\t", @logs ) . "\n";
502             };
503              
504 0         0 return @logs;
505             };
506              
507             sub get_config {
508 1     1 0 6 my ($self, $file, $config) = @_;
509              
510 1 50 33     8 if ( $config && ref $config eq 'HASH' ) {
511 0         0 $self->{conf} = $config;
512 0         0 return $config;
513             }
514              
515 1 50 33     5 return $self->{conf} if (defined $self->{conf} && ref $self->{conf});
516              
517 1   50     7 $self->{conf} = $util->parse_config( $file || 'logmonster.conf' );
518 1         4 return $self->{conf};
519             };
520              
521             sub get_util {
522 4     4 0 344 my $self = shift;
523 4 100       18 return $util if ref $util;
524 3     3   25 use lib 'lib';
  3         7  
  3         32  
525 3         32 require Apache::Logmonster::Utility;
526 3         43 $self->{util} = $util = Apache::Logmonster::Utility->new( debug => $self->{debug} );
527 3         7 return $util;
528             };
529              
530             sub report_hits {
531              
532 1     1 1 29 my $self = shift;
533 1         2 my $logdir = shift;
534 1         4 my $debug = $self->{'debug'};
535              
536 1         3 $self->{'debug'} = 0; # hush get_log_dir
537 1   33     5 $logdir ||= $self->get_log_dir();
538              
539 1         5 my $vhost_count_summary = $logdir . "/HitsPerVhost.txt";
540              
541             # fail if $vhost_count_summary is not present
542 1 50 33     54 unless ( $vhost_count_summary
      33        
543             && -e $vhost_count_summary
544             && -f $vhost_count_summary )
545             {
546 0         0 print
547             "report_hits: ERROR: hit summary file is missing. It should have"
548             . " been at: $vhost_count_summary. Report FAILURE.\n";
549 0         0 return;
550             }
551              
552 1 50       8 print "report_hits: reporting summary from file $vhost_count_summary\n"
553             if $debug;
554              
555 1         11 my @lines = $util->file_read( $vhost_count_summary,
556             debug => $debug,
557             fatal => 0,
558             );
559              
560 1         2 my $lines_in_array = @lines;
561              
562 1 50       5 if ( $lines_in_array > 0 ) {
563 1         143 print join( ':', @lines ) . "\n";
564 1         12 return 1;
565             }
566              
567 0 0       0 print "report_hits: no entries found!\n" if $debug;
568 0         0 return;
569             };
570              
571             sub report_close {
572 2     2 0 485 my $self = shift;
573 2         8 my $fh = shift;
574              
575 2 50       13 if ($fh) {
576 2         101 close($fh);
577 2         35 return 1;
578             }
579              
580 0         0 carp "report_close: was not passed a valid filehandle!";
581 0         0 return;
582             };
583              
584             sub report_open {
585 2     2 0 725 my $self = shift;
586 2         3 my $vhost = shift;
587 2         4 my $debug = $self->{'debug'};
588              
589 2 50       8 $vhost || croak "report_open: no filename passed!";
590              
591 2         6 my $logdir = $self->get_log_dir();
592              
593 2 50 33     64 unless ( $logdir && -w $logdir ) {
594 2         488 print "\tNOTICE!\nreport_open: logdir $logdir is not writable!\n";
595 2         5 $logdir = "/tmp";
596             }
597              
598 2         9 my $report_file = "$logdir/$vhost.txt";
599 2         4 my $REPORT;
600              
601 2 50       304 if ( !open $REPORT, ">", $report_file ) {
602 0         0 carp "couldn't open $report_file for write: $!";
603 0         0 return;
604             }
605              
606 2 50       6 print "\n *** this report is saved in $report_file *** \n" if $debug;
607 2         10 return $REPORT;
608             };
609              
610             sub sort_vhost_logs {
611              
612             ############################################
613             # Usage : see t/Logmonster.t for usage example
614             # Purpose : since the log entries for each host are concatenated, they are
615             # no longer in cronological order. Most stats post-processors
616             # require that log entries be in chrono order so this sorts them
617             # based on their log entry date, which also resolves any timezone
618             # differences.
619             # Returns : boolean, 1 for success
620             # Parameters : conf - hashref of setting from logmonster.conf
621             # report
622              
623 0     0 1   my $self = shift;
624 0           my $debug = $self->{'debug'};
625 0           my $conf = $self->{'conf'};
626 0           my $REPORT = $self->{'report'};
627              
628 0           my ( %beastie, %sortme );
629              
630 0   0       my $dir = $conf->{'tmpdir'} || croak "tmpdir not set in \$conf";
631              
632 0 0         if ( $self->{'host_count'} < 2 ) {
633 0 0         print "sort_vhost_logs: only one log host, skipping sort.\n"
634             if $debug;
635 0           return 1; # sort not needed with only one host
636             }
637              
638 0 0         $self->_progress_begin("sort_vhost_logs: sorting each vhost logfile...")
639             if $debug == 1;
640              
641 0           my $lines = 0;
642 0           my ($SORTED, $UNSORTED);
643              
644             VHOST_FILE:
645 0           foreach
646             my $file ( $util->get_dir_files( "$dir/doms", fatal => 0 ) )
647             {
648 0           undef %beastie; # clear the hash
649 0           undef %sortme;
650              
651 0 0         if ( -s $file > 10000000 ) {
652 0 0         print "\nsort_vhost_logs: logfile $file is greater than 10MB\n"
653             if $debug;
654 0           print $REPORT
655             "sort_vhost_logs: logfile $file is greater than 10MB\n";
656             }
657              
658 0 0         unless ( open $UNSORTED, '<', $file ) {
659 0           warn
660             "\nsort_vhost_logs: WARN: could not open input file $file: $!";
661 0           next VHOST_FILE;
662             }
663              
664             # make sure we can write out the results before doing all the work
665 0 0         unless ( open $SORTED, ">", "$file.sorted" ) {
666 0 0         print
667             "\n sort_vhost_logs: FAILED: could not open output file $file: $!\n"
668             if $debug;
669 0           next VHOST_FILE;
670             }
671              
672 0 0         $self->_progress_begin(" sorting $file...") if $debug > 1;
673              
674 0           while (<$UNSORTED>) {
675 0 0         $self->_progress_continue() if $debug > 1;
676 0           chomp;
677             ###
678             # Per Earl Ruby, switched from / / to /\s+/ so that naughty modules like
679             # Apache::Register that insert extra spaces in the Log output won't mess
680             # up logmonsters parsing.
681             # @log_entry_fields = split(/ /, $_) => @log.. = split(/\s+/, $_)
682             ###
683             # sample log entry
684             #216.220.22.182 - - [16/Jun/2004:09:37:51 -0400] "GET /images/google-g.jpg HTTP/1.1" 200 539 "http://www.tnpi.biz/internet/mail/toaster/" "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.6) Gecko/20040113" www.thenetworkpeople.biz
685              
686             # From an Apache log entry, we first split apart the line based on whitespace
687              
688             my @log_entry_fields
689 0           = split( /\s+/, $_ ); # split the log entry into fields
690              
691             # Then we use substr to extract the middle 26 characters:
692             # 16/Jun/2004:09:37:51 -0400
693             #
694             # We could also use a regexp to do this but substr is more efficient and we
695             # can safely expect the date format of ApacheLog to remain constant.
696              
697 0           my $rawdate
698             = substr( "$log_entry_fields[3] $log_entry_fields[4]", 1,
699             26 );
700              
701             # then we convert that date string to a numeric string that we can use for sorting.
702              
703 0           my $date = str2time($rawdate);
704              
705             # Finally, we put the entire line into the hash beastie (keyed with $lines,
706             # an incrementing number) and create a second hash ($sortme) with the
707             # same key but the value is the timestamp.
708              
709 0           $beastie{$lines} = $_;
710 0           $sortme{$lines} = $date;
711              
712 0           $lines++;
713             }
714 0 0         close($UNSORTED)
715             || croak "sort_vhost_logs: Gack, could not close $file: $!\n";
716 0 0         $self->_progress_end() if $debug > 1;
717              
718             # We create an array (because elements in arrays stay in order) of line
719             # numbers based on the sortme hash, sorted based on date
720              
721 0 0         my @sorted = sort {
722 0           ( $sortme{$a} <=> $sortme{$b} )
723             || ( $sortme{$a} cmp $sortme{$b} );
724             } ( keys(%sortme) );
725              
726 0           foreach (@sorted) {
727              
728             # iterate through @sorted, adding the corresponding lines from %beastie to the file
729 0           print $SORTED "$beastie{$_}\n";
730             }
731 0           close $SORTED;
732              
733 0 0         move( "$file.sorted", $file )
734             or carp
735             "sort_vhost_logs: could not replace $file with $file.sorted: $!\n";
736              
737 0 0         $self->_progress_continue() if $debug == 1;
738             }
739              
740 0 0         $self->_progress_end() if $debug == 1;
741              
742 0           return 1;
743             };
744              
745             sub split_logs_to_vhosts {
746 0     0 1   my $self = shift;
747              
748 0           my $debug = $self->{'debug'};
749 0           my $conf = $self->{'conf'};
750 0           my $REPORT = $self->{'report'};
751              
752 0           my ( %count, %orphans, $bad );
753              
754 0           my $dir = $conf->{'tmpdir'}; # normally /var/log/(apache|http)/tmp
755 0   0       my $countlog = $conf->{'CountLog'} || 1;
756              
757 0           my @webserver_logs = $self->get_log_files($dir);
758              
759 0 0         if ( !-d "$dir/doms" ) {
760 0 0         if ( !mkdir "$dir/doms", oct('0755') ) {
761 0           $err = "FATAL: couldn't create $dir/doms: $!\n";
762 0           print $REPORT $err;
763 0           die $err;
764             }
765             }
766              
767 0 0         print "\t output working dirs is $dir/doms\n" if $debug > 1;
768              
769             # use my Regexp::Log::Monster
770 0           my $regexp_parser = Regexp::Log::Monster->new(
771             format => ':logmonster',
772             capture => [qw( host vhost status bytes ref ua )],
773             );
774              
775             # Apache fields
776             # host, ident, auth, date, request, status, bytes, referer, agent, vhost
777             # returned from parser (available for capture) as:
778             # host, rfc, authuser, date, ts, request, req, status, bytes, referer, ref, useragent, ua, vhost
779              
780 0           my @captured_fields = $regexp_parser->capture;
781 0           my $re = $regexp_parser->regexp;
782              
783 0           foreach my $file (@webserver_logs) {
784              
785 0 0         my $gz = gzopen( $file, 'rb' ) or do {
786 0           warn "Couldn't open $file: $gzerrno";
787 0           next;
788             };
789              
790 0           my $lines = 0;
791 0 0         $self->_progress_begin("\t parsing entries from $file") if $debug;
792              
793 0           while ( $gz->gzreadline($_) > 0 ) {
794 0           chomp $_;
795 0           $lines++;
796 0 0 0       $self->_progress_continue() if ( $debug && $lines =~ /00$/ );
797              
798 0           my %data;
799 0           @data{@captured_fields} = /$re/; # no need for /o, a compiled regexp
800              
801             # make sure the log format has the vhost tag appended
802 0           my $vhost = $data{'vhost'};
803 0 0 0       if ( !$vhost || $vhost eq '-' ) {
804             #print "Invalid log entries! Read the FAQ!\n" if $debug;
805 0 0         print $_ . "\n" if $debug > 2;
806 0           $vhost = $conf->{default_vhost};
807 0           $bad++;
808             };
809              
810 0           $vhost = lc($vhost);
811              
812 0           $self->spam_check(\%data, \%count);
813              
814 0 0         if ( ! $fhs{$vhost} ) {
815 0           $self->open_vhost_handle( $vhost );
816             };
817 0 0         if ( $fhs{$vhost} ) {
818 0           my $fh = $fhs{$vhost};
819 0           print $fh "$_\n";
820 0           $count{$vhost}++;
821 0           next;
822             };
823 0 0         print "\nthe main domain for $vhost is missing!\n" if $debug > 1;
824 0           $orphans{$vhost} = $vhost;
825             };
826 0           $gz->gzclose();
827              
828 0 0         $self->_progress_end() if $debug;
829             };
830              
831 0           $self->report_matches( \%count, \%orphans);
832 0           $self->report_spam_hits( \%count );
833 0           $self->report_bad_hits( $bad );
834              
835 0           return \%fhs;
836             };
837              
838             sub spam_check {
839 0     0 0   my ($self, $data, $count) = @_;
840 0           my $conf = $self->{conf};
841              
842 0 0         return if ! $conf->{spam_check};
843              
844 0           my $spam_score = 0;
845              
846             # check for spam quotient
847 0 0         if ( $data->{status} ) {
848 0 0         if ( $data->{status} == 404 ) { # check for 404 status
849 0           $spam_score++; # a 404 alone is not a sign of naughtiness
850             }
851              
852 0 0         if ( $data->{status} == 412 ) { # httpd config slapping them
853 0           $spam_score++;
854             }
855              
856 0 0         if ( $data->{status} == 403 ) { # httpd config slapping them
857 0           $spam_score += 2;
858             }
859             }
860              
861             # nearly all of my referer spam has a # ending the referer string
862 0 0 0       if ( $data->{ref} && $data->{ref} =~ /#$/ ) {
863 0           $spam_score += 2;
864             }
865              
866             # should check for invalid/suspect useragent strings here
867 0 0         if ( $data->{ua} ) {
868 0 0         $spam_score +=
    0          
869             $data->{ua} =~ /crazy/ixms ? 1
870             : $data->{ua} =~ /email/i ? 3
871             # : $data->{ua} =~ /windows/ ? 1
872             : 0;
873             }
874              
875             # if we fail more than one spam test...
876 0 0         if ( $spam_score > 2 ) {
877 0           $count->{spam}++;
878 0 0 0       if ( defined $data->{bytes}
879             && $data->{bytes} =~ /[0-9]+/ )
880             {
881 0           $count->{bytes} += $data->{bytes};
882             }
883              
884 0           $count->{spam_agents}{ $data->{ua} }++;
885 0           $count->{spam_referers}{ $data->{ref} }++;
886              
887             # printf "%3s - %30s - %30s \n", $data->{status},
888             # $data->{ref}, $data->{ua};
889 0           next; # skips processing the line
890             }
891              
892             # TODO: also keep track of ham referers, and print in referer spam reports, so
893             # that I can see which UA are entirely spammers and block them in my Apache
894             # config.
895             # else {
896             # $count->{ham_referers}{$data->{ref}}++;
897             # }
898             };
899              
900             sub open_vhost_handle {
901 0     0 0   my $self = shift;
902 0           my $vhost = shift;
903              
904 0           my $fh = new FileHandle; # create a file handle for each ServerName
905 0           $fhs{$vhost} = $fh; # store in a hash keyed off the domain name
906              
907 0           my $debug = $self->{debug};
908              
909 0           my $dir = $self->{conf}{tmpdir}; # normally /var/log/(apache|http)/tmp
910 0 0         open( $fh, '>', "$dir/doms/$vhost" ) and do {
911 0 0         if ( $debug > 1 ) {
912 0           print " ";
913 0           printf "opening file for %35s...ok\n", $vhost;
914             }
915 0           return $fh;
916             };
917              
918 0           print " ";
919 0           printf "opening file for %35s...FAILED.\n", $vhost;
920 0           return;
921             };
922              
923             sub report_bad_hits {
924 0     0 0   my ($self, $bad) = @_;
925              
926 0 0         return if ! $bad;
927              
928 0           my $conf = $self->{conf};
929 0           my $debug = $self->{debug};
930 0           my $REPORT = $self->{report};
931              
932 0 0         printf "Default: %15.0f lines to $conf->{default_vhost}.\n", $bad if $debug;
933 0           my $msg = "\nSee the FAQ (logging) to see why records get assigned to the default vhost.\n\n";
934 0 0         print $msg if $debug;
935 0           print $REPORT $msg;
936             };
937              
938             sub report_matches {
939 0     0 0   my ($self, $count, $orphans ) = @_;
940              
941 0           my $debug = $self->{debug};
942 0           my $conf = $self->{conf};
943 0           my $REPORT = $self->{report};
944 0   0       my $countlog = $conf->{CountLog} || 1;
945              
946 0 0         print "\n\t\t\t Matched Entries\n\n" if $debug;
947 0           print $REPORT "\n\t\t Matched Entries\n\n";
948              
949 0           my $HitLog = '';
950 0 0         $HitLog = $self->report_open("HitsPerVhost") if $countlog;
951              
952 0           foreach my $key ( keys %fhs ) {
953 0           close $fhs{$key};
954              
955 0 0         if ( $count->{$key} ) {
956 0 0         printf " %15.0f lines to %s\n", $count->{$key}, $key if $debug;
957 0           printf $REPORT " %15.0f lines to %s\n", $count->{$key}, $key;
958 0 0         print $HitLog "$key:$count->{$key}\n" if $countlog;
959             }
960             }
961 0 0         $self->report_close( $HitLog, $debug ) if $countlog;
962              
963 0 0         print "\n" if $debug;
964 0           print $REPORT "\n";
965              
966 0           foreach my $key ( keys %$orphans ) {
967 0 0         if ( $count->{$key} ) {
968 0 0         printf "Orphans: %15.0f lines to %s\n", $count->{$key}, $key if $debug;
969 0           printf $REPORT "Orphans: %15.0f lines to %s\n", $count->{$key}, $key;
970             }
971             }
972             };
973              
974             sub report_spam_hits {
975 0     0 1   my ($self, $count ) = @_;
976              
977 0 0         return if ! $count->{spam};
978              
979 0           my $conf = $self->{conf};
980 0           my $debug = $self->{debug};
981              
982 0 0         if ( $conf->{report_spam_user_agents} ) {
983              
984 0 0         if ( $debug ) {
985 0           printf "Referer spammers hit you $count->{spam} times";
986              
987 0           my $bytes = $count->{bytes};
988 0 0         if ( $bytes ) {
989 0 0         if ( $bytes > 1000000000 ) {
    0          
990 0           $bytes = sprintf "%.2f GB", $bytes / 1000000000;
991             }
992             elsif ( $bytes > 1000000 ) {
993 0           $bytes = sprintf "%.2f MB", $bytes / 1000000;
994             }
995             else {
996 0           $bytes = sprintf "%.2f KB", $bytes / 1000;
997             }
998              
999 0           print " and wasted $bytes of your bandwidth.";
1000             }
1001 0           print "\n\n";
1002             };
1003              
1004 0           my $REPORT = $self->{report};
1005 0           printf $REPORT "Referer Spam: %15.0f lines\n", $count->{spam};
1006              
1007 0           my $spamagents = $count->{spam_agents};
1008 0           foreach my $value ( sort { $spamagents->{$b} cmp $spamagents->{$a} } keys %$spamagents ) {
  0            
1009 0           print "\t $spamagents->{$value} \t $value\n";
1010             }
1011             }
1012              
1013 0 0         if ( $conf->{report_spam_referrers} ) { # This report can get very long
1014 0           my $sr = $count->{spam_referers};
1015 0           foreach ( sort { $sr->{$b} <=> $sr->{$a} } keys %$sr ) {
  0            
1016 0           print "$sr->{$_} \t $_\n";
1017             }
1018             }
1019             }
1020              
1021              
1022             sub _progress {
1023 0     0     my ($self, $mess) = @_;
1024 0           print {*STDERR} "$mess.\n";
  0            
1025 0           return;
1026             };
1027             sub _progress_begin {
1028 0     0     my ($self, $phase) = @_;
1029 0           print {*STDERR} "$phase...";
  0            
1030 0           return;
1031             };
1032             sub _progress_continue {
1033 0     0     print {*STDERR} '.';
  0            
1034 0           return;
1035             };
1036             sub _progress_end {
1037 0     0     my ($self,$mess) = @_;
1038 0 0         if ( $mess ) {
1039 0           print {*STDERR} "$mess\n";
  0            
1040             }
1041             else {
1042 0           print {*STDERR} "done\n";
  0            
1043             };
1044 0           return;
1045             };
1046              
1047             1;
1048             __END__