File Coverage

lib/Apache/Logmonster.pm
Criterion Covered Total %
statement 122 545 22.3
branch 32 330 9.7
condition 10 71 14.0
subroutine 23 40 57.5
pod 12 21 57.1
total 199 1007 19.7


line stmt bran cond sub pod time code
1             package Apache::Logmonster;
2              
3 3     3   28064 use strict;
  3         7  
  3         122  
4 3     3   17 use warnings;
  3         6  
  3         162  
5              
6             our $VERSION = '3.12';
7              
8 3     3   24 use Carp;
  3         23  
  3         267  
9 3     3   5616 use Compress::Zlib;
  3         291421  
  3         854  
10 3     3   25 use Cwd;
  3         6  
  3         198  
11             #use Data::Dumper;
12 3     3   2496 use Date::Parse;
  3         23600  
  3         377  
13 3     3   2411 use FileHandle;
  3         2419  
  3         15  
14 3     3   1193 use File::Basename;
  3         3  
  3         194  
15 3     3   2378 use File::Copy;
  3         9263  
  3         168  
16 3     3   2349 use Regexp::Log;
  3         5094  
  3         98  
17              
18 3     3   51 use lib 'lib';
  3         4  
  3         27  
19 3     3   3245 use Apache::Logmonster::Utility;
  3         12  
  3         180  
20 3     3   2343 use Regexp::Log::Monster;
  3         7  
  3         6011  
21             my ( $util, $err, %fhs, $debug );
22              
23             sub new {
24 3     3 1 3077 my $class = shift;
25 3   50     28 $debug = shift || 0;
26              
27 3 50       20 my $self = {
28             conf => undef,
29             debug => $debug ? 1 : 0,
30             };
31              
32 3         9 bless( $self, $class );
33              
34 3         13 $self->get_util();
35              
36 3         11 return $self;
37             };
38              
39             sub check_awstats_file {
40 0     0 1 0 my $self = shift;
41 0         0 my $domain = shift;
42              
43 0         0 my $conf = $self->{'conf'};
44 0   0     0 my $confdir = $conf->{confdir} || '/etc/awstats';
45 0   0     0 my $statsdir = $conf->{statsdir} || '/var/db/awstats';
46              
47 0         0 my $adc = "$confdir/awstats.$domain.conf";
48              
49 0 0       0 return if -f $adc;
50              
51 0         0 $util->file_write( $adc,
52             lines => [
53             <<"EO_AWSTATS_VHOST"
54             Include "$confdir/awstats.model.conf"
55             SiteDomain = $domain
56             DirData = $statsdir/$domain
57             HostAliases = $domain localhost 127.0.0.1
58             EO_AWSTATS_VHOST
59             ],
60             debug => 0,
61             );
62             };
63              
64             sub check_config {
65 1     1 1 7720 my $self = shift;
66 1         9 my $conf = $self->{'conf'};
67              
68 1         10 $err = "performing sanity tests";
69 1 50       18 $self->_progress_begin($err) if $debug;
70              
71 1 50       15 print "\n\t verbose mode $debug\n" if $debug > 1;
72              
73 1 50       13 if ( $debug > 1 ) {
74 0         0 print "\t clean mode ";
75 0 0       0 print $conf->{'clean'} ? "enabled.\n" : "disabled.\n";
76             }
77              
78 1         6 my $tmpdir = $conf->{tmpdir};
79 1 50       8 print "\t temporary working directory is $tmpdir.\n" if $debug > 1;
80              
81 1 50       57 if ( ! -d $tmpdir ) {
82 0 0       0 print "\t temp dir does not existing, creating..." if $debug > 1;
83 0 0       0 if ( !mkdir $tmpdir, oct('0755') ) {
84 0         0 die "FATAL: The directory $tmpdir does not exist and I could not "
85             . "create it. Edit logmonster.conf or create it.\n";
86             }
87 0 0       0 print "done.\n" if $debug > 1;
88              
89             # this will fail unless we're root, but that should not matter much
90 0 0       0 print "\t setting permissions on temp dir..." if $debug > 1;
91 0 0 0     0 $util->chown( $tmpdir,
      0        
92             uid => $conf->{'log_user'} || 'www',
93             gid => $conf->{'log_group'} || 'www',
94             debug => $debug > 1 ? 1 : 0,
95             fatal => 0,
96             );
97 0 0       0 print "done.\n" if $debug > 1;
98             }
99              
100 1 50 33     64 if ( !-w $tmpdir || !-r $tmpdir ) {
101 0         0 croak "FATAL: \$tmpdir ($tmpdir) must be read and writable!";
102             }
103              
104 1 50       15 if ( $conf->{'clean'} ) {
105 1 50       24 if ( !$util->clean_tmp_dir( $tmpdir, debug => 1, fatal=>0 ) ) {
106 0         0 croak "\nfailed to clean out $tmpdir";
107             }
108             }
109              
110 1 50       10 die "\nFATAL: you must edit logmonster.conf and set default_vhost!\n"
111             if ! defined $conf->{'default_vhost'};
112              
113 1 50       9 if ( $conf->{'time_offset'} ) {
114 0         0 my ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date( debug=>0 );
115              
116 0   0     0 my $interval = $self->{rotation_interval} || 'day';
117 0         0 my $bump = $conf->{time_offset};
118 0         0 my $logbase = $conf->{logbase};
119              
120 0 0       0 my $how_far_back = $interval eq "hour" ? .04 # back 1 hour
    0          
121             : $interval eq "month" ? $dd + 1 # last month
122             : 1; # 1 day
123              
124 0         0 ( $dd, $mm, $yy, $lm, $hh, $mn )
125             = $util->get_the_date( bump => $bump + $how_far_back, debug => 0 );
126              
127 0 0       0 die "OK then, try again.\n"
128             if ! $util->yes_or_no(
129             "\nDoes the date $yy/$mm/$dd look correct? ");
130             }
131              
132 1 50       5 $self->_progress_end('passed') if $debug == 1;
133              
134 1         39 return 1;
135             };
136              
137             sub compress_log_file {
138 0     0 1 0 my $self = shift;
139 0         0 my $host = shift;
140 0         0 my $logfile = shift;
141              
142 0         0 my $debug = $self->{'debug'};
143              
144 0 0 0     0 unless ( $host && $logfile ) {
145 0         0 croak "compress_log_file: called incorrectly!";
146             }
147              
148 0         0 my $REPORT = $self->{'report'};
149              
150 0 0       0 if ( $host eq "localhost" ) {
151 0         0 my $gzip = $util->find_bin( 'gzip', debug => 0 );
152              
153 0 0       0 if ( !-e $logfile ) {
154 0         0 print $REPORT "compress_log_file: $logfile does not exist!\n";
155 0 0       0 if ( -e "$logfile.gz" ) {
156 0         0 print $REPORT " already compressed as $logfile.gz!\n";
157 0         0 return 1;
158             }
159 0         0 return;
160             }
161              
162 0         0 my $cmd = "$gzip $logfile";
163 0 0       0 $self->_progress("gzipping localhost:$logfile") if $debug;
164 0         0 print $REPORT "syscmd: $cmd\n";
165 0         0 my $r = $util->syscmd( $cmd, debug => 0 );
166 0 0       0 print $REPORT "syscmd: error result: $r\n" if ( $r != 0 );
167              
168 0         0 return 1;
169             }
170              
171 0 0       0 $self->_progress_begin("checking $host for $logfile") if $debug;
172              
173             # $host is remote, so we interact via SSH
174 0         0 my $ssh = $util->find_bin( "ssh", debug => 0 );
175 0         0 my $cmd = "$ssh $host test -f $logfile";
176              
177 0         0 print $REPORT "compress_log_file: checking $host\n";
178 0         0 print $REPORT "\tsyscmd: $cmd\n";
179              
180 0 0       0 $self->_progress_continue() if $debug;
181              
182             # does the file exist?
183 0 0       0 if ( !$util->syscmd( $cmd, debug => 0, fatal => 0 ) ) {
184 0 0       0 $self->_progress_continue() if $debug;
185              
186             # does file.gz exist?
187 0 0       0 if ( -f "$logfile.gz" ) {
188 0         0 $err = "ALREADY COMPRESSED";
189 0         0 print $REPORT "\t$err\n";
190              
191 0 0       0 $self->_progress_end($err) if $debug;
192              
193 0         0 return 1;
194             }
195 0 0       0 $self->_progress_end("no") if $debug;
196              
197 0         0 print $REPORT "no\n";
198 0         0 return;
199             }
200              
201 0 0       0 $self->_progress_end("yes") if $debug;
202              
203 0         0 print $REPORT "yes\n";
204              
205 0         0 $err = "compressing log file on $host";
206 0 0       0 $self->_progress_begin($err) if $debug;
207              
208 0         0 $cmd = "$ssh $host gzip $logfile";
209 0         0 print $REPORT "\tcompressing\n\tsyscmd: $cmd \n";
210              
211 0 0       0 $self->_progress_continue() if !$debug;
212              
213 0         0 my $r = $util->syscmd( $cmd, debug => 0, fatal => 0 );
214 0 0       0 if ( !$r ) {
215 0         0 print $REPORT "\terror result: $r\n";
216 0         0 return;
217             }
218             $debug
219 0 0       0 ? print "done\n"
220             : $self->_progress_end();
221              
222 0         0 return 1;
223             };
224              
225             sub consolidate_logfile {
226              
227 0     0 1 0 my $self = shift;
228 0         0 my $host = shift;
229 0         0 my $remote_logfile = shift;
230 0         0 my $local_logfile = shift;
231              
232 0         0 my $dry_run = $self->{'dry_run'};
233 0         0 my $debug = $self->{'debug'};
234 0         0 my $REPORT = $self->{'report'};
235              
236 0         0 my ( $r, $size );
237              
238             # retrieve yesterdays log files
239 0 0       0 if ( $host eq "localhost" ) {
240 0         0 $err
241             = "consolidate_logfile: checking localhost for\n\t $remote_logfile...";
242 0 0       0 $self->_progress_begin($err) if $debug;
243 0         0 print $REPORT $err;
244              
245             # requires "use File::Copy"
246 0         0 $r = copy $remote_logfile, $local_logfile;
247 0 0       0 print $REPORT "FAILED: $!\n" unless ($r);
248              
249 0         0 $size = ( stat $local_logfile )[7];
250              
251 0 0       0 if ( $size > 1000000 ) { $size = sprintf "%.2f MB", $size / 1000000; }
  0         0  
252 0         0 else { $size = sprintf "%.2f KB", $size / 1000; }
253              
254 0         0 $err = "retrieved $size\n";
255 0 0       0 $self->_progress_end($err) if $debug;
256 0         0 print $REPORT $err;
257 0         0 return 1;
258             }
259              
260 0 0       0 return 1 if $dry_run;
261              
262 0         0 my $scp = $util->find_bin( "scp", debug => 0 );
263 0         0 $scp .= " -q";
264              
265 0 0       0 $self->_progress_begin("\tconsolidate_logfile: fetching") if $debug;
266              
267 0         0 print $REPORT
268             "\tsyscmd: $scp \n\t\t$host:$remote_logfile \n\t\t$local_logfile\n";
269              
270 0         0 $r = $util->syscmd( "$scp $host:$remote_logfile $local_logfile",
271             debug => 0
272             );
273 0 0       0 print $REPORT "syscmd: error result: $r\n" if !$r;
274              
275 0         0 $size = ( stat $local_logfile )[7];
276 0 0       0 if ( !$size ) {
277 0         0 $err = "FAILED. No logfiles retrieved!";
278 0 0       0 $self->_progress_end($err) if $debug;
279 0         0 print $REPORT "\t $err \n";
280 0         0 return;
281             }
282              
283 0 0       0 if ( $size > 1000000 ) { $size = sprintf "%.2f MB", $size / 1000000; }
  0         0  
284 0         0 else { $size = sprintf "%.2f KB", $size / 1000; }
285              
286 0         0 $err = "retrieved $size";
287 0 0       0 $self->_progress_end($err) if $debug;
288 0         0 print $REPORT "\t $err\n";
289              
290 0         0 return 1;
291             };
292              
293             sub feed_the_machine {
294 0     0 1 0 my $self = shift;
295 0         0 my $domains_ref = shift;
296              
297 0 0 0     0 if ( !$domains_ref || ref $domains_ref ne 'HASH' ) {
298 0         0 croak "feed_the_machine: invalid parameters passed.";
299             }
300              
301 0         0 my $debug = $self->{'debug'};
302 0         0 my $conf = $self->{'conf'};
303 0         0 my $REPORT = $self->{'report'};
304 0         0 my $interval = $self->{'rotation_interval'};
305              
306 0         0 my ( $cmd, $r );
307              
308 0         0 my $tmpdir = $conf->{'tmpdir'};
309 0         0 my $processor = $conf->{'processor'};
310              
311 0         0 foreach my $file ( $util->get_dir_files( "$tmpdir/doms" ) ) {
312 0 0       0 next if ( $file =~ /\.bak$/ );
313              
314 3     3   22 use File::Basename;
  3         4  
  3         4520  
315 0         0 my $domain = fileparse($file);
316 0         0 my $statsdir = "$conf->{'statsdir'}/$domain";
317              
318 0         0 $util->cwd_source_dir( $statsdir, debug => 0 );
319              
320 0 0       0 if ( ! -d $statsdir ) {
321 0 0       0 print "skipping $file because $statsdir is not a directory.\n"
322             if $debug;
323 0         0 next;
324             };
325              
326             # allow domain to select their stats processor
327 0 0       0 if ( -f "$statsdir/.processor" ) {
328 0         0 $processor = `head -n1 $statsdir/.processor`;
329 0         0 chomp $processor;
330             }
331              
332 0 0       0 if ( $processor eq "webalizer" ) {
    0          
    0          
333 0         0 my $webalizer
334             = $util->find_bin( "webalizer", debug => 0 );
335 0 0       0 $webalizer .= " -q" if !$debug;
336 0 0 0     0 $webalizer .= " -p"
337             if ( $interval eq "hour" || $interval eq "day" );
338 0         0 $cmd = "$webalizer -n $domain -o $statsdir $file";
339 0 0       0 printf "$webalizer -n %-20s -o $statsdir\n", $domain if $debug;
340 0         0 printf $REPORT "$webalizer -n %-20s -o $statsdir\n", $domain;
341             }
342             elsif ( $processor eq "http-analyze" ) {
343 0         0 my $http_analyze
344             = $util->find_bin( "http-analyze", debug => 0 );
345 0 0 0     0 $http_analyze .= " -d"
346             if ( $interval eq "hour" || $interval eq "day" );
347 0 0       0 $http_analyze .= " -m" if ( $interval eq "month" );
348 0         0 $cmd = "$http_analyze -S $domain -o $statsdir $file";
349 0 0       0 printf "$http_analyze -S %-20s -o $statsdir\n", $domain if $debug;
350 0         0 printf $REPORT "$http_analyze -S %-20s -o $statsdir\n", $domain;
351             }
352             elsif ( $processor eq "awstats" ) {
353 0         0 $self->check_awstats_file( $domain );
354              
355 0         0 my $aws_cgi = "/usr/local/www/awstats/cgi-bin"; # freebsd port location
356 0 0       0 $aws_cgi = "/usr/local/www/cgi-bin" unless -d $aws_cgi;
357 0 0       0 $aws_cgi = "/var/www/cgi-bin" unless -d $aws_cgi;
358              
359 0         0 my $awstats = $util->find_bin( "awstats.pl",
360             debug => 0,
361             dir => $aws_cgi,
362             );
363 0         0 $cmd = "$awstats -config=$domain -logfile=$file";
364 0 0       0 printf "$awstats for \%-20s to $statsdir\n", $domain if $debug;
365 0         0 printf $REPORT "$awstats for \%-20s to $statsdir\n", $domain;
366             }
367             else {
368 0         0 $err = "Sorry, that is not supported! Valid options are: webalizer, http-analyze, and awstats.\n";
369 0         0 print $err;
370 0         0 print $REPORT $err;
371             }
372              
373 0 0       0 unless ( $self->{'dry_run'} ) {
374 0 0       0 print "running $processor!\n" if $debug;
375 0 0       0 print $REPORT "syscmd: $cmd\n" if $debug;
376 0         0 $r = $util->syscmd( $cmd, debug => 0 );
377 0 0       0 print $REPORT "syscmd: error result: $r\n" if ( $r != 0 );
378             }
379              
380              
381 0 0       0 if ( $conf->{'clean'} ) {
382 0         0 $util->file_delete( $file, debug => 0 );
383 0         0 next;
384             }
385              
386 0         0 print "\nDon't forget about $file\n";
387 0         0 print $REPORT "\nDon't forget about $file\n";
388             }
389             };
390              
391             sub fetch_log_files {
392              
393 0     0 1 0 my $self = shift;
394 0         0 my $debug = $self->{'debug'};
395 0         0 my $conf = $self->{'conf'};
396 0         0 my $dry_run = $self->{'dry_run'};
397              
398 0         0 my $r;
399              
400             # in a format like this: /var/log/apache/200?/09/25
401 0         0 my $logdir = $self->get_log_dir();
402 0         0 my $tmpdir = $conf->{'tmpdir'};
403              
404 0         0 my $access_log = "$logdir/" . $conf->{'access'};
405 0         0 my $error_log = "$logdir/" . $conf->{'error'};
406              
407 0 0       0 print "fetch_log_files: warming up.\n" if $debug > 1;
408              
409             WEBHOST:
410 0         0 foreach my $webserver ( split( / /, $conf->{'hosts'} ) ) {
411 0         0 my $compressed = 0;
412              
413 0 0       0 if ( !$dry_run ) {
414              
415             # compress yesterdays log files
416 0         0 $self->compress_log_file( $webserver, $error_log );
417              
418 0 0       0 if ( !$self->compress_log_file( $webserver, $access_log ) ) {
419              
420             # if there is no compressed logfile, there is no point in
421             # trying to retrieve.
422 0         0 next WEBHOST;
423             }
424             }
425              
426 0         0 my $local_logfile = "$tmpdir/$webserver-" . $conf->{'access'} . ".gz";
427              
428 0         0 $self->consolidate_logfile(
429             $webserver, # hostname to retrieve from
430             "$access_log.gz", # the logfile to fetch
431             $local_logfile, # where to put it
432             );
433             }
434              
435 0         0 return 1;
436             };
437              
438             sub get_log_dir {
439              
440 5     5 1 12 my $self = shift;
441 5         11 my $debug = $self->{'debug'};
442 5         8 my $conf = $self->{'conf'};
443              
444 5   50     41 my $interval = $self->{'rotation_interval'} || "day";
445              
446 5 50       15 unless ($conf) {
447 0         0 croak "get_log_dir: \$conf is not set!\n";
448             }
449              
450 5         11 my $bump = $conf->{'time_offset'};
451 5         16 my $logbase = $conf->{'logbase'};
452              
453 5         27 my ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date( debug => 0 );
454              
455 5 50       24 my $how_far_back = $interval eq "hour"
    50          
456             ? .04 # back 1 hour
457             : $interval eq "month" ? $dd + 1 # last month
458             : 1; # 1 day
459              
460 5 50       11 if ($bump) {
461 0 0       0 ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date(
462             bump => $bump + $how_far_back,
463             debug => $debug > 1 ? 1 : 0,
464             );
465             }
466             else {
467 5 50       29 ( $dd, $mm, $yy, $lm, $hh, $mn ) = $util->get_the_date(
468             bump => $how_far_back,
469             debug => $debug > 1 ? 1 : 0,
470             );
471             }
472              
473 5 0       34 my $logdir
    50          
    50          
474             = $interval eq "hour" ? "$logbase/$yy/$mm/$dd/$hh"
475             : $interval eq "day" ? "$logbase/$yy/$mm/$dd"
476             : $interval eq "month" ? "$logbase/$yy/$mm"
477             : "$logbase";
478              
479 5 50       14 print "get_log_dir: using $logdir\n" if $debug > 1;
480 5         24 return $logdir;
481             };
482              
483             sub get_log_files {
484 0     0 0 0 my $self = shift;
485 0 0       0 my $dir = shift or die "missing dir argument";
486              
487 0         0 my @logs = <$dir/*.gz>;
488              
489 0         0 my $debug = $self->{debug};
490 0         0 my $REPORT = $self->{report};
491              
492             # make sure we have logs to process
493 0 0 0     0 if ( !$logs[0] or $logs[0] eq '' ) {
494 0         0 $err = "WARNING: No web server log files found!\n";
495 0 0       0 print $err if $debug;
496 0         0 print $REPORT $err;
497 0         0 return;
498             }
499              
500 0 0       0 if ( $debug > 1 ) {
501 0         0 print "found logfiles \n\t" . join( "\n\t", @logs ) . "\n";
502             };
503              
504 0         0 return @logs;
505             };
506              
507             sub get_config {
508 1     1 0 4 my ($self, $file, $config) = @_;
509              
510 1 50 33     7 if ( $config && ref $config eq 'HASH' ) {
511 0         0 $self->{conf} = $config;
512 0         0 return $config;
513             }
514              
515 1 50 33     6 return $self->{conf} if (defined $self->{conf} && ref $self->{conf});
516              
517 1   50     7 $self->{conf} = $util->parse_config( $file || 'logmonster.conf' );
518 1         3 return $self->{conf};
519             };
520              
521             sub get_util {
522 4     4 0 581 my $self = shift;
523 4 100       15 return $util if ref $util;
524 3     3   21 use lib 'lib';
  3         6  
  3         29  
525 3         21 require Apache::Logmonster::Utility;
526 3         38 $self->{util} = $util = Apache::Logmonster::Utility->new( debug => $self->{debug} );
527 3         8 return $util;
528             };
529              
530             sub report_hits {
531              
532 1     1 1 22 my $self = shift;
533 1         3 my $logdir = shift;
534 1         3 my $debug = $self->{'debug'};
535              
536 1         3 $self->{'debug'} = 0; # hush get_log_dir
537 1   33     5 $logdir ||= $self->get_log_dir();
538              
539 1         4 my $vhost_count_summary = $logdir . "/HitsPerVhost.txt";
540              
541             # fail if $vhost_count_summary is not present
542 1 50 33     51 unless ( $vhost_count_summary
      33        
543             && -e $vhost_count_summary
544             && -f $vhost_count_summary )
545             {
546 0         0 print
547             "report_hits: ERROR: hit summary file is missing. It should have"
548             . " been at: $vhost_count_summary. Report FAILURE.\n";
549 0         0 return;
550             }
551              
552 1 50       10 print "report_hits: reporting summary from file $vhost_count_summary\n"
553             if $debug;
554              
555 1         17 my @lines = $util->file_read( $vhost_count_summary,
556             debug => $debug,
557             fatal => 0,
558             );
559              
560 1         3 my $lines_in_array = @lines;
561              
562 1 50       5 if ( $lines_in_array > 0 ) {
563 1         137 print join( ':', @lines ) . "\n";
564 1         8 return 1;
565             }
566              
567 0 0       0 print "report_hits: no entries found!\n" if $debug;
568 0         0 return;
569             };
570              
571             sub report_close {
572 2     2 0 461 my $self = shift;
573 2         6 my $fh = shift;
574              
575 2 50       16 if ($fh) {
576 2         100 close($fh);
577 2         42 return 1;
578             }
579              
580 0         0 carp "report_close: was not passed a valid filehandle!";
581 0         0 return;
582             };
583              
584             sub report_open {
585 2     2 0 652 my $self = shift;
586 2         5 my $vhost = shift;
587 2         7 my $debug = $self->{'debug'};
588              
589 2 50       7 $vhost || croak "report_open: no filename passed!";
590              
591 2         8 my $logdir = $self->get_log_dir();
592              
593 2 50 33     53 unless ( $logdir && -w $logdir ) {
594 2         478 print "\tNOTICE!\nreport_open: logdir $logdir is not writable!\n";
595 2         6 $logdir = "/tmp";
596             }
597              
598 2         7 my $report_file = "$logdir/$vhost.txt";
599 2         2 my $REPORT;
600              
601 2 50       231 if ( !open $REPORT, ">", $report_file ) {
602 0         0 carp "couldn't open $report_file for write: $!";
603 0         0 return;
604             }
605              
606 2 50       9 print "\n *** this report is saved in $report_file *** \n" if $debug;
607 2         8 return $REPORT;
608             };
609              
610             sub sort_vhost_logs {
611              
612             ############################################
613             # Usage : see t/Logmonster.t for usage example
614             # Purpose : since the log entries for each host are concatenated, they are
615             # no longer in cronological order. Most stats post-processors
616             # require that log entries be in chrono order so this sorts them
617             # based on their log entry date, which also resolves any timezone
618             # differences.
619             # Returns : boolean, 1 for success
620             # Parameters : conf - hashref of setting from logmonster.conf
621             # report
622              
623 0     0 1   my $self = shift;
624 0           my $debug = $self->{'debug'};
625 0           my $conf = $self->{'conf'};
626 0           my $REPORT = $self->{'report'};
627              
628 0           my ( %beastie, %sortme );
629              
630 0   0       my $dir = $conf->{'tmpdir'} || croak "tmpdir not set in \$conf";
631              
632 0 0         if ( $self->{'host_count'} < 2 ) {
633 0 0         print "sort_vhost_logs: only one log host, skipping sort.\n"
634             if $debug;
635 0           return 1; # sort not needed with only one host
636             }
637              
638 0 0         $self->_progress_begin("sort_vhost_logs: sorting each vhost logfile...")
639             if $debug == 1;
640              
641 0           my $lines = 0;
642              
643             VHOST_FILE:
644 0           foreach
645             my $file ( $util->get_dir_files( "$dir/doms", fatal => 0 ) )
646             {
647 0           undef %beastie; # clear the hash
648 0           undef %sortme;
649              
650 0 0         if ( -s $file > 10000000 ) {
651 0 0         print "\nsort_vhost_logs: logfile $file is greater than 10MB\n"
652             if $debug;
653 0           print $REPORT
654             "sort_vhost_logs: logfile $file is greater than 10MB\n";
655             }
656              
657 0 0         unless ( open UNSORTED, "<", $file ) {
658 0           warn
659             "\nsort_vhost_logs: WARN: could not open input file $file: $!";
660 0           next VHOST_FILE;
661             }
662              
663             # make sure we can write out the results before doing all the work
664 0 0         unless ( open SORTED, ">", "$file.sorted" ) {
665 0 0         print
666             "\n sort_vhost_logs: FAILED: could not open output file $file: $!\n"
667             if $debug;
668 0           next VHOST_FILE;
669             }
670              
671 0 0         $self->_progress_begin(" sorting $file...") if $debug > 1;
672              
673 0           while () {
674 0 0         $self->_progress_continue() if $debug > 1;
675 0           chomp;
676             ###
677             # Per Earl Ruby, switched from / / to /\s+/ so that naughty modules like
678             # Apache::Register that insert extra spaces in the Log output won't mess
679             # up logmonsters parsing.
680             # @log_entry_fields = split(/ /, $_) => @log.. = split(/\s+/, $_)
681             ###
682             # sample log entry
683             #216.220.22.182 - - [16/Jun/2004:09:37:51 -0400] "GET /images/google-g.jpg HTTP/1.1" 200 539 "http://www.tnpi.biz/internet/mail/toaster/" "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.6) Gecko/20040113" www.thenetworkpeople.biz
684              
685             # From an Apache log entry, we first split apart the line based on whitespace
686              
687             my @log_entry_fields
688 0           = split( /\s+/, $_ ); # split the log entry into fields
689              
690             # Then we use substr to extract the middle 26 characters:
691             # 16/Jun/2004:09:37:51 -0400
692             #
693             # We could also use a regexp to do this but substr is more efficient and we
694             # can safely expect the date format of ApacheLog to remain constant.
695              
696 0           my $rawdate
697             = substr( "$log_entry_fields[3] $log_entry_fields[4]", 1,
698             26 );
699              
700             # then we convert that date string to a numeric string that we can use for sorting.
701              
702 0           my $date = str2time($rawdate);
703              
704             # Finally, we put the entire line into the hash beastie (keyed with $lines,
705             # an incrementing number) and create a second hash ($sortme) with the
706             # same key but the value is the timestamp.
707              
708 0           $beastie{$lines} = $_;
709 0           $sortme{$lines} = $date;
710              
711 0           $lines++;
712             }
713 0 0         close(UNSORTED)
714             || croak "sort_vhost_logs: Gack, could not close $file: $!\n";
715 0 0         $self->_progress_end() if $debug > 1;
716              
717             # We create an array (because elements in arrays stay in order) of line
718             # numbers based on the sortme hash, sorted based on date
719              
720 0 0         my @sorted = sort {
721 0           ( $sortme{$a} <=> $sortme{$b} )
722             || ( $sortme{$a} cmp $sortme{$b} );
723             } ( keys(%sortme) );
724              
725 0           foreach (@sorted) {
726              
727             # iterate through @sorted, adding the corresponding lines from %beastie to the file
728 0           print SORTED "$beastie{$_}\n";
729             }
730 0           close SORTED;
731              
732 0 0         move( "$file.sorted", $file )
733             or carp
734             "sort_vhost_logs: could not replace $file with $file.sorted: $!\n";
735              
736 0 0         $self->_progress_continue() if $debug == 1;
737             }
738              
739 0 0         $self->_progress_end() if $debug == 1;
740              
741 0           return 1;
742             };
743              
744             sub split_logs_to_vhosts {
745 0     0 1   my $self = shift;
746              
747 0           my $debug = $self->{'debug'};
748 0           my $conf = $self->{'conf'};
749 0           my $REPORT = $self->{'report'};
750              
751 0           my ( %count, %orphans, $bad );
752              
753 0           my $dir = $conf->{'tmpdir'}; # normally /var/log/(apache|http)/tmp
754 0   0       my $countlog = $conf->{'CountLog'} || 1;
755              
756 0           my @webserver_logs = $self->get_log_files($dir);
757              
758 0 0         if ( !-d "$dir/doms" ) {
759 0 0         if ( !mkdir "$dir/doms", oct('0755') ) {
760 0           $err = "FATAL: couldn't create $dir/doms: $!\n";
761 0           print $REPORT $err;
762 0           die $err;
763             }
764             }
765              
766 0 0         print "\t output working dirs is $dir/doms\n" if $debug > 1;
767              
768             # use my Regexp::Log::Monster
769 0           my $regexp_parser = Regexp::Log::Monster->new(
770             format => ':logmonster',
771             capture => [qw( host vhost status bytes ref ua )],
772             );
773              
774             # Apache fields
775             # host, ident, auth, date, request, status, bytes, referer, agent, vhost
776             # returned from parser (available for capture) as:
777             # host, rfc, authuser, date, ts, request, req, status, bytes, referer, ref, useragent, ua, vhost
778              
779 0           my @captured_fields = $regexp_parser->capture;
780 0           my $re = $regexp_parser->regexp;
781              
782 0           foreach my $file (@webserver_logs) {
783              
784 0 0         my $gz = gzopen( $file, 'rb' ) or do {
785 0           warn "Couldn't open $file: $gzerrno";
786 0           next;
787             };
788              
789 0           my $lines = 0;
790 0 0         $self->_progress_begin("\t parsing entries from $file") if $debug;
791              
792 0           while ( $gz->gzreadline($_) > 0 ) {
793 0           chomp $_;
794 0           $lines++;
795 0 0 0       $self->_progress_continue() if ( $debug && $lines =~ /00$/ );
796              
797 0           my %data;
798 0           @data{@captured_fields} = /$re/; # no need for /o, a compiled regexp
799              
800             # make sure the log format has the vhost tag appended
801 0           my $vhost = $data{'vhost'};
802 0 0 0       if ( !$vhost || $vhost eq '-' ) {
803             #print "Invalid log entries! Read the FAQ!\n" if $debug;
804 0 0         print $_ . "\n" if $debug > 2;
805 0           $vhost = $conf->{default_vhost};
806 0           $bad++;
807             };
808              
809 0           $vhost = lc($vhost);
810              
811 0           $self->spam_check(\%data, \%count);
812              
813 0 0         if ( ! $fhs{$vhost} ) {
814 0           $self->open_vhost_handle( $vhost );
815             };
816 0 0         if ( $fhs{$vhost} ) {
817 0           my $fh = $fhs{$vhost};
818 0           print $fh "$_\n";
819 0           $count{$vhost}++;
820 0           next;
821             };
822 0 0         print "\nthe main domain for $vhost is missing!\n" if $debug > 1;
823 0           $orphans{$vhost} = $vhost;
824             };
825 0           $gz->gzclose();
826              
827 0 0         $self->_progress_end() if $debug;
828             };
829              
830 0           $self->report_matches( \%count, \%orphans);
831 0           $self->report_spam_hits( \%count );
832 0           $self->report_bad_hits( $bad );
833              
834 0           return \%fhs;
835             };
836              
837             sub spam_check {
838 0     0 0   my ($self, $data, $count) = @_;
839 0           my $conf = $self->{conf};
840              
841 0 0         return if ! $conf->{spam_check};
842              
843 0           my $spam_score = 0;
844              
845             # check for spam quotient
846 0 0         if ( $data->{status} ) {
847 0 0         if ( $data->{status} == 404 ) { # check for 404 status
848 0           $spam_score++; # a 404 alone is not a sign of naughtiness
849             }
850              
851 0 0         if ( $data->{status} == 412 ) { # httpd config slapping them
852 0           $spam_score++;
853             }
854              
855 0 0         if ( $data->{status} == 403 ) { # httpd config slapping them
856 0           $spam_score += 2;
857             }
858             }
859              
860             # nearly all of my referer spam has a # ending the referer string
861 0 0 0       if ( $data->{ref} && $data->{ref} =~ /#$/ ) {
862 0           $spam_score += 2;
863             }
864              
865             # should check for invalid/suspect useragent strings here
866 0 0         if ( $data->{ua} ) {
867 0 0         $spam_score +=
    0          
868             $data->{ua} =~ /crazy/ixms ? 1
869             : $data->{ua} =~ /email/i ? 3
870             # : $data->{ua} =~ /windows/ ? 1
871             : 0;
872             }
873              
874             # if we fail more than one spam test...
875 0 0         if ( $spam_score > 2 ) {
876 0           $count->{spam}++;
877 0 0 0       if ( defined $data->{bytes}
878             && $data->{bytes} =~ /[0-9]+/ )
879             {
880 0           $count->{bytes} += $data->{bytes};
881             }
882              
883 0           $count->{spam_agents}{ $data->{ua} }++;
884 0           $count->{spam_referers}{ $data->{ref} }++;
885              
886             # printf "%3s - %30s - %30s \n", $data->{status},
887             # $data->{ref}, $data->{ua};
888 0           next; # skips processing the line
889             }
890              
891             # TODO: also keep track of ham referers, and print in referer spam reports, so
892             # that I can see which UA are entirely spammers and block them in my Apache
893             # config.
894             # else {
895             # $count->{ham_referers}{$data->{ref}}++;
896             # }
897             };
898              
899             sub open_vhost_handle {
900 0     0 0   my $self = shift;
901 0           my $vhost = shift;
902              
903 0           my $fh = new FileHandle; # create a file handle for each ServerName
904 0           $fhs{$vhost} = $fh; # store in a hash keyed off the domain name
905              
906 0           my $debug = $self->{debug};
907              
908 0           my $dir = $self->{conf}{tmpdir}; # normally /var/log/(apache|http)/tmp
909 0 0         open( $fh, '>', "$dir/doms/$vhost" ) and do {
910 0 0         if ( $debug > 1 ) {
911 0           print " ";
912 0           printf "opening file for %35s...ok\n", $vhost;
913             }
914 0           return $fh;
915             };
916              
917 0           print " ";
918 0           printf "opening file for %35s...FAILED.\n", $vhost;
919 0           return;
920             };
921              
922             sub report_bad_hits {
923 0     0 0   my ($self, $bad) = @_;
924              
925 0 0         return if ! $bad;
926              
927 0           my $conf = $self->{conf};
928 0           my $debug = $self->{debug};
929 0           my $REPORT = $self->{report};
930              
931 0 0         printf "Default: %15.0f lines to $conf->{default_vhost}.\n", $bad if $debug;
932 0           my $msg = "\nSee the FAQ (logging) to see why records get assigned to the default vhost.\n\n";
933 0 0         print $msg if $debug;
934 0           print $REPORT $msg;
935             };
936              
937             sub report_matches {
938 0     0 0   my ($self, $count, $orphans ) = @_;
939              
940 0           my $debug = $self->{debug};
941 0           my $conf = $self->{conf};
942 0           my $REPORT = $self->{report};
943 0   0       my $countlog = $conf->{CountLog} || 1;
944              
945 0 0         print "\n\t\t\t Matched Entries\n\n" if $debug;
946 0           print $REPORT "\n\t\t Matched Entries\n\n";
947              
948 0 0         my $HitLog = $self->report_open("HitsPerVhost") if $countlog;
949              
950 0           foreach my $key ( keys %fhs ) {
951 0           close $fhs{$key};
952              
953 0 0         if ( $count->{$key} ) {
954 0 0         printf " %15.0f lines to %s\n", $count->{$key}, $key if $debug;
955 0           printf $REPORT " %15.0f lines to %s\n", $count->{$key}, $key;
956 0 0         print $HitLog "$key:$count->{$key}\n" if $countlog;
957             }
958             }
959 0 0         $self->report_close( $HitLog, $debug ) if $countlog;
960              
961 0 0         print "\n" if $debug;
962 0           print $REPORT "\n";
963              
964 0           foreach my $key ( keys %$orphans ) {
965 0 0         if ( $count->{$key} ) {
966 0 0         printf "Orphans: %15.0f lines to %s\n", $count->{$key}, $key if $debug;
967 0           printf $REPORT "Orphans: %15.0f lines to %s\n", $count->{$key}, $key;
968             }
969             }
970             };
971              
972             sub report_spam_hits {
973 0     0 1   my ($self, $count ) = @_;
974              
975 0 0         return if ! $count->{spam};
976              
977 0           my $conf = $self->{conf};
978 0           my $debug = $self->{debug};
979              
980 0 0         if ( $conf->{report_spam_user_agents} ) {
981              
982 0 0         if ( $debug ) {
983 0           printf "Referer spammers hit you $count->{spam} times";
984              
985 0           my $bytes = $count->{bytes};
986 0 0         if ( $bytes ) {
987 0 0         if ( $bytes > 1000000000 ) {
    0          
988 0           $bytes = sprintf "%.2f GB", $bytes / 1000000000;
989             }
990             elsif ( $bytes > 1000000 ) {
991 0           $bytes = sprintf "%.2f MB", $bytes / 1000000;
992             }
993             else {
994 0           $bytes = sprintf "%.2f KB", $bytes / 1000;
995             }
996              
997 0           print " and wasted $bytes of your bandwidth.";
998             }
999 0           print "\n\n";
1000             };
1001              
1002 0           my $REPORT = $self->{report};
1003 0           printf $REPORT "Referer Spam: %15.0f lines\n", $count->{spam};
1004              
1005 0           my $spamagents = $count->{spam_agents};
1006 0           foreach my $value ( sort { $spamagents->{$b} cmp $spamagents->{$a} } keys %$spamagents ) {
  0            
1007 0           print "\t $spamagents->{$value} \t $value\n";
1008             }
1009             }
1010              
1011 0 0         if ( $conf->{report_spam_referrers} ) { # This report can get very long
1012 0           my $sr = $count->{spam_referers};
1013 0           foreach ( sort { $sr->{$b} <=> $sr->{$a} } keys %$sr ) {
  0            
1014 0           print "$sr->{$_} \t $_\n";
1015             }
1016             }
1017             }
1018              
1019              
1020             sub _progress {
1021 0     0     my ($self, $mess) = @_;
1022 0           print {*STDERR} "$mess.\n";
  0            
1023 0           return;
1024             };
1025             sub _progress_begin {
1026 0     0     my ($self, $phase) = @_;
1027 0           print {*STDERR} "$phase...";
  0            
1028 0           return;
1029             };
1030             sub _progress_continue {
1031 0     0     print {*STDERR} '.';
  0            
1032 0           return;
1033             };
1034             sub _progress_end {
1035 0     0     my ($self,$mess) = @_;
1036 0 0         if ( $mess ) {
1037 0           print {*STDERR} "$mess\n";
  0            
1038             }
1039             else {
1040 0           print {*STDERR} "done\n";
  0            
1041             };
1042 0           return;
1043             };
1044              
1045             1;
1046             __END__