File Coverage

blib/lib/Logfile/Read.pm
Criterion Covered Total %
statement 296 296 100.0
branch 122 122 100.0
condition 26 26 100.0
subroutine 36 36 100.0
pod 6 6 100.0
total 486 486 100.0


line stmt bran cond sub pod time code
1              
2             package Logfile::Read;
3              
4             =head1 NAME
5              
6             Logfile::Read - read log files
7              
8             =head1 SYNOPSIS
9              
10             use Logfile::Read ();
11             my $file = new Logfile::Read('/var/log/messages');
12             while (<$file>) {
13             # process the line
14             }
15              
16             and later in different process
17              
18             my $file = new Logfile::Read('/var/log/messages');
19              
20             and continue reading where we've left out the last time. Also possible
21             is to explicitly save the current position:
22              
23             my $file = new Logfile::Read('/var/log/messages',
24             { autocommit => 0 });
25             my $line = $file->getline();
26             $file->commit();
27              
28             =cut
29              
30 4     4   110154 use strict;
  4         9  
  4         157  
31 4     4   17 use warnings FATAL => 'all';
  4         9  
  4         220  
32              
33             our $VERSION = '0.6';
34              
35 4     4   4066 use Symbol ();
  4         3938  
  4         90  
36 4     4   3398 use IO::File ();
  4         39391  
  4         88  
37 4     4   4383 use Digest::SHA ();
  4         847434  
  4         124  
38 4     4   38 use File::Spec ();
  4         9  
  4         98  
39 4     4   19 use Fcntl qw( O_RDWR O_CREAT );
  4         8  
  4         234  
40 4     4   21 use Cwd ();
  4         6  
  4         16782  
41              
42             sub new {
43 40     40 1 86184 my $class = shift;
44              
45 40         203 my $self = Symbol::gensym();
46 40         679 bless $self, $class;
47 40         479 tie *$self, $self;
48              
49 40 100       172 if (@_) {
50 39 100       170 $self->open(@_) or return;
51             }
52              
53 32         171 return $self;
54             }
55              
56             my $STATUS_SUBDIR = '.logfile-read-status';
57             my $CHECK_LENGTH = 512;
58             sub open {
59 39     39 1 66 my $self = shift;
60              
61 39         67 my $filename = shift;
62 39 100 100     247 if (@_ and ref $_[-1] eq 'HASH') {
63 7         30 *$self->{opts} = pop @_;
64             }
65 39 100       218 if (not exists *$self->{opts}{autocommit}) {
66 35         111 *$self->{opts}{autocommit} = 1;
67             }
68              
69 39         181 my ($archive, $offset, $checksum) = $self->_load_data_from_status($filename);
70 39 100       454 return unless defined $offset;
71              
72 35         88 my $need_commit = *$self->{opts}{autocommit};
73 35 100       92 if (not defined $checksum) {
74 11         13 $need_commit = 1;
75             }
76              
77 35 100       196 my ($fh, $content) = $self->_open(defined $archive ? $filename . $archive : $filename, $offset);
78 35 100 100     529 if (not defined $fh) {
    100          
    100          
79 4 100       16 if (not defined $archive) {
80 2         19 return;
81             }
82 2         11 my ($older_fh, $older_archive, $older_content) = $self->_get_archive($archive, 'older', $offset, $checksum);
83 2 100       13 if (defined $older_fh) {
84 1         2 $fh = $older_fh;
85 1         2 $content = $older_content;
86 1         3 $archive = $older_archive;
87             } else {
88 1         6 return;
89             }
90             } elsif (not defined $checksum) {
91 9         40 $content = $self->_seek_to($fh, 0);
92             } elsif (not defined $content
93             or $checksum ne Digest::SHA::sha256_hex($content)) {
94 9         40 my ($older_fh, $older_archive, $older_content) = $self->_get_archive($archive, 'older', $offset, $checksum);
95 9 100       31 if (defined $older_fh) {
96 5         19 $fh->close();
97 5         73 $fh = $older_fh;
98 5         14 $content = $older_content;
99 5         16 $archive = $older_archive;
100             } else {
101 4         17 $content = $self->_seek_to($fh, 0);
102             }
103             }
104              
105 32         67 my $layers = $_[0];
106 32 100 100     136 if (defined $layers and $layers =~ /<:/) {
107 2         10 $layers =~ s!<:!<:scalar:!;
108             } else {
109 30         49 $layers = '<:scalar';
110             }
111              
112 32         61 my $buffer = '';
113 32         93 *$self->{int_buffer} = \$buffer;
114 32         49 my $int_fh;
115 32     3   58 eval { open $int_fh, $layers, *$self->{int_buffer} };
  32     2   613  
  3         36  
  3         11  
  3         30  
  2         2394  
  2         5  
  2         15  
116 32 100       30247 if ($@) {
117 1         8 warn "$@\n";
118 1         108 return;
119             };
120 31         104 *$self->{int_fh} = $int_fh;
121              
122 31         72 *$self->{_fh} = $fh;
123 31         98 *$self->{data_array} = [ $content ];
124 31         107 *$self->{data_length} = length $content;
125 31         64 *$self->{archive} = $archive;
126              
127 31 100       74 if ($need_commit) {
128 28         102 $self->commit();
129             }
130 31         1432 1;
131             }
132              
133             sub _open {
134 75     75   142 my ($self, $filename, $offset) = @_;
135 75 100       423 my $fh = new IO::File or return;
136 73 100       4669 $fh->open($filename, '<:raw') or return;
137              
138 65 100       3731 if ($offset > 0) {
139 33         109 my $content = $self->_seek_to($fh, $offset);
140 33         98 return ($fh, $content);
141             }
142 32         89 return ($fh, '');
143             }
144              
145             sub _fh {
146 2067     2067   2326 *{$_[0]}->{_fh};
  2067         4234  
147             }
148              
149             sub _seek_to {
150 46     46   97 my ($self, $fh, $offset) = @_;
151              
152 46         79 my $offset_start = $offset - $CHECK_LENGTH;
153 46 100       120 $offset_start = 0 if $offset_start < 0;
154              
155             # no point in checking the return value, seek will
156             # go beyond the end of the file anyway
157 46         222 $fh->seek($offset_start, 0);
158              
159 46         417 my $buffer = '';
160 46         126 while ($offset - $offset_start > 0) {
161 36         177 my $read = $fh->read($buffer, $offset - $offset_start, length($buffer));
162             # $read is not defined for example when we try to read directory
163 36 100 100     806 last if not defined $read or $read <= 0;
164 28         79 $offset_start += $read;
165             }
166 46 100       102 if ($offset_start == $offset) {
167 38         102 return $buffer;
168             } else {
169 8         16 return;
170             }
171             }
172              
173             sub _load_data_from_status {
174 39     39   86 my ($self, $log_filename) = @_;
175 39         2074 my $abs_filename = Cwd::abs_path($log_filename);
176 39 100       129 if (not defined $abs_filename) {
177             # can we access the file at all?
178 1         7 warn "Cannot access file [$log_filename]\n";
179 1         10 return;
180             }
181 38         923 my @abs_stat = stat $abs_filename;
182 38 100 100     1178 if (defined $abs_stat[1] and (stat $log_filename)[1] == $abs_stat[1]) {
183 35         70 $log_filename = $abs_filename;
184             }
185              
186 38         144 *$self->{filename} = $log_filename;
187              
188 38         95 my $status_filename = *$self->{opts}{status_file};
189 38 100       125 if (not defined $status_filename) {
190 35         1789 $status_filename = Digest::SHA::sha256_hex($log_filename);
191             }
192 38         184 my $status_dir = *$self->{opts}{status_dir};
193 38 100       134 if (not defined $status_dir) {
    100          
194 35         73 $status_dir = $STATUS_SUBDIR;
195             } elsif ($status_dir eq '') {
196 1         2 $status_dir = '.';
197             }
198 38 100       501 if (not -d $status_dir) {
199 4         265 mkdir $status_dir, 0775;
200             }
201 38         764 my $status_path = File::Spec->catfile($status_dir, $status_filename);
202 38         358 my $status_fh = new IO::File $status_path, O_RDWR | O_CREAT;
203 38 100       203399 if (not defined $status_fh) {
204 1         23 warn "Error reading/creating status file [$status_path]\n";
205 1         14 return;
206             }
207 37         287 *$self->{status_fh} = $status_fh;
208              
209 37         2072 my $status_line = <$status_fh>;
210 37         93 my ($offset, $checksum, $archive_filename) = (0, undef, undef);
211 37 100       116 if (defined $status_line) {
212 26 100       275 if (not $status_line =~ /^File \[(.+?)\] (?:archive \[(.+)\] )?offset \[(\d+)\] checksum \[([0-9a-z]+)\]\n/) {
213 1         10 warn "Status file [$status_path] has bad format\n";
214 1         9 return;
215             }
216 25         65 my $check_filename = $1;
217 25         54 $archive_filename = $2;
218 25         55 $offset = $3;
219 25         95 $checksum = $4;
220 25 100       83 if ($check_filename ne $log_filename) {
221 1         23 warn "Status file [$status_path] is for file [$check_filename] while expected [$log_filename]\n";
222 1         11 return;
223             }
224             }
225              
226 35         175 return ($archive_filename, $offset, $checksum);
227             }
228              
229             sub _save_offset_to_status {
230 62     62   100 my ($self, $offset) = @_;
231 62         119 my $log_filename = *$self->{filename};
232 62         90 my $status_fh = *$self->{status_fh};
233 62         150 my $checksum = $self->_get_current_checksum();
234 62         771 $status_fh->seek(0, 0);
235 62 100       687 my $archive_text = defined *$self->{archive} ? " archive [@{[ *$self->{archive} ]}]" : '';
  15         73  
236 62         399 $status_fh->printflush("File [$log_filename]$archive_text offset [$offset] checksum [$checksum]\n");
237 62         4279 $status_fh->truncate($status_fh->tell);
238             }
239              
240             sub _push_to_data {
241 2056     2056   2393 my $self = shift;
242 2056         2447 my $chunk = shift;
243 2056 100       8773 if (length($chunk) >= $CHECK_LENGTH) {
244 1         5 *$self->{data_array} = [ substr $chunk, -$CHECK_LENGTH ];
245 1         4 *$self->{data_length} = $CHECK_LENGTH;
246 1         3 return;
247             }
248 2055         3228 my $data = *$self->{data_array};
249 2055         2797 my $data_length = *$self->{data_length};
250 2055         3552 push @$data, $chunk;
251 2055         2276 $data_length += length($chunk);
252 2055         5183 while ($data_length - length($data->[0]) >= $CHECK_LENGTH) {
253 1895         2371 $data_length -= length($data->[0]);
254 1895         4502 shift @$data;
255             }
256 2055         3981 *$self->{data_length} = $data_length;
257             }
258              
259             sub _get_current_checksum {
260 66     66   113 my $self = shift;
261 66         101 my $data_length = *$self->{data_length};
262 66         93 my $data = *$self->{data_array};
263 66         103 my $i = 0;
264 66         558 my $digest = new Digest::SHA('sha256');
265 66 100       1432 if ($data_length > $CHECK_LENGTH) {
266 1         21 $digest->add(substr($data->[0], $data_length - $CHECK_LENGTH));
267 1         2 $i++;
268             }
269 66         189 for (; $i <= $#$data; $i++) {
270 136         569 $digest->add($data->[$i]);
271             }
272 66         666 return $digest->hexdigest();
273             }
274              
275             sub _get_archive {
276 36     36   670 my ($self, $start, $older_newer, $offset, $checksum) = @_;
277 36         111 my @types = ( '-', '.' );
278 36         141 my $start_num;
279 36 100       90 if (defined $start) {
280 27         81 @types = substr($start, 0, 1);
281 27         52 $start_num = substr($start, 1);
282             }
283 36         79 my $filename = *$self->{filename};
284 36         83 for my $t (@types) {
285 43         85 my $srt;
286 43 100       105 if ($t eq '.') {
287 30 100       65 if ($older_newer eq 'newer') {
288 17     98   78 $srt = sub { $_[1] <=> $_[0] };
  98         388  
289             } else {
290 13     34   56 $srt = sub { $_[0] <=> $_[1] };
  34         132  
291             }
292             } else {
293 13 100       27 if ($older_newer eq 'newer') {
294 4     14   18 $srt = sub { $_[0] cmp $_[1] };
  14         53  
295             } else {
296 9     2   56 $srt = sub { $_[1] cmp $_[0] };
  2         8  
297             }
298             }
299 50         128 my @archives = map { "$t$_" } # make it a suffix
  36         65  
300 125 100       767 sort { $srt->($a, $b) } # sort properly
301 125         413 grep { not defined $start_num or $srt->($_, $start_num) == 1} # only newer / older
302 125         292 grep { /^[0-9]+$/ } # only numerical suffixes
303 43         6881 map { substr($_, length($filename) + 1) } # only get the numerical suffixes
304             glob "$filename$t*"; # we look at file.1, file.2 or file-20091231, ...
305 43 100 100     700 if ($older_newer eq 'newer' and -f $filename) {
306 17         32 push @archives, '';
307             }
308 43         125 for my $a (@archives) {
309 40   100     349 my ($fh, $content) = $self->_open($filename . $a, ($offset || 0));
310 40 100       601 if (not defined $fh) {
311 6         66 next;
312             }
313 34 100       64 if (defined $checksum) {
314 14 100 100     133 if (defined $content
315             and $checksum eq Digest::SHA::sha256_hex($content)) {
316 9         68 return ($fh, $a, $content);
317             }
318             } else {
319 20 100       149 return ($fh, ($a eq '' ? undef : $a), $content);
320             }
321 5         19 $fh->close();
322             }
323             }
324 7         23 return;
325             }
326              
327             sub _close_status {
328 31     31   53 my ($self, $offset) = @_;
329 31         92 my $status_fh = delete *$self->{status_fh};
330 31 100       170 $status_fh->close() if defined $status_fh;
331             }
332              
333             sub _getline {
334 2067     2067   2516 my $self = shift;
335 2067         3621 my $fh = $self->_fh;
336 2067 100       4693 if (defined $fh) {
337 2066         2775 my $buffer_ref = *$self->{int_buffer};
338 2089         2617 DO_GETLINE:
339             my $ret = undef;
340 2089         43775 $$buffer_ref = $fh->getline();
341 2089 100       55457 if (not defined $$buffer_ref) {
342             # we are at the end of the current file
343             # we need to check if the file was rotated
344             # in the meantime
345 33         438 my @fh_stat = stat($fh);
346 33         81 my $filename = *$self->{filename};
347 33 100       1025 my @file_stat = stat($filename . ( defined *$self->{archive} ? *$self->{archive} : '' ));
348 33 100 100     399 if (not @file_stat or "@fh_stat[0, 1]" ne "@file_stat[0, 1]") {
    100          
349             # our file was rotated, or generally
350             # is no longer where it was when
351             # we started to read
352 4         26 my ($older_fh, $older_archive, $older_content)
353             = $self->_get_archive(*$self->{archive}, 'older', $fh->tell, $self->_get_current_checksum);
354 4 100       13 if (not defined $older_fh) {
355             # we have lost the file / sync
356 1         4 return;
357             }
358 3         21 *$self->{_fh}->close();
359 3         46 *$self->{_fh} = $fh = $older_fh;
360 3         13 *$self->{data_array} = [ $older_content ];
361 3         9 *$self->{data_length} = length $older_content;
362 3         6 *$self->{archive} = $older_archive;
363 3         52 goto DO_GETLINE;
364             } elsif (defined *$self->{archive}) {
365             # our file was not rotated
366             # however, if our file is in fact
367             # a rotate file, we should go to the
368             # next one
369 21         104 my ($newer_fh, $newer_archive) = $self->_get_archive(*$self->{archive}, 'newer');
370 21 100       191 if (not defined $newer_fh) {
371 1         5 return;
372             }
373 20         330 *$self->{_fh}->close();
374 20         291 *$self->{_fh} = $fh = $newer_fh;
375 20         505 *$self->{data_array} = [ '' ];
376 20         44 *$self->{data_length} = 0;
377 20         30 *$self->{archive} = $newer_archive;
378 20         493 goto DO_GETLINE;
379             }
380 8         35 return;
381             }
382 2056         4253 $self->_push_to_data($$buffer_ref);
383 2056         3418 seek(*$self->{int_fh}, 0, 0);
384 2056         43666 my $line = *$self->{int_fh}->getline();
385 2056         53463 return $line;
386             } else {
387 1         3 return undef;
388             }
389             }
390              
391             sub getline {
392 43     43 1 1530 my $self = shift;
393 43         118 my $ret = $self->_getline();
394 4     4   31 no warnings 'uninitialized';
  4         8  
  4         525  
395 43 100       160 if (*$self->{opts}{autocommit} == 2) {
396 1         5 $self->commit();
397             }
398 43         223 return $ret;
399             }
400              
401             sub getlines {
402 7     7 1 19 my $self = shift;
403 7         13 my @out;
404 7         12 while (1) {
405 2024         4257 my $l = $self->_getline();
406 2024 100       4415 if (not defined $l) {
407 7         24 last;
408             }
409 2017         3418 push @out, $l;
410             }
411 4     4   69 no warnings 'uninitialized';
  4         7  
  4         1485  
412 7 100       29 if (*$self->{opts}{autocommit} == 2) {
413 1         3 $self->commit();
414             }
415 7         435 @out;
416             }
417              
418             sub commit {
419 62     62 1 97 my $self = shift;
420 62         108 my $fh = *$self->{_fh};
421 62         247 my $offset = $fh->tell;
422 62         445 $self->_save_offset_to_status($offset);
423             }
424              
425             sub close {
426 31     31 1 2646 my $self = shift;
427 31 100       151 if (*$self->{opts}{autocommit}) {
428 27         66 $self->commit();
429             }
430 31         1193 $self->_close_status();
431 31         576 my $fh = delete *$self->{_fh};
432 31 100       147 $fh->close() if defined $fh;
433             }
434              
435             sub TIEHANDLE() {
436 41 100   41   1245 if (ref $_[0]) {
437             # if we already have object, probably called from new(),
438             # just return that
439 40         135 return $_[0];
440             } else {
441 1         2 my $class = shift;
442 1         5 return $class->new(@_);
443             }
444             }
445              
446             sub READLINE() {
447 29 100   29   18137 goto &getlines if wantarray;
448 24         117 goto &getline;
449             }
450              
451             sub CLOSE() {
452 1     1   3 my $self = shift;
453 1         5 $self->close();
454             }
455              
456             sub DESTROY() {
457 39     39   3908 my $self = shift;
458 39 100       723 $self->close() if defined *$self->{_fh};
459             }
460              
461             1;
462              
463             =head1 DESCRIPTION
464              
465             Log files are files that are generated by various running programs.
466             They are generally only appended to. When parsing information from
467             log files, it is important to only read each record / line once,
468             both for performance and for accounting and statistics reasons.
469              
470             The C provides an easy way to achieve the
471             read-just-once processing of log files.
472              
473             The module remembers for each file the position where it left
474             out the last time, in external status file, and upon next invocation
475             it seeks to the remembered position. It also stores checksum
476             of 512 bytes before that position, and if the checksum does not
477             match the file content the next time it is read, it will try to
478             find the rotated file and read the end of it before advancing to
479             newer rotated file or to the current log file.
480              
481             Both .num and -date suffixed rotated files are supported.
482              
483             =head1 METHODS
484              
485             =over 4
486              
487             =item new()
488              
489             =item new( FILENAME [,MODE [,PERMS]], [ { attributes } ] )
490              
491             =item new( FILENAME, IOLAYERS, [ { attributes } ] )
492              
493             Constructor, creates new C object. Like C,
494             it passes any parameters to method C; it actually creates
495             an C handle internally.
496              
497             Returns new object, or undef upon error.
498              
499             =item open( FILENAME [,MODE [,PERMS]], [ { attributes } ] )
500              
501             =item open( FILENAME, IOLAYERS, [ { attributes } ] )
502              
503             Opens the file using C. If the file was read before, the
504             offset where the reading left out the last time is read from an
505             external file in the ./.logfile-read-status directory and seek is
506             made to that offset, to continue reading at the last remembered
507             position.
508              
509             If however checksum, which is also stored with the offset, does not
510             match the current content of the file (512 bytes before the offset
511             are checked), the module assumes that the file was rotated / reused
512             / truncated in the mean time since the last read. It will try to
513             find the checksum among the rotated files. If no match is found,
514             it will reset the offset to zero and start from the beginning of
515             the file.
516              
517             Returns true, or undef upon error.
518              
519             The attributes are passed as an optional hashref of key => value
520             pairs. The supported attribute is
521              
522             =over 4
523              
524             =item autocommit
525              
526             Value 0 means that no saving takes place; you need to save explicitly
527             using the commit() method.
528              
529             Value 1 (the default) means that position is saved when the object is
530             closed via explicit close() call, or when it is destroyed. The value
531             is also saved upon the first open.
532              
533             Value 2 causes the position to be save in all cases as value 1,
534             plus after each successful read.
535              
536             =item status_dir
537              
538             The attribute specifies the directory (or subdirectory of current
539             directory) which is used to hold status files. By default,
540             ./.logfile-read-status directory is used. To store the status
541             files in the current directory, pass empty string or dot (.).
542              
543             =item status_file
544              
545             The attribute specifies the name of the status file which is used to
546             hold the offset and SHA256 checksum of 512 bytes before the offset.
547             By default, SHA256 of the full (absolute) logfile filename is used
548             as the status file name.
549              
550             =back
551              
552             =item commit()
553              
554             Explicitly save the current position and checksum in the status file.
555              
556             Returns true, or undef upon error.
557              
558             =item close()
559              
560             Closes the internal filehandle. It stores the current position
561             and checksum in an external file in the ./.logfile-read-status
562             directory.
563              
564             Returns true, or undef upon error.
565              
566             =item getline()
567              
568             Line <$fh> in scalar context.
569              
570             =item getlines()
571              
572             Line <$fh> in list context.
573              
574             =back
575              
576             =head1 AUTHOR AND LICENSE
577              
578             Copyright (c) 2010 Jan Pazdziora.
579              
580             Logfile::Read is free software. You can redistribute it and/or modify
581             it under the terms of either:
582              
583             a) the GNU General Public License, version 2 or 3;
584              
585             b) the Artistic License, either the original or version 2.0.
586