File Coverage

blib/lib/App/RecordStream/Operation/normalizetime.pm
Criterion Covered Total %
statement 54 63 85.7
branch 8 14 57.1
condition 6 6 100.0
subroutine 7 9 77.7
pod 0 5 0.0
total 75 97 77.3


line stmt bran cond sub pod time code
1             package App::RecordStream::Operation::normalizetime;
2              
3             # We need to set DM5 for the backend with modern DateManip.
4             # TODO: use backend 6
5             BEGIN {
6 2     2   884 $Date::Manip::Backend = 'DM5';
7             }
8              
9             our $VERSION = "4.0.25";
10              
11 2     2   9 use strict;
  2         3  
  2         31  
12              
13 2     2   17 use base qw(App::RecordStream::Operation);
  2         2  
  2         145  
14              
15 2     2   681 use Date::Manip qw (ParseDate UnixDate ParseDateDelta Delta_Format);
  2         86617  
  2         975  
16              
17             sub init {
18 3     3 0 6 my $this = shift;
19 3         8 my $args = shift;
20              
21 3         10 my $key;
22             my $threshold;
23 3         0 my $strict;
24 3         0 my $epoch;
25              
26 3         18 my $spec = {
27             "key|k=s" => \$key,
28             "strict|s" => \$strict,
29             "epoch|e" => \$epoch,
30             "threshold|n=s" => \$threshold,
31             };
32              
33 3         13 $this->parse_options($args, $spec);
34              
35 3 50       8 die('Must specify --key') unless ( $key );
36 3 50       10 die('Must specify --threshold') unless ( $threshold );
37              
38             # if threshold is not a number, assume its a parsable string
39 3 50       20 if ( not ($threshold =~ m/^[0-9.]+$/) )
40             {
41 0         0 my $delta = ParseDateDelta($threshold);
42 0         0 $threshold = Delta_Format($delta, 0, '%st');
43              
44 0 0       0 unless ( $threshold =~ m/^[0-9.]+$/ ) {
45 0         0 die "Threshold passed isn't a number or parsable, "
46             . "see perldoc Date::Manip for parseable formats\n";
47             }
48             }
49              
50 3         5 my $sanitized_key = $key;
51 3         9 $sanitized_key =~ s!/!_!;
52              
53 3         7 $this->{'KEY'} = $key;
54 3         6 $this->{'SANITIZED_KEY'} = $sanitized_key;
55 3         6 $this->{'STRICT'} = $strict;
56 3         5 $this->{'EPOCH'} = $epoch;
57 3         24 $this->{'THRESHOLD'} = $threshold;
58             }
59              
60              
61             sub accept_record {
62 17     17 0 18 my $this = shift;
63 17         18 my $record = shift;
64              
65 17         22 my $key = $this->{'KEY'};
66 17         20 my $threshold = $this->{'THRESHOLD'};
67 17         21 my $strict = $this->{'STRICT'};
68 17         21 my $sanitized_key = $this->{'SANITIZED_KEY'};
69 17         22 my $prior_normalized_value = $this->{'PRIOR_NORMALIZED_VALUE'};
70              
71 17         17 my $value = ${$record->guess_key_from_spec($key)};
  17         44  
72              
73 17         21 my $time = $value;
74 17 100       33 if ( ! $this->{'EPOCH'} ) {
75 12         26 $time = UnixDate( ParseDate( $value ), "%s" );
76 12 50       33517 die "I can't understand Key: $key, with value: $value" unless $time;
77             }
78              
79 17         48 my $normalized_time_cur_period = int( $time / $threshold ) * $threshold;
80 17         30 my $normalized_time_prior_period = $normalized_time_cur_period - $threshold;
81              
82 17         19 my $normalized_time;
83 17 100 100     70 if( !$strict && defined( $prior_normalized_value ) && $prior_normalized_value == $normalized_time_prior_period ) {
      100        
84 3         3 $normalized_time = $prior_normalized_value;
85             } else {
86 14         17 $normalized_time = $normalized_time_cur_period;
87 14         26 $prior_normalized_value = $normalized_time_cur_period;
88 14         34 $this->{'PRIOR_NORMALIZED_VALUE'} = $normalized_time_cur_period;
89             }
90              
91 17         33 $record->{"n_$sanitized_key"} = $normalized_time;
92 17         89 $this->push_record($record);
93              
94 17         63 return 1;
95             }
96              
97             sub add_help_types {
98 3     3 0 5 my $this = shift;
99              
100 3         22 $this->use_help_type('keyspecs');
101 3         11 $this->add_help_type(
102             'full',
103             \&full_help,
104             'Indepth description of normalization alogrithm'
105             );
106             }
107              
108             sub full_help {
109 0     0 0   print <
110             Full Help
111              
112             This recs processor will generate normalized versions of date/time values and
113             add this value as another attribute to the record stream. Used in conjunction
114             with recs-collate you can aggregate information over the normalized time. For
115             example if you use
116             recs-normalized -k date --n 1 | recs-collate -k n_date -a firstrec
117             then this picks a single record from a stream to serve in placement of lots of
118             records which are close to each other in time.
119              
120             The normalized time value generated depends on whether or not you are using
121             strict normalization or not. The default is to use non-strict.
122              
123             The use of the optional --epoch argument indicates that the date/time values
124             are expressed in epoch seconds. This argument both speeds up the execution of
125             an invocation (due to avoiding the expensive perl Date:Manip executions) and is
126             required for correctness when the values are epoch seconds.
127              
128             1. When using strict normalization then time is chunked up into fixed segments
129             of --threshold seconds in each segment with the first segment occurring on
130             January 1st 1970 at 0:00. So if the threshold is 60 seconds then the following
131             record stream would be produced
132              
133             date n_date
134             1:00:00 1:00:00
135             1:00:14 1:00:00
136             1:00:59 1:00:00
137             1:02:05 1:02:00
138             1:02:55 1:02:00
139             1:03:15 1:03:00
140              
141              
142             2. When not using strict normalization then the time is again chunked up into
143             fixed segments however the actual segment assigned to a value depends on the
144             segement chunk seen in the prior record.
145              
146             The logic used is the following:
147             - a time is distilled down to a representative sample where the precision is
148             defined by the --threshold. For example if you said that the threshold is
149             10 (seconds) then 10:22:01 and 10:22:09 would both become 10:22:00. 10:22:10
150             would be 10:22:10.
151             - as you can tell the representative values is the first second within the range
152             that you define, with one exception
153             - if the representative value of the prior record is in the prior range to the
154             current representative value then the prior record value will be used
155              
156             So if the threshold is 60 seconds then the following record stream would be produced
157              
158             date n_date
159             1:00:00 1:00:00
160             1:00:59 1:00:00
161             1:02:05 1:02:00
162             1:02:55 1:02:00
163             1:03:15 1:02:00 ** Note - still matches prior representative value **
164             1:05:59 1:05:00
165             1:06:15 1:05:00 ** Note - matches prior entry **
166             1:07:01 1:07:00 ** Note - since the 1:05 and 1:06 had the same representative **
167             ** value then this is considered a new representative time slice **
168              
169             Basically a 60 second threshold will match the current minute and the next minute unless
170             the prior minute was seen and then the 60 second threshold matches the current minute and
171             the prior minute.
172              
173              
174             Example usage: if you have log records for "out of memory" exceptions which may occur multiple
175             times because of exception catching and logging then you can distill them all down to a
176             single logical event and then count the number of occurrences for a host via:
177              
178             grep "OutOfMemory" logs |\
179             recs-frommultire --re 'host=@([^:]*):' --re 'date=^[A-Za-z]* (.*) GMT ' |\
180             recs-normalizetime --key date --threshold 300 | \
181             recs-collate --perfect --key n_date -a firstrec | \
182             recs-collate --perfect --key firstrec_host -a count=count
183              
184             FULL_HELP
185             }
186              
187             sub usage {
188 0     0 0   my $this = shift;
189              
190 0           my $options = [
191             ['key|-k ', 'Single Key field containing the date/time may be a key spec, see \'--help-keyspecs\' for more info'],
192             ['epoch|-e', 'Assumes date/time field is expressed in epoch seconds (optional, defaults to non-epoch)'],
193             ['threshold|-n
194             ['strict|-s', 'Apply strict normalization (defaults to non-strict)'],
195             ];
196              
197 0           my $args_string = $this->options_string($options);
198              
199 0           return <
200             Usage: recs-normalizetime []
201             __FORMAT_TEXT__
202             Given a single key field containing a date/time value this recs processor
203             will construct a normalized version of the value and place this new value
204             into a field named "n_" (where is the key field appearing in
205             the args).
206             __FORMAT_TEXT__
207              
208             Arguments:
209             $args_string
210              
211             Examples:
212             # Tag records with normalized time in 5 minute buckets from the date field
213             ... | recs-normalizetime --strict --key date -n 300
214              
215             # Normalize time with fuzzy normalization into 1 minute buckets from the
216             # epoch-relative 'time' field
217             ... | recs-normalizetime --key time -e -n 60
218              
219             #Get 1 week buckets
220             ... | recs-normalizetime --key timestamp -n '1 week'
221             USAGE
222             }
223              
224             1;