| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package App::RecordStream::Operation::normalizetime; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | # We need to set DM5 for the backend with modern DateManip. | 
| 4 |  |  |  |  |  |  | # TODO: use backend 6 | 
| 5 |  |  |  |  |  |  | BEGIN { | 
| 6 | 2 |  |  | 2 |  | 884 | $Date::Manip::Backend = 'DM5'; | 
| 7 |  |  |  |  |  |  | } | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | our $VERSION = "4.0.25"; | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 2 |  |  | 2 |  | 9 | use strict; | 
|  | 2 |  |  |  |  | 3 |  | 
|  | 2 |  |  |  |  | 31 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 | 2 |  |  | 2 |  | 17 | use base qw(App::RecordStream::Operation); | 
|  | 2 |  |  |  |  | 2 |  | 
|  | 2 |  |  |  |  | 145 |  | 
| 14 |  |  |  |  |  |  |  | 
| 15 | 2 |  |  | 2 |  | 681 | use Date::Manip qw (ParseDate UnixDate ParseDateDelta Delta_Format); | 
|  | 2 |  |  |  |  | 86617 |  | 
|  | 2 |  |  |  |  | 975 |  | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | sub init { | 
| 18 | 3 |  |  | 3 | 0 | 6 | my $this = shift; | 
| 19 | 3 |  |  |  |  | 8 | my $args = shift; | 
| 20 |  |  |  |  |  |  |  | 
| 21 | 3 |  |  |  |  | 10 | my $key; | 
| 22 |  |  |  |  |  |  | my $threshold; | 
| 23 | 3 |  |  |  |  | 0 | my $strict; | 
| 24 | 3 |  |  |  |  | 0 | my $epoch; | 
| 25 |  |  |  |  |  |  |  | 
| 26 | 3 |  |  |  |  | 18 | my $spec = { | 
| 27 |  |  |  |  |  |  | "key|k=s"       => \$key, | 
| 28 |  |  |  |  |  |  | "strict|s"      => \$strict, | 
| 29 |  |  |  |  |  |  | "epoch|e"       => \$epoch, | 
| 30 |  |  |  |  |  |  | "threshold|n=s" => \$threshold, | 
| 31 |  |  |  |  |  |  | }; | 
| 32 |  |  |  |  |  |  |  | 
| 33 | 3 |  |  |  |  | 13 | $this->parse_options($args, $spec); | 
| 34 |  |  |  |  |  |  |  | 
| 35 | 3 | 50 |  |  |  | 8 | die('Must specify --key') unless ( $key ); | 
| 36 | 3 | 50 |  |  |  | 10 | die('Must specify --threshold') unless ( $threshold ); | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | # if threshold is not a number, assume its a parsable string | 
| 39 | 3 | 50 |  |  |  | 20 | if ( not ($threshold =~ m/^[0-9.]+$/) ) | 
| 40 |  |  |  |  |  |  | { | 
| 41 | 0 |  |  |  |  | 0 | my $delta = ParseDateDelta($threshold); | 
| 42 | 0 |  |  |  |  | 0 | $threshold = Delta_Format($delta, 0, '%st'); | 
| 43 |  |  |  |  |  |  |  | 
| 44 | 0 | 0 |  |  |  | 0 | unless ( $threshold =~ m/^[0-9.]+$/ ) { | 
| 45 | 0 |  |  |  |  | 0 | die "Threshold passed isn't a number or parsable, " | 
| 46 |  |  |  |  |  |  | . "see perldoc Date::Manip for parseable formats\n"; | 
| 47 |  |  |  |  |  |  | } | 
| 48 |  |  |  |  |  |  | } | 
| 49 |  |  |  |  |  |  |  | 
| 50 | 3 |  |  |  |  | 5 | my $sanitized_key = $key; | 
| 51 | 3 |  |  |  |  | 9 | $sanitized_key =~ s!/!_!; | 
| 52 |  |  |  |  |  |  |  | 
| 53 | 3 |  |  |  |  | 7 | $this->{'KEY'}           = $key; | 
| 54 | 3 |  |  |  |  | 6 | $this->{'SANITIZED_KEY'} = $sanitized_key; | 
| 55 | 3 |  |  |  |  | 6 | $this->{'STRICT'}        = $strict; | 
| 56 | 3 |  |  |  |  | 5 | $this->{'EPOCH'}         = $epoch; | 
| 57 | 3 |  |  |  |  | 24 | $this->{'THRESHOLD'}     = $threshold; | 
| 58 |  |  |  |  |  |  | } | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | sub accept_record { | 
| 62 | 17 |  |  | 17 | 0 | 18 | my $this   = shift; | 
| 63 | 17 |  |  |  |  | 18 | my $record = shift; | 
| 64 |  |  |  |  |  |  |  | 
| 65 | 17 |  |  |  |  | 22 | my $key                    = $this->{'KEY'}; | 
| 66 | 17 |  |  |  |  | 20 | my $threshold              = $this->{'THRESHOLD'}; | 
| 67 | 17 |  |  |  |  | 21 | my $strict                 = $this->{'STRICT'}; | 
| 68 | 17 |  |  |  |  | 21 | my $sanitized_key          = $this->{'SANITIZED_KEY'}; | 
| 69 | 17 |  |  |  |  | 22 | my $prior_normalized_value = $this->{'PRIOR_NORMALIZED_VALUE'}; | 
| 70 |  |  |  |  |  |  |  | 
| 71 | 17 |  |  |  |  | 17 | my $value = ${$record->guess_key_from_spec($key)}; | 
|  | 17 |  |  |  |  | 44 |  | 
| 72 |  |  |  |  |  |  |  | 
| 73 | 17 |  |  |  |  | 21 | my $time = $value; | 
| 74 | 17 | 100 |  |  |  | 33 | if ( ! $this->{'EPOCH'} ) { | 
| 75 | 12 |  |  |  |  | 26 | $time = UnixDate( ParseDate( $value ), "%s" ); | 
| 76 | 12 | 50 |  |  |  | 33517 | die "I can't understand Key: $key, with value: $value" unless $time; | 
| 77 |  |  |  |  |  |  | } | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 17 |  |  |  |  | 48 | my $normalized_time_cur_period = int( $time / $threshold ) * $threshold; | 
| 80 | 17 |  |  |  |  | 30 | my $normalized_time_prior_period = $normalized_time_cur_period - $threshold; | 
| 81 |  |  |  |  |  |  |  | 
| 82 | 17 |  |  |  |  | 19 | my $normalized_time; | 
| 83 | 17 | 100 | 100 |  |  | 70 | if( !$strict && defined( $prior_normalized_value ) && $prior_normalized_value == $normalized_time_prior_period ) { | 
|  |  |  | 100 |  |  |  |  | 
| 84 | 3 |  |  |  |  | 3 | $normalized_time = $prior_normalized_value; | 
| 85 |  |  |  |  |  |  | } else { | 
| 86 | 14 |  |  |  |  | 17 | $normalized_time = $normalized_time_cur_period; | 
| 87 | 14 |  |  |  |  | 26 | $prior_normalized_value = $normalized_time_cur_period; | 
| 88 | 14 |  |  |  |  | 34 | $this->{'PRIOR_NORMALIZED_VALUE'} = $normalized_time_cur_period; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  |  | 
| 91 | 17 |  |  |  |  | 33 | $record->{"n_$sanitized_key"} = $normalized_time; | 
| 92 | 17 |  |  |  |  | 89 | $this->push_record($record); | 
| 93 |  |  |  |  |  |  |  | 
| 94 | 17 |  |  |  |  | 63 | return 1; | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | sub add_help_types { | 
| 98 | 3 |  |  | 3 | 0 | 5 | my $this = shift; | 
| 99 |  |  |  |  |  |  |  | 
| 100 | 3 |  |  |  |  | 22 | $this->use_help_type('keyspecs'); | 
| 101 | 3 |  |  |  |  | 11 | $this->add_help_type( | 
| 102 |  |  |  |  |  |  | 'full', | 
| 103 |  |  |  |  |  |  | \&full_help, | 
| 104 |  |  |  |  |  |  | 'Indepth description of normalization alogrithm' | 
| 105 |  |  |  |  |  |  | ); | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | sub full_help { | 
| 109 | 0 |  |  | 0 | 0 |  | print < | 
| 110 |  |  |  |  |  |  | Full Help | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | This recs processor will generate normalized versions of date/time values and | 
| 113 |  |  |  |  |  |  | add this value as another attribute to the record stream.  Used in conjunction | 
| 114 |  |  |  |  |  |  | with recs-collate you can aggregate information over the normalized time.  For | 
| 115 |  |  |  |  |  |  | example if you use | 
| 116 |  |  |  |  |  |  | recs-normalized -k date --n 1 | recs-collate -k n_date -a firstrec | 
| 117 |  |  |  |  |  |  | then this picks a single record from a stream to serve in placement of lots of | 
| 118 |  |  |  |  |  |  | records which are close to each other in time. | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  | The normalized time value generated depends on whether or not you are using | 
| 121 |  |  |  |  |  |  | strict normalization or not.  The default is to use non-strict. | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | The use of the optional --epoch argument indicates that the date/time values | 
| 124 |  |  |  |  |  |  | are expressed in epoch seconds.  This argument both speeds up the execution of | 
| 125 |  |  |  |  |  |  | an invocation (due to avoiding the expensive perl Date:Manip executions) and is | 
| 126 |  |  |  |  |  |  | required for correctness when the values are epoch seconds. | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  | 1.  When using strict normalization then time is chunked up into fixed segments | 
| 129 |  |  |  |  |  |  | of --threshold seconds in each segment with the first segment occurring on | 
| 130 |  |  |  |  |  |  | January 1st 1970 at 0:00.  So if the threshold is 60 seconds then the following | 
| 131 |  |  |  |  |  |  | record stream would be produced | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | date      n_date | 
| 134 |  |  |  |  |  |  | 1:00:00   1:00:00 | 
| 135 |  |  |  |  |  |  | 1:00:14   1:00:00 | 
| 136 |  |  |  |  |  |  | 1:00:59   1:00:00 | 
| 137 |  |  |  |  |  |  | 1:02:05   1:02:00 | 
| 138 |  |  |  |  |  |  | 1:02:55   1:02:00 | 
| 139 |  |  |  |  |  |  | 1:03:15   1:03:00 | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | 2.  When not using strict normalization then the time is again chunked up into | 
| 143 |  |  |  |  |  |  | fixed segments however the actual segment assigned to a value depends on the | 
| 144 |  |  |  |  |  |  | segement chunk seen in the prior record. | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | The logic used is the following: | 
| 147 |  |  |  |  |  |  | - a time is distilled down to a representative sample where the precision is | 
| 148 |  |  |  |  |  |  | defined by the --threshold.  For example if you said that the threshold is | 
| 149 |  |  |  |  |  |  | 10 (seconds) then 10:22:01 and 10:22:09 would both become 10:22:00.  10:22:10 | 
| 150 |  |  |  |  |  |  | would be 10:22:10. | 
| 151 |  |  |  |  |  |  | - as you can tell the representative values is the first second within the range | 
| 152 |  |  |  |  |  |  | that you define, with one exception | 
| 153 |  |  |  |  |  |  | - if the representative value of the prior record is in the prior range to the | 
| 154 |  |  |  |  |  |  | current representative value then the prior record value will be used | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | So if the threshold is 60 seconds then the following record stream would be produced | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | date      n_date | 
| 159 |  |  |  |  |  |  | 1:00:00   1:00:00 | 
| 160 |  |  |  |  |  |  | 1:00:59   1:00:00 | 
| 161 |  |  |  |  |  |  | 1:02:05   1:02:00 | 
| 162 |  |  |  |  |  |  | 1:02:55   1:02:00 | 
| 163 |  |  |  |  |  |  | 1:03:15   1:02:00     ** Note - still matches prior representative value               ** | 
| 164 |  |  |  |  |  |  | 1:05:59   1:05:00 | 
| 165 |  |  |  |  |  |  | 1:06:15   1:05:00     ** Note - matches prior entry                                    ** | 
| 166 |  |  |  |  |  |  | 1:07:01   1:07:00     ** Note - since the 1:05 and 1:06 had the same representative    ** | 
| 167 |  |  |  |  |  |  | ** value then this is considered a new representative time slice ** | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | Basically a 60 second threshold will match the current minute and the next minute unless | 
| 170 |  |  |  |  |  |  | the prior minute was seen and then the 60 second threshold matches the current minute and | 
| 171 |  |  |  |  |  |  | the prior minute. | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | Example usage: if you have log records for "out of memory" exceptions which may occur multiple | 
| 175 |  |  |  |  |  |  | times because of exception catching and logging then you can distill them all down to a | 
| 176 |  |  |  |  |  |  | single logical event and then count the number of occurrences for a host via: | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | grep "OutOfMemory" logs |\ | 
| 179 |  |  |  |  |  |  | recs-frommultire --re 'host=@([^:]*):' --re 'date=^[A-Za-z]* (.*) GMT ' |\ | 
| 180 |  |  |  |  |  |  | recs-normalizetime --key date --threshold 300 | \ | 
| 181 |  |  |  |  |  |  | recs-collate --perfect --key n_date -a firstrec | \ | 
| 182 |  |  |  |  |  |  | recs-collate --perfect --key firstrec_host -a count=count | 
| 183 |  |  |  |  |  |  |  | 
| 184 |  |  |  |  |  |  | FULL_HELP | 
| 185 |  |  |  |  |  |  | } | 
| 186 |  |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  | sub usage { | 
| 188 | 0 |  |  | 0 | 0 |  | my $this = shift; | 
| 189 |  |  |  |  |  |  |  | 
| 190 | 0 |  |  |  |  |  | my $options = [ | 
| 191 |  |  |  |  |  |  | ['key|-k ', 'Single Key field containing the date/time may be a key spec, see \'--help-keyspecs\' for more info'], | 
| 192 |  |  |  |  |  |  | ['epoch|-e', 'Assumes date/time field is expressed in epoch seconds (optional, defaults to non-epoch)'], | 
| 193 |  |  |  |  |  |  | ['threshold|-n | 
| 194 |  |  |  |  |  |  | ['strict|-s', 'Apply strict normalization (defaults to non-strict)'], | 
| 195 |  |  |  |  |  |  | ]; | 
| 196 |  |  |  |  |  |  |  | 
| 197 | 0 |  |  |  |  |  | my $args_string = $this->options_string($options); | 
| 198 |  |  |  |  |  |  |  | 
| 199 | 0 |  |  |  |  |  | return < | 
| 200 |  |  |  |  |  |  | Usage: recs-normalizetime  [] | 
| 201 |  |  |  |  |  |  | __FORMAT_TEXT__ | 
| 202 |  |  |  |  |  |  | Given a single key field containing a date/time value this recs processor | 
| 203 |  |  |  |  |  |  | will construct a normalized version of the value and place this new value | 
| 204 |  |  |  |  |  |  | into a field named "n_" (where  is the key field appearing in | 
| 205 |  |  |  |  |  |  | the args). | 
| 206 |  |  |  |  |  |  | __FORMAT_TEXT__ | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | Arguments: | 
| 209 |  |  |  |  |  |  | $args_string | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | Examples: | 
| 212 |  |  |  |  |  |  | # Tag records with normalized time in 5 minute buckets from the date field | 
| 213 |  |  |  |  |  |  | ... | recs-normalizetime --strict --key date -n 300 | 
| 214 |  |  |  |  |  |  |  | 
| 215 |  |  |  |  |  |  | # Normalize time with fuzzy normalization into 1 minute buckets from the | 
| 216 |  |  |  |  |  |  | # epoch-relative 'time' field | 
| 217 |  |  |  |  |  |  | ... | recs-normalizetime --key time -e -n 60 | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | #Get 1 week buckets | 
| 220 |  |  |  |  |  |  | ... | recs-normalizetime --key timestamp -n '1 week' | 
| 221 |  |  |  |  |  |  | USAGE | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  | 1; |