File Coverage

blib/lib/Fsdb/Filter/dbcolmovingstats.pm
Criterion Covered Total %
statement 21 107 19.6
branch 0 38 0.0
condition 0 12 0.0
subroutine 7 16 43.7
pod 5 5 100.0
total 33 178 18.5


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             #
4             # dbcolmovingstats.pm
5             # Copyright (C) 1991-2015 by John Heidemann
6             # $Id: 356dd222b0dd3e651903369cd0cfd06ae9ca2a54 $
7             #
8             # This program is distributed under terms of the GNU general
9             # public license, version 2. See the file COPYING
10             # in $dblibdir for details.
11             #
12              
13             package Fsdb::Filter::dbcolmovingstats;
14              
15             =head1 NAME
16              
17             dbcolmovingstats - compute moving statistics over a window of a column of data
18              
19             =head1 SYNOPSIS
20              
21             dbcolmovingstats [-am] [-w window-width] [-e EmptyValue] column
22              
23             =head1 DESCRIPTION
24              
25             Compute moving statistics over a COLUMN of data.
26             Records containing non-numeric data are considered null
27             do not contribute to the stats (optionally they are treated as zeros
28             with C<-a>).
29              
30             Currently we compute mean and sample standard deviation.
31             (Note we only compute sample standard deviation,
32             not full population.)
33             Optionally, with C<-m> we also compute median.
34             (Currently there is no support for generalized quantiles.)
35              
36             Values before a sufficient number have been accumulated are given the
37             empty value (if specified with C<-e>).
38             If no empty value is given, stats are computed on as many are possible if no empty
39             value is specified.
40              
41             Dbcolmovingstats runs in O(1) memory, but must buffer a full window of data.
42             Quantiles currently will repeatedly sort the window and so may perform
43             poorly with wide windows.
44              
45              
46             =head1 OPTIONS
47              
48             =over 4
49              
50             =item B<-a> or B<--include-non-numeric>
51              
52             Compute stats over all records (treat non-numeric records
53             as zero rather than just ignoring them).
54              
55             =item B<-w> or B<--window> WINDOW
56              
57             WINDOW of how many items to accumulate (defaults to 10).
58             (For compatibility with fsdb-1.x, B<-n> is also supported.)
59              
60             =item B<-m> or B<--median>
61              
62             Show median of the window in addition to mean.
63              
64             =item B<-e E> or B<--empty E>
65              
66             Give value E as the value for empty (null) records.
67             This null value is then output before a full window is accumulated.
68              
69             =item B<-f FORMAT> or B<--format FORMAT>
70              
71             Specify a L-style format for output mean and standard deviation.
72             Defaults to C<%.5g>.
73              
74             =back
75              
76             Eventually we expect to support other options of L.
77              
78             =for comment
79             begin_standard_fsdb_options
80              
81             This module also supports the standard fsdb options:
82              
83             =over 4
84              
85             =item B<-d>
86              
87             Enable debugging output.
88              
89             =item B<-i> or B<--input> InputSource
90              
91             Read from InputSource, typically a file name, or C<-> for standard input,
92             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
93              
94             =item B<-o> or B<--output> OutputDestination
95              
96             Write to OutputDestination, typically a file name, or C<-> for standard output,
97             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
98              
99             =item B<--autorun> or B<--noautorun>
100              
101             By default, programs process automatically,
102             but Fsdb::Filter objects in Perl do not run until you invoke
103             the run() method.
104             The C<--(no)autorun> option controls that behavior within Perl.
105              
106             =item B<--help>
107              
108             Show help.
109              
110             =item B<--man>
111              
112             Show full manual.
113              
114             =back
115              
116             =for comment
117             end_standard_fsdb_options
118              
119              
120             =head1 SAMPLE USAGE
121              
122             =head2 Input:
123              
124             #fsdb date epoch count
125             19980201 886320000 6
126             19980202 886406400 8
127             19980203 886492800 19
128             19980204 886579200 53
129             19980205 886665600 20
130             19980206 886752000 18
131             19980207 886838400 5
132             19980208 886924800 9
133             19980209 887011200 22
134             19980210 887097600 22
135             19980211 887184000 36
136             19980212 887270400 26
137             19980213 887356800 23
138             19980214 887443200 6
139              
140             =head2 Command:
141              
142             cat data.fsdb | dbmovingstats -e - -w 4 count
143              
144             =head2 Output:
145              
146              
147             #fsdb date epoch count moving_mean moving_stddev
148             19980201 886320000 6 - -
149             19980202 886406400 8 - -
150             19980203 886492800 19 - -
151             19980204 886579200 53 21.5 21.764
152             19980205 886665600 20 25 19.442
153             19980206 886752000 18 27.5 17.02
154             19980207 886838400 5 24 20.445
155             19980208 886924800 9 13 7.1647
156             19980209 887011200 22 13.5 7.8528
157             19980210 887097600 22 14.5 8.8129
158             19980211 887184000 36 22.25 11.026
159             19980212 887270400 26 26.5 6.6081
160             19980213 887356800 23 26.75 6.3966
161             19980214 887443200 6 22.75 12.473
162             # | dbcolmovingstats -e - -n 4 count
163              
164              
165             =head1 SEE ALSO
166              
167             L.
168             L.
169             L.
170             L.
171              
172             =head1 BUGS
173              
174             Currently there is no support for generalized quantiles.
175              
176              
177             =head1 CLASS FUNCTIONS
178              
179             =cut
180              
181             @ISA = qw(Fsdb::Filter);
182             $VERSION = 2.0;
183              
184 1     1   5000 use strict;
  1         2  
  1         28  
185 1     1   6 use Pod::Usage;
  1         1  
  1         99  
186 1     1   7 use Carp;
  1         2  
  1         47  
187              
188 1     1   5 use Fsdb::Filter;
  1         2  
  1         16  
189 1     1   5 use Fsdb::IO::Reader;
  1         1  
  1         21  
190 1     1   4 use Fsdb::IO::Writer;
  1         4  
  1         20  
191 1     1   4 use Fsdb::Support qw($is_numeric_regexp);
  1         2  
  1         842  
192              
193              
194             =head2 new
195              
196             $filter = new Fsdb::Filter::dbcolmovingstats(@arguments);
197              
198             Create a new dbcolmovingstats object, taking command-line arguments.
199              
200             =cut
201              
202             sub new ($@) {
203 0     0 1   my $class = shift @_;
204 0           my $self = $class->SUPER::new(@_);
205 0           bless $self, $class;
206 0           $self->set_defaults;
207 0           $self->parse_options(@_);
208 0           $self->SUPER::post_new();
209 0           return $self;
210             }
211              
212              
213             =head2 set_defaults
214              
215             $filter->set_defaults();
216              
217             Internal: set up defaults.
218              
219             =cut
220              
221             sub set_defaults ($) {
222 0     0 1   my($self) = @_;
223 0           $self->SUPER::set_defaults();
224 0           $self->{_empty} = undef;
225 0           $self->{_format} = "%.5g";
226 0           $self->{_include_non_numeric} = undef;
227 0           $self->{_window} = 10;
228 0           $self->{_median} = undef;
229             }
230              
231             =head2 parse_options
232              
233             $filter->parse_options(@ARGV);
234              
235             Internal: parse command-line arguments.
236              
237             =cut
238              
239             sub parse_options ($@) {
240 0     0 1   my $self = shift @_;
241              
242 0           my(@argv) = @_;
243             $self->get_options(
244             \@argv,
245 0     0     'help|?' => sub { pod2usage(1); },
246 0     0     'man' => sub { pod2usage(-verbose => 2); },
247             'a|include-non-numeric!' => \$self->{_include_non_numeric},
248             'autorun!' => \$self->{_autorun},
249             'close!' => \$self->{_close},
250             'd|debug+' => \$self->{_debug},
251             'e|empty=s' => \$self->{_empty},
252             'f|format=s' => \$self->{_format},
253 0     0     'i|input=s' => sub { $self->parse_io_option('input', @_); },
254             'log!' => \$self->{_logprog},
255             'm|median!' => \$self->{_median},
256 0     0     'o|output=s' => sub { $self->parse_io_option('output', @_); },
257             'w|n|window=i' => \$self->{_window},
258 0 0         ) or pod2usage(2);
259 0           $self->parse_target_column(\@argv);
260             }
261              
262             =head2 setup
263              
264             $filter->setup();
265              
266             Internal: setup, parse headers.
267              
268             =cut
269              
270             sub setup ($) {
271 0     0 1   my($self) = @_;
272              
273 0 0         pod2usage(2) if (!defined($self->{_target_column}));
274              
275 0           $self->finish_io_option('input', -comment_handler => $self->create_pass_comments_sub);
276 0           $self->{_target_coli} = $self->{_in}->col_to_i($self->{_target_column});
277             croak $self->{_prog} . ": target column " . $self->{_target_column} . " is not in input stream.\n"
278 0 0         if (!defined($self->{_target_coli}));
279              
280 0           $self->finish_io_option('output', -clone => $self->{_in}, -outputheader => 'delay');
281 0           my(@new_cols) = qw(moving_mean moving_stddev);
282 0 0         push (@new_cols, "moving_median") if ($self->{_median});
283 0           foreach (@new_cols) {
284             $self->{_out}->col_create($_)
285 0 0         or croak $self->{_prog} . ": cannot create column $_ (maybe it already existed?)\n";
286             };
287 0           my $write_fastpath_sub = $self->{_out}->fastpath_sub();
288             }
289              
290             =head2 run
291              
292             $filter->run();
293              
294             Internal: run over each rows.
295              
296             =cut
297             sub run ($) {
298 0     0 1   my($self) = @_;
299              
300 0           my $read_fastpath_sub = $self->{_in}->fastpath_sub();
301 0           my $write_fastpath_sub = $self->{_out}->fastpath_sub();
302              
303 0           my $coli = $self->{_target_coli};
304 0           my $mean_coli = $self->{_out}->col_to_i('moving_mean');
305 0           my $stddev_coli = $self->{_out}->col_to_i('moving_stddev');
306 0           my $doing_median = defined($self->{_median});
307 0           my $median_coli = undef;
308 0 0         $median_coli = $self->{_out}->col_to_i('moving_median') if ($doing_median);
309 0           my $req_acc = $self->{_window};
310 0           my $empty_value = $self->{_empty};
311              
312 0           my(@d) = ();
313 0           my($sx) = 0;
314 0           my($sxx) = 0;
315             # my($minmaxinit) = 0;
316              
317             #
318             # Read and process the data.
319             #
320 0           my $fref;
321 0           while ($fref = &$read_fastpath_sub()) {
322 0           my $x = $fref->[$coli];
323 0 0         croak $self->{_prog} . ": null data value.\n" if (!defined($x));
324 0           my $x_is_valid = 1;
325 0 0         if ($x !~ /$is_numeric_regexp/) {
326 0 0         if ($self->{_include_non_numeric}) {
327 0           $x = 0;
328             } else {
329 0           $x_is_valid = undef;
330             };
331             };
332              
333 0 0         if ($x_is_valid) {
334 0           push(@d, $x);
335 0           $sx += $x;
336 0           $sxx += $x * $x;
337             };
338             # print SAVE_DATA "$x\n" if ($save_data);
339 0 0         if ($#d >= $req_acc) {
340 0           my $ox = shift @d;
341 0           $sx -= $ox;
342 0           $sxx -= $ox * $ox;
343             };
344              
345             # if (!$minmaxinit) {
346             # $min = $max = $x;
347             # $minmaxinit = 1;
348             # } else {
349             # $min = $x if ($x < $min);
350             # $max = $x if ($x > $max);
351             # };
352              
353 0           my $mean;
354             my $stddev;
355 0           my $median;
356 0           my $n = $#d+1;
357              
358 0           $mean = $sx / $n;
359 0 0         my($sqrt_part) = ($n <= 1 ? 0 : ($sxx - $n * $mean * $mean) / ($n - 1));
360             # Sqrt_part can go negtiave if we have floating point rounding in the subtraction,
361             # so protect against that.
362 0 0         $stddev = ($sqrt_part > 0 ? sqrt($sqrt_part) : 0);
363             #
364             # We get different results for different FP precision.
365             # Result TEST/dbcolmovingstats_rounding_error is unstable
366             # for the run of values that are all 0.8244.
367             # See
368             # and .
369             # Fix: map values where the relative standard deviation is small
370             # to a fixed value. This seems (empircally) to happen when stddev/mean < 1e-7
371             # which corresponds to IEEE single precision floating point.
372             #
373             # In practice, this case comes up only with strings of identical values
374             # where the moving stddev drops to zero +/- rounding error.
375             #
376 0 0 0       if ($mean != 0 && $stddev != 0 && $stddev / $mean < 1e-7) {
      0        
377 0           $stddev = 1e-7;
378             };
379 0 0         if ($doing_median) {
380 0           my $median_i = int($n / 2);
381             # 1 -> 0.5 -> [0]
382             # 2 -> 1 -> [1]
383             # 3 -> 1.5 -> [1]
384             # 4 -> 2 -> [2]
385             # 5 -> 2.5 -> [2]
386             # 6 -> 3 -> [3] ---we take the upper of the medians when there are two values
387             # Note that my understanding is sorting every time,
388             # in C, is faster than maintaining fancy data structures in Perl.
389 0           my(@sorted_d) = sort { $a <=> $b } @d;
  0            
390 0           $median = $sorted_d[$median_i]; # . ':' . join(",", @sorted_d);
391             };
392              
393 0 0 0       if ($n < $req_acc && defined($empty_value)) {
394 0           $mean = $stddev = $median = $empty_value;
395             } else {
396 0           $mean = $self->numeric_formatting($mean);
397 0 0         $stddev = $self->numeric_formatting($stddev) if (defined($stddev));
398             };
399 0 0 0       $stddev = $empty_value if (!defined($stddev) && !defined($empty_value));
400 0           $fref->[$mean_coli] = $mean;
401 0           $fref->[$stddev_coli] = $stddev;
402 0 0         $fref->[$median_coli] = $median if ($doing_median);
403 0           &$write_fastpath_sub($fref);
404             };
405             }
406              
407             =head1 AUTHOR and COPYRIGHT
408              
409             Copyright (C) 1991-2015 by John Heidemann
410              
411             This program is distributed under terms of the GNU general
412             public license, version 2. See the file COPYING
413             with the distribution for details.
414              
415             =cut
416              
417             1;