File Coverage

blib/lib/Fsdb/Filter/dbrvstatdiff.pm
Criterion Covered Total %
statement 21 153 13.7
branch 0 56 0.0
condition 0 12 0.0
subroutine 7 16 43.7
pod 5 5 100.0
total 33 242 13.6


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2              
3             #
4             # dbrvstatdiff
5             #
6             # Copyright (C) 1991-2015 by John Heidemann
7             # $Id: ae7f061d6c1ab7aa2bd1243c98877ac4cdde0c3d $
8             #
9             # This program is distributed under terms of the GNU general
10             # public license, version 2. See the file COPYING
11             # in $dblibdir for details.
12             #
13              
14             package Fsdb::Filter::dbrvstatdiff;
15              
16             =head1 NAME
17              
18             dbrvstatdiff - evaluate statistical differences between two random variables
19              
20             =head1 SYNOPSIS
21              
22             dbrvstatdiff [-f format] [-c ConfRating]
23             [-h HypothesizedDifference] m1c sd1c n1c m2c sd2c n2c
24              
25             OR
26              
27             dbrvstatdiff [-f format] [-c ConfRating] m1c n1c m2c n2c
28              
29             =head1 DESCRIPTION
30              
31             Produce statistics on the difference of sets of random variables.
32             If a hypothesized difference is given
33             (with C<-h>), to does a Student's t-test.
34              
35             Random variables are specified by:
36              
37             =over 4
38              
39             =item C, C
40              
41             The column names of means of random variables.
42              
43             =item C, C
44              
45             The column names of standard deviations of random variables.
46              
47             =item C, C
48              
49             Counts of number of samples for each random variable
50              
51             =back
52              
53             These values can be computed with L.
54              
55             Creates up to ten new columns:
56              
57             =over 4
58              
59             =item C
60              
61             The difference of RV 2 - RV 1.
62              
63             =item C
64              
65             The percentage difference (RV2-RV1)/1
66              
67             =item C and C
68              
69             The half half confidence intervals and low and high values
70             for absolute and relative confidence.
71              
72             =item C
73              
74             The T-test value for the given hypothesized difference.
75              
76             =item C
77              
78             Given the confidence rating, does the test pass? Will be
79             either "rejected" or "not-rejected".
80              
81             =item C
82              
83             The hypothesized value that is break-even point
84             for the T-test.
85              
86             =item C
87              
88             Break-even point as a percent of m1c.
89              
90             =back
91              
92             Confidence intervals are not printed if standard deviations are not provided.
93             Confidence intervals assume normal distributions with common variances.
94              
95             T-tests are only computed if a hypothesized difference is provided.
96             Hypothesized differences should be proceeded by <=, >=, =.
97             T-tests assume normal distributions with common variances.
98              
99             =head1 OPTIONS
100              
101             =over 4
102              
103             =item B<-c FRACTION> or B<--confidence FRACTION>
104              
105             Specify FRACTION for the confidence interval.
106             Defaults to 0.95 for a 95% confidence factor
107             (alpha = 0.05).
108              
109             =item B<-f FORMAT> or B<--format FORMAT>
110              
111             Specify a L-style format for output statistics.
112             Defaults to C<%.5g>.
113              
114             =item B<-h DIFF> or B<--hypothesis DIFF>
115              
116             Specify the hypothesized difference as C,
117             where C is something like C=0> or C=0>, etc.
118              
119             =back
120              
121             =for comment
122             begin_standard_fsdb_options
123              
124             This module also supports the standard fsdb options:
125              
126             =over 4
127              
128             =item B<-d>
129              
130             Enable debugging output.
131              
132             =item B<-i> or B<--input> InputSource
133              
134             Read from InputSource, typically a file name, or C<-> for standard input,
135             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
136              
137             =item B<-o> or B<--output> OutputDestination
138              
139             Write to OutputDestination, typically a file name, or C<-> for standard output,
140             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
141              
142             =item B<--autorun> or B<--noautorun>
143              
144             By default, programs process automatically,
145             but Fsdb::Filter objects in Perl do not run until you invoke
146             the run() method.
147             The C<--(no)autorun> option controls that behavior within Perl.
148              
149             =item B<--help>
150              
151             Show help.
152              
153             =item B<--man>
154              
155             Show full manual.
156              
157             =back
158              
159             =for comment
160             end_standard_fsdb_options
161              
162              
163             =head1 SAMPLE USAGE
164              
165             =head2 Input:
166              
167             #fsdb title mean2 stddev2 n2 mean1 stddev1 n1
168             example6.12 0.17 0.0020 5 0.22 0.0010 4
169              
170             =head2 Command:
171              
172             cat data.fsdb | dbrvstatdiff mean2 stddev2 n2 mean1 stddev1 n1
173              
174             =head2 Output:
175              
176             #fsdb title mean2 stddev2 n2 mean1 stddev1 n1 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high
177             example6.12 0.17 0.0020 5 0.22 0.0010 4 0.05 29.412 0.0026138 0.047386 0.052614 1.5375 27.874 30.949
178             # | dbrvstatdiff mean2 stddev2 n2 mean1 stddev1 n1
179              
180             =head2 Input 2:
181              
182             (example 7.10 from Scheaffer and McClave):
183              
184             #fsdb title x2 sd2 n2 x1 sd1 n1
185             example7.10 9 35.22 24.44 9 31.56 20.03
186              
187             =head2 Command 2:
188              
189             dbrvstatdiff -h '<=0' x2 sd2 n2 x1 sd1 n1
190              
191             =head2 Output 2:
192              
193             #fsdb title n1 x1 sd1 n2 x2 sd2 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high t_test t_test_result
194             example7.10 9 35.22 24.44 9 31.56 20.03 3.66 0.11597 4.7125 -1.0525 8.3725 0.14932 -0.033348 0.26529 1.6465 not-rejected
195             # | /global/us/edu/ucla/cs/ficus/users/johnh/BIN/DB/dbrvstatdiff -h <=0 x2 sd2 n2 x1 sd1 n1
196              
197              
198             =head2 Case 3:
199              
200             A common use case is to have one file with a set of trials
201             from two experiments, and to use dbrvstatdiff to see if they are different.
202              
203             =head3 Input 3:
204              
205             #fsdb case trial value
206             a 1 1
207             a 2 1.1
208             a 3 0.9
209             a 4 1
210             a 5 1.1
211             b 1 2
212             b 2 2.1
213             b 3 1.9
214             b 4 2
215             b 5 1.9
216              
217             =head2 Command 3:
218              
219             cat two_trial.fsdb |
220             dbmultistats -k case value |
221             dbcolcopylast mean stddev n |
222             dbrow '_case eq "b"' |
223             dbrvstatdiff -h '=0' mean stddev n copylast_mean copylast_stddev copylast_n |
224             dblistize
225              
226             =head3 Output 3:
227              
228             #fsdb -R C case mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n copylast_mean copylast_stddev copylast_n diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high t_test t_test_result t_test_break t_test_break_pct
229             case: b
230             mean: 1.98
231             stddev: 0.083666
232             pct_rsd: 4.2256
233             conf_range: 0.10387
234             conf_low: 1.8761
235             conf_high: 2.0839
236             conf_pct: 0.95
237             sum: 9.9
238             sum_squared: 19.63
239             min: 1.9
240             max: 2.1
241             n: 5
242             copylast_mean: 1.02
243             copylast_stddev: 0.083666
244             copylast_n: 5
245             diff: -0.96
246             diff_pct: -48.485
247             diff_conf_half: 0.12202
248             diff_conf_low: -1.082
249             diff_conf_high: -0.83798
250             diff_conf_pct_half: 6.1627
251             diff_conf_pct_low: -54.648
252             diff_conf_pct_high: -42.322
253             t_test: -18.142
254             t_test_result: rejected
255             t_test_break: -1.082
256             t_test_break_pct: -54.648
257            
258             # | dbmultistats -k case value
259             # | dbcolcopylast mean stddev n
260             # | dbrow _case eq "b"
261             # | dbrvstatdiff -h =0 mean stddev n copylast_mean copylast_stddev copylast_n
262             # | dbfilealter -R C
263              
264             (So one cannot say that they are statistically equal.)
265              
266              
267             =head1 SEE ALSO
268              
269             L.
270             L.
271             L.
272              
273              
274             =head1 CLASS FUNCTIONS
275              
276             =cut
277              
278             @ISA = qw(Fsdb::Filter);
279             $VERSION = 2.0;
280              
281 1     1   6230 use strict;
  1         1  
  1         24  
282 1     1   3 use Pod::Usage;
  1         1  
  1         64  
283 1     1   4 use Carp;
  1         1  
  1         37  
284              
285 1     1   3 use Fsdb::Filter;
  1         1  
  1         13  
286 1     1   3 use Fsdb::IO::Reader;
  1         1  
  1         17  
287 1     1   3 use Fsdb::IO::Writer;
  1         1  
  1         21  
288 1     1   369 use Fsdb::Support::TDistribution qw(t_distribution);
  1         2  
  1         1157  
289              
290              
291             =head2 new
292              
293             $filter = new Fsdb::Filter::dbrvstatdiff(@arguments);
294              
295             Create a new dbrvstatdiff object, taking command-line arguments.
296              
297             =cut
298              
299             sub new ($@) {
300 0     0 1   my $class = shift @_;
301 0           my $self = $class->SUPER::new(@_);
302 0           bless $self, $class;
303 0           $self->set_defaults;
304 0           $self->parse_options(@_);
305 0           $self->SUPER::post_new();
306 0           return $self;
307             }
308              
309              
310             =head2 set_defaults
311              
312             $filter->set_defaults();
313              
314             Internal: set up defaults.
315              
316             =cut
317              
318             sub set_defaults ($) {
319 0     0 1   my($self) = @_;
320 0           $self->SUPER::set_defaults();
321 0           $self->{_confidence_fraction} = 0.95;
322 0           $self->{_format} = "%.5g";
323 0           $self->{_hypothesis} = undef;
324 0           $self->{_hyp_diff} = undef;
325 0           $self->{_hyp_class} = undef;
326 0           $self->{_arg_fields} = [];
327            
328 0           $self->{_do_confidence} = undef;
329             # $self->{_do_t_test} = undef; just check for defined(hypothesis)
330             }
331              
332             =head2 parse_options
333              
334             $filter->parse_options(@ARGV);
335              
336             Internal: parse command-line arguments.
337              
338             =cut
339              
340             sub parse_options ($@) {
341 0     0 1   my $self = shift @_;
342              
343 0           my(@argv) = @_;
344             $self->get_options(
345             \@argv,
346 0     0     'help|?' => sub { pod2usage(1); },
347 0     0     'man' => sub { pod2usage(-verbose => 2); },
348             'autorun!' => \$self->{_autorun},
349             'c|confidence=f' => \$self->{_confidence_fraction},
350             'close!' => \$self->{_close},
351             'd|debug+' => \$self->{_debug},
352             'f|format=s' => \$self->{_format},
353             'h|hypothesis=s' => \$self->{_hypothesis},
354 0     0     'i|input=s' => sub { $self->parse_io_option('input', @_); },
355             'log!' => \$self->{_logprog},
356 0     0     'o|output=s' => sub { $self->parse_io_option('output', @_); },
357 0 0         ) or pod2usage(2);
358 0           push (@{$self->{_arg_fields}}, @argv);
  0            
359             }
360              
361             =head2 setup
362              
363             $filter->setup();
364              
365             Internal: setup, parse headers.
366              
367             =cut
368              
369             sub setup ($) {
370 0     0 1   my($self) = @_;
371              
372             #
373             # valid hypothesis, if any?
374             #
375 0 0         if (defined($self->{_hypothesis})) {
376 0           my($hyp_class, $hyp_diff) = ($self->{_hypothesis} =~ /^\s*([<>=!]+)\s*([0-9\.]+)/);
377 0 0         $hyp_class = '' if (!defined($hyp_class));
378 0 0         $self->{_hyp_class} = -1 if ($hyp_class eq '<=');
379 0 0         $self->{_hyp_class} = 0 if ($hyp_class =~ /^==?$/);
380 0 0         $self->{_hyp_class} = 1 if ($hyp_class eq '>=');
381             croak $self->{_prog} . ": bad hypothesis specification ``" .
382             $self->{_hypothesis} . "''; not of format <=N, =N, >=N, where N is some number.\n"
383 0 0 0       if (!defined($self->{_hyp_class}) || !defined($hyp_diff));
384 0           $self->{_hyp_diff} = $hyp_diff;
385             };
386              
387             #
388             # what exactly are we doing?
389             #
390 0 0         if ($#{$self->{_arg_fields}} == 5) {
  0 0          
391 0           ($self->{_m1_column}, $self->{_ss1_column}, $self->{_n1_column}, $self->{_m2_column}, $self->{_ss2_column}, $self->{_n2_column}) = @{$self->{_arg_fields}};
  0            
392 0           $self->{_do_confidence} = 1;
393 0           } elsif ($#{$self->{_arg_fields}} == 3) {
394 0           ($self->{_m1_column}, $self->{_n1_column}, $self->{_m2_column}, $self->{_n2_column}) = @{$self->{_arg_fields}};
  0            
395 0           $self->{_ss1_column} = $self->{_ss2_column} = undef;
396             croak $self->{_prog} . ": T-tests require standard deviations, but none were given as arguments.\n"
397 0 0         if (defined($self->{_hyp_diff}));
398             } else {
399 0           carp $self->{_prog} . ": confusing number of fields given; cannot identify desired type of stats.\n";
400 0           pod2usage(2);
401             };
402              
403             #
404             # finish up IO
405             #
406 0           $self->finish_io_option('input', -comment_handler => $self->create_pass_comments_sub);
407              
408             # check selected columns
409 0           foreach (qw(_m1_column _ss1_column _n1_column _m2_column _ss2_column _n2_column)) {
410 0           my $coli = $_;
411 0 0         next if (!defined($self->{$_})); # maybe unspecified sum-of-squares
412 0           $coli =~ s/_column/_coli/;
413 0           $self->{$coli} = $self->{_in}->col_to_i($self->{$_});
414             croak $self->{_prog} . ": unknown selected input column ``$_''.\n"
415 0 0         if (!defined($self->{$coli}));
416             };
417              
418 0           $self->finish_io_option('output', -clone => $self->{_in}, -outputheader => 'delay');
419 0           my(@new_columns) = qw(diff diff_pct);
420             push(@new_columns, qw(diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high))
421 0 0         if (defined($self->{_do_confidence}));
422             push(@new_columns, qw(t_test t_test_result t_test_break t_test_break_pct))
423 0 0         if (defined($self->{_hypothesis}));
424 0           foreach (@new_columns) {
425             $self->{_out}->col_create($_)
426 0 0         or croak $self->{_prog} . ": cannot create column ``$_'' (maybe it already existed?)\n";
427 0           $self->{"_${_}_coli"} = $self->{_out}->col_to_i($_);
428 0 0         defined($self->{"_${_}_coli"}) or croak "internal error\n";
429             };
430              
431             }
432              
433             =head2 run
434              
435             $filter->run();
436              
437             Internal: run over each rows.
438              
439             =cut
440             sub run ($) {
441 0     0 1   my($self) = @_;
442              
443 0           my $conf_alpha = (1.0 - $self->{_confidence_fraction}) / 2.0;
444              
445 0           my $read_fastpath_sub = $self->{_in}->fastpath_sub();
446 0           my $write_fastpath_sub = $self->{_out}->fastpath_sub();
447              
448 0           my $m1c = $self->{_m1_coli};
449 0           my $m2c = $self->{_m2_coli};
450 0           my $n1c = $self->{_n1_coli};
451 0           my $n2c = $self->{_n2_coli};
452 0           my $ss1c = $self->{_ss1_coli};
453 0           my $ss2c = $self->{_ss2_coli};
454              
455 0           my $empty = $self->{_empty};
456 0           my $hyp_diff = $self->{_hyp_diff};
457 0           my $hyp_class = $self->{_hyp_class};
458 0           my $format = $self->{_format};
459              
460 0           my $diff_f = $self->{_diff_coli};
461 0           my $diff_pct_f = $self->{_diff_pct_coli};
462 0           my $diff_conf_half_f = $self->{_diff_conf_half_coli};
463 0           my $diff_conf_low_f = $self->{_diff_conf_low_coli};
464 0           my $diff_conf_high_f = $self->{_diff_conf_high_coli};
465 0           my $diff_conf_pct_half_f = $self->{_diff_conf_pct_half_coli};
466 0           my $diff_conf_pct_low_f = $self->{_diff_conf_pct_low_coli};
467 0           my $diff_conf_pct_high_f = $self->{_diff_conf_pct_high_coli};
468 0           my $t_test_f = $self->{_t_test_coli};
469 0           my $t_test_result_f = $self->{_t_test_result_coli};
470 0           my $t_test_break_f = $self->{_t_test_break_coli};
471 0           my $t_test_break_pct_f = $self->{_t_test_break_pct_coli};
472              
473 0           my $fref;
474 0           my($diff_conf_half, $diff_conf_low, $diff_conf_high,
475             $diff_conf_pct_half, $diff_conf_pct_low, $diff_conf_pct_high,
476             $t_test, $t_test_result,
477             $t_test_break, $t_test_break_pct);
478 0           while ($fref = &$read_fastpath_sub()) {
479            
480 0           my $diff = $fref->[$m2c] - $fref->[$m1c];
481 0           my $diff_pct;
482 0 0         if ($fref->[$m1c] == 0.0) {
483 0           $diff_pct = $empty;
484             } else {
485 0           $diff_pct = $diff / $fref->[$m1c] * 100.0;
486             };
487 0           $diff_conf_half = $diff_conf_low = $diff_conf_high =
488             $diff_conf_pct_half = $diff_conf_pct_low = $diff_conf_pct_high =
489             $t_test = $t_test_result =
490             $t_test_break = $t_test_break_pct = $empty;
491 0 0 0       if ($self->{_do_confidence} && $fref->[$n1c] > 1 && $fref->[$n2c] > 1) {
      0        
492             # basic stuff
493 0           my $degrees_of_freedom = ($fref->[$n1c] + $fref->[$n2c] - 2);
494 0           my $ssp = (($fref->[$n1c] - 1) * $fref->[$ss1c] * $fref->[$ss1c] + ($fref->[$n2c] - 1) * $fref->[$ss2c] * $fref->[$ss2c]) / $degrees_of_freedom;
495 0           my $sqrt_ssp_inverses = sqrt($ssp * (1/$fref->[$n1c] + 1/$fref->[$n2c]));
496 0           my $t_value = t_distribution($degrees_of_freedom, $conf_alpha);
497            
498             # confidence intervals
499 0           $diff_conf_half = $t_value * $sqrt_ssp_inverses;
500 0           $diff_conf_low = $diff - $diff_conf_half;
501 0           $diff_conf_high = $diff + $diff_conf_half;
502            
503             # t-test
504 0 0 0       if (defined($self->{_hypothesis}) && $sqrt_ssp_inverses != 0.0) {
505 0           $t_test = ($diff - $hyp_diff) / $sqrt_ssp_inverses;
506 0           $t_test_result = "not-rejected";
507 0 0         if ($hyp_class < 0) {
    0          
508 0 0         $t_test_result = "rejected"
509             if ($t_test > $t_value);
510             } elsif ($hyp_class > 0) {
511 0 0         $t_test_result = "rejected"
512             if ($t_test < $t_value);
513             } else {
514 0 0         $t_test_result = "rejected"
515             if (abs($t_test) > $t_value);
516             };
517             # also compute the break-even point
518 0           $t_test_break = $diff - $t_value * $sqrt_ssp_inverses;
519 0 0         $t_test_break_pct = $t_test_break / $fref->[$m1c] * 100.0
520             if ($fref->[$m1c] != 0.0);
521             };
522            
523             # percentages
524 0 0         if ($fref->[$m1c] != 0.0) {
525 0           $diff_conf_pct_half = $diff_conf_half / $fref->[$m1c] * 100.0;
526 0           $diff_conf_pct_low = $diff_conf_low / $fref->[$m1c] * 100.0;
527 0           $diff_conf_pct_high = $diff_conf_high / $fref->[$m1c] * 100.0;
528             };
529             };
530            
531            
532 0           $fref->[$diff_f] = sprintf("$format", $diff);
533 0           $fref->[$diff_pct_f] = sprintf("$format", $diff_pct);
534 0 0         if ($self->{_do_confidence}) {
535 0           $fref->[$diff_conf_half_f] = sprintf("$format", $diff_conf_half);
536 0           $fref->[$diff_conf_low_f] = sprintf("$format", $diff_conf_low);
537 0           $fref->[$diff_conf_high_f] = sprintf("$format", $diff_conf_high);
538 0           $fref->[$diff_conf_pct_half_f] = sprintf("$format", $diff_conf_pct_half);
539 0           $fref->[$diff_conf_pct_low_f] = sprintf("$format", $diff_conf_pct_low);
540 0           $fref->[$diff_conf_pct_high_f] = sprintf("$format", $diff_conf_pct_high);
541             };
542 0 0         if (defined($self->{_hypothesis})) {
543 0           $fref->[$t_test_f] = sprintf("$format", $t_test);
544 0           $fref->[$t_test_result_f] = $t_test_result;
545 0           $fref->[$t_test_break_f] = sprintf("$format", $t_test_break);
546 0           $fref->[$t_test_break_pct_f] = sprintf("$format", $t_test_break_pct);
547             };
548            
549 0           &$write_fastpath_sub($fref);
550             };
551             }
552              
553              
554             =head1 AUTHOR and COPYRIGHT
555              
556             Copyright (C) 1991-2015 by John Heidemann
557              
558             This program is distributed under terms of the GNU general
559             public license, version 2. See the file COPYING
560             with the distribution for details.
561              
562             =cut
563              
564             1;