line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl -w |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# dbcolstats.pm |
5
|
|
|
|
|
|
|
# Copyright (C) 1991-2015 by John Heidemann |
6
|
|
|
|
|
|
|
# $Id: b8f85fa383507a09ebfc72e644fadd6e1d5ceed0 $ |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# This program is distributed under terms of the GNU general |
9
|
|
|
|
|
|
|
# public license, version 2. See the file COPYING |
10
|
|
|
|
|
|
|
# in $dblibdir for details. |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
package Fsdb::Filter::dbcolstats; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 NAME |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
dbcolstats - compute statistics on a fsdb column |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
dbcolstats [-amS] [-c ConfidenceFraction] [-q NumberOfQuantiles] column |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Compute statistics over a COLUMN of data. |
26
|
|
|
|
|
|
|
Records containing non-numeric data are considered null |
27
|
|
|
|
|
|
|
do not contribute to the stats (with the C<-a> option |
28
|
|
|
|
|
|
|
they are treated as zeros). |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Confidence intervals are a t-test (+/- (t_{a/2})*s/sqrt(n)) |
31
|
|
|
|
|
|
|
and assume the population takes a normal distribution |
32
|
|
|
|
|
|
|
with a small number of samples (< 100). |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
By default, |
35
|
|
|
|
|
|
|
all statistics are computed for as a population I (with an ``n-1'' term), |
36
|
|
|
|
|
|
|
not as representing the whole population (using ``n''). |
37
|
|
|
|
|
|
|
Select between them with B<--sample> or B<--nosample>. |
38
|
|
|
|
|
|
|
When you measure the entire population, use the latter option. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
The output of this program is probably best looked at after |
41
|
|
|
|
|
|
|
reformatting with L. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Dbcolstats runs in O(1) memory. Median or quantile requires sorting the |
44
|
|
|
|
|
|
|
data and invokes dbsort. Sorting will run in constant RAM but |
45
|
|
|
|
|
|
|
O(number of records) disk space. If median or quantile is required |
46
|
|
|
|
|
|
|
and the data is already sorted, dbcolstats will run more efficiently with |
47
|
|
|
|
|
|
|
the -S option. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 OPTIONS |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=over 4 |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item B<-a> or B<--include-non-numeric> |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Compute stats over all records (treat non-numeric records |
57
|
|
|
|
|
|
|
as zero rather than just ignoring them). |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item B<-c FRACTION> or B<--confidence FRACTION> |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Specify FRACTION for the confidence interval. |
62
|
|
|
|
|
|
|
Defaults to 0.95 for a 95% confidence factor. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item B<-f FORMAT> or B<--format FORMAT> |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Specify a L-style format for output statistics. |
67
|
|
|
|
|
|
|
Defaults to C<%.5g>. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=item B<-m> or B<--median> |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Compute median value. (Will sort data if necessary.) |
72
|
|
|
|
|
|
|
(Median is the quantitle for N=2.) |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item B<-q N> or B<--quantile N> |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Compute quantile (quartile when N is 4), |
77
|
|
|
|
|
|
|
or an arbitrary quantile for other values of N, |
78
|
|
|
|
|
|
|
where the scores that are 1 Nth of the way across the population. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item B<--sample> |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Compute I population statistics |
83
|
|
|
|
|
|
|
(e.g., the sample standard deviation), |
84
|
|
|
|
|
|
|
assuming I degrees of freedom. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item B<--nosample> |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Compute I population statistics |
89
|
|
|
|
|
|
|
(e.g., the population standard devation). |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=item B<-S> or B<--pre-sorted> |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Assume data is already sorted. |
94
|
|
|
|
|
|
|
With one -S, we check and confirm this precondition. |
95
|
|
|
|
|
|
|
When repeated, we skip the check. |
96
|
|
|
|
|
|
|
(This flag is ignored if quartiles are not requested.) |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=item B<--parallelism=N> or C<-j N> |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Allow sorting to happen in parallel. |
101
|
|
|
|
|
|
|
Defaults on. |
102
|
|
|
|
|
|
|
(Only relevant if using non-pre-sorted data with quantiles.) |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item B<-F> or B<--fs> or B<--fieldseparator> S |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Specify the field (column) separator as C. |
107
|
|
|
|
|
|
|
See L for valid field separators. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=item B<-T TmpDir> |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
where to put temporary data. |
112
|
|
|
|
|
|
|
Only used if median or quantiles are requested. |
113
|
|
|
|
|
|
|
Also uses environment variable TMPDIR, if -T is |
114
|
|
|
|
|
|
|
not specified. |
115
|
|
|
|
|
|
|
Default is /tmp. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item B<-k KeyField> |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Do multi-stats, grouped by each key. |
120
|
|
|
|
|
|
|
Assumes keys are sorted. (Use dbmultistats to guarantee sorting order.) |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item B<--output-on-no-input> |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Enables null output (all fields are "-", n is 0) |
125
|
|
|
|
|
|
|
if we get input with a schema but no records. |
126
|
|
|
|
|
|
|
Without this option, just output the schema but no rows. |
127
|
|
|
|
|
|
|
Default: no output if no input. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=back |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=for comment |
133
|
|
|
|
|
|
|
begin_standard_fsdb_options |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
This module also supports the standard fsdb options: |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=over 4 |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item B<-d> |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Enable debugging output. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=item B<-i> or B<--input> InputSource |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Read from InputSource, typically a file name, or C<-> for standard input, |
146
|
|
|
|
|
|
|
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=item B<-o> or B<--output> OutputDestination |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
Write to OutputDestination, typically a file name, or C<-> for standard output, |
151
|
|
|
|
|
|
|
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item B<--autorun> or B<--noautorun> |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
By default, programs process automatically, |
156
|
|
|
|
|
|
|
but Fsdb::Filter objects in Perl do not run until you invoke |
157
|
|
|
|
|
|
|
the run() method. |
158
|
|
|
|
|
|
|
The C<--(no)autorun> option controls that behavior within Perl. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item B<--help> |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Show help. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=item B<--man> |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Show full manual. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=back |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=for comment |
171
|
|
|
|
|
|
|
end_standard_fsdb_options |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head1 SAMPLE USAGE |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head2 Input: |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
#fsdb absdiff |
179
|
|
|
|
|
|
|
0 |
180
|
|
|
|
|
|
|
0.046953 |
181
|
|
|
|
|
|
|
0.072074 |
182
|
|
|
|
|
|
|
0.075413 |
183
|
|
|
|
|
|
|
0.094088 |
184
|
|
|
|
|
|
|
0.096602 |
185
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbrow |
186
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbcol event clock |
187
|
|
|
|
|
|
|
# | dbrowdiff clock |
188
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbcol absdiff |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 Command: |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
cat data.fsdb | dbcolstats absdiff |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head2 Output: |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#fsdb mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n |
197
|
|
|
|
|
|
|
0.064188 0.036194 56.387 0.037989 0.026199 0.102180.95 0.38513 0.031271 0 0.096602 6 |
198
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbrow |
199
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbcol event clock |
200
|
|
|
|
|
|
|
# | dbrowdiff clock |
201
|
|
|
|
|
|
|
# | /home/johnh/BIN/DB/dbcol absdiff |
202
|
|
|
|
|
|
|
# | dbcolstats absdiff |
203
|
|
|
|
|
|
|
# 0.95 confidence intervals assume normal distribution and small n. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head1 SEE ALSO |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
L, handles multiple experiments in a single file. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
L, to pretty-print the output of dbcolstats. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
L, to compute an even more general version of median/quantiles. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
L, to compute z-scores or t-scores for each row |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
L, to see if two sample populations are statistically different. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
L. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=head1 BUGS |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
The algorithms used to compute variance have not been |
222
|
|
|
|
|
|
|
audited to check for numerical stability. |
223
|
|
|
|
|
|
|
(See F).) |
224
|
|
|
|
|
|
|
Variance may be incorrect when standard deviation |
225
|
|
|
|
|
|
|
is small relative to the mean. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
The field C implies percentage, but it's actually |
228
|
|
|
|
|
|
|
reported as a fraction (0.95 means 95%). |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Because of limits of floating point, statistics on numbers of |
231
|
|
|
|
|
|
|
widely different scales may be incorrect. |
232
|
|
|
|
|
|
|
See the test cases F for examples. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=head1 CLASS FUNCTIONS |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=cut |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
@ISA = qw(Fsdb::Filter); |
240
|
|
|
|
|
|
|
($VERSION) = 2.0; |
241
|
|
|
|
|
|
|
|
242
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
243
|
1
|
|
|
1
|
|
5
|
use Pod::Usage; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
72
|
|
244
|
|
|
|
|
|
|
|
245
|
1
|
|
|
1
|
|
6
|
use Fsdb::IO::Reader; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
17
|
|
246
|
1
|
|
|
1
|
|
4
|
use Fsdb::IO::Writer; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
14
|
|
247
|
1
|
|
|
1
|
|
4
|
use Fsdb::Filter; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
248
|
1
|
|
|
1
|
|
5
|
use Fsdb::Filter::dbpipeline qw(dbpipeline_sink dbsort); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
249
|
1
|
|
|
1
|
|
4
|
use Fsdb::Support qw($is_numeric_regexp); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
69
|
|
250
|
1
|
|
|
1
|
|
5
|
use Fsdb::Support::TDistribution qw(t_distribution); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
42
|
|
251
|
1
|
|
|
1
|
|
5
|
use Fsdb::Support::NamedTmpfile; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1700
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=head2 new |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
$filter = new Fsdb::Filter::dbcolstats(@arguments); |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
Create a new dbcolstats object, taking command-line arguments. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=cut |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub new($@) { |
263
|
0
|
|
|
0
|
1
|
|
my $class = shift @_; |
264
|
0
|
|
|
|
|
|
my $self = $class->SUPER::new(@_); |
265
|
0
|
|
|
|
|
|
bless $self, $class; |
266
|
0
|
|
|
|
|
|
$self->set_defaults; |
267
|
0
|
|
|
|
|
|
$self->parse_options(@_); |
268
|
0
|
|
|
|
|
|
$self->SUPER::post_new(); |
269
|
0
|
|
|
|
|
|
return $self; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head2 set_defaults |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
$filter->set_defaults(); |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Internal: set up defaults. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub set_defaults($) { |
282
|
0
|
|
|
0
|
1
|
|
my($self) = @_; |
283
|
0
|
|
|
|
|
|
$self->SUPER::set_defaults(); |
284
|
0
|
|
|
|
|
|
$self->{_target_column} = undef; |
285
|
0
|
|
|
|
|
|
$self->{_confidence_fraction} = 0.95; |
286
|
0
|
|
|
|
|
|
$self->{_format} = "%.5g"; |
287
|
0
|
|
|
|
|
|
$self->{_quantile} = undef; |
288
|
0
|
|
|
|
|
|
$self->{_median} = undef; # special case: renames the output field |
289
|
0
|
|
|
|
|
|
$self->{_sample} = 1; |
290
|
0
|
|
|
|
|
|
$self->{_pre_sorted} = 0; |
291
|
0
|
|
|
|
|
|
$self->{_include_non_numeric} = undef; |
292
|
0
|
|
|
|
|
|
$self->{_fscode} = undef; |
293
|
0
|
|
|
|
|
|
$self->{_max_parallelism} = undef; |
294
|
0
|
|
|
|
|
|
$self->{_key_column} = undef; |
295
|
0
|
|
|
|
|
|
$self->{_output_on_no_input} = undef; |
296
|
0
|
|
|
|
|
|
$self->set_default_tmpdir; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=head2 parse_options |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$filter->parse_options(@ARGV); |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Internal: parse command-line arguments. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub parse_options($@) { |
308
|
0
|
|
|
0
|
1
|
|
my $self = shift @_; |
309
|
|
|
|
|
|
|
|
310
|
0
|
|
|
|
|
|
my(@argv) = @_; |
311
|
|
|
|
|
|
|
$self->get_options( |
312
|
|
|
|
|
|
|
\@argv, |
313
|
0
|
|
|
0
|
|
|
'help|?' => sub { pod2usage(1); }, |
314
|
0
|
|
|
0
|
|
|
'man' => sub { pod2usage(-verbose => 2); }, |
315
|
|
|
|
|
|
|
'a|include-non-numeric!' => \$self->{_include_non_numeric}, |
316
|
|
|
|
|
|
|
'autorun!' => \$self->{_autorun}, |
317
|
|
|
|
|
|
|
'close!' => \$self->{_close}, |
318
|
|
|
|
|
|
|
'c|confidence=f' => \$self->{_confidence_fraction}, |
319
|
|
|
|
|
|
|
'd|debug+' => \$self->{_debug}, |
320
|
|
|
|
|
|
|
'f|format=s' => \$self->{_format}, |
321
|
|
|
|
|
|
|
'F|fs|cs|fieldseparator|columnseparator=s' => \$self->{_fscode}, |
322
|
0
|
|
|
0
|
|
|
'i|input=s' => sub { $self->parse_io_option('input', @_); }, |
323
|
|
|
|
|
|
|
'j|parallelism=i' => \$self->{_max_parallelism}, |
324
|
|
|
|
|
|
|
'k|key=s' => \$self->{_key_column}, |
325
|
|
|
|
|
|
|
'log!' => \$self->{_logprog}, |
326
|
|
|
|
|
|
|
'm|median!' => \$self->{_median}, |
327
|
0
|
|
|
0
|
|
|
'o|output=s' => sub { $self->parse_io_option('output', @_); }, |
328
|
|
|
|
|
|
|
'output-on-no-input!' => \$self->{_output_on_no_input}, |
329
|
|
|
|
|
|
|
'q|quantile=i' => \$self->{_quantile}, |
330
|
|
|
|
|
|
|
's|sample!' => \$self->{_sample}, |
331
|
|
|
|
|
|
|
'S|pre-sorted+' => \$self->{_pre_sorted}, |
332
|
|
|
|
|
|
|
'T|tmpdir|tempdir=s' => \$self->{_tmpdir}, |
333
|
|
|
|
|
|
|
'saveoutput=s' => \$self->{_save_output}, |
334
|
0
|
0
|
|
|
|
|
) or pod2usage(2); |
335
|
0
|
|
|
|
|
|
$self->parse_target_column(\@argv); |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=head2 setup |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
$filter->setup(); |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Internal: setup, parse headers. |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=cut |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub setup($) { |
348
|
0
|
|
|
0
|
1
|
|
my($self) = @_; |
349
|
|
|
|
|
|
|
|
350
|
0
|
0
|
|
|
|
|
pod2usage(2) if (!defined($self->{_target_column})); |
351
|
|
|
|
|
|
|
|
352
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: pre-input setup\n" if ($self->{_debug} > 2); |
353
|
0
|
|
|
|
|
|
$self->finish_io_option('input', -comment_handler => $self->create_delay_comments_sub); |
354
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: post-input setup\n" if ($self->{_debug} > 2); |
355
|
0
|
|
|
|
|
|
$self->{_target_coli} = $self->{_in}->col_to_i($self->{_target_column}); |
356
|
|
|
|
|
|
|
croak $self->{_prog} . ": target column " . $self->{_target_column} . " is not in input stream.\n" |
357
|
0
|
0
|
|
|
|
|
if (!defined($self->{_target_coli})); |
358
|
0
|
|
|
|
|
|
$self->{_key_coli} = undef; |
359
|
0
|
0
|
|
|
|
|
if (defined($self->{_key_column})) { |
360
|
0
|
|
|
|
|
|
$self->{_key_coli} = $self->{_in}->col_to_i($self->{_key_column}); |
361
|
|
|
|
|
|
|
croak($self->{_prog} . ": key column " . $self->{_key_column} . " is not in input stream.\n") |
362
|
0
|
0
|
|
|
|
|
if (!defined($self->{_key_coli})); |
363
|
|
|
|
|
|
|
}; |
364
|
0
|
|
|
|
|
|
my $read_fastpath_sub = $self->{_in}->fastpath_sub(); |
365
|
0
|
|
|
|
|
|
$self->{_read_fastpath_sub} = $read_fastpath_sub; |
366
|
|
|
|
|
|
|
|
367
|
0
|
|
|
|
|
|
my(@headers) = (qw(mean stddev pct_rsd conf_range conf_low conf_high |
368
|
|
|
|
|
|
|
conf_pct sum sum_squared min max n)); |
369
|
0
|
0
|
|
|
|
|
push(@headers, "median") if ($self->{_median}); |
370
|
0
|
0
|
|
|
|
|
if ($self->{_quantile}) { |
371
|
0
|
|
|
|
|
|
foreach (1..($self->{_quantile}-1)) { |
372
|
0
|
|
|
|
|
|
push(@headers, "q$_"); |
373
|
|
|
|
|
|
|
}; |
374
|
|
|
|
|
|
|
}; |
375
|
0
|
0
|
|
|
|
|
unshift(@headers, $self->{_key_column}) if (defined($self->{_key_column})); |
376
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: pre-output setup\n" if ($self->{_debug} > 2); |
377
|
0
|
|
|
|
|
|
my @output_options = (-cols => \@headers); |
378
|
|
|
|
|
|
|
unshift (@output_options, -fscode => $self->{_fscode}) |
379
|
0
|
0
|
|
|
|
|
if (defined($self->{_fscode})); |
380
|
0
|
|
|
|
|
|
$self->finish_io_option('output', @output_options); |
381
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: post-output setup\n" if ($self->{_debug} > 2); |
382
|
|
|
|
|
|
|
|
383
|
0
|
0
|
0
|
|
|
|
if ($self->{_quantile} || $self->{_median}) { |
384
|
|
|
|
|
|
|
croak($self->{_prog} . ": cannot currently do median or quantile with a key column\n") |
385
|
0
|
0
|
|
|
|
|
if (defined($self->{_key_column})); |
386
|
0
|
|
|
|
|
|
$self->{_save_out_filename} = Fsdb::Support::NamedTmpfile::alloc($self->{_tmpdir}); |
387
|
|
|
|
|
|
|
# sorting needed? |
388
|
0
|
|
|
|
|
|
my $save_out; |
389
|
0
|
|
|
|
|
|
my(@writer_args) = (-cols => [qw(data)]); |
390
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: pre-saveoutput setup\n" if ($self->{_debug} > 2); |
391
|
0
|
0
|
|
|
|
|
if (!$self->{_pre_sorted}) { |
392
|
0
|
|
|
|
|
|
my $sorter_fred; |
393
|
0
|
|
|
|
|
|
my(@dbsort_args) = qw(-n data); |
394
|
|
|
|
|
|
|
push (@dbsort_args, '--parallelism', $self->{_max_parallelism}) |
395
|
0
|
0
|
|
|
|
|
if (defined($self->{_max_parallelism})); |
396
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: doing sorter thread\n" if ($self->{_debug} > 2); |
397
|
|
|
|
|
|
|
($save_out, $sorter_fred) = dbpipeline_sink(\@writer_args, |
398
|
|
|
|
|
|
|
'--output' => $self->{_save_out_filename}, |
399
|
0
|
|
|
|
|
|
dbsort(@dbsort_args)); |
400
|
0
|
|
|
|
|
|
$self->{_sorter_fred} = $sorter_fred; |
401
|
|
|
|
|
|
|
} else { |
402
|
|
|
|
|
|
|
# no, just write it ourselves |
403
|
0
|
|
|
|
|
|
$save_out = new Fsdb::IO::Writer('-file' => $self->{_save_out_filename}, @writer_args); |
404
|
|
|
|
|
|
|
}; |
405
|
0
|
|
|
|
|
|
$self->{_save_out} = $save_out; |
406
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: post-saveoutput setup\n" if ($self->{_debug} > 2); |
407
|
|
|
|
|
|
|
} else { |
408
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: no saveoutput needed\n" if ($self->{_debug} > 2); |
409
|
0
|
|
|
|
|
|
$self->{_save_out} = undef; |
410
|
|
|
|
|
|
|
}; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 _round_up |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
$i = _round_up($x); |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
Internal: Round up to the next integer. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=cut |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
sub _round_up($) { |
422
|
0
|
|
|
0
|
|
|
my($x) = @_; |
423
|
0
|
|
|
|
|
|
my($xi) = int($x); |
424
|
0
|
0
|
|
|
|
|
return ($x > $xi) ? $xi+1 : $xi; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=head2 _compute_quantile |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
($median, $quantile_aref) = _compute_quantile($n, $mean); |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Internal: Compute quantile from the saved data. |
432
|
|
|
|
|
|
|
Not generalizable. |
433
|
|
|
|
|
|
|
We assume the saved output is closed before we enter. |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=cut |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
sub _compute_quantile($$$) { |
438
|
0
|
|
|
0
|
|
|
my ($self, $n, $mean) = @_; |
439
|
|
|
|
|
|
|
|
440
|
0
|
0
|
0
|
|
|
|
return if (!($self->{_quantile} || $self->{_median})); |
441
|
0
|
|
|
|
|
|
my $effective_quantile = $self->{_quantile}; |
442
|
0
|
0
|
|
|
|
|
$effective_quantile = 2 if (!defined($effective_quantile)); |
443
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
my $median; |
445
|
|
|
|
|
|
|
my @q; |
446
|
0
|
0
|
|
|
|
|
if ($n <= 1) { |
447
|
0
|
|
|
|
|
|
$median = $mean; |
448
|
0
|
|
|
|
|
|
push(@q, ($mean) x $effective_quantile); |
449
|
0
|
|
|
|
|
|
return ($median, \@q); |
450
|
|
|
|
|
|
|
}; |
451
|
|
|
|
|
|
|
|
452
|
0
|
|
|
|
|
|
my $save_in = new Fsdb::IO::Reader(-file => $self->{_save_out_filename}); |
453
|
0
|
0
|
|
|
|
|
$save_in->error && die $self->{_prog} . ": re-read error " . $save_in->error; |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
# To handle the ugly case of having more ntiles than |
456
|
|
|
|
|
|
|
# data, we detect it and replicate the data until we have more |
457
|
|
|
|
|
|
|
# replicated_data than ntiles. |
458
|
0
|
0
|
|
|
|
|
my($replicate_data) = ($n >= $effective_quantile+1) ? 1 : _round_up(($effective_quantile+1.0)/$n); |
459
|
0
|
|
|
|
|
|
my($replicated_n) = $n * $replicate_data; |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# Also note that the array of quantiles and the number of |
462
|
|
|
|
|
|
|
# data elements read are both 1-based and not 0-based like |
463
|
|
|
|
|
|
|
# most perl stuff. This is to make the math easier. |
464
|
0
|
|
|
|
|
|
my $median_i = _round_up($replicated_n / 2); |
465
|
0
|
|
|
|
|
|
my $ntile_frac = ($replicated_n + 0.0) / ($effective_quantile + 0.0); |
466
|
0
|
|
|
|
|
|
my($x, $last_x, $next_q_i); |
467
|
0
|
|
|
|
|
|
@q = (0); # note that q is primed with 0 (to fill that zero element) |
468
|
0
|
|
|
|
|
|
my($replicates_left) = 0; |
469
|
0
|
|
|
|
|
|
my($i); # note that i counts from 1! |
470
|
0
|
|
|
|
|
|
for ($i = 1; $#q+1 < $effective_quantile; $i++) { |
471
|
0
|
0
|
|
|
|
|
if (--$replicates_left <= 0) { |
472
|
0
|
|
|
|
|
|
my $fref = $save_in->read_rowobj; |
473
|
0
|
0
|
|
|
|
|
die "internal error re-reading data\n" if (ref($fref) ne 'ARRAY'); |
474
|
0
|
|
|
|
|
|
$x = $fref->[0]; |
475
|
0
|
|
|
|
|
|
$replicates_left = $replicate_data; |
476
|
|
|
|
|
|
|
# Verify sorted order (in case the user lied to us |
477
|
|
|
|
|
|
|
# about pre-sorting). |
478
|
0
|
0
|
0
|
|
|
|
if (defined($last_x) && $x < $last_x) { |
479
|
0
|
0
|
|
|
|
|
my($info) = ($self->{_pre_sorted} ? " (internal error in dbsort)" : " (user specified -S for pre-sorted data but it is unsorted)"); |
480
|
0
|
|
|
|
|
|
die $self->{_prog} . ": cannot process data that is out of order between $last_x and $x $info.\n"; |
481
|
|
|
|
|
|
|
}; |
482
|
0
|
|
|
|
|
|
$last_x = $x; |
483
|
|
|
|
|
|
|
}; |
484
|
0
|
0
|
|
|
|
|
if ($i == $median_i) { $median = $x; }; |
|
0
|
|
|
|
|
|
|
485
|
0
|
0
|
|
|
|
|
$next_q_i = (_round_up($ntile_frac * ($#q + 1.0) )) if (!defined($next_q_i)); |
486
|
|
|
|
|
|
|
# print "d: q=$#q nq=$next_q_i i=$i\n"; |
487
|
0
|
0
|
|
|
|
|
if ($i == $next_q_i) { push(@q, $x); $next_q_i = undef; }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
}; |
489
|
0
|
|
|
|
|
|
return ($median, \@q); |
490
|
|
|
|
|
|
|
}; |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head2 run_one_key |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
$filter->run_one_key(); |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
Internal: run over each row, for a given key. |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=cut |
500
|
|
|
|
|
|
|
sub run_one_key($) { |
501
|
0
|
|
|
0
|
1
|
|
my($self) = @_; |
502
|
|
|
|
|
|
|
|
503
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: starting run\n" if ($self->{_debug} > 2); |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# xxx: should eval all this to factor out constants from runtime |
506
|
0
|
|
|
|
|
|
my($xf) = $self->{_target_coli}; |
507
|
0
|
|
|
|
|
|
my($key_column) = $self->{_key_column}; |
508
|
|
|
|
|
|
|
|
509
|
0
|
|
|
|
|
|
my($n) = 0; |
510
|
0
|
|
|
|
|
|
my($sx) = 0; |
511
|
0
|
|
|
|
|
|
my($sxx) = 0; |
512
|
0
|
|
|
|
|
|
my $min; |
513
|
|
|
|
|
|
|
my $max; |
514
|
0
|
|
|
|
|
|
my $key; |
515
|
0
|
|
|
|
|
|
my $last_key = $self->{_holdover_key}; |
516
|
0
|
|
|
|
|
|
my $holdover_data = $self->{_holdover_data}; |
517
|
0
|
|
|
|
|
|
$self->{_holdover_key} = $self->{_holdover_data} = undef; |
518
|
|
|
|
|
|
|
|
519
|
0
|
|
|
|
|
|
my $fref; |
520
|
|
|
|
|
|
|
my $x; |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
{ |
523
|
0
|
|
|
|
|
|
my $save_out = $self->{_save_out}; |
|
0
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
|
my $read_fastpath_sub = $self->{_read_fastpath_sub}; |
525
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
|
my $code = q' |
527
|
|
|
|
|
|
|
while (1) { |
528
|
|
|
|
|
|
|
if (defined($holdover_data)) { |
529
|
|
|
|
|
|
|
$x = $holdover_data; # and key was set earlier |
530
|
|
|
|
|
|
|
$holdover_data = undef; |
531
|
|
|
|
|
|
|
} else { |
532
|
|
|
|
|
|
|
$fref = &{$read_fastpath_sub}(); |
533
|
|
|
|
|
|
|
last if (!defined($fref)); |
534
|
|
|
|
|
|
|
$x = $fref->[' . $xf . q']; |
535
|
|
|
|
|
|
|
'; |
536
|
0
|
0
|
|
|
|
|
if (defined($self->{_key_column})) { |
537
|
|
|
|
|
|
|
$code .= q' |
538
|
0
|
|
|
|
|
|
$key = $fref->[' . $self->{_key_coli} . ']; |
539
|
|
|
|
|
|
|
if (!defined($last_key)) { |
540
|
|
|
|
|
|
|
$last_key = $key; |
541
|
|
|
|
|
|
|
} elsif ($key ne $last_key) { |
542
|
|
|
|
|
|
|
$self->{_holdover_key} = $key; |
543
|
|
|
|
|
|
|
$self->{_holdover_data} = $x; |
544
|
|
|
|
|
|
|
last; |
545
|
|
|
|
|
|
|
}; |
546
|
|
|
|
|
|
|
'; |
547
|
|
|
|
|
|
|
}; |
548
|
0
|
|
|
|
|
|
$code .= q' |
549
|
|
|
|
|
|
|
}; |
550
|
|
|
|
|
|
|
'; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
$code .= 'next if ($x !~ /' . $is_numeric_regexp . "/);\n" |
553
|
0
|
0
|
|
|
|
|
if (!$self->{_include_non_numeric}); |
554
|
0
|
|
|
|
|
|
$code .= q' |
555
|
|
|
|
|
|
|
$x += 0.0; # force numeric |
556
|
|
|
|
|
|
|
$n++; |
557
|
|
|
|
|
|
|
$sx += $x; |
558
|
|
|
|
|
|
|
$sxx += $x * $x; |
559
|
|
|
|
|
|
|
'; |
560
|
0
|
0
|
|
|
|
|
$code .= 'print STDERR "dbcolstats: save-out write\n";' . "\n" if ($self->{_debug} > 2); |
561
|
|
|
|
|
|
|
|
562
|
0
|
0
|
0
|
|
|
|
if ($self->{_quantile} || $self->{_median}) { |
563
|
|
|
|
|
|
|
# note that as of perl-5.14 we must force numeric or perl truncates floats to ints :-( |
564
|
0
|
|
|
|
|
|
$code .= q' |
565
|
|
|
|
|
|
|
my(@row); |
566
|
|
|
|
|
|
|
$row[0] = $x + 0; # force numeric, as guaranteed by above |
567
|
|
|
|
|
|
|
$save_out->write_rowobj(\@row); |
568
|
|
|
|
|
|
|
'; |
569
|
|
|
|
|
|
|
}; |
570
|
0
|
0
|
|
|
|
|
$code .= 'print STDERR "dbcolstats: post save-out write\n";' . "\n" if ($self->{_debug} > 2); |
571
|
0
|
|
|
|
|
|
$code .= q' |
572
|
|
|
|
|
|
|
if (!defined($min)) { |
573
|
|
|
|
|
|
|
$min = $max = $x; |
574
|
|
|
|
|
|
|
} else { |
575
|
|
|
|
|
|
|
$min = $x if ($x < $min); |
576
|
|
|
|
|
|
|
$max = $x if ($x > $max); |
577
|
|
|
|
|
|
|
}; |
578
|
|
|
|
|
|
|
};'; |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# run it |
581
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: eval'ing code\n" if ($self->{_debug}); |
582
|
0
|
0
|
|
|
|
|
print $code if ($self->{_debug}); |
583
|
0
|
|
|
|
|
|
eval $code; |
584
|
0
|
0
|
|
|
|
|
$@ and die $self->{_prog} . ": internal error in eval.: $@\n"; |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# clean up |
587
|
0
|
0
|
0
|
|
|
|
if ($self->{_quantile} || $self->{_median}) { |
588
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: closing save-out\n" if ($self->{_debug} > 2); |
589
|
0
|
|
|
|
|
|
$self->{_save_out}->close; |
590
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: post closing save-out\n" if ($self->{_debug} > 2); |
591
|
|
|
|
|
|
|
}; |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
# |
595
|
|
|
|
|
|
|
# Make sure we cleaned up before we do any computation. |
596
|
|
|
|
|
|
|
# |
597
|
0
|
0
|
|
|
|
|
if (defined($self->{_sorter_fred})) { |
598
|
|
|
|
|
|
|
# let sorting finish |
599
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: join on sorter thread\n" if ($self->{_debug} > 2); |
600
|
0
|
|
|
|
|
|
$self->{_sorter_fred}->join(); |
601
|
0
|
|
|
|
|
|
$self->{_sorter_fred} = undef; |
602
|
0
|
0
|
|
|
|
|
print STDERR "dbcolstats: post join on sorter thread\n" if ($self->{_debug} > 2); |
603
|
|
|
|
|
|
|
}; |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
# |
606
|
|
|
|
|
|
|
# Compute stats. |
607
|
|
|
|
|
|
|
# |
608
|
0
|
0
|
|
|
|
|
my $mean = ($n == 0 ? "-" : $sx / $n); |
609
|
|
|
|
|
|
|
# stddev = s, not s^2, approximates omega |
610
|
|
|
|
|
|
|
# Check for special cases: |
611
|
|
|
|
|
|
|
# $n <= 1 => divide by zero |
612
|
|
|
|
|
|
|
# all same data value => can sometimes get very small or negative |
613
|
|
|
|
|
|
|
# stddev (due to rounding error) |
614
|
|
|
|
|
|
|
# for these cases, $stddev = 0 |
615
|
0
|
|
|
|
|
|
my $stddev; |
616
|
0
|
0
|
|
|
|
|
if ($n == 0) { |
617
|
0
|
|
|
|
|
|
$stddev = "-"; |
618
|
|
|
|
|
|
|
} else { |
619
|
|
|
|
|
|
|
$stddev = ($n <= 1 || $max == $min) ? 0 : |
620
|
0
|
0
|
0
|
|
|
|
sqrt(($sxx - $n * $mean * $mean) / ($n - ($self->{_sample} ? 1 : 0))); |
|
|
0
|
|
|
|
|
|
621
|
|
|
|
|
|
|
}; |
622
|
0
|
|
|
|
|
|
my $pct_rsd; |
623
|
0
|
0
|
0
|
|
|
|
if ($stddev eq '-' || $mean eq '-' || $mean == 0) { |
|
|
|
0
|
|
|
|
|
624
|
0
|
|
|
|
|
|
$pct_rsd = "-"; |
625
|
|
|
|
|
|
|
} else { |
626
|
0
|
|
|
|
|
|
$pct_rsd = ($stddev / $mean) * 100; |
627
|
|
|
|
|
|
|
}; |
628
|
|
|
|
|
|
|
# |
629
|
|
|
|
|
|
|
# Confidence intervals from "Probability and Statistics for Engineers", |
630
|
|
|
|
|
|
|
# Second Edition, 1986, Scheaffer and McClave, p. 242. |
631
|
|
|
|
|
|
|
# |
632
|
0
|
|
|
|
|
|
my $conf_half; |
633
|
0
|
0
|
|
|
|
|
if ($n <= 1) { |
634
|
0
|
|
|
|
|
|
$conf_half = "-"; |
635
|
|
|
|
|
|
|
} else { |
636
|
0
|
|
|
|
|
|
my $conf_alpha = (1.0 - $self->{_confidence_fraction}) / 2.0; |
637
|
0
|
|
|
|
|
|
$conf_half = t_distribution($n - 1, $conf_alpha) * $stddev / sqrt($n); |
638
|
|
|
|
|
|
|
}; |
639
|
0
|
0
|
|
|
|
|
my $conf_low = ($conf_half eq '-' ? '-' : $mean - $conf_half); |
640
|
0
|
0
|
|
|
|
|
my $conf_high = ($conf_half eq '-' ? '-' : $mean + $conf_half); |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# |
643
|
|
|
|
|
|
|
# Compute median/quantile. |
644
|
|
|
|
|
|
|
# |
645
|
0
|
|
|
|
|
|
my($median, $q_aref) = $self->_compute_quantile($n, $mean); |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
# |
648
|
|
|
|
|
|
|
# Output the results. |
649
|
|
|
|
|
|
|
# |
650
|
|
|
|
|
|
|
# xxx: bug work-around: the +0s on conf_pct, min, max are |
651
|
|
|
|
|
|
|
# because perl-5.14.2-191.fc16.x86_64 |
652
|
|
|
|
|
|
|
# truncates the floating-point portion of these values otherwise. |
653
|
|
|
|
|
|
|
# |
654
|
|
|
|
|
|
|
my %out_hash = ( |
655
|
|
|
|
|
|
|
mean => $self->numeric_formatting($mean), |
656
|
|
|
|
|
|
|
stddev => $self->numeric_formatting($stddev), |
657
|
|
|
|
|
|
|
pct_rsd => $self->numeric_formatting($pct_rsd), |
658
|
|
|
|
|
|
|
conf_range => $self->numeric_formatting($conf_half), |
659
|
|
|
|
|
|
|
conf_low => $self->numeric_formatting($conf_low), |
660
|
|
|
|
|
|
|
conf_high => $self->numeric_formatting($conf_high), |
661
|
0
|
0
|
0
|
|
|
|
conf_pct => $self->{_confidence_fraction} + 0, |
|
|
0
|
0
|
|
|
|
|
662
|
|
|
|
|
|
|
sum => $self->numeric_formatting($sx), |
663
|
|
|
|
|
|
|
sum_squared => $self->numeric_formatting($sxx), |
664
|
|
|
|
|
|
|
min => (!defined($min) || $min eq '-' ? $min : $min + 0), |
665
|
|
|
|
|
|
|
max => (!defined($max) || $max eq '-' ? $max : $max + 0), |
666
|
|
|
|
|
|
|
n => $n, |
667
|
|
|
|
|
|
|
); |
668
|
|
|
|
|
|
|
# my $bug_workaround = "xxx: conf_pct : $out_hash{conf_pct}\n"; |
669
|
0
|
0
|
|
|
|
|
$out_hash{median} = $median if ($self->{_median}); |
670
|
0
|
0
|
|
|
|
|
if ($self->{_quantile}) { |
671
|
0
|
|
|
|
|
|
foreach (1..($self->{_quantile}-1)) { |
672
|
0
|
|
|
|
|
|
$out_hash{"q$_"} = $q_aref->[$_]; |
673
|
|
|
|
|
|
|
}; |
674
|
|
|
|
|
|
|
}; |
675
|
0
|
0
|
|
|
|
|
if (defined($key_column)) { |
676
|
0
|
|
|
|
|
|
$out_hash{$key_column} = $last_key; |
677
|
|
|
|
|
|
|
}; |
678
|
|
|
|
|
|
|
|
679
|
0
|
0
|
0
|
|
|
|
if ($n > 0 || ($n == 0 && $self->{_output_on_no_input})) { |
|
|
|
0
|
|
|
|
|
680
|
0
|
|
|
|
|
|
$self->{_out}->write_row_from_href(\%out_hash); |
681
|
|
|
|
|
|
|
}; |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=head2 run |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
$filter->run(); |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
Internal: run over each row, for one or many keys. |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
=cut |
691
|
|
|
|
|
|
|
sub run($) { |
692
|
0
|
|
|
0
|
1
|
|
my($self) = @_; |
693
|
0
|
|
|
|
|
|
$self->{_holdover_key} = $self->{_holdove_data} = undef; |
694
|
0
|
|
|
|
|
|
for (;;) { |
695
|
0
|
|
|
|
|
|
$self->run_one_key(); |
696
|
0
|
0
|
|
|
|
|
last if (!defined($self->{_holdover_key})); |
697
|
|
|
|
|
|
|
}; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=head1 AUTHOR and COPYRIGHT |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Copyright (C) 1991-2015 by John Heidemann |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
This program is distributed under terms of the GNU general |
705
|
|
|
|
|
|
|
public license, version 2. See the file COPYING |
706
|
|
|
|
|
|
|
with the distribution for details. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=cut |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
1; |