File Coverage

blib/lib/Math/LiveStats.pm
Criterion Covered Total %
statement 15 194 7.7
branch 1 84 1.1
condition 0 25 0.0
subroutine 3 14 21.4
pod 9 9 100.0
total 28 326 8.5


line stmt bran cond sub pod time code
1             package Math::LiveStats;
2              
3 1     1   114519 use strict;
  1         2  
  1         44  
4 1     1   4 use warnings;
  1         3  
  1         2049  
5              
6             # perl -MPod::Markdown -e 'Pod::Markdown->new->filter(@ARGV)' lib/Math/LiveStats.pm > README.md
7              
8             =head1 NAME
9              
10             Math::LiveStats - Pure perl module to make mean, standard deviation, vwap, and p-values available for one or more window sizes in streaming data
11              
12             =head1 SYNOPSIS
13              
14              
15             #!/usr/bin/perl -w
16            
17             use Math::LiveStats;
18            
19             # Create a new Math::LiveStats object with window sizes of 60 and 300 seconds
20             my $stats = Math::LiveStats->new(60, 300); # doesn't have to be "time" or "seconds" - could be any series base you want
21            
22             # Add time-series data points (timestamp, value, volume) # use volume=0 if you don't use/need vwap
23             $stats->add(1000, 50, 5);
24             $stats->add(1060, 55, 10);
25             $stats->add(1120, 53, 5);
26            
27             # Get mean and standard deviation for a window size
28             my $mean_60 = $stats->mean(60);
29             my $stddev_60 = $stats->stddev(60); # of the mean
30             my $vwap_60 = $stats->vwap(60);
31             my $vwapdev_60 = $stats->vwapdev(60); # stddev of the vwap
32            
33             # Get the p-value for a window size
34             my $pvalue_60 = $stats->pvalue(60);
35            
36             # Get the number of entries in a window
37             my $n_60 = $stats->n(60);
38            
39             # Recalculate statistics to reduce accumulated errors
40             $stats->recalc(60);
41              
42             =head1 CLI one-liner example
43              
44             cat data | perl -MMath::LiveStats -ne 'BEGIN{$s=Math::LiveStats->new(20);} chomp;($t,$p,$v)=split(/,/); $s->add($t,$p,$v); print "$t,$p,$v,",$s->n(20),",",$s->mean(20),",",$s->stddev(20),",",$s->vwap(20),",",$s->vwapdev(20),"\n"'
45              
46             =head1 DESCRIPTION
47              
48             Math::LiveStats provides live statistical calculations (mean, standard deviation, p-value,
49             volume-weighted-average-price and stddev vwap) over multiple window sizes for streaming
50             data. It uses West's algorithm for efficient updates and supports synthetic boundary
51             entries to maintain consistent results.
52              
53             Stats are computed based on data that exists inside the given window size, plus possibly
54             one (at most) synthetic entry: when old data shuffles out of the window, if there's no
55             data exactly on the oldest boundary of the window, one synthetic value is assumed to be
56             there, which is linearly-interpolated from the entries that appeared logically either side.
57              
58             =head1 METHODS
59              
60              
61             =cut
62              
63             require Exporter;
64              
65             our @ISA = qw(Exporter);
66             our($VERSION)='1.02';
67             our($UntarError) = '';
68              
69             our %EXPORT_TAGS = ( 'all' => [ qw( ) ] );
70              
71             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
72              
73             our @EXPORT = qw( );
74              
75              
76              
77             =head2 new(@window_sizes)
78              
79             Creates a new Math::LiveStats object with the specified window sizes.
80              
81             =cut
82              
83             sub new {
84 5     5 1 135247 my ($class, @window_sizes) = @_;
85 5 50       9 die "At least one window size must be provided" unless @window_sizes;
86              
87             # Ensure window sizes are positive integers and sort them
88 5         7 @window_sizes = sort { $a <=> $b } grep { $_ > 0 } @window_sizes;
  6         10  
  9         17  
89              
90 5         10 my $self = {
91             window_sizes => \@window_sizes,
92             data => [],
93             stats => {},
94             };
95              
96             # Initialize stats for each window size
97 5         10 foreach my $window (@window_sizes) {
98 9         41 $self->{stats}{$window} = {
99             n => 0,
100             mean => 0,
101             M2 => 0,
102             cpv => 0, # Cumulative_Price_Volume
103             cv => 0, # Cumulative_Volume
104             vM2 => 0, # M2 of the vwap
105             vmean => 0, # for vwapdev
106             synthetic => undef, # To store synthetic entry if needed
107             start_index => 0,
108             };
109             }
110              
111 5         11 return bless $self, $class;
112             } # new
113              
114              
115             =head2 add($timestamp, $value [,$volume])
116              
117             Adds a new data point to the time-series and updates statistics.
118              
119             =cut
120              
121             sub add {
122 0     0 1   my ($self, $timestamp, $value, $volume) = @_;
123              
124 0 0 0       die "series key (e.g. timestamp) and value must be defined" unless defined $timestamp && defined $value;
125 0 0 0       die "duplicated $timestamp" if @{ $self->{data} } && $self->{data}[-1]{timestamp}==$timestamp;
  0            
126              
127 0           my $largest_window = $self->{window_sizes}[-1];
128 0           my $window_start = $timestamp - $largest_window;
129 0           my $inserted_synthetic=0;
130 0 0         $volume=0 unless($volume);
131              
132              
133             # Actually append the new data point to the end of our array
134 0           push @{ $self->{data} }, { timestamp => $timestamp, value => $value, volume => $volume };
  0            
135              
136             # Update stats for our largest_window with the new data point
137 0           $self->_add_point({ timestamp => $timestamp, value => $value, volume => $volume}, $largest_window);
138              
139             # de-accumulate now-old data from non-largest window sizes
140 0           foreach my $window (@{ $self->{window_sizes} }[0 .. $#{ $self->{window_sizes} } - 1]) { # do all, except the last (i.e. not the $largest_window)
  0            
  0            
141 0           my $stats = $self->{stats}{$window};
142              
143             # Remove previous synthetic point if it exists (adding new data always means that any synthetic must be removed)
144 0 0         if ($stats->{synthetic}) {
145 0           my $synthetic_point = $stats->{synthetic};
146 0           $self->_remove_point($synthetic_point, $window);
147 0           $stats->{synthetic} = undef;
148             }
149              
150 0           my $window_start = $timestamp - $window; # Caution; this "$window_start" is for the smaller window, not the outer-scope $window_start which is for the $largest_window
151 0   0       while (@{ $self->{data} } && $self->{data}[$stats->{start_index}]{timestamp} < $window_start) {
  0            
152 0           $self->_remove_point($self->{data}[$stats->{start_index}], $window); # de-accumulate this old data
153 0           $stats->{start_index}++;
154             }
155             }
156              
157              
158             # Remove (both de-accumulate, as well as physically remove from the start of the array) data points outside the largest window size
159 0           my $last_removed_point;
160 0           my $removed_count = 0; # Keep track of the number of removed points
161              
162 0   0       while (@{ $self->{data} } && $self->{data}[0]{timestamp} < $window_start) {
  0            
163 0           $last_removed_point = shift @{ $self->{data} };
  0            
164 0           $self->_remove_point($last_removed_point, $largest_window);
165 0           $removed_count++;
166             }
167              
168              
169              
170             # Check if a synthetic entry is needed to be physically inserted into the data as the start of the largest window
171 0           my $oldest_point = $self->{data}[0];
172 0 0         if ($oldest_point->{timestamp} > $window_start) {
173              
174             # Need to insert synthetic entry at window_start
175 0           my $synthetic_timestamp = $window_start;
176              
177             # Determine value for synthetic point
178 0           my($synthetic_value, $synthetic_volume);
179              
180 0 0         if ($last_removed_point) {
181             # Interpolate between last_removed_point and oldest_point
182 0           ($synthetic_value, $synthetic_volume) = $self->_interpolate( $last_removed_point, $oldest_point, $synthetic_timestamp );
183             } else {
184             # Initial add, use value of the oldest_point
185 0           $synthetic_value = $oldest_point->{value};
186 0           $synthetic_volume = $oldest_point->{volume};
187             }
188              
189             # Create synthetic point
190 0           my $synthetic_point = {
191             timestamp => $synthetic_timestamp,
192             value => $synthetic_value,
193             volume => $synthetic_volume,
194             # is this needed? synthetic => 1, # Mark as synthetic
195             };
196              
197             # Insert synthetic point at the beginning of data list
198 0           unshift @{ $self->{data} }, $synthetic_point;
  0            
199 0           $removed_count--;
200              
201             # Update stats with the synthetic point
202 0           $self->_add_point($synthetic_point, $largest_window);
203             }
204              
205              
206             # Now accumulate the new point for the other window sizes, and work out their synthetic entries as well if required
207 0           foreach my $window (@{ $self->{window_sizes} }[0 .. $#{ $self->{window_sizes} } - 1]) { # all except largest_window
  0            
  0            
208 0           my $window_start = $timestamp - $window;
209 0           my $stats = $self->{stats}{$window};
210              
211             # Update stats with the new data point for this window
212 0           $self->_add_point({ timestamp => $timestamp, value => $value, volume => $volume }, $window);
213              
214 0 0         if($removed_count!=0) { # might be negative if we already physically inserted a synthetic point
215             # Decrement start_index by the number of removed elements
216 0           $stats->{start_index} -= $removed_count;
217             # Ensure start_index doesn't go below zero
218 0 0         $stats->{start_index} = 0 if $stats->{start_index} < 0;
219             }
220              
221             # clear dust if our window has only 1 entry now
222 0 0         $self->recalc($window) if($stats->{n}==1);
223              
224              
225             # Check if a synthetic entry is needed at the start of this window
226 0   0       my $oldest_in_window = $self->{data}[ $stats->{start_index} ] || undef;
227              
228 0 0 0       if (!$oldest_in_window || $oldest_in_window->{timestamp} > $window_start) { # needs synthetic
229             # Need to insert synthetic point
230 0           my $synthetic_timestamp = $window_start;
231 0           my($synthetic_value, $synthetic_volume);
232              
233 0           my $before_index = $stats->{start_index} - 1;
234 0 0         my $before = $before_index >= 0 ? $self->{data}[ $before_index ] : undef;
235 0           my $after = $oldest_in_window;
236              
237 0 0 0       if ($before && $after) {
    0          
238             # Interpolate between before and after
239 0           ($synthetic_value, $synthetic_volume) = $self->_interpolate($before, $after, $synthetic_timestamp);
240             } elsif ($after) {
241             # Use the value of the after point
242 0           $synthetic_value = $after->{value};
243 0           $synthetic_volume = $after->{volume};
244             } else {
245             # Use the current value (since there's no data in window)
246 0           $synthetic_value = $value;
247 0           $synthetic_volume = $volume;
248             }
249              
250             # Create synthetic point
251 0           my $synthetic_point = {
252             timestamp => $synthetic_timestamp,
253             value => $synthetic_value,
254             volume => $synthetic_volume,
255             # not used: synthetic => 1,
256             };
257              
258             # Update stats with the synthetic point
259 0           $self->_add_point($synthetic_point, $window);
260              
261             # Store synthetic point in stats
262 0           $stats->{synthetic} = $synthetic_point;
263             }
264              
265             }
266              
267             } # add
268              
269              
270             =head2 mean($window_size)
271              
272             Returns the mean for the specified window size.
273              
274             =cut
275              
276             sub mean {
277 0     0 1   my ($self, $window) = @_;
278 0 0         die "Window size must be specified" unless defined $window;
279 0 0         die "Invalid window size" unless exists $self->{stats}{$window};
280              
281 0           return $self->{stats}{$window}{mean};
282             }
283              
284             =head2 stddev($window_size)
285              
286             Returns the standard deviation of the values for the specified window size.
287              
288             =cut
289              
290             sub stddev {
291 0     0 1   my ($self, $window) = @_;
292 0 0         die "Window size must be specified" unless defined $window;
293 0 0         die "Invalid window size" unless exists $self->{stats}{$window};
294              
295 0           my $n = $self->{stats}{$window}{n};
296 0           my $M2 = $self->{stats}{$window}{M2};
297 0 0         my $variance = $n > 1 ? $M2 / ($n - 1) : 0;
298 0 0         return $variance<0? 0: sqrt($variance);
299             }
300              
301             =head2 pvalue($window_size)
302              
303             Calculates the p-value based on the standard deviation for the specified window size.
304              
305             =cut
306              
307             sub pvalue {
308 0     0 1   my ($self, $window) = @_;
309 0           my $stddev = $self->stddev($window);
310              
311             # Ensure standard deviation is defined
312 0 0         return undef unless defined $stddev;
313              
314             # Absolute value of z-score
315 0           my $z = abs($stddev);
316              
317             # Constants for the approximation
318 0           my $b1 = 0.319381530;
319 0           my $b2 = -0.356563782;
320 0           my $b3 = 1.781477937;
321 0           my $b4 = -1.821255978;
322 0           my $b5 = 1.330274429;
323 0           my $p = 0.2316419;
324 0           my $c = 0.39894228;
325              
326             # Compute t
327 0           my $t = 1 / (1 + $p * $z);
328              
329             # Compute the standard normal probability density function (PDF)
330 0           my $pdf = $c * exp(-0.5 * $z * $z);
331              
332             # Compute the cumulative distribution function (CDF) approximation
333 0           my $cdf = 1 - $pdf * (
334             $b1 * $t +
335             $b2 * $t**2 +
336             $b3 * $t**3 +
337             $b4 * $t**4 +
338             $b5 * $t**5
339             );
340              
341             # Two-tailed p-value
342 0           my $pvalue = 2 * (1 - $cdf);
343              
344             # Ensure p-value is between 0 and 1
345 0 0         $pvalue = 1 if $pvalue > 1;
346 0 0         $pvalue = 0 if $pvalue < 0;
347              
348 0           return $pvalue;
349             }
350              
351              
352             =head2 n($window_size)
353              
354             Returns the number of entries in the specified window size.
355              
356             =cut
357              
358             sub n {
359 0     0 1   my ($self, $window) = @_;
360 0 0 0       die "Invalid window size" unless defined $window && exists $self->{stats}{$window};
361 0           return $self->{stats}{$window}{n};
362             }
363              
364              
365             =head2 vwap($window_size)
366              
367             Returns the volume-weighted average price for the specified window size.
368              
369             =cut
370              
371             sub vwap {
372 0     0 1   my ($self, $window) = @_;
373 0 0         die "Window size must be specified" unless defined $window;
374 0 0         die "Invalid window size" unless exists $self->{stats}{$window};
375              
376 0 0         return $self->{stats}{$window}{cv} ? $self->{stats}{$window}{cpv}/$self->{stats}{$window}{cv} : undef;
377             } # vwap
378              
379              
380             =head2 vwapdev($window_size)
381              
382             Returns the standard deviation of the vwap for the specified window size.
383              
384             =cut
385              
386             sub vwapdev {
387 0     0 1   my ($self, $window) = @_;
388 0 0         die "Window size must be specified" unless defined $window;
389 0 0         die "Invalid window size" unless exists $self->{stats}{$window};
390              
391 0           my $cv = $self->{stats}{$window}{cv};
392 0           my $vM2 = $self->{stats}{$window}{vM2};
393 0 0         my $variance = $cv > 0 ? $vM2 / $cv : 0;
394 0 0         return $variance < 0 ? 0 : sqrt($variance);
395             } # vwapdev
396              
397             =head2 recalc($window_size)
398              
399             Recalculates the running statistics for the given window to reduce accumulated numerical errors.
400              
401             =cut
402              
403             sub recalc {
404 0     0 1   my ($self,$window) = @_;
405              
406 0           my $data = $self->{data};
407              
408             # Reset stats for given window size
409 0           my $stats = $self->{stats}{$window};
410 0           $stats->{n} = 0;
411 0           $stats->{mean} = 0;
412 0           $stats->{M2} = 0;
413 0           $stats->{cpv} = 0;
414 0           $stats->{cv} = 0;
415 0           $stats->{vM2} = 0;
416 0           $stats->{vmean} = 0;
417             # Retain existing synthetic entry if any
418             # $stats->{synthetic} remains unchanged
419             # Retain existing start_index so we can avoid having to search for the starting data
420             # $stats->{start_index} = 0; # Note that $stats->{start_index} is always 0 for our largest window size.
421              
422 0           my $window_start = $data->[-1]{timestamp} - $window;
423              
424             # Add synthetic point to stats if it exists
425 0 0         if ($stats->{synthetic}) {
426 0           my $synthetic_point = $stats->{synthetic};
427 0           $self->_add_point($synthetic_point, $window);
428             }
429              
430             # Add data points within the window to stats
431 0           for (my $i = $stats->{start_index}; $i < @$data; $i++) {
432 0           my $point = $data->[$i];
433 0           $self->_add_point($point, $window);
434             }
435             } # recalc
436              
437              
438             # Internal method to add a data point to stats for a specific window
439             sub _add_point {
440 0     0     my ($self, $point, $window) = @_;
441              
442 0           my $stats = $self->{stats}{$window};
443 0 0         my $w = $point->{volume} ? $point->{volume} : ($stats->{n} ? $stats->{cv} / $stats->{n} : 0);
    0          
444 0           $stats->{n}++;
445              
446             # Update mean and M2 as before
447 0           my $delta = $point->{value} - $stats->{mean};
448 0           $stats->{mean} += $delta / $stats->{n};
449 0           my $delta2 = $point->{value} - $stats->{mean};
450 0           $stats->{M2} += $delta * $delta2;
451              
452             # Update cumulative price*volume and cumulative volume
453 0           $stats->{cpv} += $point->{value} * $w;
454 0           $stats->{cv} += $w;
455              
456             # Update weighted mean (vmean) and weighted M2 (vM2)
457 0           my $sumw_prev = $stats->{cv} - $w;
458 0 0         if ($sumw_prev > 0) {
459 0           my $delta_w = $point->{value} - $stats->{vmean};
460 0           $stats->{vmean} += ($w / $stats->{cv}) * $delta_w;
461 0           $stats->{vM2} += $w * $delta_w * ($point->{value} - $stats->{vmean});
462             } else {
463             # First data point
464 0           $stats->{vmean} = $point->{value};
465 0           $stats->{vM2} = 0;
466             }
467             } # _add_point
468              
469              
470             # Internal method to remove a data point from stats for a specific window
471             sub _remove_point {
472 0     0     my ($self, $point, $window) = @_;
473              
474 0           my $stats = $self->{stats}{$window};
475 0 0         my $w = $point->{volume} ? $point->{volume} : ($stats->{n} ? $stats->{cv} / $stats->{n} : 0);
    0          
476 0           $stats->{n}--;
477 0 0         $stats->{n} = 0 if $stats->{n} < 0;
478              
479             # Update mean and M2
480 0           my $delta = $point->{value} - $stats->{mean};
481 0   0       $stats->{mean} -= $delta / ($stats->{n} || 1);
482 0           my $delta2 = $point->{value} - $stats->{mean};
483 0           $stats->{M2} -= $delta * $delta2;
484 0 0         $stats->{M2} = 0 if $stats->{M2} < 0; # Ensure M2 is not negative due to floating-point errors
485              
486             # Update cumulative price*volume and cumulative volume
487 0           $stats->{cpv} -= $point->{value} * $w;
488 0           $stats->{cv} -= $w;
489 0 0         $stats->{cv} = 0 if $stats->{cv} < 0;
490              
491             # Update weighted mean (vmean) and weighted M2 (vM2)
492 0           my $sumw_prev = $stats->{cv} + $w;
493 0 0         if ($stats->{cv} > 0) {
494 0           my $delta_w = $point->{value} - $stats->{vmean};
495 0           $stats->{vmean} -= ($w / $stats->{cv}) * $delta_w;
496 0           $stats->{vM2} -= $w * $delta_w * ($point->{value} - $stats->{vmean});
497 0 0         $stats->{vM2} = 0 if $stats->{vM2} < 0;
498             } else {
499             # No data points left
500 0           $stats->{vmean} = 0;
501 0           $stats->{vM2} = 0;
502             }
503             } # _remove_point
504              
505              
506             # Internal method to interpolate synthetic value
507             sub _interpolate {
508 0     0     my ($self, $before, $after, $time) = @_;
509              
510 0           my $t0 = $before->{timestamp};
511 0           my $t1 = $after->{timestamp};
512 0           my $p0 = $before->{value};
513 0           my $p1 = $after->{value};
514 0           my $v0 = $before->{volume};
515 0           my $v1 = $after->{volume};
516              
517 0           my $slopep = ($p1 - $p0) / ($t1 - $t0);
518 0           my $slopev = ($v1 - $v0) / ($t1 - $t0);
519 0           return ($p0 + $slopep * ($time - $t0), $v0 + $slopev * ($time - $t0));
520             } # _interpolate
521              
522              
523              
524             1; # End of Math::LiveStats
525              
526              
527             __END__