File Coverage

blib/lib/Date/RetentionPolicy.pm

Criterion	Covered	Total	%
statement	64	117	54.7
branch	21	46	45.6
condition	16	23	69.5
subroutine	10	12	83.3
pod	2	3	66.6
total	113	201	56.2

line	stmt	bran	cond	sub	pod	time	code
1							package Date::RetentionPolicy;
2							$Date::RetentionPolicy::VERSION = '0.01';
3	3			3		3343	use Moo;
	3					29733
	3					13
4	3			3		3821	use Scalar::Util 'looks_like_number';
	3					6
	3					130
5	3			3		2338	use DateTime;
	3					1283521
	3					123
6	3			3		1681	use DateTime::Format::Flexible;
	3					365781
	3					35
7
8							# ABSTRACT: Prune a list of dates down to the ones you want to keep
9
10
11							has retain => ( is => 'rw', required => 1 );
12							has time_zone => ( is => 'rw', default => sub { 'floating' } );
13							has reach_factor => ( is => 'rw', default => sub { .5 } );
14							has reference_date => ( is => 'rw' );
15							has auto_sync => ( is => 'rw' );
16
17							sub reference_date_or_default {
18	341			341	1	829	my $self= shift;
19							# Use override, else 'now' rounded up to next day boundary of timezone
20	341					1076	my $start= $self->reference_date;
21	341	50				1008	return $start->clone if ref $start;
22	341	50				1852	return $self->_coerce_date($start) if defined $start;
23	0					0	return DateTime->now(time_zone => $self->time_zone)
24							->add(days => 1, seconds => -1)->truncate(to => 'day');
25	0					0	return $start;
26							}
27
28
29							sub prune {
30	341			341	0	487977	my ($self, $list)= @_;
31	341					1431	my $processed= $self->_sort_and_mark_retention($list);
32							# Divide the elements into two lists. Make a set of which indexes
33							# we're keeping, then iterate the original list to preserve the caller's
34							# list order.
35	341					976	my (@retain, @prune);
36	341					22948	my %keep= map +($_->[1] => 1), grep $_->[2], @$processed;
37	21765	100				43224	push @{ $keep{$_}? \@retain : \@prune }, $list->[$_]
38	341					2950	for 0..$#$list;
39	341					1963	@$list= @retain;
40	341					6533	return \@prune;
41							}
42
43							sub _sort_and_mark_retention {
44	341			341		1007	my ($self, $list, $trace)= @_;
45							# Each list element needs to be a date object, (but preserve the original)
46							# and the list needs to be sorted in cronological order.
47	341					2726	my @sorted= sort { $a->[0] <=> $b->[0] }
	21425					32266
48							# tuple of [ Epoch, ListIndex, KeepBoolean ].
49							# A hash would be more readable but there could be a lot of these.
50							map [ $self->_coerce_to_epoch($list->[$_]), $_, 0 ],
51							0..$#$list;
52							# Never prune things newer than the reference date
53	341					2575	my $ref_date= $self->reference_date_or_default;
54	341		66			2396	for (my $i= $#sorted; $i >= 0 && $sorted[$i][0] > $ref_date->epoch; --$i) {
55	4636					31457	$sorted[$i][2]= 1;
56							}
57							# Set the boolean to true for each element that a rule wants to keep
58							$self->_mark_for_retention($ref_date, $_, \@sorted, $trace)
59	341					2292	for @{ $self->retain };
	341					1966
60	341					5912	return \@sorted;
61							}
62
63							sub _mark_for_retention {
64	1019			1019		11982	my ($self, $reference_date, $rule, $list, $trace)= @_;
65	1019					2291	my ($interval, $history, $reach_factor)= @{$rule}{'interval','history','reach_factor'};
	1019					3443
66	1019	50				4161	$reach_factor= $self->reach_factor unless defined $reach_factor;
67	1019					3841	my $next_date= $reference_date->clone->subtract(%$history)->add(%$interval);
68	1019					2105921	my $epoch= $next_date->epoch;
69	1019					8584	my $search_idx= 0;
70	1019					3500	my $next_epoch= $next_date->add(%$interval)->epoch;
71	1019					969377	my $radius= -($epoch - $next_epoch) * $reach_factor;
72	1019					2322	my $drift= 0; # only used for auto_sync
73	1019					1623	my $rule_key;
74
75							# The epoch variables track the current date interval, and the _idx
76							# variables track our position in the list.
77	1019		66			2745	while ($epoch-abs($drift) <= $reference_date->epoch && $search_idx < @$list) {
78	24256					197099	my $best;
79	24256		100			99736	for (my $i= $search_idx; $i < @$list and $list->[$i][0] < $epoch+$drift+$radius; ++$i) {
80	102115	100	100			288804	if ($list->[$i][0] >= $epoch+$drift-$radius
			100
81							and (!defined $best or abs($list->[$i][0] - ($epoch+$drift)) < abs($list->[$best][0] - ($epoch+$drift)))
82							) {
83	35970					52045	$best= $i;
84							}
85							# update the start_idx for next interval iteration
86	102115	100				374452	$search_idx= $i+1 if $list->[$i][0] < $next_epoch-$radius*2;
87							}
88	24256	100				49643	if (defined $best) {
89	20160					35213	$list->[$best][2]= 1; # mark as a keeper
90							# If option enabled, drift toward the time we found, so that gap between next
91							# is closer to $interval
92	20160	100				66169	$drift= $list->[$best][0] - $epoch
93							if $self->auto_sync;
94							}
95	24256	50				48262	if ($trace) {
96	0	0				0	$rule_key= join ',', map "$_=$interval->{$_}", keys %$interval
97							unless defined $rule_key;
98	0	0				0	if (!$trace->{$rule_key}) {
99	0					0	$trace->{$rule_key}{idx}= scalar keys %$trace;
100	0					0	$trace->{$rule_key}{radius}= $radius;
101	0					0	$trace->{$rule_key}{name}= $rule_key;
102							}
103	0					0	push @{$trace->{$rule_key}{interval}}, { epoch => $epoch, best => $best, drift => $drift };
	0					0
104							}
105	24256					34273	$epoch= $next_epoch;
106	24256					74411	$next_epoch= $next_date->add(%$interval)->epoch;
107
108							# if auto_sync enabled, cause drift to decay back toward 0
109	24256	100				23132595	$drift= int($drift * 7/8)
110							if $drift;
111							}
112							}
113
114
115							sub visualize {
116	0			0	1	0	my ($self, $list)= @_;
117	0					0	my $trace= {};
118	0					0	my $processed= $self->_sort_and_mark_retention($list, $trace);
119	0					0	$processed->[$_][1]= $_ for 0..$#$processed; # change indexes to index within processed list
120	0					0	my @claimed;
121	0					0	my @things= @$processed;
122	0					0	my @columns;
123							my %rule_to_col;
124							# Convert each trace to a similar arrayref structure as the processed items, for sorting
125	0					0	for my $rule_trace (sort { $a->{idx} <=> $b->{idx} } values %$trace) {
	0					0
126	0					0	push @columns, { name => $rule_trace->{name}, width => 20 };
127	0					0	$rule_to_col{$rule_trace->{name}}= $#columns;
128	0					0	for (@{ $rule_trace->{interval} }) {
	0					0
129	0	0				0	push @{$claimed[$_->{best}]}, $rule_trace->{name} if defined $_->{best};
	0					0
130							push @things,
131							[ $_->{epoch} + $_->{drift} + $rule_trace->{radius}, 'ival-newest', $rule_trace->{name} ],
132							[ $_->{epoch} + $_->{drift}, 'ival', $rule_trace->{name} ],
133	0					0	[ $_->{epoch} + $_->{drift} - $rule_trace->{radius}, 'ival-oldest', $rule_trace->{name} ];
134							}
135							}
136	0					0	push @columns, { name => 'timestamp', width => 20 };
137	0					0	@things= sort { $a->[0] <=> $b->[0] } @things;
	0					0
138
139							# Walk from oldest to newest, displaying timestamps alongside the epock interval points
140	0					0	my $cur_time= 0;
141	0					0	my @in_interval= ( 0 ) x @columns;
142	0					0	my @row= map $_->{name}, @columns;
143	0					0	my $format= join(' ', map '%-'.$_->{width}.'s', @columns)."\n";
144	0					0	my $out= '';
145							my $emit= sub {
146							# if in_interval, display a vertical bar as a graphic
147	0			0		0	for (0..$#in_interval) {
148	0	0	0			0	$row[$_] \|\|= '\|' if $in_interval[$_];
149							}
150	0					0	$out .= sprintf $format, @row;
151	0					0	@row= ('') x @columns;
152	0					0	};
153	0					0	for (@things) {
154	0	0				0	$emit->() if $cur_time != $_->[0];
155	0					0	$cur_time= $_->[0];
156	0	0				0	if ($_->[1] eq 'ival') {
		0
		0
157	0					0	$row[ $rule_to_col{ $_->[2] } ]= $self->_coerce_date($_->[0]);
158							} elsif ($_->[1] eq 'ival-newest') {
159	0					0	$row[ $rule_to_col{ $_->[2] } ]= '---';
160	0					0	--$in_interval[ $rule_to_col{ $_->[2] } ];
161							} elsif ($_->[1] eq 'ival-oldest') {
162	0					0	$row[ $rule_to_col{ $_->[2] } ]= '---';
163	0					0	++$in_interval[ $rule_to_col{ $_->[2] } ];
164							} else {
165	0	0				0	$row[-1]= $self->_coerce_date($_->[0]).($_->[2]? ' +':' x');
166	0	0				0	if ($claimed[$_->[1]]) {
167	0					0	$row[-1] .= ' '.join ', ', @{ $claimed[$_->[1]] };
	0					0
168							}
169							}
170							}
171	0					0	$emit->();
172	0					0	return $out;
173							}
174
175							sub _coerce_date {
176	1801			1801		3394	my ($self, $thing)= @_;
177	1801	100	33			10033	my $date= ref $thing && ref($thing)->can('set_time_zone')? $_->clone
		50
178							: looks_like_number($thing)? DateTime->from_epoch(epoch => $thing)
179							: DateTime::Format::Flexible->parse_datetime($thing);
180	1801					10330693	$date->set_time_zone($self->time_zone);
181	1801					77681	return $date;
182							}
183
184							sub _coerce_to_epoch {
185	21765			21765		50909	my ($self, $thing)= @_;
186	21765	100	66			88729	return $thing if !ref $thing && looks_like_number($thing);
187	1460					3395	return $self->_coerce_date($thing)->epoch;
188							}
189
190							1;
191
192							__END__
193
194							=pod
195
196							=encoding UTF-8
197
198							=head1 NAME
199
200							Date::RetentionPolicy - Prune a list of dates down to the ones you want to keep
201
202							=head1 VERSION
203
204							version 0.01
205
206							=head1 SYNOPSIS
207
208							my $rp= Date::RetentionPolicy->new(
209							retain => [
210							{ interval => { hours => 6 }, history => { months => 3 } },
211							{ interval => { days => 1 }, history => { months => 6 } },
212							{ interval => { days => 7 }, history => { months => 9 } },
213							]
214							);
215
216							my $dates= [ '2018-01-01 03:23:00', '2018-01-01 09:45:00', ... ];
217							my $pruned= $rp->prune($dates);
218							for (@$pruned) {
219							# delete the backup dated $_
220							...
221							}
222
223							=head1 DESCRIPTION
224
225							Often when making backups of a thing, you want to have more frequent snapshots
226							for recent dates, but don't need that frequency further back in time, and want
227							to delete some of the older ones to save space.
228
229							The problem of deciding which snapshots to delete is non-trivial because
230							backups often don't complete on a timely schedule (despite being started on
231							a schedule) or have discontinuities from production mishaps, and it would be
232							bad if your script wiped out the only backup in an interval just because it
233							didn't look like one of the "main" timestamps. Also it would be bad if the
234							jitter from the time zone or time of day that you run the pruning process
235							caused the script to round differently and delete the backups it had
236							previously decided to keep.
237
238							This module uses an algorithm where you first define the intervals which
239							should retain a backup, then assign the existing timestamps to those intervals
240							(possibly reaching across the interval boundary a bit in order to preserve
241							a nearby timestamp; see L<reach_factor>) thus making an intelligent decision
242							about which timestamps to keep.
243
244							=head1 DATES
245
246							This module currently depends on DateTime, but I'm happy to accept patches
247							to allow it to work with other Date classes.
248
249							=head1 ATTRIBUTES
250
251							=head2 retain
252
253							An arrayref of specifications for what to preserve. Each element should be a
254							hashref containing C<history> and C<interval>. C<history> specifies how far
255							backward from L</reference_date> to apply the intervals, and C<interval>
256							specifies the time difference between the backups that need preserved.
257
258							As an example, consider
259
260							retain => [
261							{ interval => { days => 1 }, history => { days => 20 } },
262							{ interval => { hours => 1 }, history => { hours => 48 } },
263							]
264
265							This will attempt to preserve timestamps near the marks of L</reference_date>,
266							an hour before that, an hour before that, and so on for the past 48 hours.
267							It will also attempt to preserve L</reference_date>, a day before that, a day
268							before that, and so on for the past 20 days.
269
270							There is another setting called L</reach_factor> that determines how far from
271							the desired timestamp the algorithm will look for something to preserve. The
272							default C<reach_factor> of C<0.5> means that it will scan from half an interval
273							back in time until half an interval forward in time looking for the closest
274							timestamp to preserve. In some cases, you may want a narrower or wider search
275							distance, and you can set C<reach_factor> accordingly. You can also supply it
276							as another hash key for a retain rule for per-rule customization.
277
278							retain => [
279							{ interval => { days => 1 }, history => { days => 20 }, reach_factor => .75 }
280							]
281
282							=head2 time_zone
283
284							When date strings are involved, parse them as this time zone before converting
285							to an epoch value used in the calculations. The default is C<'floating'>.
286
287							=head2 reach_factor
288
289							The multiplier for how far to look in each direction from an interval point.
290							See discussion in L</retain>.
291
292							=head2 reference_date
293
294							The end-point from which all intervals will be calculated. There is no
295							default, to allow L</reference_date_or_default> to always pick up the current
296							time when called.
297
298							=head2 reference_date_or_default
299
300							Read-only. Return (a clone of) L</reference_date>, or if it isn't set, return
301							the current date in the designated L</time_zone> rounded up to the next day
302							boundary.
303
304							=head2 auto_sync
305
306							While walking backward through time intervals looking for backups, adjust the
307							interval endpoint to be closer to whatever match it found. This might allow
308							the algorithm to essentially adjust the C<reference_date> to match whatever
309							schedule your backups are running on. This is not enabled by default.
310
311							=head1 METHODS
312
313							=head1 prune
314
315							my $pruned_arrayref= $self->prune( \@times );
316
317							C<@times> may be an array of epoch numbers, DateTime objects, or date strings
318							in any format recognized by L<DateTime::Format::Flexible>. Epochs are
319							currently the most efficient type of argument since that's what the algorithm
320							operates on.
321
322							=head2 visualize
323
324							print $rp->visualize( \@list );
325
326							This method takes a list of timestamps, sorts and marks them for retention,
327							and then returns printable text showing the retention intervals and which
328							increment it decided to keep. The text is simple ascii-art, and requires
329							a monospace font to display correctly.
330
331							=head1 AUTHOR
332
333							Michael Conrad <mconrad@intellitree.com>
334
335							=head1 COPYRIGHT AND LICENSE
336
337							This software is copyright (c) 2018 by IntelliTree Solutions llc.
338
339							This is free software; you can redistribute it and/or modify it under
340							the same terms as the Perl 5 programming language system itself.
341
342							=cut