File Coverage

lib/Algorithm/CurveFit/Simple.pm

Criterion	Covered	Total	%
statement	196	208	94.2
branch	61	78	78.2
condition	21	40	52.5
subroutine	16	16	100.0
pod	1	1	100.0
total	295	343	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::CurveFit::Simple;
2
3							# ABSTRACT: Convenience wrapper around Algorithm::CurveFit.
4
5							our $VERSION = '1.03'; # VERSION 1.03
6
7	5			5		562395	use strict;
	5					21
	5					111
8	5			5		20	use warnings;
	5					8
	5					120
9	5			5		1876	use Algorithm::CurveFit;
	5					541876
	5					205
10	5			5		38	use Time::HiRes;
	5					8
	5					42
11	5			5		406	use JSON::PP;
	5					8
	5					423
12
13							our %STATS_H; # side-products of fit() stored here for profiling purposes
14
15							BEGIN {
16	5			5		24	require Exporter;
17	5					9	our $VERSION = '1.03';
18	5					90	our @ISA = qw(Exporter);
19	5					8866	our @EXPORT_OK = qw(fit %STATS_H);
20							}
21
22							# fit() - only public function for this distribution
23							# Given at least parameter "xy", generate a best-fit curve within a time limit.
24							# Output: max deviation, avg deviation, implementation source string (perl or C, for now).
25							# Optional parameters and their defaults:
26							# terms => 3 # number of terms in formula, max is 10
27							# time_limit => 3 # number of seconds to try for better fit
28							# inv => 1 # invert sense of curve-fit, from x->y to y->x
29							# impl_lang => 'perl' # programming language used for output implementation: perl, c
30							# impl_name => 'x2y' # name given to output implementation function
31							sub fit {
32	1			1	1	79	my %p = @_;
33
34	1					13	my $formula = _init_formula(%p);
35	1					5	my ($xdata, $ydata) = _init_data(%p);
36	1					4	my $parameters = _init_parameters($xdata, $ydata, %p);
37
38	1					1	my $iter_mode = 'time';
39	1					2	my $time_limit = 3; # sane default?
40	1	50				2	$time_limit = 0.01 if ($time_limit < 0.01);
41	1					2	my $n_iter;
42	1	50				3	if (defined($p{iterations})) {
43	0					0	$iter_mode = 'iter';
44	0		0			0	$n_iter = $p{iterations} \|\| 10000;
45							} else {
46	1		33			4	$time_limit = $p{time_limit} // $time_limit;
47	1					2	$n_iter = 10000 * $time_limit; # will use this to figure out how long it -really- takes.
48							}
49
50	1					2	my ($n_sec, $params_ar_ar);
51	1	50				3	if ($iter_mode eq 'time') {
52	1					4	($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
53	1					4	$STATS_H{iter_mode} = $iter_mode;
54	1					3	$STATS_H{fit_calib_iter} = $n_iter;
55	1					2	$STATS_H{fit_calib_time} = $n_sec;
56	1					2	$STATS_H{fit_calib_parar} = $params_ar_ar;
57	1					3	$n_iter = int(($time_limit / $n_sec) * $n_iter + 1);
58							}
59
60	1					5	($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
61	1					4	$STATS_H{fit_iter} = $n_iter;
62	1					3	$STATS_H{fit_time} = $n_sec;
63	1					2	$STATS_H{fit_parar} = $params_ar_ar;
64
65	1					5	my $coderef = _implement_formula($params_ar_ar, "coderef", "", $xdata, \%p);
66	1					4	my ($max_dev, $avg_dev) = _calculate_deviation($coderef, $xdata, $ydata);
67	1		50			4	my $impl_lang = $p{impl_lang} // 'perl';
68	1					3	$impl_lang = lc($impl_lang);
69	1	50				3	my $impl_name = $p{inv} ? "y2x" : "x2y";
70	1		33			6	$impl_name = $p{impl_name} // $impl_name;
71	1					2	my $impl = $coderef;
72	1	50				5	$impl = _implement_formula($params_ar_ar, $impl_lang, $impl_name, $xdata, \%p) unless($impl_lang eq 'coderef');
73	1					17	return ($max_dev, $avg_dev, $impl);
74							}
75
76							# ($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
77							sub _try_fit {
78	2			2		9	my ($formula, $parameters, $xdata, $ydata, $n_iter, $fitter_class) = @_;
79	2		50			18	$fitter_class //= "Algorithm::CurveFit";
80	2					4	my $params_ar_ar = [map {[@$_]} @$parameters]; # making a copy because curve_fit() is destructive
	8					15
81	2					6	my $tm0 = Time::HiRes::time();
82	2					16	my $res = $fitter_class->curve_fit(
83							formula => $formula,
84							params => $params_ar_ar,
85							variable => 'x',
86							xdata => $xdata,
87							ydata => $ydata,
88							maximum_iterations => $n_iter
89							);
90	2					203406	my $tm_elapsed = Time::HiRes::time() - $tm0;
91	2					9	return ($tm_elapsed, $params_ar_ar);
92							}
93
94							sub _init_formula {
95	11			11		10092	my %p = @_;
96	11					17	my $formula = 'k + ax + bx^2 + c*x^3'; # sane'ish default
97	11		100			41	my $terms = $p{terms} // 3;
98	11	50				20	die "maximum of 10 terms\n" if ($terms > 10);
99	11	100				20	if ($terms != 3) {
100	8					10	$formula = 'k';
101	8					14	for (my $i = 1; $i <= $terms; $i++) {
102	42					44	my $fact = chr(ord('a') + $i - 1);
103	42					79	$formula .= " + $fact * x^$i";
104							}
105							}
106	11					48	return $formula;
107							}
108
109							# ($xdata, $ydata) = _init_data(%p);
110							sub _init_data {
111	15			15		10628	my %p = @_;
112	15					24	my ($xdata, $ydata);
113	15	100	100			66	if (defined($p{xydata})) {
		100
114	7					9	my $xy = $p{xydata};
115	7	50	33			54	unless (
			33
			33
116							ref($xy) eq 'ARRAY'
117							&& @$xy >= 2
118							&& ref($xy->[0]) eq 'ARRAY'
119							&& ref($xy->[1]) eq 'ARRAY'
120							) {
121	0					0	die "xydata must be either an arrayref of [x, y] data point arrayrefs or an arrayref [[x0, x1, ... xN], [y0, y1, ... yN]]\n";
122							}
123	7	100	100			17	if (@$xy == 2 && @{$xy->[0]} > 2) {
	3					7
124							# user has provided [[x, ..], [y, ..]]
125	2					2	$xdata = $xy->[0];
126	2					3	$ydata = $xy->[1];
127							} else {
128							# user has provided [[x, y], [x, y], ..]
129	5	100				6	die "pairwise xydata must have two data points per element\n" unless(@{$xy->[0]} == 2);
	5					16
130	4					6	$xdata = [map {$_->[0]} @{$xy}];
	21					26
	4					7
131	4					6	$ydata = [map {$_->[1]} @{$xy}];
	21					21
	4					5
132							}
133							}
134							elsif (defined($p{xdata}) && defined($p{ydata})) {
135	5					8	$xdata = $p{xdata};
136	5					4	$ydata = $p{ydata};
137							}
138							else {
139	3					14	die "must provide at least xydata or both xdata and ydata\n";
140							}
141	11	50	33			40	die "xdata and ydata must both be arrayref\n" unless (ref($xdata) eq "ARRAY" && ref($ydata) eq "ARRAY");
142	11	100				31	die "xdata and ydata must have the same number of elements\n" unless (@$xdata == @$ydata);
143	9	100				17	die "must have more than one data-point\n" unless (@$xdata > 1);
144	8	100				15	if ($p{inv}) {
145	3					5	$STATS_H{xdata} = $ydata;
146	3					4	$STATS_H{ydata} = $xdata;
147	3					18	return ($ydata, $xdata);
148							}
149	5					8	$STATS_H{xdata} = $xdata;
150	5					7	$STATS_H{ydata} = $ydata;
151	5					16	return ($xdata, $ydata);
152							}
153
154							# $parameters = _init_parameters($xdata, $ydata, %p);
155							sub _init_parameters {
156	3			3		1207	my ($xdata, $ydata, %p) = @_;
157	3					5	my $k = 0;
158	3					4	my $n_points = @$xdata;
159	3					6	foreach my $v (@$ydata) { $k += $v; }
	18					18
160	3					5	$k /= $n_points;
161							# zzapp -- implement any precision hints here.
162	3					9	my @params = (['k', $k, 0.0000001]);
163	3		100			11	my $terms = $p{terms} // 3;
164	3					8	push @params, map {[chr(ord('a')+$_-1), 0.5, 0.0000001]} (1..$terms);
	8					18
165	3					10	return \@params;
166							}
167
168							# $impl = _implement_formula($params_ar_ar, $impl_lang, $impl_name, $xdata, \%p) unless($impl_lang eq 'coderef');
169							sub _implement_formula {
170	10			10		4609	my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
171	10	100				28	return _implement_formula_as_coderef(@_) if ($impl_lang eq 'coderef');
172							# return _implement_formula_as_python(@_) if ($impl_lang eq 'python'); # zzapp
173	8	100				18	return _implement_formula_as_C(@_) if ($impl_lang eq 'c');
174							# return _implement_formula_as_R(@_) if ($impl_lang eq 'r'); # zzapp
175							# return _implement_formula_as_MATLAB(@_) if ($impl_lang eq 'matlab'); # zzapp
176	5					14	return _implement_formula_as_perl(@_);
177							}
178
179							sub _implement_formula_as_coderef {
180	2			2		6	my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
181	2					4	my $k_ar = $params_ar_ar->[0];
182	2					15	my $formula = sprintf("%f", $k_ar->[1]);
183	2					6	for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
184	6					9	my $fact = $params_ar_ar->[$i]->[1];
185	6	100				14	my $pow = ($i == 1) ? "" : "**$i";
186	6					26	$formula .= sprintf(' + %f * $x%s', $fact, $pow);
187							}
188	2					4	$STATS_H{impl_formula} = $formula;
189	2					5	my $bounder = '';
190	2	50				8	if ($opt_hr->{bounds_check}) {
191	0					0	my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
192	0					0	foreach my $x (@$xdata) {
193	0	0				0	$high_x = $x if ($high_x < $x);
194	0	0				0	$low_x = $x if ($low_x > $x);
195							}
196	0					0	$bounder = 'die "x out of bounds (high)" if ($x > '.$high_x.'); die "x out of bounds (low)" if ($x < '.$low_x.');';
197							}
198	2					3	my $rounder = '';
199	2	50				6	$rounder = '$y = int($y + 0.5);' if ($opt_hr->{round_result});
200	2					8	my $src = 'sub { my($x) = @_; '.$bounder.' my $y = '.$formula.'; '.$rounder.' return $y; }';
201	2					4	$STATS_H{impl_source} = $src;
202	2					3	$STATS_H{impl_exception} = '';
203	2					195	my $coderef = eval($src);
204	2	50				8	$STATS_H{impl_exception} = $@ unless(defined($coderef));
205	2					7	return $coderef;
206							}
207
208							sub _implement_formula_as_perl {
209	5			5		8	my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
210	5					9	my $k_ar = $params_ar_ar->[0];
211	5					31	my $formula = sprintf("%.11f", $k_ar->[1]);
212	5					14	for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
213	15					17	my $fact = $params_ar_ar->[$i]->[1];
214	15	100				31	my $pow = ($i == 1) ? "" : "**$i";
215	15					54	$formula .= sprintf(' + %.11f * $x%s', $fact, $pow);
216							}
217	5					18	$STATS_H{impl_formula} = $formula;
218	5					6	my $bounder = '';
219	5	100				12	if ($opt_hr->{bounds_check}) {
220	1					3	my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
221	1					2	foreach my $x (@$xdata) {
222	7	100				9	$high_x = $x if ($high_x < $x);
223	7	100				10	$low_x = $x if ($low_x > $x);
224							}
225	1					7	$bounder = sprintf(' die "x out of bounds (high)" if ($x > %.11f);'."\n", $high_x) .
226							sprintf(' die "x out of bounds (low)" if ($x < %.11f);'."\n", $low_x);
227							}
228	5					8	my $rounder = '';
229	5	100				14	$rounder = ' $y = int($y + 0.5);'."\n" if ($opt_hr->{round_result});
230	5					21	my $src = join("\n",(
231							"sub $impl_name {",
232							' my($x) = @_;',
233							$bounder,
234							' my $y = '.$formula.';',
235							$rounder,
236							' return $y;',
237							'}'
238							));
239	5					8	$STATS_H{impl_source} = $src;
240	5					7	$STATS_H{impl_exception} = '';
241	5					14	return $src;
242							}
243
244							sub _implement_formula_as_C {
245	3			3		18	my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
246	3					5	my $k_ar = $params_ar_ar->[0];
247	3					3	my $src = "";
248	3	100	66			10	$src .= "#include \n" if ($opt_hr->{round_result} && !$opt_hr->{suppress_includes});
249	3					6	$src .= "double $impl_name(double x) {\n";
250	3					16	$src .= sprintf(" double y = %.11f;\n", $k_ar->[1]);
251	3					5	$src .= " double xx = x;\n"; # eliminating pow() calls, which gcc doesn't seem willing to optimize completely away
252
253	3	100				7	if ($opt_hr->{bounds_check}) {
254	1					2	my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
255	1					3	foreach my $x (@$xdata) {
256	7	100				11	$high_x = $x if ($high_x < $x);
257	7	100				8	$low_x = $x if ($low_x > $x);
258							}
259							# zzapp -- this is kludgy. better way to signal bounds violation?
260	1					8	$src .= sprintf(" if (x > %.11f) return -1.0;\n", $high_x) .
261							sprintf(" if (x < %.11f) return -1.0;\n", $low_x);
262							}
263
264	3					4	my $formula = "";
265	3					8	for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
266	9					10	my $fact = $params_ar_ar->[$i]->[1];
267	9					21	$formula .= sprintf(" y += %.11f * xx;\n", $fact);
268	9	100				23	$formula .= " xx *= x;\n" if(defined($params_ar_ar->[$i+1]));
269							}
270	3					4	$STATS_H{impl_formula} = $formula; # zzapp -- not clean!
271
272	3					5	$src .= $formula;
273	3	100				7	$src .= " y = round(y);\n" if ($opt_hr->{round_result});
274	3					4	$src .= " return y;\n}\n";
275	3					3	$STATS_H{impl_source} = $src;
276	3					4	$STATS_H{impl_exception} = '';
277	3					8	return $src;
278							}
279
280							# ($max_dev, $avg_dev) = _calculate_deviation($coderef, $xdata, $ydata);
281							sub _calculate_deviation {
282	1			1		2	my ($coderef, $xdata, $ydata) = @_;
283	1					2	my $max_off = 1.0;
284	1					2	my $max_off_datum = 0.0;
285	1					2	my $tot_off = 0.0;
286	1					8	for (my $i = 0; defined($xdata->[$i]); $i++) {
287	10					9	my $x = $xdata->[$i];
288	10					8	my $y = eval { $coderef->($x); };
	10					142
289	10	50				39	unless(defined($y)) {
290	0					0	$STATS_H{deviation_exception} = $@;
291	0					0	$STATS_H{deviation_exception_datum} = $x;
292	0					0	die "caught exception calculating deviations";
293							}
294
295	10					14	my $observed_y = $ydata->[$i];
296	10	50	33			24	if ($observed_y && $y) {
297	10	100				17	my $deviation = $y > $observed_y ? $y / $observed_y : $observed_y / $y;
298	10					11	my $dev_mag = abs($deviation - 1.0);
299	10					11	my $max_mag = abs($max_off - 1.0);
300							# print "x=$x\ty=$y\toy=$observed_y\tdev_mag=$dev_mag\tmax_mag=$max_mag\n";
301	10	100				15	($max_off, $max_off_datum) = ($deviation, $x) if ($dev_mag > $max_mag);
302	10					19	$tot_off += $deviation;
303							} else {
304	0					0	$tot_off += 1.0;
305							}
306							}
307	1					3	$STATS_H{deviation_max_offset_datum} = $max_off_datum;
308	1					4	return ($max_off, $tot_off / @$xdata);
309							}
310
311
312							1;
313
314							=head1 NAME
315
316							Algorithm::CurveFit::Simple - Convenience wrapper around Algorithm::CurveFit
317
318							=head1 SYNOPSIS
319
320							use Algorithm::CurveFit::Simple qw(fit);
321
322							my ($max_dev, $avg_dev, $src) = fit(xdata => \@xdata, ydata => \@ydata, ..options..);
323
324							# Alternatively pass xdata and ydata together:
325							my ($max_dev, $avg_dev, $src) = fit(xydata => [\@xdata, \@ydata], ..options..);
326
327							# Alternatively pass data as array of [x,y] pairs:
328							my ($max_dev, $avg_dev, $src) = fit(xydata => [[1, 2], [2, 5], [3, 10]], ..options..);
329
330							=head1 DESCRIPTION
331
332							This is a convenience wrapper around L. Given a body of (x, y) data points, it will generate a polynomial formula f(x) = y which fits that data.
333
334							Its main differences from L are:
335
336							=over 4
337
338							=item * It synthesizes the initial formula for you,
339
340							=item * It allows for a time limit on the curve-fit instead of an iteration count,
341
342							=item * It implements the formula as source code (or as a perl coderef, if you want to use the formula immediately in your program).
343
344							=back
345
346							Additionally it returns a maximum deviation and average deviation of the formula vs the xydata, which is more useful (to me, at least) than L's square residual output. Closer to 1.0 indicates a better fit. Play with C #> until these deviations are as close to 1.0 as possible, and beware overfitting.
347
348							=head1 SUBROUTINES
349
350							There is only one public subroutine, C. It B be given either C or C and C parameters. All other paramters are optional.
351
352							It returns three values: A maximum deviation, the average deviation and the formula implementation.
353
354							=head2 Options
355
356							=over 4
357
358							=item C \@xdata, ydata =E \@ydata)>
359
360							The data points the formula will fit. Same as L parameters of the same name.
361
362							=item C [[1, 2, 3, 4], [10, 17, 26, 37]])>
363
364							=item C [[1, 10], [2, 17], [3, 26], [4, 37]])>
365
366							A more convenient way to provide data points. C will try to detect how the data points are organized -- list of x and list of y, or list of [x,y].
367
368							=item C 3)>
369
370							Sets the order of the polynomial, which will be of the form C. The default is 3 and the limit is 10.
371
372							There is no need to specify initial C. It will be calculated from C.
373
374							=item C 3)>
375
376							If a time limit is given (in seconds), C will spend no more than that long trying to fit the data. It may return in much less time. The default is 3.
377
378							=item C 10000)>
379
380							If an iteration count is given, C will ignore any time limit and iterate up to C times trying to fit the curve. Same as L parameter of the same name.
381
382							=item C 1)>
383
384							Setting C inverts the sense of the fit. Instead of C the formula will fit C.
385
386							=item C "perl")>
387
388							Sets the programming language in which the formula will be implemented. Currently supported languages are C<"C">, C<"coderef"> and the default, C<"perl">.
389
390							When C "coderef"> is specified, a code reference is returned instead which may be used immediately by your perl script:
391
392							my($max_dev, $avg_dev, $x2y) = fit(xydata => \@xy, impl_lang => "coderef");
393
394							my $y = $x2y->(42);
395
396							More implementation languages will be supported in the future.
397
398							=item C "x2y")>
399
400							Sets the name of the function implementing the formula. The default is C<"x2y">. Has no effect when used with C "coderef")>.
401
402							my($max_dev, $avg_dev, $src) = fit(xydata => \@xy, impl_name => "converto");
403
404							print "$src\n";
405
406							sub converto {
407							my($x) = @_;
408							my $y = -5340.93059104837 + 249.23009968947 * $x + -3.87745746448 * $x*2 + 0.02114780993 $x**3;
409							return $y;
410							}
411
412							=item C 1)>
413
414							When set, the implementation will include logic for checking whether the input is out-of-bounds, per the highest and lowest x points in the data used to fit the formula. For implementation languages which support exceptions, an exception will be thrown. For others (like C), C<-1.0> will be returned to indicate the error.
415
416							For instance, if the highest x in C<$xydata> is 83.0 and the lowest x is 60.0:
417
418							my($max_dev, $avg_dev, $src) = fit(xydata => \@xy, bounds_check => 1);
419
420							print "$src\n";
421
422							sub x2y {
423							my($x) = @_;
424							die "x out of bounds (high)" if ($x > 83.80000000000);
425							die "x out of bounds (low)" if ($x < 60.80000000000);
426							my $y = -5340.93059104837 + 249.23009968947 * $x + -3.87745746448 * $x*2 + 0.02114780993 $x**3;
427							return $y;
428							}
429
430							=item C 1)>
431
432							When set, the implementation will round the output to the nearest whole number. When the implementation language is C<"C"> this adds an C<#include Emath.hE> directive to the source code, which will have to be compiled against libm -- see C.
433
434							my($max_dev, $avg_dev, $src) = fit(xydata => \@xy, round_result => 1);
435
436							print "$src\n";
437
438							sub x2y {
439							my($x) = @_;
440							my $y = -5340.93059104837 + 249.23009968947 * $x + -3.87745746448 * $x*2 + 0.02114780993 $x**3;
441							$y = int($y + 0.5);
442							return $y;
443							}
444
445							=item C 1)>
446
447							When set and C "C">, any C<#include> directives which the implementation might need will be suppressed.
448
449							=back
450
451							=head1 VARIABLES
452
453							The class variable C<%STATS_H> contains various intermediate values which might be helpful. For instance, C<$STATS_H{deviation_max_offset_datum}> contains the x data point which corresponds to the maximum deviation returned.
454
455							The contents of C<%STATS_H> is subject to change and might not be fully documented in future versions. The current fields are:
456
457							=over 4
458
459							=item C: The x data point corresponding with returned maximum deviation.
460
461							=item C: Arrayref of formula parameters as returned by L after a short fitting attempt used for timing calibration.
462
463							=item C: The number of seconds L spent in the calibration run.
464
465							=item C: The iterations parameter passed to L.
466
467							=item C: Arrayref of formula parameters as returned by L.
468
469							=item C: The number of seconds L actually spent fitting the formula.
470
471							=item C: The exception thrown when the implementation was used to calculate the deviations, or the empty string if none.
472
473							=item C: The formula part of the implementation.
474
475							=item C: The implementation source string.
476
477							=item C: One of C<"time"> or C<"iter">, indicating whether a time limit was used or an iteration count.
478
479							=item C: Arrayref of x data points as passed to L.
480
481							=item C: Arrayref of y data points as passed to L.
482
483							=back
484
485							=head1 CAVEATS
486
487							=over 4
488
489							=item * Only simple polynomial functions are supported. Sometimes you need something else. Use L for such cases.
490
491							=item * If C is very large, iterating over it to calculate deviances can take more time than permitted by C.
492
493							=item * The dangers of overfitting are real! L
494
495							=item * Using too many terms can dramatically reduce the accuracy of the fitted formula.
496
497							=item * Sometimes calling L with a ten-term polynomial causes it to hang.
498
499							=back
500
501							=head1 TO DO
502
503							=over 4
504
505							=item * Support more programming languages for formula implementation: R, MATLAB, python
506
507							=item * Calculate the actual term sigfigs and set precision appropriately in the formula implementation instead of just "%.11f".
508
509							=item * Support trying a range of terms and returning whatever gives the best fit.
510
511							=item * Support piecewise output formulas.
512
513							=item * Work around L's occasional hang problem when using ten-term polynomials.
514
515							=back
516
517							=head1 SEE ALSO
518
519							L
520
521							L
522
523							=cut