File Coverage

levmar-2.6/lm_core.c

Criterion	Covered	Total	%
statement	497	598	83.1
branch	161	204	78.9
condition			n/a
subroutine			n/a
pod			n/a
total	658	802	82.0

line	stmt	bran	code
1			/////////////////////////////////////////////////////////////////////////////////
2			//
3			// Levenberg - Marquardt non-linear minimization algorithm
4			// Copyright (C) 2004 Manolis Lourakis (lourakis at ics forth gr)
5			// Institute of Computer Science, Foundation for Research & Technology - Hellas
6			// Heraklion, Crete, Greece.
7			//
8			// This program is free software; you can redistribute it and/or modify
9			// it under the terms of the GNU General Public License as published by
10			// the Free Software Foundation; either version 2 of the License, or
11			// (at your option) any later version.
12			//
13			// This program is distributed in the hope that it will be useful,
14			// but WITHOUT ANY WARRANTY; without even the implied warranty of
15			// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16			// GNU General Public License for more details.
17			//
18			/////////////////////////////////////////////////////////////////////////////////
19
20			#ifndef LM_REAL // not included by lm.c
21			#error This file should not be compiled directly!
22			#endif
23
24
25			/* precision-specific definitions */
26			#define LEVMAR_DER LM_ADD_PREFIX(levmar_der)
27			#define LEVMAR_DIF LM_ADD_PREFIX(levmar_dif)
28			#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
29			#define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
30			#define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
31			#define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)
32			#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
33
34			#ifdef HAVE_LAPACK
35			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU)
36			#define AX_EQ_B_CHOL LM_ADD_PREFIX(Ax_eq_b_Chol)
37			#define AX_EQ_B_QR LM_ADD_PREFIX(Ax_eq_b_QR)
38			#define AX_EQ_B_QRLS LM_ADD_PREFIX(Ax_eq_b_QRLS)
39			#define AX_EQ_B_SVD LM_ADD_PREFIX(Ax_eq_b_SVD)
40			#define AX_EQ_B_BK LM_ADD_PREFIX(Ax_eq_b_BK)
41			#else
42			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU_noLapack)
43			#endif /* HAVE_LAPACK */
44
45			#ifdef HAVE_PLASMA
46			#define AX_EQ_B_PLASMA_CHOL LM_ADD_PREFIX(Ax_eq_b_PLASMA_Chol)
47			#endif
48
49			/*
50			* This function seeks the parameter vector p that best describes the measurements vector x.
51			* More precisely, given a vector function func : R^m --> R^n with n>=m,
52			* it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of
53			* e=x-func(p) is minimized.
54			*
55			* This function requires an analytic Jacobian. In case the latter is unavailable,
56			* use LEVMAR_DIF() bellow
57			*
58			* Returns the number of iterations (>=0) if successful, LM_ERROR if failed
59			*
60			* For more details, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on
61			* non-linear least squares at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf
62			*/
63
64	125		int LEVMAR_DER(
	119
	6
65			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
66			void (jacf)(LM_REAL p, LM_REAL j, int m, int n, void adata), /* function to evaluate the Jacobian \part x / \part p */
67			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
68			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
69			int m, /* I: parameter vector dimension (i.e. #unknowns) */
70			int n, /* I: measurement vector dimension */
71			int itmax, /* I: maximum number of iterations */
72			LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
73			* stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2. Set to NULL for defaults to be used
74			*/
75			LM_REAL info[LM_INFO_SZ],
76			/* O: information regarding the minimization. Set to NULL if don't care
77			* info[0]= \|\|e\|\|_2 at initial p.
78			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
79			* info[5]= # iterations,
80			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
81			* 2 - stopped by small Dp
82			* 3 - stopped by itmax
83			* 4 - singular matrix. Restart from current p with increased mu
84			* 5 - no further error reduction is possible. Restart with increased mu
85			* 6 - stopped by small \|\|e\|\|_2
86			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
87			* info[7]= # function evaluations
88			* info[8]= # Jacobian evaluations
89			* info[9]= # linear systems solved, i.e. # attempts for reducing error
90			*/
91			LM_REAL work, / working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */
92			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
93			void adata) / pointer to possibly additional data, passed uninterpreted to func & jacf.
94			* Set to NULL if not needed
95			*/
96			{
97			register int i, j, k, l;
98	125		int worksz, freework=0, issolved;
	119
	6
99			/* temp work arrays */
100			LM_REAL e, / nx1 */
101			hx, / \hat{x}_i, nx1 */
102			jacTe, / J^T e_i mx1 */
103			jac, / nxm */
104			jacTjac, / mxm */
105			Dp, / mx1 */
106			diag_jacTjac, / diagonal of J^T J, mx1 */
107			pDp; / p + Dp, mx1 */
108
109			register LM_REAL mu, /* damping constant */
110			tmp; /* mainly used in matrix & vector multiplications */
111			LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* \|\|e(p)\|\|_2, \|\|J^T e\|\|_inf, \|\|e(p+Dp)\|\|_2 */
112	125		LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
	119
	6
113			LM_REAL tau, eps1, eps2, eps2_sq, eps3;
114			LM_REAL init_p_eL2;
115	125		int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
	119
	6
116	125		const int nm=n*m;
	119
	6
117	125		int (linsolver)(LM_REAL A, LM_REAL B, LM_REAL x, int m)=NULL;
	119
	6
118
119	125		mu=jacTe_inf=0.0; /* -Wall */
	119
	6
120
121	125	50	if(n
	119	50
	6
122	0		fprintf(stderr, LCAT(LEVMAR_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m);
	0
	0
123	0		return LM_ERROR;
	0
	0
124			}
125
126	125	50	if(!jacf){
	119	50
	6
127	0		fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_DER)
	0
	0
128			RCAT("().\nIf no such function is available, use ", LEVMAR_DIF) RCAT("() rather than ", LEVMAR_DER) "()\n");
129	0		return LM_ERROR;
	0
	0
130			}
131
132	125	50	if(opts){
	119	50
	6
133	125		tau=opts[0];
	119
	6
134	125		eps1=opts[1];
	119
	6
135	125		eps2=opts[2];
	119
	6
136	125		eps2_sq=opts[2]*opts[2];
	119
	6
137	125		eps3=opts[3];
	119
	6
138			}
139			else{ // use default values
140	0		tau=LM_CNST(LM_INIT_MU);
	0
	0
141	0		eps1=LM_CNST(LM_STOP_THRESH);
	0
	0
142	0		eps2=LM_CNST(LM_STOP_THRESH);
	0
	0
143	0		eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH);
	0
	0
144	0		eps3=LM_CNST(LM_STOP_THRESH);
	0
	0
145			}
146
147	125	50	if(!work){
	119	50
	6
148	0		worksz=LM_DER_WORKSZ(m, n); //2n+4m + nm + mm;
	0
	0
149	0		work=(LM_REAL )malloc(workszsizeof(LM_REAL)); /* allocate a big chunk in one step */
	0
	0
150	0	0	if(!work){
	0	0
	0
151	0		fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n"));
	0
	0
152	0		return LM_ERROR;
	0
	0
153			}
154	0		freework=1;
	0
	0
155			}
156
157			/* set up work arrays */
158	125		e=work;
	119
	6
159	125		hx=e + n;
	119
	6
160	125		jacTe=hx + n;
	119
	6
161	125		jac=jacTe + m;
	119
	6
162	125		jacTjac=jac + nm;
	119
	6
163	125		Dp=jacTjac + m*m;
	119
	6
164	125		diag_jacTjac=Dp + m;
	119
	6
165	125		pDp=diag_jacTjac + m;
	119
	6
166
167			/* compute e=x - f(p) and its L2 norm */
168	125		(*func)(p, hx, m, n, adata); nfev=1;
	119
	6
169			/* ### e=x-hx, p_eL2=\|\|e\|\| */
170			#if 1
171	125		p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);
	119
	6
172			#else
173			for(i=0, p_eL2=0.0; i
174			e[i]=tmp=x[i]-hx[i];
175			p_eL2+=tmp*tmp;
176			}
177			#endif
178	125		init_p_eL2=p_eL2;
	119
	6
179	125	50	if(!LM_FINITE(p_eL2)) stop=7;
	119	50
	6
180
181	31002	100	for(k=0; k
	15968	100
	15034	100
		100
182			/* Note that p and e have been updated at a previous iteration */
183
184	30977	100	if(p_eL2<=eps3){ /* error is small */
	15948	100
	15029
185	100		stop=6;
	99
	1
186	100		break;
	99
	1
187			}
188
189			/* Compute the Jacobian J at p, J^T J, J^T e, \|\|J^T e\|\|_inf and \|\|p\|\|^2.
190			* Since J^T J is symmetric, its computation can be sped up by computing
191			* only its upper triangular part and copying it to the lower part
192			*/
193
194	30877		(*jacf)(p, jac, m, n, adata); ++njev;
	15849
	15028
195
196			/* J^T J, J^T e */
197	30877	100	if(nm<__BLOCKSZ__SQ){ // this is a small problem
	15849	50
	15028
198			/* J^TJ_ij = \sum_l J^T_il J_lj = \sum_l J_li * J_lj.
199			* Thus, the product J^T J can be computed using an outer loop for
200			* l that adds J_li*J_lj to each element ij of the result. Note that
201			* with this scheme, the accesses to J and JtJ are always along rows,
202			* therefore induces less cache misses compared to the straightforward
203			* algorithm for computing the product (i.e., l loop is innermost one).
204			* A similar scheme applies to the computation of J^T e.
205			* However, for large minimization problems (i.e., involving a large number
206			* of unknowns and measurements) for which J/J^T J rows are too large to
207			* fit in the L1 cache, even this scheme incures many cache misses. In
208			* such cases, a cache-efficient blocking scheme is preferable.
209			*
210			* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
211			* performance problem.
212			*
213			* Note that the non-blocking algorithm is faster on small
214			* problems since in this case it avoids the overheads of blocking.
215			*/
216
217			/* looping downwards saves a few computations */
218			register int l;
219			register LM_REAL alpha, jaclm, jacTjacim;
220
221	151340	100	for(i=m*m; i-->0; )
	76200	100
	75140
222	121072		jacTjac[i]=0.0;
	60960
	60112
223	90804	100	for(i=m; i-->0; )
	45720	100
	45084
224	60536		jacTe[i]=0.0;
	30480
	30056
225
226	109502	100	for(l=n; l-->0; ){
	63808	100
	45694
227	79234		jaclm=jac+l*m;
	48568
	30666
228	237702	100	for(i=m; i-->0; ){
	145704	100
	91998
229	158468		jacTjacim=jacTjac+i*m;
	97136
	61332
230	158468		alpha=jaclm[i]; //jac[l*m+i];
	97136
	61332
231	396170	100	for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
	242840	100
	153330
232	237702		jacTjacim[j]+=jaclm[j]alpha; //jacTjac[im+j]+=jac[lm+j]alpha
	145704
	91998
233
234			/* J^T e */
235	158468		jacTe[i]+=alpha*e[l];
	97136
	61332
236			}
237			}
238
239	90804	100	for(i=m; i-->0; ) /* copy to upper part */
	45720	100
	45084
240	90804	100	for(j=i+1; j
	45720	100
	45084
241	30268		jacTjac[im+j]=jacTjac[jm+i];
	15240
	15028
242
243			}
244			else{ // this is a large problem
245			/* Cache efficient computation of J^T J based on blocking
246			*/
247	609		LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m);
	609
	0
248
249			/* cache efficient computation of J^T e */
250	1887	100	for(i=0; i
	1887	0
	0
251	1278		jacTe[i]=0.0;
	1278
	0
252
253	1473609	100	for(i=0; i
	1473609	0
	0
254			register LM_REAL *jacrow;
255
256	4479000	100	for(l=0, jacrow=jac+i*m, tmp=e[i]; l
	4479000	0
	0
257	3006000		jacTe[l]+=jacrow[l]*tmp;
	3006000
	0
258			}
259			}
260
261			/* Compute \|\|J^T e\|\|_inf and \|\|p\|\|^2 */
262	92691	100	for(i=0, p_L2=jacTe_inf=0.0; i
	47607	100
	45084
263	61814	100	if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
	31758	100
	30056	100
		100
264
265	61814		diag_jacTjac[i]=jacTjac[im+i]; / save diagonal entries so that augmentation can be later canceled */
	31758
	30056
266	61814		p_L2+=p[i]*p[i];
	31758
	30056
267			}
268			//p_L2=sqrt(p_L2);
269
270			#if 0
271			if(!(k%100)){
272			printf("Current estimate: ");
273			for(i=0; i
274			printf("%.9g ", p[i]);
275			printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
276			}
277			#endif
278
279			/* check for convergence */
280	30877	50	if((jacTe_inf <= eps1)){
	15849	50
	15028
281	0		Dp_L2=0.0; /* no increment for p in this case */
	0
	0
282	0		stop=1;
	0
	0
283	0		break;
	0
	0
284			}
285
286			/* compute initial damping factor */
287	30877	100	if(k==0){
	15849	100
	15028
288	387	100	for(i=0, tmp=LM_REAL_MIN; i
	369	100
	18
289	262	100	if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
	250	100
	12
290	125		mu=tau*tmp;
	119
	6
291			}
292
293			/* determine increment using adaptive damping */
294			while(1){
295			/* augment normal equations */
296	121563	100	for(i=0; i
	62745	100
	58818
297	81062		jacTjac[i*m+i]+=mu;
	41850
	39212
298
299			/* solve augmented equations */
300			#ifdef HAVE_LAPACK
301			/* 7 alternatives are available: LU, Cholesky + Cholesky with PLASMA, LDLt, 2 variants of QR decomposition and SVD.
302			* For matrices with dimensions of at least a few hundreds, the PLASMA implementation of Cholesky is the fastest.
303			* From the serial solvers, Cholesky is the fastest but might occasionally be inapplicable due to numerical round-off;
304			* QR is slower but more robust; SVD is the slowest but most robust; LU is quite robust but
305			* slower than LDLt; LDLt offers a good tradeoff between robustness and speed
306			*/
307
308			issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK;
309			//issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
310			//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
311			#ifdef HAVE_PLASMA
312			//issolved=AX_EQ_B_PLASMA_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_PLASMA_CHOL;
313			#endif
314			//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
315			//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int ()(LM_REAL A, LM_REAL B, LM_REAL x, int m))AX_EQ_B_QRLS;
316			//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
317
318			#else
319			/* use the LU included with levmar */
320	40501		issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
	20895
	19606
321			#endif /* HAVE_LAPACK */
322
323	40501	50	if(issolved){
	20895	50
	19606
324			/* compute p's new estimate and \|\|Dp\|\|^2 */
325	121563	100	for(i=0, Dp_L2=0.0; i
	62745	100
	58818
326	81062		pDp[i]=p[i] + (tmp=Dp[i]);
	41850
	39212
327	81062		Dp_L2+=tmp*tmp;
	41850
	39212
328			}
329			//Dp_L2=sqrt(Dp_L2);
330
331	40501	100	if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
	20895	100
	19606
332			//if(Dp_L2<=eps2(p_L2 + eps2)){ / relative change in p is small, stop */
333	6		stop=2;
	4
	2
334	6		break;
	4
	2
335			}
336
337	40495	50	if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)LM_CNST(EPSILON))){ / almost singular */
	20891	50
	19604
338			//if(Dp_L2>=(p_L2+eps2)/LM_CNST(EPSILON)){ /* almost singular */
339	0		stop=4;
	0
	0
340	0		break;
	0
	0
341			}
342
343	40495		(func)(pDp, hx, m, n, adata); ++nfev; / evaluate function at p + Dp */
	20891
	19604
344			/* compute \|\|e(pDp)\|\|_2 */
345			/* ### hx=x-hx, pDp_eL2=\|\|hx\|\| */
346			#if 1
347	40495		pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);
	20891
	19604
348			#else
349			for(i=0, pDp_eL2=0.0; i
350			hx[i]=tmp=x[i]-hx[i];
351			pDp_eL2+=tmp*tmp;
352			}
353			#endif
354	40495	100	if(!LM_FINITE(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
	20891	50
	19604
355			* This check makes sure that the inner loop does not run indefinitely.
356			* Thanks to Steve Danauskas for reporting such cases
357			*/
358	12		stop=7;
	12
	0
359	12		break;
	12
	0
360			}
361
362	121509	100	for(i=0, dL=0.0; i
	62697	100
	58812
363	81026		dL+=Dp[i](muDp[i]+jacTe[i]);
	41818
	39208
364
365	40483		dF=p_eL2-pDp_eL2;
	20879
	19604
366
367	40483	50	if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
	20879	100
	19604	50
		100
368	30859		tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0));
	15833
	15026
369	30859		tmp=LM_CNST(1.0)-tmptmptmp;
	15833
	15026
370	30859	100	mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) );
	15833	100
	15026
371	30859		nu=2;
	15833
	15026
372
373	92637	100	for(i=0 ; i
	47559	100
	45078
374	61778		p[i]=pDp[i];
	31726
	30052
375
376	1571075	100	for(i=0; i
	1525389	100
	45686
377	1540216		e[i]=hx[i];
	1509556
	30660
378	30859		p_eL2=pDp_eL2;
	15833
	15026
379	30859		break;
	15833
	15026
380			}
381			}
382
383			/* if this point is reached, either the linear system could not be solved or
384			* the error did not reduce; in any case, the increment must be rejected
385			*/
386
387	9624		mu*=nu;
	5046
	4578
388	9624		nu2=nu<<1; // 2*nu;
	5046
	4578
389	9624	50	if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
	5046	50
	4578
390	0		stop=5;
	0
	0
391	0		break;
	0
	0
392			}
393	9624		nu=nu2;
	5046
	4578
394
395	28872	100	for(i=0; i
	15138	100
	13734
396	19248		jacTjac[i*m+i]=diag_jacTjac[i];
	10092
	9156
397			} /* inner loop */
398			}
399
400	125	100	if(k>=itmax) stop=3;
	119	100
	6
401
402	387	100	for(i=0; i
	369	100
	18
403	262		jacTjac[i*m+i]=diag_jacTjac[i];
	250
	12
404
405	125	50	if(info){
	119	50
	6
406	125		info[0]=init_p_eL2;
	119
	6
407	125		info[1]=p_eL2;
	119
	6
408	125		info[2]=jacTe_inf;
	119
	6
409	125		info[3]=Dp_L2;
	119
	6
410	387	100	for(i=0, tmp=LM_REAL_MIN; i
	369	100
	18
411	262	100	if(tmp
	250	100
	12
412	125		info[4]=mu/tmp;
	119
	6
413	125		info[5]=(LM_REAL)k;
	119
	6
414	125		info[6]=(LM_REAL)stop;
	119
	6
415	125		info[7]=(LM_REAL)nfev;
	119
	6
416	125		info[8]=(LM_REAL)njev;
	119
	6
417	125		info[9]=(LM_REAL)nlss;
	119
	6
418			}
419
420			/* covariance matrix */
421	125	50	if(covar){
	119	50
	6
422	125		LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
	119
	6
423			}
424
425	125	50	if(freework) free(work);
	119	50
	6
426
427			#ifdef LINSOLVERS_RETAIN_MEMORY
428	125	50	if(linsolver) (*linsolver)(NULL, NULL, NULL, 0);
	119	50
	6
429			#endif
430
431	125	50	return (stop!=4 && stop!=7)? k : LM_ERROR;
	119	100
	6	50
		50
432			}
433
434
435			/* Secant version of the LEVMAR_DER() function above: the Jacobian is approximated with
436			* the aid of finite differences (forward or central, see the comment for the opts argument)
437			*/
438	18		int LEVMAR_DIF(
439			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
440			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
441			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
442			int m, /* I: parameter vector dimension (i.e. #unknowns) */
443			int n, /* I: measurement vector dimension */
444			int itmax, /* I: maximum number of iterations */
445			LM_REAL opts[5], /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
446			* scale factor for initial \mu, stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2 and
447			* the step used in difference approximation to the Jacobian. Set to NULL for defaults to be used.
448			* If \delta<0, the Jacobian is approximated with central differences which are more accurate
449			* (but slower!) compared to the forward differences employed by default.
450			*/
451			LM_REAL info[LM_INFO_SZ],
452			/* O: information regarding the minimization. Set to NULL if don't care
453			* info[0]= \|\|e\|\|_2 at initial p.
454			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
455			* info[5]= # iterations,
456			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
457			* 2 - stopped by small Dp
458			* 3 - stopped by itmax
459			* 4 - singular matrix. Restart from current p with increased mu
460			* 5 - no further error reduction is possible. Restart with increased mu
461			* 6 - stopped by small \|\|e\|\|_2
462			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
463			* info[7]= # function evaluations
464			* info[8]= # Jacobian evaluations
465			* info[9]= # linear systems solved, i.e. # attempts for reducing error
466			*/
467			LM_REAL work, / working memory at least LM_DIF_WORKSZ() reals large, allocated if NULL */
468			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
469			void adata) / pointer to possibly additional data, passed uninterpreted to func.
470			* Set to NULL if not needed
471			*/
472			{
473			register int i, j, k, l;
474	18		int worksz, freework=0, issolved;
475			/* temp work arrays */
476			LM_REAL e, / nx1 */
477			hx, / \hat{x}_i, nx1 */
478			jacTe, / J^T e_i mx1 */
479			jac, / nxm */
480			jacTjac, / mxm */
481			Dp, / mx1 */
482			diag_jacTjac, / diagonal of J^T J, mx1 */
483			pDp, / p + Dp, mx1 */
484			wrk, / nx1 */
485			wrk2; / nx1, used only for holding a temporary e vector and when differentiating with central differences */
486
487	18		int using_ffdif=1;
488
489			register LM_REAL mu, /* damping constant */
490			tmp; /* mainly used in matrix & vector multiplications */
491			LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* \|\|e(p)\|\|_2, \|\|J^T e\|\|_inf, \|\|e(p+Dp)\|\|_2 */
492	18		LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
493			LM_REAL tau, eps1, eps2, eps2_sq, eps3, delta;
494			LM_REAL init_p_eL2;
495	18		int nu, nu2, stop=0, nfev, njap=0, nlss=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
496	18		const int nm=n*m;
497	18		int (linsolver)(LM_REAL A, LM_REAL B, LM_REAL x, int m)=NULL;
498
499	18		mu=jacTe_inf=p_L2=0.0; /* -Wall */
500	18		updjac=newjac=0; /* -Wall */
501
502	18		if(n
503	0		fprintf(stderr, LCAT(LEVMAR_DIF, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m);
504	0		return LM_ERROR;
505			}
506
507	18		if(opts){
508	18		tau=opts[0];
509	18		eps1=opts[1];
510	18		eps2=opts[2];
511	18		eps2_sq=opts[2]*opts[2];
512	18		eps3=opts[3];
513	18		delta=opts[4];
514	18		if(delta<0.0){
515	0		delta=-delta; /* make positive */
516	0		using_ffdif=0; /* use central differencing */
517			}
518			}
519			else{ // use default values
520	0		tau=LM_CNST(LM_INIT_MU);
521	0		eps1=LM_CNST(LM_STOP_THRESH);
522	0		eps2=LM_CNST(LM_STOP_THRESH);
523	0		eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH);
524	0		eps3=LM_CNST(LM_STOP_THRESH);
525	0		delta=LM_CNST(LM_DIFF_DELTA);
526			}
527
528	18		if(!work){
529	0		worksz=LM_DIF_WORKSZ(m, n); //4n+4m + nm + mm;
530	0		work=(LM_REAL )malloc(workszsizeof(LM_REAL)); /* allocate a big chunk in one step */
531	0		if(!work){
532	0		fprintf(stderr, LCAT(LEVMAR_DIF, "(): memory allocation request failed\n"));
533	0		return LM_ERROR;
534			}
535	0		freework=1;
536			}
537
538			/* set up work arrays */
539	18		e=work;
540	18		hx=e + n;
541	18		jacTe=hx + n;
542	18		jac=jacTe + m;
543	18		jacTjac=jac + nm;
544	18		Dp=jacTjac + m*m;
545	18		diag_jacTjac=Dp + m;
546	18		pDp=diag_jacTjac + m;
547	18		wrk=pDp + m;
548	18		wrk2=wrk + n;
549
550			/* compute e=x - f(p) and its L2 norm */
551	18		(*func)(p, hx, m, n, adata); nfev=1;
552			/* ### e=x-hx, p_eL2=\|\|e\|\| */
553			#if 1
554	18		p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);
555			#else
556			for(i=0, p_eL2=0.0; i
557			e[i]=tmp=x[i]-hx[i];
558			p_eL2+=tmp*tmp;
559			}
560			#endif
561	18		init_p_eL2=p_eL2;
562	18		if(!LM_FINITE(p_eL2)) stop=7;
563
564	18		nu=20; /* force computation of J */
565
566	1332		for(k=0; k
567			/* Note that p and e have been updated at a previous iteration */
568
569	1332		if(p_eL2<=eps3){ /* error is small */
570	13		stop=6;
571	13		break;
572			}
573
574			/* Compute the Jacobian J at p, J^T J, J^T e, \|\|J^T e\|\|_inf and \|\|p\|\|^2.
575			* The symmetry of J^T J is again exploited for speed
576			*/
577
578	1319		if((updp && nu>16) \|\| updjac==K){ /* compute difference approximation to J */
579	137		if(using_ffdif){ /* use forward differences */
580	137		LEVMAR_FDIF_FORW_JAC_APPROX(func, p, hx, wrk, delta, jac, m, n, adata);
581	137		++njap; nfev+=m;
582			}
583			else{ /* use central differences */
584	0		LEVMAR_FDIF_CENT_JAC_APPROX(func, p, wrk, wrk2, delta, jac, m, n, adata);
585	0		++njap; nfev+=2*m;
586			}
587	137		nu=2; updjac=0; updp=0; newjac=1;
588			}
589
590	1319		if(newjac){ /* Jacobian has changed, recompute J^T J, J^t e, etc */
591	1262		newjac=0;
592
593			/* J^T J, J^T e */
594	1262		if(nm<=__BLOCKSZ__SQ){ // this is a small problem
595			/* J^TJ_ij = \sum_l J^T_il J_lj = \sum_l J_li * J_lj.
596			* Thus, the product J^T J can be computed using an outer loop for
597			* l that adds J_li*J_lj to each element ij of the result. Note that
598			* with this scheme, the accesses to J and JtJ are always along rows,
599			* therefore induces less cache misses compared to the straightforward
600			* algorithm for computing the product (i.e., l loop is innermost one).
601			* A similar scheme applies to the computation of J^T e.
602			* However, for large minimization problems (i.e., involving a large number
603			* of unknowns and measurements) for which J/J^T J rows are too large to
604			* fit in the L1 cache, even this scheme incures many cache misses. In
605			* such cases, a cache-efficient blocking scheme is preferable.
606			*
607			* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
608			* performance problem.
609			*
610			* Note that the non-blocking algorithm is faster on small
611			* problems since in this case it avoids the overheads of blocking.
612			*/
613			register int l;
614			register LM_REAL alpha, jaclm, jacTjacim;
615
616			/* looping downwards saves a few computations */
617	5745		for(i=m*m; i-->0; )
618	4596		jacTjac[i]=0.0;
619	3447		for(i=m; i-->0; )
620	2298		jacTe[i]=0.0;
621
622	3568		for(l=n; l-->0; ){
623	2419		jaclm=jac+l*m;
624	7257		for(i=m; i-->0; ){
625	4838		jacTjacim=jacTjac+i*m;
626	4838		alpha=jaclm[i]; //jac[l*m+i];
627	12095		for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
628	7257		jacTjacim[j]+=jaclm[j]alpha; //jacTjac[im+j]+=jac[lm+j]alpha
629
630			/* J^T e */
631	4838		jacTe[i]+=alpha*e[l];
632			}
633			}
634
635	3447		for(i=m; i-->0; ) /* copy to upper part */
636	3447		for(j=i+1; j
637	1149		jacTjac[im+j]=jacTjac[jm+i];
638			}
639			else{ // this is a large problem
640			/* Cache efficient computation of J^T J based on blocking
641			*/
642	113		LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m);
643
644			/* cache efficient computation of J^T e */
645	389		for(i=0; i
646	276		jacTe[i]=0.0;
647
648	919339		for(i=0; i
649			register LM_REAL *jacrow;
650
651	3046904		for(l=0, jacrow=jac+i*m, tmp=e[i]; l
652	2127678		jacTe[l]+=jacrow[l]*tmp;
653			}
654			}
655
656			/* Compute \|\|J^T e\|\|_inf and \|\|p\|\|^2 */
657	3836		for(i=0, p_L2=jacTe_inf=0.0; i
658	2574		if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
659
660	2574		diag_jacTjac[i]=jacTjac[im+i]; / save diagonal entries so that augmentation can be later canceled */
661	2574		p_L2+=p[i]*p[i];
662			}
663			//p_L2=sqrt(p_L2);
664			}
665
666			#if 0
667			if(!(k%100)){
668			printf("Current estimate: ");
669			for(i=0; i
670			printf("%.9g ", p[i]);
671			printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
672			}
673			#endif
674
675			/* check for convergence */
676	1319		if((jacTe_inf <= eps1)){
677	0		Dp_L2=0.0; /* no increment for p in this case */
678	0		stop=1;
679	0		break;
680			}
681
682			/* compute initial damping factor */
683	1319		if(k==0){
684	59		for(i=0, tmp=LM_REAL_MIN; i
685	41		if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
686	18		mu=tau*tmp;
687			}
688
689			/* determine increment using adaptive damping */
690
691			/* augment normal equations */
692	4007		for(i=0; i
693	2688		jacTjac[i*m+i]+=mu;
694
695			/* solve augmented equations */
696			#ifdef HAVE_LAPACK
697			/* 7 alternatives are available: LU, Cholesky + Cholesky with PLASMA, LDLt, 2 variants of QR decomposition and SVD.
698			* For matrices with dimensions of at least a few hundreds, the PLASMA implementation of Cholesky is the fastest.
699			* From the serial solvers, Cholesky is the fastest but might occasionally be inapplicable due to numerical round-off;
700			* QR is slower but more robust; SVD is the slowest but most robust; LU is quite robust but
701			* slower than LDLt; LDLt offers a good tradeoff between robustness and speed
702			*/
703
704			issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK;
705			//issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
706			//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
707			#ifdef HAVE_PLASMA
708			//issolved=AX_EQ_B_PLASMA_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_PLASMA_CHOL;
709			#endif
710			//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
711			//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int ()(LM_REAL A, LM_REAL B, LM_REAL x, int m))AX_EQ_B_QRLS;
712			//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
713			#else
714			/* use the LU included with levmar */
715	1319		issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
716			#endif /* HAVE_LAPACK */
717
718	1319		if(issolved){
719			/* compute p's new estimate and \|\|Dp\|\|^2 */
720	4007		for(i=0, Dp_L2=0.0; i
721	2688		pDp[i]=p[i] + (tmp=Dp[i]);
722	2688		Dp_L2+=tmp*tmp;
723			}
724			//Dp_L2=sqrt(Dp_L2);
725
726	1319		if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
727			//if(Dp_L2<=eps2(p_L2 + eps2)){ / relative change in p is small, stop */
728	5		stop=2;
729	5		break;
730			}
731
732	1314		if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)LM_CNST(EPSILON))){ / almost singular */
733			//if(Dp_L2>=(p_L2+eps2)/LM_CNST(EPSILON)){ /* almost singular */
734	0		stop=4;
735	0		break;
736			}
737
738	1314		(func)(pDp, wrk, m, n, adata); ++nfev; / evaluate function at p + Dp */
739			/* compute \|\|e(pDp)\|\|_2 */
740			/* ### wrk2=x-wrk, pDp_eL2=\|\|wrk2\|\| */
741			#if 1
742	1314		pDp_eL2=LEVMAR_L2NRMXMY(wrk2, x, wrk, n);
743			#else
744			for(i=0, pDp_eL2=0.0; i
745			wrk2[i]=tmp=x[i]-wrk[i];
746			pDp_eL2+=tmp*tmp;
747			}
748			#endif
749	1314		if(!LM_FINITE(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
750			* This check makes sure that the loop terminates early in the case
751			* of invalid input. Thanks to Steve Danauskas for suggesting it
752			*/
753
754	0		stop=7;
755	0		break;
756			}
757
758	1314		dF=p_eL2-pDp_eL2;
759	1314		if(updp \|\| dF>0){ /* update jac */
760	922887		for(i=0; i
761	3054116		for(l=0, tmp=0.0; l
762	2132486		tmp+=jac[im+l]Dp[l]; /* (J * Dp)[i] */
763	921630		tmp=(wrk[i] - hx[i] - tmp)/Dp_L2; /* (f(p+dp)[i] - f(p)[i] - (J * Dp)[i])/(dp^Tdp) /
764	3054116		for(j=0; j
765	2132486		jac[im+j]+=tmpDp[j];
766			}
767	1257		++updjac;
768	1257		newjac=1;
769			}
770
771	3992		for(i=0, dL=0.0; i
772	2678		dL+=Dp[i](muDp[i]+jacTe[i]);
773
774	1314		if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
775	882		tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0));
776	882		tmp=LM_CNST(1.0)-tmptmptmp;
777	882		mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) );
778	882		nu=2;
779
780	2696		for(i=0 ; i
781	1814		p[i]=pDp[i];
782
783	821747		for(i=0; i
784	820865		e[i]=wrk2[i]; //x[i]-wrk[i];
785	820865		hx[i]=wrk[i];
786			}
787	882		p_eL2=pDp_eL2;
788	882		updp=1;
789	882		continue;
790			}
791			}
792
793			/* if this point is reached, either the linear system could not be solved or
794			* the error did not reduce; in any case, the increment must be rejected
795			*/
796
797	432		mu*=nu;
798	432		nu2=nu<<1; // 2*nu;
799	432		if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
800	0		stop=5;
801	0		break;
802			}
803	432		nu=nu2;
804
805	1296		for(i=0; i
806	864		jacTjac[i*m+i]=diag_jacTjac[i];
807			}
808
809	18		if(k>=itmax) stop=3;
810
811	59		for(i=0; i
812	41		jacTjac[i*m+i]=diag_jacTjac[i];
813
814	18		if(info){
815	18		info[0]=init_p_eL2;
816	18		info[1]=p_eL2;
817	18		info[2]=jacTe_inf;
818	18		info[3]=Dp_L2;
819	59		for(i=0, tmp=LM_REAL_MIN; i
820	41		if(tmp
821	18		info[4]=mu/tmp;
822	18		info[5]=(LM_REAL)k;
823	18		info[6]=(LM_REAL)stop;
824	18		info[7]=(LM_REAL)nfev;
825	18		info[8]=(LM_REAL)njap;
826	18		info[9]=(LM_REAL)nlss;
827			}
828
829			/* covariance matrix */
830	18		if(covar){
831	18		LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
832			}
833
834
835	18		if(freework) free(work);
836
837			#ifdef LINSOLVERS_RETAIN_MEMORY
838	18		if(linsolver) (*linsolver)(NULL, NULL, NULL, 0);
839			#endif
840
841	18		return (stop!=4 && stop!=7)? k : LM_ERROR;
842			}
843
844			/* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
845			#undef LEVMAR_DER
846			#undef LEVMAR_DIF
847			#undef LEVMAR_FDIF_FORW_JAC_APPROX
848			#undef LEVMAR_FDIF_CENT_JAC_APPROX
849			#undef LEVMAR_COVAR
850			#undef LEVMAR_TRANS_MAT_MAT_MULT
851			#undef LEVMAR_L2NRMXMY
852			#undef AX_EQ_B_LU
853			#undef AX_EQ_B_CHOL
854			#undef AX_EQ_B_PLASMA_CHOL
855			#undef AX_EQ_B_QR
856			#undef AX_EQ_B_QRLS
857			#undef AX_EQ_B_SVD
858			#undef AX_EQ_B_BK