File Coverage

levmar-2.6/lmbc_core.c

Criterion	Covered	Total	%
statement	779	1187	65.6
branch	333	680	48.9
condition			n/a
subroutine			n/a
pod			n/a
total	1112	1867	59.5

line	stmt	bran	code
1			/////////////////////////////////////////////////////////////////////////////////
2			//
3			// Levenberg - Marquardt non-linear minimization algorithm
4			// Copyright (C) 2004-05 Manolis Lourakis (lourakis at ics forth gr)
5			// Institute of Computer Science, Foundation for Research & Technology - Hellas
6			// Heraklion, Crete, Greece.
7			//
8			// This program is free software; you can redistribute it and/or modify
9			// it under the terms of the GNU General Public License as published by
10			// the Free Software Foundation; either version 2 of the License, or
11			// (at your option) any later version.
12			//
13			// This program is distributed in the hope that it will be useful,
14			// but WITHOUT ANY WARRANTY; without even the implied warranty of
15			// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16			// GNU General Public License for more details.
17			//
18			/////////////////////////////////////////////////////////////////////////////////
19
20			#ifndef LM_REAL // not included by lmbc.c
21			#error This file should not be compiled directly!
22			#endif
23
24
25			/* precision-specific definitions */
26			#define FUNC_STATE LM_ADD_PREFIX(func_state)
27			#define LNSRCH LM_ADD_PREFIX(lnsrch)
28			#define BOXPROJECT LM_ADD_PREFIX(boxProject)
29			#define BOXSCALE LM_ADD_PREFIX(boxScale)
30			#define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
31			#define VECNORM LM_ADD_PREFIX(vecnorm)
32			#define LEVMAR_BC_DER LM_ADD_PREFIX(levmar_bc_der)
33			#define LEVMAR_BC_DIF LM_ADD_PREFIX(levmar_bc_dif)
34			#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
35			#define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
36			#define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
37			#define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)
38			#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
39			#define LMBC_DIF_DATA LM_ADD_PREFIX(lmbc_dif_data)
40			#define LMBC_DIF_FUNC LM_ADD_PREFIX(lmbc_dif_func)
41			#define LMBC_DIF_JACF LM_ADD_PREFIX(lmbc_dif_jacf)
42
43			#ifdef HAVE_LAPACK
44			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU)
45			#define AX_EQ_B_CHOL LM_ADD_PREFIX(Ax_eq_b_Chol)
46			#define AX_EQ_B_QR LM_ADD_PREFIX(Ax_eq_b_QR)
47			#define AX_EQ_B_QRLS LM_ADD_PREFIX(Ax_eq_b_QRLS)
48			#define AX_EQ_B_SVD LM_ADD_PREFIX(Ax_eq_b_SVD)
49			#define AX_EQ_B_BK LM_ADD_PREFIX(Ax_eq_b_BK)
50			#else
51			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU_noLapack)
52			#endif /* HAVE_LAPACK */
53
54			#ifdef HAVE_PLASMA
55			#define AX_EQ_B_PLASMA_CHOL LM_ADD_PREFIX(Ax_eq_b_PLASMA_Chol)
56			#endif
57
58			/* find the median of 3 numbers */
59			#define __MEDIAN3(a, b, c) ( ((a) >= (b))?\
60			( ((c) >= (a))? (a) : ( ((c) <= (b))? (b) : (c) ) ) : \
61			( ((c) >= (b))? (b) : ( ((c) <= (a))? (a) : (c) ) ) )
62
63			/* Projections to feasible set \Omega: P_{\Omega}(y) := arg min { \|\|x - y\|\| : x \in \Omega}, y \in R^m */
64
65			/* project vector p to a box shaped feasible set. p is a mx1 vector.
66			* Either lb, ub can be NULL. If not NULL, they are mx1 vectors
67			*/
68	190812		static void BOXPROJECT(LM_REAL p, LM_REAL lb, LM_REAL *ub, int m)
	138319
	52493
69			{
70			register int i;
71
72	190812	50	if(!lb){ /* no lower bounds */
	138319	50
	52493
73	0	0	if(!ub) /* no upper bounds */
	0	0
	0
74	0		return;
	0
	0
75			else{ /* upper bounds only */
76	0	0	for(i=m; i-->0; )
	0	0
	0
77	0	0	if(p[i]>ub[i]) p[i]=ub[i];
	0	0
	0
78			}
79			}
80			else
81	190812	50	if(!ub){ /* lower bounds only */
	138319	50
	52493
82	0	0	for(i=m; i-->0; )
	0	0
	0
83	0	0	if(p[i]
	0	0
	0
84			}
85			else /* box bounds */
86	954046	100	for(i=m; i-->0; )
	691581	100
	262465
87	763234	100	p[i]=__MEDIAN3(lb[i], p[i], ub[i]);
	553262	50
	209972	0
		100
		50
		50
		0
		0
		100
		50
88			}
89			#undef __MEDIAN3
90
91			/* pointwise scaling of bounds with the mx1 vector scl. If div=1 scaling is by 1./scl.
92			* Either lb, ub can be NULL. If not NULL, they are mx1 vectors
93			*/
94	0		static void BOXSCALE(LM_REAL lb, LM_REAL ub, LM_REAL *scl, int m, int div)
	0
	0
95			{
96			register int i;
97
98	0	0	if(!lb){ /* no lower bounds */
	0	0
	0
99	0	0	if(!ub) /* no upper bounds */
	0	0
	0
100	0		return;
	0
	0
101			else{ /* upper bounds only */
102	0	0	if(div){
	0	0
	0
103	0	0	for(i=m; i-->0; )
	0	0
	0
104	0	0	if(ub[i]!=LM_REAL_MAX)
	0	0
	0
105	0		ub[i]=ub[i]/scl[i];
	0
	0
106			}else{
107	0	0	for(i=m; i-->0; )
	0	0
	0
108	0	0	if(ub[i]!=LM_REAL_MAX)
	0	0
	0
109	0		ub[i]=ub[i]*scl[i];
	0
	0
110			}
111			}
112			}
113			else
114	0	0	if(!ub){ /* lower bounds only */
	0	0
	0
115	0	0	if(div){
	0	0
	0
116	0	0	for(i=m; i-->0; )
	0	0
	0
117	0	0	if(lb[i]!=LM_REAL_MIN)
	0	0
	0
118	0		lb[i]=lb[i]/scl[i];
	0
	0
119			}else{
120	0	0	for(i=m; i-->0; )
	0	0
	0
121	0	0	if(lb[i]!=LM_REAL_MIN)
	0	0
	0
122	0		lb[i]=lb[i]*scl[i];
	0
	0
123			}
124			}
125			else{ /* box bounds */
126	0	0	if(div){
	0	0
	0
127	0	0	for(i=m; i-->0; ){
	0	0
	0
128	0	0	if(ub[i]!=LM_REAL_MAX)
	0	0
	0
129	0		ub[i]=ub[i]/scl[i];
	0
	0
130	0	0	if(lb[i]!=LM_REAL_MIN)
	0	0
	0
131	0		lb[i]=lb[i]/scl[i];
	0
	0
132			}
133			}else{
134	0	0	for(i=m; i-->0; ){
	0	0
	0
135	0	0	if(ub[i]!=LM_REAL_MAX)
	0	0
	0
136	0		ub[i]=ub[i]*scl[i];
	0
	0
137	0	0	if(lb[i]!=LM_REAL_MIN)
	0	0
	0
138	0		lb[i]=lb[i]*scl[i];
	0
	0
139			}
140			}
141			}
142			}
143
144			/* compute the norm of a vector in a manner that avoids overflows
145			*/
146	0		static LM_REAL VECNORM(LM_REAL *x, int n)
	0
	0
147			{
148			#ifdef HAVE_LAPACK
149			#define NRM2 LM_MK_BLAS_NAME(nrm2)
150			extern LM_REAL NRM2(int n, LM_REAL dx, int *incx);
151			int one=1;
152
153			return NRM2(&n, x, &one);
154			#undef NRM2
155			#else // no LAPACK, use the simple method described by Blue in TOMS78
156			register int i;
157			LM_REAL max, sum, tmp;
158
159	0	0	for(i=n, max=0.0; i-->0; )
	0	0
	0
160	0	0	if(x[i]>max) max=x[i];
	0	0
	0
161	0	0	else if(x[i]<-max) max=-x[i];
	0	0
	0
162
163	0	0	for(i=n, sum=0.0; i-->0; ){
	0	0
	0
164	0		tmp=x[i]/max;
	0
	0
165	0		sum+=tmp*tmp;
	0
	0
166			}
167
168	0		return max*(LM_REAL)sqrt(sum);
	0
	0
169			#endif /* HAVE_LAPACK */
170			}
171
172			struct FUNC_STATE{
173			int n, *nfev;
174			LM_REAL hx, x;
175			LM_REAL lb, ub;
176			void *adata;
177			};
178
179			static void
180	2757		LNSRCH(int m, LM_REAL x, LM_REAL f, LM_REAL g, LM_REAL p, LM_REAL alpha, LM_REAL xpls,
	2534
	223
181			LM_REAL ffpls, void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), struct FUNC_STATE state,
182			int mxtake, int iretcd, LM_REAL stepmx, LM_REAL steptl, LM_REAL *sx)
183			{
184			/* Find a next newton iterate by backtracking line search.
185			* Specifically, finds a \lambda such that for a fixed alpha<0.5 (usually 1e-4),
186			* f(x + \lambdap) <= f(x) + alpha \lambda * g^T*p
187			*
188			* Translated (with a few changes) from Schnabel, Koontz & Weiss uncmin.f, v1.3
189			* Main changes include the addition of box projection and modification of the scaling
190			* logic since uncmin.f operates in the original (unscaled) variable space.
191
192			* PARAMETERS :
193
194			* m --> dimension of problem (i.e. number of variables)
195			* x(m) --> old iterate: x[k-1]
196			* f --> function value at old iterate, f(x)
197			* g(m) --> gradient at old iterate, g(x), or approximate
198			* p(m) --> non-zero newton step
199			* alpha --> fixed constant < 0.5 for line search (see above)
200			* xpls(m) <-- new iterate x[k]
201			* ffpls <-- function value at new iterate, f(xpls)
202			* func --> name of subroutine to evaluate function
203			* state <--> information other than x and m that func requires.
204			* state is not modified in xlnsrch (but can be modified by func).
205			* iretcd <-- return code
206			* mxtake <-- boolean flag indicating step of maximum length used
207			* stepmx --> maximum allowable step size
208			* steptl --> relative step size at which successive iterates
209			* considered close enough to terminate algorithm
210			* sx(m) --> diagonal scaling matrix for x, can be NULL
211
212			* internal variables
213
214			* sln newton length
215			* rln relative length of newton step
216			*/
217
218			register int i, j;
219	2757		int firstback = 1;
	2534
	223
220			LM_REAL disc;
221			LM_REAL a3, b;
222			LM_REAL t1, t2, t3, lambda, tlmbda, rmnlmb;
223			LM_REAL scl, rln, sln, slp;
224			LM_REAL tmp1, tmp2;
225	2757		LM_REAL fpls, pfpls = 0., plmbda = 0.; /* -Wall */
	2534
	223
226
227	2757		f*=LM_CNST(0.5);
	2534
	223
228	2757		*mxtake = 0;
	2534
	223
229	2757		*iretcd = 2;
	2534
	223
230	2757		tmp1 = 0.;
	2534
	223
231	13785	100	for (i = m; i-- > 0; )
	12670	100
	1115
232	11028		tmp1 += p[i] * p[i];
	10136
	892
233	2757		sln = (LM_REAL)sqrt(tmp1);
	2534
	223
234	2757	50	if (sln > stepmx) {
	2534	50
	223
235			/* newton step longer than maximum allowed */
236	0		scl = stepmx / sln;
	0
	0
237	0	0	for (i = m; i-- > 0; ) /* p * scl */
	0	0
	0
238	0		p[i]*=scl;
	0
	0
239	0		sln = stepmx;
	0
	0
240			}
241	13785	100	for (i = m, slp = rln = 0.; i-- > 0; ){
	12670	100
	1115
242	11028		slp+=g[i]p[i]; / g^T * p */
	10136
	892
243
244	11028	50	tmp1 = (FABS(x[i])>=LM_CNST(1.))? FABS(x[i]) : LM_CNST(1.);
	10136	50
	892	0
		50
		50
		0
245	11028	100	tmp2 = FABS(p[i])/tmp1;
	10136	100
	892
246	11028	100	if(rln < tmp2) rln = tmp2;
	10136	100
	892
247			}
248	2757		rmnlmb = steptl / rln;
	2534
	223
249	2757		lambda = LM_CNST(1.0);
	2534
	223
250
251			/* check if new iterate satisfactory. generate new lambda if necessary. */
252
253	13336	50	for(j = _LSITMAX_; j-- > 0; ) {
	12915	50
	421
254	66680	100	for (i = m; i-- > 0; )
	64575	100
	2105
255	53344		xpls[i] = x[i] + lambda * p[i];
	51660
	1684
256	13336		BOXPROJECT(xpls, state->lb, state->ub, m); /* project to feasible set */
	12915
	421
257
258			/* evaluate function at new point */
259	13336	50	if(!sx){
	12915	50
	421
260	13336		(func)(xpls, state->hx, m, state->n, state->adata); ++((state->nfev));
	12915
	421
261			}
262			else{
263	0	0	for (i = m; i-- > 0; ) xpls[i] *= sx[i];
	0	0
	0
264	0		(func)(xpls, state->hx, m, state->n, state->adata); ++((state->nfev));
	0
	0
265	0	0	for (i = m; i-- > 0; ) xpls[i] /= sx[i];
	0	0
	0
266			}
267			/* ### state->hx=state->x-state->hx, tmp1=\|\|state->hx\|\| */
268			#if 1
269	13336		tmp1=LEVMAR_L2NRMXMY(state->hx, state->x, state->hx, state->n);
	12915
	421
270			#else
271			for(i=0, tmp1=0.0; in; ++i){
272			state->hx[i]=tmp2=state->x[i]-state->hx[i];
273			tmp1+=tmp2*tmp2;
274			}
275			#endif
276	13336		fpls=LM_CNST(0.5)tmp1; ffpls=tmp1;
	12915
	421
277
278	13336	100	if (fpls <= f + slp * alpha * lambda) { /* solution found */
	12915	100
	421
279	2139		*iretcd = 0;
	2109
	30
280	2139	50	if (lambda == LM_CNST(1.) && sln > stepmx * LM_CNST(.99)) *mxtake = 1;
	2109	0
	30	50
		0
281	2139		return;
	2109
	30
282			}
283
284			/* else : solution not (yet) found */
285
286			/* First find a point with a finite value */
287
288	11197	100	if (lambda < rmnlmb) {
	10806	100
	391
289			/* no satisfactory xpls found sufficiently distinct from x */
290
291	618		*iretcd = 1;
	425
	193
292	618		return;
	425
	193
293			}
294			else { /* calculate new lambda */
295
296			/* modifications to cover non-finite values */
297	10579	50	if (!LM_FINITE(fpls)) {
	10381	50
	198
298	0		lambda *= LM_CNST(0.1);
	0
	0
299	0		firstback = 1;
	0
	0
300			}
301			else {
302	10579	100	if (firstback) { /* first backtrack: quadratic fit */
	10381	50
	198
303	2732		tlmbda = -lambda * slp / ((fpls - f - slp) * LM_CNST(2.));
	2534
	198
304	2732		firstback = 0;
	2534
	198
305			}
306			else { /* all subsequent backtracks: cubic fit */
307	7847		t1 = fpls - f - lambda * slp;
	7847
	0
308	7847		t2 = pfpls - f - plmbda * slp;
	7847
	0
309	7847		t3 = LM_CNST(1.) / (lambda - plmbda);
	7847
	0
310	7847		a3 = LM_CNST(3.) * t3 * (t1 / (lambda * lambda)
	7847
	0
311	7847		- t2 / (plmbda * plmbda));
	7847
	0
312	7847		b = t3 * (t2 * lambda / (plmbda * plmbda)
	7847
	0
313	7847		- t1 * plmbda / (lambda * lambda));
	7847
	0
314	7847		disc = b * b - a3 * slp;
	7847
	0
315	7847	100	if (disc > b * b)
	7847	0
	0
316			/* only one positive critical point, must be minimum */
317	8	50	tlmbda = (-b + ((a3 < 0)? -(LM_REAL)sqrt(disc): (LM_REAL)sqrt(disc))) /a3;
	8	0
	0
318			else
319			/* both critical points positive, first is minimum */
320	7839	50	tlmbda = (-b + ((a3 < 0)? (LM_REAL)sqrt(disc): -(LM_REAL)sqrt(disc))) /a3;
	7839	0
	0
321
322	7847	50	if (tlmbda > lambda * LM_CNST(.5))
	7847	0
	0
323	0		tlmbda = lambda * LM_CNST(.5);
	0
	0
324			}
325	10579		plmbda = lambda;
	10381
	198
326	10579		pfpls = fpls;
	10381
	198
327	10579	100	if (tlmbda < lambda * LM_CNST(.1))
	10381	100
	198
328	9011		lambda *= LM_CNST(.1);
	8828
	183
329			else
330	1568		lambda = tlmbda;
	1553
	15
331			}
332			}
333			}
334			/* this point is reached when the iterations limit is exceeded */
335	0		iretcd = 1; / failed */
	0
	0
336	0		return;
	0
	0
337			} /* LNSRCH */
338
339			/*
340			* This function seeks the parameter vector p that best describes the measurements
341			* vector x under box constraints.
342			* More precisely, given a vector function func : R^m --> R^n with n>=m,
343			* it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of
344			* e=x-func(p) is minimized under the constraints lb[i]<=p[i]<=ub[i].
345			* If no lower bound constraint applies for p[i], use -DBL_MAX/-FLT_MAX for lb[i];
346			* If no upper bound constraint applies for p[i], use DBL_MAX/FLT_MAX for ub[i].
347			*
348			* This function requires an analytic Jacobian. In case the latter is unavailable,
349			* use LEVMAR_BC_DIF() bellow
350			*
351			* Returns the number of iterations (>=0) if successful, LM_ERROR if failed
352			*
353			* For details, see C. Kanzow, N. Yamashita and M. Fukushima: "Levenberg-Marquardt
354			* methods for constrained nonlinear equations with strong local convergence properties",
355			* Journal of Computational and Applied Mathematics 172, 2004, pp. 375-397.
356			* Also, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on
357			* unconstrained Levenberg-Marquardt at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf
358			*
359			* The algorithm implemented by this function employs projected gradient steps. Since steepest descent
360			* is very sensitive to poor scaling, diagonal scaling has been implemented through the dscl argument:
361			* Instead of minimizing f(p) for p, f(Dq) is minimized for q=D^-1p, D being a diagonal scaling
362			* matrix whose diagonal equals dscl (see Nocedal-Wright p.27). dscl should contain "typical" magnitudes
363			* for the parameters p. A NULL value for dscl implies no scaling. i.e. D=I.
364			* To account for scaling, the code divides the starting point and box bounds pointwise by dscl. Moreover,
365			* before calling func and jacf the scaling has to be undone (by multiplying), as should be done with
366			* the final point. Note also that jac_q=jac_p*D, where jac_q, jac_p are the jacobians w.r.t. q & p, resp.
367			*/
368
369	17		int LEVMAR_BC_DER(
	12
	5
370			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
371			void (jacf)(LM_REAL p, LM_REAL j, int m, int n, void adata), /* function to evaluate the Jacobian \part x / \part p */
372			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
373			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
374			int m, /* I: parameter vector dimension (i.e. #unknowns) */
375			int n, /* I: measurement vector dimension */
376			LM_REAL lb, / I: vector of lower bounds. If NULL, no lower bounds apply */
377			LM_REAL ub, / I: vector of upper bounds. If NULL, no upper bounds apply */
378			LM_REAL dscl, / I: diagonal scaling constants. NULL implies no scaling */
379			int itmax, /* I: maximum number of iterations */
380			LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
381			* stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2. Set to NULL for defaults to be used.
382			* Note that \|\|J^T e\|\|_inf is computed on free (not equal to lb[i] or ub[i]) variables only.
383			*/
384			LM_REAL info[LM_INFO_SZ],
385			/* O: information regarding the minimization. Set to NULL if don't care
386			* info[0]= \|\|e\|\|_2 at initial p.
387			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
388			* info[5]= # iterations,
389			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
390			* 2 - stopped by small Dp
391			* 3 - stopped by itmax
392			* 4 - singular matrix. Restart from current p with increased mu
393			* 5 - no further error reduction is possible. Restart with increased mu
394			* 6 - stopped by small \|\|e\|\|_2
395			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
396			* info[7]= # function evaluations
397			* info[8]= # Jacobian evaluations
398			* info[9]= # linear systems solved, i.e. # attempts for reducing error
399			*/
400			LM_REAL work, / working memory at least LM_BC_DER_WORKSZ() reals large, allocated if NULL */
401			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
402			void adata) / pointer to possibly additional data, passed uninterpreted to func & jacf.
403			* Set to NULL if not needed
404			*/
405			{
406			register int i, j, k, l;
407	17		int worksz, freework=0, issolved;
	12
	5
408			/* temp work arrays */
409			LM_REAL e, / nx1 */
410			hx, / \hat{x}_i, nx1 */
411			jacTe, / J^T e_i mx1 */
412			jac, / nxm */
413			jacTjac, / mxm */
414			Dp, / mx1 */
415			diag_jacTjac, / diagonal of J^T J, mx1 */
416			pDp, / p + Dp, mx1 */
417	17		sp_pDp=NULL; / dsclp or dsclpDp, mx1 */
	12
	5
418
419			register LM_REAL mu, /* damping constant */
420			tmp; /* mainly used in matrix & vector multiplications */
421			LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* \|\|e(p)\|\|_2, \|\|J^T e\|\|_inf, \|\|e(p+Dp)\|\|_2 */
422	17		LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
	12
	5
423			LM_REAL tau, eps1, eps2, eps2_sq, eps3;
424			LM_REAL init_p_eL2;
425	17		int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
	12
	5
426	17		const int nm=n*m;
	12
	5
427
428			/* variables for constrained LM */
429			struct FUNC_STATE fstate;
430	17		LM_REAL alpha=LM_CNST(1e-4), beta=LM_CNST(0.9), gamma=LM_CNST(0.99995), rho=LM_CNST(1e-8);
	12
	5
431			LM_REAL t, t0, jacTeDp;
432	17		LM_REAL tmin=LM_CNST(1e-12), tming=LM_CNST(1e-18); /* minimum step length for LS and PG steps */
	12
	5
433	17		const LM_REAL tini=LM_CNST(1.0); /* initial step length for LS and PG steps */
	12
	5
434	17		int nLMsteps=0, nLSsteps=0, nPGsteps=0, gprevtaken=0;
	12
	5
435			int numactive;
436	17		int (linsolver)(LM_REAL A, LM_REAL B, LM_REAL x, int m)=NULL;
	12
	5
437
438	17		mu=jacTe_inf=t=0.0; tmin=tmin; /* -Wall */
	12
	5
439
440	17	50	if(n
	12	50
	5
441	0		fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m);
	0
	0
442	0		return LM_ERROR;
	0
	0
443			}
444
445	17	50	if(!jacf){
	12	50
	5
446	0		fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_BC_DER)
	0
	0
447			RCAT("().\nIf no such function is available, use ", LEVMAR_BC_DIF) RCAT("() rather than ", LEVMAR_BC_DER) "()\n");
448	0		return LM_ERROR;
	0
	0
449			}
450
451	17	50	if(!LEVMAR_BOX_CHECK(lb, ub, m)){
	12	50
	5
452	0		fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): at least one lower bound exceeds the upper one\n"));
	0
	0
453	0		return LM_ERROR;
	0
	0
454			}
455
456	17	50	if(dscl){ /* check that scaling consts are valid */
	12	50
	5
457	0	0	for(i=m; i-->0; )
	0	0
	0
458	0	0	if(dscl[i]<=0.0){
	0	0
	0
459	0		fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): scaling constants should be positive (scale %d: %g <= 0)\n"), i, dscl[i]);
	0
	0
460	0		return LM_ERROR;
	0
	0
461			}
462
463	0		sp_pDp=(LM_REAL )malloc(msizeof(LM_REAL));
	0
	0
464	0	0	if(!sp_pDp){
	0	0
	0
465	0		fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n"));
	0
	0
466	0		return LM_ERROR;
	0
	0
467			}
468			}
469
470	17	50	if(opts){
	12	50
	5
471	17		tau=opts[0];
	12
	5
472	17		eps1=opts[1];
	12
	5
473	17		eps2=opts[2];
	12
	5
474	17		eps2_sq=opts[2]*opts[2];
	12
	5
475	17		eps3=opts[3];
	12
	5
476			}
477			else{ // use default values
478	0		tau=LM_CNST(LM_INIT_MU);
	0
	0
479	0		eps1=LM_CNST(LM_STOP_THRESH);
	0
	0
480	0		eps2=LM_CNST(LM_STOP_THRESH);
	0
	0
481	0		eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH);
	0
	0
482	0		eps3=LM_CNST(LM_STOP_THRESH);
	0
	0
483			}
484
485	17	50	if(!work){
	12	50
	5
486	0		worksz=LM_BC_DER_WORKSZ(m, n); //2n+4m + nm + mm;
	0
	0
487	0		work=(LM_REAL )malloc(workszsizeof(LM_REAL)); /* allocate a big chunk in one step */
	0
	0
488	0	0	if(!work){
	0	0
	0
489	0		fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n"));
	0
	0
490	0		return LM_ERROR;
	0
	0
491			}
492	0		freework=1;
	0
	0
493			}
494
495			/* set up work arrays */
496	17		e=work;
	12
	5
497	17		hx=e + n;
	12
	5
498	17		jacTe=hx + n;
	12
	5
499	17		jac=jacTe + m;
	12
	5
500	17		jacTjac=jac + nm;
	12
	5
501	17		Dp=jacTjac + m*m;
	12
	5
502	17		diag_jacTjac=Dp + m;
	12
	5
503	17		pDp=diag_jacTjac + m;
	12
	5
504
505	17		fstate.n=n;
	12
	5
506	17		fstate.hx=hx;
	12
	5
507	17		fstate.x=x;
	12
	5
508	17		fstate.lb=lb;
	12
	5
509	17		fstate.ub=ub;
	12
	5
510	17		fstate.adata=adata;
	12
	5
511	17		fstate.nfev=&nfev;
	12
	5
512
513			/* see if starting point is within the feasible set */
514	82	100	for(i=0; i
	57	100
	25
515	65		pDp[i]=p[i];
	45
	20
516	17		BOXPROJECT(p, lb, ub, m); /* project to feasible set */
	12
	5
517	82	100	for(i=0; i
	57	100
	25
518	65	50	if(pDp[i]!=p[i])
	45	50
	20
519	0		fprintf(stderr, RCAT("Warning: component %d of starting point not feasible in ", LEVMAR_BC_DER) "()! [%g projected to %g]\n",
	0
	0
520	0		i, pDp[i], p[i]);
	0
	0
521
522			/* compute e=x - f(p) and its L2 norm */
523	17		(*func)(p, hx, m, n, adata); nfev=1;
	12
	5
524			/* ### e=x-hx, p_eL2=\|\|e\|\| */
525			#if 1
526	17		p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);
	12
	5
527			#else
528			for(i=0, p_eL2=0.0; i
529			e[i]=tmp=x[i]-hx[i];
530			p_eL2+=tmp*tmp;
531			}
532			#endif
533	17		init_p_eL2=p_eL2;
	12
	5
534	17	50	if(!LM_FINITE(p_eL2)) stop=7;
	12	50
	5
535
536	17	50	if(dscl){
	12	50
	5
537			/* scale starting point and constraints */
538	0	0	for(i=m; i-->0; ) p[i]/=dscl[i];
	0	0
	0
539	0		BOXSCALE(lb, ub, dscl, m, 1);
	0
	0
540			}
541
542	60500	100	for(k=0; k
	45023	100
	15477	100
		100
543			/* Note that p and e have been updated at a previous iteration */
544
545	60485	100	if(p_eL2<=eps3){ /* error is small */
	45013	50
	15472
546	2		stop=6;
	2
	0
547	2		break;
	2
	0
548			}
549
550			/* Compute the Jacobian J at p, J^T J, J^T e, \|\|J^T e\|\|_inf and \|\|p\|\|^2.
551			* Since J^T J is symmetric, its computation can be sped up by computing
552			* only its upper triangular part and copying it to the lower part
553			*/
554
555	60483	50	if(!dscl){
	45011	50
	15472
556	60483		(*jacf)(p, jac, m, n, adata); ++njev;
	45011
	15472
557			}
558			else{
559	0	0	for(i=m; i-->0; ) sp_pDp[i]=p[i]*dscl[i];
	0	0
	0
560	0		(*jacf)(sp_pDp, jac, m, n, adata); ++njev;
	0
	0
561
562			/* compute jacD /
563	0	0	for(i=n; i-->0; ){
	0	0
	0
564			register LM_REAL *jacim;
565
566	0		jacim=jac+i*m;
	0
	0
567	0	0	for(j=m; j-->0; )
	0	0
	0
568	0		jacim[j]=dscl[j]; // jac[im+j]*=dscl[j];
	0
	0
569			}
570			}
571
572			/* J^T J, J^T e */
573	60483	100	if(nm<__BLOCKSZ__SQ){ // this is a small problem
	45011	50
	15472
574			/* J^TJ_ij = \sum_l J^T_il J_lj = \sum_l J_li * J_lj.
575			* Thus, the product J^T J can be computed using an outer loop for
576			* l that adds J_li*J_lj to each element ij of the result. Note that
577			* with this scheme, the accesses to J and JtJ are always along rows,
578			* therefore induces less cache misses compared to the straightforward
579			* algorithm for computing the product (i.e., l loop is innermost one).
580			* A similar scheme applies to the computation of J^T e.
581			* However, for large minimization problems (i.e., involving a large number
582			* of unknowns and measurements) for which J/J^T J rows are too large to
583			* fit in the L1 cache, even this scheme incures many cache misses. In
584			* such cases, a cache-efficient blocking scheme is preferable.
585			*
586			* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
587			* performance problem.
588			*
589			* Note that the non-blocking algorithm is faster on small
590			* problems since in this case it avoids the overheads of blocking.
591			*/
592			register LM_REAL alpha, jaclm, jacTjacim;
593
594			/* looping downwards saves a few computations */
595	1028024	100	for(i=m*m; i-->0; )
	765000	100
	263024
596	967552		jacTjac[i]=0.0;
	720000
	247552
597	302360	100	for(i=m; i-->0; )
	225000	100
	77360
598	241888		jacTe[i]=0.0;
	180000
	61888
599
600	302360	100	for(l=n; l-->0; ){
	225000	100
	77360
601	241888		jaclm=jac+l*m;
	180000
	61888
602	1209440	100	for(i=m; i-->0; ){
	900000	100
	309440
603	967552		jacTjacim=jacTjac+i*m;
	720000
	247552
604	967552		alpha=jaclm[i]; //jac[l*m+i];
	720000
	247552
605	3386432	100	for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
	2520000	100
	866432
606	2418880		jacTjacim[j]+=jaclm[j]alpha; //jacTjac[im+j]+=jac[lm+j]alpha
	1800000
	618880
607
608			/* J^T e */
609	967552		jacTe[i]+=alpha*e[l];
	720000
	247552
610			}
611			}
612
613	302360	100	for(i=m; i-->0; ) /* copy to upper part */
	225000	100
	77360
614	604720	100	for(j=i+1; j
	450000	100
	154720
615	362832		jacTjac[im+j]=jacTjac[jm+i];
	270000
	92832
616			}
617			else{ // this is a large problem
618			/* Cache efficient computation of J^T J based on blocking
619			*/
620	11		LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m);
	11
	0
621
622			/* cache efficient computation of J^T e */
623	44	100	for(i=0; i
	44	0
	0
624	33		jacTe[i]=0.0;
	33
	0
625
626	11011	100	for(i=0; i
	11011	0
	0
627			register LM_REAL *jacrow;
628
629	44000	100	for(l=0, jacrow=jac+i*m, tmp=e[i]; l
	44000	0
	0
630	33000		jacTe[l]+=jacrow[l]*tmp;
	33000
	0
631			}
632			}
633
634			/* Compute \|\|J^T e\|\|_inf and \|\|p\|\|^2. Note that \|\|J^T e\|\|_inf
635			* is computed for free (i.e. inactive) variables only.
636			* At a local minimum, if p[i]==ub[i] then g[i]>0;
637			* if p[i]==lb[i] g[i]<0; otherwise g[i]=0
638			*/
639	302404	100	for(i=j=numactive=0, p_L2=jacTe_inf=0.0; i
	225044	100
	77360
640	241921	50	if(ub && p[i]==ub[i]){ ++numactive; if(jacTe[i]>0.0) ++j; }
	180033	100
	61888	100
		50
		100
		50
641	181468	50	else if(lb && p[i]==lb[i]){ ++numactive; if(jacTe[i]<0.0) ++j; }
	135042	50
	46426	0
		50
		50
		0
642	181468	100	else if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
	135042	100
	46426	100
		100
643
644	241921		diag_jacTjac[i]=jacTjac[im+i]; / save diagonal entries so that augmentation can be later canceled */
	180033
	61888
645	241921		p_L2+=p[i]*p[i];
	180033
	61888
646			}
647			//p_L2=sqrt(p_L2);
648
649			#if 0
650			if(!(k%100)){
651			printf("Current estimate: ");
652			for(i=0; i
653			printf("%.9g ", p[i]);
654			printf("-- errors %.9g %0.9g, #active %d [%d]\n", jacTe_inf, p_eL2, numactive, j);
655			}
656			#endif
657
658			/* check for convergence */
659	60483	100	if(j==numactive && (jacTe_inf <= eps1)){
	45011	50
	15472	50
		50
660	0		Dp_L2=0.0; /* no increment for p in this case */
	0
	0
661	0		stop=1;
	0
	0
662	0		break;
	0
	0
663			}
664
665			/* compute initial damping factor */
666	60483	100	if(k==0){
	45011	100
	15472
667	17	50	if(!lb && !ub){ /* no bounds */
	12	0
	5	50
		0
668	0	0	for(i=0, tmp=LM_REAL_MIN; i
	0	0
	0
669	0	0	if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
	0	0
	0
670	0		mu=tau*tmp;
	0
	0
671			}
672			else
673	17		mu=LM_CNST(0.5)taup_eL2; /* use Kanzow's starting mu */
	12
	5
674			}
675
676			/* determine increment using a combination of adaptive damping, line search and projected gradient search */
677			while(1){
678			/* augment normal equations */
679	302404	100	for(i=0; i
	225044	100
	77360
680	241921		jacTjac[i*m+i]+=mu;
	180033
	61888
681
682			/* solve augmented equations */
683			#ifdef HAVE_LAPACK
684			/* 7 alternatives are available: LU, Cholesky + Cholesky with PLASMA, LDLt, 2 variants of QR decomposition and SVD.
685			* For matrices with dimensions of at least a few hundreds, the PLASMA implementation of Cholesky is the fastest.
686			* From the serial solvers, Cholesky is the fastest but might occasionally be inapplicable due to numerical round-off;
687			* QR is slower but more robust; SVD is the slowest but most robust; LU is quite robust but
688			* slower than LDLt; LDLt offers a good tradeoff between robustness and speed
689			*/
690
691			issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK;
692			//issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
693			//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
694			#ifdef HAVE_PLASMA
695			//issolved=AX_EQ_B_PLASMA_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_PLASMA_CHOL;
696			#endif
697			//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
698			//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int ()(LM_REAL A, LM_REAL B, LM_REAL x, int m))AX_EQ_B_QRLS;
699			//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
700
701			#else
702			/* use the LU included with levmar */
703	60483		issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
	45011
	15472
704			#endif /* HAVE_LAPACK */
705
706	60483	50	if(issolved){
	45011	50
	15472
707	302404	100	for(i=0; i
	225044	100
	77360
708	241921		pDp[i]=p[i] + Dp[i];
	180033
	61888
709
710			/* compute p's new estimate and \|\|Dp\|\|^2 */
711	60483		BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */
	45011
	15472
712	302404	100	for(i=0, Dp_L2=0.0; i
	225044	100
	77360
713	241921		Dp[i]=tmp=pDp[i]-p[i];
	180033
	61888
714	241921		Dp_L2+=tmp*tmp;
	180033
	61888
715			}
716			//Dp_L2=sqrt(Dp_L2);
717
718	60483	100	if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
	45011	50
	15472
719	1		stop=2;
	1
	0
720	1		break;
	1
	0
721			}
722
723	60482	50	if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)LM_CNST(EPSILON))){ / almost singular */
	45010	50
	15472
724	0		stop=4;
	0
	0
725	0		break;
	0
	0
726			}
727
728	60482	50	if(!dscl){
	45010	50
	15472
729	60482		(func)(pDp, hx, m, n, adata); ++nfev; / evaluate function at p + Dp */
	45010
	15472
730			}
731			else{
732	0	0	for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i];
	0	0
	0
733	0		(func)(sp_pDp, hx, m, n, adata); ++nfev; / evaluate function at p + Dp */
	0
	0
734			}
735
736			/* ### hx=x-hx, pDp_eL2=\|\|hx\|\| */
737			#if 1
738	60482		pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);
	45010
	15472
739			#else
740			for(i=0, pDp_eL2=0.0; i
741			hx[i]=tmp=x[i]-hx[i];
742			pDp_eL2+=tmp*tmp;
743			}
744			#endif
745			/* the following test ensures that the computation of pDp_eL2 has not overflowed.
746			* Such an overflow does no harm here, thus it is not signalled as an error
747			*/
748	60482	50	if(!LM_FINITE(pDp_eL2) && !LM_FINITE(VECNORM(hx, n))){
	45010	0
	15472	50
		0
749	0		stop=7;
	0
	0
750	0		break;
	0
	0
751			}
752
753	60482	100	if(pDp_eL2<=gamma*p_eL2){
	45010	100
	15472
754	180	100	for(i=0, dL=0.0; i
	130	100
	50
755	142		dL+=Dp[i](muDp[i]+jacTe[i]);
	102
	40
756
757			#if 1
758	38	50	if(dL>0.0){
	28	50
	10
759	38		dF=p_eL2-pDp_eL2;
	28
	10
760	38		tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0));
	28
	10
761	38		tmp=LM_CNST(1.0)-tmptmptmp;
	28
	10
762	38	100	mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) );
	28	50
	10
763			}
764			else{
765	0		tmp=LM_CNST(0.1)pDp_eL2; / pDp_eL2 is the new p_eL2 */
	0
	0
766	0	0	mu=(mu>=tmp)? tmp : mu;
	0	0
	0
767			}
768			#else
769
770			tmp=LM_CNST(0.1)pDp_eL2; / pDp_eL2 is the new p_eL2 */
771			mu=(mu>=tmp)? tmp : mu;
772			#endif
773
774	38		nu=2;
	28
	10
775
776	180	100	for(i=0 ; i
	130	100
	50
777	142		p[i]=pDp[i];
	102
	40
778
779	10150	100	for(i=0; i
	10100	100
	50
780	10112		e[i]=hx[i];
	10072
	40
781	38		p_eL2=pDp_eL2;
	28
	10
782	38		++nLMsteps;
	28
	10
783	38		gprevtaken=0;
	28
	10
784	38		break;
	28
	10
785			}
786			/* note that if the LM step is not taken, code falls through to the LM line search below */
787			}
788			else{
789
790			/* the augmented linear system could not be solved, increase mu */
791
792	0		mu*=nu;
	0
	0
793	0		nu2=nu<<1; // 2*nu;
	0
	0
794	0	0	if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
	0	0
	0
795	0		stop=5;
	0
	0
796	0		break;
	0
	0
797			}
798	0		nu=nu2;
	0
	0
799
800	0	0	for(i=0; i
	0	0
	0
801	0		jacTjac[i*m+i]=diag_jacTjac[i];
	0
	0
802
803	0		continue; /* solve again with increased nu */
	0
	0
804			}
805
806			/* if this point is reached, the LM step did not reduce the error;
807			* see if it is a descent direction
808			*/
809
810			/* negate jacTe (i.e. g) & compute g^T * Dp */
811	302220	100	for(i=0, jacTeDp=0.0; i
	224910	100
	77310
812	241776		jacTe[i]=-jacTe[i];
	179928
	61848
813	241776		jacTeDp+=jacTe[i]*Dp[i];
	179928
	61848
814			}
815
816	60444	100	if(jacTeDp<=-rho*pow(Dp_L2, LM_CNST(_POW_)/LM_CNST(2.0))){
	44982	100
	15462
817			/* Dp is a descent direction; do a line search along it */
818			#if 1
819			/* use Schnabel's backtracking line search; it requires fewer "func" evaluations */
820			{
821			int mxtake, iretcd;
822	2757		LM_REAL stepmx, steptl=LM_CNST(1e3)*(LM_REAL)sqrt(LM_REAL_EPSILON);
	2534
	223
823
824	2757	50	tmp=(LM_REAL)sqrt(p_L2); stepmx=LM_CNST(1e3)*( (tmp>=LM_CNST(1.0))? tmp : LM_CNST(1.0) );
	2534	50
	223
825
826	2757		LNSRCH(m, p, p_eL2, jacTe, Dp, alpha, pDp, &pDp_eL2, func, &fstate,
	2534
	223
827			&mxtake, &iretcd, stepmx, steptl, dscl); /* NOTE: LNSRCH() updates hx */
828	2757	100	if(iretcd!=0 \|\| !LM_FINITE(pDp_eL2)) goto gradproj; /* rather inelegant but effective way to handle LNSRCH() failures... */
	2534	50
	223	100
		50
829			}
830			#else
831			/* use the simpler (but slower!) line search described by Kanzow et al */
832			for(t=tini; t>tmin; t*=beta){
833			for(i=0; i
834			pDp[i]=p[i] + t*Dp[i];
835			BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */
836
837			if(!dscl){
838			(func)(pDp, hx, m, n, adata); ++nfev; / evaluate function at p + tDp /
839			}
840			else{
841			for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i];
842			(func)(sp_pDp, hx, m, n, adata); ++nfev; / evaluate function at p + tDp /
843			}
844
845			/* compute \|\|e(pDp)\|\|_2 */
846			/* ### hx=x-hx, pDp_eL2=\|\|hx\|\| */
847			#if 1
848			pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);
849			#else
850			for(i=0, pDp_eL2=0.0; i
851			hx[i]=tmp=x[i]-hx[i];
852			pDp_eL2+=tmp*tmp;
853			}
854			#endif /* \|\|e(pDp)\|\|_2 */
855			if(!LM_FINITE(pDp_eL2)) goto gradproj; /* treat as line search failure */
856
857			//if(LM_CNST(0.5)pDp_eL2<=LM_CNST(0.5)p_eL2 + talphajacTeDp) break;
858			if(pDp_eL2<=p_eL2 + LM_CNST(2.0)talpha*jacTeDp) break;
859			}
860			#endif /* line search alternatives */
861
862	2139		++nLSsteps;
	2109
	30
863	2139		gprevtaken=0;
	2109
	30
864
865			/* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2.
866			* These values are used below to update their corresponding variables
867			*/
868			}
869			else{
870			/* Note that this point can also be reached via a goto when LNSRCH() fails. */
871	57687		gradproj:
	42448
	15239
872
873			/* jacTe has been negated above. Being a descent direction, it is next used
874			* to make a projected gradient step
875			*/
876
877			/* compute \|\|g\|\| */
878	291525	100	for(i=0, tmp=0.0; i
	214365	100
	77160
879	233220		tmp+=jacTe[i]*jacTe[i];
	171492
	61728
880	58305		tmp=(LM_REAL)sqrt(tmp);
	42873
	15432
881	58305		tmp=LM_CNST(100.0)/(LM_CNST(1.0)+tmp);
	42873
	15432
882	58305	50	t0=(tmp<=tini)? tmp : tini; /* guard against poor scaling & large steps; see (3.50) in C.T. Kelley's book */
	42873	50
	15432
883
884			/* if the previous step was along the gradient descent, try to use the t employed in that step */
885	116976	100	for(t=(gprevtaken)? t : t0; t>tming; t*=beta){
	80381	50
	36595	100
		50
886	584880	100	for(i=0; i
	401905	100
	182975
887	467904		pDp[i]=p[i] - t*jacTe[i];
	321524
	146380
888	116976		BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */
	80381
	36595
889	584880	100	for(i=0, Dp_L2=0.0; i
	401905	100
	182975
890	467904		Dp[i]=tmp=pDp[i]-p[i];
	321524
	146380
891	467904		Dp_L2+=tmp*tmp;
	321524
	146380
892			}
893
894	116976	50	if(!dscl){
	80381	50
	36595
895	116976		(func)(pDp, hx, m, n, adata); ++nfev; / evaluate function at p - tg /
	80381
	36595
896			}
897			else{
898	0	0	for(i=m; i-->0; ) sp_pDp[i]=pDp[i]*dscl[i];
	0	0
	0
899	0		(func)(sp_pDp, hx, m, n, adata); ++nfev; / evaluate function at p - tg /
	0
	0
900			}
901
902			/* compute \|\|e(pDp)\|\|_2 */
903			/* ### hx=x-hx, pDp_eL2=\|\|hx\|\| */
904			#if 1
905	116976		pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);
	80381
	36595
906			#else
907			for(i=0, pDp_eL2=0.0; i
908			hx[i]=tmp=x[i]-hx[i];
909			pDp_eL2+=tmp*tmp;
910			}
911			#endif
912			/* the following test ensures that the computation of pDp_eL2 has not overflowed.
913			* Such an overflow does no harm here, thus it is not signalled as an error
914			*/
915	116976	50	if(!LM_FINITE(pDp_eL2) && !LM_FINITE(VECNORM(hx, n))){
	80381	0
	36595	50
		0
916	0		stop=7;
	0
	0
917	0		goto breaknested;
	0
	0
918			}
919
920			/* compute \|\|g^T * Dp\|\|. Note that if pDp has not been altered by projection
921			* (i.e. BOXPROJECT), jacTeDp=-t*\|\|g\|\|^2
922			*/
923	584880	100	for(i=0, jacTeDp=0.0; i
	401905	100
	182975
924	467904		jacTeDp+=jacTe[i]*Dp[i];
	321524
	146380
925
926	116976	100	if(gprevtaken && pDp_eL2<=p_eL2 + LM_CNST(2.0)LM_CNST(0.99999)jacTeDp){ /* starting t too small */
	80381	100
	36595	100
		100
927	52369		t=t0;
	37465
	14904
928	52369		gprevtaken=0;
	37465
	14904
929	52369		continue;
	37465
	14904
930			}
931			//if(LM_CNST(0.5)pDp_eL2<=LM_CNST(0.5)p_eL2 + alpha*jacTeDp) terminatePGLS;
932	64607	100	if(pDp_eL2<=p_eL2 + LM_CNST(2.0)alphajacTeDp) goto terminatePGLS;
	42916	100
	21691
933
934			//if(pDp_eL2<=p_eL2 - LM_CNST(2.0)alpha/tDp_L2) goto terminatePGLS; // sufficient decrease condition proposed by Kelley in (5.13)
935			}
936
937			/* if this point is reached then the gradient line search has failed */
938	0		gprevtaken=0;
	0
	0
939	0		break;
	0
	0
940
941	58305		terminatePGLS:
	42873
	15432
942
943	58305		++nPGsteps;
	42873
	15432
944	58305		gprevtaken=1;
	42873
	15432
945			/* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2 */
946			}
947
948			/* update using computed values */
949
950	302220	100	for(i=0, Dp_L2=0.0; i
	224910	100
	77310
951	241776		tmp=pDp[i]-p[i];
	179928
	61848
952	241776		Dp_L2+=tmp*tmp;
	179928
	61848
953			}
954			//Dp_L2=sqrt(Dp_L2);
955
956	60444	50	if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
	44982	100
	15462
957	2		stop=2;
	0
	2
958	2		break;
	0
	2
959			}
960
961	302210	100	for(i=0 ; i
	224910	100
	77300
962	241768		p[i]=pDp[i];
	179928
	61840
963
964	302210	100	for(i=0; i
	224910	100
	77300
965	241768		e[i]=hx[i];
	179928
	61840
966	60442		p_eL2=pDp_eL2;
	44982
	15460
967	60442		break;
	44982
	15460
968			} /* inner loop */
969			}
970
971	15		breaknested: /* NOTE: this point is also reached via an explicit goto! */
	10
	5
972
973	17	100	if(k>=itmax) stop=3;
	12	100
	5
974
975	82	100	for(i=0; i
	57	100
	25
976	65		jacTjac[i*m+i]=diag_jacTjac[i];
	45
	20
977
978	17	50	if(info){
	12	50
	5
979	17		info[0]=init_p_eL2;
	12
	5
980	17		info[1]=p_eL2;
	12
	5
981	17		info[2]=jacTe_inf;
	12
	5
982	17		info[3]=Dp_L2;
	12
	5
983	82	100	for(i=0, tmp=LM_REAL_MIN; i
	57	100
	25
984	65	100	if(tmp
	45	100
	20
985	17		info[4]=mu/tmp;
	12
	5
986	17		info[5]=(LM_REAL)k;
	12
	5
987	17		info[6]=(LM_REAL)stop;
	12
	5
988	17		info[7]=(LM_REAL)nfev;
	12
	5
989	17		info[8]=(LM_REAL)njev;
	12
	5
990	17		info[9]=(LM_REAL)nlss;
	12
	5
991			}
992
993			/* covariance matrix */
994	17	50	if(covar){
	12	50
	5
995	17		LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
	12
	5
996
997	17	50	if(dscl){ /* correct for the scaling */
	12	50
	5
998	0	0	for(i=m; i-->0; )
	0	0
	0
999	0	0	for(j=m; j-->0; )
	0	0
	0
1000	0		covar[im+j]=(dscl[i]*dscl[j]);
	0
	0
1001			}
1002			}
1003
1004	17	50	if(freework) free(work);
	12	50
	5
1005
1006			#ifdef LINSOLVERS_RETAIN_MEMORY
1007	17	50	if(linsolver) (*linsolver)(NULL, NULL, NULL, 0);
	12	50
	5
1008			#endif
1009
1010			#if 0
1011			printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSsteps, nPGsteps);
1012			#endif
1013
1014	17	50	if(dscl){
	12	50
	5
1015			/* scale final point and constraints */
1016	0	0	for(i=0; i
	0	0
	0
1017	0		BOXSCALE(lb, ub, dscl, m, 0);
	0
	0
1018	0		free(sp_pDp);
	0
	0
1019			}
1020
1021	17	50	return (stop!=4 && stop!=7)? k : LM_ERROR;
	12	50
	5	50
		50
1022			}
1023
1024			/* following struct & LMBC_DIF_XXX functions won't be necessary if a true secant
1025			* version of LEVMAR_BC_DIF() is implemented...
1026			*/
1027			struct LMBC_DIF_DATA{
1028			int ffdif; // nonzero if forward differencing is used
1029			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata);
1030			LM_REAL hx, hxx;
1031			void *adata;
1032			LM_REAL delta;
1033			};
1034
1035	49939		static void LMBC_DIF_FUNC(LM_REAL p, LM_REAL hx, int m, int n, void *data)
	30729
	19210
1036			{
1037	49939		struct LMBC_DIF_DATA dta=(struct LMBC_DIF_DATA )data;
	30729
	19210
1038
1039			/* call user-supplied function passing it the user-supplied data */
1040	49939		(*(dta->func))(p, hx, m, n, dta->adata);
	30729
	19210
1041	49939		}
	30729
	19210
1042
1043	15254		static void LMBC_DIF_JACF(LM_REAL p, LM_REAL jac, int m, int n, void *data)
	10004
	5250
1044			{
1045	15254		struct LMBC_DIF_DATA dta=(struct LMBC_DIF_DATA )data;
	10004
	5250
1046
1047	15254	50	if(dta->ffdif){
	10004	50
	5250
1048			/* evaluate user-supplied function at p */
1049	15254		(*(dta->func))(p, dta->hx, m, n, dta->adata);
	10004
	5250
1050	15254		LEVMAR_FDIF_FORW_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
	10004
	5250
1051			}
1052			else
1053	0		LEVMAR_FDIF_CENT_JAC_APPROX(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
	0
	0
1054	15254		}
	10004
	5250
1055
1056
1057			/* No Jacobian version of the LEVMAR_BC_DER() function above: the Jacobian is approximated with
1058			* the aid of finite differences (forward or central, see the comment for the opts argument)
1059			* Ideally, this function should be implemented with a secant approach. Currently, it just calls
1060			* LEVMAR_BC_DER()
1061			*/
1062	5		int LEVMAR_BC_DIF(
1063			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
1064			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
1065			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
1066			int m, /* I: parameter vector dimension (i.e. #unknowns) */
1067			int n, /* I: measurement vector dimension */
1068			LM_REAL lb, / I: vector of lower bounds. If NULL, no lower bounds apply */
1069			LM_REAL ub, / I: vector of upper bounds. If NULL, no upper bounds apply */
1070			LM_REAL dscl, / I: diagonal scaling constants. NULL implies no scaling */
1071			int itmax, /* I: maximum number of iterations */
1072			LM_REAL opts[5], /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
1073			* scale factor for initial \mu, stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2 and
1074			* the step used in difference approximation to the Jacobian. Set to NULL for defaults to be used.
1075			* If \delta<0, the Jacobian is approximated with central differences which are more accurate
1076			* (but slower!) compared to the forward differences employed by default.
1077			*/
1078			LM_REAL info[LM_INFO_SZ],
1079			/* O: information regarding the minimization. Set to NULL if don't care
1080			* info[0]= \|\|e\|\|_2 at initial p.
1081			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
1082			* info[5]= # iterations,
1083			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
1084			* 2 - stopped by small Dp
1085			* 3 - stopped by itmax
1086			* 4 - singular matrix. Restart from current p with increased mu
1087			* 5 - no further error reduction is possible. Restart with increased mu
1088			* 6 - stopped by small \|\|e\|\|_2
1089			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
1090			* info[7]= # function evaluations
1091			* info[8]= # Jacobian evaluations
1092			* info[9]= # linear systems solved, i.e. # attempts for reducing error
1093			*/
1094			LM_REAL work, / working memory at least LM_BC_DIF_WORKSZ() reals large, allocated if NULL */
1095			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
1096			void adata) / pointer to possibly additional data, passed uninterpreted to func.
1097			* Set to NULL if not needed
1098			*/
1099			{
1100			struct LMBC_DIF_DATA data;
1101			int ret;
1102
1103			//fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n");
1104
1105	5		data.ffdif=!opts \|\| opts[4]>=0.0;
1106
1107	5		data.func=func;
1108	5		data.hx=(LM_REAL )malloc(2nsizeof(LM_REAL)); / allocate a big chunk in one step */
1109	5		if(!data.hx){
1110	0		fprintf(stderr, LCAT(LEVMAR_BC_DIF, "(): memory allocation request failed\n"));
1111	0		return LM_ERROR;
1112			}
1113	5		data.hxx=data.hx+n;
1114	5		data.adata=adata;
1115	5		data.delta=(opts)? FABS(opts[4]) : (LM_REAL)LM_DIFF_DELTA;
1116
1117	5		ret=LEVMAR_BC_DER(LMBC_DIF_FUNC, LMBC_DIF_JACF, p, x, m, n, lb, ub, dscl, itmax, opts, info, work, covar, (void *)&data);
1118
1119	5		if(info){ /* correct the number of function calls */
1120	5		if(data.ffdif)
1121	5		info[7]+=info[8](m+1); / each Jacobian evaluation costs m+1 function calls */
1122			else
1123	0		info[7]+=info[8](2m); /* each Jacobian evaluation costs 2m function calls /
1124			}
1125
1126	5		free(data.hx);
1127
1128	5		return ret;
1129			}
1130
1131			/* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
1132			#undef FUNC_STATE
1133			#undef LNSRCH
1134			#undef BOXPROJECT
1135			#undef BOXSCALE
1136			#undef LEVMAR_BOX_CHECK
1137			#undef VECNORM
1138			#undef LEVMAR_BC_DER
1139			#undef LMBC_DIF_DATA
1140			#undef LMBC_DIF_FUNC
1141			#undef LMBC_DIF_JACF
1142			#undef LEVMAR_BC_DIF
1143			#undef LEVMAR_FDIF_FORW_JAC_APPROX
1144			#undef LEVMAR_FDIF_CENT_JAC_APPROX
1145			#undef LEVMAR_COVAR
1146			#undef LEVMAR_TRANS_MAT_MAT_MULT
1147			#undef LEVMAR_L2NRMXMY
1148			#undef AX_EQ_B_LU
1149			#undef AX_EQ_B_CHOL
1150			#undef AX_EQ_B_PLASMA_CHOL
1151			#undef AX_EQ_B_QR
1152			#undef AX_EQ_B_QRLS
1153			#undef AX_EQ_B_SVD
1154			#undef AX_EQ_B_BK