File Coverage

levmar-2.6/misc_core.c

Criterion	Covered	Total	%
statement	501	668	75.0
branch	233	308	75.6
condition			n/a
subroutine			n/a
pod			n/a
total	734	976	75.2

line	stmt	bran	code
1			/////////////////////////////////////////////////////////////////////////////////
2			//
3			// Levenberg - Marquardt non-linear minimization algorithm
4			// Copyright (C) 2004-05 Manolis Lourakis (lourakis at ics forth gr)
5			// Institute of Computer Science, Foundation for Research & Technology - Hellas
6			// Heraklion, Crete, Greece.
7			//
8			// This program is free software; you can redistribute it and/or modify
9			// it under the terms of the GNU General Public License as published by
10			// the Free Software Foundation; either version 2 of the License, or
11			// (at your option) any later version.
12			//
13			// This program is distributed in the hope that it will be useful,
14			// but WITHOUT ANY WARRANTY; without even the implied warranty of
15			// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16			// GNU General Public License for more details.
17			//
18			/////////////////////////////////////////////////////////////////////////////////
19
20			#ifndef LM_REAL // not included by misc.c
21			#error This file should not be compiled directly!
22			#endif
23
24
25			/* precision-specific definitions */
26			#define LEVMAR_CHKJAC LM_ADD_PREFIX(levmar_chkjac)
27			#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
28			#define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
29			#define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
30			#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
31			#define LEVMAR_STDDEV LM_ADD_PREFIX(levmar_stddev)
32			#define LEVMAR_CORCOEF LM_ADD_PREFIX(levmar_corcoef)
33			#define LEVMAR_R2 LM_ADD_PREFIX(levmar_R2)
34			#define LEVMAR_BOX_CHECK LM_ADD_PREFIX(levmar_box_check)
35			#define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)
36
37			#ifdef HAVE_LAPACK
38			#define LEVMAR_PSEUDOINVERSE LM_ADD_PREFIX(levmar_pseudoinverse)
39			static int LEVMAR_PSEUDOINVERSE(LM_REAL A, LM_REAL B, int m);
40
41			#ifdef __cplusplus
42			extern "C" {
43			#endif
44			/* BLAS matrix multiplication, LAPACK SVD & Cholesky routines */
45			#define GEMM LM_MK_BLAS_NAME(gemm)
46			/* C := alphaop( A )op( B ) + betaC /
47			extern void GEMM(char transa, char transb, int m, int n, int *k,
48			LM_REAL alpha, LM_REAL a, int lda, LM_REAL b, int ldb, LM_REAL beta, LM_REAL c, int ldc);
49
50			#define GESVD LM_MK_LAPACK_NAME(gesvd)
51			#define GESDD LM_MK_LAPACK_NAME(gesdd)
52			extern int GESVD(char jobu, char jobvt, int m, int n, LM_REAL a, int lda, LM_REAL s, LM_REAL u, int *ldu,
53			LM_REAL vt, int ldvt, LM_REAL work, int lwork, int *info);
54
55			/* lapack 3.0 new SVD routine, faster than xgesvd() */
56			extern int GESDD(char jobz, int m, int n, LM_REAL a, int lda, LM_REAL s, LM_REAL u, int ldu, LM_REAL vt, int ldvt,
57			LM_REAL work, int lwork, int iwork, int info);
58
59			/* Cholesky decomposition */
60			#define POTF2 LM_MK_LAPACK_NAME(potf2)
61			extern int POTF2(char uplo, int n, LM_REAL a, int lda, int *info);
62			#ifdef __cplusplus
63			}
64			#endif
65
66			#define LEVMAR_CHOLESKY LM_ADD_PREFIX(levmar_chol)
67
68			#else /* !HAVE_LAPACK */
69			#define LEVMAR_LUINVERSE LM_ADD_PREFIX(levmar_LUinverse_noLapack)
70
71			static int LEVMAR_LUINVERSE(LM_REAL A, LM_REAL B, int m);
72			#endif /* HAVE_LAPACK */
73
74			/* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
75			* using a block size of bsize. The product is returned in b.
76			* Since a^T a is symmetric, its computation can be sped up by computing only its
77			* upper triangular part and copying it to the lower part.
78			*
79			* More details on blocking can be found at
80			* http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf
81			*/
82	733		void LEVMAR_TRANS_MAT_MAT_MULT(LM_REAL a, LM_REAL b, int n, int m)
	720
	13
83			{
84			#ifdef HAVE_LAPACK /* use BLAS matrix multiply */
85
86			LM_REAL alpha=LM_CNST(1.0), beta=LM_CNST(0.0);
87			/* Fool BLAS to compute a^T*a avoiding transposing a: a is equivalent to a^T in column major,
88			* therefore BLAS computes aa^T with a and aa^T in column major, which is equivalent to
89			* computing a^T*a in row major!
90			*/
91			GEMM("N", "T", &m, &m, &n, &alpha, a, &m, a, &m, &beta, b, &m);
92
93			#else /* no LAPACK, use blocking-based multiply */
94
95			register int i, j, k, jj, kk;
96			register LM_REAL sum, bim, akm;
97	733		const int bsize=__BLOCKSZ__;
	720
	13
98
99			#define __MIN__(x, y) (((x)<=(y))? (x) : (y))
100			#define __MAX__(x, y) (((x)>=(y))? (x) : (y))
101
102			/* compute upper triangular part using blocking */
103	1466	100	for(jj=0; jj
	1440	100
	26
104	2320	100	for(i=0; i
	2268	100
	52
105	1587		bim=b+i*m;
	1548
	39
106	4149	100	for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j)
	4032	100
	117
107	2562		bim[j]=0.0; //b[i*m+j]=0.0;
	2484
	78
108			}
109
110	76330	100	for(kk=0; kk
	72170	100
	4160
111	238125	100	for(i=0; i
	221537	100
	16588
112	162528		bim=b+i*m;
	150087
	12441
113	423321	100	for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){
	385998	100
	37323
114	260793		sum=0.0;
	235911
	24882
115	8551149	100	for(k=kk; k<__MIN__(kk+bsize, n); ++k){
	7730589	100
	820560
116	8290356		akm=a+k*m;
	7494678
	795678
117	8290356		sum+=akm[i]akm[j]; //a[km+i]a[km+j];
	7494678
	795678
118			}
119	260793		bim[j]+=sum; //b[i*m+j]+=sum;
	235911
	24882
120			}
121			}
122			}
123			}
124
125			/* copy upper triangular part to the lower one */
126	2320	100	for(i=0; i
	2268	100
	52
127	2562	100	for(j=0; j
	2484	100
	78
128	975		b[im+j]=b[jm+i];
	936
	39
129
130			#undef __MIN__
131			#undef __MAX__
132
133			#endif /* HAVE_LAPACK */
134	733		}
	720
	13
135
136			/* forward finite difference approximation to the Jacobian of func */
137	15391		void LEVMAR_FDIF_FORW_JAC_APPROX(
	10079
	5312
138			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata),
139			/* function to differentiate */
140			LM_REAL p, / I: current parameter estimate, mx1 */
141			LM_REAL hx, / I: func evaluated at p, i.e. hx=func(p), nx1 */
142			LM_REAL hxx, / W/O: work array for evaluating func(p+delta), nx1 */
143			LM_REAL delta, /* increment for computing the Jacobian */
144			LM_REAL jac, / O: array for storing approximated Jacobian, nxm */
145			int m,
146			int n,
147			void *adata)
148			{
149			register int i, j;
150			LM_REAL tmp;
151			register LM_REAL d;
152
153	76684	100	for(j=0; j
	50246	100
	26438
154			/* determine d=max(1E-04\|p[j]\|, delta), see HZ /
155	61293		d=LM_CNST(1E-04)*p[j]; // force evaluation
	40167
	21126
156	61293	100	d=FABS(d);
	40167	100
	21126
157	61293	100	if(d
	40167	100
	21126
158	150		d=delta;
	74
	76
159
160	61293		tmp=p[j];
	40167
	21126
161	61293		p[j]+=d;
	40167
	21126
162
163	61293		(*func)(p, hxx, m, n, adata);
	40167
	21126
164
165	61293		p[j]=tmp; /* restore */
	40167
	21126
166
167	61293		d=LM_CNST(1.0)/d; /* invert so that divisions can be carried out faster as multiplications */
	40167
	21126
168	609229	100	for(i=0; i
	442645	100
	166584
169	547936		jac[im+j]=(hxx[i]-hx[i])d;
	402478
	145458
170			}
171			}
172	15391		}
	10079
	5312
173
174			/* central finite difference approximation to the Jacobian of func */
175	0		void LEVMAR_FDIF_CENT_JAC_APPROX(
	0
	0
176			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata),
177			/* function to differentiate */
178			LM_REAL p, / I: current parameter estimate, mx1 */
179			LM_REAL hxm, / W/O: work array for evaluating func(p-delta), nx1 */
180			LM_REAL hxp, / W/O: work array for evaluating func(p+delta), nx1 */
181			LM_REAL delta, /* increment for computing the Jacobian */
182			LM_REAL jac, / O: array for storing approximated Jacobian, nxm */
183			int m,
184			int n,
185			void *adata)
186			{
187			register int i, j;
188			LM_REAL tmp;
189			register LM_REAL d;
190
191	0	0	for(j=0; j
	0	0
	0
192			/* determine d=max(1E-04\|p[j]\|, delta), see HZ /
193	0		d=LM_CNST(1E-04)*p[j]; // force evaluation
	0
	0
194	0	0	d=FABS(d);
	0	0
	0
195	0	0	if(d
	0	0
	0
196	0		d=delta;
	0
	0
197
198	0		tmp=p[j];
	0
	0
199	0		p[j]-=d;
	0
	0
200	0		(*func)(p, hxm, m, n, adata);
	0
	0
201
202	0		p[j]=tmp+d;
	0
	0
203	0		(*func)(p, hxp, m, n, adata);
	0
	0
204	0		p[j]=tmp; /* restore */
	0
	0
205
206	0		d=LM_CNST(0.5)/d; /* invert so that divisions can be carried out faster as multiplications */
	0
	0
207	0	0	for(i=0; i
	0	0
	0
208	0		jac[im+j]=(hxp[i]-hxm[i])d;
	0
	0
209			}
210			}
211	0		}
	0
	0
212
213			/*
214			* Check the Jacobian of a n-valued nonlinear function in m variables
215			* evaluated at a point p, for consistency with the function itself.
216			*
217			* Based on fortran77 subroutine CHKDER by
218			* Burton S. Garbow, Kenneth E. Hillstrom, Jorge J. More
219			* Argonne National Laboratory. MINPACK project. March 1980.
220			*
221			*
222			* func points to a function from R^m --> R^n: Given a p in R^m it yields hx in R^n
223			* jacf points to a function implementing the Jacobian of func, whose correctness
224			* is to be tested. Given a p in R^m, jacf computes into the nxm matrix j the
225			* Jacobian of func at p. Note that row i of j corresponds to the gradient of
226			* the i-th component of func, evaluated at p.
227			* p is an input array of length m containing the point of evaluation.
228			* m is the number of variables
229			* n is the number of functions
230			* adata points to possible additional data and is passed uninterpreted
231			* to func, jacf.
232			* err is an array of length n. On output, err contains measures
233			* of correctness of the respective gradients. if there is
234			* no severe loss of significance, then if err[i] is 1.0 the
235			* i-th gradient is correct, while if err[i] is 0.0 the i-th
236			* gradient is incorrect. For values of err between 0.0 and 1.0,
237			* the categorization is less certain. In general, a value of
238			* err[i] greater than 0.5 indicates that the i-th gradient is
239			* probably correct, while a value of err[i] less than 0.5
240			* indicates that the i-th gradient is probably incorrect.
241			*
242			*
243			* The function does not perform reliably if cancellation or
244			* rounding errors cause a severe loss of significance in the
245			* evaluation of a function. therefore, none of the components
246			* of p should be unusually small (in particular, zero) or any
247			* other value which may cause loss of significance.
248			*/
249
250	4		void LEVMAR_CHKJAC(
	2
	2
251			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata),
252			void (jacf)(LM_REAL p, LM_REAL j, int m, int n, void adata),
253			LM_REAL p, int m, int n, void adata, LM_REAL *err)
254			{
255	4		LM_REAL factor=LM_CNST(100.0);
	2
	2
256	4		LM_REAL one=LM_CNST(1.0);
	2
	2
257	4		LM_REAL zero=LM_CNST(0.0);
	2
	2
258			LM_REAL fvec, fjac, pp, fvecp, *buf;
259
260			register int i, j;
261			LM_REAL eps, epsf, temp, epsmch;
262			LM_REAL epslog;
263	4		int fvec_sz=n, fjac_sz=n*m, pp_sz=m, fvecp_sz=n;
	2
	2
264
265	4		epsmch=LM_REAL_EPSILON;
	2
	2
266	4		eps=(LM_REAL)sqrt(epsmch);
	2
	2
267
268	4		buf=(LM_REAL )malloc((fvec_sz + fjac_sz + pp_sz + fvecp_sz)sizeof(LM_REAL));
	2
	2
269	4	50	if(!buf){
	2	50
	2
270	0		fprintf(stderr, LCAT(LEVMAR_CHKJAC, "(): memory allocation request failed\n"));
	0
	0
271	0		exit(1);
	0
	0
272			}
273	4		fvec=buf;
	2
	2
274	4		fjac=fvec+fvec_sz;
	2
	2
275	4		pp=fjac+fjac_sz;
	2
	2
276	4		fvecp=pp+pp_sz;
	2
	2
277
278			/* compute fvec=func(p) */
279	4		(*func)(p, fvec, m, n, adata);
	2
	2
280
281			/* compute the Jacobian at p */
282	4		(*jacf)(p, fjac, m, n, adata);
	2
	2
283
284			/* compute pp */
285	12	100	for(j=0; j
	6	100
	6
286	8	50	temp=eps*FABS(p[j]);
	4	50
	4
287	8	50	if(temp==zero) temp=eps;
	4	50
	4
288	8		pp[j]=p[j]+temp;
	4
	4
289			}
290
291			/* compute fvecp=func(pp) */
292	4		(*func)(pp, fvecp, m, n, adata);
	2
	2
293
294	4		epsf=factor*epsmch;
	2
	2
295	4		epslog=(LM_REAL)log10(eps);
	2
	2
296
297	44	100	for(i=0; i
	22	100
	22
298	40		err[i]=zero;
	20
	20
299
300	12	100	for(j=0; j
	6	100
	6
301	8	50	temp=FABS(p[j]);
	4	50
	4
302	8	50	if(temp==zero) temp=one;
	4	50
	4
303
304	88	100	for(i=0; i
	44	100
	44
305	80		err[i]+=tempfjac[im+j];
	40
	40
306			}
307
308	44	100	for(i=0; i
	22	100
	22
309	40		temp=one;
	20
	20
310	40	50	if(fvec[i]!=zero && fvecp[i]!=zero && FABS(fvecp[i]-fvec[i])>=epsf*FABS(fvec[i]))
	20	50
	20	100
		50
		50
		50
		50
		100
		50
		50
311	40	100	temp=eps*FABS((fvecp[i]-fvec[i])/eps - err[i])/(FABS(fvec[i])+FABS(fvecp[i]));
	20	50
	20	50
		100
		50
		50
312	40		err[i]=one;
	20
	20
313	40	100	if(temp>epsmch && temp
	20	50
	20	100
		50
314	12		err[i]=((LM_REAL)log10(temp) - epslog)/epslog;
	6
	6
315	40	50	if(temp>=eps) err[i]=zero;
	20	50
	20
316			}
317
318	4		free(buf);
	2
	2
319
320	4		return;
	2
	2
321			}
322
323			#ifdef HAVE_LAPACK
324			/*
325			* This function computes the pseudoinverse of a square matrix A
326			* into B using SVD. A and B can coincide
327			*
328			* The function returns 0 in case of error (e.g. A is singular),
329			* the rank of A if successful
330			*
331			* A, B are mxm
332			*
333			*/
334			static int LEVMAR_PSEUDOINVERSE(LM_REAL A, LM_REAL B, int m)
335			{
336			LM_REAL *buf=NULL;
337			int buf_sz=0;
338			static LM_REAL eps=LM_CNST(-1.0);
339
340			register int i, j;
341			LM_REAL a, u, s, vt, *work;
342			int a_sz, u_sz, s_sz, vt_sz, tot_sz;
343			LM_REAL thresh, one_over_denom;
344			int info, rank, worksz, *iwork, iworksz;
345
346			/* calculate required memory size */
347			worksz=5*m; // min worksize for GESVD
348			//worksz=m(7m+4); // min worksize for GESDD
349			iworksz=8*m;
350			a_sz=m*m;
351			u_sz=mm; s_sz=m; vt_sz=mm;
352
353			tot_sz=(a_sz + u_sz + s_sz + vt_sz + worksz)sizeof(LM_REAL) + iworkszsizeof(int); /* should be arranged in that order for proper doubles alignment */
354
355			buf_sz=tot_sz;
356			buf=(LM_REAL *)malloc(buf_sz);
357			if(!buf){
358			fprintf(stderr, RCAT("memory allocation in ", LEVMAR_PSEUDOINVERSE) "() failed!\n");
359			return 0; /* error */
360			}
361
362			a=buf;
363			u=a+a_sz;
364			s=u+u_sz;
365			vt=s+s_sz;
366			work=vt+vt_sz;
367			iwork=(int *)(work+worksz);
368
369			/* store A (column major!) into a */
370			for(i=0; i
371			for(j=0; j
372			a[i+jm]=A[im+j];
373
374			/* SVD decomposition of A */
375			GESVD("A", "A", (int )&m, (int )&m, a, (int )&m, s, u, (int )&m, vt, (int )&m, work, (int )&worksz, &info);
376			//GESDD("A", (int )&m, (int )&m, a, (int )&m, s, u, (int )&m, vt, (int )&m, work, (int )&worksz, iwork, &info);
377
378			/* error treatment */
379			if(info!=0){
380			if(info<0){
381			fprintf(stderr, RCAT(RCAT(RCAT("LAPACK error: illegal value for argument %d of ", GESVD), "/" GESDD) " in ", LEVMAR_PSEUDOINVERSE) "()\n", -info);
382			}
383			else{
384			fprintf(stderr, RCAT("LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in ", LEVMAR_PSEUDOINVERSE) "() [info=%d]\n", info);
385			}
386			free(buf);
387			return 0;
388			}
389
390			if(eps<0.0){
391			LM_REAL aux;
392
393			/* compute machine epsilon */
394			for(eps=LM_CNST(1.0); aux=eps+LM_CNST(1.0), aux-LM_CNST(1.0)>0.0; eps*=LM_CNST(0.5))
395			;
396			eps*=LM_CNST(2.0);
397			}
398
399			/* compute the pseudoinverse in B */
400			for(i=0; i
401			for(rank=0, thresh=eps*s[0]; rankthresh; rank++){
402			one_over_denom=LM_CNST(1.0)/s[rank];
403
404			for(j=0; j
405			for(i=0; i
406			B[im+j]+=vt[rank+im]u[j+rankm]*one_over_denom;
407			}
408
409			free(buf);
410
411			return rank;
412			}
413			#else // no LAPACK
414
415			/*
416			* This function computes the inverse of A in B. A and B can coincide
417			*
418			* The function employs LAPACK-free LU decomposition of A to solve m linear
419			* systems A*B_i=I_i, where B_i and I_i are the i-th columns of B and I.
420			*
421			* A and B are mxm
422			*
423			* The function returns 0 in case of error, 1 if successful
424			*
425			*/
426	160		static int LEVMAR_LUINVERSE(LM_REAL A, LM_REAL B, int m)
	144
	16
427			{
428	160		void *buf=NULL;
	144
	16
429	160		int buf_sz=0;
	144
	16
430
431			register int i, j, k, l;
432	160		int *idx, maxi=-1, idx_sz, a_sz, x_sz, work_sz, tot_sz;
	144
	16
433			LM_REAL a, x, *work, max, sum, tmp;
434
435			/* calculate required memory size */
436	160		idx_sz=m;
	144
	16
437	160		a_sz=m*m;
	144
	16
438	160		x_sz=m;
	144
	16
439	160		work_sz=m;
	144
	16
440	160		tot_sz=(a_sz + x_sz + work_sz)sizeof(LM_REAL) + idx_szsizeof(int); /* should be arranged in that order for proper doubles alignment */
	144
	16
441
442	160		buf_sz=tot_sz;
	144
	16
443	160		buf=(void *)malloc(tot_sz);
	144
	16
444	160	50	if(!buf){
	144	50
	16
445	0		fprintf(stderr, RCAT("memory allocation in ", LEVMAR_LUINVERSE) "() failed!\n");
	0
	0
446	0		return 0; /* error */
	0
	0
447			}
448
449	160		a=buf;
	144
	16
450	160		x=a+a_sz;
	144
	16
451	160		work=x+x_sz;
	144
	16
452	160		idx=(int *)(work+work_sz);
	144
	16
453
454			/* avoid destroying A by copying it to a */
455	1068	100	for(i=0; i
	923	100
	145
456
457			/* compute the LU decomposition of a row permutation of matrix a; the permutation itself is saved in idx[] */
458	528	100	for(i=0; i
	469	100
	59
459	368		max=0.0;
	325
	43
460	1276	100	for(j=0; j
	1104	100
	172
461	908	100	if((tmp=FABS(a[i*m+j]))>max)
	779	100
	129	100
		100
462	562		max=tmp;
	503
	59
463	368	50	if(max==0.0){
	325	50
	43
464	0		fprintf(stderr, RCAT("Singular matrix A in ", LEVMAR_LUINVERSE) "()!\n");
	0
	0
465	0		free(buf);
	0
	0
466
467	0		return 0;
	0
	0
468			}
469	368		work[i]=LM_CNST(1.0)/max;
	325
	43
470			}
471
472	528	100	for(j=0; j
	469	100
	59
473	638	100	for(i=0; i
	552	100
	86
474	270		sum=a[i*m+j];
	227
	43
475	346	100	for(k=0; k
	282	100
	64
476	76		sum-=a[im+k]a[k*m+j];
	55
	21
477	270		a[i*m+j]=sum;
	227
	43
478			}
479	368		max=0.0;
	325
	43
480	1006	100	for(i=j; i
	877	100
	129
481	638		sum=a[i*m+j];
	552
	86
482	984	100	for(k=0; k
	834	100
	150
483	346		sum-=a[im+k]a[k*m+j];
	282
	64
484	638		a[i*m+j]=sum;
	552
	86
485	638	100	if((tmp=work[i]*FABS(sum))>=max){
	552	100
	86	100
		100
486	462		max=tmp;
	403
	59
487	462		maxi=i;
	403
	59
488			}
489			}
490	368	100	if(j!=maxi){
	325	100
	43
491	325	100	for(k=0; k
	265	100
	60
492	232		tmp=a[maxi*m+k];
	188
	44
493	232		a[maxim+k]=a[jm+k];
	188
	44
494	232		a[j*m+k]=tmp;
	188
	44
495			}
496	93		work[maxi]=work[j];
	77
	16
497			}
498	368		idx[j]=maxi;
	325
	43
499	368	50	if(a[j*m+j]==0.0)
	325	100
	43
500	2		a[j*m+j]=LM_REAL_EPSILON;
	0
	2
501	368	100	if(j!=m-1){
	325	100
	43
502	208		tmp=LM_CNST(1.0)/(a[j*m+j]);
	181
	27
503	478	100	for(i=j+1; i
	408	100
	70
504	270		a[im+j]=tmp;
	227
	43
505			}
506			}
507
508			/* The decomposition has now replaced a. Solve the m linear systems using
509			* forward and back substitution
510			*/
511	528	100	for(l=0; l
	469	100
	59
512	1276	100	for(i=0; i
	1104	100
	172
513	368		x[l]=LM_CNST(1.0);
	325
	43
514
515	1276	100	for(i=k=0; i
	1104	100
	172
516	908		j=idx[i];
	779
	129
517	908		sum=x[j];
	779
	129
518	908		x[j]=x[i];
	779
	129
519	908	100	if(k!=0)
	779	100
	129
520	616	100	for(j=k-1; j
	509	100
	107
521	346		sum-=a[im+j]x[j];
	282
	64
522			else
523	638	100	if(sum!=0.0)
	552	100
	86
524	368		k=i+1;
	325
	43
525	908		x[i]=sum;
	779
	129
526			}
527
528	1276	100	for(i=m-1; i>=0; --i){
	1104	100
	172
529	908		sum=x[i];
	779
	129
530	1676	100	for(j=i+1; j
	1398	100
	278
531	768		sum-=a[im+j]x[j];
	619
	149
532	908		x[i]=sum/a[i*m+i];
	779
	129
533			}
534
535	1276	100	for(i=0; i
	1104	100
	172
536	908		B[i*m+l]=x[i];
	779
	129
537			}
538
539	160		free(buf);
	144
	16
540
541	160		return 1;
	144
	16
542			}
543			#endif /* HAVE_LAPACK */
544
545			/*
546			* This function computes in C the covariance matrix corresponding to a least
547			* squares fit. JtJ is the approximate Hessian at the solution (i.e. J^T*J, where
548			* J is the Jacobian at the solution), sumsq is the sum of squared residuals
549			* (i.e. goodnes of fit) at the solution, m is the number of parameters (variables)
550			* and n the number of observations. JtJ can coincide with C.
551			*
552			* if JtJ is of full rank, C is computed as sumsq/(n-m)*(JtJ)^-1
553			* otherwise and if LAPACK is available, C=sumsq/(n-r)*(JtJ)^+
554			* where r is JtJ's rank and ^+ denotes the pseudoinverse
555			* The diagonal of C is made up from the estimates of the variances
556			* of the estimated regression coefficients.
557			* See the documentation of routine E04YCF from the NAG fortran lib
558			*
559			* The function returns the rank of JtJ if successful, 0 on error
560			*
561			* A and C are mxm
562			*
563			*/
564	160		int LEVMAR_COVAR(LM_REAL JtJ, LM_REAL C, LM_REAL sumsq, int m, int n)
	144
	16
565			{
566			register int i;
567			int rnk;
568			LM_REAL fact;
569
570			#ifdef HAVE_LAPACK
571			rnk=LEVMAR_PSEUDOINVERSE(JtJ, C, m);
572			if(!rnk) return 0;
573			#else
574			#ifdef _MSC_VER
575			#pragma message("LAPACK not available, LU will be used for matrix inversion when computing the covariance; this might be unstable at times")
576			#else
577			#warning LAPACK not available, LU will be used for matrix inversion when computing the covariance; this might be unstable at times
578			#endif // _MSC_VER
579
580	160		rnk=LEVMAR_LUINVERSE(JtJ, C, m);
	144
	16
581	160	50	if(!rnk) return 0;
	144	50
	16
582
583	160		rnk=m; /* assume full rank */
	144
	16
584			#endif /* HAVE_LAPACK */
585
586	160		fact=sumsq/(LM_REAL)(n-rnk);
	144
	16
587	1068	100	for(i=0; i
	923	100
	145
588	908		C[i]*=fact;
	779
	129
589
590	160		return rnk;
	144
	16
591			}
592
593			/* standard deviation of the best-fit parameter i.
594			* covar is the mxm covariance matrix of the best-fit parameters (see also LEVMAR_COVAR()).
595			*
596			* The standard deviation is computed as \sigma_{i} = \sqrt{C_{ii}}
597			*/
598	0		LM_REAL LEVMAR_STDDEV(LM_REAL *covar, int m, int i)
	0
	0
599			{
600	0		return (LM_REAL)sqrt(covar[i*m+i]);
	0
	0
601			}
602
603			/* Pearson's correlation coefficient of the best-fit parameters i and j.
604			* covar is the mxm covariance matrix of the best-fit parameters (see also LEVMAR_COVAR()).
605			*
606			* The coefficient is computed as \rho_{ij} = C_{ij} / sqrt(C_{ii} C_{jj})
607			*/
608	0		LM_REAL LEVMAR_CORCOEF(LM_REAL *covar, int m, int i, int j)
	0
	0
609			{
610	0		return (LM_REAL)(covar[im+j]/sqrt(covar[im+i]covar[jm+j]));
	0
	0
611			}
612
613			/* coefficient of determination.
614			* see http://en.wikipedia.org/wiki/Coefficient_of_determination
615			*/
616	0		LM_REAL LEVMAR_R2(void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata),
	0
	0
617			LM_REAL p, LM_REAL x, int m, int n, void *adata)
618			{
619			register int i;
620			register LM_REAL tmp;
621			LM_REAL SSerr, // sum of squared errors, i.e. residual sum of squares \sum_i (x_i-hx_i)^2
622			SStot, // \sum_i (x_i-xavg)^2
623			*hx, xavg;
624
625
626	0	0	if((hx=(LM_REAL )malloc(nsizeof(LM_REAL)))==NULL){
	0	0
	0
627	0		fprintf(stderr, RCAT("memory allocation request failed in ", LEVMAR_R2) "()\n");
	0
	0
628	0		exit(1);
	0
	0
629			}
630
631			/* hx=f(p) */
632	0		(*func)(p, hx, m, n, adata);
	0
	0
633
634	0	0	for(i=n, tmp=0.0; i-->0; )
	0	0
	0
635	0		tmp+=x[i];
	0
	0
636	0		xavg=tmp/(LM_REAL)n;
	0
	0
637
638	0	0	if(x)
	0	0
	0
639	0	0	for(i=n, SSerr=SStot=0.0; i-->0; ){
	0	0
	0
640	0		tmp=x[i]-hx[i];
	0
	0
641	0		SSerr+=tmp*tmp;
	0
	0
642
643	0		tmp=x[i]-xavg;
	0
	0
644	0		SStot+=tmp*tmp;
	0
	0
645			}
646			else /* x==0 */
647	0	0	for(i=n, SSerr=SStot=0.0; i-->0; ){
	0	0
	0
648	0		tmp=-hx[i];
	0
	0
649	0		SSerr+=tmp*tmp;
	0
	0
650
651	0		tmp=-xavg;
	0
	0
652	0		SStot+=tmp*tmp;
	0
	0
653			}
654
655	0		free(hx);
	0
	0
656
657	0		return LM_CNST(1.0) - SSerr/SStot;
	0
	0
658			}
659
660			/* check box constraints for consistency */
661	17		int LEVMAR_BOX_CHECK(LM_REAL lb, LM_REAL ub, int m)
	12
	5
662			{
663			register int i;
664
665	17	50	if(!lb \|\| !ub) return 1;
	12	50
	5	50
		50
666
667	82	100	for(i=0; i
	57	100
	25
668	65	50	if(lb[i]>ub[i]) return 0;
	45	50
	20
669
670	17		return 1;
	12
	5
671			}
672
673			#ifdef HAVE_LAPACK
674
675			/* compute the Cholesky decomposition of C in W, s.t. C=W^t W and W is upper triangular */
676			int LEVMAR_CHOLESKY(LM_REAL C, LM_REAL W, int m)
677			{
678			register int i, j;
679			int info;
680
681			/* copy weights array C to W so that LAPACK won't destroy it;
682			* C is assumed symmetric, hence no transposition is needed
683			*/
684			for(i=0, j=m*m; i
685			W[i]=C[i];
686
687			/* Cholesky decomposition */
688			POTF2("L", (int )&m, W, (int )&m, (int *)&info);
689			/* error treatment */
690			if(info!=0){
691			if(info<0){
692			fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2 in %s\n", -info, LCAT(LEVMAR_CHOLESKY, "()"));
693			}
694			else{
695			fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\n%s()\n", info,
696			RCAT("and the Cholesky factorization could not be completed in ", LEVMAR_CHOLESKY));
697			}
698			return LM_ERROR;
699			}
700
701			/* the decomposition is in the lower part of W (in column-major order!).
702			* zeroing the upper part makes it lower triangular which is equivalent to
703			* upper triangular in row-major order
704			*/
705			for(i=0; i
706			for(j=i+1; j
707			W[i+j*m]=0.0;
708
709			return 0;
710			}
711			#endif /* HAVE_LAPACK */
712
713
714			/* Compute e=x-y for two n-vectors x and y and return the squared L2 norm of e.
715			* e can coincide with either x or y; x can be NULL, in which case it is assumed
716			* to be equal to the zero vector.
717			* Uses loop unrolling and blocking to reduce bookkeeping overhead & pipeline
718			* stalls and increase instruction-level parallelism; see http://www.abarnett.demon.co.uk/tutorial.html
719			*/
720
721	232763		LM_REAL LEVMAR_L2NRMXMY(LM_REAL e, LM_REAL x, LM_REAL *y, int n)
722			{
723	232763		const int blocksize=8, bpwr=3; /* 8=2^3 */
724			register int i;
725			int j1, j2, j3, j4, j5, j6, j7;
726			int blockn;
727	232763		register LM_REAL sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0;
728
729			/* n may not be divisible by blocksize,
730			* go as near as we can first, then tidy up.
731			*/
732	232763		blockn = (n>>bpwr)<
733
734			/* unroll the loop in blocks of `blocksize'; looping downwards gains some more speed */
735	232763		if(x){
736	671841		for(i=blockn-1; i>0; i-=blocksize){
737	439078		e[i ]=x[i ]-y[i ]; sum0+=e[i ]*e[i ];
738	439078		j1=i-1; e[j1]=x[j1]-y[j1]; sum1+=e[j1]*e[j1];
739	439078		j2=i-2; e[j2]=x[j2]-y[j2]; sum2+=e[j2]*e[j2];
740	439078		j3=i-3; e[j3]=x[j3]-y[j3]; sum3+=e[j3]*e[j3];
741	439078		j4=i-4; e[j4]=x[j4]-y[j4]; sum0+=e[j4]*e[j4];
742	439078		j5=i-5; e[j5]=x[j5]-y[j5]; sum1+=e[j5]*e[j5];
743	439078		j6=i-6; e[j6]=x[j6]-y[j6]; sum2+=e[j6]*e[j6];
744	439078		j7=i-7; e[j7]=x[j7]-y[j7]; sum3+=e[j7]*e[j7];
745			}
746
747			/*
748			* There may be some left to do.
749			* This could be done as a simple for() loop,
750			* but a switch is faster (and more interesting)
751			*/
752
753	232763		i=blockn;
754	232763		if(i
755			/* Jump into the case at the place that will allow
756			* us to finish off the appropriate number of items.
757			*/
758
759	231645		switch(n - i){
760	0		case 7 : e[i]=x[i]-y[i]; sum0+=e[i]*e[i]; ++i;
761	0		case 6 : e[i]=x[i]-y[i]; sum1+=e[i]*e[i]; ++i;
762	0		case 5 : e[i]=x[i]-y[i]; sum2+=e[i]*e[i]; ++i;
763	191017		case 4 : e[i]=x[i]-y[i]; sum3+=e[i]*e[i]; ++i;
764	191334		case 3 : e[i]=x[i]-y[i]; sum0+=e[i]*e[i]; ++i;
765	231617		case 2 : e[i]=x[i]-y[i]; sum1+=e[i]*e[i]; ++i;
766	231645		case 1 : e[i]=x[i]-y[i]; sum2+=e[i]*e[i]; //++i;
767			}
768			}
769			}
770			else{ /* x==0 */
771	0		for(i=blockn-1; i>0; i-=blocksize){
772	0		e[i ]=-y[i ]; sum0+=e[i ]*e[i ];
773	0		j1=i-1; e[j1]=-y[j1]; sum1+=e[j1]*e[j1];
774	0		j2=i-2; e[j2]=-y[j2]; sum2+=e[j2]*e[j2];
775	0		j3=i-3; e[j3]=-y[j3]; sum3+=e[j3]*e[j3];
776	0		j4=i-4; e[j4]=-y[j4]; sum0+=e[j4]*e[j4];
777	0		j5=i-5; e[j5]=-y[j5]; sum1+=e[j5]*e[j5];
778	0		j6=i-6; e[j6]=-y[j6]; sum2+=e[j6]*e[j6];
779	0		j7=i-7; e[j7]=-y[j7]; sum3+=e[j7]*e[j7];
780			}
781
782			/*
783			* There may be some left to do.
784			* This could be done as a simple for() loop,
785			* but a switch is faster (and more interesting)
786			*/
787
788	0		i=blockn;
789	0		if(i
790			/* Jump into the case at the place that will allow
791			* us to finish off the appropriate number of items.
792			*/
793
794	0		switch(n - i){
795	0		case 7 : e[i]=-y[i]; sum0+=e[i]*e[i]; ++i;
796	0		case 6 : e[i]=-y[i]; sum1+=e[i]*e[i]; ++i;
797	0		case 5 : e[i]=-y[i]; sum2+=e[i]*e[i]; ++i;
798	0		case 4 : e[i]=-y[i]; sum3+=e[i]*e[i]; ++i;
799	0		case 3 : e[i]=-y[i]; sum0+=e[i]*e[i]; ++i;
800	0		case 2 : e[i]=-y[i]; sum1+=e[i]*e[i]; ++i;
801	0		case 1 : e[i]=-y[i]; sum2+=e[i]*e[i]; //++i;
802			}
803			}
804			}
805
806	232763		return sum0+sum1+sum2+sum3;
807			}
808
809			/* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
810			#undef POTF2
811			#undef GESVD
812			#undef GESDD
813			#undef GEMM
814			#undef LEVMAR_PSEUDOINVERSE
815			#undef LEVMAR_LUINVERSE
816			#undef LEVMAR_BOX_CHECK
817			#undef LEVMAR_CHOLESKY
818			#undef LEVMAR_COVAR
819			#undef LEVMAR_STDDEV
820			#undef LEVMAR_CORCOEF
821			#undef LEVMAR_R2
822			#undef LEVMAR_CHKJAC
823			#undef LEVMAR_FDIF_FORW_JAC_APPROX
824			#undef LEVMAR_FDIF_CENT_JAC_APPROX
825			#undef LEVMAR_TRANS_MAT_MAT_MULT
826			#undef LEVMAR_L2NRMXMY