Actual source code: fnutil.c
slepc-3.9.1 2018-05-02
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2018, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: /*
18: Compute the square root of an upper quasi-triangular matrix T,
19: using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T).
20: */
21: PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld)
22: {
23: #if defined(SLEPC_MISSING_LAPACK_TRSYL)
25: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"TRSYL - Lapack routine unavailable");
26: #else
27: PetscScalar one=1.0,mone=-1.0;
28: PetscReal scal;
29: PetscBLASInt i,j,si,sj,r,ione=1,info;
30: #if !defined(PETSC_USE_COMPLEX)
31: PetscReal alpha,theta,mu,mu2;
32: #endif
35: for (j=0;j<n;j++) {
36: #if defined(PETSC_USE_COMPLEX)
37: sj = 1;
38: T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]);
39: #else
40: sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2;
41: if (sj==1) {
42: if (T[j+j*ld]<0.0) SETERRQ(PETSC_COMM_SELF,1,"Matrix has a real negative eigenvalue, no real primary square root exists");
43: T[j+j*ld] = PetscSqrtReal(T[j+j*ld]);
44: } else {
45: /* square root of 2x2 block */
46: theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0;
47: mu = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0;
48: mu2 = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld];
49: mu = PetscSqrtReal(mu2);
50: if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0);
51: else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2)));
52: T[j+j*ld] /= 2.0*alpha;
53: T[j+1+(j+1)*ld] /= 2.0*alpha;
54: T[j+(j+1)*ld] /= 2.0*alpha;
55: T[j+1+j*ld] /= 2.0*alpha;
56: T[j+j*ld] += alpha-theta/(2.0*alpha);
57: T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha);
58: }
59: #endif
60: for (i=j-1;i>=0;i--) {
61: #if defined(PETSC_USE_COMPLEX)
62: si = 1;
63: #else
64: si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2;
65: if (si==2) i--;
66: #endif
67: /* solve Sylvester equation of order si x sj */
68: r = j-i-si;
69: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld));
70: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info));
71: SlepcCheckLapackInfo("trsyl",info);
72: if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,1,"Current implementation cannot handle scale factor %g",scal);
73: }
74: if (sj==2) j++;
75: }
76: return(0);
77: #endif
78: }
80: #define BLOCKSIZE 64
82: /*
83: Schur method for the square root of an upper quasi-triangular matrix T.
84: T is overwritten with sqrtm(T).
85: If firstonly then only the first column of T will contain relevant values.
86: */
87: PetscErrorCode SlepcSqrtmSchur(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly)
88: {
89: #if defined(SLEPC_MISSING_LAPACK_GEES) || defined(SLEPC_MISSING_LAPACK_TRSYL)
91: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEES/TRSYL - Lapack routines are unavailable");
92: #else
94: PetscBLASInt i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE;
95: PetscScalar *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0;
96: PetscInt m,nblk;
97: PetscReal scal;
98: #if defined(PETSC_USE_COMPLEX)
99: PetscReal *rwork;
100: #else
101: PetscReal *wi;
102: #endif
105: m = n;
106: nblk = (m+bs-1)/bs;
107: lwork = 5*n;
108: k = firstonly? 1: n;
110: /* compute Schur decomposition A*Q = Q*T */
111: #if !defined(PETSC_USE_COMPLEX)
112: PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
113: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info));
114: #else
115: PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
116: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info));
117: #endif
118: SlepcCheckLapackInfo("gees",info);
120: /* determine block sizes and positions, to avoid cutting 2x2 blocks */
121: j = 0;
122: p[j] = 0;
123: do {
124: s[j] = PetscMin(bs,n-p[j]);
125: #if !defined(PETSC_USE_COMPLEX)
126: if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++;
127: #endif
128: if (p[j]+s[j]==n) break;
129: j++;
130: p[j] = p[j-1]+s[j-1];
131: } while (1);
132: nblk = j+1;
134: for (j=0;j<nblk;j++) {
135: /* evaluate f(T_jj) */
136: SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld);
137: for (i=j-1;i>=0;i--) {
138: /* solve Sylvester equation for block (i,j) */
139: r = p[j]-p[i]-s[i];
140: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld));
141: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info));
142: SlepcCheckLapackInfo("trsyl",info);
143: if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,1,"Current implementation cannot handle scale factor %g",scal);
144: }
145: }
147: /* backtransform B = Q*T*Q' */
148: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld));
149: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld));
151: /* flop count: Schur decomposition, triangular square root, and backtransform */
152: PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k);
154: #if !defined(PETSC_USE_COMPLEX)
155: PetscFree7(wr,wi,W,Q,work,s,p);
156: #else
157: PetscFree7(wr,rwork,W,Q,work,s,p);
158: #endif
159: return(0);
160: #endif
161: }
163: #define DBMAXIT 25
165: /*
166: Computes the principal square root of the matrix T using the product form
167: of the Denman-Beavers iteration.
168: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
169: */
170: PetscErrorCode SlepcSqrtmDenmanBeavers(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
171: {
172: #if defined(PETSC_MISSING_LAPACK_GETRF) || defined(PETSC_MISSING_LAPACK_GETRI)
174: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GETRF/GETRI - Lapack routine is unavailable");
175: #else
176: PetscScalar *Told,*M=NULL,*invM,*work,work1,prod,alpha;
177: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25;
178: PetscReal tol,Mres,detM,g,reldiff,fnormdiff,fnormT,rwork[1];
179: PetscBLASInt N,i,it,*piv=NULL,info,query=-1,lwork;
180: const PetscBLASInt one=1;
181: PetscBool converged=PETSC_FALSE,scale=PETSC_FALSE;
182: PetscErrorCode ierr;
183: unsigned int ftz;
186: N = n*n;
187: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
188: SlepcSetFlushToZero(&ftz);
190: /* query work size */
191: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info));
192: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
193: PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM);
194: PetscMemcpy(M,T,n*n*sizeof(PetscScalar));
196: if (inv) { /* start recurrence with I instead of A */
197: PetscMemzero(T,n*n*sizeof(PetscScalar));
198: for (i=0;i<n;i++) T[i+i*ld] += 1.0;
199: }
201: for (it=0;it<DBMAXIT && !converged;it++) {
203: if (scale) { /* g = (abs(det(M)))^(-1/(2*n)) */
204: PetscMemcpy(invM,M,n*n*sizeof(PetscScalar));
205: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
206: SlepcCheckLapackInfo("getrf",info);
207: prod = invM[0];
208: for(i=1;i<n;i++) prod *= invM[i+i*ld];
209: detM = PetscAbsScalar(prod);
210: g = PetscPowReal(detM,-1.0/(2.0*n));
211: alpha = g;
212: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one));
213: alpha = g*g;
214: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one));
215: PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
216: }
218: PetscMemcpy(Told,T,n*n*sizeof(PetscScalar));
219: PetscMemcpy(invM,M,n*n*sizeof(PetscScalar));
221: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
222: SlepcCheckLapackInfo("getrf",info);
223: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info));
224: SlepcCheckLapackInfo("getri",info);
225: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);
227: for (i=0;i<n;i++) invM[i+i*ld] += 1.0;
228: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld));
229: for (i=0;i<n;i++) invM[i+i*ld] -= 1.0;
231: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one));
232: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one));
233: for (i=0;i<n;i++) M[i+i*ld] -= 0.5;
234: PetscLogFlops(2.0*n*n*n+2.0*n*n);
236: Mres = LAPACKlange_("F",&n,&n,M,&n,rwork);
237: for (i=0;i<n;i++) M[i+i*ld] += 1.0;
239: if (scale) {
240: /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */
241: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one));
242: fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork);
243: fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork);
244: PetscLogFlops(7.0*n*n);
245: reldiff = fnormdiff/fnormT;
246: PetscInfo4(NULL,"it: %D reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
247: if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch off scaling */
248: }
250: if (Mres<=tol) converged = PETSC_TRUE;
251: }
253: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",DBMAXIT);
254: PetscFree5(work,piv,Told,M,invM);
255: SlepcResetFlushToZero(&ftz);
256: return(0);
257: #endif
258: }
260: #define NSMAXIT 50
262: /*
263: Computes the principal square root of the matrix A using the Newton-Schulz iteration.
264: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
265: */
266: PetscErrorCode SlepcSqrtmNewtonSchulz(PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
267: {
268: PetscScalar *Y=A,*Yold,*Z,*Zold,*M,alpha,sqrtnrm;
269: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
270: PetscReal tol,Yres,nrm,rwork[1];
271: PetscBLASInt i,it,N;
272: const PetscBLASInt one=1;
273: PetscBool converged=PETSC_FALSE;
274: PetscErrorCode ierr;
275: unsigned int ftz;
278: N = n*n;
279: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
280: SlepcSetFlushToZero(&ftz);
282: PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M);
284: /* scale A so that ||I-A|| < 1 */
285: PetscMemcpy(Z,A,N*sizeof(PetscScalar));
286: for (i=0;i<n;i++) Z[i+i*ld] -= 1.0;
287: nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork);
288: sqrtnrm = PetscSqrtReal(nrm);
289: alpha = 1.0/nrm;
290: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,A,&one));
291: tol *= nrm;
292: PetscInfo2(NULL,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
293: PetscLogFlops(2.0*n*n);
295: /* Z = I */
296: PetscMemzero(Z,N*sizeof(PetscScalar));
297: for (i=0;i<n;i++) Z[i+i*ld] = 1.0;
299: for (it=0;it<NSMAXIT && !converged;it++) {
300: /* Yold = Y, Zold = Z */
301: PetscMemcpy(Yold,Y,N*sizeof(PetscScalar));
302: PetscMemcpy(Zold,Z,N*sizeof(PetscScalar));
304: /* M = (3*I-Zold*Yold) */
305: PetscMemzero(M,N*sizeof(PetscScalar));
306: for (i=0;i<n;i++) M[i+i*ld] = sthree;
307: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld));
309: /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */
310: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld));
311: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld));
313: /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
314: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one));
315: Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork);
316: PetscIsNanReal(Yres);
317: if (Yres<=tol) converged = PETSC_TRUE;
318: PetscInfo2(NULL,"it: %D res: %g\n",it,(double)Yres);
320: PetscLogFlops(6.0*n*n*n+2.0*n*n);
321: }
323: if (Yres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",NSMAXIT);
325: /* undo scaling */
326: if (inv) {
327: PetscMemcpy(A,Z,N*sizeof(PetscScalar));
328: sqrtnrm = 1.0/sqrtnrm;
329: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sqrtnrm,A,&one));
330: } else PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sqrtnrm,A,&one));
332: PetscFree4(Yold,Z,Zold,M);
333: SlepcResetFlushToZero(&ftz);
334: return(0);
335: }