Actual source code: agmresorthog.c

petsc-3.12.2 2019-11-22
Report Typos and Errors
  1: #define PETSCKSP_DLL

  3:  #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
  4: /*
  5:  *  This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
  6:  * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
  7:  *
  8:  *
  9:  * initial author R. B. SIDJE,
 10:  * modified : G.-A Atenekeng-Kahou, 2008
 11:  * modified : D. NUENTSA WAKAM, 2011
 12:  *
 13:  */


 16: /*
 17:  * Take the processes that own the vectors and Organize them on a virtual ring.
 18:  */
 19: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);

 21: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
 22: {
 23:   MPI_Comm       comm;
 24:   KSP_AGMRES     *agmres = (KSP_AGMRES*)(ksp->data);
 26:   PetscMPIInt    First, Last, rank, size;

 29:   PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
 30:   MPI_Comm_rank(comm, &rank);
 31:   MPI_Comm_size(comm, &size);
 32:   MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
 33:   MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);

 35:   if ((rank != Last) && (!rank)) {
 36:     agmres->Ileft  = rank - 1;
 37:     agmres->Iright = rank + 1;
 38:   } else {
 39:     if (rank == Last) {
 40:       agmres->Ileft  = rank - 1;
 41:       agmres->Iright = First;
 42:     } else {
 43:       {
 44:         agmres->Ileft  = Last;
 45:         agmres->Iright = rank + 1;
 46:       }
 47:     }
 48:   }
 49:   agmres->rank  = rank;
 50:   agmres->size  = size;
 51:   agmres->First = First;
 52:   agmres->Last  = Last;
 53:   return(0);
 54: }

 56: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
 57: {
 58:   PetscReal a, b, t;

 61:   if (make_r == 1) {
 62:     a = *c;
 63:     b = *s;
 64:     if (b == 0.e0) {
 65:       *c = 1.e0;
 66:       *s = 0.e0;
 67:     } else {
 68:       if (PetscAbsReal(b) > PetscAbsReal(a)) {
 69:         t  = -a / b;
 70:         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 71:         *c = (*s) * t;
 72:       } else {
 73:         t  = -b / a;
 74:         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 75:         *s = (*c) * t;
 76:       }
 77:     }
 78:     if (*c == 0.e0) {
 79:       *r = 1.e0;
 80:     } else {
 81:       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
 82:         *r = PetscSign(*c) * (*s) / 2.e0;
 83:       } else {
 84:         *r = PetscSign(*s) * 2.e0 / (*c);
 85:       }
 86:     }
 87:   }

 89:   if (*r == 1.e0) {
 90:     *c = 0.e0;
 91:     *s = 1.e0;
 92:   } else {
 93:     if (PetscAbsReal(*r) < 1.e0) {
 94:       *s = 2.e0 * (*r);
 95:       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
 96:     } else {
 97:       *c = 2.e0 / (*r);
 98:       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
 99:     }
100:   }
101:   return(0);

103: }

105: /*
106:  * Compute the QR factorization of the Krylov basis vectors
107:  * Input :
108:  *  - the vectors are availabe in agmres->vecs (alias VEC_V)
109:  *  - nvec :  the number of vectors
110:  * Output :
111:  *  - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
112:  *  - agmres->sgn :  Sign of the rotations
113:  *  - agmres->tloc : scalar factors of the elementary reflectors.

115:  */
116: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
117: {
118:   KSP_AGMRES     *agmres = (KSP_AGMRES*) ksp->data;
119:   MPI_Comm       comm;
120:   PetscScalar    *Qloc   = agmres->Qloc;
121:   PetscScalar    *sgn    = agmres->sgn;
122:   PetscScalar    *tloc   = agmres->tloc;
124:   PetscReal      *wbufptr = agmres->wbufptr;
125:   PetscMPIInt    rank     = agmres->rank;
126:   PetscMPIInt    First    = agmres->First;
127:   PetscMPIInt    Last     = agmres->Last;
128:   PetscBLASInt   pas,len,bnloc,bpos;
129:   PetscInt       nloc,d, i, j, k;
130:   PetscInt       pos;
131:   PetscReal      c, s, rho, Ajj, val, tt, old;
132:   PetscScalar    *col;
133:   MPI_Status     status;
134:   PetscBLASInt   N = MAXKSPSIZE + 1;


138:   PetscObjectGetComm((PetscObject)ksp,&comm);
139:   PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
140:   PetscArrayzero(agmres->Rloc, N*N);
141:   /* check input arguments */
142:   if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive");
143:   VecGetLocalSize(VEC_V(0), &nloc);
144:   PetscBLASIntCast(nloc,&bnloc);
145:   if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
146:   pas = 1;
147:   /* Copy the vectors of the basis */
148:   for (j = 0; j < nvec; j++) {
149:     VecGetArray(VEC_V(j), &col);
150:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
151:     VecRestoreArray(VEC_V(j), &col);
152:   }
153:   /* Each process performs a local QR on its own block */
154:   for (j = 0; j < nvec; j++) {
155:     len = nloc - j;
156:     Ajj = Qloc[j*nloc+j];
157:     rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas);
158:     if (rho == 0.0) tloc[j] = 0.0;
159:     else {
160:       tloc[j] = (Ajj - rho) / rho;
161:       len     = len - 1;
162:       val     = 1.0 / (Ajj - rho);
163:       PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
164:       Qloc[j*nloc+j] = 1.0;
165:       len            = len + 1;
166:       for (k = j + 1; k < nvec; k++) {
167:         PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
168:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
169:       }
170:       Qloc[j*nloc+j] = rho;
171:     }
172:   }
173:   /* annihilate undesirable Rloc, diagonal by diagonal*/
174:   for (d = 0; d < nvec; d++) {
175:     len = nvec - d;
176:     if (rank == First) {
177:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
178:       MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
179:     } else {
180:       MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
181:       /* Elimination of Rloc(1,d)*/
182:       c    = wbufptr[d];
183:       s    = Qloc[d*nloc];
184:       KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
185:       /* Apply Givens Rotation*/
186:       for (k = d; k < nvec; k++) {
187:         old          = wbufptr[k];
188:         wbufptr[k]   =  c * old - s * Qloc[k*nloc];
189:         Qloc[k*nloc] =  s * old + c * Qloc[k*nloc];
190:       }
191:       Qloc[d*nloc] = rho;
192:       if (rank != Last) {
193:         MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
194:       }
195:       /* zero-out the d-th diagonal of Rloc ...*/
196:       for (j = d + 1; j < nvec; j++) {
197:         /* elimination of Rloc[i][j]*/
198:         i    = j - d;
199:         c    = Qloc[j*nloc+i-1];
200:         s    = Qloc[j*nloc+i];
201:         KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
202:         for (k = j; k < nvec; k++) {
203:           old              = Qloc[k*nloc+i-1];
204:           Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
205:           Qloc[k*nloc+i]   =   s * old + c * Qloc[k*nloc+i];
206:         }
207:         Qloc[j*nloc+i] = rho;
208:       }
209:       if (rank == Last) {
210:         PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
211:         for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
212:       }
213:     }
214:   }

216:   if (rank == Last) {
217:     for (d = 0; d < nvec; d++) {
218:       pos    = nvec - d;
219:       PetscBLASIntCast(pos,&bpos);
220:       sgn[d] = PetscSign(*RLOC(d,d));
221:       PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
222:     }
223:   }
224:   /* BroadCast Rloc to all other processes
225:    * NWD : should not be needed
226:    */
227:   MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
228:   PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
229:   return(0);
230: }


233: /*
234:  *  Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
235:  *  Input
236:  *   - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
237:  *   - In : input vector (size nvec)
238:  *  Output :
239:  *   - Out : Petsc vector (distributed as the basis vectors)
240:  */
241: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
242: {
243:   KSP_AGMRES     *agmres  = (KSP_AGMRES*) ksp->data;
244:   MPI_Comm       comm;
245:   PetscScalar    *Qloc    = agmres->Qloc;
246:   PetscScalar    *sgn     = agmres->sgn;
247:   PetscScalar    *tloc    = agmres->tloc;
248:   PetscMPIInt    rank     = agmres->rank;
249:   PetscMPIInt    First    = agmres->First, Last = agmres->Last;
250:   PetscMPIInt    Iright   = agmres->Iright, Ileft = agmres->Ileft;
251:   PetscScalar    *y, *zloc;
253:   PetscInt       nloc,d, len, i, j;
254:   PetscBLASInt   bnvec,pas,blen;
255:   PetscInt       dpt;
256:   PetscReal      c, s, rho, zp, zq, yd=0.0, tt;
257:   MPI_Status     status;

260:   PetscBLASIntCast(nvec,&bnvec);
261:   PetscObjectGetComm((PetscObject)ksp,&comm);
262:   pas  = 1;
263:   VecGetLocalSize(VEC_V(0), &nloc);
264:   PetscMalloc1(nvec, &y);
265:   PetscArraycpy(y, In, nvec);
266:   VecGetArray(Out, &zloc);

268:   if (rank == Last) {
269:     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
270:   }
271:   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
272:   if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
273:   else {
274:     for (d = nvec - 1; d >= 0; d--) {
275:       if (rank == First) {
276:         MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
277:       } else {
278:         for (j = nvec - 1; j >= d + 1; j--) {
279:           i         = j - d;
280:           KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
281:           zp        = zloc[i-1];
282:           zq        = zloc[i];
283:           zloc[i-1] =     c * zp + s * zq;
284:           zloc[i]   =     -s * zp + c * zq;
285:         }
286:         KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
287:         if (rank == Last) {
288:           zp      = y[d];
289:           zq      = zloc[0];
290:           y[d]    =      c * zp + s * zq;
291:           zloc[0] =   -s * zp + c * zq;
292:           MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
293:         } else {
294:           MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
295:           zp      = yd;
296:           zq      = zloc[0];
297:           yd      =      c * zp + s * zq;
298:           zloc[0] =   -s * zp + c * zq;
299:           MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
300:         }
301:       }
302:     }
303:   }
304:   for (j = nvec - 1; j >= 0; j--) {
305:     dpt = j * nloc + j;
306:     if (tloc[j] != 0.0) {
307:       len       = nloc - j;
308:       PetscBLASIntCast(len,&blen);
309:       rho       = Qloc[dpt];
310:       Qloc[dpt] = 1.0;
311:       tt        = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
312:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
313:       Qloc[dpt] = rho;
314:     }
315:   }
316:   VecRestoreArray(Out, &zloc);
317:   PetscFree(y);
318:   return(0);
319: }