Actual source code: mpimatmatmult.c

petsc-3.4.2 2013-07-02
  2: /*
  3:   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
  4:           C = A * B
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
  7: #include <../src/mat/utils/freespace.h>
  8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  9: #include <petscbt.h>
 10: #include <../src/mat/impls/dense/mpi/mpidense.h>
 11: #include <petsc-private/vecimpl.h>

 15: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
 16: {

 20:   if (scall == MAT_INITIAL_MATRIX) {
 21:     PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
 22:     MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
 23:     PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
 24:   }
 25:   PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
 26:   (*(*C)->ops->matmultnumeric)(A,B,*C);
 27:   PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
 28:   return(0);
 29: }

 33: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
 34: {
 36:   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
 37:   Mat_PtAPMPI    *ptap = a->ptap;

 40:   PetscFree2(ptap->startsj_s,ptap->startsj_r);
 41:   PetscFree(ptap->bufa);
 42:   MatDestroy(&ptap->P_loc);
 43:   MatDestroy(&ptap->P_oth);
 44:   MatDestroy(&ptap->Pt);
 45:   PetscFree(ptap->api);
 46:   PetscFree(ptap->apj);
 47:   PetscFree(ptap->apa);
 48:   ptap->destroy(A);
 49:   PetscFree(ptap);
 50:   return(0);
 51: }

 55: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
 56: {
 58:   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
 59:   Mat_PtAPMPI    *ptap = a->ptap;

 62:   (*ptap->duplicate)(A,op,M);

 64:   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
 65:   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
 66:   return(0);
 67: }

 71: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
 72: {
 74:   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
 75:   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
 76:   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
 77:   PetscInt       *adi=ad->i,*adj,*aoi=ao->i,*aoj;
 78:   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
 79:   Mat_SeqAIJ     *p_loc,*p_oth;
 80:   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
 81:   PetscScalar    *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
 82:   PetscInt       cm   =C->rmap->n,anz,pnz;
 83:   Mat_PtAPMPI    *ptap=c->ptap;
 84:   PetscInt       *api,*apj,*apJ,i,j,k,row;
 85:   PetscInt       cstart=C->cmap->rstart;
 86:   PetscInt       cdnz,conz,k0,k1;

 89:   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
 90:   /*-----------------------------------------------------*/
 91:   /* update numerical values of P_oth and P_loc */
 92:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
 93:   MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);

 95:   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
 96:   /*----------------------------------------------------------*/
 97:   /* get data from symbolic products */
 98:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
 99:   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
100:   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
101:   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;

103:   /* get apa for storing dense row A[i,:]*P */
104:   apa = ptap->apa;

106:   api = ptap->api;
107:   apj = ptap->apj;
108:   for (i=0; i<cm; i++) {
109:     /* diagonal portion of A */
110:     anz = adi[i+1] - adi[i];
111:     adj = ad->j + adi[i];
112:     ada = ad->a + adi[i];
113:     for (j=0; j<anz; j++) {
114:       row = adj[j];
115:       pnz = pi_loc[row+1] - pi_loc[row];
116:       pj  = pj_loc + pi_loc[row];
117:       pa  = pa_loc + pi_loc[row];

119:       /* perform dense axpy */
120:       valtmp = ada[j];
121:       for (k=0; k<pnz; k++) {
122:         apa[pj[k]] += valtmp*pa[k];
123:       }
124:       PetscLogFlops(2.0*pnz);
125:     }

127:     /* off-diagonal portion of A */
128:     anz = aoi[i+1] - aoi[i];
129:     aoj = ao->j + aoi[i];
130:     aoa = ao->a + aoi[i];
131:     for (j=0; j<anz; j++) {
132:       row = aoj[j];
133:       pnz = pi_oth[row+1] - pi_oth[row];
134:       pj  = pj_oth + pi_oth[row];
135:       pa  = pa_oth + pi_oth[row];

137:       /* perform dense axpy */
138:       valtmp = aoa[j];
139:       for (k=0; k<pnz; k++) {
140:         apa[pj[k]] += valtmp*pa[k];
141:       }
142:       PetscLogFlops(2.0*pnz);
143:     }

145:     /* set values in C */
146:     apJ  = apj + api[i];
147:     cdnz = cd->i[i+1] - cd->i[i];
148:     conz = co->i[i+1] - co->i[i];

150:     /* 1st off-diagoanl part of C */
151:     ca = coa + co->i[i];
152:     k  = 0;
153:     for (k0=0; k0<conz; k0++) {
154:       if (apJ[k] >= cstart) break;
155:       ca[k0]      = apa[apJ[k]];
156:       apa[apJ[k]] = 0.0;
157:       k++;
158:     }

160:     /* diagonal part of C */
161:     ca = cda + cd->i[i];
162:     for (k1=0; k1<cdnz; k1++) {
163:       ca[k1]      = apa[apJ[k]];
164:       apa[apJ[k]] = 0.0;
165:       k++;
166:     }

168:     /* 2nd off-diagoanl part of C */
169:     ca = coa + co->i[i];
170:     for (; k0<conz; k0++) {
171:       ca[k0]      = apa[apJ[k]];
172:       apa[apJ[k]] = 0.0;
173:       k++;
174:     }
175:   }
176:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
177:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
178:   return(0);
179: }

183: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
184: {
185:   PetscErrorCode     ierr;
186:   MPI_Comm           comm;
187:   Mat                Cmpi;
188:   Mat_PtAPMPI        *ptap;
189:   PetscFreeSpaceList free_space=NULL,current_space=NULL;
190:   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
191:   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
192:   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
193:   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
194:   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
195:   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
196:   PetscBT            lnkbt;
197:   PetscScalar        *apa;
198:   PetscReal          afill;
199:   PetscBool          scalable=PETSC_TRUE;
200:   PetscInt           nlnk_max,armax,prmax;

203:   PetscObjectGetComm((PetscObject)A,&comm);
204:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
205:     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
206:   }

208:   PetscObjectOptionsBegin((PetscObject)A);
209:   PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);
210:   if (scalable) {
211:     MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);
212:     return(0);
213:   }
214:   PetscOptionsEnd();

216:   /* create struct Mat_PtAPMPI and attached it to C later */
217:   PetscNew(Mat_PtAPMPI,&ptap);

219:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
220:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);

222:   /* get P_loc by taking all local rows of P */
223:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

225:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
226:   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
227:   pi_loc = p_loc->i; pj_loc = p_loc->j;
228:   pi_oth = p_oth->i; pj_oth = p_oth->j;

230:   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
231:   /*-------------------------------------------------------------------*/
232:   PetscMalloc((am+2)*sizeof(PetscInt),&api);
233:   ptap->api = api;
234:   api[0]    = 0;

236:   /* create and initialize a linked list */
237:   armax    = ad->rmax+ao->rmax;
238:   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
239:   nlnk_max = armax*prmax;
240:   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
241:   PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);

243:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
244:   PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);

246:   current_space = free_space;

248:   MatPreallocateInitialize(comm,am,pn,dnz,onz);
249:   for (i=0; i<am; i++) {
250:     /* diagonal portion of A */
251:     nzi = adi[i+1] - adi[i];
252:     for (j=0; j<nzi; j++) {
253:       row  = *adj++;
254:       pnz  = pi_loc[row+1] - pi_loc[row];
255:       Jptr = pj_loc + pi_loc[row];
256:       /* add non-zero cols of P into the sorted linked list lnk */
257:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
258:     }
259:     /* off-diagonal portion of A */
260:     nzi = aoi[i+1] - aoi[i];
261:     for (j=0; j<nzi; j++) {
262:       row  = *aoj++;
263:       pnz  = pi_oth[row+1] - pi_oth[row];
264:       Jptr = pj_oth + pi_oth[row];
265:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
266:     }

268:     apnz     = lnk[0];
269:     api[i+1] = api[i] + apnz;

271:     /* if free space is not available, double the total space in the list */
272:     if (current_space->local_remaining<apnz) {
273:       PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);
274:       nspacedouble++;
275:     }

277:     /* Copy data into free space, then initialize lnk */
278:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
279:     MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);

281:     current_space->array           += apnz;
282:     current_space->local_used      += apnz;
283:     current_space->local_remaining -= apnz;
284:   }

286:   /* Allocate space for apj, initialize apj, and */
287:   /* destroy list of free space and other temporary array(s) */
288:   PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
289:   apj  = ptap->apj;
290:   PetscFreeSpaceContiguous(&free_space,ptap->apj);
291:   PetscLLDestroy(lnk,lnkbt);

293:   /* malloc apa to store dense row A[i,:]*P */
294:   PetscMalloc(pN*sizeof(PetscScalar),&apa);
295:   PetscMemzero(apa,pN*sizeof(PetscScalar));

297:   ptap->apa = apa;

299:   /* create and assemble symbolic parallel matrix Cmpi */
300:   /*----------------------------------------------------*/
301:   MatCreate(comm,&Cmpi);
302:   MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
303:   MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);

305:   MatSetType(Cmpi,MATMPIAIJ);
306:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
307:   MatPreallocateFinalize(dnz,onz);
308:   for (i=0; i<am; i++) {
309:     row  = i + rstart;
310:     apnz = api[i+1] - api[i];
311:     MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
312:     apj += apnz;
313:   }
314:   MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
315:   MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);

317:   ptap->destroy        = Cmpi->ops->destroy;
318:   ptap->duplicate      = Cmpi->ops->duplicate;
319:   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
320:   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;

322:   /* attach the supporting struct to Cmpi for reuse */
323:   c       = (Mat_MPIAIJ*)Cmpi->data;
324:   c->ptap = ptap;

326:   *C = Cmpi;

328:   /* set MatInfo */
329:   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
330:   if (afill < 1.0) afill = 1.0;
331:   Cmpi->info.mallocs           = nspacedouble;
332:   Cmpi->info.fill_ratio_given  = fill;
333:   Cmpi->info.fill_ratio_needed = afill;

335: #if defined(PETSC_USE_INFO)
336:   if (api[am]) {
337:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
338:     PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
339:   } else {
340:     PetscInfo(Cmpi,"Empty matrix product\n");
341:   }
342: #endif
343:   return(0);
344: }

348: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
349: {

353:   if (scall == MAT_INITIAL_MATRIX) {
354:     PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
355:     MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
356:     PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
357:   }
358:   PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
359:   MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
360:   PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
361:   return(0);
362: }

364: typedef struct {
365:   Mat         workB;
366:   PetscScalar *rvalues,*svalues;
367:   MPI_Request *rwaits,*swaits;
368: } MPIAIJ_MPIDense;

372: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
373: {
374:   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
375:   PetscErrorCode  ierr;

378:   MatDestroy(&contents->workB);
379:   PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
380:   PetscFree(contents);
381:   return(0);
382: }

386: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
387: {
388:   PetscErrorCode         ierr;
389:   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
390:   PetscInt               nz   = aij->B->cmap->n;
391:   PetscContainer         container;
392:   MPIAIJ_MPIDense        *contents;
393:   VecScatter             ctx   = aij->Mvctx;
394:   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
395:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
396:   PetscInt               m     = A->rmap->n,n=B->cmap->n;

399:   MatCreate(PetscObjectComm((PetscObject)B),C);
400:   MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
401:   MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);
402:   MatSetType(*C,MATMPIDENSE);
403:   MatMPIDenseSetPreallocation(*C,NULL);
404:   MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
405:   MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);

407:   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;

409:   PetscNew(MPIAIJ_MPIDense,&contents);
410:   /* Create work matrix used to store off processor rows of B needed for local product */
411:   MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
412:   /* Create work arrays needed */
413:   PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
414:                       B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
415:                       from->n,MPI_Request,&contents->rwaits,
416:                       to->n,MPI_Request,&contents->swaits);

418:   PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
419:   PetscContainerSetPointer(container,contents);
420:   PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
421:   PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
422:   PetscContainerDestroy(&container);
423:   return(0);
424: }

428: /*
429:     Performs an efficient scatter on the rows of B needed by this process; this is
430:     a modification of the VecScatterBegin_() routines.
431: */
432: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
433: {
434:   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
435:   PetscErrorCode         ierr;
436:   PetscScalar            *b,*w,*svalues,*rvalues;
437:   VecScatter             ctx   = aij->Mvctx;
438:   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
439:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
440:   PetscInt               i,j,k;
441:   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
442:   PetscMPIInt            *sprocs,*rprocs,nrecvs;
443:   MPI_Request            *swaits,*rwaits;
444:   MPI_Comm               comm;
445:   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
446:   MPI_Status             status;
447:   MPIAIJ_MPIDense        *contents;
448:   PetscContainer         container;
449:   Mat                    workB;

452:   PetscObjectGetComm((PetscObject)A,&comm);
453:   PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
454:   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
455:   PetscContainerGetPointer(container,(void**)&contents);

457:   workB = *outworkB = contents->workB;
458:   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
459:   sindices = to->indices;
460:   sstarts  = to->starts;
461:   sprocs   = to->procs;
462:   swaits   = contents->swaits;
463:   svalues  = contents->svalues;

465:   rindices = from->indices;
466:   rstarts  = from->starts;
467:   rprocs   = from->procs;
468:   rwaits   = contents->rwaits;
469:   rvalues  = contents->rvalues;

471:   MatDenseGetArray(B,&b);
472:   MatDenseGetArray(workB,&w);

474:   for (i=0; i<from->n; i++) {
475:     MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
476:   }

478:   for (i=0; i<to->n; i++) {
479:     /* pack a message at a time */
480:     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
481:       for (k=0; k<ncols; k++) {
482:         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
483:       }
484:     }
485:     MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
486:   }

488:   nrecvs = from->n;
489:   while (nrecvs) {
490:     MPI_Waitany(from->n,rwaits,&imdex,&status);
491:     nrecvs--;
492:     /* unpack a message at a time */
493:     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
494:       for (k=0; k<ncols; k++) {
495:         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
496:       }
497:     }
498:   }
499:   if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}

501:   MatDenseRestoreArray(B,&b);
502:   MatDenseRestoreArray(workB,&w);
503:   MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
504:   MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
505:   return(0);
506: }
507: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);

511: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
512: {
514:   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
515:   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
516:   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
517:   Mat            workB;

520:   /* diagonal block of A times all local rows of B*/
521:   MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);

523:   /* get off processor parts of B needed to complete the product */
524:   MatMPIDenseScatter(A,B,C,&workB);

526:   /* off-diagonal block of A times nonlocal rows of B */
527:   MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
528:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
529:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
530:   return(0);
531: }

535: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
536: {
538:   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
539:   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
540:   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
541:   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
542:   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
543:   Mat_SeqAIJ     *p_loc,*p_oth;
544:   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
545:   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
546:   PetscInt       cm          = C->rmap->n,anz,pnz;
547:   Mat_PtAPMPI    *ptap       = c->ptap;
548:   PetscScalar    *apa_sparse = ptap->apa;
549:   PetscInt       *api,*apj,*apJ,i,j,k,row;
550:   PetscInt       cstart = C->cmap->rstart;
551:   PetscInt       cdnz,conz,k0,k1,nextp;

554:   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
555:   /*-----------------------------------------------------*/
556:   /* update numerical values of P_oth and P_loc */
557:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
558:   MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);

560:   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
561:   /*----------------------------------------------------------*/
562:   /* get data from symbolic products */
563:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
564:   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
565:   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
566:   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;

568:   api = ptap->api;
569:   apj = ptap->apj;
570:   for (i=0; i<cm; i++) {
571:     apJ = apj + api[i];

573:     /* diagonal portion of A */
574:     anz = adi[i+1] - adi[i];
575:     adj = ad->j + adi[i];
576:     ada = ad->a + adi[i];
577:     for (j=0; j<anz; j++) {
578:       row = adj[j];
579:       pnz = pi_loc[row+1] - pi_loc[row];
580:       pj  = pj_loc + pi_loc[row];
581:       pa  = pa_loc + pi_loc[row];
582:       /* perform sparse axpy */
583:       valtmp = ada[j];
584:       nextp  = 0;
585:       for (k=0; nextp<pnz; k++) {
586:         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
587:           apa_sparse[k] += valtmp*pa[nextp++];
588:         }
589:       }
590:       PetscLogFlops(2.0*pnz);
591:     }

593:     /* off-diagonal portion of A */
594:     anz = aoi[i+1] - aoi[i];
595:     aoj = ao->j + aoi[i];
596:     aoa = ao->a + aoi[i];
597:     for (j=0; j<anz; j++) {
598:       row = aoj[j];
599:       pnz = pi_oth[row+1] - pi_oth[row];
600:       pj  = pj_oth + pi_oth[row];
601:       pa  = pa_oth + pi_oth[row];
602:       /* perform sparse axpy */
603:       valtmp = aoa[j];
604:       nextp  = 0;
605:       for (k=0; nextp<pnz; k++) {
606:         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
607:           apa_sparse[k] += valtmp*pa[nextp++];
608:         }
609:       }
610:       PetscLogFlops(2.0*pnz);
611:     }

613:     /* set values in C */
614:     cdnz = cd->i[i+1] - cd->i[i];
615:     conz = co->i[i+1] - co->i[i];

617:     /* 1st off-diagoanl part of C */
618:     ca = coa + co->i[i];
619:     k  = 0;
620:     for (k0=0; k0<conz; k0++) {
621:       if (apJ[k] >= cstart) break;
622:       ca[k0]        = apa_sparse[k];
623:       apa_sparse[k] = 0.0;
624:       k++;
625:     }

627:     /* diagonal part of C */
628:     ca = cda + cd->i[i];
629:     for (k1=0; k1<cdnz; k1++) {
630:       ca[k1]        = apa_sparse[k];
631:       apa_sparse[k] = 0.0;
632:       k++;
633:     }

635:     /* 2nd off-diagoanl part of C */
636:     ca = coa + co->i[i];
637:     for (; k0<conz; k0++) {
638:       ca[k0]        = apa_sparse[k];
639:       apa_sparse[k] = 0.0;
640:       k++;
641:     }
642:   }
643:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
644:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
645:   return(0);
646: }

648: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
651: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
652: {
653:   PetscErrorCode     ierr;
654:   MPI_Comm           comm;
655:   Mat                Cmpi;
656:   Mat_PtAPMPI        *ptap;
657:   PetscFreeSpaceList free_space = NULL,current_space=NULL;
658:   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
659:   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
660:   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
661:   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
662:   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
663:   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
664:   PetscInt           nlnk_max,armax,prmax;
665:   PetscReal          afill;
666:   PetscScalar        *apa;

669:   PetscObjectGetComm((PetscObject)A,&comm);
670:   /* create struct Mat_PtAPMPI and attached it to C later */
671:   PetscNew(Mat_PtAPMPI,&ptap);

673:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
674:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);

676:   /* get P_loc by taking all local rows of P */
677:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

679:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
680:   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
681:   pi_loc = p_loc->i; pj_loc = p_loc->j;
682:   pi_oth = p_oth->i; pj_oth = p_oth->j;

684:   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
685:   /*-------------------------------------------------------------------*/
686:   PetscMalloc((am+2)*sizeof(PetscInt),&api);
687:   ptap->api = api;
688:   api[0]    = 0;

690:   /* create and initialize a linked list */
691:   armax    = ad->rmax+ao->rmax;
692:   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
693:   nlnk_max = armax*prmax;
694:   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
695:   PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);

697:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
698:   PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);

700:   current_space = free_space;

702:   MatPreallocateInitialize(comm,am,pn,dnz,onz);
703:   for (i=0; i<am; i++) {
704:     /* diagonal portion of A */
705:     nzi = adi[i+1] - adi[i];
706:     for (j=0; j<nzi; j++) {
707:       row  = *adj++;
708:       pnz  = pi_loc[row+1] - pi_loc[row];
709:       Jptr = pj_loc + pi_loc[row];
710:       /* add non-zero cols of P into the sorted linked list lnk */
711:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
712:     }
713:     /* off-diagonal portion of A */
714:     nzi = aoi[i+1] - aoi[i];
715:     for (j=0; j<nzi; j++) {
716:       row  = *aoj++;
717:       pnz  = pi_oth[row+1] - pi_oth[row];
718:       Jptr = pj_oth + pi_oth[row];
719:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
720:     }

722:     apnz     = *lnk;
723:     api[i+1] = api[i] + apnz;
724:     if (apnz > apnz_max) apnz_max = apnz;

726:     /* if free space is not available, double the total space in the list */
727:     if (current_space->local_remaining<apnz) {
728:       PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);
729:       nspacedouble++;
730:     }

732:     /* Copy data into free space, then initialize lnk */
733:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
734:     MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);

736:     current_space->array           += apnz;
737:     current_space->local_used      += apnz;
738:     current_space->local_remaining -= apnz;
739:   }

741:   /* Allocate space for apj, initialize apj, and */
742:   /* destroy list of free space and other temporary array(s) */
743:   PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
744:   apj  = ptap->apj;
745:   PetscFreeSpaceContiguous(&free_space,ptap->apj);
746:   PetscLLCondensedDestroy_Scalable(lnk);

748:   /* create and assemble symbolic parallel matrix Cmpi */
749:   /*----------------------------------------------------*/
750:   MatCreate(comm,&Cmpi);
751:   MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
752:   MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
753:   MatSetType(Cmpi,MATMPIAIJ);
754:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
755:   MatPreallocateFinalize(dnz,onz);

757:   /* malloc apa for assembly Cmpi */
758:   PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);
759:   PetscMemzero(apa,apnz_max*sizeof(PetscScalar));

761:   ptap->apa = apa;
762:   for (i=0; i<am; i++) {
763:     row  = i + rstart;
764:     apnz = api[i+1] - api[i];
765:     MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
766:     apj += apnz;
767:   }
768:   MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
769:   MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);

771:   ptap->destroy             = Cmpi->ops->destroy;
772:   ptap->duplicate           = Cmpi->ops->duplicate;
773:   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
774:   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
775:   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;

777:   /* attach the supporting struct to Cmpi for reuse */
778:   c       = (Mat_MPIAIJ*)Cmpi->data;
779:   c->ptap = ptap;

781:   *C = Cmpi;

783:   /* set MatInfo */
784:   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
785:   if (afill < 1.0) afill = 1.0;
786:   Cmpi->info.mallocs           = nspacedouble;
787:   Cmpi->info.fill_ratio_given  = fill;
788:   Cmpi->info.fill_ratio_needed = afill;

790: #if defined(PETSC_USE_INFO)
791:   if (api[am]) {
792:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
793:     PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
794:   } else {
795:     PetscInfo(Cmpi,"Empty matrix product\n");
796:   }
797: #endif
798:   return(0);
799: }

801: /*-------------------------------------------------------------------------*/
804: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
805: {
807:   PetscBool      scalable=PETSC_TRUE,viamatmatmult=PETSC_FALSE;

810:   PetscOptionsBool("-mattransposematmult_viamatmatmult","Use R=Pt and C=R*A","",viamatmatmult,&viamatmatmult,NULL);
811:   if (viamatmatmult) {
812:     Mat         Pt;
813:     Mat_PtAPMPI *ptap;
814:     Mat_MPIAIJ  *c;
815:     if (scall == MAT_INITIAL_MATRIX) {
816:       MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
817:       MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);

819:       c        = (Mat_MPIAIJ*)(*C)->data;
820:       ptap     = c->ptap;
821:       ptap->Pt = Pt;
822:     } else if (scall == MAT_REUSE_MATRIX) {
823:       c    = (Mat_MPIAIJ*)(*C)->data;
824:       ptap = c->ptap;
825:       Pt   = ptap->Pt;
826:       MatTranspose(P,scall,&Pt);
827:       MatMatMult(Pt,A,scall,fill,C);
828:     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not supported");
829:     return(0);
830:   }

832:   if (scall == MAT_INITIAL_MATRIX) {
833:     PetscObjectOptionsBegin((PetscObject)A);
834:     PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,NULL);
835:     if  (scalable) {
836:       MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);
837:     } else {
838:       MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
839:     }
840:     PetscOptionsEnd();
841:   }
842:   (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
843:   return(0);
844: }

848: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
849: {
850:   PetscErrorCode      ierr;
851:   Mat_Merge_SeqsToMPI *merge;
852:   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
853:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
854:   Mat_PtAPMPI         *ptap;
855:   PetscInt            *adj,*aJ;
856:   PetscInt            i,j,k,anz,pnz,row,*cj;
857:   MatScalar           *ada,*aval,*ca,valtmp;
858:   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
859:   MPI_Comm            comm;
860:   PetscMPIInt         size,rank,taga,*len_s;
861:   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
862:   PetscInt            **buf_ri,**buf_rj;
863:   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
864:   MPI_Request         *s_waits,*r_waits;
865:   MPI_Status          *status;
866:   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
867:   PetscInt            *ai,*aj,*coi,*coj;
868:   PetscInt            *poJ,*pdJ;
869:   Mat                 A_loc;
870:   Mat_SeqAIJ          *a_loc;

873:   PetscObjectGetComm((PetscObject)C,&comm);
874:   MPI_Comm_size(comm,&size);
875:   MPI_Comm_rank(comm,&rank);

877:   ptap  = c->ptap;
878:   merge = ptap->merge;

880:   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
881:   /*--------------------------------------------------------------*/
882:   /* get data from symbolic products */
883:   coi  = merge->coi; coj = merge->coj;
884:   PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
885:   PetscMemzero(coa,coi[pon]*sizeof(MatScalar));

887:   bi     = merge->bi; bj = merge->bj;
888:   owners = merge->rowmap->range;
889:   PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
890:   PetscMemzero(ba,bi[cm]*sizeof(MatScalar));

892:   /* get A_loc by taking all local rows of A */
893:   A_loc = ptap->A_loc;
894:   MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
895:   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
896:   ai    = a_loc->i;
897:   aj    = a_loc->j;

899:   PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval); /* non-scalable!!! */
900:   PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));

902:   for (i=0; i<am; i++) {
903:     /* 2-a) put A[i,:] to dense array aval */
904:     anz = ai[i+1] - ai[i];
905:     adj = aj + ai[i];
906:     ada = a_loc->a + ai[i];
907:     for (j=0; j<anz; j++) {
908:       aval[adj[j]] = ada[j];
909:     }

911:     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
912:     /*--------------------------------------------------------------*/
913:     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
914:     pnz = po->i[i+1] - po->i[i];
915:     poJ = po->j + po->i[i];
916:     pA  = po->a + po->i[i];
917:     for (j=0; j<pnz; j++) {
918:       row = poJ[j];
919:       cnz = coi[row+1] - coi[row];
920:       cj  = coj + coi[row];
921:       ca  = coa + coi[row];
922:       /* perform dense axpy */
923:       valtmp = pA[j];
924:       for (k=0; k<cnz; k++) {
925:         ca[k] += valtmp*aval[cj[k]];
926:       }
927:       PetscLogFlops(2.0*cnz);
928:     }

930:     /* put the value into Cd (diagonal part) */
931:     pnz = pd->i[i+1] - pd->i[i];
932:     pdJ = pd->j + pd->i[i];
933:     pA  = pd->a + pd->i[i];
934:     for (j=0; j<pnz; j++) {
935:       row = pdJ[j];
936:       cnz = bi[row+1] - bi[row];
937:       cj  = bj + bi[row];
938:       ca  = ba + bi[row];
939:       /* perform dense axpy */
940:       valtmp = pA[j];
941:       for (k=0; k<cnz; k++) {
942:         ca[k] += valtmp*aval[cj[k]];
943:       }
944:       PetscLogFlops(2.0*cnz);
945:     }

947:     /* zero the current row of Pt*A */
948:     aJ = aj + ai[i];
949:     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
950:   }

952:   /* 3) send and recv matrix values coa */
953:   /*------------------------------------*/
954:   buf_ri = merge->buf_ri;
955:   buf_rj = merge->buf_rj;
956:   len_s  = merge->len_s;
957:   PetscCommGetNewTag(comm,&taga);
958:   PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);

960:   PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
961:   for (proc=0,k=0; proc<size; proc++) {
962:     if (!len_s[proc]) continue;
963:     i    = merge->owners_co[proc];
964:     MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
965:     k++;
966:   }
967:   if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
968:   if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}

970:   PetscFree2(s_waits,status);
971:   PetscFree(r_waits);
972:   PetscFree(coa);

974:   /* 4) insert local Cseq and received values into Cmpi */
975:   /*----------------------------------------------------*/
976:   PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
977:   for (k=0; k<merge->nrecv; k++) {
978:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
979:     nrows       = *(buf_ri_k[k]);
980:     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
981:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
982:   }

984:   for (i=0; i<cm; i++) {
985:     row  = owners[rank] + i; /* global row index of C_seq */
986:     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
987:     ba_i = ba + bi[i];
988:     bnz  = bi[i+1] - bi[i];
989:     /* add received vals into ba */
990:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
991:       /* i-th row */
992:       if (i == *nextrow[k]) {
993:         cnz    = *(nextci[k]+1) - *nextci[k];
994:         cj     = buf_rj[k] + *(nextci[k]);
995:         ca     = abuf_r[k] + *(nextci[k]);
996:         nextcj = 0;
997:         for (j=0; nextcj<cnz; j++) {
998:           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
999:             ba_i[j] += ca[nextcj++];
1000:           }
1001:         }
1002:         nextrow[k]++; nextci[k]++;
1003:         PetscLogFlops(2.0*cnz);
1004:       }
1005:     }
1006:     MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1007:   }
1008:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1009:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1011:   PetscFree(ba);
1012:   PetscFree(abuf_r[0]);
1013:   PetscFree(abuf_r);
1014:   PetscFree3(buf_ri_k,nextrow,nextci);
1015:   PetscFree(aval);
1016:   return(0);
1017: }

1019: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1022: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1023: {
1024:   PetscErrorCode      ierr;
1025:   Mat                 Cmpi,A_loc,POt,PDt;
1026:   Mat_PtAPMPI         *ptap;
1027:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1028:   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1029:   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1030:   PetscInt            nnz;
1031:   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1032:   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1033:   PetscBT             lnkbt;
1034:   MPI_Comm            comm;
1035:   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1036:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1037:   PetscInt            len,proc,*dnz,*onz,*owners;
1038:   PetscInt            nzi,*bi,*bj;
1039:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1040:   MPI_Request         *swaits,*rwaits;
1041:   MPI_Status          *sstatus,rstatus;
1042:   Mat_Merge_SeqsToMPI *merge;
1043:   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1044:   PetscReal           afill  =1.0,afill_tmp;
1045:   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1046:   PetscScalar         *vals;
1047:   Mat_SeqAIJ          *a_loc, *pdt,*pot;

1050:   PetscObjectGetComm((PetscObject)A,&comm);
1051:   /* check if matrix local sizes are compatible */
1052:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1053:     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1054:   }

1056:   MPI_Comm_size(comm,&size);
1057:   MPI_Comm_rank(comm,&rank);

1059:   /* create struct Mat_PtAPMPI and attached it to C later */
1060:   PetscNew(Mat_PtAPMPI,&ptap);

1062:   /* get A_loc by taking all local rows of A */
1063:   MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);

1065:   ptap->A_loc = A_loc;

1067:   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1068:   ai    = a_loc->i;
1069:   aj    = a_loc->j;

1071:   /* determine symbolic Co=(p->B)^T*A - send to others */
1072:   /*----------------------------------------------------*/
1073:   MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1074:   pdt  = (Mat_SeqAIJ*)PDt->data;
1075:   pdti = pdt->i; pdtj = pdt->j;

1077:   MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1078:   pot  = (Mat_SeqAIJ*)POt->data;
1079:   poti = pot->i; potj = pot->j;

1081:   /* then, compute symbolic Co = (p->B)^T*A */
1082:   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1083:   PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1084:   coi[0] = 0;

1086:   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1087:   nnz           = fill*(poti[pon] + ai[am]);
1088:   PetscFreeSpaceGet(nnz,&free_space);
1089:   current_space = free_space;

1091:   /* create and initialize a linked list */
1092:   i     = PetscMax(pdt->rmax,pot->rmax);
1093:   Crmax = i*a_loc->rmax*size;
1094:   if (!Crmax || Crmax > aN) Crmax = aN;
1095:   PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);

1097:   for (i=0; i<pon; i++) {
1098:     pnz = poti[i+1] - poti[i];
1099:     ptJ = potj + poti[i];
1100:     for (j=0; j<pnz; j++) {
1101:       row  = ptJ[j]; /* row of A_loc == col of Pot */
1102:       anz  = ai[row+1] - ai[row];
1103:       Jptr = aj + ai[row];
1104:       /* add non-zero cols of AP into the sorted linked list lnk */
1105:       PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1106:     }
1107:     nnz = lnk[0];

1109:     /* If free space is not available, double the total space in the list */
1110:     if (current_space->local_remaining<nnz) {
1111:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
1112:       nspacedouble++;
1113:     }

1115:     /* Copy data into free space, and zero out denserows */
1116:     PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);

1118:     current_space->array           += nnz;
1119:     current_space->local_used      += nnz;
1120:     current_space->local_remaining -= nnz;

1122:     coi[i+1] = coi[i] + nnz;
1123:   }

1125:   PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1126:   PetscFreeSpaceContiguous(&free_space,coj);

1128:   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1129:   if (afill_tmp > afill) afill = afill_tmp;

1131:   /* send j-array (coj) of Co to other processors */
1132:   /*----------------------------------------------*/
1133:   /* determine row ownership */
1134:   PetscNew(Mat_Merge_SeqsToMPI,&merge);
1135:   PetscLayoutCreate(comm,&merge->rowmap);

1137:   merge->rowmap->n  = pn;
1138:   merge->rowmap->bs = 1;

1140:   PetscLayoutSetUp(merge->rowmap);
1141:   owners = merge->rowmap->range;

1143:   /* determine the number of messages to send, their lengths */
1144:   PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1145:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1146:   PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);

1148:   len_s        = merge->len_s;
1149:   merge->nsend = 0;

1151:   PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1152:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));

1154:   proc = 0;
1155:   for (i=0; i<pon; i++) {
1156:     while (prmap[i] >= owners[proc+1]) proc++;
1157:     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1158:     len_s[proc] += coi[i+1] - coi[i];
1159:   }

1161:   len          = 0; /* max length of buf_si[] */
1162:   owners_co[0] = 0;
1163:   for (proc=0; proc<size; proc++) {
1164:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1165:     if (len_si[proc]) {
1166:       merge->nsend++;
1167:       len_si[proc] = 2*(len_si[proc] + 1);
1168:       len         += len_si[proc];
1169:     }
1170:   }

1172:   /* determine the number and length of messages to receive for coi and coj  */
1173:   PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1174:   PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);

1176:   /* post the Irecv and Isend of coj */
1177:   PetscCommGetNewTag(comm,&tagj);
1178:   PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1179:   PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1180:   for (proc=0, k=0; proc<size; proc++) {
1181:     if (!len_s[proc]) continue;
1182:     i    = owners_co[proc];
1183:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1184:     k++;
1185:   }

1187:   /* receives and sends of coj are complete */
1188:   PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1189:   for (i=0; i<merge->nrecv; i++) {
1190:     PetscMPIInt icompleted;
1191:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1192:   }
1193:   PetscFree(rwaits);
1194:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1196:   /* send and recv coi */
1197:   /*-------------------*/
1198:   PetscCommGetNewTag(comm,&tagi);
1199:   PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1200:   PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1201:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1202:   for (proc=0,k=0; proc<size; proc++) {
1203:     if (!len_s[proc]) continue;
1204:     /* form outgoing message for i-structure:
1205:          buf_si[0]:                 nrows to be sent
1206:                [1:nrows]:           row index (global)
1207:                [nrows+1:2*nrows+1]: i-structure index
1208:     */
1209:     /*-------------------------------------------*/
1210:     nrows       = len_si[proc]/2 - 1;
1211:     buf_si_i    = buf_si + nrows+1;
1212:     buf_si[0]   = nrows;
1213:     buf_si_i[0] = 0;
1214:     nrows       = 0;
1215:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1216:       nzi               = coi[i+1] - coi[i];
1217:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1218:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1219:       nrows++;
1220:     }
1221:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1222:     k++;
1223:     buf_si += len_si[proc];
1224:   }
1225:   i = merge->nrecv;
1226:   while (i--) {
1227:     PetscMPIInt icompleted;
1228:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1229:   }
1230:   PetscFree(rwaits);
1231:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1232:   PetscFree(len_si);
1233:   PetscFree(len_ri);
1234:   PetscFree(swaits);
1235:   PetscFree(sstatus);
1236:   PetscFree(buf_s);

1238:   /* compute the local portion of C (mpi mat) */
1239:   /*------------------------------------------*/
1240:   /* allocate bi array and free space for accumulating nonzero column info */
1241:   PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1242:   bi[0] = 0;

1244:   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1245:   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1246:   PetscFreeSpaceGet(nnz,&free_space);
1247:   current_space = free_space;

1249:   PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1250:   for (k=0; k<merge->nrecv; k++) {
1251:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1252:     nrows       = *buf_ri_k[k];
1253:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1254:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1255:   }

1257:   MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1258:   rmax = 0;
1259:   for (i=0; i<pn; i++) {
1260:     /* add pdt[i,:]*AP into lnk */
1261:     pnz = pdti[i+1] - pdti[i];
1262:     ptJ = pdtj + pdti[i];
1263:     for (j=0; j<pnz; j++) {
1264:       row  = ptJ[j];  /* row of AP == col of Pt */
1265:       anz  = ai[row+1] - ai[row];
1266:       Jptr = aj + ai[row];
1267:       /* add non-zero cols of AP into the sorted linked list lnk */
1268:       PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1269:     }

1271:     /* add received col data into lnk */
1272:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1273:       if (i == *nextrow[k]) { /* i-th row */
1274:         nzi  = *(nextci[k]+1) - *nextci[k];
1275:         Jptr = buf_rj[k] + *nextci[k];
1276:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1277:         nextrow[k]++; nextci[k]++;
1278:       }
1279:     }
1280:     nnz = lnk[0];

1282:     /* if free space is not available, make more free space */
1283:     if (current_space->local_remaining<nnz) {
1284:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
1285:       nspacedouble++;
1286:     }
1287:     /* copy data into free space, then initialize lnk */
1288:     PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1289:     MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);

1291:     current_space->array           += nnz;
1292:     current_space->local_used      += nnz;
1293:     current_space->local_remaining -= nnz;

1295:     bi[i+1] = bi[i] + nnz;
1296:     if (nnz > rmax) rmax = nnz;
1297:   }
1298:   PetscFree3(buf_ri_k,nextrow,nextci);

1300:   PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1301:   PetscFreeSpaceContiguous(&free_space,bj);

1303:   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1304:   if (afill_tmp > afill) afill = afill_tmp;
1305:   PetscLLCondensedDestroy(lnk,lnkbt);
1306:   MatDestroy(&POt);
1307:   MatDestroy(&PDt);

1309:   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1310:   /*----------------------------------------------------------------------------------*/
1311:   PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1312:   PetscMemzero(vals,rmax*sizeof(PetscScalar));

1314:   MatCreate(comm,&Cmpi);
1315:   MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1316:   MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1317:   MatSetType(Cmpi,MATMPIAIJ);
1318:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1319:   MatPreallocateFinalize(dnz,onz);
1320:   MatSetBlockSize(Cmpi,1);
1321:   for (i=0; i<pn; i++) {
1322:     row  = i + rstart;
1323:     nnz  = bi[i+1] - bi[i];
1324:     Jptr = bj + bi[i];
1325:     MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1326:   }
1327:   MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1328:   MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1329:   PetscFree(vals);

1331:   merge->bi        = bi;
1332:   merge->bj        = bj;
1333:   merge->coi       = coi;
1334:   merge->coj       = coj;
1335:   merge->buf_ri    = buf_ri;
1336:   merge->buf_rj    = buf_rj;
1337:   merge->owners_co = owners_co;
1338:   merge->destroy   = Cmpi->ops->destroy;
1339:   merge->duplicate = Cmpi->ops->duplicate;

1341:   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1342:   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;

1344:   /* attach the supporting struct to Cmpi for reuse */
1345:   c           = (Mat_MPIAIJ*)Cmpi->data;
1346:   c->ptap     = ptap;
1347:   ptap->api   = NULL;
1348:   ptap->apj   = NULL;
1349:   ptap->merge = merge;
1350:   ptap->rmax  = rmax;

1352:   *C = Cmpi;
1353: #if defined(PETSC_USE_INFO)
1354:   if (bi[pn] != 0) {
1355:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1356:     PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1357:   } else {
1358:     PetscInfo(Cmpi,"Empty matrix product\n");
1359:   }
1360: #endif
1361:   return(0);
1362: }

1366: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1367: {
1368:   PetscErrorCode      ierr;
1369:   Mat_Merge_SeqsToMPI *merge;
1370:   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1371:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1372:   Mat_PtAPMPI         *ptap;
1373:   PetscInt            *adj;
1374:   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1375:   MatScalar           *ada,*ca,valtmp;
1376:   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1377:   MPI_Comm            comm;
1378:   PetscMPIInt         size,rank,taga,*len_s;
1379:   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1380:   PetscInt            **buf_ri,**buf_rj;
1381:   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1382:   MPI_Request         *s_waits,*r_waits;
1383:   MPI_Status          *status;
1384:   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1385:   PetscInt            *ai,*aj,*coi,*coj;
1386:   PetscInt            *poJ,*pdJ;
1387:   Mat                 A_loc;
1388:   Mat_SeqAIJ          *a_loc;

1391:   PetscObjectGetComm((PetscObject)C,&comm);
1392:   MPI_Comm_size(comm,&size);
1393:   MPI_Comm_rank(comm,&rank);

1395:   ptap  = c->ptap;
1396:   merge = ptap->merge;

1398:   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1399:   /*------------------------------------------*/
1400:   /* get data from symbolic products */
1401:   coi    = merge->coi; coj = merge->coj;
1402:   PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
1403:   PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
1404:   bi     = merge->bi; bj = merge->bj;
1405:   owners = merge->rowmap->range;
1406:   PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
1407:   PetscMemzero(ba,bi[cm]*sizeof(MatScalar));

1409:   /* get A_loc by taking all local rows of A */
1410:   A_loc = ptap->A_loc;
1411:   MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1412:   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1413:   ai    = a_loc->i;
1414:   aj    = a_loc->j;

1416:   for (i=0; i<am; i++) {
1417:     anz = ai[i+1] - ai[i];
1418:     adj = aj + ai[i];
1419:     ada = a_loc->a + ai[i];

1421:     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1422:     /*-------------------------------------------------------------*/
1423:     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1424:     pnz = po->i[i+1] - po->i[i];
1425:     poJ = po->j + po->i[i];
1426:     pA  = po->a + po->i[i];
1427:     for (j=0; j<pnz; j++) {
1428:       row = poJ[j];
1429:       cj  = coj + coi[row];
1430:       ca  = coa + coi[row];
1431:       /* perform sparse axpy */
1432:       nexta  = 0;
1433:       valtmp = pA[j];
1434:       for (k=0; nexta<anz; k++) {
1435:         if (cj[k] == adj[nexta]) {
1436:           ca[k] += valtmp*ada[nexta];
1437:           nexta++;
1438:         }
1439:       }
1440:       PetscLogFlops(2.0*anz);
1441:     }

1443:     /* put the value into Cd (diagonal part) */
1444:     pnz = pd->i[i+1] - pd->i[i];
1445:     pdJ = pd->j + pd->i[i];
1446:     pA  = pd->a + pd->i[i];
1447:     for (j=0; j<pnz; j++) {
1448:       row = pdJ[j];
1449:       cj  = bj + bi[row];
1450:       ca  = ba + bi[row];
1451:       /* perform sparse axpy */
1452:       nexta  = 0;
1453:       valtmp = pA[j];
1454:       for (k=0; nexta<anz; k++) {
1455:         if (cj[k] == adj[nexta]) {
1456:           ca[k] += valtmp*ada[nexta];
1457:           nexta++;
1458:         }
1459:       }
1460:       PetscLogFlops(2.0*anz);
1461:     }
1462:   }

1464:   /* 3) send and recv matrix values coa */
1465:   /*------------------------------------*/
1466:   buf_ri = merge->buf_ri;
1467:   buf_rj = merge->buf_rj;
1468:   len_s  = merge->len_s;
1469:   PetscCommGetNewTag(comm,&taga);
1470:   PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);

1472:   PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
1473:   for (proc=0,k=0; proc<size; proc++) {
1474:     if (!len_s[proc]) continue;
1475:     i    = merge->owners_co[proc];
1476:     MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1477:     k++;
1478:   }
1479:   if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1480:   if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}

1482:   PetscFree2(s_waits,status);
1483:   PetscFree(r_waits);
1484:   PetscFree(coa);

1486:   /* 4) insert local Cseq and received values into Cmpi */
1487:   /*----------------------------------------------------*/
1488:   PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1489:   for (k=0; k<merge->nrecv; k++) {
1490:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1491:     nrows       = *(buf_ri_k[k]);
1492:     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1493:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1494:   }

1496:   for (i=0; i<cm; i++) {
1497:     row  = owners[rank] + i; /* global row index of C_seq */
1498:     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1499:     ba_i = ba + bi[i];
1500:     bnz  = bi[i+1] - bi[i];
1501:     /* add received vals into ba */
1502:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1503:       /* i-th row */
1504:       if (i == *nextrow[k]) {
1505:         cnz    = *(nextci[k]+1) - *nextci[k];
1506:         cj     = buf_rj[k] + *(nextci[k]);
1507:         ca     = abuf_r[k] + *(nextci[k]);
1508:         nextcj = 0;
1509:         for (j=0; nextcj<cnz; j++) {
1510:           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1511:             ba_i[j] += ca[nextcj++];
1512:           }
1513:         }
1514:         nextrow[k]++; nextci[k]++;
1515:         PetscLogFlops(2.0*cnz);
1516:       }
1517:     }
1518:     MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1519:   }
1520:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1521:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1523:   PetscFree(ba);
1524:   PetscFree(abuf_r[0]);
1525:   PetscFree(abuf_r);
1526:   PetscFree3(buf_ri_k,nextrow,nextci);
1527:   return(0);
1528: }

1530: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1533: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1534: {
1535:   PetscErrorCode      ierr;
1536:   Mat                 Cmpi,A_loc,POt,PDt;
1537:   Mat_PtAPMPI         *ptap;
1538:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1539:   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1540:   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1541:   PetscInt            nnz;
1542:   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1543:   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1544:   MPI_Comm            comm;
1545:   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1546:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1547:   PetscInt            len,proc,*dnz,*onz,*owners;
1548:   PetscInt            nzi,*bi,*bj;
1549:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1550:   MPI_Request         *swaits,*rwaits;
1551:   MPI_Status          *sstatus,rstatus;
1552:   Mat_Merge_SeqsToMPI *merge;
1553:   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1554:   PetscReal           afill  =1.0,afill_tmp;
1555:   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1556:   PetscScalar         *vals;
1557:   Mat_SeqAIJ          *a_loc, *pdt,*pot;

1560:   PetscObjectGetComm((PetscObject)A,&comm);
1561:   /* check if matrix local sizes are compatible */
1562:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1563:     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1564:   }

1566:   MPI_Comm_size(comm,&size);
1567:   MPI_Comm_rank(comm,&rank);

1569:   /* create struct Mat_PtAPMPI and attached it to C later */
1570:   PetscNew(Mat_PtAPMPI,&ptap);

1572:   /* get A_loc by taking all local rows of A */
1573:   MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);

1575:   ptap->A_loc = A_loc;
1576:   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1577:   ai          = a_loc->i;
1578:   aj          = a_loc->j;

1580:   /* determine symbolic Co=(p->B)^T*A - send to others */
1581:   /*----------------------------------------------------*/
1582:   MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1583:   pdt  = (Mat_SeqAIJ*)PDt->data;
1584:   pdti = pdt->i; pdtj = pdt->j;

1586:   MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1587:   pot  = (Mat_SeqAIJ*)POt->data;
1588:   poti = pot->i; potj = pot->j;

1590:   /* then, compute symbolic Co = (p->B)^T*A */
1591:   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1592:                          >= (num of nonzero rows of C_seq) - pn */
1593:   PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1594:   coi[0] = 0;

1596:   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1597:   nnz           = fill*(poti[pon] + ai[am]);
1598:   PetscFreeSpaceGet(nnz,&free_space);
1599:   current_space = free_space;

1601:   /* create and initialize a linked list */
1602:   i     = PetscMax(pdt->rmax,pot->rmax);
1603:   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1604:   if (!Crmax || Crmax > aN) Crmax = aN;
1605:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

1607:   for (i=0; i<pon; i++) {
1608:     pnz = poti[i+1] - poti[i];
1609:     ptJ = potj + poti[i];
1610:     for (j=0; j<pnz; j++) {
1611:       row  = ptJ[j]; /* row of A_loc == col of Pot */
1612:       anz  = ai[row+1] - ai[row];
1613:       Jptr = aj + ai[row];
1614:       /* add non-zero cols of AP into the sorted linked list lnk */
1615:       PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1616:     }
1617:     nnz = lnk[0];

1619:     /* If free space is not available, double the total space in the list */
1620:     if (current_space->local_remaining<nnz) {
1621:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
1622:       nspacedouble++;
1623:     }

1625:     /* Copy data into free space, and zero out denserows */
1626:     PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);

1628:     current_space->array           += nnz;
1629:     current_space->local_used      += nnz;
1630:     current_space->local_remaining -= nnz;

1632:     coi[i+1] = coi[i] + nnz;
1633:   }

1635:   PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1636:   PetscFreeSpaceContiguous(&free_space,coj);

1638:   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1639:   if (afill_tmp > afill) afill = afill_tmp;

1641:   /* send j-array (coj) of Co to other processors */
1642:   /*----------------------------------------------*/
1643:   /* determine row ownership */
1644:   PetscNew(Mat_Merge_SeqsToMPI,&merge);
1645:   PetscLayoutCreate(comm,&merge->rowmap);

1647:   merge->rowmap->n  = pn;
1648:   merge->rowmap->bs = 1;

1650:   PetscLayoutSetUp(merge->rowmap);
1651:   owners = merge->rowmap->range;

1653:   /* determine the number of messages to send, their lengths */
1654:   PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1655:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1656:   PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);

1658:   len_s        = merge->len_s;
1659:   merge->nsend = 0;

1661:   PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1662:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));

1664:   proc = 0;
1665:   for (i=0; i<pon; i++) {
1666:     while (prmap[i] >= owners[proc+1]) proc++;
1667:     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1668:     len_s[proc] += coi[i+1] - coi[i];
1669:   }

1671:   len          = 0; /* max length of buf_si[] */
1672:   owners_co[0] = 0;
1673:   for (proc=0; proc<size; proc++) {
1674:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1675:     if (len_si[proc]) {
1676:       merge->nsend++;
1677:       len_si[proc] = 2*(len_si[proc] + 1);
1678:       len         += len_si[proc];
1679:     }
1680:   }

1682:   /* determine the number and length of messages to receive for coi and coj  */
1683:   PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1684:   PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);

1686:   /* post the Irecv and Isend of coj */
1687:   PetscCommGetNewTag(comm,&tagj);
1688:   PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1689:   PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1690:   for (proc=0, k=0; proc<size; proc++) {
1691:     if (!len_s[proc]) continue;
1692:     i    = owners_co[proc];
1693:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1694:     k++;
1695:   }

1697:   /* receives and sends of coj are complete */
1698:   PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1699:   for (i=0; i<merge->nrecv; i++) {
1700:     PetscMPIInt icompleted;
1701:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1702:   }
1703:   PetscFree(rwaits);
1704:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1706:   /* send and recv coi */
1707:   /*-------------------*/
1708:   PetscCommGetNewTag(comm,&tagi);
1709:   PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1710:   PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1711:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1712:   for (proc=0,k=0; proc<size; proc++) {
1713:     if (!len_s[proc]) continue;
1714:     /* form outgoing message for i-structure:
1715:          buf_si[0]:                 nrows to be sent
1716:                [1:nrows]:           row index (global)
1717:                [nrows+1:2*nrows+1]: i-structure index
1718:     */
1719:     /*-------------------------------------------*/
1720:     nrows       = len_si[proc]/2 - 1;
1721:     buf_si_i    = buf_si + nrows+1;
1722:     buf_si[0]   = nrows;
1723:     buf_si_i[0] = 0;
1724:     nrows       = 0;
1725:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1726:       nzi               = coi[i+1] - coi[i];
1727:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1728:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1729:       nrows++;
1730:     }
1731:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1732:     k++;
1733:     buf_si += len_si[proc];
1734:   }
1735:   i = merge->nrecv;
1736:   while (i--) {
1737:     PetscMPIInt icompleted;
1738:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1739:   }
1740:   PetscFree(rwaits);
1741:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1742:   PetscFree(len_si);
1743:   PetscFree(len_ri);
1744:   PetscFree(swaits);
1745:   PetscFree(sstatus);
1746:   PetscFree(buf_s);

1748:   /* compute the local portion of C (mpi mat) */
1749:   /*------------------------------------------*/
1750:   /* allocate bi array and free space for accumulating nonzero column info */
1751:   PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1752:   bi[0] = 0;

1754:   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1755:   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1756:   PetscFreeSpaceGet(nnz,&free_space);
1757:   current_space = free_space;

1759:   PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1760:   for (k=0; k<merge->nrecv; k++) {
1761:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1762:     nrows       = *buf_ri_k[k];
1763:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1764:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1765:   }

1767:   MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1768:   rmax = 0;
1769:   for (i=0; i<pn; i++) {
1770:     /* add pdt[i,:]*AP into lnk */
1771:     pnz = pdti[i+1] - pdti[i];
1772:     ptJ = pdtj + pdti[i];
1773:     for (j=0; j<pnz; j++) {
1774:       row  = ptJ[j];  /* row of AP == col of Pt */
1775:       anz  = ai[row+1] - ai[row];
1776:       Jptr = aj + ai[row];
1777:       /* add non-zero cols of AP into the sorted linked list lnk */
1778:       PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1779:     }

1781:     /* add received col data into lnk */
1782:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1783:       if (i == *nextrow[k]) { /* i-th row */
1784:         nzi  = *(nextci[k]+1) - *nextci[k];
1785:         Jptr = buf_rj[k] + *nextci[k];
1786:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1787:         nextrow[k]++; nextci[k]++;
1788:       }
1789:     }
1790:     nnz = lnk[0];

1792:     /* if free space is not available, make more free space */
1793:     if (current_space->local_remaining<nnz) {
1794:       PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);
1795:       nspacedouble++;
1796:     }
1797:     /* copy data into free space, then initialize lnk */
1798:     PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1799:     MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);

1801:     current_space->array           += nnz;
1802:     current_space->local_used      += nnz;
1803:     current_space->local_remaining -= nnz;

1805:     bi[i+1] = bi[i] + nnz;
1806:     if (nnz > rmax) rmax = nnz;
1807:   }
1808:   PetscFree3(buf_ri_k,nextrow,nextci);

1810:   PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1811:   PetscFreeSpaceContiguous(&free_space,bj);
1812:   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1813:   if (afill_tmp > afill) afill = afill_tmp;
1814:   PetscLLCondensedDestroy_Scalable(lnk);
1815:   MatDestroy(&POt);
1816:   MatDestroy(&PDt);

1818:   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1819:   /*----------------------------------------------------------------------------------*/
1820:   PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1821:   PetscMemzero(vals,rmax*sizeof(PetscScalar));

1823:   MatCreate(comm,&Cmpi);
1824:   MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1825:   MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1826:   MatSetType(Cmpi,MATMPIAIJ);
1827:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1828:   MatPreallocateFinalize(dnz,onz);
1829:   MatSetBlockSize(Cmpi,1);
1830:   for (i=0; i<pn; i++) {
1831:     row  = i + rstart;
1832:     nnz  = bi[i+1] - bi[i];
1833:     Jptr = bj + bi[i];
1834:     MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1835:   }
1836:   MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1837:   MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1838:   PetscFree(vals);

1840:   merge->bi        = bi;
1841:   merge->bj        = bj;
1842:   merge->coi       = coi;
1843:   merge->coj       = coj;
1844:   merge->buf_ri    = buf_ri;
1845:   merge->buf_rj    = buf_rj;
1846:   merge->owners_co = owners_co;
1847:   merge->destroy   = Cmpi->ops->destroy;
1848:   merge->duplicate = Cmpi->ops->duplicate;

1850:   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1851:   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;

1853:   /* attach the supporting struct to Cmpi for reuse */
1854:   c = (Mat_MPIAIJ*)Cmpi->data;

1856:   c->ptap     = ptap;
1857:   ptap->api   = NULL;
1858:   ptap->apj   = NULL;
1859:   ptap->merge = merge;
1860:   ptap->rmax  = rmax;
1861:   ptap->apa   = NULL;

1863:   *C = Cmpi;
1864: #if defined(PETSC_USE_INFO)
1865:   if (bi[pn] != 0) {
1866:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1867:     PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1868:   } else {
1869:     PetscInfo(Cmpi,"Empty matrix product\n");
1870:   }
1871: #endif
1872:   return(0);
1873: }