Actual source code: mpiptap.c

petsc-3.8.3 2017-12-09
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>

 13: /* #define PTAP_PROFILE */

 15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 16: {
 17:   PetscErrorCode    ierr;
 18:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 19:   Mat_PtAPMPI       *ptap=a->ptap;
 20:   PetscBool         iascii;
 21:   PetscViewerFormat format;

 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       }
 33:     }
 34:   }
 35:   (ptap->view)(A,viewer);
 36:   return(0);
 37: }

 39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
 40: {
 42:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
 43:   Mat_PtAPMPI    *ptap=a->ptap;

 46:   if (ptap) {
 47:     Mat_Merge_SeqsToMPI *merge=ptap->merge;
 48:     PetscFree2(ptap->startsj_s,ptap->startsj_r);
 49:     PetscFree(ptap->bufa);
 50:     MatDestroy(&ptap->P_loc);
 51:     MatDestroy(&ptap->P_oth);
 52:     MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 53:     MatDestroy(&ptap->Rd);
 54:     MatDestroy(&ptap->Ro);
 55:     if (ptap->AP_loc) { /* used by alg_rap */
 56:       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 57:       PetscFree(ap->i);
 58:       PetscFree2(ap->j,ap->a);
 59:       MatDestroy(&ptap->AP_loc);
 60:     } else { /* used by alg_ptap */
 61:       PetscFree(ptap->api);
 62:       PetscFree(ptap->apj);
 63:     }
 64:     MatDestroy(&ptap->C_loc);
 65:     MatDestroy(&ptap->C_oth);
 66:     if (ptap->apa) {PetscFree(ptap->apa);}

 68:     if (merge) { /* used by alg_ptap */
 69:       PetscFree(merge->id_r);
 70:       PetscFree(merge->len_s);
 71:       PetscFree(merge->len_r);
 72:       PetscFree(merge->bi);
 73:       PetscFree(merge->bj);
 74:       PetscFree(merge->buf_ri[0]);
 75:       PetscFree(merge->buf_ri);
 76:       PetscFree(merge->buf_rj[0]);
 77:       PetscFree(merge->buf_rj);
 78:       PetscFree(merge->coi);
 79:       PetscFree(merge->coj);
 80:       PetscFree(merge->owners_co);
 81:       PetscLayoutDestroy(&merge->rowmap);
 82:       PetscFree(ptap->merge);
 83:     }

 85:     ptap->destroy(A);
 86:     PetscFree(ptap);
 87:   }
 88:   return(0);
 89: }

 91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 92: {
 94:   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
 95:   Mat_PtAPMPI    *ptap  = a->ptap;

 98:   (*ptap->duplicate)(A,op,M);
 99:   (*M)->ops->destroy   = ptap->destroy;
100:   (*M)->ops->duplicate = ptap->duplicate;
101:   (*M)->ops->view      = ptap->view;
102:   return(0);
103: }

105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108:   PetscBool      rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */
109:   MPI_Comm       comm;

112:   /* check if matrix local sizes are compatible */
113:   PetscObjectGetComm((PetscObject)A,&comm);
114:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
115:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);

117:   PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);
118:   if (scall == MAT_INITIAL_MATRIX) {
119:     PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
120:     if (rap) { /* do R=P^T locally, then C=R*A*P */
121:       MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
122:     } else {       /* do P^T*A*P */
123:       MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);
124:     }
125:     PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
126:   }
127:   PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
128:   (*(*C)->ops->ptapnumeric)(A,P,*C);
129:   PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
130:   return(0);
131: }

133: #if defined(PETSC_HAVE_HYPRE)
134: PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
135: #endif

137: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
138: {
139:   PetscErrorCode    ierr;
140:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
141:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
142:   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
143:   Mat_PtAPMPI       *ptap = c->ptap;
144:   Mat               AP_loc,C_loc,C_oth;
145:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
146:   PetscScalar       *apa;
147:   const PetscInt    *cols;
148:   const PetscScalar *vals;

151:   MatZeroEntries(C);

153:   /* 1) get R = Pd^T,Ro = Po^T */
154:   if (ptap->reuse == MAT_REUSE_MATRIX) {
155:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
156:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
157:   }

159:   /* 2) get AP_loc */
160:   AP_loc = ptap->AP_loc;
161:   ap = (Mat_SeqAIJ*)AP_loc->data;

163:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
164:   /*-----------------------------------------------------*/
165:   if (ptap->reuse == MAT_REUSE_MATRIX) {
166:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
167:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
168:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
169:   }

171:   /* 2-2) compute numeric A_loc*P - dominating part */
172:   /* ---------------------------------------------- */
173:   /* get data from symbolic products */
174:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
175:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
176:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");

178:   api   = ap->i;
179:   apj   = ap->j;
180:   for (i=0; i<am; i++) {
181:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
182:     apnz = api[i+1] - api[i];
183:     apa = ap->a + api[i];
184:     PetscMemzero(apa,sizeof(PetscScalar)*apnz);
185:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
186:     PetscLogFlops(2.0*apnz);
187:   }

189:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
190:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
191:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

193:   C_loc = ptap->C_loc;
194:   C_oth = ptap->C_oth;

196:   /* add C_loc and Co to to C */
197:   MatGetOwnershipRange(C,&rstart,&rend);

199:   /* C_loc -> C */
200:   cm    = C_loc->rmap->N;
201:   c_seq = (Mat_SeqAIJ*)C_loc->data;
202:   cols = c_seq->j;
203:   vals = c_seq->a;
204:   for (i=0; i<cm; i++) {
205:     ncols = c_seq->i[i+1] - c_seq->i[i];
206:     row = rstart + i;
207:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
208:     cols += ncols; vals += ncols;
209:   }

211:   /* Co -> C, off-processor part */
212:   cm = C_oth->rmap->N;
213:   c_seq = (Mat_SeqAIJ*)C_oth->data;
214:   cols = c_seq->j;
215:   vals = c_seq->a;
216:   for (i=0; i<cm; i++) {
217:     ncols = c_seq->i[i+1] - c_seq->i[i];
218:     row = p->garray[i];
219:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
220:     cols += ncols; vals += ncols;
221:   }
222:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
223:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

225:   ptap->reuse = MAT_REUSE_MATRIX;
226:   return(0);
227: }

229: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
230: {
231:   PetscErrorCode      ierr;
232:   Mat_PtAPMPI         *ptap;
233:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
234:   MPI_Comm            comm;
235:   PetscMPIInt         size,rank;
236:   Mat                 Cmpi,P_loc,P_oth;
237:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
238:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
239:   PetscInt            *lnk,i,k,pnz,row,nsend;
240:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
241:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
242:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
243:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
244:   MPI_Request         *swaits,*rwaits;
245:   MPI_Status          *sstatus,rstatus;
246:   PetscLayout         rowmap;
247:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
248:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
249:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
250:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
251:   PetscScalar         *apv;
252:   PetscTable          ta;
253: #if defined(PETSC_USE_INFO)
254:   PetscReal           apfill;
255: #endif

258:   PetscObjectGetComm((PetscObject)A,&comm);
259:   MPI_Comm_size(comm,&size);
260:   MPI_Comm_rank(comm,&rank);

262:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

264:   /* create symbolic parallel matrix Cmpi */
265:   MatCreate(comm,&Cmpi);
266:   MatSetType(Cmpi,MATMPIAIJ);

268:   /* create struct Mat_PtAPMPI and attached it to C later */
269:   PetscNew(&ptap);
270:   ptap->reuse = MAT_INITIAL_MATRIX;
271:   ptap->algType = 0;

273:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
274:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
275:   /* get P_loc by taking all local rows of P */
276:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

278:   ptap->P_loc = P_loc;
279:   ptap->P_oth = P_oth;

281:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
282:   /* --------------------------------- */
283:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
284:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

286:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
287:   /* ----------------------------------------------------------------- */
288:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
289:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

291:   /* create and initialize a linked list */
292:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
293:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
294:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
295:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

297:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

299:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
300:   if (ao) {
301:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
302:   } else {
303:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
304:   }
305:   current_space = free_space;
306:   nspacedouble  = 0;

308:   PetscMalloc1(am+1,&api);
309:   api[0] = 0;
310:   for (i=0; i<am; i++) {
311:     /* diagonal portion: Ad[i,:]*P */
312:     ai = ad->i; pi = p_loc->i;
313:     nzi = ai[i+1] - ai[i];
314:     aj  = ad->j + ai[i];
315:     for (j=0; j<nzi; j++) {
316:       row  = aj[j];
317:       pnz  = pi[row+1] - pi[row];
318:       Jptr = p_loc->j + pi[row];
319:       /* add non-zero cols of P into the sorted linked list lnk */
320:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
321:     }
322:     /* off-diagonal portion: Ao[i,:]*P */
323:     if (ao) {
324:       ai = ao->i; pi = p_oth->i;
325:       nzi = ai[i+1] - ai[i];
326:       aj  = ao->j + ai[i];
327:       for (j=0; j<nzi; j++) {
328:         row  = aj[j];
329:         pnz  = pi[row+1] - pi[row];
330:         Jptr = p_oth->j + pi[row];
331:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
332:       }
333:     }
334:     apnz     = lnk[0];
335:     api[i+1] = api[i] + apnz;

337:     /* if free space is not available, double the total space in the list */
338:     if (current_space->local_remaining<apnz) {
339:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
340:       nspacedouble++;
341:     }

343:     /* Copy data into free space, then initialize lnk */
344:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

346:     current_space->array           += apnz;
347:     current_space->local_used      += apnz;
348:     current_space->local_remaining -= apnz;
349:   }
350:   /* Allocate space for apj and apv, initialize apj, and */
351:   /* destroy list of free space and other temporary array(s) */
352:   PetscMalloc2(api[am],&apj,api[am],&apv);
353:   PetscFreeSpaceContiguous(&free_space,apj);
354:   PetscLLCondensedDestroy_Scalable(lnk);

356:   /* Create AP_loc for reuse */
357:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

359: #if defined(PETSC_USE_INFO)
360:   if (ao) {
361:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
362:   } else {
363:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
364:   }
365:   ptap->AP_loc->info.mallocs           = nspacedouble;
366:   ptap->AP_loc->info.fill_ratio_given  = fill;
367:   ptap->AP_loc->info.fill_ratio_needed = apfill;

369:   if (api[am]) {
370:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
371:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
372:   } else {
373:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
374:   }
375: #endif

377:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
378:   /* ------------------------------------ */
379:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

381:   /* (3) send coj of C_oth to other processors  */
382:   /* ------------------------------------------ */
383:   /* determine row ownership */
384:   PetscLayoutCreate(comm,&rowmap);
385:   rowmap->n  = pn;
386:   rowmap->bs = 1;
387:   PetscLayoutSetUp(rowmap);
388:   owners = rowmap->range;

390:   /* determine the number of messages to send, their lengths */
391:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
392:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
393:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

395:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
396:   coi   = c_oth->i; coj = c_oth->j;
397:   con   = ptap->C_oth->rmap->n;
398:   proc  = 0;
399:   for (i=0; i<con; i++) {
400:     while (prmap[i] >= owners[proc+1]) proc++;
401:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
402:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
403:   }

405:   len          = 0; /* max length of buf_si[], see (4) */
406:   owners_co[0] = 0;
407:   nsend        = 0;
408:   for (proc=0; proc<size; proc++) {
409:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
410:     if (len_s[proc]) {
411:       nsend++;
412:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
413:       len         += len_si[proc];
414:     }
415:   }

417:   /* determine the number and length of messages to receive for coi and coj  */
418:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
419:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

421:   /* post the Irecv and Isend of coj */
422:   PetscCommGetNewTag(comm,&tagj);
423:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
424:   PetscMalloc1(nsend+1,&swaits);
425:   for (proc=0, k=0; proc<size; proc++) {
426:     if (!len_s[proc]) continue;
427:     i    = owners_co[proc];
428:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
429:     k++;
430:   }

432:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
433:   /* ---------------------------------------- */
434:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
435:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

437:   /* receives coj are complete */
438:   for (i=0; i<nrecv; i++) {
439:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
440:   }
441:   PetscFree(rwaits);
442:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

444:   /* add received column indices into ta to update Crmax */
445:   for (k=0; k<nrecv; k++) {/* k-th received message */
446:     Jptr = buf_rj[k];
447:     for (j=0; j<len_r[k]; j++) {
448:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
449:     }
450:   }
451:   PetscTableGetCount(ta,&Crmax);
452:   PetscTableDestroy(&ta);

454:   /* (4) send and recv coi */
455:   /*-----------------------*/
456:   PetscCommGetNewTag(comm,&tagi);
457:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
458:   PetscMalloc1(len+1,&buf_s);
459:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
460:   for (proc=0,k=0; proc<size; proc++) {
461:     if (!len_s[proc]) continue;
462:     /* form outgoing message for i-structure:
463:          buf_si[0]:                 nrows to be sent
464:                [1:nrows]:           row index (global)
465:                [nrows+1:2*nrows+1]: i-structure index
466:     */
467:     /*-------------------------------------------*/
468:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
469:     buf_si_i    = buf_si + nrows+1;
470:     buf_si[0]   = nrows;
471:     buf_si_i[0] = 0;
472:     nrows       = 0;
473:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
474:       nzi = coi[i+1] - coi[i];
475:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
476:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
477:       nrows++;
478:     }
479:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
480:     k++;
481:     buf_si += len_si[proc];
482:   }
483:   for (i=0; i<nrecv; i++) {
484:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
485:   }
486:   PetscFree(rwaits);
487:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

489:   PetscFree4(len_s,len_si,sstatus,owners_co);
490:   PetscFree(len_ri);
491:   PetscFree(swaits);
492:   PetscFree(buf_s);

494:   /* (5) compute the local portion of Cmpi      */
495:   /* ------------------------------------------ */
496:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
497:   PetscFreeSpaceGet(Crmax,&free_space);
498:   current_space = free_space;

500:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
501:   for (k=0; k<nrecv; k++) {
502:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
503:     nrows       = *buf_ri_k[k];
504:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
505:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
506:   }

508:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
509:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
510:   for (i=0; i<pn; i++) {
511:     /* add C_loc into Cmpi */
512:     nzi  = c_loc->i[i+1] - c_loc->i[i];
513:     Jptr = c_loc->j + c_loc->i[i];
514:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

516:     /* add received col data into lnk */
517:     for (k=0; k<nrecv; k++) { /* k-th received message */
518:       if (i == *nextrow[k]) { /* i-th row */
519:         nzi  = *(nextci[k]+1) - *nextci[k];
520:         Jptr = buf_rj[k] + *nextci[k];
521:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
522:         nextrow[k]++; nextci[k]++;
523:       }
524:     }
525:     nzi = lnk[0];

527:     /* copy data into free space, then initialize lnk */
528:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
529:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
530:   }
531:   PetscFree3(buf_ri_k,nextrow,nextci);
532:   PetscLLCondensedDestroy_Scalable(lnk);
533:   PetscFreeSpaceDestroy(free_space);

535:   /* local sizes and preallocation */
536:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
537:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
538:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
539:   MatPreallocateFinalize(dnz,onz);

541:   /* members in merge */
542:   PetscFree(id_r);
543:   PetscFree(len_r);
544:   PetscFree(buf_ri[0]);
545:   PetscFree(buf_ri);
546:   PetscFree(buf_rj[0]);
547:   PetscFree(buf_rj);
548:   PetscLayoutDestroy(&rowmap);

550:   /* attach the supporting struct to Cmpi for reuse */
551:   c = (Mat_MPIAIJ*)Cmpi->data;
552:   c->ptap         = ptap;
553:   ptap->duplicate = Cmpi->ops->duplicate;
554:   ptap->destroy   = Cmpi->ops->destroy;
555:   ptap->view      = Cmpi->ops->view;

557:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
558:   Cmpi->assembled        = PETSC_FALSE;
559:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
560:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
561:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
562:   *C                     = Cmpi;
563:   return(0);
564: }

566: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
567: {
568:   PetscErrorCode      ierr;
569:   Mat_PtAPMPI         *ptap;
570:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
571:   MPI_Comm            comm;
572:   PetscMPIInt         size,rank;
573:   Mat                 Cmpi;
574:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
575:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
576:   PetscInt            *lnk,i,k,pnz,row,nsend;
577:   PetscBT             lnkbt;
578:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
579:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
580:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
581:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
582:   MPI_Request         *swaits,*rwaits;
583:   MPI_Status          *sstatus,rstatus;
584:   PetscLayout         rowmap;
585:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
586:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
587:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
588:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
589:   PetscScalar         *apv;
590:   PetscTable          ta;
591: #if defined(PETSC_HAVE_HYPRE)
592:   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
593:   PetscInt            nalg = 3;
594: #else
595:   const char          *algTypes[2] = {"scalable","nonscalable"};
596:   PetscInt            nalg = 2;
597: #endif
598:   PetscInt            alg = 1; /* set default algorithm */
599: #if defined(PETSC_USE_INFO)
600:   PetscReal           apfill;
601: #endif
602: #if defined(PTAP_PROFILE)
603:   PetscLogDouble      t0,t1,t11,t12,t2,t3,t4;
604: #endif
605:   PetscBool           flg;

608:   PetscObjectGetComm((PetscObject)A,&comm);

610:   /* pick an algorithm */
611:   PetscObjectOptionsBegin((PetscObject)A);
612:   PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
613:   PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);
614:   PetscOptionsEnd();

616:   if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
617:     MatInfo     Ainfo,Pinfo;
618:     PetscInt    nz_local;
619:     PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;

621:     MatGetInfo(A,MAT_LOCAL,&Ainfo);
622:     MatGetInfo(P,MAT_LOCAL,&Pinfo);
623:     nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);

625:     if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
626:     MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);

628:     if (alg_scalable) {
629:       alg  = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
630:     }
631:   }

633:   if (alg == 0) {
634:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
635:     (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
636:     return(0);

638: #if defined(PETSC_HAVE_HYPRE)
639:   } else if (alg == 2) {
640:     /* Use boomerAMGBuildCoarseOperator */
641:     MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
642:     return(0);
643: #endif
644:   }

646: #if defined(PTAP_PROFILE)
647:   PetscTime(&t0);
648: #endif

650:   MPI_Comm_size(comm,&size);
651:   MPI_Comm_rank(comm,&rank);

653:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

655:   /* create symbolic parallel matrix Cmpi */
656:   MatCreate(comm,&Cmpi);
657:   MatSetType(Cmpi,MATMPIAIJ);

659:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
660:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

662:   /* create struct Mat_PtAPMPI and attached it to C later */
663:   PetscNew(&ptap);
664:   ptap->reuse = MAT_INITIAL_MATRIX;
665:   ptap->algType = alg;

667:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
669:   /* get P_loc by taking all local rows of P */
670:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

672:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
673:   /* --------------------------------- */
674:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
675:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
676: #if defined(PTAP_PROFILE)
677:   PetscTime(&t11);
678: #endif

680:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
681:   /* ----------------------------------------------------------------- */
682:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
683:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

685:   /* create and initialize a linked list */
686:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
687:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
688:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
689:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
690:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

692:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

694:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
695:   if (ao) {
696:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
697:   } else {
698:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
699:   }
700:   current_space = free_space;
701:   nspacedouble  = 0;

703:   PetscMalloc1(am+1,&api);
704:   api[0] = 0;
705:   for (i=0; i<am; i++) {
706:     /* diagonal portion: Ad[i,:]*P */
707:     ai = ad->i; pi = p_loc->i;
708:     nzi = ai[i+1] - ai[i];
709:     aj  = ad->j + ai[i];
710:     for (j=0; j<nzi; j++) {
711:       row  = aj[j];
712:       pnz  = pi[row+1] - pi[row];
713:       Jptr = p_loc->j + pi[row];
714:       /* add non-zero cols of P into the sorted linked list lnk */
715:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
716:     }
717:     /* off-diagonal portion: Ao[i,:]*P */
718:     if (ao) {
719:       ai = ao->i; pi = p_oth->i;
720:       nzi = ai[i+1] - ai[i];
721:       aj  = ao->j + ai[i];
722:       for (j=0; j<nzi; j++) {
723:         row  = aj[j];
724:         pnz  = pi[row+1] - pi[row];
725:         Jptr = p_oth->j + pi[row];
726:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
727:       }
728:     }
729:     apnz     = lnk[0];
730:     api[i+1] = api[i] + apnz;
731:     if (ap_rmax < apnz) ap_rmax = apnz;

733:     /* if free space is not available, double the total space in the list */
734:     if (current_space->local_remaining<apnz) {
735:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
736:       nspacedouble++;
737:     }

739:     /* Copy data into free space, then initialize lnk */
740:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

742:     current_space->array           += apnz;
743:     current_space->local_used      += apnz;
744:     current_space->local_remaining -= apnz;
745:   }
746:   /* Allocate space for apj and apv, initialize apj, and */
747:   /* destroy list of free space and other temporary array(s) */
748:   PetscMalloc2(api[am],&apj,api[am],&apv);
749:   PetscFreeSpaceContiguous(&free_space,apj);
750:   PetscLLDestroy(lnk,lnkbt);

752:   /* Create AP_loc for reuse */
753:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

755: #if defined(PETSC_USE_INFO)
756:   if (ao) {
757:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
758:   } else {
759:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
760:   }
761:   ptap->AP_loc->info.mallocs           = nspacedouble;
762:   ptap->AP_loc->info.fill_ratio_given  = fill;
763:   ptap->AP_loc->info.fill_ratio_needed = apfill;

765:   if (api[am]) {
766:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
767:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
768:   } else {
769:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
770:   }
771: #endif

773: #if defined(PTAP_PROFILE)
774:   PetscTime(&t12);
775: #endif

777:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
778:   /* ------------------------------------ */
779:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
780: #if defined(PTAP_PROFILE)
781:   PetscTime(&t1);
782: #endif

784:   /* (3) send coj of C_oth to other processors  */
785:   /* ------------------------------------------ */
786:   /* determine row ownership */
787:   PetscLayoutCreate(comm,&rowmap);
788:   rowmap->n  = pn;
789:   rowmap->bs = 1;
790:   PetscLayoutSetUp(rowmap);
791:   owners = rowmap->range;

793:   /* determine the number of messages to send, their lengths */
794:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
795:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
796:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

798:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
799:   coi   = c_oth->i; coj = c_oth->j;
800:   con   = ptap->C_oth->rmap->n;
801:   proc  = 0;
802:   for (i=0; i<con; i++) {
803:     while (prmap[i] >= owners[proc+1]) proc++;
804:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
805:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
806:   }

808:   len          = 0; /* max length of buf_si[], see (4) */
809:   owners_co[0] = 0;
810:   nsend        = 0;
811:   for (proc=0; proc<size; proc++) {
812:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
813:     if (len_s[proc]) {
814:       nsend++;
815:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
816:       len         += len_si[proc];
817:     }
818:   }

820:   /* determine the number and length of messages to receive for coi and coj  */
821:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
822:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

824:   /* post the Irecv and Isend of coj */
825:   PetscCommGetNewTag(comm,&tagj);
826:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
827:   PetscMalloc1(nsend+1,&swaits);
828:   for (proc=0, k=0; proc<size; proc++) {
829:     if (!len_s[proc]) continue;
830:     i    = owners_co[proc];
831:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
832:     k++;
833:   }

835:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
836:   /* ---------------------------------------- */
837:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
838:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

840:   /* receives coj are complete */
841:   for (i=0; i<nrecv; i++) {
842:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
843:   }
844:   PetscFree(rwaits);
845:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

847:   /* add received column indices into ta to update Crmax */
848:   for (k=0; k<nrecv; k++) {/* k-th received message */
849:     Jptr = buf_rj[k];
850:     for (j=0; j<len_r[k]; j++) {
851:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
852:     }
853:   }
854:   PetscTableGetCount(ta,&Crmax);
855:   PetscTableDestroy(&ta);

857:   /* (4) send and recv coi */
858:   /*-----------------------*/
859:   PetscCommGetNewTag(comm,&tagi);
860:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
861:   PetscMalloc1(len+1,&buf_s);
862:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
863:   for (proc=0,k=0; proc<size; proc++) {
864:     if (!len_s[proc]) continue;
865:     /* form outgoing message for i-structure:
866:          buf_si[0]:                 nrows to be sent
867:                [1:nrows]:           row index (global)
868:                [nrows+1:2*nrows+1]: i-structure index
869:     */
870:     /*-------------------------------------------*/
871:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
872:     buf_si_i    = buf_si + nrows+1;
873:     buf_si[0]   = nrows;
874:     buf_si_i[0] = 0;
875:     nrows       = 0;
876:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
877:       nzi = coi[i+1] - coi[i];
878:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
879:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
880:       nrows++;
881:     }
882:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
883:     k++;
884:     buf_si += len_si[proc];
885:   }
886:   for (i=0; i<nrecv; i++) {
887:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
888:   }
889:   PetscFree(rwaits);
890:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

892:   PetscFree4(len_s,len_si,sstatus,owners_co);
893:   PetscFree(len_ri);
894:   PetscFree(swaits);
895:   PetscFree(buf_s);
896: #if defined(PTAP_PROFILE)
897:   PetscTime(&t2);
898: #endif
899:   /* (5) compute the local portion of Cmpi      */
900:   /* ------------------------------------------ */
901:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
902:   PetscFreeSpaceGet(Crmax,&free_space);
903:   current_space = free_space;

905:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
906:   for (k=0; k<nrecv; k++) {
907:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
908:     nrows       = *buf_ri_k[k];
909:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
910:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
911:   }

913:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
914:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
915:   for (i=0; i<pn; i++) {
916:     /* add C_loc into Cmpi */
917:     nzi  = c_loc->i[i+1] - c_loc->i[i];
918:     Jptr = c_loc->j + c_loc->i[i];
919:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

921:     /* add received col data into lnk */
922:     for (k=0; k<nrecv; k++) { /* k-th received message */
923:       if (i == *nextrow[k]) { /* i-th row */
924:         nzi  = *(nextci[k]+1) - *nextci[k];
925:         Jptr = buf_rj[k] + *nextci[k];
926:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
927:         nextrow[k]++; nextci[k]++;
928:       }
929:     }
930:     nzi = lnk[0];

932:     /* copy data into free space, then initialize lnk */
933:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
934:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
935:   }
936:   PetscFree3(buf_ri_k,nextrow,nextci);
937:   PetscLLDestroy(lnk,lnkbt);
938:   PetscFreeSpaceDestroy(free_space);
939: #if defined(PTAP_PROFILE)
940:   PetscTime(&t3);
941: #endif

943:   /* local sizes and preallocation */
944:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
945:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
946:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
947:   MatPreallocateFinalize(dnz,onz);

949:   /* members in merge */
950:   PetscFree(id_r);
951:   PetscFree(len_r);
952:   PetscFree(buf_ri[0]);
953:   PetscFree(buf_ri);
954:   PetscFree(buf_rj[0]);
955:   PetscFree(buf_rj);
956:   PetscLayoutDestroy(&rowmap);

958:   /* attach the supporting struct to Cmpi for reuse */
959:   c = (Mat_MPIAIJ*)Cmpi->data;
960:   c->ptap         = ptap;
961:   ptap->duplicate = Cmpi->ops->duplicate;
962:   ptap->destroy   = Cmpi->ops->destroy;
963:   ptap->view      = Cmpi->ops->view;

965:   if (alg == 1) {
966:     PetscCalloc1(pN,&ptap->apa);
967:   }

969:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
970:   Cmpi->assembled        = PETSC_FALSE;
971:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
972:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
973:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
974:   *C                     = Cmpi;

976: #if defined(PTAP_PROFILE)
977:   PetscTime(&t4);
978:   if (rank == 1) {
979:     printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
980:   }
981: #endif
982:   return(0);
983: }

985: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
986: {
987:   PetscErrorCode    ierr;
988:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
989:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
990:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
991:   Mat_PtAPMPI       *ptap = c->ptap;
992:   Mat               AP_loc,C_loc,C_oth;
993:   PetscInt          i,rstart,rend,cm,ncols,row;
994:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
995:   PetscScalar       *apa;
996:   const PetscInt    *cols;
997:   const PetscScalar *vals;
998: #if defined(PTAP_PROFILE)
999:   PetscMPIInt       rank;
1000:   MPI_Comm          comm;
1001:   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
1002: #endif

1005:   MatZeroEntries(C);

1007:   /* 1) get R = Pd^T,Ro = Po^T */
1008: #if defined(PTAP_PROFILE)
1009:   PetscTime(&t0);
1010: #endif
1011:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1012:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1013:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1014:   }
1015: #if defined(PTAP_PROFILE)
1016:   PetscTime(&t1);
1017:   eR = t1 - t0;
1018: #endif

1020:   /* 2) get AP_loc */
1021:   AP_loc = ptap->AP_loc;
1022:   ap = (Mat_SeqAIJ*)AP_loc->data;

1024:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1025:   /*-----------------------------------------------------*/
1026:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1027:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1028:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1029:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1030:   }

1032:   /* 2-2) compute numeric A_loc*P - dominating part */
1033:   /* ---------------------------------------------- */
1034:   /* get data from symbolic products */
1035:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1036:   if (ptap->P_oth) {
1037:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1038:   }
1039:   apa   = ptap->apa;
1040:   api   = ap->i;
1041:   apj   = ap->j;
1042:   for (i=0; i<am; i++) {
1043:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1044:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1045:     apnz = api[i+1] - api[i];
1046:     for (j=0; j<apnz; j++) {
1047:       col = apj[j+api[i]];
1048:       ap->a[j+ap->i[i]] = apa[col];
1049:       apa[col] = 0.0;
1050:     }
1051:     PetscLogFlops(2.0*apnz);
1052:   }
1053: #if defined(PTAP_PROFILE)
1054:   PetscTime(&t2);
1055:   eAP = t2 - t1;
1056: #endif

1058:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1059:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1060:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1061:   C_loc = ptap->C_loc;
1062:   C_oth = ptap->C_oth;
1063: #if defined(PTAP_PROFILE)
1064:   PetscTime(&t3);
1065:   eCseq = t3 - t2;
1066: #endif

1068:   /* add C_loc and Co to to C */
1069:   MatGetOwnershipRange(C,&rstart,&rend);

1071:   /* C_loc -> C */
1072:   cm    = C_loc->rmap->N;
1073:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1074:   cols = c_seq->j;
1075:   vals = c_seq->a;
1076:   for (i=0; i<cm; i++) {
1077:     ncols = c_seq->i[i+1] - c_seq->i[i];
1078:     row = rstart + i;
1079:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1080:     cols += ncols; vals += ncols;
1081:   }
1082: 
1083:   /* Co -> C, off-processor part */
1084:   cm = C_oth->rmap->N;
1085:   c_seq = (Mat_SeqAIJ*)C_oth->data;
1086:   cols = c_seq->j;
1087:   vals = c_seq->a;
1088:   for (i=0; i<cm; i++) {
1089:     ncols = c_seq->i[i+1] - c_seq->i[i];
1090:     row = p->garray[i];
1091:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1092:     cols += ncols; vals += ncols;
1093:   }
1094:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1095:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1096: #if defined(PTAP_PROFILE)
1097:   PetscTime(&t4);
1098:   eCmpi = t4 - t3;

1100:   PetscObjectGetComm((PetscObject)C,&comm);
1101:   MPI_Comm_rank(comm,&rank);
1102:   if (rank==1) {
1103:     PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);
1104:   }
1105: #endif
1106:   ptap->reuse = MAT_REUSE_MATRIX;
1107:   return(0);
1108: }

1110: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C)
1111: {
1112:   PetscErrorCode      ierr;
1113:   Mat                 Cmpi;
1114:   Mat_PtAPMPI         *ptap;
1115:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1116:   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1117:   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1118:   Mat_SeqAIJ          *p_loc,*p_oth;
1119:   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
1120:   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
1121:   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1122:   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
1123:   PetscBT             lnkbt;
1124:   MPI_Comm            comm;
1125:   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
1126:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1127:   PetscInt            len,proc,*dnz,*onz,*owners;
1128:   PetscInt            nzi,*pti,*ptj;
1129:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1130:   MPI_Request         *swaits,*rwaits;
1131:   MPI_Status          *sstatus,rstatus;
1132:   Mat_Merge_SeqsToMPI *merge;
1133:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
1134:   PetscReal           afill=1.0,afill_tmp;

1137:   PetscObjectGetComm((PetscObject)A,&comm);
1138:   MPI_Comm_size(comm,&size);
1139:   MPI_Comm_rank(comm,&rank);

1141:   /* create struct Mat_PtAPMPI and attached it to C later */
1142:   PetscNew(&ptap);
1143:   PetscNew(&merge);
1144:   ptap->merge = merge;
1145:   ptap->reuse = MAT_INITIAL_MATRIX;

1147:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1148:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);

1150:   /* get P_loc by taking all local rows of P */
1151:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

1153:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1154:   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1155:   pi_loc = p_loc->i; pj_loc = p_loc->j;
1156:   pi_oth = p_oth->i; pj_oth = p_oth->j;

1158:   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
1159:   /*--------------------------------------------------------------------------*/
1160:   PetscMalloc1(am+1,&api);
1161:   api[0] = 0;

1163:   /* create and initialize a linked list */
1164:   PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);

1166:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
1167:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
1168:   current_space = free_space;

1170:   for (i=0; i<am; i++) {
1171:     /* diagonal portion of A */
1172:     nzi = adi[i+1] - adi[i];
1173:     aj  = ad->j + adi[i];
1174:     for (j=0; j<nzi; j++) {
1175:       row  = aj[j];
1176:       pnz  = pi_loc[row+1] - pi_loc[row];
1177:       Jptr = pj_loc + pi_loc[row];
1178:       /* add non-zero cols of P into the sorted linked list lnk */
1179:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1180:     }
1181:     /* off-diagonal portion of A */
1182:     nzi = aoi[i+1] - aoi[i];
1183:     aj  = ao->j + aoi[i];
1184:     for (j=0; j<nzi; j++) {
1185:       row  = aj[j];
1186:       pnz  = pi_oth[row+1] - pi_oth[row];
1187:       Jptr = pj_oth + pi_oth[row];
1188:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1189:     }
1190:     apnz     = lnk[0];
1191:     api[i+1] = api[i] + apnz;
1192:     if (ap_rmax < apnz) ap_rmax = apnz;

1194:     /* if free space is not available, double the total space in the list */
1195:     if (current_space->local_remaining<apnz) {
1196:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
1197:       nspacedouble++;
1198:     }

1200:     /* Copy data into free space, then initialize lnk */
1201:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

1203:     current_space->array           += apnz;
1204:     current_space->local_used      += apnz;
1205:     current_space->local_remaining -= apnz;
1206:   }

1208:   /* Allocate space for apj, initialize apj, and */
1209:   /* destroy list of free space and other temporary array(s) */
1210:   PetscMalloc1(api[am]+1,&apj);
1211:   PetscFreeSpaceContiguous(&free_space,apj);
1212:   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
1213:   if (afill_tmp > afill) afill = afill_tmp;

1215:   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
1216:   /*-----------------------------------------------------------------*/
1217:   MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

1219:   /* then, compute symbolic Co = (p->B)^T*AP */
1220:   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1221:                          >= (num of nonzero rows of C_seq) - pn */
1222:   PetscMalloc1(pon+1,&coi);
1223:   coi[0] = 0;

1225:   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
1226:   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am]));
1227:   PetscFreeSpaceGet(nnz,&free_space);
1228:   current_space = free_space;

1230:   for (i=0; i<pon; i++) {
1231:     pnz = poti[i+1] - poti[i];
1232:     ptJ = potj + poti[i];
1233:     for (j=0; j<pnz; j++) {
1234:       row  = ptJ[j]; /* row of AP == col of Pot */
1235:       apnz = api[row+1] - api[row];
1236:       Jptr = apj + api[row];
1237:       /* add non-zero cols of AP into the sorted linked list lnk */
1238:       PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1239:     }
1240:     nnz = lnk[0];

1242:     /* If free space is not available, double the total space in the list */
1243:     if (current_space->local_remaining<nnz) {
1244:       PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);
1245:       nspacedouble++;
1246:     }

1248:     /* Copy data into free space, and zero out denserows */
1249:     PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);

1251:     current_space->array           += nnz;
1252:     current_space->local_used      += nnz;
1253:     current_space->local_remaining -= nnz;

1255:     coi[i+1] = coi[i] + nnz;
1256:   }
1257: 
1258:   PetscMalloc1(coi[pon],&coj);
1259:   PetscFreeSpaceContiguous(&free_space,coj);
1260:   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
1261:   if (afill_tmp > afill) afill = afill_tmp;
1262:   MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);

1264:   /* (3) send j-array (coj) of Co to other processors */
1265:   /*--------------------------------------------------*/
1266:   PetscCalloc1(size,&merge->len_s);
1267:   len_s        = merge->len_s;
1268:   merge->nsend = 0;


1271:   /* determine row ownership */
1272:   PetscLayoutCreate(comm,&merge->rowmap);
1273:   merge->rowmap->n  = pn;
1274:   merge->rowmap->bs = 1;

1276:   PetscLayoutSetUp(merge->rowmap);
1277:   owners = merge->rowmap->range;

1279:   /* determine the number of messages to send, their lengths */
1280:   PetscMalloc2(size,&len_si,size,&sstatus);
1281:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1282:   PetscMalloc1(size+2,&owners_co);

1284:   proc = 0;
1285:   for (i=0; i<pon; i++) {
1286:     while (prmap[i] >= owners[proc+1]) proc++;
1287:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1288:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1289:   }

1291:   len          = 0; /* max length of buf_si[], see (4) */
1292:   owners_co[0] = 0;
1293:   for (proc=0; proc<size; proc++) {
1294:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1295:     if (len_s[proc]) {
1296:       merge->nsend++;
1297:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1298:       len         += len_si[proc];
1299:     }
1300:   }

1302:   /* determine the number and length of messages to receive for coi and coj  */
1303:   PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1304:   PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);

1306:   /* post the Irecv and Isend of coj */
1307:   PetscCommGetNewTag(comm,&tagj);
1308:   PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1309:   PetscMalloc1(merge->nsend+1,&swaits);
1310:   for (proc=0, k=0; proc<size; proc++) {
1311:     if (!len_s[proc]) continue;
1312:     i    = owners_co[proc];
1313:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1314:     k++;
1315:   }

1317:   /* receives and sends of coj are complete */
1318:   for (i=0; i<merge->nrecv; i++) {
1319:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1320:   }
1321:   PetscFree(rwaits);
1322:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1324:   /* (4) send and recv coi */
1325:   /*-----------------------*/
1326:   PetscCommGetNewTag(comm,&tagi);
1327:   PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1328:   PetscMalloc1(len+1,&buf_s);
1329:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1330:   for (proc=0,k=0; proc<size; proc++) {
1331:     if (!len_s[proc]) continue;
1332:     /* form outgoing message for i-structure:
1333:          buf_si[0]:                 nrows to be sent
1334:                [1:nrows]:           row index (global)
1335:                [nrows+1:2*nrows+1]: i-structure index
1336:     */
1337:     /*-------------------------------------------*/
1338:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1339:     buf_si_i    = buf_si + nrows+1;
1340:     buf_si[0]   = nrows;
1341:     buf_si_i[0] = 0;
1342:     nrows       = 0;
1343:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1344:       nzi = coi[i+1] - coi[i];
1345:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1346:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1347:       nrows++;
1348:     }
1349:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1350:     k++;
1351:     buf_si += len_si[proc];
1352:   }
1353:   i = merge->nrecv;
1354:   while (i--) {
1355:     MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1356:   }
1357:   PetscFree(rwaits);
1358:   if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}

1360:   PetscFree2(len_si,sstatus);
1361:   PetscFree(len_ri);
1362:   PetscFree(swaits);
1363:   PetscFree(buf_s);

1365:   /* (5) compute the local portion of C (mpi mat) */
1366:   /*----------------------------------------------*/
1367:   MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);

1369:   /* allocate pti array and free space for accumulating nonzero column info */
1370:   PetscMalloc1(pn+1,&pti);
1371:   pti[0] = 0;

1373:   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1374:   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am]));
1375:   PetscFreeSpaceGet(nnz,&free_space);
1376:   current_space = free_space;

1378:   PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1379:   for (k=0; k<merge->nrecv; k++) {
1380:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1381:     nrows       = *buf_ri_k[k];
1382:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1383:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1384:   }
1385:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1386: 
1387:   for (i=0; i<pn; i++) {
1388:     /* add pdt[i,:]*AP into lnk */
1389:     pnz = pdti[i+1] - pdti[i];
1390:     ptJ = pdtj + pdti[i];
1391:     for (j=0; j<pnz; j++) {
1392:       row  = ptJ[j];  /* row of AP == col of Pt */
1393:       apnz = api[row+1] - api[row];
1394:       Jptr = apj + api[row];
1395:       /* add non-zero cols of AP into the sorted linked list lnk */
1396:       PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1397:     }

1399:     /* add received col data into lnk */
1400:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1401:       if (i == *nextrow[k]) { /* i-th row */
1402:         nzi  = *(nextci[k]+1) - *nextci[k];
1403:         Jptr = buf_rj[k] + *nextci[k];
1404:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1405:         nextrow[k]++; nextci[k]++;
1406:       }
1407:     }
1408:     nnz = lnk[0];

1410:     /* if free space is not available, make more free space */
1411:     if (current_space->local_remaining<nnz) {
1412:       PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);
1413:       nspacedouble++;
1414:     }
1415:     /* copy data into free space, then initialize lnk */
1416:     PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1417:     MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);

1419:     current_space->array           += nnz;
1420:     current_space->local_used      += nnz;
1421:     current_space->local_remaining -= nnz;

1423:     pti[i+1] = pti[i] + nnz;
1424:   }
1425:   MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1426:   PetscFree3(buf_ri_k,nextrow,nextci);

1428:   PetscMalloc1(pti[pn]+1,&ptj);
1429:   PetscFreeSpaceContiguous(&free_space,ptj);
1430:   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1431:   if (afill_tmp > afill) afill = afill_tmp;
1432:   PetscLLDestroy(lnk,lnkbt);

1434:   /* (6) create symbolic parallel matrix Cmpi */
1435:   /*------------------------------------------*/
1436:   MatCreate(comm,&Cmpi);
1437:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1438:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
1439:   MatSetType(Cmpi,MATMPIAIJ);
1440:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1441:   MatPreallocateFinalize(dnz,onz);

1443:   merge->bi        = pti;      /* Cseq->i */
1444:   merge->bj        = ptj;      /* Cseq->j */
1445:   merge->coi       = coi;      /* Co->i   */
1446:   merge->coj       = coj;      /* Co->j   */
1447:   merge->buf_ri    = buf_ri;
1448:   merge->buf_rj    = buf_rj;
1449:   merge->owners_co = owners_co;
1450:   merge->destroy   = Cmpi->ops->destroy;

1452:   /* attach the supporting struct to Cmpi for reuse */
1453:   c           = (Mat_MPIAIJ*)Cmpi->data;
1454:   c->ptap     = ptap;
1455:   ptap->api   = api;
1456:   ptap->apj   = apj;
1457:   ptap->duplicate = Cmpi->ops->duplicate;
1458:   ptap->destroy   = Cmpi->ops->destroy;

1460:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1461:   Cmpi->assembled        = PETSC_FALSE;
1462:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1463:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
1464:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap;
1465:   *C                     = Cmpi;

1467:   /* flag 'scalable' determines which implementations to be used:
1468:        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1469:        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1470:   /* set default scalable */
1471:   ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */

1473:   PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);
1474:   if (!ptap->scalable) {  /* Do dense axpy */
1475:     PetscCalloc1(pN,&ptap->apa);
1476:   } else {
1477:     PetscCalloc1(ap_rmax+1,&ptap->apa);
1478:   }

1480: #if defined(PETSC_USE_INFO)
1481:   if (pti[pn] != 0) {
1482:     PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1483:     PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);
1484:   } else {
1485:     PetscInfo(Cmpi,"Empty matrix product\n");
1486:   }
1487: #endif
1488:   return(0);
1489: }

1491: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C)
1492: {
1493:   PetscErrorCode      ierr;
1494:   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1495:   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1496:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1497:   Mat_SeqAIJ          *p_loc,*p_oth;
1498:   Mat_PtAPMPI         *ptap;
1499:   Mat_Merge_SeqsToMPI *merge;
1500:   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1501:   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1502:   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1503:   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1504:   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1505:   MPI_Comm            comm;
1506:   PetscMPIInt         size,rank,taga,*len_s;
1507:   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1508:   PetscInt            **buf_ri,**buf_rj;
1509:   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1510:   MPI_Request         *s_waits,*r_waits;
1511:   MPI_Status          *status;
1512:   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1513:   PetscInt            *api,*apj,*coi,*coj;
1514:   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1515:   PetscBool           scalable;
1516: #if defined(PTAP_PROFILE)
1517:   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1518: #endif

1521:   PetscObjectGetComm((PetscObject)C,&comm);
1522: #if defined(PTAP_PROFILE)
1523:   PetscTime(&t0);
1524: #endif
1525:   MPI_Comm_size(comm,&size);
1526:   MPI_Comm_rank(comm,&rank);

1528:   ptap = c->ptap;
1529:   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1530:   merge    = ptap->merge;
1531:   apa      = ptap->apa;
1532:   scalable = ptap->scalable;

1534:   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1535:   /*-----------------------------------------------------*/
1536:   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1537:     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1538:     ptap->reuse = MAT_REUSE_MATRIX;
1539:   } else { /* update numerical values of P_oth and P_loc */
1540:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1541:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1542:   }
1543: #if defined(PTAP_PROFILE)
1544:   PetscTime(&t1);
1545:   eP = t1-t0;
1546: #endif
1547:   /*
1548:   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1549:          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1550:          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1551:          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1552:    */

1554:   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1555:   /*--------------------------------------------------------------*/
1556:   /* get data from symbolic products */
1557:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1558:   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1559:   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
1560:   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;

1562:   coi  = merge->coi; coj = merge->coj;
1563:   PetscCalloc1(coi[pon]+1,&coa);

1565:   bi     = merge->bi; bj = merge->bj;
1566:   owners = merge->rowmap->range;
1567:   PetscCalloc1(bi[cm]+1,&ba);  /* ba: Cseq->a */

1569:   api = ptap->api; apj = ptap->apj;

1571:   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1572:     PetscInfo(C,"Using non-scalable dense axpy\n");
1573: #if defined(PTAP_PROFILE)   
1574:     PetscTime(&t1);
1575: #endif
1576:     for (i=0; i<am; i++) {
1577: #if defined(PTAP_PROFILE)
1578:       PetscTime(&t2_0);
1579: #endif
1580:       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1581:       /*------------------------------------------------------------*/
1582:       apJ = apj + api[i];

1584:       /* diagonal portion of A */
1585:       anz = adi[i+1] - adi[i];
1586:       adj = ad->j + adi[i];
1587:       ada = ad->a + adi[i];
1588:       for (j=0; j<anz; j++) {
1589:         row = adj[j];
1590:         pnz = pi_loc[row+1] - pi_loc[row];
1591:         pj  = pj_loc + pi_loc[row];
1592:         pa  = pa_loc + pi_loc[row];

1594:         /* perform dense axpy */
1595:         valtmp = ada[j];
1596:         for (k=0; k<pnz; k++) {
1597:           apa[pj[k]] += valtmp*pa[k];
1598:         }
1599:         PetscLogFlops(2.0*pnz);
1600:       }

1602:       /* off-diagonal portion of A */
1603:       anz = aoi[i+1] - aoi[i];
1604:       aoj = ao->j + aoi[i];
1605:       aoa = ao->a + aoi[i];
1606:       for (j=0; j<anz; j++) {
1607:         row = aoj[j];
1608:         pnz = pi_oth[row+1] - pi_oth[row];
1609:         pj  = pj_oth + pi_oth[row];
1610:         pa  = pa_oth + pi_oth[row];

1612:         /* perform dense axpy */
1613:         valtmp = aoa[j];
1614:         for (k=0; k<pnz; k++) {
1615:           apa[pj[k]] += valtmp*pa[k];
1616:         }
1617:         PetscLogFlops(2.0*pnz);
1618:       }
1619: #if defined(PTAP_PROFILE)
1620:       PetscTime(&t2_1);
1621:       et2_AP += t2_1 - t2_0;
1622: #endif

1624:       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1625:       /*--------------------------------------------------------------*/
1626:       apnz = api[i+1] - api[i];
1627:       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1628:       pnz = po->i[i+1] - po->i[i];
1629:       poJ = po->j + po->i[i];
1630:       pA  = po->a + po->i[i];
1631:       for (j=0; j<pnz; j++) {
1632:         row = poJ[j];
1633:         cnz = coi[row+1] - coi[row];
1634:         cj  = coj + coi[row];
1635:         ca  = coa + coi[row];
1636:         /* perform dense axpy */
1637:         valtmp = pA[j];
1638:         for (k=0; k<cnz; k++) {
1639:           ca[k] += valtmp*apa[cj[k]];
1640:         }
1641:         PetscLogFlops(2.0*cnz);
1642:       }
1643:       /* put the value into Cd (diagonal part) */
1644:       pnz = pd->i[i+1] - pd->i[i];
1645:       pdJ = pd->j + pd->i[i];
1646:       pA  = pd->a + pd->i[i];
1647:       for (j=0; j<pnz; j++) {
1648:         row = pdJ[j];
1649:         cnz = bi[row+1] - bi[row];
1650:         cj  = bj + bi[row];
1651:         ca  = ba + bi[row];
1652:         /* perform dense axpy */
1653:         valtmp = pA[j];
1654:         for (k=0; k<cnz; k++) {
1655:           ca[k] += valtmp*apa[cj[k]];
1656:         }
1657:         PetscLogFlops(2.0*cnz);
1658:       }
1659: 
1660:       /* zero the current row of A*P */
1661:       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1662: #if defined(PTAP_PROFILE)
1663:       PetscTime(&t2_2);
1664:       ePtAP += t2_2 - t2_1;
1665: #endif
1666:     }
1667:   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1668:     PetscInfo(C,"Using scalable sparse axpy\n");
1669:     /*-----------------------------------------------------------------------------------------*/
1670:     pA=pa_loc;
1671:     for (i=0; i<am; i++) {
1672: #if defined(PTAP_PROFILE)
1673:       PetscTime(&t2_0);
1674: #endif
1675:       /* form i-th sparse row of A*P */
1676:       apnz = api[i+1] - api[i];
1677:       apJ  = apj + api[i];
1678:       /* diagonal portion of A */
1679:       anz = adi[i+1] - adi[i];
1680:       adj = ad->j + adi[i];
1681:       ada = ad->a + adi[i];
1682:       for (j=0; j<anz; j++) {
1683:         row    = adj[j];
1684:         pnz    = pi_loc[row+1] - pi_loc[row];
1685:         pj     = pj_loc + pi_loc[row];
1686:         pa     = pa_loc + pi_loc[row];
1687:         valtmp = ada[j];
1688:         nextp  = 0;
1689:         for (k=0; nextp<pnz; k++) {
1690:           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1691:             apa[k] += valtmp*pa[nextp++];
1692:           }
1693:         }
1694:         PetscLogFlops(2.0*pnz);
1695:       }
1696:       /* off-diagonal portion of A */
1697:       anz = aoi[i+1] - aoi[i];
1698:       aoj = ao->j + aoi[i];
1699:       aoa = ao->a + aoi[i];
1700:       for (j=0; j<anz; j++) {
1701:         row    = aoj[j];
1702:         pnz    = pi_oth[row+1] - pi_oth[row];
1703:         pj     = pj_oth + pi_oth[row];
1704:         pa     = pa_oth + pi_oth[row];
1705:         valtmp = aoa[j];
1706:         nextp  = 0;
1707:         for (k=0; nextp<pnz; k++) {
1708:           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1709:             apa[k] += valtmp*pa[nextp++];
1710:           }
1711:         }
1712:         PetscLogFlops(2.0*pnz);
1713:       }
1714: #if defined(PTAP_PROFILE)
1715:       PetscTime(&t2_1);
1716:       et2_AP += t2_1 - t2_0;
1717: #endif

1719:       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1720:       /*--------------------------------------------------------------*/
1721:       pnz = pi_loc[i+1] - pi_loc[i];
1722:       pJ  = pj_loc + pi_loc[i];
1723:       for (j=0; j<pnz; j++) {
1724:         nextap = 0;
1725:         row    = pJ[j]; /* global index */
1726:         if (row < pcstart || row >=pcend) { /* put the value into Co */
1727:           row = *poJ;
1728:           cj  = coj + coi[row];
1729:           ca  = coa + coi[row]; poJ++;
1730:         } else {                            /* put the value into Cd */
1731:           row = *pdJ;
1732:           cj  = bj + bi[row];
1733:           ca  = ba + bi[row]; pdJ++;
1734:         }
1735:         valtmp = pA[j];
1736:         for (k=0; nextap<apnz; k++) {
1737:           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1738:         }
1739:         PetscLogFlops(2.0*apnz);
1740:       }
1741:       pA += pnz;
1742:       /* zero the current row info for A*P */
1743:       PetscMemzero(apa,apnz*sizeof(MatScalar));
1744: #if defined(PTAP_PROFILE)
1745:       PetscTime(&t2_2);
1746:       ePtAP += t2_2 - t2_1;
1747: #endif
1748:     }
1749:   }
1750: #if defined(PTAP_PROFILE)
1751:   PetscTime(&t2);
1752: #endif

1754:   /* 3) send and recv matrix values coa */
1755:   /*------------------------------------*/
1756:   buf_ri = merge->buf_ri;
1757:   buf_rj = merge->buf_rj;
1758:   len_s  = merge->len_s;
1759:   PetscCommGetNewTag(comm,&taga);
1760:   PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);

1762:   PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1763:   for (proc=0,k=0; proc<size; proc++) {
1764:     if (!len_s[proc]) continue;
1765:     i    = merge->owners_co[proc];
1766:     MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1767:     k++;
1768:   }
1769:   if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1770:   if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}

1772:   PetscFree2(s_waits,status);
1773:   PetscFree(r_waits);
1774:   PetscFree(coa);
1775: #if defined(PTAP_PROFILE)
1776:   PetscTime(&t3);
1777: #endif

1779:   /* 4) insert local Cseq and received values into Cmpi */
1780:   /*------------------------------------------------------*/
1781:   PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1782:   for (k=0; k<merge->nrecv; k++) {
1783:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1784:     nrows       = *(buf_ri_k[k]);
1785:     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1786:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1787:   }

1789:   for (i=0; i<cm; i++) {
1790:     row  = owners[rank] + i; /* global row index of C_seq */
1791:     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1792:     ba_i = ba + bi[i];
1793:     bnz  = bi[i+1] - bi[i];
1794:     /* add received vals into ba */
1795:     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1796:       /* i-th row */
1797:       if (i == *nextrow[k]) {
1798:         cnz    = *(nextci[k]+1) - *nextci[k];
1799:         cj     = buf_rj[k] + *(nextci[k]);
1800:         ca     = abuf_r[k] + *(nextci[k]);
1801:         nextcj = 0;
1802:         for (j=0; nextcj<cnz; j++) {
1803:           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1804:             ba_i[j] += ca[nextcj++];
1805:           }
1806:         }
1807:         nextrow[k]++; nextci[k]++;
1808:         PetscLogFlops(2.0*cnz);
1809:       }
1810:     }
1811:     MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1812:   }
1813:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1814:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1816:   PetscFree(ba);
1817:   PetscFree(abuf_r[0]);
1818:   PetscFree(abuf_r);
1819:   PetscFree3(buf_ri_k,nextrow,nextci);
1820: #if defined(PTAP_PROFILE)
1821:   PetscTime(&t4);
1822:   if (rank==1) {
1823:     PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);
1824:   }
1825: #endif
1826:   return(0);
1827: }