Actual source code: mpiptap.c
petsc-3.8.3 2017-12-09
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
13: /* #define PTAP_PROFILE */
15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
16: {
17: PetscErrorCode ierr;
18: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
19: Mat_PtAPMPI *ptap=a->ptap;
20: PetscBool iascii;
21: PetscViewerFormat format;
24: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
32: }
33: }
34: }
35: (ptap->view)(A,viewer);
36: return(0);
37: }
39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
40: {
42: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
43: Mat_PtAPMPI *ptap=a->ptap;
46: if (ptap) {
47: Mat_Merge_SeqsToMPI *merge=ptap->merge;
48: PetscFree2(ptap->startsj_s,ptap->startsj_r);
49: PetscFree(ptap->bufa);
50: MatDestroy(&ptap->P_loc);
51: MatDestroy(&ptap->P_oth);
52: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
53: MatDestroy(&ptap->Rd);
54: MatDestroy(&ptap->Ro);
55: if (ptap->AP_loc) { /* used by alg_rap */
56: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
57: PetscFree(ap->i);
58: PetscFree2(ap->j,ap->a);
59: MatDestroy(&ptap->AP_loc);
60: } else { /* used by alg_ptap */
61: PetscFree(ptap->api);
62: PetscFree(ptap->apj);
63: }
64: MatDestroy(&ptap->C_loc);
65: MatDestroy(&ptap->C_oth);
66: if (ptap->apa) {PetscFree(ptap->apa);}
68: if (merge) { /* used by alg_ptap */
69: PetscFree(merge->id_r);
70: PetscFree(merge->len_s);
71: PetscFree(merge->len_r);
72: PetscFree(merge->bi);
73: PetscFree(merge->bj);
74: PetscFree(merge->buf_ri[0]);
75: PetscFree(merge->buf_ri);
76: PetscFree(merge->buf_rj[0]);
77: PetscFree(merge->buf_rj);
78: PetscFree(merge->coi);
79: PetscFree(merge->coj);
80: PetscFree(merge->owners_co);
81: PetscLayoutDestroy(&merge->rowmap);
82: PetscFree(ptap->merge);
83: }
85: ptap->destroy(A);
86: PetscFree(ptap);
87: }
88: return(0);
89: }
91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
92: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
95: Mat_PtAPMPI *ptap = a->ptap;
98: (*ptap->duplicate)(A,op,M);
99: (*M)->ops->destroy = ptap->destroy;
100: (*M)->ops->duplicate = ptap->duplicate;
101: (*M)->ops->view = ptap->view;
102: return(0);
103: }
105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108: PetscBool rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */
109: MPI_Comm comm;
112: /* check if matrix local sizes are compatible */
113: PetscObjectGetComm((PetscObject)A,&comm);
114: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
115: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
117: PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);
118: if (scall == MAT_INITIAL_MATRIX) {
119: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
120: if (rap) { /* do R=P^T locally, then C=R*A*P */
121: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
122: } else { /* do P^T*A*P */
123: MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);
124: }
125: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
126: }
127: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
128: (*(*C)->ops->ptapnumeric)(A,P,*C);
129: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
130: return(0);
131: }
133: #if defined(PETSC_HAVE_HYPRE)
134: PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
135: #endif
137: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
138: {
139: PetscErrorCode ierr;
140: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
141: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
142: Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq;
143: Mat_PtAPMPI *ptap = c->ptap;
144: Mat AP_loc,C_loc,C_oth;
145: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
146: PetscScalar *apa;
147: const PetscInt *cols;
148: const PetscScalar *vals;
151: MatZeroEntries(C);
153: /* 1) get R = Pd^T,Ro = Po^T */
154: if (ptap->reuse == MAT_REUSE_MATRIX) {
155: MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
156: MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
157: }
159: /* 2) get AP_loc */
160: AP_loc = ptap->AP_loc;
161: ap = (Mat_SeqAIJ*)AP_loc->data;
163: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
164: /*-----------------------------------------------------*/
165: if (ptap->reuse == MAT_REUSE_MATRIX) {
166: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
167: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
168: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
169: }
171: /* 2-2) compute numeric A_loc*P - dominating part */
172: /* ---------------------------------------------- */
173: /* get data from symbolic products */
174: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
175: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
176: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
178: api = ap->i;
179: apj = ap->j;
180: for (i=0; i<am; i++) {
181: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
182: apnz = api[i+1] - api[i];
183: apa = ap->a + api[i];
184: PetscMemzero(apa,sizeof(PetscScalar)*apnz);
185: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
186: PetscLogFlops(2.0*apnz);
187: }
189: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
190: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
191: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
193: C_loc = ptap->C_loc;
194: C_oth = ptap->C_oth;
196: /* add C_loc and Co to to C */
197: MatGetOwnershipRange(C,&rstart,&rend);
199: /* C_loc -> C */
200: cm = C_loc->rmap->N;
201: c_seq = (Mat_SeqAIJ*)C_loc->data;
202: cols = c_seq->j;
203: vals = c_seq->a;
204: for (i=0; i<cm; i++) {
205: ncols = c_seq->i[i+1] - c_seq->i[i];
206: row = rstart + i;
207: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
208: cols += ncols; vals += ncols;
209: }
211: /* Co -> C, off-processor part */
212: cm = C_oth->rmap->N;
213: c_seq = (Mat_SeqAIJ*)C_oth->data;
214: cols = c_seq->j;
215: vals = c_seq->a;
216: for (i=0; i<cm; i++) {
217: ncols = c_seq->i[i+1] - c_seq->i[i];
218: row = p->garray[i];
219: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
220: cols += ncols; vals += ncols;
221: }
222: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
223: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
225: ptap->reuse = MAT_REUSE_MATRIX;
226: return(0);
227: }
229: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
230: {
231: PetscErrorCode ierr;
232: Mat_PtAPMPI *ptap;
233: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
234: MPI_Comm comm;
235: PetscMPIInt size,rank;
236: Mat Cmpi,P_loc,P_oth;
237: PetscFreeSpaceList free_space=NULL,current_space=NULL;
238: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
239: PetscInt *lnk,i,k,pnz,row,nsend;
240: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
241: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
242: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
243: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
244: MPI_Request *swaits,*rwaits;
245: MPI_Status *sstatus,rstatus;
246: PetscLayout rowmap;
247: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
248: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
249: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
250: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
251: PetscScalar *apv;
252: PetscTable ta;
253: #if defined(PETSC_USE_INFO)
254: PetscReal apfill;
255: #endif
258: PetscObjectGetComm((PetscObject)A,&comm);
259: MPI_Comm_size(comm,&size);
260: MPI_Comm_rank(comm,&rank);
262: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
264: /* create symbolic parallel matrix Cmpi */
265: MatCreate(comm,&Cmpi);
266: MatSetType(Cmpi,MATMPIAIJ);
268: /* create struct Mat_PtAPMPI and attached it to C later */
269: PetscNew(&ptap);
270: ptap->reuse = MAT_INITIAL_MATRIX;
271: ptap->algType = 0;
273: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
274: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
275: /* get P_loc by taking all local rows of P */
276: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
278: ptap->P_loc = P_loc;
279: ptap->P_oth = P_oth;
281: /* (0) compute Rd = Pd^T, Ro = Po^T */
282: /* --------------------------------- */
283: MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
284: MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
286: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
287: /* ----------------------------------------------------------------- */
288: p_loc = (Mat_SeqAIJ*)P_loc->data;
289: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
291: /* create and initialize a linked list */
292: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
293: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
294: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
295: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
297: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
299: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
300: if (ao) {
301: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
302: } else {
303: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
304: }
305: current_space = free_space;
306: nspacedouble = 0;
308: PetscMalloc1(am+1,&api);
309: api[0] = 0;
310: for (i=0; i<am; i++) {
311: /* diagonal portion: Ad[i,:]*P */
312: ai = ad->i; pi = p_loc->i;
313: nzi = ai[i+1] - ai[i];
314: aj = ad->j + ai[i];
315: for (j=0; j<nzi; j++) {
316: row = aj[j];
317: pnz = pi[row+1] - pi[row];
318: Jptr = p_loc->j + pi[row];
319: /* add non-zero cols of P into the sorted linked list lnk */
320: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
321: }
322: /* off-diagonal portion: Ao[i,:]*P */
323: if (ao) {
324: ai = ao->i; pi = p_oth->i;
325: nzi = ai[i+1] - ai[i];
326: aj = ao->j + ai[i];
327: for (j=0; j<nzi; j++) {
328: row = aj[j];
329: pnz = pi[row+1] - pi[row];
330: Jptr = p_oth->j + pi[row];
331: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
332: }
333: }
334: apnz = lnk[0];
335: api[i+1] = api[i] + apnz;
337: /* if free space is not available, double the total space in the list */
338: if (current_space->local_remaining<apnz) {
339: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
340: nspacedouble++;
341: }
343: /* Copy data into free space, then initialize lnk */
344: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
346: current_space->array += apnz;
347: current_space->local_used += apnz;
348: current_space->local_remaining -= apnz;
349: }
350: /* Allocate space for apj and apv, initialize apj, and */
351: /* destroy list of free space and other temporary array(s) */
352: PetscMalloc2(api[am],&apj,api[am],&apv);
353: PetscFreeSpaceContiguous(&free_space,apj);
354: PetscLLCondensedDestroy_Scalable(lnk);
356: /* Create AP_loc for reuse */
357: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
359: #if defined(PETSC_USE_INFO)
360: if (ao) {
361: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
362: } else {
363: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
364: }
365: ptap->AP_loc->info.mallocs = nspacedouble;
366: ptap->AP_loc->info.fill_ratio_given = fill;
367: ptap->AP_loc->info.fill_ratio_needed = apfill;
369: if (api[am]) {
370: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
371: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
372: } else {
373: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
374: }
375: #endif
377: /* (2-1) compute symbolic Co = Ro*AP_loc */
378: /* ------------------------------------ */
379: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
381: /* (3) send coj of C_oth to other processors */
382: /* ------------------------------------------ */
383: /* determine row ownership */
384: PetscLayoutCreate(comm,&rowmap);
385: rowmap->n = pn;
386: rowmap->bs = 1;
387: PetscLayoutSetUp(rowmap);
388: owners = rowmap->range;
390: /* determine the number of messages to send, their lengths */
391: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
392: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
393: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
395: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
396: coi = c_oth->i; coj = c_oth->j;
397: con = ptap->C_oth->rmap->n;
398: proc = 0;
399: for (i=0; i<con; i++) {
400: while (prmap[i] >= owners[proc+1]) proc++;
401: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
402: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
403: }
405: len = 0; /* max length of buf_si[], see (4) */
406: owners_co[0] = 0;
407: nsend = 0;
408: for (proc=0; proc<size; proc++) {
409: owners_co[proc+1] = owners_co[proc] + len_si[proc];
410: if (len_s[proc]) {
411: nsend++;
412: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
413: len += len_si[proc];
414: }
415: }
417: /* determine the number and length of messages to receive for coi and coj */
418: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
419: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
421: /* post the Irecv and Isend of coj */
422: PetscCommGetNewTag(comm,&tagj);
423: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
424: PetscMalloc1(nsend+1,&swaits);
425: for (proc=0, k=0; proc<size; proc++) {
426: if (!len_s[proc]) continue;
427: i = owners_co[proc];
428: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
429: k++;
430: }
432: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
433: /* ---------------------------------------- */
434: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
435: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
437: /* receives coj are complete */
438: for (i=0; i<nrecv; i++) {
439: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
440: }
441: PetscFree(rwaits);
442: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
444: /* add received column indices into ta to update Crmax */
445: for (k=0; k<nrecv; k++) {/* k-th received message */
446: Jptr = buf_rj[k];
447: for (j=0; j<len_r[k]; j++) {
448: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
449: }
450: }
451: PetscTableGetCount(ta,&Crmax);
452: PetscTableDestroy(&ta);
454: /* (4) send and recv coi */
455: /*-----------------------*/
456: PetscCommGetNewTag(comm,&tagi);
457: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
458: PetscMalloc1(len+1,&buf_s);
459: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
460: for (proc=0,k=0; proc<size; proc++) {
461: if (!len_s[proc]) continue;
462: /* form outgoing message for i-structure:
463: buf_si[0]: nrows to be sent
464: [1:nrows]: row index (global)
465: [nrows+1:2*nrows+1]: i-structure index
466: */
467: /*-------------------------------------------*/
468: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
469: buf_si_i = buf_si + nrows+1;
470: buf_si[0] = nrows;
471: buf_si_i[0] = 0;
472: nrows = 0;
473: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
474: nzi = coi[i+1] - coi[i];
475: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
476: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
477: nrows++;
478: }
479: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
480: k++;
481: buf_si += len_si[proc];
482: }
483: for (i=0; i<nrecv; i++) {
484: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
485: }
486: PetscFree(rwaits);
487: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
489: PetscFree4(len_s,len_si,sstatus,owners_co);
490: PetscFree(len_ri);
491: PetscFree(swaits);
492: PetscFree(buf_s);
494: /* (5) compute the local portion of Cmpi */
495: /* ------------------------------------------ */
496: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
497: PetscFreeSpaceGet(Crmax,&free_space);
498: current_space = free_space;
500: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
501: for (k=0; k<nrecv; k++) {
502: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
503: nrows = *buf_ri_k[k];
504: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
505: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
506: }
508: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
509: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
510: for (i=0; i<pn; i++) {
511: /* add C_loc into Cmpi */
512: nzi = c_loc->i[i+1] - c_loc->i[i];
513: Jptr = c_loc->j + c_loc->i[i];
514: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
516: /* add received col data into lnk */
517: for (k=0; k<nrecv; k++) { /* k-th received message */
518: if (i == *nextrow[k]) { /* i-th row */
519: nzi = *(nextci[k]+1) - *nextci[k];
520: Jptr = buf_rj[k] + *nextci[k];
521: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
522: nextrow[k]++; nextci[k]++;
523: }
524: }
525: nzi = lnk[0];
527: /* copy data into free space, then initialize lnk */
528: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
529: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
530: }
531: PetscFree3(buf_ri_k,nextrow,nextci);
532: PetscLLCondensedDestroy_Scalable(lnk);
533: PetscFreeSpaceDestroy(free_space);
535: /* local sizes and preallocation */
536: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
537: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
538: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
539: MatPreallocateFinalize(dnz,onz);
541: /* members in merge */
542: PetscFree(id_r);
543: PetscFree(len_r);
544: PetscFree(buf_ri[0]);
545: PetscFree(buf_ri);
546: PetscFree(buf_rj[0]);
547: PetscFree(buf_rj);
548: PetscLayoutDestroy(&rowmap);
550: /* attach the supporting struct to Cmpi for reuse */
551: c = (Mat_MPIAIJ*)Cmpi->data;
552: c->ptap = ptap;
553: ptap->duplicate = Cmpi->ops->duplicate;
554: ptap->destroy = Cmpi->ops->destroy;
555: ptap->view = Cmpi->ops->view;
557: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
558: Cmpi->assembled = PETSC_FALSE;
559: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
560: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
561: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
562: *C = Cmpi;
563: return(0);
564: }
566: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
567: {
568: PetscErrorCode ierr;
569: Mat_PtAPMPI *ptap;
570: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
571: MPI_Comm comm;
572: PetscMPIInt size,rank;
573: Mat Cmpi;
574: PetscFreeSpaceList free_space=NULL,current_space=NULL;
575: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
576: PetscInt *lnk,i,k,pnz,row,nsend;
577: PetscBT lnkbt;
578: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
579: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
580: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
581: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
582: MPI_Request *swaits,*rwaits;
583: MPI_Status *sstatus,rstatus;
584: PetscLayout rowmap;
585: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
586: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
587: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
588: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
589: PetscScalar *apv;
590: PetscTable ta;
591: #if defined(PETSC_HAVE_HYPRE)
592: const char *algTypes[3] = {"scalable","nonscalable","hypre"};
593: PetscInt nalg = 3;
594: #else
595: const char *algTypes[2] = {"scalable","nonscalable"};
596: PetscInt nalg = 2;
597: #endif
598: PetscInt alg = 1; /* set default algorithm */
599: #if defined(PETSC_USE_INFO)
600: PetscReal apfill;
601: #endif
602: #if defined(PTAP_PROFILE)
603: PetscLogDouble t0,t1,t11,t12,t2,t3,t4;
604: #endif
605: PetscBool flg;
608: PetscObjectGetComm((PetscObject)A,&comm);
610: /* pick an algorithm */
611: PetscObjectOptionsBegin((PetscObject)A);
612: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
613: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);
614: PetscOptionsEnd();
616: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
617: MatInfo Ainfo,Pinfo;
618: PetscInt nz_local;
619: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
621: MatGetInfo(A,MAT_LOCAL,&Ainfo);
622: MatGetInfo(P,MAT_LOCAL,&Pinfo);
623: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
625: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
626: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
628: if (alg_scalable) {
629: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
630: }
631: }
633: if (alg == 0) {
634: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
635: (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
636: return(0);
638: #if defined(PETSC_HAVE_HYPRE)
639: } else if (alg == 2) {
640: /* Use boomerAMGBuildCoarseOperator */
641: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
642: return(0);
643: #endif
644: }
646: #if defined(PTAP_PROFILE)
647: PetscTime(&t0);
648: #endif
650: MPI_Comm_size(comm,&size);
651: MPI_Comm_rank(comm,&rank);
653: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
655: /* create symbolic parallel matrix Cmpi */
656: MatCreate(comm,&Cmpi);
657: MatSetType(Cmpi,MATMPIAIJ);
659: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
660: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
662: /* create struct Mat_PtAPMPI and attached it to C later */
663: PetscNew(&ptap);
664: ptap->reuse = MAT_INITIAL_MATRIX;
665: ptap->algType = alg;
667: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
669: /* get P_loc by taking all local rows of P */
670: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
672: /* (0) compute Rd = Pd^T, Ro = Po^T */
673: /* --------------------------------- */
674: MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
675: MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
676: #if defined(PTAP_PROFILE)
677: PetscTime(&t11);
678: #endif
680: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
681: /* ----------------------------------------------------------------- */
682: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
683: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
685: /* create and initialize a linked list */
686: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
687: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
688: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
689: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
690: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
692: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
694: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
695: if (ao) {
696: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
697: } else {
698: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
699: }
700: current_space = free_space;
701: nspacedouble = 0;
703: PetscMalloc1(am+1,&api);
704: api[0] = 0;
705: for (i=0; i<am; i++) {
706: /* diagonal portion: Ad[i,:]*P */
707: ai = ad->i; pi = p_loc->i;
708: nzi = ai[i+1] - ai[i];
709: aj = ad->j + ai[i];
710: for (j=0; j<nzi; j++) {
711: row = aj[j];
712: pnz = pi[row+1] - pi[row];
713: Jptr = p_loc->j + pi[row];
714: /* add non-zero cols of P into the sorted linked list lnk */
715: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
716: }
717: /* off-diagonal portion: Ao[i,:]*P */
718: if (ao) {
719: ai = ao->i; pi = p_oth->i;
720: nzi = ai[i+1] - ai[i];
721: aj = ao->j + ai[i];
722: for (j=0; j<nzi; j++) {
723: row = aj[j];
724: pnz = pi[row+1] - pi[row];
725: Jptr = p_oth->j + pi[row];
726: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
727: }
728: }
729: apnz = lnk[0];
730: api[i+1] = api[i] + apnz;
731: if (ap_rmax < apnz) ap_rmax = apnz;
733: /* if free space is not available, double the total space in the list */
734: if (current_space->local_remaining<apnz) {
735: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
736: nspacedouble++;
737: }
739: /* Copy data into free space, then initialize lnk */
740: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
742: current_space->array += apnz;
743: current_space->local_used += apnz;
744: current_space->local_remaining -= apnz;
745: }
746: /* Allocate space for apj and apv, initialize apj, and */
747: /* destroy list of free space and other temporary array(s) */
748: PetscMalloc2(api[am],&apj,api[am],&apv);
749: PetscFreeSpaceContiguous(&free_space,apj);
750: PetscLLDestroy(lnk,lnkbt);
752: /* Create AP_loc for reuse */
753: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
755: #if defined(PETSC_USE_INFO)
756: if (ao) {
757: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
758: } else {
759: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
760: }
761: ptap->AP_loc->info.mallocs = nspacedouble;
762: ptap->AP_loc->info.fill_ratio_given = fill;
763: ptap->AP_loc->info.fill_ratio_needed = apfill;
765: if (api[am]) {
766: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
767: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
768: } else {
769: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
770: }
771: #endif
773: #if defined(PTAP_PROFILE)
774: PetscTime(&t12);
775: #endif
777: /* (2-1) compute symbolic Co = Ro*AP_loc */
778: /* ------------------------------------ */
779: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
780: #if defined(PTAP_PROFILE)
781: PetscTime(&t1);
782: #endif
784: /* (3) send coj of C_oth to other processors */
785: /* ------------------------------------------ */
786: /* determine row ownership */
787: PetscLayoutCreate(comm,&rowmap);
788: rowmap->n = pn;
789: rowmap->bs = 1;
790: PetscLayoutSetUp(rowmap);
791: owners = rowmap->range;
793: /* determine the number of messages to send, their lengths */
794: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
795: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
796: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
798: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
799: coi = c_oth->i; coj = c_oth->j;
800: con = ptap->C_oth->rmap->n;
801: proc = 0;
802: for (i=0; i<con; i++) {
803: while (prmap[i] >= owners[proc+1]) proc++;
804: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
805: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
806: }
808: len = 0; /* max length of buf_si[], see (4) */
809: owners_co[0] = 0;
810: nsend = 0;
811: for (proc=0; proc<size; proc++) {
812: owners_co[proc+1] = owners_co[proc] + len_si[proc];
813: if (len_s[proc]) {
814: nsend++;
815: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
816: len += len_si[proc];
817: }
818: }
820: /* determine the number and length of messages to receive for coi and coj */
821: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
822: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
824: /* post the Irecv and Isend of coj */
825: PetscCommGetNewTag(comm,&tagj);
826: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
827: PetscMalloc1(nsend+1,&swaits);
828: for (proc=0, k=0; proc<size; proc++) {
829: if (!len_s[proc]) continue;
830: i = owners_co[proc];
831: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
832: k++;
833: }
835: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
836: /* ---------------------------------------- */
837: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
838: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
840: /* receives coj are complete */
841: for (i=0; i<nrecv; i++) {
842: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
843: }
844: PetscFree(rwaits);
845: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
847: /* add received column indices into ta to update Crmax */
848: for (k=0; k<nrecv; k++) {/* k-th received message */
849: Jptr = buf_rj[k];
850: for (j=0; j<len_r[k]; j++) {
851: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
852: }
853: }
854: PetscTableGetCount(ta,&Crmax);
855: PetscTableDestroy(&ta);
857: /* (4) send and recv coi */
858: /*-----------------------*/
859: PetscCommGetNewTag(comm,&tagi);
860: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
861: PetscMalloc1(len+1,&buf_s);
862: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
863: for (proc=0,k=0; proc<size; proc++) {
864: if (!len_s[proc]) continue;
865: /* form outgoing message for i-structure:
866: buf_si[0]: nrows to be sent
867: [1:nrows]: row index (global)
868: [nrows+1:2*nrows+1]: i-structure index
869: */
870: /*-------------------------------------------*/
871: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
872: buf_si_i = buf_si + nrows+1;
873: buf_si[0] = nrows;
874: buf_si_i[0] = 0;
875: nrows = 0;
876: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
877: nzi = coi[i+1] - coi[i];
878: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
879: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
880: nrows++;
881: }
882: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
883: k++;
884: buf_si += len_si[proc];
885: }
886: for (i=0; i<nrecv; i++) {
887: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
888: }
889: PetscFree(rwaits);
890: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
892: PetscFree4(len_s,len_si,sstatus,owners_co);
893: PetscFree(len_ri);
894: PetscFree(swaits);
895: PetscFree(buf_s);
896: #if defined(PTAP_PROFILE)
897: PetscTime(&t2);
898: #endif
899: /* (5) compute the local portion of Cmpi */
900: /* ------------------------------------------ */
901: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
902: PetscFreeSpaceGet(Crmax,&free_space);
903: current_space = free_space;
905: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
906: for (k=0; k<nrecv; k++) {
907: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
908: nrows = *buf_ri_k[k];
909: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
910: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
911: }
913: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
914: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
915: for (i=0; i<pn; i++) {
916: /* add C_loc into Cmpi */
917: nzi = c_loc->i[i+1] - c_loc->i[i];
918: Jptr = c_loc->j + c_loc->i[i];
919: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
921: /* add received col data into lnk */
922: for (k=0; k<nrecv; k++) { /* k-th received message */
923: if (i == *nextrow[k]) { /* i-th row */
924: nzi = *(nextci[k]+1) - *nextci[k];
925: Jptr = buf_rj[k] + *nextci[k];
926: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
927: nextrow[k]++; nextci[k]++;
928: }
929: }
930: nzi = lnk[0];
932: /* copy data into free space, then initialize lnk */
933: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
934: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
935: }
936: PetscFree3(buf_ri_k,nextrow,nextci);
937: PetscLLDestroy(lnk,lnkbt);
938: PetscFreeSpaceDestroy(free_space);
939: #if defined(PTAP_PROFILE)
940: PetscTime(&t3);
941: #endif
943: /* local sizes and preallocation */
944: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
945: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
946: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
947: MatPreallocateFinalize(dnz,onz);
949: /* members in merge */
950: PetscFree(id_r);
951: PetscFree(len_r);
952: PetscFree(buf_ri[0]);
953: PetscFree(buf_ri);
954: PetscFree(buf_rj[0]);
955: PetscFree(buf_rj);
956: PetscLayoutDestroy(&rowmap);
958: /* attach the supporting struct to Cmpi for reuse */
959: c = (Mat_MPIAIJ*)Cmpi->data;
960: c->ptap = ptap;
961: ptap->duplicate = Cmpi->ops->duplicate;
962: ptap->destroy = Cmpi->ops->destroy;
963: ptap->view = Cmpi->ops->view;
965: if (alg == 1) {
966: PetscCalloc1(pN,&ptap->apa);
967: }
969: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
970: Cmpi->assembled = PETSC_FALSE;
971: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
972: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
973: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
974: *C = Cmpi;
976: #if defined(PTAP_PROFILE)
977: PetscTime(&t4);
978: if (rank == 1) {
979: printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
980: }
981: #endif
982: return(0);
983: }
985: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
986: {
987: PetscErrorCode ierr;
988: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
989: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
990: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
991: Mat_PtAPMPI *ptap = c->ptap;
992: Mat AP_loc,C_loc,C_oth;
993: PetscInt i,rstart,rend,cm,ncols,row;
994: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
995: PetscScalar *apa;
996: const PetscInt *cols;
997: const PetscScalar *vals;
998: #if defined(PTAP_PROFILE)
999: PetscMPIInt rank;
1000: MPI_Comm comm;
1001: PetscLogDouble t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
1002: #endif
1005: MatZeroEntries(C);
1007: /* 1) get R = Pd^T,Ro = Po^T */
1008: #if defined(PTAP_PROFILE)
1009: PetscTime(&t0);
1010: #endif
1011: if (ptap->reuse == MAT_REUSE_MATRIX) {
1012: MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1013: MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1014: }
1015: #if defined(PTAP_PROFILE)
1016: PetscTime(&t1);
1017: eR = t1 - t0;
1018: #endif
1020: /* 2) get AP_loc */
1021: AP_loc = ptap->AP_loc;
1022: ap = (Mat_SeqAIJ*)AP_loc->data;
1024: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1025: /*-----------------------------------------------------*/
1026: if (ptap->reuse == MAT_REUSE_MATRIX) {
1027: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1028: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1029: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1030: }
1032: /* 2-2) compute numeric A_loc*P - dominating part */
1033: /* ---------------------------------------------- */
1034: /* get data from symbolic products */
1035: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1036: if (ptap->P_oth) {
1037: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1038: }
1039: apa = ptap->apa;
1040: api = ap->i;
1041: apj = ap->j;
1042: for (i=0; i<am; i++) {
1043: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1044: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1045: apnz = api[i+1] - api[i];
1046: for (j=0; j<apnz; j++) {
1047: col = apj[j+api[i]];
1048: ap->a[j+ap->i[i]] = apa[col];
1049: apa[col] = 0.0;
1050: }
1051: PetscLogFlops(2.0*apnz);
1052: }
1053: #if defined(PTAP_PROFILE)
1054: PetscTime(&t2);
1055: eAP = t2 - t1;
1056: #endif
1058: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1059: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1060: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1061: C_loc = ptap->C_loc;
1062: C_oth = ptap->C_oth;
1063: #if defined(PTAP_PROFILE)
1064: PetscTime(&t3);
1065: eCseq = t3 - t2;
1066: #endif
1068: /* add C_loc and Co to to C */
1069: MatGetOwnershipRange(C,&rstart,&rend);
1071: /* C_loc -> C */
1072: cm = C_loc->rmap->N;
1073: c_seq = (Mat_SeqAIJ*)C_loc->data;
1074: cols = c_seq->j;
1075: vals = c_seq->a;
1076: for (i=0; i<cm; i++) {
1077: ncols = c_seq->i[i+1] - c_seq->i[i];
1078: row = rstart + i;
1079: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1080: cols += ncols; vals += ncols;
1081: }
1082:
1083: /* Co -> C, off-processor part */
1084: cm = C_oth->rmap->N;
1085: c_seq = (Mat_SeqAIJ*)C_oth->data;
1086: cols = c_seq->j;
1087: vals = c_seq->a;
1088: for (i=0; i<cm; i++) {
1089: ncols = c_seq->i[i+1] - c_seq->i[i];
1090: row = p->garray[i];
1091: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1092: cols += ncols; vals += ncols;
1093: }
1094: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1095: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1096: #if defined(PTAP_PROFILE)
1097: PetscTime(&t4);
1098: eCmpi = t4 - t3;
1100: PetscObjectGetComm((PetscObject)C,&comm);
1101: MPI_Comm_rank(comm,&rank);
1102: if (rank==1) {
1103: PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);
1104: }
1105: #endif
1106: ptap->reuse = MAT_REUSE_MATRIX;
1107: return(0);
1108: }
1110: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C)
1111: {
1112: PetscErrorCode ierr;
1113: Mat Cmpi;
1114: Mat_PtAPMPI *ptap;
1115: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1116: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1117: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1118: Mat_SeqAIJ *p_loc,*p_oth;
1119: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
1120: PetscInt *adi=ad->i,*aj,*aoi=ao->i,nnz;
1121: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1122: PetscInt am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
1123: PetscBT lnkbt;
1124: MPI_Comm comm;
1125: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
1126: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1127: PetscInt len,proc,*dnz,*onz,*owners;
1128: PetscInt nzi,*pti,*ptj;
1129: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1130: MPI_Request *swaits,*rwaits;
1131: MPI_Status *sstatus,rstatus;
1132: Mat_Merge_SeqsToMPI *merge;
1133: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
1134: PetscReal afill=1.0,afill_tmp;
1137: PetscObjectGetComm((PetscObject)A,&comm);
1138: MPI_Comm_size(comm,&size);
1139: MPI_Comm_rank(comm,&rank);
1141: /* create struct Mat_PtAPMPI and attached it to C later */
1142: PetscNew(&ptap);
1143: PetscNew(&merge);
1144: ptap->merge = merge;
1145: ptap->reuse = MAT_INITIAL_MATRIX;
1147: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1148: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1150: /* get P_loc by taking all local rows of P */
1151: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1153: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1154: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1155: pi_loc = p_loc->i; pj_loc = p_loc->j;
1156: pi_oth = p_oth->i; pj_oth = p_oth->j;
1158: /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
1159: /*--------------------------------------------------------------------------*/
1160: PetscMalloc1(am+1,&api);
1161: api[0] = 0;
1163: /* create and initialize a linked list */
1164: PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);
1166: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
1167: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
1168: current_space = free_space;
1170: for (i=0; i<am; i++) {
1171: /* diagonal portion of A */
1172: nzi = adi[i+1] - adi[i];
1173: aj = ad->j + adi[i];
1174: for (j=0; j<nzi; j++) {
1175: row = aj[j];
1176: pnz = pi_loc[row+1] - pi_loc[row];
1177: Jptr = pj_loc + pi_loc[row];
1178: /* add non-zero cols of P into the sorted linked list lnk */
1179: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1180: }
1181: /* off-diagonal portion of A */
1182: nzi = aoi[i+1] - aoi[i];
1183: aj = ao->j + aoi[i];
1184: for (j=0; j<nzi; j++) {
1185: row = aj[j];
1186: pnz = pi_oth[row+1] - pi_oth[row];
1187: Jptr = pj_oth + pi_oth[row];
1188: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1189: }
1190: apnz = lnk[0];
1191: api[i+1] = api[i] + apnz;
1192: if (ap_rmax < apnz) ap_rmax = apnz;
1194: /* if free space is not available, double the total space in the list */
1195: if (current_space->local_remaining<apnz) {
1196: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1197: nspacedouble++;
1198: }
1200: /* Copy data into free space, then initialize lnk */
1201: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1203: current_space->array += apnz;
1204: current_space->local_used += apnz;
1205: current_space->local_remaining -= apnz;
1206: }
1208: /* Allocate space for apj, initialize apj, and */
1209: /* destroy list of free space and other temporary array(s) */
1210: PetscMalloc1(api[am]+1,&apj);
1211: PetscFreeSpaceContiguous(&free_space,apj);
1212: afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
1213: if (afill_tmp > afill) afill = afill_tmp;
1215: /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
1216: /*-----------------------------------------------------------------*/
1217: MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
1219: /* then, compute symbolic Co = (p->B)^T*AP */
1220: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1221: >= (num of nonzero rows of C_seq) - pn */
1222: PetscMalloc1(pon+1,&coi);
1223: coi[0] = 0;
1225: /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
1226: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am]));
1227: PetscFreeSpaceGet(nnz,&free_space);
1228: current_space = free_space;
1230: for (i=0; i<pon; i++) {
1231: pnz = poti[i+1] - poti[i];
1232: ptJ = potj + poti[i];
1233: for (j=0; j<pnz; j++) {
1234: row = ptJ[j]; /* row of AP == col of Pot */
1235: apnz = api[row+1] - api[row];
1236: Jptr = apj + api[row];
1237: /* add non-zero cols of AP into the sorted linked list lnk */
1238: PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1239: }
1240: nnz = lnk[0];
1242: /* If free space is not available, double the total space in the list */
1243: if (current_space->local_remaining<nnz) {
1244: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1245: nspacedouble++;
1246: }
1248: /* Copy data into free space, and zero out denserows */
1249: PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1251: current_space->array += nnz;
1252: current_space->local_used += nnz;
1253: current_space->local_remaining -= nnz;
1255: coi[i+1] = coi[i] + nnz;
1256: }
1257:
1258: PetscMalloc1(coi[pon],&coj);
1259: PetscFreeSpaceContiguous(&free_space,coj);
1260: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
1261: if (afill_tmp > afill) afill = afill_tmp;
1262: MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
1264: /* (3) send j-array (coj) of Co to other processors */
1265: /*--------------------------------------------------*/
1266: PetscCalloc1(size,&merge->len_s);
1267: len_s = merge->len_s;
1268: merge->nsend = 0;
1271: /* determine row ownership */
1272: PetscLayoutCreate(comm,&merge->rowmap);
1273: merge->rowmap->n = pn;
1274: merge->rowmap->bs = 1;
1276: PetscLayoutSetUp(merge->rowmap);
1277: owners = merge->rowmap->range;
1279: /* determine the number of messages to send, their lengths */
1280: PetscMalloc2(size,&len_si,size,&sstatus);
1281: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1282: PetscMalloc1(size+2,&owners_co);
1284: proc = 0;
1285: for (i=0; i<pon; i++) {
1286: while (prmap[i] >= owners[proc+1]) proc++;
1287: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1288: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1289: }
1291: len = 0; /* max length of buf_si[], see (4) */
1292: owners_co[0] = 0;
1293: for (proc=0; proc<size; proc++) {
1294: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1295: if (len_s[proc]) {
1296: merge->nsend++;
1297: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1298: len += len_si[proc];
1299: }
1300: }
1302: /* determine the number and length of messages to receive for coi and coj */
1303: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1304: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1306: /* post the Irecv and Isend of coj */
1307: PetscCommGetNewTag(comm,&tagj);
1308: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1309: PetscMalloc1(merge->nsend+1,&swaits);
1310: for (proc=0, k=0; proc<size; proc++) {
1311: if (!len_s[proc]) continue;
1312: i = owners_co[proc];
1313: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1314: k++;
1315: }
1317: /* receives and sends of coj are complete */
1318: for (i=0; i<merge->nrecv; i++) {
1319: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1320: }
1321: PetscFree(rwaits);
1322: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1324: /* (4) send and recv coi */
1325: /*-----------------------*/
1326: PetscCommGetNewTag(comm,&tagi);
1327: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1328: PetscMalloc1(len+1,&buf_s);
1329: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1330: for (proc=0,k=0; proc<size; proc++) {
1331: if (!len_s[proc]) continue;
1332: /* form outgoing message for i-structure:
1333: buf_si[0]: nrows to be sent
1334: [1:nrows]: row index (global)
1335: [nrows+1:2*nrows+1]: i-structure index
1336: */
1337: /*-------------------------------------------*/
1338: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1339: buf_si_i = buf_si + nrows+1;
1340: buf_si[0] = nrows;
1341: buf_si_i[0] = 0;
1342: nrows = 0;
1343: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1344: nzi = coi[i+1] - coi[i];
1345: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1346: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1347: nrows++;
1348: }
1349: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1350: k++;
1351: buf_si += len_si[proc];
1352: }
1353: i = merge->nrecv;
1354: while (i--) {
1355: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1356: }
1357: PetscFree(rwaits);
1358: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1360: PetscFree2(len_si,sstatus);
1361: PetscFree(len_ri);
1362: PetscFree(swaits);
1363: PetscFree(buf_s);
1365: /* (5) compute the local portion of C (mpi mat) */
1366: /*----------------------------------------------*/
1367: MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1369: /* allocate pti array and free space for accumulating nonzero column info */
1370: PetscMalloc1(pn+1,&pti);
1371: pti[0] = 0;
1373: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1374: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am]));
1375: PetscFreeSpaceGet(nnz,&free_space);
1376: current_space = free_space;
1378: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1379: for (k=0; k<merge->nrecv; k++) {
1380: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1381: nrows = *buf_ri_k[k];
1382: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1383: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1384: }
1385: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1386:
1387: for (i=0; i<pn; i++) {
1388: /* add pdt[i,:]*AP into lnk */
1389: pnz = pdti[i+1] - pdti[i];
1390: ptJ = pdtj + pdti[i];
1391: for (j=0; j<pnz; j++) {
1392: row = ptJ[j]; /* row of AP == col of Pt */
1393: apnz = api[row+1] - api[row];
1394: Jptr = apj + api[row];
1395: /* add non-zero cols of AP into the sorted linked list lnk */
1396: PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);
1397: }
1399: /* add received col data into lnk */
1400: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1401: if (i == *nextrow[k]) { /* i-th row */
1402: nzi = *(nextci[k]+1) - *nextci[k];
1403: Jptr = buf_rj[k] + *nextci[k];
1404: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1405: nextrow[k]++; nextci[k]++;
1406: }
1407: }
1408: nnz = lnk[0];
1410: /* if free space is not available, make more free space */
1411: if (current_space->local_remaining<nnz) {
1412: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1413: nspacedouble++;
1414: }
1415: /* copy data into free space, then initialize lnk */
1416: PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);
1417: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1419: current_space->array += nnz;
1420: current_space->local_used += nnz;
1421: current_space->local_remaining -= nnz;
1423: pti[i+1] = pti[i] + nnz;
1424: }
1425: MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
1426: PetscFree3(buf_ri_k,nextrow,nextci);
1428: PetscMalloc1(pti[pn]+1,&ptj);
1429: PetscFreeSpaceContiguous(&free_space,ptj);
1430: afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1431: if (afill_tmp > afill) afill = afill_tmp;
1432: PetscLLDestroy(lnk,lnkbt);
1434: /* (6) create symbolic parallel matrix Cmpi */
1435: /*------------------------------------------*/
1436: MatCreate(comm,&Cmpi);
1437: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1438: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
1439: MatSetType(Cmpi,MATMPIAIJ);
1440: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1441: MatPreallocateFinalize(dnz,onz);
1443: merge->bi = pti; /* Cseq->i */
1444: merge->bj = ptj; /* Cseq->j */
1445: merge->coi = coi; /* Co->i */
1446: merge->coj = coj; /* Co->j */
1447: merge->buf_ri = buf_ri;
1448: merge->buf_rj = buf_rj;
1449: merge->owners_co = owners_co;
1450: merge->destroy = Cmpi->ops->destroy;
1452: /* attach the supporting struct to Cmpi for reuse */
1453: c = (Mat_MPIAIJ*)Cmpi->data;
1454: c->ptap = ptap;
1455: ptap->api = api;
1456: ptap->apj = apj;
1457: ptap->duplicate = Cmpi->ops->duplicate;
1458: ptap->destroy = Cmpi->ops->destroy;
1460: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1461: Cmpi->assembled = PETSC_FALSE;
1462: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1463: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1464: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap;
1465: *C = Cmpi;
1467: /* flag 'scalable' determines which implementations to be used:
1468: 0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1469: 1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1470: /* set default scalable */
1471: ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */
1473: PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);
1474: if (!ptap->scalable) { /* Do dense axpy */
1475: PetscCalloc1(pN,&ptap->apa);
1476: } else {
1477: PetscCalloc1(ap_rmax+1,&ptap->apa);
1478: }
1480: #if defined(PETSC_USE_INFO)
1481: if (pti[pn] != 0) {
1482: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1483: PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);
1484: } else {
1485: PetscInfo(Cmpi,"Empty matrix product\n");
1486: }
1487: #endif
1488: return(0);
1489: }
1491: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C)
1492: {
1493: PetscErrorCode ierr;
1494: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1495: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1496: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1497: Mat_SeqAIJ *p_loc,*p_oth;
1498: Mat_PtAPMPI *ptap;
1499: Mat_Merge_SeqsToMPI *merge;
1500: PetscInt *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1501: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1502: PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj;
1503: MatScalar *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1504: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1505: MPI_Comm comm;
1506: PetscMPIInt size,rank,taga,*len_s;
1507: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1508: PetscInt **buf_ri,**buf_rj;
1509: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1510: MPI_Request *s_waits,*r_waits;
1511: MPI_Status *status;
1512: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1513: PetscInt *api,*apj,*coi,*coj;
1514: PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1515: PetscBool scalable;
1516: #if defined(PTAP_PROFILE)
1517: PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1518: #endif
1521: PetscObjectGetComm((PetscObject)C,&comm);
1522: #if defined(PTAP_PROFILE)
1523: PetscTime(&t0);
1524: #endif
1525: MPI_Comm_size(comm,&size);
1526: MPI_Comm_rank(comm,&rank);
1528: ptap = c->ptap;
1529: if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1530: merge = ptap->merge;
1531: apa = ptap->apa;
1532: scalable = ptap->scalable;
1534: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1535: /*-----------------------------------------------------*/
1536: if (ptap->reuse == MAT_INITIAL_MATRIX) {
1537: /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1538: ptap->reuse = MAT_REUSE_MATRIX;
1539: } else { /* update numerical values of P_oth and P_loc */
1540: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1541: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1542: }
1543: #if defined(PTAP_PROFILE)
1544: PetscTime(&t1);
1545: eP = t1-t0;
1546: #endif
1547: /*
1548: printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1549: a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1550: ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1551: ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1552: */
1554: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1555: /*--------------------------------------------------------------*/
1556: /* get data from symbolic products */
1557: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1558: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1559: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
1560: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1562: coi = merge->coi; coj = merge->coj;
1563: PetscCalloc1(coi[pon]+1,&coa);
1565: bi = merge->bi; bj = merge->bj;
1566: owners = merge->rowmap->range;
1567: PetscCalloc1(bi[cm]+1,&ba); /* ba: Cseq->a */
1569: api = ptap->api; apj = ptap->apj;
1571: if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1572: PetscInfo(C,"Using non-scalable dense axpy\n");
1573: #if defined(PTAP_PROFILE)
1574: PetscTime(&t1);
1575: #endif
1576: for (i=0; i<am; i++) {
1577: #if defined(PTAP_PROFILE)
1578: PetscTime(&t2_0);
1579: #endif
1580: /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1581: /*------------------------------------------------------------*/
1582: apJ = apj + api[i];
1584: /* diagonal portion of A */
1585: anz = adi[i+1] - adi[i];
1586: adj = ad->j + adi[i];
1587: ada = ad->a + adi[i];
1588: for (j=0; j<anz; j++) {
1589: row = adj[j];
1590: pnz = pi_loc[row+1] - pi_loc[row];
1591: pj = pj_loc + pi_loc[row];
1592: pa = pa_loc + pi_loc[row];
1594: /* perform dense axpy */
1595: valtmp = ada[j];
1596: for (k=0; k<pnz; k++) {
1597: apa[pj[k]] += valtmp*pa[k];
1598: }
1599: PetscLogFlops(2.0*pnz);
1600: }
1602: /* off-diagonal portion of A */
1603: anz = aoi[i+1] - aoi[i];
1604: aoj = ao->j + aoi[i];
1605: aoa = ao->a + aoi[i];
1606: for (j=0; j<anz; j++) {
1607: row = aoj[j];
1608: pnz = pi_oth[row+1] - pi_oth[row];
1609: pj = pj_oth + pi_oth[row];
1610: pa = pa_oth + pi_oth[row];
1612: /* perform dense axpy */
1613: valtmp = aoa[j];
1614: for (k=0; k<pnz; k++) {
1615: apa[pj[k]] += valtmp*pa[k];
1616: }
1617: PetscLogFlops(2.0*pnz);
1618: }
1619: #if defined(PTAP_PROFILE)
1620: PetscTime(&t2_1);
1621: et2_AP += t2_1 - t2_0;
1622: #endif
1624: /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1625: /*--------------------------------------------------------------*/
1626: apnz = api[i+1] - api[i];
1627: /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1628: pnz = po->i[i+1] - po->i[i];
1629: poJ = po->j + po->i[i];
1630: pA = po->a + po->i[i];
1631: for (j=0; j<pnz; j++) {
1632: row = poJ[j];
1633: cnz = coi[row+1] - coi[row];
1634: cj = coj + coi[row];
1635: ca = coa + coi[row];
1636: /* perform dense axpy */
1637: valtmp = pA[j];
1638: for (k=0; k<cnz; k++) {
1639: ca[k] += valtmp*apa[cj[k]];
1640: }
1641: PetscLogFlops(2.0*cnz);
1642: }
1643: /* put the value into Cd (diagonal part) */
1644: pnz = pd->i[i+1] - pd->i[i];
1645: pdJ = pd->j + pd->i[i];
1646: pA = pd->a + pd->i[i];
1647: for (j=0; j<pnz; j++) {
1648: row = pdJ[j];
1649: cnz = bi[row+1] - bi[row];
1650: cj = bj + bi[row];
1651: ca = ba + bi[row];
1652: /* perform dense axpy */
1653: valtmp = pA[j];
1654: for (k=0; k<cnz; k++) {
1655: ca[k] += valtmp*apa[cj[k]];
1656: }
1657: PetscLogFlops(2.0*cnz);
1658: }
1659:
1660: /* zero the current row of A*P */
1661: for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1662: #if defined(PTAP_PROFILE)
1663: PetscTime(&t2_2);
1664: ePtAP += t2_2 - t2_1;
1665: #endif
1666: }
1667: } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1668: PetscInfo(C,"Using scalable sparse axpy\n");
1669: /*-----------------------------------------------------------------------------------------*/
1670: pA=pa_loc;
1671: for (i=0; i<am; i++) {
1672: #if defined(PTAP_PROFILE)
1673: PetscTime(&t2_0);
1674: #endif
1675: /* form i-th sparse row of A*P */
1676: apnz = api[i+1] - api[i];
1677: apJ = apj + api[i];
1678: /* diagonal portion of A */
1679: anz = adi[i+1] - adi[i];
1680: adj = ad->j + adi[i];
1681: ada = ad->a + adi[i];
1682: for (j=0; j<anz; j++) {
1683: row = adj[j];
1684: pnz = pi_loc[row+1] - pi_loc[row];
1685: pj = pj_loc + pi_loc[row];
1686: pa = pa_loc + pi_loc[row];
1687: valtmp = ada[j];
1688: nextp = 0;
1689: for (k=0; nextp<pnz; k++) {
1690: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1691: apa[k] += valtmp*pa[nextp++];
1692: }
1693: }
1694: PetscLogFlops(2.0*pnz);
1695: }
1696: /* off-diagonal portion of A */
1697: anz = aoi[i+1] - aoi[i];
1698: aoj = ao->j + aoi[i];
1699: aoa = ao->a + aoi[i];
1700: for (j=0; j<anz; j++) {
1701: row = aoj[j];
1702: pnz = pi_oth[row+1] - pi_oth[row];
1703: pj = pj_oth + pi_oth[row];
1704: pa = pa_oth + pi_oth[row];
1705: valtmp = aoa[j];
1706: nextp = 0;
1707: for (k=0; nextp<pnz; k++) {
1708: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1709: apa[k] += valtmp*pa[nextp++];
1710: }
1711: }
1712: PetscLogFlops(2.0*pnz);
1713: }
1714: #if defined(PTAP_PROFILE)
1715: PetscTime(&t2_1);
1716: et2_AP += t2_1 - t2_0;
1717: #endif
1719: /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1720: /*--------------------------------------------------------------*/
1721: pnz = pi_loc[i+1] - pi_loc[i];
1722: pJ = pj_loc + pi_loc[i];
1723: for (j=0; j<pnz; j++) {
1724: nextap = 0;
1725: row = pJ[j]; /* global index */
1726: if (row < pcstart || row >=pcend) { /* put the value into Co */
1727: row = *poJ;
1728: cj = coj + coi[row];
1729: ca = coa + coi[row]; poJ++;
1730: } else { /* put the value into Cd */
1731: row = *pdJ;
1732: cj = bj + bi[row];
1733: ca = ba + bi[row]; pdJ++;
1734: }
1735: valtmp = pA[j];
1736: for (k=0; nextap<apnz; k++) {
1737: if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1738: }
1739: PetscLogFlops(2.0*apnz);
1740: }
1741: pA += pnz;
1742: /* zero the current row info for A*P */
1743: PetscMemzero(apa,apnz*sizeof(MatScalar));
1744: #if defined(PTAP_PROFILE)
1745: PetscTime(&t2_2);
1746: ePtAP += t2_2 - t2_1;
1747: #endif
1748: }
1749: }
1750: #if defined(PTAP_PROFILE)
1751: PetscTime(&t2);
1752: #endif
1754: /* 3) send and recv matrix values coa */
1755: /*------------------------------------*/
1756: buf_ri = merge->buf_ri;
1757: buf_rj = merge->buf_rj;
1758: len_s = merge->len_s;
1759: PetscCommGetNewTag(comm,&taga);
1760: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1762: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1763: for (proc=0,k=0; proc<size; proc++) {
1764: if (!len_s[proc]) continue;
1765: i = merge->owners_co[proc];
1766: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1767: k++;
1768: }
1769: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1770: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1772: PetscFree2(s_waits,status);
1773: PetscFree(r_waits);
1774: PetscFree(coa);
1775: #if defined(PTAP_PROFILE)
1776: PetscTime(&t3);
1777: #endif
1779: /* 4) insert local Cseq and received values into Cmpi */
1780: /*------------------------------------------------------*/
1781: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1782: for (k=0; k<merge->nrecv; k++) {
1783: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1784: nrows = *(buf_ri_k[k]);
1785: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1786: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1787: }
1789: for (i=0; i<cm; i++) {
1790: row = owners[rank] + i; /* global row index of C_seq */
1791: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1792: ba_i = ba + bi[i];
1793: bnz = bi[i+1] - bi[i];
1794: /* add received vals into ba */
1795: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1796: /* i-th row */
1797: if (i == *nextrow[k]) {
1798: cnz = *(nextci[k]+1) - *nextci[k];
1799: cj = buf_rj[k] + *(nextci[k]);
1800: ca = abuf_r[k] + *(nextci[k]);
1801: nextcj = 0;
1802: for (j=0; nextcj<cnz; j++) {
1803: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1804: ba_i[j] += ca[nextcj++];
1805: }
1806: }
1807: nextrow[k]++; nextci[k]++;
1808: PetscLogFlops(2.0*cnz);
1809: }
1810: }
1811: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1812: }
1813: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1814: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1816: PetscFree(ba);
1817: PetscFree(abuf_r[0]);
1818: PetscFree(abuf_r);
1819: PetscFree3(buf_ri_k,nextrow,nextci);
1820: #if defined(PTAP_PROFILE)
1821: PetscTime(&t4);
1822: if (rank==1) {
1823: PetscPrintf(MPI_COMM_SELF," [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);
1824: }
1825: #endif
1826: return(0);
1827: }