Actual source code: mpiptap.c
petsc-3.12.0 2019-09-29
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
17: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
18: {
19: PetscErrorCode ierr;
20: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
21: Mat_APMPI *ptap=a->ap;
22: PetscBool iascii;
23: PetscViewerFormat format;
26: if (!ptap) {
27: /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
28: A->ops->view = MatView_MPIAIJ;
29: (A->ops->view)(A,viewer);
30: return(0);
31: }
33: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
34: if (iascii) {
35: PetscViewerGetFormat(viewer,&format);
36: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
37: if (ptap->algType == 0) {
38: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
39: } else if (ptap->algType == 1) {
40: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
41: } else if (ptap->algType == 2) {
42: PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
43: } else if (ptap->algType == 3) {
44: PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
45: }
46: }
47: }
48: (ptap->view)(A,viewer);
49: return(0);
50: }
52: PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
53: {
54: PetscErrorCode ierr;
55: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
56: Mat_APMPI *ptap=a->ap;
57: Mat_Merge_SeqsToMPI *merge;
60: if (!ptap) return(0);
62: PetscFree2(ptap->startsj_s,ptap->startsj_r);
63: PetscFree(ptap->bufa);
64: MatDestroy(&ptap->P_loc);
65: MatDestroy(&ptap->P_oth);
66: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
67: MatDestroy(&ptap->Rd);
68: MatDestroy(&ptap->Ro);
69: if (ptap->AP_loc) { /* used by alg_rap */
70: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
71: PetscFree(ap->i);
72: PetscFree2(ap->j,ap->a);
73: MatDestroy(&ptap->AP_loc);
74: } else { /* used by alg_ptap */
75: PetscFree(ptap->api);
76: PetscFree(ptap->apj);
77: }
78: MatDestroy(&ptap->C_loc);
79: MatDestroy(&ptap->C_oth);
80: if (ptap->apa) {PetscFree(ptap->apa);}
82: MatDestroy(&ptap->Pt);
84: merge=ptap->merge;
85: if (merge) { /* used by alg_ptap */
86: PetscFree(merge->id_r);
87: PetscFree(merge->len_s);
88: PetscFree(merge->len_r);
89: PetscFree(merge->bi);
90: PetscFree(merge->bj);
91: PetscFree(merge->buf_ri[0]);
92: PetscFree(merge->buf_ri);
93: PetscFree(merge->buf_rj[0]);
94: PetscFree(merge->buf_rj);
95: PetscFree(merge->coi);
96: PetscFree(merge->coj);
97: PetscFree(merge->owners_co);
98: PetscLayoutDestroy(&merge->rowmap);
99: PetscFree(ptap->merge);
100: }
101: ISLocalToGlobalMappingDestroy(&ptap->ltog);
103: PetscSFDestroy(&ptap->sf);
104: PetscFree(ptap->c_othi);
105: PetscFree(ptap->c_rmti);
106: return(0);
107: }
109: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
110: {
112: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
113: Mat_APMPI *ptap=a->ap;
116: (*A->ops->freeintermediatedatastructures)(A);
117: (*ptap->destroy)(A); /* MatDestroy_MPIAIJ(A) */
118: PetscFree(ptap);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
123: {
125: PetscBool flg;
126: MPI_Comm comm;
127: #if !defined(PETSC_HAVE_HYPRE)
128: const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"};
129: PetscInt nalg=4;
130: #else
131: const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"};
132: PetscInt nalg=5;
133: #endif
134: PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */
137: /* check if matrix local sizes are compatible */
138: PetscObjectGetComm((PetscObject)A,&comm);
139: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
140: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
142: if (scall == MAT_INITIAL_MATRIX) {
143: /* pick an algorithm */
144: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
145: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
146: PetscOptionsEnd();
148: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
149: MatInfo Ainfo,Pinfo;
150: PetscInt nz_local;
151: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
153: MatGetInfo(A,MAT_LOCAL,&Ainfo);
154: MatGetInfo(P,MAT_LOCAL,&Pinfo);
155: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
157: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
158: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
160: if (alg_scalable) {
161: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
162: }
163: }
165: switch (alg) {
166: case 1:
167: /* do R=P^T locally, then C=R*A*P -- nonscalable */
168: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
169: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
170: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
171: break;
172: case 2:
173: /* compute C=P^T*A*P allatonce */
174: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
175: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
176: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
177: break;
178: case 3:
179: /* compute C=P^T*A*P allatonce */
180: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
181: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
182: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
183: break;
184: #if defined(PETSC_HAVE_HYPRE)
185: case 4:
186: /* Use boomerAMGBuildCoarseOperator */
187: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
188: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
189: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
190: break;
191: #endif
192: default:
193: /* do R=P^T locally, then C=R*A*P */
194: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
195: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
196: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
197: break;
198: }
200: if (alg == 0 || alg == 1 || alg == 2 || alg == 3) {
201: Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data;
202: Mat_APMPI *ap = c->ap;
203: PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");
204: ap->freestruct = PETSC_FALSE;
205: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
206: PetscOptionsEnd();
207: }
208: }
210: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
211: (*(*C)->ops->ptapnumeric)(A,P,*C);
212: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
213: return(0);
214: }
216: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
217: {
218: PetscErrorCode ierr;
219: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
220: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
221: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
222: Mat_APMPI *ptap = c->ap;
223: Mat AP_loc,C_loc,C_oth;
224: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
225: PetscScalar *apa;
226: const PetscInt *cols;
227: const PetscScalar *vals;
230: if (!ptap->AP_loc) {
231: MPI_Comm comm;
232: PetscObjectGetComm((PetscObject)C,&comm);
233: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
234: }
236: MatZeroEntries(C);
238: /* 1) get R = Pd^T,Ro = Po^T */
239: if (ptap->reuse == MAT_REUSE_MATRIX) {
240: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
241: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
242: }
244: /* 2) get AP_loc */
245: AP_loc = ptap->AP_loc;
246: ap = (Mat_SeqAIJ*)AP_loc->data;
248: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
249: /*-----------------------------------------------------*/
250: if (ptap->reuse == MAT_REUSE_MATRIX) {
251: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
252: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
253: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
254: }
256: /* 2-2) compute numeric A_loc*P - dominating part */
257: /* ---------------------------------------------- */
258: /* get data from symbolic products */
259: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
260: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
262: api = ap->i;
263: apj = ap->j;
264: ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
265: for (i=0; i<am; i++) {
266: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
267: apnz = api[i+1] - api[i];
268: apa = ap->a + api[i];
269: PetscArrayzero(apa,apnz);
270: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
271: }
272: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
273: if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
275: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
276: /* Always use scalable version since we are in the MPI scalable version */
277: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
278: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
280: C_loc = ptap->C_loc;
281: C_oth = ptap->C_oth;
283: /* add C_loc and Co to to C */
284: MatGetOwnershipRange(C,&rstart,&rend);
286: /* C_loc -> C */
287: cm = C_loc->rmap->N;
288: c_seq = (Mat_SeqAIJ*)C_loc->data;
289: cols = c_seq->j;
290: vals = c_seq->a;
291: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);
293: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
294: /* when there are no off-processor parts. */
295: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
296: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
297: /* a table, and other, more complex stuff has to be done. */
298: if (C->assembled) {
299: C->was_assembled = PETSC_TRUE;
300: C->assembled = PETSC_FALSE;
301: }
302: if (C->was_assembled) {
303: for (i=0; i<cm; i++) {
304: ncols = c_seq->i[i+1] - c_seq->i[i];
305: row = rstart + i;
306: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
307: cols += ncols; vals += ncols;
308: }
309: } else {
310: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
311: }
312: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
313: if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
315: /* Co -> C, off-processor part */
316: cm = C_oth->rmap->N;
317: c_seq = (Mat_SeqAIJ*)C_oth->data;
318: cols = c_seq->j;
319: vals = c_seq->a;
320: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
321: for (i=0; i<cm; i++) {
322: ncols = c_seq->i[i+1] - c_seq->i[i];
323: row = p->garray[i];
324: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
325: cols += ncols; vals += ncols;
326: }
327: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
328: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
330: ptap->reuse = MAT_REUSE_MATRIX;
332: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
333: if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
335: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
336: if (ptap->freestruct) {
337: MatFreeIntermediateDataStructures(C);
338: }
339: return(0);
340: }
342: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
343: {
344: PetscErrorCode ierr;
345: Mat_APMPI *ptap;
346: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
347: MPI_Comm comm;
348: PetscMPIInt size,rank;
349: Mat Cmpi,P_loc,P_oth;
350: PetscFreeSpaceList free_space=NULL,current_space=NULL;
351: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
352: PetscInt *lnk,i,k,pnz,row,nsend;
353: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
354: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
355: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
356: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
357: MPI_Request *swaits,*rwaits;
358: MPI_Status *sstatus,rstatus;
359: PetscLayout rowmap;
360: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
361: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
362: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
363: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
364: PetscScalar *apv;
365: PetscTable ta;
366: MatType mtype;
367: const char *prefix;
368: #if defined(PETSC_USE_INFO)
369: PetscReal apfill;
370: #endif
373: PetscObjectGetComm((PetscObject)A,&comm);
374: MPI_Comm_size(comm,&size);
375: MPI_Comm_rank(comm,&rank);
377: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
379: /* create symbolic parallel matrix Cmpi */
380: MatCreate(comm,&Cmpi);
381: MatGetType(A,&mtype);
382: MatSetType(Cmpi,mtype);
384: /* create struct Mat_APMPI and attached it to C later */
385: PetscNew(&ptap);
386: ptap->reuse = MAT_INITIAL_MATRIX;
387: ptap->algType = 0;
389: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
390: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
391: /* get P_loc by taking all local rows of P */
392: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
394: ptap->P_loc = P_loc;
395: ptap->P_oth = P_oth;
397: /* (0) compute Rd = Pd^T, Ro = Po^T */
398: /* --------------------------------- */
399: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
400: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
402: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
403: /* ----------------------------------------------------------------- */
404: p_loc = (Mat_SeqAIJ*)P_loc->data;
405: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
407: /* create and initialize a linked list */
408: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
409: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
410: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
411: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
413: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
415: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
416: if (ao) {
417: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
418: } else {
419: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
420: }
421: current_space = free_space;
422: nspacedouble = 0;
424: PetscMalloc1(am+1,&api);
425: api[0] = 0;
426: for (i=0; i<am; i++) {
427: /* diagonal portion: Ad[i,:]*P */
428: ai = ad->i; pi = p_loc->i;
429: nzi = ai[i+1] - ai[i];
430: aj = ad->j + ai[i];
431: for (j=0; j<nzi; j++) {
432: row = aj[j];
433: pnz = pi[row+1] - pi[row];
434: Jptr = p_loc->j + pi[row];
435: /* add non-zero cols of P into the sorted linked list lnk */
436: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
437: }
438: /* off-diagonal portion: Ao[i,:]*P */
439: if (ao) {
440: ai = ao->i; pi = p_oth->i;
441: nzi = ai[i+1] - ai[i];
442: aj = ao->j + ai[i];
443: for (j=0; j<nzi; j++) {
444: row = aj[j];
445: pnz = pi[row+1] - pi[row];
446: Jptr = p_oth->j + pi[row];
447: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
448: }
449: }
450: apnz = lnk[0];
451: api[i+1] = api[i] + apnz;
453: /* if free space is not available, double the total space in the list */
454: if (current_space->local_remaining<apnz) {
455: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
456: nspacedouble++;
457: }
459: /* Copy data into free space, then initialize lnk */
460: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
462: current_space->array += apnz;
463: current_space->local_used += apnz;
464: current_space->local_remaining -= apnz;
465: }
466: /* Allocate space for apj and apv, initialize apj, and */
467: /* destroy list of free space and other temporary array(s) */
468: PetscCalloc2(api[am],&apj,api[am],&apv);
469: PetscFreeSpaceContiguous(&free_space,apj);
470: PetscLLCondensedDestroy_Scalable(lnk);
472: /* Create AP_loc for reuse */
473: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
474: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
476: #if defined(PETSC_USE_INFO)
477: if (ao) {
478: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
479: } else {
480: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
481: }
482: ptap->AP_loc->info.mallocs = nspacedouble;
483: ptap->AP_loc->info.fill_ratio_given = fill;
484: ptap->AP_loc->info.fill_ratio_needed = apfill;
486: if (api[am]) {
487: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
488: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
489: } else {
490: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
491: }
492: #endif
494: /* (2-1) compute symbolic Co = Ro*AP_loc */
495: /* ------------------------------------ */
496: MatGetOptionsPrefix(A,&prefix);
497: MatSetOptionsPrefix(ptap->Ro,prefix);
498: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
499: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
501: /* (3) send coj of C_oth to other processors */
502: /* ------------------------------------------ */
503: /* determine row ownership */
504: PetscLayoutCreate(comm,&rowmap);
505: rowmap->n = pn;
506: rowmap->bs = 1;
507: PetscLayoutSetUp(rowmap);
508: owners = rowmap->range;
510: /* determine the number of messages to send, their lengths */
511: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
512: PetscArrayzero(len_s,size);
513: PetscArrayzero(len_si,size);
515: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
516: coi = c_oth->i; coj = c_oth->j;
517: con = ptap->C_oth->rmap->n;
518: proc = 0;
519: ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
520: for (i=0; i<con; i++) {
521: while (prmap[i] >= owners[proc+1]) proc++;
522: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
523: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
524: }
526: len = 0; /* max length of buf_si[], see (4) */
527: owners_co[0] = 0;
528: nsend = 0;
529: for (proc=0; proc<size; proc++) {
530: owners_co[proc+1] = owners_co[proc] + len_si[proc];
531: if (len_s[proc]) {
532: nsend++;
533: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
534: len += len_si[proc];
535: }
536: }
538: /* determine the number and length of messages to receive for coi and coj */
539: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
540: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
542: /* post the Irecv and Isend of coj */
543: PetscCommGetNewTag(comm,&tagj);
544: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
545: PetscMalloc1(nsend+1,&swaits);
546: for (proc=0, k=0; proc<size; proc++) {
547: if (!len_s[proc]) continue;
548: i = owners_co[proc];
549: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
550: k++;
551: }
553: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
554: /* ---------------------------------------- */
555: MatSetOptionsPrefix(ptap->Rd,prefix);
556: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
557: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
558: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
559: ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);
561: /* receives coj are complete */
562: for (i=0; i<nrecv; i++) {
563: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
564: }
565: PetscFree(rwaits);
566: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
568: /* add received column indices into ta to update Crmax */
569: for (k=0; k<nrecv; k++) {/* k-th received message */
570: Jptr = buf_rj[k];
571: for (j=0; j<len_r[k]; j++) {
572: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
573: }
574: }
575: PetscTableGetCount(ta,&Crmax);
576: PetscTableDestroy(&ta);
578: /* (4) send and recv coi */
579: /*-----------------------*/
580: PetscCommGetNewTag(comm,&tagi);
581: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
582: PetscMalloc1(len+1,&buf_s);
583: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
584: for (proc=0,k=0; proc<size; proc++) {
585: if (!len_s[proc]) continue;
586: /* form outgoing message for i-structure:
587: buf_si[0]: nrows to be sent
588: [1:nrows]: row index (global)
589: [nrows+1:2*nrows+1]: i-structure index
590: */
591: /*-------------------------------------------*/
592: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
593: buf_si_i = buf_si + nrows+1;
594: buf_si[0] = nrows;
595: buf_si_i[0] = 0;
596: nrows = 0;
597: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
598: nzi = coi[i+1] - coi[i];
599: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
600: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
601: nrows++;
602: }
603: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
604: k++;
605: buf_si += len_si[proc];
606: }
607: for (i=0; i<nrecv; i++) {
608: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
609: }
610: PetscFree(rwaits);
611: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
613: PetscFree4(len_s,len_si,sstatus,owners_co);
614: PetscFree(len_ri);
615: PetscFree(swaits);
616: PetscFree(buf_s);
618: /* (5) compute the local portion of Cmpi */
619: /* ------------------------------------------ */
620: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
621: PetscFreeSpaceGet(Crmax,&free_space);
622: current_space = free_space;
624: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
625: for (k=0; k<nrecv; k++) {
626: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
627: nrows = *buf_ri_k[k];
628: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
629: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
630: }
632: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
633: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
634: for (i=0; i<pn; i++) {
635: /* add C_loc into Cmpi */
636: nzi = c_loc->i[i+1] - c_loc->i[i];
637: Jptr = c_loc->j + c_loc->i[i];
638: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
640: /* add received col data into lnk */
641: for (k=0; k<nrecv; k++) { /* k-th received message */
642: if (i == *nextrow[k]) { /* i-th row */
643: nzi = *(nextci[k]+1) - *nextci[k];
644: Jptr = buf_rj[k] + *nextci[k];
645: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
646: nextrow[k]++; nextci[k]++;
647: }
648: }
649: nzi = lnk[0];
651: /* copy data into free space, then initialize lnk */
652: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
653: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
654: }
655: PetscFree3(buf_ri_k,nextrow,nextci);
656: PetscLLCondensedDestroy_Scalable(lnk);
657: PetscFreeSpaceDestroy(free_space);
659: /* local sizes and preallocation */
660: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
661: if (P->cmap->bs > 0) {
662: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
663: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
664: }
665: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
666: MatPreallocateFinalize(dnz,onz);
668: /* members in merge */
669: PetscFree(id_r);
670: PetscFree(len_r);
671: PetscFree(buf_ri[0]);
672: PetscFree(buf_ri);
673: PetscFree(buf_rj[0]);
674: PetscFree(buf_rj);
675: PetscLayoutDestroy(&rowmap);
677: /* attach the supporting struct to Cmpi for reuse */
678: c = (Mat_MPIAIJ*)Cmpi->data;
679: c->ap = ptap;
680: ptap->duplicate = Cmpi->ops->duplicate;
681: ptap->destroy = Cmpi->ops->destroy;
682: ptap->view = Cmpi->ops->view;
684: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
685: Cmpi->assembled = PETSC_FALSE;
686: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
687: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
688: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
689: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
690: *C = Cmpi;
692: nout = 0;
693: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
694: if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
695: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
696: if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
698: return(0);
699: }
701: PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
702: {
703: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
704: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
705: PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
706: PetscInt pcstart,pcend,column,offset;
707: PetscErrorCode ierr;
710: pcstart = P->cmap->rstart;
711: pcstart *= dof;
712: pcend = P->cmap->rend;
713: pcend *= dof;
714: /* diagonal portion: Ad[i,:]*P */
715: ai = ad->i;
716: nzi = ai[i+1] - ai[i];
717: aj = ad->j + ai[i];
718: for (j=0; j<nzi; j++) {
719: row = aj[j];
720: offset = row%dof;
721: row /= dof;
722: nzpi = pd->i[row+1] - pd->i[row];
723: pj = pd->j + pd->i[row];
724: for (k=0; k<nzpi; k++) {
725: PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
726: }
727: }
728: /* off diag P */
729: for (j=0; j<nzi; j++) {
730: row = aj[j];
731: offset = row%dof;
732: row /= dof;
733: nzpi = po->i[row+1] - po->i[row];
734: pj = po->j + po->i[row];
735: for (k=0; k<nzpi; k++) {
736: PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
737: }
738: }
740: /* off diagonal part: Ao[i, :]*P_oth */
741: if (ao) {
742: ai = ao->i;
743: pi = p_oth->i;
744: nzi = ai[i+1] - ai[i];
745: aj = ao->j + ai[i];
746: for (j=0; j<nzi; j++) {
747: row = aj[j];
748: offset = a->garray[row]%dof;
749: row = map[row];
750: pnz = pi[row+1] - pi[row];
751: p_othcols = p_oth->j + pi[row];
752: for (col=0; col<pnz; col++) {
753: column = p_othcols[col] * dof + offset;
754: if (column>=pcstart && column<pcend) {
755: PetscHSetIAdd(dht,column);
756: } else {
757: PetscHSetIAdd(oht,column);
758: }
759: }
760: }
761: } /* end if (ao) */
762: return(0);
763: }
765: PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
766: {
767: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
768: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
769: PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
770: PetscScalar ra,*aa,*pa;
771: PetscErrorCode ierr;
774: pcstart = P->cmap->rstart;
775: pcstart *= dof;
777: /* diagonal portion: Ad[i,:]*P */
778: ai = ad->i;
779: nzi = ai[i+1] - ai[i];
780: aj = ad->j + ai[i];
781: aa = ad->a + ai[i];
782: for (j=0; j<nzi; j++) {
783: ra = aa[j];
784: row = aj[j];
785: offset = row%dof;
786: row /= dof;
787: nzpi = pd->i[row+1] - pd->i[row];
788: pj = pd->j + pd->i[row];
789: pa = pd->a + pd->i[row];
790: for (k=0; k<nzpi; k++) {
791: PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
792: }
793: PetscLogFlops(2.0*nzpi);
794: }
795: for (j=0; j<nzi; j++) {
796: ra = aa[j];
797: row = aj[j];
798: offset = row%dof;
799: row /= dof;
800: nzpi = po->i[row+1] - po->i[row];
801: pj = po->j + po->i[row];
802: pa = po->a + po->i[row];
803: for (k=0; k<nzpi; k++) {
804: PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
805: }
806: PetscLogFlops(2.0*nzpi);
807: }
809: /* off diagonal part: Ao[i, :]*P_oth */
810: if (ao) {
811: ai = ao->i;
812: pi = p_oth->i;
813: nzi = ai[i+1] - ai[i];
814: aj = ao->j + ai[i];
815: aa = ao->a + ai[i];
816: for (j=0; j<nzi; j++) {
817: row = aj[j];
818: offset = a->garray[row]%dof;
819: row = map[row];
820: ra = aa[j];
821: pnz = pi[row+1] - pi[row];
822: p_othcols = p_oth->j + pi[row];
823: pa = p_oth->a + pi[row];
824: for (col=0; col<pnz; col++) {
825: PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
826: }
827: PetscLogFlops(2.0*pnz);
828: }
829: } /* end if (ao) */
831: return(0);
832: }
834: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
836: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
837: {
838: PetscErrorCode ierr;
839: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
840: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
841: Mat_APMPI *ptap = c->ap;
842: PetscHMapIV hmap;
843: PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
844: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
845: PetscInt offset,ii,pocol;
846: const PetscInt *mappingindices;
847: IS map;
848: MPI_Comm comm;
851: PetscObjectGetComm((PetscObject)A,&comm);
852: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
854: MatZeroEntries(C);
856: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
857: /*-----------------------------------------------------*/
858: if (ptap->reuse == MAT_REUSE_MATRIX) {
859: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
860: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
861: }
862: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
864: MatGetLocalSize(p->B,NULL,&pon);
865: pon *= dof;
866: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
867: MatGetLocalSize(A,&am,NULL);
868: cmaxr = 0;
869: for (i=0; i<pon; i++) {
870: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
871: }
872: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
873: PetscHMapIVCreate(&hmap);
874: PetscHMapIVResize(hmap,cmaxr);
875: ISGetIndices(map,&mappingindices);
876: for (i=0; i<am && pon; i++) {
877: PetscHMapIVClear(hmap);
878: offset = i%dof;
879: ii = i/dof;
880: nzi = po->i[ii+1] - po->i[ii];
881: if (!nzi) continue;
882: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
883: voff = 0;
884: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
885: if (!voff) continue;
887: /* Form C(ii, :) */
888: poj = po->j + po->i[ii];
889: poa = po->a + po->i[ii];
890: for (j=0; j<nzi; j++) {
891: pocol = poj[j]*dof+offset;
892: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
893: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
894: for (jj=0; jj<voff; jj++) {
895: apvaluestmp[jj] = apvalues[jj]*poa[j];
896: /*If the row is empty */
897: if (!c_rmtc[pocol]) {
898: c_rmtjj[jj] = apindices[jj];
899: c_rmtaa[jj] = apvaluestmp[jj];
900: c_rmtc[pocol]++;
901: } else {
902: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
903: if (loc>=0){ /* hit */
904: c_rmtaa[loc] += apvaluestmp[jj];
905: PetscLogFlops(1.0);
906: } else { /* new element */
907: loc = -(loc+1);
908: /* Move data backward */
909: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
910: c_rmtjj[kk] = c_rmtjj[kk-1];
911: c_rmtaa[kk] = c_rmtaa[kk-1];
912: }/* End kk */
913: c_rmtjj[loc] = apindices[jj];
914: c_rmtaa[loc] = apvaluestmp[jj];
915: c_rmtc[pocol]++;
916: }
917: }
918: PetscLogFlops(voff);
919: } /* End jj */
920: } /* End j */
921: } /* End i */
923: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
925: MatGetLocalSize(P,NULL,&pn);
926: pn *= dof;
927: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
929: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
930: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
931: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
932: pcstart = pcstart*dof;
933: pcend = pcend*dof;
934: cd = (Mat_SeqAIJ*)(c->A)->data;
935: co = (Mat_SeqAIJ*)(c->B)->data;
937: cmaxr = 0;
938: for (i=0; i<pn; i++) {
939: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
940: }
941: PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
942: PetscHMapIVCreate(&hmap);
943: PetscHMapIVResize(hmap,cmaxr);
944: for (i=0; i<am && pn; i++) {
945: PetscHMapIVClear(hmap);
946: offset = i%dof;
947: ii = i/dof;
948: nzi = pd->i[ii+1] - pd->i[ii];
949: if (!nzi) continue;
950: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
951: voff = 0;
952: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
953: if (!voff) continue;
954: /* Form C(ii, :) */
955: pdj = pd->j + pd->i[ii];
956: pda = pd->a + pd->i[ii];
957: for (j=0; j<nzi; j++) {
958: row = pcstart + pdj[j] * dof + offset;
959: for (jj=0; jj<voff; jj++) {
960: apvaluestmp[jj] = apvalues[jj]*pda[j];
961: }
962: PetscLogFlops(voff);
963: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
964: }
965: }
966: ISRestoreIndices(map,&mappingindices);
967: MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
968: PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
969: PetscHMapIVDestroy(&hmap);
970: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
971: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
972: PetscFree2(c_rmtj,c_rmta);
974: /* Add contributions from remote */
975: for (i = 0; i < pn; i++) {
976: row = i + pcstart;
977: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
978: }
979: PetscFree2(c_othj,c_otha);
981: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
982: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
984: ptap->reuse = MAT_REUSE_MATRIX;
986: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
987: if (ptap->freestruct) {
988: MatFreeIntermediateDataStructures(C);
989: }
990: return(0);
991: }
993: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
994: {
995: PetscErrorCode ierr;
999: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
1000: return(0);
1001: }
1003: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
1004: {
1005: PetscErrorCode ierr;
1006: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1007: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
1008: Mat_APMPI *ptap = c->ap;
1009: PetscHMapIV hmap;
1010: PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
1011: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
1012: PetscInt offset,ii,pocol;
1013: const PetscInt *mappingindices;
1014: IS map;
1015: MPI_Comm comm;
1018: PetscObjectGetComm((PetscObject)A,&comm);
1019: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1021: MatZeroEntries(C);
1023: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1024: /*-----------------------------------------------------*/
1025: if (ptap->reuse == MAT_REUSE_MATRIX) {
1026: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1027: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
1028: }
1029: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1030: MatGetLocalSize(p->B,NULL,&pon);
1031: pon *= dof;
1032: MatGetLocalSize(P,NULL,&pn);
1033: pn *= dof;
1035: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
1036: MatGetLocalSize(A,&am,NULL);
1037: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1038: pcstart *= dof;
1039: pcend *= dof;
1040: cmaxr = 0;
1041: for (i=0; i<pon; i++) {
1042: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
1043: }
1044: cd = (Mat_SeqAIJ*)(c->A)->data;
1045: co = (Mat_SeqAIJ*)(c->B)->data;
1046: for (i=0; i<pn; i++) {
1047: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
1048: }
1049: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
1050: PetscHMapIVCreate(&hmap);
1051: PetscHMapIVResize(hmap,cmaxr);
1052: ISGetIndices(map,&mappingindices);
1053: for (i=0; i<am && (pon || pn); i++) {
1054: PetscHMapIVClear(hmap);
1055: offset = i%dof;
1056: ii = i/dof;
1057: nzi = po->i[ii+1] - po->i[ii];
1058: dnzi = pd->i[ii+1] - pd->i[ii];
1059: if (!nzi && !dnzi) continue;
1060: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
1061: voff = 0;
1062: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
1063: if (!voff) continue;
1065: /* Form remote C(ii, :) */
1066: poj = po->j + po->i[ii];
1067: poa = po->a + po->i[ii];
1068: for (j=0; j<nzi; j++) {
1069: pocol = poj[j]*dof+offset;
1070: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
1071: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
1072: for (jj=0; jj<voff; jj++) {
1073: apvaluestmp[jj] = apvalues[jj]*poa[j];
1074: /*If the row is empty */
1075: if (!c_rmtc[pocol]) {
1076: c_rmtjj[jj] = apindices[jj];
1077: c_rmtaa[jj] = apvaluestmp[jj];
1078: c_rmtc[pocol]++;
1079: } else {
1080: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
1081: if (loc>=0){ /* hit */
1082: c_rmtaa[loc] += apvaluestmp[jj];
1083: PetscLogFlops(1.0);
1084: } else { /* new element */
1085: loc = -(loc+1);
1086: /* Move data backward */
1087: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1088: c_rmtjj[kk] = c_rmtjj[kk-1];
1089: c_rmtaa[kk] = c_rmtaa[kk-1];
1090: }/* End kk */
1091: c_rmtjj[loc] = apindices[jj];
1092: c_rmtaa[loc] = apvaluestmp[jj];
1093: c_rmtc[pocol]++;
1094: }
1095: }
1096: } /* End jj */
1097: PetscLogFlops(voff);
1098: } /* End j */
1100: /* Form local C(ii, :) */
1101: pdj = pd->j + pd->i[ii];
1102: pda = pd->a + pd->i[ii];
1103: for (j=0; j<dnzi; j++) {
1104: row = pcstart + pdj[j] * dof + offset;
1105: for (jj=0; jj<voff; jj++) {
1106: apvaluestmp[jj] = apvalues[jj]*pda[j];
1107: }/* End kk */
1108: PetscLogFlops(voff);
1109: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
1110: }/* End j */
1111: } /* End i */
1113: ISRestoreIndices(map,&mappingindices);
1114: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
1115: PetscHMapIVDestroy(&hmap);
1116: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
1118: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1119: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1120: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1121: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1122: PetscFree2(c_rmtj,c_rmta);
1124: /* Add contributions from remote */
1125: for (i = 0; i < pn; i++) {
1126: row = i + pcstart;
1127: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
1128: }
1129: PetscFree2(c_othj,c_otha);
1131: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1132: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1134: ptap->reuse = MAT_REUSE_MATRIX;
1136: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1137: if (ptap->freestruct) {
1138: MatFreeIntermediateDataStructures(C);
1139: }
1140: return(0);
1141: }
1143: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1144: {
1145: PetscErrorCode ierr;
1149: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1150: return(0);
1151: }
1153: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1154: {
1155: Mat_APMPI *ptap;
1156: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1157: MPI_Comm comm;
1158: Mat Cmpi;
1159: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1160: MatType mtype;
1161: PetscSF sf;
1162: PetscSFNode *iremote;
1163: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1164: const PetscInt *rootdegrees;
1165: PetscHSetI ht,oht,*hta,*hto;
1166: PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1167: PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1168: PetscInt nalg=2,alg=0,offset,ii;
1169: const PetscInt *mappingindices;
1170: PetscBool flg;
1171: const char *algTypes[2] = {"overlapping","merged"};
1172: IS map;
1173: PetscErrorCode ierr;
1176: PetscObjectGetComm((PetscObject)A,&comm);
1178: /* Create symbolic parallel matrix Cmpi */
1179: MatGetLocalSize(P,NULL,&pn);
1180: pn *= dof;
1181: MatCreate(comm,&Cmpi);
1182: MatGetType(A,&mtype);
1183: MatSetType(Cmpi,mtype);
1184: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1186: PetscNew(&ptap);
1187: ptap->reuse = MAT_INITIAL_MATRIX;
1188: ptap->algType = 2;
1190: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1191: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1192: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1193: /* This equals to the number of offdiag columns in P */
1194: MatGetLocalSize(p->B,NULL,&pon);
1195: pon *= dof;
1196: /* offsets */
1197: PetscMalloc1(pon+1,&ptap->c_rmti);
1198: /* The number of columns we will send to remote ranks */
1199: PetscMalloc1(pon,&c_rmtc);
1200: PetscMalloc1(pon,&hta);
1201: for (i=0; i<pon; i++) {
1202: PetscHSetICreate(&hta[i]);
1203: }
1204: MatGetLocalSize(A,&am,NULL);
1205: MatGetOwnershipRange(A,&arstart,&arend);
1206: /* Create hash table to merge all columns for C(i, :) */
1207: PetscHSetICreate(&ht);
1209: ISGetIndices(map,&mappingindices);
1210: ptap->c_rmti[0] = 0;
1211: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1212: for (i=0; i<am && pon; i++) {
1213: /* Form one row of AP */
1214: PetscHSetIClear(ht);
1215: offset = i%dof;
1216: ii = i/dof;
1217: /* If the off diag is empty, we should not do any calculation */
1218: nzi = po->i[ii+1] - po->i[ii];
1219: if (!nzi) continue;
1221: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1222: PetscHSetIGetSize(ht,&htsize);
1223: /* If AP is empty, just continue */
1224: if (!htsize) continue;
1225: /* Form C(ii, :) */
1226: poj = po->j + po->i[ii];
1227: for (j=0; j<nzi; j++) {
1228: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1229: }
1230: }
1232: for (i=0; i<pon; i++) {
1233: PetscHSetIGetSize(hta[i],&htsize);
1234: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1235: c_rmtc[i] = htsize;
1236: }
1238: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1240: for (i=0; i<pon; i++) {
1241: off = 0;
1242: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1243: PetscHSetIDestroy(&hta[i]);
1244: }
1245: PetscFree(hta);
1247: PetscMalloc1(pon,&iremote);
1248: for (i=0; i<pon; i++) {
1249: owner = 0; lidx = 0;
1250: offset = i%dof;
1251: ii = i/dof;
1252: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1253: iremote[i].index = lidx*dof + offset;
1254: iremote[i].rank = owner;
1255: }
1257: PetscSFCreate(comm,&sf);
1258: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1259: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1260: PetscSFSetRankOrder(sf,PETSC_TRUE);
1261: PetscSFSetFromOptions(sf);
1262: PetscSFSetUp(sf);
1263: /* How many neighbors have contributions to my rows? */
1264: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1265: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1266: rootspacesize = 0;
1267: for (i = 0; i < pn; i++) {
1268: rootspacesize += rootdegrees[i];
1269: }
1270: PetscMalloc1(rootspacesize,&rootspace);
1271: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1272: /* Get information from leaves
1273: * Number of columns other people contribute to my rows
1274: * */
1275: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1276: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1277: PetscFree(c_rmtc);
1278: PetscCalloc1(pn+1,&ptap->c_othi);
1279: /* The number of columns is received for each row */
1280: ptap->c_othi[0] = 0;
1281: rootspacesize = 0;
1282: rootspaceoffsets[0] = 0;
1283: for (i = 0; i < pn; i++) {
1284: rcvncols = 0;
1285: for (j = 0; j<rootdegrees[i]; j++) {
1286: rcvncols += rootspace[rootspacesize];
1287: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1288: rootspacesize++;
1289: }
1290: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1291: }
1292: PetscFree(rootspace);
1294: PetscMalloc1(pon,&c_rmtoffsets);
1295: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1296: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1297: PetscSFDestroy(&sf);
1298: PetscFree(rootspaceoffsets);
1300: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1301: nleaves = 0;
1302: for (i = 0; i<pon; i++) {
1303: owner = 0;
1304: ii = i/dof;
1305: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1306: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1307: for (j=0; j<sendncols; j++) {
1308: iremote[nleaves].rank = owner;
1309: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1310: }
1311: }
1312: PetscFree(c_rmtoffsets);
1313: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1315: PetscSFCreate(comm,&ptap->sf);
1316: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1317: PetscSFSetFromOptions(ptap->sf);
1318: /* One to one map */
1319: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1321: PetscMalloc2(pn,&dnz,pn,&onz);
1322: PetscHSetICreate(&oht);
1323: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1324: pcstart *= dof;
1325: pcend *= dof;
1326: PetscMalloc2(pn,&hta,pn,&hto);
1327: for (i=0; i<pn; i++) {
1328: PetscHSetICreate(&hta[i]);
1329: PetscHSetICreate(&hto[i]);
1330: }
1331: /* Work on local part */
1332: /* 4) Pass 1: Estimate memory for C_loc */
1333: for (i=0; i<am && pn; i++) {
1334: PetscHSetIClear(ht);
1335: PetscHSetIClear(oht);
1336: offset = i%dof;
1337: ii = i/dof;
1338: nzi = pd->i[ii+1] - pd->i[ii];
1339: if (!nzi) continue;
1341: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1342: PetscHSetIGetSize(ht,&htsize);
1343: PetscHSetIGetSize(oht,&htosize);
1344: if (!(htsize+htosize)) continue;
1345: /* Form C(ii, :) */
1346: pdj = pd->j + pd->i[ii];
1347: for (j=0; j<nzi; j++) {
1348: PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1349: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1350: }
1351: }
1353: ISRestoreIndices(map,&mappingindices);
1355: PetscHSetIDestroy(&ht);
1356: PetscHSetIDestroy(&oht);
1358: /* Get remote data */
1359: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1360: PetscFree(c_rmtj);
1362: for (i = 0; i < pn; i++) {
1363: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1364: rdj = c_othj + ptap->c_othi[i];
1365: for (j = 0; j < nzi; j++) {
1366: col = rdj[j];
1367: /* diag part */
1368: if (col>=pcstart && col<pcend) {
1369: PetscHSetIAdd(hta[i],col);
1370: } else { /* off diag */
1371: PetscHSetIAdd(hto[i],col);
1372: }
1373: }
1374: PetscHSetIGetSize(hta[i],&htsize);
1375: dnz[i] = htsize;
1376: PetscHSetIDestroy(&hta[i]);
1377: PetscHSetIGetSize(hto[i],&htsize);
1378: onz[i] = htsize;
1379: PetscHSetIDestroy(&hto[i]);
1380: }
1382: PetscFree2(hta,hto);
1383: PetscFree(c_othj);
1385: /* local sizes and preallocation */
1386: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1387: MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1388: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1389: MatSetUp(Cmpi);
1390: PetscFree2(dnz,onz);
1392: /* attach the supporting struct to Cmpi for reuse */
1393: c = (Mat_MPIAIJ*)Cmpi->data;
1394: c->ap = ptap;
1395: ptap->duplicate = Cmpi->ops->duplicate;
1396: ptap->destroy = Cmpi->ops->destroy;
1397: ptap->view = Cmpi->ops->view;
1399: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1400: Cmpi->assembled = PETSC_FALSE;
1401: /* pick an algorithm */
1402: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1403: alg = 0;
1404: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1405: PetscOptionsEnd();
1406: switch (alg) {
1407: case 0:
1408: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1409: break;
1410: case 1:
1411: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1412: break;
1413: default:
1414: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1415: }
1416: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1417: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1418: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1419: *C = Cmpi;
1420: return(0);
1421: }
1423: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C)
1424: {
1425: PetscErrorCode ierr;
1429: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1430: return(0);
1431: }
1433: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1434: {
1435: Mat_APMPI *ptap;
1436: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1437: MPI_Comm comm;
1438: Mat Cmpi;
1439: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1440: MatType mtype;
1441: PetscSF sf;
1442: PetscSFNode *iremote;
1443: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1444: const PetscInt *rootdegrees;
1445: PetscHSetI ht,oht,*hta,*hto,*htd;
1446: PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1447: PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1448: PetscInt nalg=2,alg=0,offset,ii;
1449: PetscBool flg;
1450: const char *algTypes[2] = {"merged","overlapping"};
1451: const PetscInt *mappingindices;
1452: IS map;
1453: PetscErrorCode ierr;
1456: PetscObjectGetComm((PetscObject)A,&comm);
1458: /* Create symbolic parallel matrix Cmpi */
1459: MatGetLocalSize(P,NULL,&pn);
1460: pn *= dof;
1461: MatCreate(comm,&Cmpi);
1462: MatGetType(A,&mtype);
1463: MatSetType(Cmpi,mtype);
1464: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1466: PetscNew(&ptap);
1467: ptap->reuse = MAT_INITIAL_MATRIX;
1468: ptap->algType = 3;
1470: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1471: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1472: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1474: /* This equals to the number of offdiag columns in P */
1475: MatGetLocalSize(p->B,NULL,&pon);
1476: pon *= dof;
1477: /* offsets */
1478: PetscMalloc1(pon+1,&ptap->c_rmti);
1479: /* The number of columns we will send to remote ranks */
1480: PetscMalloc1(pon,&c_rmtc);
1481: PetscMalloc1(pon,&hta);
1482: for (i=0; i<pon; i++) {
1483: PetscHSetICreate(&hta[i]);
1484: }
1485: MatGetLocalSize(A,&am,NULL);
1486: MatGetOwnershipRange(A,&arstart,&arend);
1487: /* Create hash table to merge all columns for C(i, :) */
1488: PetscHSetICreate(&ht);
1489: PetscHSetICreate(&oht);
1490: PetscMalloc2(pn,&htd,pn,&hto);
1491: for (i=0; i<pn; i++) {
1492: PetscHSetICreate(&htd[i]);
1493: PetscHSetICreate(&hto[i]);
1494: }
1496: ISGetIndices(map,&mappingindices);
1497: ptap->c_rmti[0] = 0;
1498: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1499: for (i=0; i<am && (pon || pn); i++) {
1500: /* Form one row of AP */
1501: PetscHSetIClear(ht);
1502: PetscHSetIClear(oht);
1503: offset = i%dof;
1504: ii = i/dof;
1505: /* If the off diag is empty, we should not do any calculation */
1506: nzi = po->i[ii+1] - po->i[ii];
1507: dnzi = pd->i[ii+1] - pd->i[ii];
1508: if (!nzi && !dnzi) continue;
1510: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1511: PetscHSetIGetSize(ht,&htsize);
1512: PetscHSetIGetSize(oht,&htosize);
1513: /* If AP is empty, just continue */
1514: if (!(htsize+htosize)) continue;
1516: /* Form remote C(ii, :) */
1517: poj = po->j + po->i[ii];
1518: for (j=0; j<nzi; j++) {
1519: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1520: PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1521: }
1523: /* Form local C(ii, :) */
1524: pdj = pd->j + pd->i[ii];
1525: for (j=0; j<dnzi; j++) {
1526: PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1527: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1528: }
1529: }
1531: ISRestoreIndices(map,&mappingindices);
1533: PetscHSetIDestroy(&ht);
1534: PetscHSetIDestroy(&oht);
1536: for (i=0; i<pon; i++) {
1537: PetscHSetIGetSize(hta[i],&htsize);
1538: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1539: c_rmtc[i] = htsize;
1540: }
1542: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1544: for (i=0; i<pon; i++) {
1545: off = 0;
1546: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1547: PetscHSetIDestroy(&hta[i]);
1548: }
1549: PetscFree(hta);
1551: PetscMalloc1(pon,&iremote);
1552: for (i=0; i<pon; i++) {
1553: owner = 0; lidx = 0;
1554: offset = i%dof;
1555: ii = i/dof;
1556: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1557: iremote[i].index = lidx*dof+offset;
1558: iremote[i].rank = owner;
1559: }
1561: PetscSFCreate(comm,&sf);
1562: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1563: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1564: PetscSFSetRankOrder(sf,PETSC_TRUE);
1565: PetscSFSetFromOptions(sf);
1566: PetscSFSetUp(sf);
1567: /* How many neighbors have contributions to my rows? */
1568: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1569: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1570: rootspacesize = 0;
1571: for (i = 0; i < pn; i++) {
1572: rootspacesize += rootdegrees[i];
1573: }
1574: PetscMalloc1(rootspacesize,&rootspace);
1575: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1576: /* Get information from leaves
1577: * Number of columns other people contribute to my rows
1578: * */
1579: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1580: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1581: PetscFree(c_rmtc);
1582: PetscMalloc1(pn+1,&ptap->c_othi);
1583: /* The number of columns is received for each row */
1584: ptap->c_othi[0] = 0;
1585: rootspacesize = 0;
1586: rootspaceoffsets[0] = 0;
1587: for (i = 0; i < pn; i++) {
1588: rcvncols = 0;
1589: for (j = 0; j<rootdegrees[i]; j++) {
1590: rcvncols += rootspace[rootspacesize];
1591: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1592: rootspacesize++;
1593: }
1594: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1595: }
1596: PetscFree(rootspace);
1598: PetscMalloc1(pon,&c_rmtoffsets);
1599: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1600: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1601: PetscSFDestroy(&sf);
1602: PetscFree(rootspaceoffsets);
1604: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1605: nleaves = 0;
1606: for (i = 0; i<pon; i++) {
1607: owner = 0;
1608: ii = i/dof;
1609: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1610: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1611: for (j=0; j<sendncols; j++) {
1612: iremote[nleaves].rank = owner;
1613: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1614: }
1615: }
1616: PetscFree(c_rmtoffsets);
1617: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1619: PetscSFCreate(comm,&ptap->sf);
1620: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1621: PetscSFSetFromOptions(ptap->sf);
1622: /* One to one map */
1623: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1624: /* Get remote data */
1625: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1626: PetscFree(c_rmtj);
1627: PetscMalloc2(pn,&dnz,pn,&onz);
1628: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1629: pcstart *= dof;
1630: pcend *= dof;
1631: for (i = 0; i < pn; i++) {
1632: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1633: rdj = c_othj + ptap->c_othi[i];
1634: for (j = 0; j < nzi; j++) {
1635: col = rdj[j];
1636: /* diag part */
1637: if (col>=pcstart && col<pcend) {
1638: PetscHSetIAdd(htd[i],col);
1639: } else { /* off diag */
1640: PetscHSetIAdd(hto[i],col);
1641: }
1642: }
1643: PetscHSetIGetSize(htd[i],&htsize);
1644: dnz[i] = htsize;
1645: PetscHSetIDestroy(&htd[i]);
1646: PetscHSetIGetSize(hto[i],&htsize);
1647: onz[i] = htsize;
1648: PetscHSetIDestroy(&hto[i]);
1649: }
1651: PetscFree2(htd,hto);
1652: PetscFree(c_othj);
1654: /* local sizes and preallocation */
1655: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1656: MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1657: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1658: PetscFree2(dnz,onz);
1660: /* attach the supporting struct to Cmpi for reuse */
1661: c = (Mat_MPIAIJ*)Cmpi->data;
1662: c->ap = ptap;
1663: ptap->duplicate = Cmpi->ops->duplicate;
1664: ptap->destroy = Cmpi->ops->destroy;
1665: ptap->view = Cmpi->ops->view;
1667: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1668: Cmpi->assembled = PETSC_FALSE;
1669: /* pick an algorithm */
1670: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1671: alg = 0;
1672: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1673: PetscOptionsEnd();
1674: switch (alg) {
1675: case 0:
1676: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1677: break;
1678: case 1:
1679: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1680: break;
1681: default:
1682: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1683: }
1684: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1685: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1686: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1687: *C = Cmpi;
1688: return(0);
1689: }
1691: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C)
1692: {
1693: PetscErrorCode ierr;
1697: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1698: return(0);
1699: }
1701: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1702: {
1703: PetscErrorCode ierr;
1704: Mat_APMPI *ptap;
1705: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1706: MPI_Comm comm;
1707: PetscMPIInt size,rank;
1708: Mat Cmpi;
1709: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1710: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1711: PetscInt *lnk,i,k,pnz,row,nsend;
1712: PetscBT lnkbt;
1713: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1714: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1715: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1716: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1717: MPI_Request *swaits,*rwaits;
1718: MPI_Status *sstatus,rstatus;
1719: PetscLayout rowmap;
1720: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1721: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1722: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1723: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1724: PetscScalar *apv;
1725: PetscTable ta;
1726: MatType mtype;
1727: const char *prefix;
1728: #if defined(PETSC_USE_INFO)
1729: PetscReal apfill;
1730: #endif
1733: PetscObjectGetComm((PetscObject)A,&comm);
1734: MPI_Comm_size(comm,&size);
1735: MPI_Comm_rank(comm,&rank);
1737: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1739: /* create symbolic parallel matrix Cmpi */
1740: MatCreate(comm,&Cmpi);
1741: MatGetType(A,&mtype);
1742: MatSetType(Cmpi,mtype);
1744: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1745: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1747: /* create struct Mat_APMPI and attached it to C later */
1748: PetscNew(&ptap);
1749: ptap->reuse = MAT_INITIAL_MATRIX;
1750: ptap->algType = 1;
1752: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1753: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1754: /* get P_loc by taking all local rows of P */
1755: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1757: /* (0) compute Rd = Pd^T, Ro = Po^T */
1758: /* --------------------------------- */
1759: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1760: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1762: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1763: /* ----------------------------------------------------------------- */
1764: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1765: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1767: /* create and initialize a linked list */
1768: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1769: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1770: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1771: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1772: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1774: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1776: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1777: if (ao) {
1778: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1779: } else {
1780: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1781: }
1782: current_space = free_space;
1783: nspacedouble = 0;
1785: PetscMalloc1(am+1,&api);
1786: api[0] = 0;
1787: for (i=0; i<am; i++) {
1788: /* diagonal portion: Ad[i,:]*P */
1789: ai = ad->i; pi = p_loc->i;
1790: nzi = ai[i+1] - ai[i];
1791: aj = ad->j + ai[i];
1792: for (j=0; j<nzi; j++) {
1793: row = aj[j];
1794: pnz = pi[row+1] - pi[row];
1795: Jptr = p_loc->j + pi[row];
1796: /* add non-zero cols of P into the sorted linked list lnk */
1797: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1798: }
1799: /* off-diagonal portion: Ao[i,:]*P */
1800: if (ao) {
1801: ai = ao->i; pi = p_oth->i;
1802: nzi = ai[i+1] - ai[i];
1803: aj = ao->j + ai[i];
1804: for (j=0; j<nzi; j++) {
1805: row = aj[j];
1806: pnz = pi[row+1] - pi[row];
1807: Jptr = p_oth->j + pi[row];
1808: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1809: }
1810: }
1811: apnz = lnk[0];
1812: api[i+1] = api[i] + apnz;
1813: if (ap_rmax < apnz) ap_rmax = apnz;
1815: /* if free space is not available, double the total space in the list */
1816: if (current_space->local_remaining<apnz) {
1817: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1818: nspacedouble++;
1819: }
1821: /* Copy data into free space, then initialize lnk */
1822: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1824: current_space->array += apnz;
1825: current_space->local_used += apnz;
1826: current_space->local_remaining -= apnz;
1827: }
1828: /* Allocate space for apj and apv, initialize apj, and */
1829: /* destroy list of free space and other temporary array(s) */
1830: PetscMalloc2(api[am],&apj,api[am],&apv);
1831: PetscFreeSpaceContiguous(&free_space,apj);
1832: PetscLLDestroy(lnk,lnkbt);
1834: /* Create AP_loc for reuse */
1835: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1837: #if defined(PETSC_USE_INFO)
1838: if (ao) {
1839: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1840: } else {
1841: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1842: }
1843: ptap->AP_loc->info.mallocs = nspacedouble;
1844: ptap->AP_loc->info.fill_ratio_given = fill;
1845: ptap->AP_loc->info.fill_ratio_needed = apfill;
1847: if (api[am]) {
1848: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1849: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1850: } else {
1851: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1852: }
1853: #endif
1855: /* (2-1) compute symbolic Co = Ro*AP_loc */
1856: /* ------------------------------------ */
1857: MatGetOptionsPrefix(A,&prefix);
1858: MatSetOptionsPrefix(ptap->Ro,prefix);
1859: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1860: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
1862: /* (3) send coj of C_oth to other processors */
1863: /* ------------------------------------------ */
1864: /* determine row ownership */
1865: PetscLayoutCreate(comm,&rowmap);
1866: rowmap->n = pn;
1867: rowmap->bs = 1;
1868: PetscLayoutSetUp(rowmap);
1869: owners = rowmap->range;
1871: /* determine the number of messages to send, their lengths */
1872: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1873: PetscArrayzero(len_s,size);
1874: PetscArrayzero(len_si,size);
1876: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1877: coi = c_oth->i; coj = c_oth->j;
1878: con = ptap->C_oth->rmap->n;
1879: proc = 0;
1880: for (i=0; i<con; i++) {
1881: while (prmap[i] >= owners[proc+1]) proc++;
1882: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1883: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1884: }
1886: len = 0; /* max length of buf_si[], see (4) */
1887: owners_co[0] = 0;
1888: nsend = 0;
1889: for (proc=0; proc<size; proc++) {
1890: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1891: if (len_s[proc]) {
1892: nsend++;
1893: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1894: len += len_si[proc];
1895: }
1896: }
1898: /* determine the number and length of messages to receive for coi and coj */
1899: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1900: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1902: /* post the Irecv and Isend of coj */
1903: PetscCommGetNewTag(comm,&tagj);
1904: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1905: PetscMalloc1(nsend+1,&swaits);
1906: for (proc=0, k=0; proc<size; proc++) {
1907: if (!len_s[proc]) continue;
1908: i = owners_co[proc];
1909: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1910: k++;
1911: }
1913: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1914: /* ---------------------------------------- */
1915: MatSetOptionsPrefix(ptap->Rd,prefix);
1916: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1917: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
1918: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1920: /* receives coj are complete */
1921: for (i=0; i<nrecv; i++) {
1922: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1923: }
1924: PetscFree(rwaits);
1925: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1927: /* add received column indices into ta to update Crmax */
1928: for (k=0; k<nrecv; k++) {/* k-th received message */
1929: Jptr = buf_rj[k];
1930: for (j=0; j<len_r[k]; j++) {
1931: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1932: }
1933: }
1934: PetscTableGetCount(ta,&Crmax);
1935: PetscTableDestroy(&ta);
1937: /* (4) send and recv coi */
1938: /*-----------------------*/
1939: PetscCommGetNewTag(comm,&tagi);
1940: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1941: PetscMalloc1(len+1,&buf_s);
1942: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1943: for (proc=0,k=0; proc<size; proc++) {
1944: if (!len_s[proc]) continue;
1945: /* form outgoing message for i-structure:
1946: buf_si[0]: nrows to be sent
1947: [1:nrows]: row index (global)
1948: [nrows+1:2*nrows+1]: i-structure index
1949: */
1950: /*-------------------------------------------*/
1951: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1952: buf_si_i = buf_si + nrows+1;
1953: buf_si[0] = nrows;
1954: buf_si_i[0] = 0;
1955: nrows = 0;
1956: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1957: nzi = coi[i+1] - coi[i];
1958: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1959: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1960: nrows++;
1961: }
1962: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1963: k++;
1964: buf_si += len_si[proc];
1965: }
1966: for (i=0; i<nrecv; i++) {
1967: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1968: }
1969: PetscFree(rwaits);
1970: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1972: PetscFree4(len_s,len_si,sstatus,owners_co);
1973: PetscFree(len_ri);
1974: PetscFree(swaits);
1975: PetscFree(buf_s);
1977: /* (5) compute the local portion of Cmpi */
1978: /* ------------------------------------------ */
1979: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1980: PetscFreeSpaceGet(Crmax,&free_space);
1981: current_space = free_space;
1983: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1984: for (k=0; k<nrecv; k++) {
1985: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1986: nrows = *buf_ri_k[k];
1987: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1988: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1989: }
1991: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1992: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1993: for (i=0; i<pn; i++) {
1994: /* add C_loc into Cmpi */
1995: nzi = c_loc->i[i+1] - c_loc->i[i];
1996: Jptr = c_loc->j + c_loc->i[i];
1997: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1999: /* add received col data into lnk */
2000: for (k=0; k<nrecv; k++) { /* k-th received message */
2001: if (i == *nextrow[k]) { /* i-th row */
2002: nzi = *(nextci[k]+1) - *nextci[k];
2003: Jptr = buf_rj[k] + *nextci[k];
2004: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
2005: nextrow[k]++; nextci[k]++;
2006: }
2007: }
2008: nzi = lnk[0];
2010: /* copy data into free space, then initialize lnk */
2011: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
2012: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
2013: }
2014: PetscFree3(buf_ri_k,nextrow,nextci);
2015: PetscLLDestroy(lnk,lnkbt);
2016: PetscFreeSpaceDestroy(free_space);
2018: /* local sizes and preallocation */
2019: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
2020: if (P->cmap->bs > 0) {
2021: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
2022: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
2023: }
2024: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
2025: MatPreallocateFinalize(dnz,onz);
2027: /* members in merge */
2028: PetscFree(id_r);
2029: PetscFree(len_r);
2030: PetscFree(buf_ri[0]);
2031: PetscFree(buf_ri);
2032: PetscFree(buf_rj[0]);
2033: PetscFree(buf_rj);
2034: PetscLayoutDestroy(&rowmap);
2036: /* attach the supporting struct to Cmpi for reuse */
2037: c = (Mat_MPIAIJ*)Cmpi->data;
2038: c->ap = ptap;
2039: ptap->duplicate = Cmpi->ops->duplicate;
2040: ptap->destroy = Cmpi->ops->destroy;
2041: ptap->view = Cmpi->ops->view;
2042: PetscCalloc1(pN,&ptap->apa);
2044: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
2045: Cmpi->assembled = PETSC_FALSE;
2046: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
2047: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
2048: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
2049: *C = Cmpi;
2050: return(0);
2051: }
2053: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
2054: {
2055: PetscErrorCode ierr;
2056: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
2057: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2058: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
2059: Mat_APMPI *ptap = c->ap;
2060: Mat AP_loc,C_loc,C_oth;
2061: PetscInt i,rstart,rend,cm,ncols,row;
2062: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
2063: PetscScalar *apa;
2064: const PetscInt *cols;
2065: const PetscScalar *vals;
2068: if (!ptap->AP_loc) {
2069: MPI_Comm comm;
2070: PetscObjectGetComm((PetscObject)C,&comm);
2071: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2072: }
2074: MatZeroEntries(C);
2075: /* 1) get R = Pd^T,Ro = Po^T */
2076: if (ptap->reuse == MAT_REUSE_MATRIX) {
2077: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
2078: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
2079: }
2081: /* 2) get AP_loc */
2082: AP_loc = ptap->AP_loc;
2083: ap = (Mat_SeqAIJ*)AP_loc->data;
2085: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
2086: /*-----------------------------------------------------*/
2087: if (ptap->reuse == MAT_REUSE_MATRIX) {
2088: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2089: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
2090: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
2091: }
2093: /* 2-2) compute numeric A_loc*P - dominating part */
2094: /* ---------------------------------------------- */
2095: /* get data from symbolic products */
2096: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2097: if (ptap->P_oth) {
2098: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2099: }
2100: apa = ptap->apa;
2101: api = ap->i;
2102: apj = ap->j;
2103: for (i=0; i<am; i++) {
2104: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2105: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2106: apnz = api[i+1] - api[i];
2107: for (j=0; j<apnz; j++) {
2108: col = apj[j+api[i]];
2109: ap->a[j+ap->i[i]] = apa[col];
2110: apa[col] = 0.0;
2111: }
2112: }
2114: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2115: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
2116: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
2117: C_loc = ptap->C_loc;
2118: C_oth = ptap->C_oth;
2120: /* add C_loc and Co to to C */
2121: MatGetOwnershipRange(C,&rstart,&rend);
2123: /* C_loc -> C */
2124: cm = C_loc->rmap->N;
2125: c_seq = (Mat_SeqAIJ*)C_loc->data;
2126: cols = c_seq->j;
2127: vals = c_seq->a;
2130: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2131: /* when there are no off-processor parts. */
2132: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2133: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2134: /* a table, and other, more complex stuff has to be done. */
2135: if (C->assembled) {
2136: C->was_assembled = PETSC_TRUE;
2137: C->assembled = PETSC_FALSE;
2138: }
2139: if (C->was_assembled) {
2140: for (i=0; i<cm; i++) {
2141: ncols = c_seq->i[i+1] - c_seq->i[i];
2142: row = rstart + i;
2143: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
2144: cols += ncols; vals += ncols;
2145: }
2146: } else {
2147: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
2148: }
2150: /* Co -> C, off-processor part */
2151: cm = C_oth->rmap->N;
2152: c_seq = (Mat_SeqAIJ*)C_oth->data;
2153: cols = c_seq->j;
2154: vals = c_seq->a;
2155: for (i=0; i<cm; i++) {
2156: ncols = c_seq->i[i+1] - c_seq->i[i];
2157: row = p->garray[i];
2158: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2159: cols += ncols; vals += ncols;
2160: }
2162: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2163: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2165: ptap->reuse = MAT_REUSE_MATRIX;
2167: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2168: if (ptap->freestruct) {
2169: MatFreeIntermediateDataStructures(C);
2170: }
2171: return(0);
2172: }