Actual source code: mpimatmatmult.c
petsc-3.6.4 2016-04-12
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
15: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=0; /* set default algorithm */
22: if (scall == MAT_INITIAL_MATRIX) {
23: PetscObjectOptionsBegin((PetscObject)A);
24: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);
25: PetscOptionsEnd();
27: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
28: switch (alg) {
29: case 1:
30: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
31: break;
32: default:
33: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
34: break;
35: }
36: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
37: }
38: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
39: (*(*C)->ops->matmultnumeric)(A,B,*C);
40: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
41: return(0);
42: }
46: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47: {
49: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
50: Mat_PtAPMPI *ptap = a->ptap;
53: PetscFree2(ptap->startsj_s,ptap->startsj_r);
54: PetscFree(ptap->bufa);
55: MatDestroy(&ptap->P_loc);
56: MatDestroy(&ptap->P_oth);
57: MatDestroy(&ptap->Pt);
58: PetscFree(ptap->api);
59: PetscFree(ptap->apj);
60: PetscFree(ptap->apa);
61: ptap->destroy(A);
62: PetscFree(ptap);
63: return(0);
64: }
68: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69: {
71: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
72: Mat_PtAPMPI *ptap = a->ptap;
75: (*ptap->duplicate)(A,op,M);
77: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79: return(0);
80: }
84: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
85: {
87: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
91: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
92: Mat_SeqAIJ *p_loc,*p_oth;
93: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
94: PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
95: PetscInt cm =C->rmap->n,anz,pnz;
96: Mat_PtAPMPI *ptap=c->ptap;
97: PetscInt *api,*apj,*apJ,i,j,k,row;
98: PetscInt cstart=C->cmap->rstart;
99: PetscInt cdnz,conz,k0,k1;
100: MPI_Comm comm;
101: PetscMPIInt size;
104: PetscObjectGetComm((PetscObject)A,&comm);
105: MPI_Comm_size(comm,&size);
107: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
108: /*-----------------------------------------------------*/
109: /* update numerical values of P_oth and P_loc */
110: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
111: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
113: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
114: /*----------------------------------------------------------*/
115: /* get data from symbolic products */
116: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
117: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
118: if (size >1) {
119: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
120: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
121: } else {
122: pi_oth=NULL; pj_oth=NULL; pa_oth=NULL;
123: }
125: /* get apa for storing dense row A[i,:]*P */
126: apa = ptap->apa;
128: api = ptap->api;
129: apj = ptap->apj;
130: for (i=0; i<cm; i++) {
131: /* diagonal portion of A */
132: anz = adi[i+1] - adi[i];
133: adj = ad->j + adi[i];
134: ada = ad->a + adi[i];
135: for (j=0; j<anz; j++) {
136: row = adj[j];
137: pnz = pi_loc[row+1] - pi_loc[row];
138: pj = pj_loc + pi_loc[row];
139: pa = pa_loc + pi_loc[row];
141: /* perform dense axpy */
142: valtmp = ada[j];
143: for (k=0; k<pnz; k++) {
144: apa[pj[k]] += valtmp*pa[k];
145: }
146: PetscLogFlops(2.0*pnz);
147: }
149: /* off-diagonal portion of A */
150: anz = aoi[i+1] - aoi[i];
151: aoj = ao->j + aoi[i];
152: aoa = ao->a + aoi[i];
153: for (j=0; j<anz; j++) {
154: row = aoj[j];
155: pnz = pi_oth[row+1] - pi_oth[row];
156: pj = pj_oth + pi_oth[row];
157: pa = pa_oth + pi_oth[row];
159: /* perform dense axpy */
160: valtmp = aoa[j];
161: for (k=0; k<pnz; k++) {
162: apa[pj[k]] += valtmp*pa[k];
163: }
164: PetscLogFlops(2.0*pnz);
165: }
167: /* set values in C */
168: apJ = apj + api[i];
169: cdnz = cd->i[i+1] - cd->i[i];
170: conz = co->i[i+1] - co->i[i];
172: /* 1st off-diagoanl part of C */
173: ca = coa + co->i[i];
174: k = 0;
175: for (k0=0; k0<conz; k0++) {
176: if (apJ[k] >= cstart) break;
177: ca[k0] = apa[apJ[k]];
178: apa[apJ[k]] = 0.0;
179: k++;
180: }
182: /* diagonal part of C */
183: ca = cda + cd->i[i];
184: for (k1=0; k1<cdnz; k1++) {
185: ca[k1] = apa[apJ[k]];
186: apa[apJ[k]] = 0.0;
187: k++;
188: }
190: /* 2nd off-diagoanl part of C */
191: ca = coa + co->i[i];
192: for (; k0<conz; k0++) {
193: ca[k0] = apa[apJ[k]];
194: apa[apJ[k]] = 0.0;
195: k++;
196: }
197: }
198: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
199: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
200: return(0);
201: }
205: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
206: {
207: PetscErrorCode ierr;
208: MPI_Comm comm;
209: PetscMPIInt size;
210: Mat Cmpi;
211: Mat_PtAPMPI *ptap;
212: PetscFreeSpaceList free_space=NULL,current_space=NULL;
213: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
214: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
215: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
216: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
217: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
218: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
219: PetscBT lnkbt;
220: PetscScalar *apa;
221: PetscReal afill;
222: PetscInt nlnk_max,armax,prmax;
225: PetscObjectGetComm((PetscObject)A,&comm);
226: MPI_Comm_size(comm,&size);
228: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
229: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
230: }
231:
232: /* create struct Mat_PtAPMPI and attached it to C later */
233: PetscNew(&ptap);
235: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
236: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
238: /* get P_loc by taking all local rows of P */
239: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
241: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
242: pi_loc = p_loc->i; pj_loc = p_loc->j;
243: if (size > 1) {
244: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
245: pi_oth = p_oth->i; pj_oth = p_oth->j;
246: } else {
247: p_oth = NULL;
248: pi_oth = NULL; pj_oth = NULL;
249: }
251: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
252: /*-------------------------------------------------------------------*/
253: PetscMalloc1(am+2,&api);
254: ptap->api = api;
255: api[0] = 0;
257: /* create and initialize a linked list */
258: armax = ad->rmax+ao->rmax;
259: if (size >1) {
260: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
261: } else {
262: prmax = p_loc->rmax;
263: }
264: nlnk_max = armax*prmax;
265: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
266: PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);
268: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
269: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
271: current_space = free_space;
273: MatPreallocateInitialize(comm,am,pn,dnz,onz);
274: for (i=0; i<am; i++) {
275: /* diagonal portion of A */
276: nzi = adi[i+1] - adi[i];
277: for (j=0; j<nzi; j++) {
278: row = *adj++;
279: pnz = pi_loc[row+1] - pi_loc[row];
280: Jptr = pj_loc + pi_loc[row];
281: /* add non-zero cols of P into the sorted linked list lnk */
282: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
283: }
284: /* off-diagonal portion of A */
285: nzi = aoi[i+1] - aoi[i];
286: for (j=0; j<nzi; j++) {
287: row = *aoj++;
288: pnz = pi_oth[row+1] - pi_oth[row];
289: Jptr = pj_oth + pi_oth[row];
290: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
291: }
293: apnz = lnk[0];
294: api[i+1] = api[i] + apnz;
296: /* if free space is not available, double the total space in the list */
297: if (current_space->local_remaining<apnz) {
298: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
299: nspacedouble++;
300: }
302: /* Copy data into free space, then initialize lnk */
303: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
304: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
306: current_space->array += apnz;
307: current_space->local_used += apnz;
308: current_space->local_remaining -= apnz;
309: }
311: /* Allocate space for apj, initialize apj, and */
312: /* destroy list of free space and other temporary array(s) */
313: PetscMalloc1(api[am]+1,&ptap->apj);
314: apj = ptap->apj;
315: PetscFreeSpaceContiguous(&free_space,ptap->apj);
316: PetscLLDestroy(lnk,lnkbt);
318: /* malloc apa to store dense row A[i,:]*P */
319: PetscCalloc1(pN,&apa);
321: ptap->apa = apa;
323: /* create and assemble symbolic parallel matrix Cmpi */
324: /*----------------------------------------------------*/
325: MatCreate(comm,&Cmpi);
326: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
327: MatSetBlockSizesFromMats(Cmpi,A,P);
329: MatSetType(Cmpi,MATMPIAIJ);
330: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
331: MatPreallocateFinalize(dnz,onz);
332: for (i=0; i<am; i++) {
333: row = i + rstart;
334: apnz = api[i+1] - api[i];
335: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
336: apj += apnz;
337: }
338: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
339: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
341: ptap->destroy = Cmpi->ops->destroy;
342: ptap->duplicate = Cmpi->ops->duplicate;
343: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
344: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
345: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
347: /* attach the supporting struct to Cmpi for reuse */
348: c = (Mat_MPIAIJ*)Cmpi->data;
349: c->ptap = ptap;
351: *C = Cmpi;
353: /* set MatInfo */
354: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
355: if (afill < 1.0) afill = 1.0;
356: Cmpi->info.mallocs = nspacedouble;
357: Cmpi->info.fill_ratio_given = fill;
358: Cmpi->info.fill_ratio_needed = afill;
360: #if defined(PETSC_USE_INFO)
361: if (api[am]) {
362: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
363: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
364: } else {
365: PetscInfo(Cmpi,"Empty matrix product\n");
366: }
367: #endif
368: return(0);
369: }
373: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
374: {
378: if (scall == MAT_INITIAL_MATRIX) {
379: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
380: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
381: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
382: }
383: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
384: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
385: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
386: return(0);
387: }
389: typedef struct {
390: Mat workB;
391: PetscScalar *rvalues,*svalues;
392: MPI_Request *rwaits,*swaits;
393: } MPIAIJ_MPIDense;
397: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
398: {
399: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
400: PetscErrorCode ierr;
403: MatDestroy(&contents->workB);
404: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
405: PetscFree(contents);
406: return(0);
407: }
411: /*
412: This is a "dummy function" that handles the case where matrix C was created as a dense matrix
413: directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
415: It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
416: */
417: PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
418: {
419: PetscErrorCode ierr;
420: PetscBool flg;
421: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
422: PetscInt nz = aij->B->cmap->n;
423: PetscContainer container;
424: MPIAIJ_MPIDense *contents;
425: VecScatter ctx = aij->Mvctx;
426: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
427: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
430: PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);
431: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
433: /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
434: PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);
435: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
437: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
439: PetscNew(&contents);
440: /* Create work matrix used to store off processor rows of B needed for local product */
441: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
442: /* Create work arrays needed */
443: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
444: B->cmap->N*to->starts[to->n],&contents->svalues,
445: from->n,&contents->rwaits,
446: to->n,&contents->swaits);
448: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
449: PetscContainerSetPointer(container,contents);
450: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
451: PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);
452: PetscContainerDestroy(&container);
454: (*C->ops->matmultnumeric)(A,B,C);
455: return(0);
456: }
460: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
461: {
462: PetscErrorCode ierr;
463: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
464: PetscInt nz = aij->B->cmap->n;
465: PetscContainer container;
466: MPIAIJ_MPIDense *contents;
467: VecScatter ctx = aij->Mvctx;
468: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
469: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
470: PetscInt m = A->rmap->n,n=B->cmap->n;
473: MatCreate(PetscObjectComm((PetscObject)B),C);
474: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
475: MatSetBlockSizesFromMats(*C,A,B);
476: MatSetType(*C,MATMPIDENSE);
477: MatMPIDenseSetPreallocation(*C,NULL);
478: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
479: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
481: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
483: PetscNew(&contents);
484: /* Create work matrix used to store off processor rows of B needed for local product */
485: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
486: /* Create work arrays needed */
487: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
488: B->cmap->N*to->starts[to->n],&contents->svalues,
489: from->n,&contents->rwaits,
490: to->n,&contents->swaits);
492: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
493: PetscContainerSetPointer(container,contents);
494: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
495: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
496: PetscContainerDestroy(&container);
497: return(0);
498: }
502: /*
503: Performs an efficient scatter on the rows of B needed by this process; this is
504: a modification of the VecScatterBegin_() routines.
505: */
506: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
507: {
508: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
509: PetscErrorCode ierr;
510: PetscScalar *b,*w,*svalues,*rvalues;
511: VecScatter ctx = aij->Mvctx;
512: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
513: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
514: PetscInt i,j,k;
515: PetscInt *sindices,*sstarts,*rindices,*rstarts;
516: PetscMPIInt *sprocs,*rprocs,nrecvs;
517: MPI_Request *swaits,*rwaits;
518: MPI_Comm comm;
519: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
520: MPI_Status status;
521: MPIAIJ_MPIDense *contents;
522: PetscContainer container;
523: Mat workB;
526: PetscObjectGetComm((PetscObject)A,&comm);
527: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
528: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
529: PetscContainerGetPointer(container,(void**)&contents);
531: workB = *outworkB = contents->workB;
532: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
533: sindices = to->indices;
534: sstarts = to->starts;
535: sprocs = to->procs;
536: swaits = contents->swaits;
537: svalues = contents->svalues;
539: rindices = from->indices;
540: rstarts = from->starts;
541: rprocs = from->procs;
542: rwaits = contents->rwaits;
543: rvalues = contents->rvalues;
545: MatDenseGetArray(B,&b);
546: MatDenseGetArray(workB,&w);
548: for (i=0; i<from->n; i++) {
549: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
550: }
552: for (i=0; i<to->n; i++) {
553: /* pack a message at a time */
554: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
555: for (k=0; k<ncols; k++) {
556: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
557: }
558: }
559: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
560: }
562: nrecvs = from->n;
563: while (nrecvs) {
564: MPI_Waitany(from->n,rwaits,&imdex,&status);
565: nrecvs--;
566: /* unpack a message at a time */
567: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
568: for (k=0; k<ncols; k++) {
569: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
570: }
571: }
572: }
573: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
575: MatDenseRestoreArray(B,&b);
576: MatDenseRestoreArray(workB,&w);
577: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
578: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
579: return(0);
580: }
581: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
585: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
586: {
588: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
589: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
590: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
591: Mat workB;
594: /* diagonal block of A times all local rows of B*/
595: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
597: /* get off processor parts of B needed to complete the product */
598: MatMPIDenseScatter(A,B,C,&workB);
600: /* off-diagonal block of A times nonlocal rows of B */
601: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
602: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
603: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
604: return(0);
605: }
609: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
610: {
612: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
613: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
614: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
615: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
616: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
617: Mat_SeqAIJ *p_loc,*p_oth;
618: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
619: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
620: PetscInt cm = C->rmap->n,anz,pnz;
621: Mat_PtAPMPI *ptap = c->ptap;
622: PetscScalar *apa_sparse = ptap->apa;
623: PetscInt *api,*apj,*apJ,i,j,k,row;
624: PetscInt cstart = C->cmap->rstart;
625: PetscInt cdnz,conz,k0,k1,nextp;
626: MPI_Comm comm;
627: PetscMPIInt size;
630: PetscObjectGetComm((PetscObject)A,&comm);
631: MPI_Comm_size(comm,&size);
633: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
634: /*-----------------------------------------------------*/
635: /* update numerical values of P_oth and P_loc */
636: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
637: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
639: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
640: /*----------------------------------------------------------*/
641: /* get data from symbolic products */
642: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
643: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
644: if (size >1) {
645: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
646: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
647: } else {
648: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
649: }
651: api = ptap->api;
652: apj = ptap->apj;
653: for (i=0; i<cm; i++) {
654: apJ = apj + api[i];
656: /* diagonal portion of A */
657: anz = adi[i+1] - adi[i];
658: adj = ad->j + adi[i];
659: ada = ad->a + adi[i];
660: for (j=0; j<anz; j++) {
661: row = adj[j];
662: pnz = pi_loc[row+1] - pi_loc[row];
663: pj = pj_loc + pi_loc[row];
664: pa = pa_loc + pi_loc[row];
665: /* perform sparse axpy */
666: valtmp = ada[j];
667: nextp = 0;
668: for (k=0; nextp<pnz; k++) {
669: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
670: apa_sparse[k] += valtmp*pa[nextp++];
671: }
672: }
673: PetscLogFlops(2.0*pnz);
674: }
676: /* off-diagonal portion of A */
677: anz = aoi[i+1] - aoi[i];
678: aoj = ao->j + aoi[i];
679: aoa = ao->a + aoi[i];
680: for (j=0; j<anz; j++) {
681: row = aoj[j];
682: pnz = pi_oth[row+1] - pi_oth[row];
683: pj = pj_oth + pi_oth[row];
684: pa = pa_oth + pi_oth[row];
685: /* perform sparse axpy */
686: valtmp = aoa[j];
687: nextp = 0;
688: for (k=0; nextp<pnz; k++) {
689: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
690: apa_sparse[k] += valtmp*pa[nextp++];
691: }
692: }
693: PetscLogFlops(2.0*pnz);
694: }
696: /* set values in C */
697: cdnz = cd->i[i+1] - cd->i[i];
698: conz = co->i[i+1] - co->i[i];
700: /* 1st off-diagoanl part of C */
701: ca = coa + co->i[i];
702: k = 0;
703: for (k0=0; k0<conz; k0++) {
704: if (apJ[k] >= cstart) break;
705: ca[k0] = apa_sparse[k];
706: apa_sparse[k] = 0.0;
707: k++;
708: }
710: /* diagonal part of C */
711: ca = cda + cd->i[i];
712: for (k1=0; k1<cdnz; k1++) {
713: ca[k1] = apa_sparse[k];
714: apa_sparse[k] = 0.0;
715: k++;
716: }
718: /* 2nd off-diagoanl part of C */
719: ca = coa + co->i[i];
720: for (; k0<conz; k0++) {
721: ca[k0] = apa_sparse[k];
722: apa_sparse[k] = 0.0;
723: k++;
724: }
725: }
726: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
727: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
728: return(0);
729: }
731: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
734: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
735: {
736: PetscErrorCode ierr;
737: MPI_Comm comm;
738: PetscMPIInt size;
739: Mat Cmpi;
740: Mat_PtAPMPI *ptap;
741: PetscFreeSpaceList free_space = NULL,current_space=NULL;
742: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
743: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
744: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
745: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
746: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
747: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
748: PetscInt nlnk_max,armax,prmax;
749: PetscReal afill;
750: PetscScalar *apa;
753: PetscObjectGetComm((PetscObject)A,&comm);
754: MPI_Comm_size(comm,&size);
756: /* create struct Mat_PtAPMPI and attached it to C later */
757: PetscNew(&ptap);
759: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
760: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
761:
762: /* get P_loc by taking all local rows of P */
763: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
765: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
766: pi_loc = p_loc->i; pj_loc = p_loc->j;
767: if (size > 1) {
768: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
769: pi_oth = p_oth->i; pj_oth = p_oth->j;
770: } else {
771: p_oth = NULL;
772: pi_oth = NULL; pj_oth = NULL;
773: }
775: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
776: /*-------------------------------------------------------------------*/
777: PetscMalloc1(am+2,&api);
778: ptap->api = api;
779: api[0] = 0;
781: /* create and initialize a linked list */
782: armax = ad->rmax+ao->rmax;
783: if (size >1) {
784: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
785: } else {
786: prmax = p_loc->rmax;
787: }
788: nlnk_max = armax*prmax;
789: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
790: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
792: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
793: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
795: current_space = free_space;
797: MatPreallocateInitialize(comm,am,pn,dnz,onz);
798: for (i=0; i<am; i++) {
799: /* diagonal portion of A */
800: nzi = adi[i+1] - adi[i];
801: for (j=0; j<nzi; j++) {
802: row = *adj++;
803: pnz = pi_loc[row+1] - pi_loc[row];
804: Jptr = pj_loc + pi_loc[row];
805: /* add non-zero cols of P into the sorted linked list lnk */
806: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
807: }
808: /* off-diagonal portion of A */
809: nzi = aoi[i+1] - aoi[i];
810: for (j=0; j<nzi; j++) {
811: row = *aoj++;
812: pnz = pi_oth[row+1] - pi_oth[row];
813: Jptr = pj_oth + pi_oth[row];
814: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
815: }
817: apnz = *lnk;
818: api[i+1] = api[i] + apnz;
819: if (apnz > apnz_max) apnz_max = apnz;
821: /* if free space is not available, double the total space in the list */
822: if (current_space->local_remaining<apnz) {
823: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
824: nspacedouble++;
825: }
827: /* Copy data into free space, then initialize lnk */
828: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
829: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
831: current_space->array += apnz;
832: current_space->local_used += apnz;
833: current_space->local_remaining -= apnz;
834: }
836: /* Allocate space for apj, initialize apj, and */
837: /* destroy list of free space and other temporary array(s) */
838: PetscMalloc1(api[am]+1,&ptap->apj);
839: apj = ptap->apj;
840: PetscFreeSpaceContiguous(&free_space,ptap->apj);
841: PetscLLCondensedDestroy_Scalable(lnk);
843: /* create and assemble symbolic parallel matrix Cmpi */
844: /*----------------------------------------------------*/
845: MatCreate(comm,&Cmpi);
846: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
847: MatSetBlockSizesFromMats(Cmpi,A,P);
848: MatSetType(Cmpi,MATMPIAIJ);
849: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
850: MatPreallocateFinalize(dnz,onz);
852: /* malloc apa for assembly Cmpi */
853: PetscCalloc1(apnz_max,&apa);
855: ptap->apa = apa;
856: for (i=0; i<am; i++) {
857: row = i + rstart;
858: apnz = api[i+1] - api[i];
859: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
860: apj += apnz;
861: }
862: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
863: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
865: ptap->destroy = Cmpi->ops->destroy;
866: ptap->duplicate = Cmpi->ops->duplicate;
867: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
868: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
869: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
871: /* attach the supporting struct to Cmpi for reuse */
872: c = (Mat_MPIAIJ*)Cmpi->data;
873: c->ptap = ptap;
875: *C = Cmpi;
877: /* set MatInfo */
878: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
879: if (afill < 1.0) afill = 1.0;
880: Cmpi->info.mallocs = nspacedouble;
881: Cmpi->info.fill_ratio_given = fill;
882: Cmpi->info.fill_ratio_needed = afill;
884: #if defined(PETSC_USE_INFO)
885: if (api[am]) {
886: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
887: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
888: } else {
889: PetscInfo(Cmpi,"Empty matrix product\n");
890: }
891: #endif
892: return(0);
893: }
895: /*-------------------------------------------------------------------------*/
898: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
899: {
901: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
902: PetscInt alg=0; /* set default algorithm */
905: if (scall == MAT_INITIAL_MATRIX) {
906: PetscObjectOptionsBegin((PetscObject)A);
907: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
908: PetscOptionsEnd();
910: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
911: switch (alg) {
912: case 1:
913: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
914: break;
915: case 2:
916: {
917: Mat Pt;
918: Mat_PtAPMPI *ptap;
919: Mat_MPIAIJ *c;
920: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
921: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
922: c = (Mat_MPIAIJ*)(*C)->data;
923: ptap = c->ptap;
924: ptap->Pt = Pt;
925: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
926: return(0);
927: }
928: break;
929: default:
930: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
931: break;
932: }
933: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
934: }
935: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
936: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
937: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
938: return(0);
939: }
941: /* This routine only works when scall=MAT_REUSE_MATRIX! */
944: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
945: {
947: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
948: Mat_PtAPMPI *ptap= c->ptap;
949: Mat Pt=ptap->Pt;
952: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
953: MatMatMultNumeric(Pt,A,C);
954: return(0);
955: }
957: /* Non-scalable version, use dense axpy */
960: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
961: {
962: PetscErrorCode ierr;
963: Mat_Merge_SeqsToMPI *merge;
964: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
965: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
966: Mat_PtAPMPI *ptap;
967: PetscInt *adj,*aJ;
968: PetscInt i,j,k,anz,pnz,row,*cj;
969: MatScalar *ada,*aval,*ca,valtmp;
970: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
971: MPI_Comm comm;
972: PetscMPIInt size,rank,taga,*len_s;
973: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
974: PetscInt **buf_ri,**buf_rj;
975: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
976: MPI_Request *s_waits,*r_waits;
977: MPI_Status *status;
978: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
979: PetscInt *ai,*aj,*coi,*coj;
980: PetscInt *poJ,*pdJ;
981: Mat A_loc;
982: Mat_SeqAIJ *a_loc;
985: PetscObjectGetComm((PetscObject)C,&comm);
986: MPI_Comm_size(comm,&size);
987: MPI_Comm_rank(comm,&rank);
989: ptap = c->ptap;
990: merge = ptap->merge;
992: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
993: /*--------------------------------------------------------------*/
994: /* get data from symbolic products */
995: coi = merge->coi; coj = merge->coj;
996: PetscCalloc1(coi[pon]+1,&coa);
998: bi = merge->bi; bj = merge->bj;
999: owners = merge->rowmap->range;
1000: PetscCalloc1(bi[cm]+1,&ba);
1002: /* get A_loc by taking all local rows of A */
1003: A_loc = ptap->A_loc;
1004: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1005: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1006: ai = a_loc->i;
1007: aj = a_loc->j;
1009: PetscCalloc1(A->cmap->N,&aval); /* non-scalable!!! */
1011: for (i=0; i<am; i++) {
1012: /* 2-a) put A[i,:] to dense array aval */
1013: anz = ai[i+1] - ai[i];
1014: adj = aj + ai[i];
1015: ada = a_loc->a + ai[i];
1016: for (j=0; j<anz; j++) {
1017: aval[adj[j]] = ada[j];
1018: }
1020: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1021: /*--------------------------------------------------------------*/
1022: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1023: pnz = po->i[i+1] - po->i[i];
1024: poJ = po->j + po->i[i];
1025: pA = po->a + po->i[i];
1026: for (j=0; j<pnz; j++) {
1027: row = poJ[j];
1028: cnz = coi[row+1] - coi[row];
1029: cj = coj + coi[row];
1030: ca = coa + coi[row];
1031: /* perform dense axpy */
1032: valtmp = pA[j];
1033: for (k=0; k<cnz; k++) {
1034: ca[k] += valtmp*aval[cj[k]];
1035: }
1036: PetscLogFlops(2.0*cnz);
1037: }
1039: /* put the value into Cd (diagonal part) */
1040: pnz = pd->i[i+1] - pd->i[i];
1041: pdJ = pd->j + pd->i[i];
1042: pA = pd->a + pd->i[i];
1043: for (j=0; j<pnz; j++) {
1044: row = pdJ[j];
1045: cnz = bi[row+1] - bi[row];
1046: cj = bj + bi[row];
1047: ca = ba + bi[row];
1048: /* perform dense axpy */
1049: valtmp = pA[j];
1050: for (k=0; k<cnz; k++) {
1051: ca[k] += valtmp*aval[cj[k]];
1052: }
1053: PetscLogFlops(2.0*cnz);
1054: }
1056: /* zero the current row of Pt*A */
1057: aJ = aj + ai[i];
1058: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1059: }
1061: /* 3) send and recv matrix values coa */
1062: /*------------------------------------*/
1063: buf_ri = merge->buf_ri;
1064: buf_rj = merge->buf_rj;
1065: len_s = merge->len_s;
1066: PetscCommGetNewTag(comm,&taga);
1067: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1069: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1070: for (proc=0,k=0; proc<size; proc++) {
1071: if (!len_s[proc]) continue;
1072: i = merge->owners_co[proc];
1073: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1074: k++;
1075: }
1076: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1077: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1079: PetscFree2(s_waits,status);
1080: PetscFree(r_waits);
1081: PetscFree(coa);
1083: /* 4) insert local Cseq and received values into Cmpi */
1084: /*----------------------------------------------------*/
1085: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1086: for (k=0; k<merge->nrecv; k++) {
1087: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1088: nrows = *(buf_ri_k[k]);
1089: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1090: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1091: }
1093: for (i=0; i<cm; i++) {
1094: row = owners[rank] + i; /* global row index of C_seq */
1095: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1096: ba_i = ba + bi[i];
1097: bnz = bi[i+1] - bi[i];
1098: /* add received vals into ba */
1099: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1100: /* i-th row */
1101: if (i == *nextrow[k]) {
1102: cnz = *(nextci[k]+1) - *nextci[k];
1103: cj = buf_rj[k] + *(nextci[k]);
1104: ca = abuf_r[k] + *(nextci[k]);
1105: nextcj = 0;
1106: for (j=0; nextcj<cnz; j++) {
1107: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1108: ba_i[j] += ca[nextcj++];
1109: }
1110: }
1111: nextrow[k]++; nextci[k]++;
1112: PetscLogFlops(2.0*cnz);
1113: }
1114: }
1115: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1116: }
1117: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1118: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1120: PetscFree(ba);
1121: PetscFree(abuf_r[0]);
1122: PetscFree(abuf_r);
1123: PetscFree3(buf_ri_k,nextrow,nextci);
1124: PetscFree(aval);
1125: return(0);
1126: }
1128: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1129: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1132: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1133: {
1134: PetscErrorCode ierr;
1135: Mat Cmpi,A_loc,POt,PDt;
1136: Mat_PtAPMPI *ptap;
1137: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1138: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1139: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1140: PetscInt nnz;
1141: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1142: PetscInt am=A->rmap->n,pn=P->cmap->n;
1143: PetscBT lnkbt;
1144: MPI_Comm comm;
1145: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1146: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1147: PetscInt len,proc,*dnz,*onz,*owners;
1148: PetscInt nzi,*bi,*bj;
1149: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1150: MPI_Request *swaits,*rwaits;
1151: MPI_Status *sstatus,rstatus;
1152: Mat_Merge_SeqsToMPI *merge;
1153: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1154: PetscReal afill =1.0,afill_tmp;
1155: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1156: PetscScalar *vals;
1157: Mat_SeqAIJ *a_loc, *pdt,*pot;
1160: PetscObjectGetComm((PetscObject)A,&comm);
1161: /* check if matrix local sizes are compatible */
1162: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1163: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1164: }
1166: MPI_Comm_size(comm,&size);
1167: MPI_Comm_rank(comm,&rank);
1169: /* create struct Mat_PtAPMPI and attached it to C later */
1170: PetscNew(&ptap);
1172: /* get A_loc by taking all local rows of A */
1173: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1175: ptap->A_loc = A_loc;
1177: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1178: ai = a_loc->i;
1179: aj = a_loc->j;
1181: /* determine symbolic Co=(p->B)^T*A - send to others */
1182: /*----------------------------------------------------*/
1183: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1184: pdt = (Mat_SeqAIJ*)PDt->data;
1185: pdti = pdt->i; pdtj = pdt->j;
1187: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1188: pot = (Mat_SeqAIJ*)POt->data;
1189: poti = pot->i; potj = pot->j;
1191: /* then, compute symbolic Co = (p->B)^T*A */
1192: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1193: PetscMalloc1(pon+1,&coi);
1194: coi[0] = 0;
1196: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1197: nnz = fill*(poti[pon] + ai[am]);
1198: PetscFreeSpaceGet(nnz,&free_space);
1199: current_space = free_space;
1201: /* create and initialize a linked list */
1202: i = PetscMax(pdt->rmax,pot->rmax);
1203: Crmax = i*a_loc->rmax*size;
1204: if (!Crmax || Crmax > aN) Crmax = aN;
1205: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1207: for (i=0; i<pon; i++) {
1208: pnz = poti[i+1] - poti[i];
1209: ptJ = potj + poti[i];
1210: for (j=0; j<pnz; j++) {
1211: row = ptJ[j]; /* row of A_loc == col of Pot */
1212: anz = ai[row+1] - ai[row];
1213: Jptr = aj + ai[row];
1214: /* add non-zero cols of AP into the sorted linked list lnk */
1215: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1216: }
1217: nnz = lnk[0];
1219: /* If free space is not available, double the total space in the list */
1220: if (current_space->local_remaining<nnz) {
1221: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1222: nspacedouble++;
1223: }
1225: /* Copy data into free space, and zero out denserows */
1226: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1228: current_space->array += nnz;
1229: current_space->local_used += nnz;
1230: current_space->local_remaining -= nnz;
1232: coi[i+1] = coi[i] + nnz;
1233: }
1235: PetscMalloc1(coi[pon]+1,&coj);
1236: PetscFreeSpaceContiguous(&free_space,coj);
1238: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1239: if (afill_tmp > afill) afill = afill_tmp;
1241: /* send j-array (coj) of Co to other processors */
1242: /*----------------------------------------------*/
1243: /* determine row ownership */
1244: PetscNew(&merge);
1245: PetscLayoutCreate(comm,&merge->rowmap);
1247: merge->rowmap->n = pn;
1248: merge->rowmap->bs = 1;
1250: PetscLayoutSetUp(merge->rowmap);
1251: owners = merge->rowmap->range;
1253: /* determine the number of messages to send, their lengths */
1254: PetscCalloc1(size,&len_si);
1255: PetscMalloc1(size,&merge->len_s);
1257: len_s = merge->len_s;
1258: merge->nsend = 0;
1260: PetscMalloc1(size+2,&owners_co);
1261: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1263: proc = 0;
1264: for (i=0; i<pon; i++) {
1265: while (prmap[i] >= owners[proc+1]) proc++;
1266: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1267: len_s[proc] += coi[i+1] - coi[i];
1268: }
1270: len = 0; /* max length of buf_si[] */
1271: owners_co[0] = 0;
1272: for (proc=0; proc<size; proc++) {
1273: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1274: if (len_si[proc]) {
1275: merge->nsend++;
1276: len_si[proc] = 2*(len_si[proc] + 1);
1277: len += len_si[proc];
1278: }
1279: }
1281: /* determine the number and length of messages to receive for coi and coj */
1282: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1283: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1285: /* post the Irecv and Isend of coj */
1286: PetscCommGetNewTag(comm,&tagj);
1287: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1288: PetscMalloc1(merge->nsend+1,&swaits);
1289: for (proc=0, k=0; proc<size; proc++) {
1290: if (!len_s[proc]) continue;
1291: i = owners_co[proc];
1292: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1293: k++;
1294: }
1296: /* receives and sends of coj are complete */
1297: PetscMalloc1(size,&sstatus);
1298: for (i=0; i<merge->nrecv; i++) {
1299: PetscMPIInt icompleted;
1300: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1301: }
1302: PetscFree(rwaits);
1303: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1305: /* send and recv coi */
1306: /*-------------------*/
1307: PetscCommGetNewTag(comm,&tagi);
1308: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1309: PetscMalloc1(len+1,&buf_s);
1310: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1311: for (proc=0,k=0; proc<size; proc++) {
1312: if (!len_s[proc]) continue;
1313: /* form outgoing message for i-structure:
1314: buf_si[0]: nrows to be sent
1315: [1:nrows]: row index (global)
1316: [nrows+1:2*nrows+1]: i-structure index
1317: */
1318: /*-------------------------------------------*/
1319: nrows = len_si[proc]/2 - 1;
1320: buf_si_i = buf_si + nrows+1;
1321: buf_si[0] = nrows;
1322: buf_si_i[0] = 0;
1323: nrows = 0;
1324: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1325: nzi = coi[i+1] - coi[i];
1326: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1327: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1328: nrows++;
1329: }
1330: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1331: k++;
1332: buf_si += len_si[proc];
1333: }
1334: i = merge->nrecv;
1335: while (i--) {
1336: PetscMPIInt icompleted;
1337: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1338: }
1339: PetscFree(rwaits);
1340: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1341: PetscFree(len_si);
1342: PetscFree(len_ri);
1343: PetscFree(swaits);
1344: PetscFree(sstatus);
1345: PetscFree(buf_s);
1347: /* compute the local portion of C (mpi mat) */
1348: /*------------------------------------------*/
1349: /* allocate bi array and free space for accumulating nonzero column info */
1350: PetscMalloc1(pn+1,&bi);
1351: bi[0] = 0;
1353: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1354: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1355: PetscFreeSpaceGet(nnz,&free_space);
1356: current_space = free_space;
1358: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1359: for (k=0; k<merge->nrecv; k++) {
1360: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1361: nrows = *buf_ri_k[k];
1362: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1363: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1364: }
1366: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1367: rmax = 0;
1368: for (i=0; i<pn; i++) {
1369: /* add pdt[i,:]*AP into lnk */
1370: pnz = pdti[i+1] - pdti[i];
1371: ptJ = pdtj + pdti[i];
1372: for (j=0; j<pnz; j++) {
1373: row = ptJ[j]; /* row of AP == col of Pt */
1374: anz = ai[row+1] - ai[row];
1375: Jptr = aj + ai[row];
1376: /* add non-zero cols of AP into the sorted linked list lnk */
1377: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1378: }
1380: /* add received col data into lnk */
1381: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1382: if (i == *nextrow[k]) { /* i-th row */
1383: nzi = *(nextci[k]+1) - *nextci[k];
1384: Jptr = buf_rj[k] + *nextci[k];
1385: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1386: nextrow[k]++; nextci[k]++;
1387: }
1388: }
1389: nnz = lnk[0];
1391: /* if free space is not available, make more free space */
1392: if (current_space->local_remaining<nnz) {
1393: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1394: nspacedouble++;
1395: }
1396: /* copy data into free space, then initialize lnk */
1397: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1398: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1400: current_space->array += nnz;
1401: current_space->local_used += nnz;
1402: current_space->local_remaining -= nnz;
1404: bi[i+1] = bi[i] + nnz;
1405: if (nnz > rmax) rmax = nnz;
1406: }
1407: PetscFree3(buf_ri_k,nextrow,nextci);
1409: PetscMalloc1(bi[pn]+1,&bj);
1410: PetscFreeSpaceContiguous(&free_space,bj);
1412: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1413: if (afill_tmp > afill) afill = afill_tmp;
1414: PetscLLCondensedDestroy(lnk,lnkbt);
1415: MatDestroy(&POt);
1416: MatDestroy(&PDt);
1418: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1419: /*----------------------------------------------------------------------------------*/
1420: PetscCalloc1(rmax+1,&vals);
1422: MatCreate(comm,&Cmpi);
1423: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1424: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1425: MatSetType(Cmpi,MATMPIAIJ);
1426: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1427: MatPreallocateFinalize(dnz,onz);
1428: MatSetBlockSize(Cmpi,1);
1429: for (i=0; i<pn; i++) {
1430: row = i + rstart;
1431: nnz = bi[i+1] - bi[i];
1432: Jptr = bj + bi[i];
1433: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1434: }
1435: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1436: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1437: PetscFree(vals);
1439: merge->bi = bi;
1440: merge->bj = bj;
1441: merge->coi = coi;
1442: merge->coj = coj;
1443: merge->buf_ri = buf_ri;
1444: merge->buf_rj = buf_rj;
1445: merge->owners_co = owners_co;
1446: merge->destroy = Cmpi->ops->destroy;
1447: merge->duplicate = Cmpi->ops->duplicate;
1449: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1450: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1451: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1453: /* attach the supporting struct to Cmpi for reuse */
1454: c = (Mat_MPIAIJ*)Cmpi->data;
1455: c->ptap = ptap;
1456: ptap->api = NULL;
1457: ptap->apj = NULL;
1458: ptap->merge = merge;
1459: ptap->rmax = rmax;
1461: *C = Cmpi;
1462: #if defined(PETSC_USE_INFO)
1463: if (bi[pn] != 0) {
1464: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1465: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1466: } else {
1467: PetscInfo(Cmpi,"Empty matrix product\n");
1468: }
1469: #endif
1470: return(0);
1471: }
1475: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1476: {
1477: PetscErrorCode ierr;
1478: Mat_Merge_SeqsToMPI *merge;
1479: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1480: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1481: Mat_PtAPMPI *ptap;
1482: PetscInt *adj;
1483: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1484: MatScalar *ada,*ca,valtmp;
1485: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1486: MPI_Comm comm;
1487: PetscMPIInt size,rank,taga,*len_s;
1488: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1489: PetscInt **buf_ri,**buf_rj;
1490: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1491: MPI_Request *s_waits,*r_waits;
1492: MPI_Status *status;
1493: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1494: PetscInt *ai,*aj,*coi,*coj;
1495: PetscInt *poJ,*pdJ;
1496: Mat A_loc;
1497: Mat_SeqAIJ *a_loc;
1500: PetscObjectGetComm((PetscObject)C,&comm);
1501: MPI_Comm_size(comm,&size);
1502: MPI_Comm_rank(comm,&rank);
1504: ptap = c->ptap;
1505: merge = ptap->merge;
1507: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1508: /*------------------------------------------*/
1509: /* get data from symbolic products */
1510: coi = merge->coi; coj = merge->coj;
1511: PetscCalloc1(coi[pon]+1,&coa);
1512: bi = merge->bi; bj = merge->bj;
1513: owners = merge->rowmap->range;
1514: PetscCalloc1(bi[cm]+1,&ba);
1516: /* get A_loc by taking all local rows of A */
1517: A_loc = ptap->A_loc;
1518: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1519: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1520: ai = a_loc->i;
1521: aj = a_loc->j;
1523: for (i=0; i<am; i++) {
1524: anz = ai[i+1] - ai[i];
1525: adj = aj + ai[i];
1526: ada = a_loc->a + ai[i];
1528: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1529: /*-------------------------------------------------------------*/
1530: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1531: pnz = po->i[i+1] - po->i[i];
1532: poJ = po->j + po->i[i];
1533: pA = po->a + po->i[i];
1534: for (j=0; j<pnz; j++) {
1535: row = poJ[j];
1536: cj = coj + coi[row];
1537: ca = coa + coi[row];
1538: /* perform sparse axpy */
1539: nexta = 0;
1540: valtmp = pA[j];
1541: for (k=0; nexta<anz; k++) {
1542: if (cj[k] == adj[nexta]) {
1543: ca[k] += valtmp*ada[nexta];
1544: nexta++;
1545: }
1546: }
1547: PetscLogFlops(2.0*anz);
1548: }
1550: /* put the value into Cd (diagonal part) */
1551: pnz = pd->i[i+1] - pd->i[i];
1552: pdJ = pd->j + pd->i[i];
1553: pA = pd->a + pd->i[i];
1554: for (j=0; j<pnz; j++) {
1555: row = pdJ[j];
1556: cj = bj + bi[row];
1557: ca = ba + bi[row];
1558: /* perform sparse axpy */
1559: nexta = 0;
1560: valtmp = pA[j];
1561: for (k=0; nexta<anz; k++) {
1562: if (cj[k] == adj[nexta]) {
1563: ca[k] += valtmp*ada[nexta];
1564: nexta++;
1565: }
1566: }
1567: PetscLogFlops(2.0*anz);
1568: }
1569: }
1571: /* 3) send and recv matrix values coa */
1572: /*------------------------------------*/
1573: buf_ri = merge->buf_ri;
1574: buf_rj = merge->buf_rj;
1575: len_s = merge->len_s;
1576: PetscCommGetNewTag(comm,&taga);
1577: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1579: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1580: for (proc=0,k=0; proc<size; proc++) {
1581: if (!len_s[proc]) continue;
1582: i = merge->owners_co[proc];
1583: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1584: k++;
1585: }
1586: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1587: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1589: PetscFree2(s_waits,status);
1590: PetscFree(r_waits);
1591: PetscFree(coa);
1593: /* 4) insert local Cseq and received values into Cmpi */
1594: /*----------------------------------------------------*/
1595: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1596: for (k=0; k<merge->nrecv; k++) {
1597: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1598: nrows = *(buf_ri_k[k]);
1599: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1600: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1601: }
1603: for (i=0; i<cm; i++) {
1604: row = owners[rank] + i; /* global row index of C_seq */
1605: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1606: ba_i = ba + bi[i];
1607: bnz = bi[i+1] - bi[i];
1608: /* add received vals into ba */
1609: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1610: /* i-th row */
1611: if (i == *nextrow[k]) {
1612: cnz = *(nextci[k]+1) - *nextci[k];
1613: cj = buf_rj[k] + *(nextci[k]);
1614: ca = abuf_r[k] + *(nextci[k]);
1615: nextcj = 0;
1616: for (j=0; nextcj<cnz; j++) {
1617: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1618: ba_i[j] += ca[nextcj++];
1619: }
1620: }
1621: nextrow[k]++; nextci[k]++;
1622: PetscLogFlops(2.0*cnz);
1623: }
1624: }
1625: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1626: }
1627: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1628: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1630: PetscFree(ba);
1631: PetscFree(abuf_r[0]);
1632: PetscFree(abuf_r);
1633: PetscFree3(buf_ri_k,nextrow,nextci);
1634: return(0);
1635: }
1637: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1638: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1639: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1642: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1643: {
1644: PetscErrorCode ierr;
1645: Mat Cmpi,A_loc,POt,PDt;
1646: Mat_PtAPMPI *ptap;
1647: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1648: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1649: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1650: PetscInt nnz;
1651: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1652: PetscInt am =A->rmap->n,pn=P->cmap->n;
1653: MPI_Comm comm;
1654: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1655: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1656: PetscInt len,proc,*dnz,*onz,*owners;
1657: PetscInt nzi,*bi,*bj;
1658: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1659: MPI_Request *swaits,*rwaits;
1660: MPI_Status *sstatus,rstatus;
1661: Mat_Merge_SeqsToMPI *merge;
1662: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1663: PetscReal afill =1.0,afill_tmp;
1664: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1665: PetscScalar *vals;
1666: Mat_SeqAIJ *a_loc, *pdt,*pot;
1669: PetscObjectGetComm((PetscObject)A,&comm);
1670: /* check if matrix local sizes are compatible */
1671: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1672: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1673: }
1675: MPI_Comm_size(comm,&size);
1676: MPI_Comm_rank(comm,&rank);
1678: /* create struct Mat_PtAPMPI and attached it to C later */
1679: PetscNew(&ptap);
1681: /* get A_loc by taking all local rows of A */
1682: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1684: ptap->A_loc = A_loc;
1685: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1686: ai = a_loc->i;
1687: aj = a_loc->j;
1689: /* determine symbolic Co=(p->B)^T*A - send to others */
1690: /*----------------------------------------------------*/
1691: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1692: pdt = (Mat_SeqAIJ*)PDt->data;
1693: pdti = pdt->i; pdtj = pdt->j;
1695: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1696: pot = (Mat_SeqAIJ*)POt->data;
1697: poti = pot->i; potj = pot->j;
1699: /* then, compute symbolic Co = (p->B)^T*A */
1700: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1701: >= (num of nonzero rows of C_seq) - pn */
1702: PetscMalloc1(pon+1,&coi);
1703: coi[0] = 0;
1705: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1706: nnz = fill*(poti[pon] + ai[am]);
1707: PetscFreeSpaceGet(nnz,&free_space);
1708: current_space = free_space;
1710: /* create and initialize a linked list */
1711: i = PetscMax(pdt->rmax,pot->rmax);
1712: Crmax = i*a_loc->rmax*size; /* non-scalable! */
1713: if (!Crmax || Crmax > aN) Crmax = aN;
1714: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
1716: for (i=0; i<pon; i++) {
1717: pnz = poti[i+1] - poti[i];
1718: ptJ = potj + poti[i];
1719: for (j=0; j<pnz; j++) {
1720: row = ptJ[j]; /* row of A_loc == col of Pot */
1721: anz = ai[row+1] - ai[row];
1722: Jptr = aj + ai[row];
1723: /* add non-zero cols of AP into the sorted linked list lnk */
1724: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1725: }
1726: nnz = lnk[0];
1728: /* If free space is not available, double the total space in the list */
1729: if (current_space->local_remaining<nnz) {
1730: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1731: nspacedouble++;
1732: }
1734: /* Copy data into free space, and zero out denserows */
1735: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1737: current_space->array += nnz;
1738: current_space->local_used += nnz;
1739: current_space->local_remaining -= nnz;
1741: coi[i+1] = coi[i] + nnz;
1742: }
1744: PetscMalloc1(coi[pon]+1,&coj);
1745: PetscFreeSpaceContiguous(&free_space,coj);
1747: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1748: if (afill_tmp > afill) afill = afill_tmp;
1750: /* send j-array (coj) of Co to other processors */
1751: /*----------------------------------------------*/
1752: /* determine row ownership */
1753: PetscNew(&merge);
1754: PetscLayoutCreate(comm,&merge->rowmap);
1756: merge->rowmap->n = pn;
1757: merge->rowmap->bs = 1;
1759: PetscLayoutSetUp(merge->rowmap);
1760: owners = merge->rowmap->range;
1762: /* determine the number of messages to send, their lengths */
1763: PetscCalloc1(size,&len_si);
1764: PetscMalloc1(size,&merge->len_s);
1766: len_s = merge->len_s;
1767: merge->nsend = 0;
1769: PetscMalloc1(size+2,&owners_co);
1770: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1772: proc = 0;
1773: for (i=0; i<pon; i++) {
1774: while (prmap[i] >= owners[proc+1]) proc++;
1775: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1776: len_s[proc] += coi[i+1] - coi[i];
1777: }
1779: len = 0; /* max length of buf_si[] */
1780: owners_co[0] = 0;
1781: for (proc=0; proc<size; proc++) {
1782: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1783: if (len_si[proc]) {
1784: merge->nsend++;
1785: len_si[proc] = 2*(len_si[proc] + 1);
1786: len += len_si[proc];
1787: }
1788: }
1790: /* determine the number and length of messages to receive for coi and coj */
1791: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1792: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1794: /* post the Irecv and Isend of coj */
1795: PetscCommGetNewTag(comm,&tagj);
1796: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1797: PetscMalloc1(merge->nsend+1,&swaits);
1798: for (proc=0, k=0; proc<size; proc++) {
1799: if (!len_s[proc]) continue;
1800: i = owners_co[proc];
1801: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1802: k++;
1803: }
1805: /* receives and sends of coj are complete */
1806: PetscMalloc1(size,&sstatus);
1807: for (i=0; i<merge->nrecv; i++) {
1808: PetscMPIInt icompleted;
1809: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1810: }
1811: PetscFree(rwaits);
1812: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1814: /* send and recv coi */
1815: /*-------------------*/
1816: PetscCommGetNewTag(comm,&tagi);
1817: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1818: PetscMalloc1(len+1,&buf_s);
1819: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1820: for (proc=0,k=0; proc<size; proc++) {
1821: if (!len_s[proc]) continue;
1822: /* form outgoing message for i-structure:
1823: buf_si[0]: nrows to be sent
1824: [1:nrows]: row index (global)
1825: [nrows+1:2*nrows+1]: i-structure index
1826: */
1827: /*-------------------------------------------*/
1828: nrows = len_si[proc]/2 - 1;
1829: buf_si_i = buf_si + nrows+1;
1830: buf_si[0] = nrows;
1831: buf_si_i[0] = 0;
1832: nrows = 0;
1833: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1834: nzi = coi[i+1] - coi[i];
1835: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1836: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1837: nrows++;
1838: }
1839: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1840: k++;
1841: buf_si += len_si[proc];
1842: }
1843: i = merge->nrecv;
1844: while (i--) {
1845: PetscMPIInt icompleted;
1846: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1847: }
1848: PetscFree(rwaits);
1849: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1850: PetscFree(len_si);
1851: PetscFree(len_ri);
1852: PetscFree(swaits);
1853: PetscFree(sstatus);
1854: PetscFree(buf_s);
1856: /* compute the local portion of C (mpi mat) */
1857: /*------------------------------------------*/
1858: /* allocate bi array and free space for accumulating nonzero column info */
1859: PetscMalloc1(pn+1,&bi);
1860: bi[0] = 0;
1862: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1863: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1864: PetscFreeSpaceGet(nnz,&free_space);
1865: current_space = free_space;
1867: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1868: for (k=0; k<merge->nrecv; k++) {
1869: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1870: nrows = *buf_ri_k[k];
1871: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1872: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1873: }
1875: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1876: rmax = 0;
1877: for (i=0; i<pn; i++) {
1878: /* add pdt[i,:]*AP into lnk */
1879: pnz = pdti[i+1] - pdti[i];
1880: ptJ = pdtj + pdti[i];
1881: for (j=0; j<pnz; j++) {
1882: row = ptJ[j]; /* row of AP == col of Pt */
1883: anz = ai[row+1] - ai[row];
1884: Jptr = aj + ai[row];
1885: /* add non-zero cols of AP into the sorted linked list lnk */
1886: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1887: }
1889: /* add received col data into lnk */
1890: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1891: if (i == *nextrow[k]) { /* i-th row */
1892: nzi = *(nextci[k]+1) - *nextci[k];
1893: Jptr = buf_rj[k] + *nextci[k];
1894: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1895: nextrow[k]++; nextci[k]++;
1896: }
1897: }
1898: nnz = lnk[0];
1900: /* if free space is not available, make more free space */
1901: if (current_space->local_remaining<nnz) {
1902: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1903: nspacedouble++;
1904: }
1905: /* copy data into free space, then initialize lnk */
1906: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1907: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1909: current_space->array += nnz;
1910: current_space->local_used += nnz;
1911: current_space->local_remaining -= nnz;
1913: bi[i+1] = bi[i] + nnz;
1914: if (nnz > rmax) rmax = nnz;
1915: }
1916: PetscFree3(buf_ri_k,nextrow,nextci);
1918: PetscMalloc1(bi[pn]+1,&bj);
1919: PetscFreeSpaceContiguous(&free_space,bj);
1920: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1921: if (afill_tmp > afill) afill = afill_tmp;
1922: PetscLLCondensedDestroy_Scalable(lnk);
1923: MatDestroy(&POt);
1924: MatDestroy(&PDt);
1926: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1927: /*----------------------------------------------------------------------------------*/
1928: PetscCalloc1(rmax+1,&vals);
1930: MatCreate(comm,&Cmpi);
1931: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1932: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1933: MatSetType(Cmpi,MATMPIAIJ);
1934: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1935: MatPreallocateFinalize(dnz,onz);
1936: MatSetBlockSize(Cmpi,1);
1937: for (i=0; i<pn; i++) {
1938: row = i + rstart;
1939: nnz = bi[i+1] - bi[i];
1940: Jptr = bj + bi[i];
1941: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1942: }
1943: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1944: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1945: PetscFree(vals);
1947: merge->bi = bi;
1948: merge->bj = bj;
1949: merge->coi = coi;
1950: merge->coj = coj;
1951: merge->buf_ri = buf_ri;
1952: merge->buf_rj = buf_rj;
1953: merge->owners_co = owners_co;
1954: merge->destroy = Cmpi->ops->destroy;
1955: merge->duplicate = Cmpi->ops->duplicate;
1957: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1958: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1959: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1961: /* attach the supporting struct to Cmpi for reuse */
1962: c = (Mat_MPIAIJ*)Cmpi->data;
1964: c->ptap = ptap;
1965: ptap->api = NULL;
1966: ptap->apj = NULL;
1967: ptap->merge = merge;
1968: ptap->rmax = rmax;
1969: ptap->apa = NULL;
1971: *C = Cmpi;
1972: #if defined(PETSC_USE_INFO)
1973: if (bi[pn] != 0) {
1974: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1975: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1976: } else {
1977: PetscInfo(Cmpi,"Empty matrix product\n");
1978: }
1979: #endif
1980: return(0);
1981: }