Actual source code: mpiptap.c
petsc-3.10.2 2018-10-09
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
13: /* #define PTAP_PROFILE */
15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
16: {
17: PetscErrorCode ierr;
18: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
19: Mat_PtAPMPI *ptap=a->ptap;
20: PetscBool iascii;
21: PetscViewerFormat format;
24: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
32: }
33: }
34: }
35: (ptap->view)(A,viewer);
36: return(0);
37: }
39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
40: {
42: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
43: Mat_PtAPMPI *ptap=a->ptap;
46: if (ptap) {
47: Mat_Merge_SeqsToMPI *merge=ptap->merge;
48: PetscFree2(ptap->startsj_s,ptap->startsj_r);
49: PetscFree(ptap->bufa);
50: MatDestroy(&ptap->P_loc);
51: MatDestroy(&ptap->P_oth);
52: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
53: MatDestroy(&ptap->Rd);
54: MatDestroy(&ptap->Ro);
55: if (ptap->AP_loc) { /* used by alg_rap */
56: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
57: PetscFree(ap->i);
58: PetscFree2(ap->j,ap->a);
59: MatDestroy(&ptap->AP_loc);
60: } else { /* used by alg_ptap */
61: PetscFree(ptap->api);
62: PetscFree(ptap->apj);
63: }
64: MatDestroy(&ptap->C_loc);
65: MatDestroy(&ptap->C_oth);
66: if (ptap->apa) {PetscFree(ptap->apa);}
68: if (merge) { /* used by alg_ptap */
69: PetscFree(merge->id_r);
70: PetscFree(merge->len_s);
71: PetscFree(merge->len_r);
72: PetscFree(merge->bi);
73: PetscFree(merge->bj);
74: PetscFree(merge->buf_ri[0]);
75: PetscFree(merge->buf_ri);
76: PetscFree(merge->buf_rj[0]);
77: PetscFree(merge->buf_rj);
78: PetscFree(merge->coi);
79: PetscFree(merge->coj);
80: PetscFree(merge->owners_co);
81: PetscLayoutDestroy(&merge->rowmap);
82: PetscFree(ptap->merge);
83: }
85: ptap->destroy(A);
86: PetscFree(ptap);
87: }
88: return(0);
89: }
91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
92: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
95: Mat_PtAPMPI *ptap = a->ptap;
98: (*ptap->duplicate)(A,op,M);
99: (*M)->ops->destroy = ptap->destroy;
100: (*M)->ops->duplicate = ptap->duplicate;
101: (*M)->ops->view = ptap->view;
102: return(0);
103: }
105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108: PetscBool flg;
109: MPI_Comm comm;
110: #if !defined(PETSC_HAVE_HYPRE)
111: const char *algTypes[2] = {"scalable","nonscalable"};
112: PetscInt nalg=2;
113: #else
114: const char *algTypes[3] = {"scalable","nonscalable","hypre"};
115: PetscInt nalg=3;
116: #endif
117: PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */
120: /* check if matrix local sizes are compatible */
121: PetscObjectGetComm((PetscObject)A,&comm);
122: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
123: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
125: if (scall == MAT_INITIAL_MATRIX) {
126: /* pick an algorithm */
127: PetscObjectOptionsBegin((PetscObject)A);
128: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
129: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
130: PetscOptionsEnd();
132: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
133: MatInfo Ainfo,Pinfo;
134: PetscInt nz_local;
135: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
137: MatGetInfo(A,MAT_LOCAL,&Ainfo);
138: MatGetInfo(P,MAT_LOCAL,&Pinfo);
139: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
141: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
142: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
144: if (alg_scalable) {
145: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
146: }
147: }
149: switch (alg) {
150: case 1:
151: /* do R=P^T locally, then C=R*A*P */
152: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
153: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
154: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
155: break;
156: #if defined(PETSC_HAVE_HYPRE)
157: case 2:
158: /* Use boomerAMGBuildCoarseOperator */
159: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
160: return(0);
161: break;
162: #endif
163: default:
164: /* do R=P^T locally, then C=R*A*P */
165: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
166: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
167: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
168: break;
169: }
170: }
171: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
172: (*(*C)->ops->ptapnumeric)(A,P,*C);
173: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
174: return(0);
175: }
177: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
178: {
179: PetscErrorCode ierr;
180: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
181: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
182: Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq;
183: Mat_PtAPMPI *ptap = c->ptap;
184: Mat AP_loc,C_loc,C_oth;
185: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
186: PetscScalar *apa;
187: const PetscInt *cols;
188: const PetscScalar *vals;
191: MatZeroEntries(C);
193: /* 1) get R = Pd^T,Ro = Po^T */
194: if (ptap->reuse == MAT_REUSE_MATRIX) {
195: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
196: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
197: }
199: /* 2) get AP_loc */
200: AP_loc = ptap->AP_loc;
201: ap = (Mat_SeqAIJ*)AP_loc->data;
203: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
204: /*-----------------------------------------------------*/
205: if (ptap->reuse == MAT_REUSE_MATRIX) {
206: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
207: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
208: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
209: }
211: /* 2-2) compute numeric A_loc*P - dominating part */
212: /* ---------------------------------------------- */
213: /* get data from symbolic products */
214: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
215: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
216: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
218: api = ap->i;
219: apj = ap->j;
220: for (i=0; i<am; i++) {
221: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
222: apnz = api[i+1] - api[i];
223: apa = ap->a + api[i];
224: PetscMemzero(apa,sizeof(PetscScalar)*apnz);
225: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
226: PetscLogFlops(2.0*apnz);
227: }
229: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
230: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
231: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
233: C_loc = ptap->C_loc;
234: C_oth = ptap->C_oth;
236: /* add C_loc and Co to to C */
237: MatGetOwnershipRange(C,&rstart,&rend);
239: /* C_loc -> C */
240: cm = C_loc->rmap->N;
241: c_seq = (Mat_SeqAIJ*)C_loc->data;
242: cols = c_seq->j;
243: vals = c_seq->a;
245: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
246: /* when there are no off-processor parts. */
247: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
248: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
249: /* a table, and other, more complex stuff has to be done. */
250: if (C->assembled) {
251: C->was_assembled = PETSC_TRUE;
252: C->assembled = PETSC_FALSE;
253: }
254: if (C->was_assembled) {
255: for (i=0; i<cm; i++) {
256: ncols = c_seq->i[i+1] - c_seq->i[i];
257: row = rstart + i;
258: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
259: cols += ncols; vals += ncols;
260: }
261: } else {
262: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
263: }
265: /* Co -> C, off-processor part */
266: cm = C_oth->rmap->N;
267: c_seq = (Mat_SeqAIJ*)C_oth->data;
268: cols = c_seq->j;
269: vals = c_seq->a;
270: for (i=0; i<cm; i++) {
271: ncols = c_seq->i[i+1] - c_seq->i[i];
272: row = p->garray[i];
273: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
274: cols += ncols; vals += ncols;
275: }
276: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
277: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
279: ptap->reuse = MAT_REUSE_MATRIX;
280: return(0);
281: }
283: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
284: {
285: PetscErrorCode ierr;
286: Mat_PtAPMPI *ptap;
287: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
288: MPI_Comm comm;
289: PetscMPIInt size,rank;
290: Mat Cmpi,P_loc,P_oth;
291: PetscFreeSpaceList free_space=NULL,current_space=NULL;
292: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
293: PetscInt *lnk,i,k,pnz,row,nsend;
294: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
295: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
296: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
297: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
298: MPI_Request *swaits,*rwaits;
299: MPI_Status *sstatus,rstatus;
300: PetscLayout rowmap;
301: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
302: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
303: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
304: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
305: PetscScalar *apv;
306: PetscTable ta;
307: #if defined(PETSC_USE_INFO)
308: PetscReal apfill;
309: #endif
312: PetscObjectGetComm((PetscObject)A,&comm);
313: MPI_Comm_size(comm,&size);
314: MPI_Comm_rank(comm,&rank);
316: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
318: /* create symbolic parallel matrix Cmpi */
319: MatCreate(comm,&Cmpi);
320: MatSetType(Cmpi,MATMPIAIJ);
322: /* create struct Mat_PtAPMPI and attached it to C later */
323: PetscNew(&ptap);
324: ptap->reuse = MAT_INITIAL_MATRIX;
325: ptap->algType = 0;
327: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
328: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
329: /* get P_loc by taking all local rows of P */
330: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
332: ptap->P_loc = P_loc;
333: ptap->P_oth = P_oth;
335: /* (0) compute Rd = Pd^T, Ro = Po^T */
336: /* --------------------------------- */
337: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
338: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
340: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
341: /* ----------------------------------------------------------------- */
342: p_loc = (Mat_SeqAIJ*)P_loc->data;
343: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
345: /* create and initialize a linked list */
346: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
347: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
348: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
349: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
351: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
353: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
354: if (ao) {
355: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
356: } else {
357: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
358: }
359: current_space = free_space;
360: nspacedouble = 0;
362: PetscMalloc1(am+1,&api);
363: api[0] = 0;
364: for (i=0; i<am; i++) {
365: /* diagonal portion: Ad[i,:]*P */
366: ai = ad->i; pi = p_loc->i;
367: nzi = ai[i+1] - ai[i];
368: aj = ad->j + ai[i];
369: for (j=0; j<nzi; j++) {
370: row = aj[j];
371: pnz = pi[row+1] - pi[row];
372: Jptr = p_loc->j + pi[row];
373: /* add non-zero cols of P into the sorted linked list lnk */
374: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
375: }
376: /* off-diagonal portion: Ao[i,:]*P */
377: if (ao) {
378: ai = ao->i; pi = p_oth->i;
379: nzi = ai[i+1] - ai[i];
380: aj = ao->j + ai[i];
381: for (j=0; j<nzi; j++) {
382: row = aj[j];
383: pnz = pi[row+1] - pi[row];
384: Jptr = p_oth->j + pi[row];
385: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
386: }
387: }
388: apnz = lnk[0];
389: api[i+1] = api[i] + apnz;
391: /* if free space is not available, double the total space in the list */
392: if (current_space->local_remaining<apnz) {
393: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
394: nspacedouble++;
395: }
397: /* Copy data into free space, then initialize lnk */
398: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
400: current_space->array += apnz;
401: current_space->local_used += apnz;
402: current_space->local_remaining -= apnz;
403: }
404: /* Allocate space for apj and apv, initialize apj, and */
405: /* destroy list of free space and other temporary array(s) */
406: PetscMalloc2(api[am],&apj,api[am],&apv);
407: PetscFreeSpaceContiguous(&free_space,apj);
408: PetscLLCondensedDestroy_Scalable(lnk);
410: /* Create AP_loc for reuse */
411: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
413: #if defined(PETSC_USE_INFO)
414: if (ao) {
415: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
416: } else {
417: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
418: }
419: ptap->AP_loc->info.mallocs = nspacedouble;
420: ptap->AP_loc->info.fill_ratio_given = fill;
421: ptap->AP_loc->info.fill_ratio_needed = apfill;
423: if (api[am]) {
424: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
425: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
426: } else {
427: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
428: }
429: #endif
431: /* (2-1) compute symbolic Co = Ro*AP_loc */
432: /* ------------------------------------ */
433: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
435: /* (3) send coj of C_oth to other processors */
436: /* ------------------------------------------ */
437: /* determine row ownership */
438: PetscLayoutCreate(comm,&rowmap);
439: rowmap->n = pn;
440: rowmap->bs = 1;
441: PetscLayoutSetUp(rowmap);
442: owners = rowmap->range;
444: /* determine the number of messages to send, their lengths */
445: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
446: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
447: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
449: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
450: coi = c_oth->i; coj = c_oth->j;
451: con = ptap->C_oth->rmap->n;
452: proc = 0;
453: for (i=0; i<con; i++) {
454: while (prmap[i] >= owners[proc+1]) proc++;
455: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
456: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
457: }
459: len = 0; /* max length of buf_si[], see (4) */
460: owners_co[0] = 0;
461: nsend = 0;
462: for (proc=0; proc<size; proc++) {
463: owners_co[proc+1] = owners_co[proc] + len_si[proc];
464: if (len_s[proc]) {
465: nsend++;
466: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
467: len += len_si[proc];
468: }
469: }
471: /* determine the number and length of messages to receive for coi and coj */
472: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
473: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
475: /* post the Irecv and Isend of coj */
476: PetscCommGetNewTag(comm,&tagj);
477: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
478: PetscMalloc1(nsend+1,&swaits);
479: for (proc=0, k=0; proc<size; proc++) {
480: if (!len_s[proc]) continue;
481: i = owners_co[proc];
482: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
483: k++;
484: }
486: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
487: /* ---------------------------------------- */
488: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
489: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
491: /* receives coj are complete */
492: for (i=0; i<nrecv; i++) {
493: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
494: }
495: PetscFree(rwaits);
496: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
498: /* add received column indices into ta to update Crmax */
499: for (k=0; k<nrecv; k++) {/* k-th received message */
500: Jptr = buf_rj[k];
501: for (j=0; j<len_r[k]; j++) {
502: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
503: }
504: }
505: PetscTableGetCount(ta,&Crmax);
506: PetscTableDestroy(&ta);
508: /* (4) send and recv coi */
509: /*-----------------------*/
510: PetscCommGetNewTag(comm,&tagi);
511: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
512: PetscMalloc1(len+1,&buf_s);
513: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
514: for (proc=0,k=0; proc<size; proc++) {
515: if (!len_s[proc]) continue;
516: /* form outgoing message for i-structure:
517: buf_si[0]: nrows to be sent
518: [1:nrows]: row index (global)
519: [nrows+1:2*nrows+1]: i-structure index
520: */
521: /*-------------------------------------------*/
522: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
523: buf_si_i = buf_si + nrows+1;
524: buf_si[0] = nrows;
525: buf_si_i[0] = 0;
526: nrows = 0;
527: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
528: nzi = coi[i+1] - coi[i];
529: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
530: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
531: nrows++;
532: }
533: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
534: k++;
535: buf_si += len_si[proc];
536: }
537: for (i=0; i<nrecv; i++) {
538: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
539: }
540: PetscFree(rwaits);
541: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
543: PetscFree4(len_s,len_si,sstatus,owners_co);
544: PetscFree(len_ri);
545: PetscFree(swaits);
546: PetscFree(buf_s);
548: /* (5) compute the local portion of Cmpi */
549: /* ------------------------------------------ */
550: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
551: PetscFreeSpaceGet(Crmax,&free_space);
552: current_space = free_space;
554: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
555: for (k=0; k<nrecv; k++) {
556: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
557: nrows = *buf_ri_k[k];
558: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
559: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
560: }
562: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
563: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
564: for (i=0; i<pn; i++) {
565: /* add C_loc into Cmpi */
566: nzi = c_loc->i[i+1] - c_loc->i[i];
567: Jptr = c_loc->j + c_loc->i[i];
568: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
570: /* add received col data into lnk */
571: for (k=0; k<nrecv; k++) { /* k-th received message */
572: if (i == *nextrow[k]) { /* i-th row */
573: nzi = *(nextci[k]+1) - *nextci[k];
574: Jptr = buf_rj[k] + *nextci[k];
575: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
576: nextrow[k]++; nextci[k]++;
577: }
578: }
579: nzi = lnk[0];
581: /* copy data into free space, then initialize lnk */
582: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
583: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
584: }
585: PetscFree3(buf_ri_k,nextrow,nextci);
586: PetscLLCondensedDestroy_Scalable(lnk);
587: PetscFreeSpaceDestroy(free_space);
589: /* local sizes and preallocation */
590: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
591: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
592: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
593: MatPreallocateFinalize(dnz,onz);
595: /* members in merge */
596: PetscFree(id_r);
597: PetscFree(len_r);
598: PetscFree(buf_ri[0]);
599: PetscFree(buf_ri);
600: PetscFree(buf_rj[0]);
601: PetscFree(buf_rj);
602: PetscLayoutDestroy(&rowmap);
604: /* attach the supporting struct to Cmpi for reuse */
605: c = (Mat_MPIAIJ*)Cmpi->data;
606: c->ptap = ptap;
607: ptap->duplicate = Cmpi->ops->duplicate;
608: ptap->destroy = Cmpi->ops->destroy;
609: ptap->view = Cmpi->ops->view;
611: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
612: Cmpi->assembled = PETSC_FALSE;
613: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
614: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
615: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
616: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
617: *C = Cmpi;
618: return(0);
619: }
621: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
622: {
623: PetscErrorCode ierr;
624: Mat_PtAPMPI *ptap;
625: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
626: MPI_Comm comm;
627: PetscMPIInt size,rank;
628: Mat Cmpi;
629: PetscFreeSpaceList free_space=NULL,current_space=NULL;
630: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
631: PetscInt *lnk,i,k,pnz,row,nsend;
632: PetscBT lnkbt;
633: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
634: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
635: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
636: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
637: MPI_Request *swaits,*rwaits;
638: MPI_Status *sstatus,rstatus;
639: PetscLayout rowmap;
640: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
641: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
642: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
643: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
644: PetscScalar *apv;
645: PetscTable ta;
646: #if defined(PETSC_USE_INFO)
647: PetscReal apfill;
648: #endif
651: PetscObjectGetComm((PetscObject)A,&comm);
652: MPI_Comm_size(comm,&size);
653: MPI_Comm_rank(comm,&rank);
655: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
657: /* create symbolic parallel matrix Cmpi */
658: MatCreate(comm,&Cmpi);
659: MatSetType(Cmpi,MATMPIAIJ);
661: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
662: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
664: /* create struct Mat_PtAPMPI and attached it to C later */
665: PetscNew(&ptap);
666: ptap->reuse = MAT_INITIAL_MATRIX;
667: ptap->algType = 1;
669: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
670: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
671: /* get P_loc by taking all local rows of P */
672: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
674: /* (0) compute Rd = Pd^T, Ro = Po^T */
675: /* --------------------------------- */
676: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
677: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
679: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
680: /* ----------------------------------------------------------------- */
681: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
682: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
684: /* create and initialize a linked list */
685: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
686: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
687: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
688: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
689: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
691: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
693: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
694: if (ao) {
695: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
696: } else {
697: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
698: }
699: current_space = free_space;
700: nspacedouble = 0;
702: PetscMalloc1(am+1,&api);
703: api[0] = 0;
704: for (i=0; i<am; i++) {
705: /* diagonal portion: Ad[i,:]*P */
706: ai = ad->i; pi = p_loc->i;
707: nzi = ai[i+1] - ai[i];
708: aj = ad->j + ai[i];
709: for (j=0; j<nzi; j++) {
710: row = aj[j];
711: pnz = pi[row+1] - pi[row];
712: Jptr = p_loc->j + pi[row];
713: /* add non-zero cols of P into the sorted linked list lnk */
714: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
715: }
716: /* off-diagonal portion: Ao[i,:]*P */
717: if (ao) {
718: ai = ao->i; pi = p_oth->i;
719: nzi = ai[i+1] - ai[i];
720: aj = ao->j + ai[i];
721: for (j=0; j<nzi; j++) {
722: row = aj[j];
723: pnz = pi[row+1] - pi[row];
724: Jptr = p_oth->j + pi[row];
725: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
726: }
727: }
728: apnz = lnk[0];
729: api[i+1] = api[i] + apnz;
730: if (ap_rmax < apnz) ap_rmax = apnz;
732: /* if free space is not available, double the total space in the list */
733: if (current_space->local_remaining<apnz) {
734: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
735: nspacedouble++;
736: }
738: /* Copy data into free space, then initialize lnk */
739: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
741: current_space->array += apnz;
742: current_space->local_used += apnz;
743: current_space->local_remaining -= apnz;
744: }
745: /* Allocate space for apj and apv, initialize apj, and */
746: /* destroy list of free space and other temporary array(s) */
747: PetscMalloc2(api[am],&apj,api[am],&apv);
748: PetscFreeSpaceContiguous(&free_space,apj);
749: PetscLLDestroy(lnk,lnkbt);
751: /* Create AP_loc for reuse */
752: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
754: #if defined(PETSC_USE_INFO)
755: if (ao) {
756: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
757: } else {
758: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
759: }
760: ptap->AP_loc->info.mallocs = nspacedouble;
761: ptap->AP_loc->info.fill_ratio_given = fill;
762: ptap->AP_loc->info.fill_ratio_needed = apfill;
764: if (api[am]) {
765: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
766: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
767: } else {
768: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
769: }
770: #endif
772: /* (2-1) compute symbolic Co = Ro*AP_loc */
773: /* ------------------------------------ */
774: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
776: /* (3) send coj of C_oth to other processors */
777: /* ------------------------------------------ */
778: /* determine row ownership */
779: PetscLayoutCreate(comm,&rowmap);
780: rowmap->n = pn;
781: rowmap->bs = 1;
782: PetscLayoutSetUp(rowmap);
783: owners = rowmap->range;
785: /* determine the number of messages to send, their lengths */
786: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
787: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
788: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
790: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
791: coi = c_oth->i; coj = c_oth->j;
792: con = ptap->C_oth->rmap->n;
793: proc = 0;
794: for (i=0; i<con; i++) {
795: while (prmap[i] >= owners[proc+1]) proc++;
796: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
797: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
798: }
800: len = 0; /* max length of buf_si[], see (4) */
801: owners_co[0] = 0;
802: nsend = 0;
803: for (proc=0; proc<size; proc++) {
804: owners_co[proc+1] = owners_co[proc] + len_si[proc];
805: if (len_s[proc]) {
806: nsend++;
807: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
808: len += len_si[proc];
809: }
810: }
812: /* determine the number and length of messages to receive for coi and coj */
813: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
814: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
816: /* post the Irecv and Isend of coj */
817: PetscCommGetNewTag(comm,&tagj);
818: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
819: PetscMalloc1(nsend+1,&swaits);
820: for (proc=0, k=0; proc<size; proc++) {
821: if (!len_s[proc]) continue;
822: i = owners_co[proc];
823: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
824: k++;
825: }
827: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
828: /* ---------------------------------------- */
829: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
830: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
832: /* receives coj are complete */
833: for (i=0; i<nrecv; i++) {
834: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
835: }
836: PetscFree(rwaits);
837: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
839: /* add received column indices into ta to update Crmax */
840: for (k=0; k<nrecv; k++) {/* k-th received message */
841: Jptr = buf_rj[k];
842: for (j=0; j<len_r[k]; j++) {
843: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
844: }
845: }
846: PetscTableGetCount(ta,&Crmax);
847: PetscTableDestroy(&ta);
849: /* (4) send and recv coi */
850: /*-----------------------*/
851: PetscCommGetNewTag(comm,&tagi);
852: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
853: PetscMalloc1(len+1,&buf_s);
854: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
855: for (proc=0,k=0; proc<size; proc++) {
856: if (!len_s[proc]) continue;
857: /* form outgoing message for i-structure:
858: buf_si[0]: nrows to be sent
859: [1:nrows]: row index (global)
860: [nrows+1:2*nrows+1]: i-structure index
861: */
862: /*-------------------------------------------*/
863: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
864: buf_si_i = buf_si + nrows+1;
865: buf_si[0] = nrows;
866: buf_si_i[0] = 0;
867: nrows = 0;
868: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
869: nzi = coi[i+1] - coi[i];
870: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
871: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
872: nrows++;
873: }
874: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
875: k++;
876: buf_si += len_si[proc];
877: }
878: for (i=0; i<nrecv; i++) {
879: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
880: }
881: PetscFree(rwaits);
882: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
884: PetscFree4(len_s,len_si,sstatus,owners_co);
885: PetscFree(len_ri);
886: PetscFree(swaits);
887: PetscFree(buf_s);
889: /* (5) compute the local portion of Cmpi */
890: /* ------------------------------------------ */
891: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
892: PetscFreeSpaceGet(Crmax,&free_space);
893: current_space = free_space;
895: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
896: for (k=0; k<nrecv; k++) {
897: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
898: nrows = *buf_ri_k[k];
899: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
900: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
901: }
903: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
904: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
905: for (i=0; i<pn; i++) {
906: /* add C_loc into Cmpi */
907: nzi = c_loc->i[i+1] - c_loc->i[i];
908: Jptr = c_loc->j + c_loc->i[i];
909: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
911: /* add received col data into lnk */
912: for (k=0; k<nrecv; k++) { /* k-th received message */
913: if (i == *nextrow[k]) { /* i-th row */
914: nzi = *(nextci[k]+1) - *nextci[k];
915: Jptr = buf_rj[k] + *nextci[k];
916: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
917: nextrow[k]++; nextci[k]++;
918: }
919: }
920: nzi = lnk[0];
922: /* copy data into free space, then initialize lnk */
923: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
924: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
925: }
926: PetscFree3(buf_ri_k,nextrow,nextci);
927: PetscLLDestroy(lnk,lnkbt);
928: PetscFreeSpaceDestroy(free_space);
930: /* local sizes and preallocation */
931: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
932: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
933: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
934: MatPreallocateFinalize(dnz,onz);
936: /* members in merge */
937: PetscFree(id_r);
938: PetscFree(len_r);
939: PetscFree(buf_ri[0]);
940: PetscFree(buf_ri);
941: PetscFree(buf_rj[0]);
942: PetscFree(buf_rj);
943: PetscLayoutDestroy(&rowmap);
945: /* attach the supporting struct to Cmpi for reuse */
946: c = (Mat_MPIAIJ*)Cmpi->data;
947: c->ptap = ptap;
948: ptap->duplicate = Cmpi->ops->duplicate;
949: ptap->destroy = Cmpi->ops->destroy;
950: ptap->view = Cmpi->ops->view;
951: PetscCalloc1(pN,&ptap->apa);
953: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
954: Cmpi->assembled = PETSC_FALSE;
955: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
956: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
957: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
958: *C = Cmpi;
959: return(0);
960: }
963: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
964: {
965: PetscErrorCode ierr;
966: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
967: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
968: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
969: Mat_PtAPMPI *ptap = c->ptap;
970: Mat AP_loc,C_loc,C_oth;
971: PetscInt i,rstart,rend,cm,ncols,row;
972: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
973: PetscScalar *apa;
974: const PetscInt *cols;
975: const PetscScalar *vals;
978: MatZeroEntries(C);
979: /* 1) get R = Pd^T,Ro = Po^T */
980: if (ptap->reuse == MAT_REUSE_MATRIX) {
981: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
982: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
983: }
985: /* 2) get AP_loc */
986: AP_loc = ptap->AP_loc;
987: ap = (Mat_SeqAIJ*)AP_loc->data;
989: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
990: /*-----------------------------------------------------*/
991: if (ptap->reuse == MAT_REUSE_MATRIX) {
992: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
993: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
994: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
995: }
997: /* 2-2) compute numeric A_loc*P - dominating part */
998: /* ---------------------------------------------- */
999: /* get data from symbolic products */
1000: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1001: if (ptap->P_oth) {
1002: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1003: }
1004: apa = ptap->apa;
1005: api = ap->i;
1006: apj = ap->j;
1007: for (i=0; i<am; i++) {
1008: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1009: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1010: apnz = api[i+1] - api[i];
1011: for (j=0; j<apnz; j++) {
1012: col = apj[j+api[i]];
1013: ap->a[j+ap->i[i]] = apa[col];
1014: apa[col] = 0.0;
1015: }
1016: PetscLogFlops(2.0*apnz);
1017: }
1019: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1020: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1021: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1022: C_loc = ptap->C_loc;
1023: C_oth = ptap->C_oth;
1025: /* add C_loc and Co to to C */
1026: MatGetOwnershipRange(C,&rstart,&rend);
1028: /* C_loc -> C */
1029: cm = C_loc->rmap->N;
1030: c_seq = (Mat_SeqAIJ*)C_loc->data;
1031: cols = c_seq->j;
1032: vals = c_seq->a;
1035: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1036: /* when there are no off-processor parts. */
1037: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1038: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1039: /* a table, and other, more complex stuff has to be done. */
1040: if (C->assembled) {
1041: C->was_assembled = PETSC_TRUE;
1042: C->assembled = PETSC_FALSE;
1043: }
1044: if (C->was_assembled) {
1045: for (i=0; i<cm; i++) {
1046: ncols = c_seq->i[i+1] - c_seq->i[i];
1047: row = rstart + i;
1048: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1049: cols += ncols; vals += ncols;
1050: }
1051: } else {
1052: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1053: }
1055: /* Co -> C, off-processor part */
1056: cm = C_oth->rmap->N;
1057: c_seq = (Mat_SeqAIJ*)C_oth->data;
1058: cols = c_seq->j;
1059: vals = c_seq->a;
1060: for (i=0; i<cm; i++) {
1061: ncols = c_seq->i[i+1] - c_seq->i[i];
1062: row = p->garray[i];
1063: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1064: cols += ncols; vals += ncols;
1065: }
1067: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1068: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1070: ptap->reuse = MAT_REUSE_MATRIX;
1071: return(0);
1072: }