Actual source code: mpiov.c

petsc-3.9.2 2018-05-20
Report Typos and Errors
  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5:  #include <../src/mat/impls/aij/seq/aij.h>
  6:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7:  #include <petscbt.h>
  8:  #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);


 22: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 23: {
 25:   PetscInt       i;

 28:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 29:   for (i=0; i<ov; ++i) {
 30:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 31:   }
 32:   return(0);
 33: }

 35: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
 36: {
 38:   PetscInt       i;

 41:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 42:   for (i=0; i<ov; ++i) {
 43:     MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
 44:   }
 45:   return(0);
 46: }


 49: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
 50: {
 52:   MPI_Comm       comm;
 53:   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
 54:   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
 55:   PetscInt       nrecvrows,*sbsizes = 0,*sbdata = 0;
 56:   const PetscInt *indices_i,**indices;
 57:   PetscLayout    rmap;
 58:   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom;
 59:   PetscSF        sf;
 60:   PetscSFNode    *remote;

 63:   PetscObjectGetComm((PetscObject)mat,&comm);
 64:   MPI_Comm_rank(comm,&rank);
 65:   MPI_Comm_size(comm,&size);
 66:   /* get row map to determine where rows should be going */
 67:   MatGetLayouts(mat,&rmap,NULL);
 68:   /* retrieve IS data and put all together so that we
 69:    * can optimize communication
 70:    *  */
 71:   PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
 72:   for (i=0,tlength=0; i<nidx; i++){
 73:     ISGetLocalSize(is[i],&length[i]);
 74:     tlength += length[i];
 75:     ISGetIndices(is[i],&indices[i]);
 76:   }
 77:   /* find these rows on remote processors */
 78:   PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
 79:   PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
 80:   nrrows = 0;
 81:   for (i=0; i<nidx; i++){
 82:     length_i     = length[i];
 83:     indices_i    = indices[i];
 84:     for (j=0; j<length_i; j++){
 85:       owner = -1;
 86:       PetscLayoutFindOwner(rmap,indices_i[j],&owner);
 87:       /* remote processors */
 88:       if (owner != rank){
 89:         tosizes_temp[owner]++; /* number of rows to owner */
 90:         rrow_ranks[nrrows]  = owner; /* processor */
 91:         rrow_isids[nrrows]   = i; /* is id */
 92:         remoterows[nrrows++] = indices_i[j]; /* row */
 93:       }
 94:     }
 95:     ISRestoreIndices(is[i],&indices[i]);
 96:   }
 97:   PetscFree2(*(PetscInt***)&indices,length);
 98:   /* test if we need to exchange messages
 99:    * generally speaking, we do not need to exchange
100:    * data when overlap is 1
101:    * */
102:   MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
103:   /* we do not have any messages
104:    * It usually corresponds to overlap 1
105:    * */
106:   if (!reducednrrows){
107:     PetscFree3(toranks,tosizes,tosizes_temp);
108:     PetscFree3(remoterows,rrow_ranks,rrow_isids);
109:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
110:     return(0);
111:   }
112:   nto = 0;
113:   /* send sizes and ranks for building a two-sided communcation */
114:   for (i=0; i<size; i++){
115:    if (tosizes_temp[i]){
116:      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
117:      tosizes_temp[i] = nto; /* a map from processor to index */
118:      toranks[nto++]  = i; /* processor */
119:    }
120:   }
121:   PetscCalloc1(nto+1,&toffsets);
122:   for (i=0; i<nto; i++){
123:     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
124:     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125:   }
126:   /* send information to other processors */
127:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
128:   nrecvrows = 0;
129:   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130:   PetscMalloc1(nrecvrows,&remote);
131:   nrecvrows = 0;
132:   for (i=0; i<nfrom; i++){
133:     for (j=0; j<fromsizes[2*i]; j++){
134:       remote[nrecvrows].rank    = fromranks[i];
135:       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136:     }
137:   }
138:   PetscSFCreate(comm,&sf);
139:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
140:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141:   PetscSFSetType(sf,PETSCSFBASIC);
142:   PetscSFSetFromOptions(sf);
143:   /* message pair <no of is, row>  */
144:   PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
145:   for (i=0; i<nrrows; i++){
146:     owner = rrow_ranks[i]; /* processor */
147:     j     = tosizes_temp[owner]; /* index */
148:     todata[toffsets[j]++] = rrow_isids[i];
149:     todata[toffsets[j]++] = remoterows[i];
150:   }
151:   PetscFree3(toranks,tosizes,tosizes_temp);
152:   PetscFree3(remoterows,rrow_ranks,rrow_isids);
153:   PetscFree(toffsets);
154:   PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);
155:   PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);
156:   PetscSFDestroy(&sf);
157:   /* send rows belonging to the remote so that then we could get the overlapping data back */
158:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
159:   PetscFree2(todata,fromdata);
160:   PetscFree(fromsizes);
161:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
162:   PetscFree(fromranks);
163:   nrecvrows = 0;
164:   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165:   PetscCalloc1(nrecvrows,&todata);
166:   PetscMalloc1(nrecvrows,&remote);
167:   nrecvrows = 0;
168:   for (i=0; i<nto; i++){
169:     for (j=0; j<tosizes[2*i]; j++){
170:       remote[nrecvrows].rank    = toranks[i];
171:       remote[nrecvrows++].index = tosizes[2*i+1]+j;
172:     }
173:   }
174:   PetscSFCreate(comm,&sf);
175:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
176:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177:   PetscSFSetType(sf,PETSCSFBASIC);
178:   PetscSFSetFromOptions(sf);
179:   /* overlap communication and computation */
180:   PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);
181:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
182:   PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);
183:   PetscSFDestroy(&sf);
184:   PetscFree2(sbdata,sbsizes);
185:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
186:   PetscFree(toranks);
187:   PetscFree(tosizes);
188:   PetscFree(todata);
189:   return(0);
190: }

192: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193: {
194:   PetscInt         *isz,isz_i,i,j,is_id, data_size;
195:   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
196:   const PetscInt   *indices_i_temp;
197:   PetscErrorCode    ierr;

200:   max_lsize = 0;
201:   PetscMalloc1(nidx,&isz);
202:   for (i=0; i<nidx; i++){
203:     ISGetLocalSize(is[i],&lsize);
204:     max_lsize = lsize>max_lsize ? lsize:max_lsize;
205:     isz[i]    = lsize;
206:   }
207:   PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);
208:   for (i=0; i<nidx; i++){
209:     ISGetIndices(is[i],&indices_i_temp);
210:     PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);
211:     ISRestoreIndices(is[i],&indices_i_temp);
212:     ISDestroy(&is[i]);
213:   }
214:   /* retrieve information to get row id and its overlap */
215:   for (i=0; i<nrecvs; ){
216:     is_id      = recvdata[i++];
217:     data_size  = recvdata[i++];
218:     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
219:     isz_i      = isz[is_id];
220:     for (j=0; j< data_size; j++){
221:       col = recvdata[i++];
222:       indices_i[isz_i++] = col;
223:     }
224:     isz[is_id] = isz_i;
225:   }
226:   /* remove duplicate entities */
227:   for (i=0; i<nidx; i++){
228:     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
229:     isz_i      = isz[i];
230:     PetscSortRemoveDupsInt(&isz_i,indices_i);
231:     ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
232:   }
233:   PetscFree(isz);
234:   PetscFree(indices_temp);
235:   return(0);
236: }

238: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239: {
240:   PetscLayout       rmap,cmap;
241:   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242:   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243:   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244:   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
245:   Mat               amat,bmat;
246:   PetscMPIInt       rank;
247:   PetscBool         done;
248:   MPI_Comm          comm;
249:   PetscErrorCode    ierr;

252:   PetscObjectGetComm((PetscObject)mat,&comm);
253:   MPI_Comm_rank(comm,&rank);
254:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
255:   /* Even if the mat is symmetric, we still assume it is not symmetric */
256:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
257:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
259:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260:   /* total number of nonzero values is used to estimate the memory usage in the next step */
261:   tnz  = ai[an]+bi[bn];
262:   MatGetLayouts(mat,&rmap,&cmap);
263:   PetscLayoutGetRange(rmap,&rstart,NULL);
264:   PetscLayoutGetRange(cmap,&cstart,NULL);
265:   /* to find the longest message */
266:   max_fszs = 0;
267:   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268:   /* better way to estimate number of nonzero in the mat??? */
269:   PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
270:   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271:   rows_pos  = 0;
272:   totalrows = 0;
273:   for (i=0; i<nfrom; i++){
274:     PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
275:     /* group data together */
276:     for (j=0; j<fromsizes[2*i]; j+=2){
277:       is_id                       = fromrows[rows_pos++];/* no of is */
278:       rows_i                      = rows_data[is_id];
279:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280:     }
281:     /* estimate a space to avoid multiple allocations  */
282:     for (j=0; j<nidx; j++){
283:       indvc_ij = 0;
284:       rows_i   = rows_data[j];
285:       for (l=0; l<rows_pos_i[j]; l++){
286:         row    = rows_i[l]-rstart;
287:         start  = ai[row];
288:         end    = ai[row+1];
289:         for (k=start; k<end; k++){ /* Amat */
290:           col = aj[k] + cstart;
291:           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292:         }
293:         start = bi[row];
294:         end   = bi[row+1];
295:         for (k=start; k<end; k++) { /* Bmat */
296:           col = gcols[bj[k]];
297:           indices_tmp[indvc_ij++] = col;
298:         }
299:       }
300:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
301:       indv_counts[i*nidx+j] = indvc_ij;
302:       totalrows            += indvc_ij;
303:     }
304:   }
305:   /* message triple <no of is, number of rows, rows> */
306:   PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
307:   totalrows = 0;
308:   rows_pos  = 0;
309:   /* use this code again */
310:   for (i=0;i<nfrom;i++){
311:     PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
312:     for (j=0; j<fromsizes[2*i]; j+=2){
313:       is_id                       = fromrows[rows_pos++];
314:       rows_i                      = rows_data[is_id];
315:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316:     }
317:     /* add data  */
318:     for (j=0; j<nidx; j++){
319:       if (!indv_counts[i*nidx+j]) continue;
320:       indvc_ij = 0;
321:       sbdata[totalrows++] = j;
322:       sbdata[totalrows++] = indv_counts[i*nidx+j];
323:       sbsizes[2*i]       += 2;
324:       rows_i              = rows_data[j];
325:       for (l=0; l<rows_pos_i[j]; l++){
326:         row   = rows_i[l]-rstart;
327:         start = ai[row];
328:         end   = ai[row+1];
329:         for (k=start; k<end; k++){ /* Amat */
330:           col = aj[k] + cstart;
331:           indices_tmp[indvc_ij++] = col;
332:         }
333:         start = bi[row];
334:         end   = bi[row+1];
335:         for (k=start; k<end; k++) { /* Bmat */
336:           col = gcols[bj[k]];
337:           indices_tmp[indvc_ij++] = col;
338:         }
339:       }
340:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
341:       sbsizes[2*i]  += indvc_ij;
342:       PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);
343:       totalrows += indvc_ij;
344:     }
345:   }
346:   PetscCalloc1(nfrom+1,&offsets);
347:   for (i=0; i<nfrom; i++){
348:     offsets[i+1]   = offsets[i] + sbsizes[2*i];
349:     sbsizes[2*i+1] = offsets[i];
350:   }
351:   PetscFree(offsets);
352:   if (sbrowsizes) *sbrowsizes = sbsizes;
353:   if (sbrows) *sbrows = sbdata;
354:   PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
355:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
356:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
357:   return(0);
358: }

360: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361: {
362:   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
363:   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364:   PetscInt          lsize,lsize_tmp,owner;
365:   PetscMPIInt       rank;
366:   Mat               amat,bmat;
367:   PetscBool         done;
368:   PetscLayout       cmap,rmap;
369:   MPI_Comm          comm;
370:   PetscErrorCode    ierr;

373:   PetscObjectGetComm((PetscObject)mat,&comm);
374:   MPI_Comm_rank(comm,&rank);
375:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
376:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
377:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
379:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380:   /* is it a safe way to compute number of nonzero values ? */
381:   tnz  = ai[an]+bi[bn];
382:   MatGetLayouts(mat,&rmap,&cmap);
383:   PetscLayoutGetRange(rmap,&rstart,NULL);
384:   PetscLayoutGetRange(cmap,&cstart,NULL);
385:   /* it is a better way to estimate memory than the old implementation
386:    * where global size of matrix is used
387:    * */
388:   PetscMalloc1(tnz,&indices_temp);
389:   for (i=0; i<nidx; i++) {
390:     ISGetLocalSize(is[i],&lsize);
391:     ISGetIndices(is[i],&indices);
392:     lsize_tmp = 0;
393:     for (j=0; j<lsize; j++) {
394:       owner = -1;
395:       row   = indices[j];
396:       PetscLayoutFindOwner(rmap,row,&owner);
397:       if (owner != rank) continue;
398:       /* local number */
399:       row  -= rstart;
400:       start = ai[row];
401:       end   = ai[row+1];
402:       for (k=start; k<end; k++) { /* Amat */
403:         col = aj[k] + cstart;
404:         indices_temp[lsize_tmp++] = col;
405:       }
406:       start = bi[row];
407:       end   = bi[row+1];
408:       for (k=start; k<end; k++) { /* Bmat */
409:         col = gcols[bj[k]];
410:         indices_temp[lsize_tmp++] = col;
411:       }
412:     }
413:    ISRestoreIndices(is[i],&indices);
414:    ISDestroy(&is[i]);
415:    PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
416:    ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
417:   }
418:   PetscFree(indices_temp);
419:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
420:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
421:   return(0);
422: }


425: /*
426:   Sample message format:
427:   If a processor A wants processor B to process some elements corresponding
428:   to index sets is[1],is[5]
429:   mesg [0] = 2   (no of index sets in the mesg)
430:   -----------
431:   mesg [1] = 1 => is[1]
432:   mesg [2] = sizeof(is[1]);
433:   -----------
434:   mesg [3] = 5  => is[5]
435:   mesg [4] = sizeof(is[5]);
436:   -----------
437:   mesg [5]
438:   mesg [n]  datas[1]
439:   -----------
440:   mesg[n+1]
441:   mesg[m]  data(is[5])
442:   -----------

444:   Notes:
445:   nrqs - no of requests sent (or to be sent out)
446:   nrqr - no of requests recieved (which have to be or which have been processed
447: */
448: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449: {
450:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
451:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452:   const PetscInt **idx,*idx_i;
453:   PetscInt       *n,**data,len;
454: #if defined(PETSC_USE_CTABLE)
455:   PetscTable     *table_data,table_data_i;
456:   PetscInt       *tdata,tcount,tcount_max;
457: #else
458:   PetscInt       *data_i,*d_p;
459: #endif
461:   PetscMPIInt    size,rank,tag1,tag2;
462:   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464:   PetscBT        *table;
465:   MPI_Comm       comm;
466:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467:   MPI_Status     *s_status,*recv_status;
468:   char           *t_p;

471:   PetscObjectGetComm((PetscObject)C,&comm);
472:   size = c->size;
473:   rank = c->rank;
474:   M    = C->rmap->N;

476:   PetscObjectGetNewTag((PetscObject)C,&tag1);
477:   PetscObjectGetNewTag((PetscObject)C,&tag2);

479:   PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);

481:   for (i=0; i<imax; i++) {
482:     ISGetIndices(is[i],&idx[i]);
483:     ISGetLocalSize(is[i],&n[i]);
484:   }

486:   /* evaluate communication - mesg to who,length of mesg, and buffer space
487:      required. Based on this, buffers are allocated, and data copied into them  */
488:   PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
489:   PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
490:   PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
491:   PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
492:   for (i=0; i<imax; i++) {
493:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
494:     idx_i = idx[i];
495:     len   = n[i];
496:     for (j=0; j<len; j++) {
497:       row = idx_i[j];
498:       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499:       PetscLayoutFindOwner(C->rmap,row,&proc);
500:       w4[proc]++;
501:     }
502:     for (j=0; j<size; j++) {
503:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504:     }
505:   }

507:   nrqs     = 0;              /* no of outgoing messages */
508:   msz      = 0;              /* total mesg length (for all proc */
509:   w1[rank] = 0;              /* no mesg sent to intself */
510:   w3[rank] = 0;
511:   for (i=0; i<size; i++) {
512:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513:   }
514:   /* pa - is list of processors to communicate with */
515:   PetscMalloc1(nrqs+1,&pa);
516:   for (i=0,j=0; i<size; i++) {
517:     if (w1[i]) {pa[j] = i; j++;}
518:   }

520:   /* Each message would have a header = 1 + 2*(no of IS) + data */
521:   for (i=0; i<nrqs; i++) {
522:     j      = pa[i];
523:     w1[j] += w2[j] + 2*w3[j];
524:     msz   += w1[j];
525:   }

527:   /* Determine the number of messages to expect, their lengths, from from-ids */
528:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
529:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

531:   /* Now post the Irecvs corresponding to these messages */
532:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

534:   /* Allocate Memory for outgoing messages */
535:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
536:   PetscMemzero(outdat,size*sizeof(PetscInt*));
537:   PetscMemzero(ptr,size*sizeof(PetscInt*));

539:   {
540:     PetscInt *iptr = tmp,ict  = 0;
541:     for (i=0; i<nrqs; i++) {
542:       j         = pa[i];
543:       iptr     +=  ict;
544:       outdat[j] = iptr;
545:       ict       = w1[j];
546:     }
547:   }

549:   /* Form the outgoing messages */
550:   /* plug in the headers */
551:   for (i=0; i<nrqs; i++) {
552:     j            = pa[i];
553:     outdat[j][0] = 0;
554:     PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
555:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
556:   }

558:   /* Memory for doing local proc's work */
559:   {
560:     PetscInt M_BPB_imax = 0;
561: #if defined(PETSC_USE_CTABLE)
562:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
563:     PetscMalloc1(imax,&table_data);
564:     for (i=0; i<imax; i++) {
565:       PetscTableCreate(n[i]+1,M+1,&table_data[i]);
566:     }
567:     PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
568:     for (i=0; i<imax; i++) {
569:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570:     }
571: #else
572:     PetscInt Mimax = 0;
573:     PetscIntMultError(M,imax, &Mimax);
574:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
575:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
576:     for (i=0; i<imax; i++) {
577:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578:       data[i]  = d_p + M*i;
579:     }
580: #endif
581:   }

583:   /* Parse the IS and update local tables and the outgoing buf with the data */
584:   {
585:     PetscInt n_i,isz_i,*outdat_j,ctr_j;
586:     PetscBT  table_i;

588:     for (i=0; i<imax; i++) {
589:       PetscMemzero(ctr,size*sizeof(PetscInt));
590:       n_i     = n[i];
591:       table_i = table[i];
592:       idx_i   = idx[i];
593: #if defined(PETSC_USE_CTABLE)
594:       table_data_i = table_data[i];
595: #else
596:       data_i  = data[i];
597: #endif
598:       isz_i   = isz[i];
599:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
600:         row  = idx_i[j];
601:         PetscLayoutFindOwner(C->rmap,row,&proc);
602:         if (proc != rank) { /* copy to the outgoing buffer */
603:           ctr[proc]++;
604:           *ptr[proc] = row;
605:           ptr[proc]++;
606:         } else if (!PetscBTLookupSet(table_i,row)) {
607: #if defined(PETSC_USE_CTABLE)
608:           PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
609: #else
610:           data_i[isz_i] = row; /* Update the local table */
611: #endif
612:           isz_i++;
613:         }
614:       }
615:       /* Update the headers for the current IS */
616:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617:         if ((ctr_j = ctr[j])) {
618:           outdat_j        = outdat[j];
619:           k               = ++outdat_j[0];
620:           outdat_j[2*k]   = ctr_j;
621:           outdat_j[2*k-1] = i;
622:         }
623:       }
624:       isz[i] = isz_i;
625:     }
626:   }

628:   /*  Now  post the sends */
629:   PetscMalloc1(nrqs+1,&s_waits1);
630:   for (i=0; i<nrqs; ++i) {
631:     j    = pa[i];
632:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
633:   }

635:   /* No longer need the original indices */
636:   for (i=0; i<imax; ++i) {
637:     ISRestoreIndices(is[i],idx+i);
638:   }
639:   PetscFree2(*(PetscInt***)&idx,n);

641:   for (i=0; i<imax; ++i) {
642:     ISDestroy(&is[i]);
643:   }

645:   /* Do Local work */
646: #if defined(PETSC_USE_CTABLE)
647:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
648: #else
649:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
650: #endif

652:   /* Receive messages */
653:   PetscMalloc1(nrqr+1,&recv_status);
654:   if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}

656:   PetscMalloc1(nrqs+1,&s_status);
657:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}

659:   /* Phase 1 sends are complete - deallocate buffers */
660:   PetscFree4(outdat,ptr,tmp,ctr);
661:   PetscFree4(w1,w2,w3,w4);

663:   PetscMalloc1(nrqr+1,&xdata);
664:   PetscMalloc1(nrqr+1,&isz1);
665:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
666:   PetscFree(rbuf[0]);
667:   PetscFree(rbuf);


670:   /* Send the data back */
671:   /* Do a global reduction to know the buffer space req for incoming messages */
672:   {
673:     PetscMPIInt *rw1;

675:     PetscCalloc1(size,&rw1);

677:     for (i=0; i<nrqr; ++i) {
678:       proc = recv_status[i].MPI_SOURCE;

680:       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681:       rw1[proc] = isz1[i];
682:     }
683:     PetscFree(onodes1);
684:     PetscFree(olengths1);

686:     /* Determine the number of messages to expect, their lengths, from from-ids */
687:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
688:     PetscFree(rw1);
689:   }
690:   /* Now post the Irecvs corresponding to these messages */
691:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

693:   /* Now  post the sends */
694:   PetscMalloc1(nrqr+1,&s_waits2);
695:   for (i=0; i<nrqr; ++i) {
696:     j    = recv_status[i].MPI_SOURCE;
697:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
698:   }

700:   /* receive work done on other processors */
701:   {
702:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703:     PetscMPIInt idex;
704:     PetscBT     table_i;
705:     MPI_Status  *status2;

707:     PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
708:     for (i=0; i<nrqs; ++i) {
709:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
710:       /* Process the message */
711:       rbuf2_i = rbuf2[idex];
712:       ct1     = 2*rbuf2_i[0]+1;
713:       jmax    = rbuf2[idex][0];
714:       for (j=1; j<=jmax; j++) {
715:         max     = rbuf2_i[2*j];
716:         is_no   = rbuf2_i[2*j-1];
717:         isz_i   = isz[is_no];
718:         table_i = table[is_no];
719: #if defined(PETSC_USE_CTABLE)
720:         table_data_i = table_data[is_no];
721: #else
722:         data_i  = data[is_no];
723: #endif
724:         for (k=0; k<max; k++,ct1++) {
725:           row = rbuf2_i[ct1];
726:           if (!PetscBTLookupSet(table_i,row)) {
727: #if defined(PETSC_USE_CTABLE)
728:             PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
729: #else
730:             data_i[isz_i] = row;
731: #endif
732:             isz_i++;
733:           }
734:         }
735:         isz[is_no] = isz_i;
736:       }
737:     }

739:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
740:     PetscFree(status2);
741:   }

743: #if defined(PETSC_USE_CTABLE)
744:   tcount_max = 0;
745:   for (i=0; i<imax; ++i) {
746:     table_data_i = table_data[i];
747:     PetscTableGetCount(table_data_i,&tcount);
748:     if (tcount_max < tcount) tcount_max = tcount;
749:   }
750:   PetscMalloc1(tcount_max+1,&tdata);
751: #endif

753:   for (i=0; i<imax; ++i) {
754: #if defined(PETSC_USE_CTABLE)
755:     PetscTablePosition tpos;
756:     table_data_i = table_data[i];

758:     PetscTableGetHeadPosition(table_data_i,&tpos);
759:     while (tpos) {
760:       PetscTableGetNext(table_data_i,&tpos,&k,&j);
761:       tdata[--j] = --k;
762:     }
763:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);
764: #else
765:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
766: #endif
767:   }

769:   PetscFree(onodes2);
770:   PetscFree(olengths2);

772:   PetscFree(pa);
773:   PetscFree(rbuf2[0]);
774:   PetscFree(rbuf2);
775:   PetscFree(s_waits1);
776:   PetscFree(r_waits1);
777:   PetscFree(s_waits2);
778:   PetscFree(r_waits2);
779:   PetscFree(s_status);
780:   PetscFree(recv_status);
781:   PetscFree(xdata[0]);
782:   PetscFree(xdata);
783:   PetscFree(isz1);
784: #if defined(PETSC_USE_CTABLE)
785:   for (i=0; i<imax; i++) {
786:     PetscTableDestroy((PetscTable*)&table_data[i]);
787:   }
788:   PetscFree(table_data);
789:   PetscFree(tdata);
790:   PetscFree4(table,data,isz,t_p);
791: #else
792:   PetscFree5(table,data,isz,d_p,t_p);
793: #endif
794:   return(0);
795: }

797: /*
798:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799:        the work on the local processor.

801:      Inputs:
802:       C      - MAT_MPIAIJ;
803:       imax - total no of index sets processed at a time;
804:       table  - an array of char - size = m bits.

806:      Output:
807:       isz    - array containing the count of the solution elements corresponding
808:                to each index set;
809:       data or table_data  - pointer to the solutions
810: */
811: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812: {
813:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814:   Mat        A  = c->A,B = c->B;
815:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
817:   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
818:   PetscBT    table_i;
819: #if defined(PETSC_USE_CTABLE)
820:   PetscTable         table_data_i;
821:   PetscErrorCode     ierr;
822:   PetscTablePosition tpos;
823:   PetscInt           tcount,*tdata;
824: #else
825:   PetscInt           *data_i;
826: #endif

829:   rstart = C->rmap->rstart;
830:   cstart = C->cmap->rstart;
831:   ai     = a->i;
832:   aj     = a->j;
833:   bi     = b->i;
834:   bj     = b->j;
835:   garray = c->garray;

837:   for (i=0; i<imax; i++) {
838: #if defined(PETSC_USE_CTABLE)
839:     /* copy existing entries of table_data_i into tdata[] */
840:     table_data_i = table_data[i];
841:     PetscTableGetCount(table_data_i,&tcount);
842:     if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);

844:     PetscMalloc1(tcount,&tdata);
845:     PetscTableGetHeadPosition(table_data_i,&tpos);
846:     while (tpos) {
847:       PetscTableGetNext(table_data_i,&tpos,&row,&j);
848:       tdata[--j] = --row;
849:       if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850:     }
851: #else
852:     data_i  = data[i];
853: #endif
854:     table_i = table[i];
855:     isz_i   = isz[i];
856:     max     = isz[i];

858:     for (j=0; j<max; j++) {
859: #if defined(PETSC_USE_CTABLE)
860:       row   = tdata[j] - rstart;
861: #else
862:       row   = data_i[j] - rstart;
863: #endif
864:       start = ai[row];
865:       end   = ai[row+1];
866:       for (k=start; k<end; k++) { /* Amat */
867:         val = aj[k] + cstart;
868:         if (!PetscBTLookupSet(table_i,val)) {
869: #if defined(PETSC_USE_CTABLE)
870:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
871: #else
872:           data_i[isz_i] = val;
873: #endif
874:           isz_i++;
875:         }
876:       }
877:       start = bi[row];
878:       end   = bi[row+1];
879:       for (k=start; k<end; k++) { /* Bmat */
880:         val = garray[bj[k]];
881:         if (!PetscBTLookupSet(table_i,val)) {
882: #if defined(PETSC_USE_CTABLE)
883:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
884: #else
885:           data_i[isz_i] = val;
886: #endif
887:           isz_i++;
888:         }
889:       }
890:     }
891:     isz[i] = isz_i;

893: #if defined(PETSC_USE_CTABLE)
894:     PetscFree(tdata);
895: #endif
896:   }
897:   return(0);
898: }

900: /*
901:       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902:          and return the output

904:          Input:
905:            C    - the matrix
906:            nrqr - no of messages being processed.
907:            rbuf - an array of pointers to the recieved requests

909:          Output:
910:            xdata - array of messages to be sent back
911:            isz1  - size of each message

913:   For better efficiency perhaps we should malloc separately each xdata[i],
914: then if a remalloc is required we need only copy the data for that one row
915: rather then all previous rows as it is now where a single large chunck of
916: memory is used.

918: */
919: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920: {
921:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
922:   Mat            A  = c->A,B = c->B;
923:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
925:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928:   PetscInt       *rbuf_i,kmax,rbuf_0;
929:   PetscBT        xtable;

932:   m      = C->rmap->N;
933:   rstart = C->rmap->rstart;
934:   cstart = C->cmap->rstart;
935:   ai     = a->i;
936:   aj     = a->j;
937:   bi     = b->i;
938:   bj     = b->j;
939:   garray = c->garray;


942:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943:     rbuf_i =  rbuf[i];
944:     rbuf_0 =  rbuf_i[0];
945:     ct    += rbuf_0;
946:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947:   }

949:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950:   else max1 = 1;
951:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952:   PetscMalloc1(mem_estimate,&xdata[0]);
953:   ++no_malloc;
954:   PetscBTCreate(m,&xtable);
955:   PetscMemzero(isz1,nrqr*sizeof(PetscInt));

957:   ct3 = 0;
958:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959:     rbuf_i =  rbuf[i];
960:     rbuf_0 =  rbuf_i[0];
961:     ct1    =  2*rbuf_0+1;
962:     ct2    =  ct1;
963:     ct3   += ct1;
964:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965:       PetscBTMemzero(m,xtable);
966:       oct2 = ct2;
967:       kmax = rbuf_i[2*j];
968:       for (k=0; k<kmax; k++,ct1++) {
969:         row = rbuf_i[ct1];
970:         if (!PetscBTLookupSet(xtable,row)) {
971:           if (!(ct3 < mem_estimate)) {
972:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973:             PetscMalloc1(new_estimate,&tmp);
974:             PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
975:             PetscFree(xdata[0]);
976:             xdata[0]     = tmp;
977:             mem_estimate = new_estimate; ++no_malloc;
978:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979:           }
980:           xdata[i][ct2++] = row;
981:           ct3++;
982:         }
983:       }
984:       for (k=oct2,max2=ct2; k<max2; k++) {
985:         row   = xdata[i][k] - rstart;
986:         start = ai[row];
987:         end   = ai[row+1];
988:         for (l=start; l<end; l++) {
989:           val = aj[l] + cstart;
990:           if (!PetscBTLookupSet(xtable,val)) {
991:             if (!(ct3 < mem_estimate)) {
992:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993:               PetscMalloc1(new_estimate,&tmp);
994:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
995:               PetscFree(xdata[0]);
996:               xdata[0]     = tmp;
997:               mem_estimate = new_estimate; ++no_malloc;
998:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999:             }
1000:             xdata[i][ct2++] = val;
1001:             ct3++;
1002:           }
1003:         }
1004:         start = bi[row];
1005:         end   = bi[row+1];
1006:         for (l=start; l<end; l++) {
1007:           val = garray[bj[l]];
1008:           if (!PetscBTLookupSet(xtable,val)) {
1009:             if (!(ct3 < mem_estimate)) {
1010:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011:               PetscMalloc1(new_estimate,&tmp);
1012:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
1013:               PetscFree(xdata[0]);
1014:               xdata[0]     = tmp;
1015:               mem_estimate = new_estimate; ++no_malloc;
1016:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017:             }
1018:             xdata[i][ct2++] = val;
1019:             ct3++;
1020:           }
1021:         }
1022:       }
1023:       /* Update the header*/
1024:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025:       xdata[i][2*j-1] = rbuf_i[2*j-1];
1026:     }
1027:     xdata[i][0] = rbuf_0;
1028:     xdata[i+1]  = xdata[i] + ct2;
1029:     isz1[i]     = ct2; /* size of each message */
1030:   }
1031:   PetscBTDestroy(&xtable);
1032:   PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1033:   return(0);
1034: }
1035: /* -------------------------------------------------------------------------*/
1036: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037: /*
1038:     Every processor gets the entire matrix
1039: */
1040: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1041: {
1042:   Mat            B;
1043:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1044:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046:   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
1047:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1048:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1049:   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;

1052:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1053:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);

1055:   if (scall == MAT_INITIAL_MATRIX) {
1056:     /* ----------------------------------------------------------------
1057:          Tell every processor the number of nonzeros per row
1058:     */
1059:     PetscMalloc1(A->rmap->N,&lens);
1060:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1061:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1062:     }
1063:     PetscMalloc2(size,&recvcounts,size,&displs);
1064:     for (i=0; i<size; i++) {
1065:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1066:       displs[i]     = A->rmap->range[i];
1067:     }
1068: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1069:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1070: #else
1071:     sendcount = A->rmap->rend - A->rmap->rstart;
1072:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1073: #endif
1074:     /* ---------------------------------------------------------------
1075:          Create the sequential matrix of the same type as the local block diagonal
1076:     */
1077:     MatCreate(PETSC_COMM_SELF,&B);
1078:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1079:     MatSetBlockSizesFromMats(B,A,A);
1080:     MatSetType(B,((PetscObject)a->A)->type_name);
1081:     MatSeqAIJSetPreallocation(B,0,lens);
1082:     PetscCalloc1(2,Bin);
1083:     **Bin = B;
1084:     b     = (Mat_SeqAIJ*)B->data;

1086:     /*--------------------------------------------------------------------
1087:        Copy my part of matrix column indices over
1088:     */
1089:     sendcount  = ad->nz + bd->nz;
1090:     jsendbuf   = b->j + b->i[rstarts[rank]];
1091:     a_jsendbuf = ad->j;
1092:     b_jsendbuf = bd->j;
1093:     n          = A->rmap->rend - A->rmap->rstart;
1094:     cnt        = 0;
1095:     for (i=0; i<n; i++) {

1097:       /* put in lower diagonal portion */
1098:       m = bd->i[i+1] - bd->i[i];
1099:       while (m > 0) {
1100:         /* is it above diagonal (in bd (compressed) numbering) */
1101:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1102:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1103:         m--;
1104:       }

1106:       /* put in diagonal portion */
1107:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1108:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1109:       }

1111:       /* put in upper diagonal portion */
1112:       while (m-- > 0) {
1113:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114:       }
1115:     }
1116:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1118:     /*--------------------------------------------------------------------
1119:        Gather all column indices to all processors
1120:     */
1121:     for (i=0; i<size; i++) {
1122:       recvcounts[i] = 0;
1123:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1124:         recvcounts[i] += lens[j];
1125:       }
1126:     }
1127:     displs[0] = 0;
1128:     for (i=1; i<size; i++) {
1129:       displs[i] = displs[i-1] + recvcounts[i-1];
1130:     }
1131: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1132:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1133: #else
1134:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1135: #endif
1136:     /*--------------------------------------------------------------------
1137:         Assemble the matrix into useable form (note numerical values not yet set)
1138:     */
1139:     /* set the b->ilen (length of each row) values */
1140:     PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
1141:     /* set the b->i indices */
1142:     b->i[0] = 0;
1143:     for (i=1; i<=A->rmap->N; i++) {
1144:       b->i[i] = b->i[i-1] + lens[i-1];
1145:     }
1146:     PetscFree(lens);
1147:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1148:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

1150:   } else {
1151:     B = **Bin;
1152:     b = (Mat_SeqAIJ*)B->data;
1153:   }

1155:   /*--------------------------------------------------------------------
1156:        Copy my part of matrix numerical values into the values location
1157:   */
1158:   if (flag == MAT_GET_VALUES) {
1159:     sendcount = ad->nz + bd->nz;
1160:     sendbuf   = b->a + b->i[rstarts[rank]];
1161:     a_sendbuf = ad->a;
1162:     b_sendbuf = bd->a;
1163:     b_sendj   = bd->j;
1164:     n         = A->rmap->rend - A->rmap->rstart;
1165:     cnt       = 0;
1166:     for (i=0; i<n; i++) {

1168:       /* put in lower diagonal portion */
1169:       m = bd->i[i+1] - bd->i[i];
1170:       while (m > 0) {
1171:         /* is it above diagonal (in bd (compressed) numbering) */
1172:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1173:         sendbuf[cnt++] = *b_sendbuf++;
1174:         m--;
1175:         b_sendj++;
1176:       }

1178:       /* put in diagonal portion */
1179:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1180:         sendbuf[cnt++] = *a_sendbuf++;
1181:       }

1183:       /* put in upper diagonal portion */
1184:       while (m-- > 0) {
1185:         sendbuf[cnt++] = *b_sendbuf++;
1186:         b_sendj++;
1187:       }
1188:     }
1189:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1191:     /* -----------------------------------------------------------------
1192:        Gather all numerical values to all processors
1193:     */
1194:     if (!recvcounts) {
1195:       PetscMalloc2(size,&recvcounts,size,&displs);
1196:     }
1197:     for (i=0; i<size; i++) {
1198:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1199:     }
1200:     displs[0] = 0;
1201:     for (i=1; i<size; i++) {
1202:       displs[i] = displs[i-1] + recvcounts[i-1];
1203:     }
1204:     recvbuf = b->a;
1205: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1206:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1207: #else
1208:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1209: #endif
1210:   }  /* endof (flag == MAT_GET_VALUES) */
1211:   PetscFree2(recvcounts,displs);

1213:   if (A->symmetric) {
1214:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1215:   } else if (A->hermitian) {
1216:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1217:   } else if (A->structurally_symmetric) {
1218:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1219:   }
1220:   return(0);
1221: }

1223: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1224: {
1225:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1226:   Mat            submat,A = c->A,B = c->B;
1227:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1228:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1229:   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1230:   const PetscInt *icol,*irow;
1231:   PetscInt       nrow,ncol,start;
1233:   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1234:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1235:   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1236:   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1237:   PetscInt       *lens,rmax,ncols,*cols,Crow;
1238: #if defined(PETSC_USE_CTABLE)
1239:   PetscTable     cmap,rmap;
1240:   PetscInt       *cmap_loc,*rmap_loc;
1241: #else
1242:   PetscInt       *cmap,*rmap;
1243: #endif
1244:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1245:   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1246:   PetscScalar    *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1247:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1248:   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1249:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1250:   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1251:   MPI_Comm       comm;
1252:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1253:   PetscMPIInt    *onodes1,*olengths1,idex,end;
1254:   Mat_SubSppt    *smatis1;
1255:   PetscBool      isrowsorted,iscolsorted;

1258:   if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");

1260:   PetscObjectGetComm((PetscObject)C,&comm);
1261:   size = c->size;
1262:   rank = c->rank;

1264:   ISSorted(iscol[0],&iscolsorted);
1265:   ISSorted(isrow[0],&isrowsorted);
1266:   ISGetIndices(isrow[0],&irow);
1267:   ISGetLocalSize(isrow[0],&nrow);
1268:   if (allcolumns) {
1269:     icol = NULL;
1270:     ncol = C->cmap->N;
1271:   } else {
1272:     ISGetIndices(iscol[0],&icol);
1273:     ISGetLocalSize(iscol[0],&ncol);
1274:   }

1276:   if (scall == MAT_INITIAL_MATRIX) {
1277:     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;

1279:     /* Get some new tags to keep the communication clean */
1280:     tag1 = ((PetscObject)C)->tag;
1281:     PetscObjectGetNewTag((PetscObject)C,&tag2);
1282:     PetscObjectGetNewTag((PetscObject)C,&tag3);

1284:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1285:      required. Based on this, buffers are allocated, and data copied into them */
1286:     PetscCalloc2(size,&w1,size,&w2);
1287:     PetscMalloc1(nrow,&row2proc);

1289:     /* w1[proc] = num of rows owned by proc -- to be requested */
1290:     proc = 0;
1291:     nrqs = 0; /* num of outgoing messages */
1292:     for (j=0; j<nrow; j++) {
1293:       row  = irow[j];
1294:       if (!isrowsorted) proc = 0;
1295:       while (row >= C->rmap->range[proc+1]) proc++;
1296:       w1[proc]++;
1297:       row2proc[j] = proc; /* map row index to proc */

1299:       if (proc != rank && !w2[proc]) {
1300:         w2[proc] = 1; nrqs++;
1301:       }
1302:     }
1303:     w1[rank] = 0;  /* rows owned by self will not be requested */

1305:     PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1306:     for (proc=0,j=0; proc<size; proc++) {
1307:       if (w1[proc]) { pa[j++] = proc;}
1308:     }

1310:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1311:     msz = 0;              /* total mesg length (for all procs) */
1312:     for (i=0; i<nrqs; i++) {
1313:       proc      = pa[i];
1314:       w1[proc] += 3;
1315:       msz      += w1[proc];
1316:     }
1317:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

1319:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1320:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1321:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);

1323:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1324:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1325:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

1327:     /* Now post the Irecvs corresponding to these messages */
1328:     PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

1330:     PetscFree(onodes1);
1331:     PetscFree(olengths1);

1333:     /* Allocate Memory for outgoing messages */
1334:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1335:     PetscMemzero(sbuf1,size*sizeof(PetscInt*));
1336:     PetscMemzero(ptr,size*sizeof(PetscInt*));

1338:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1339:     iptr = tmp;
1340:     for (i=0; i<nrqs; i++) {
1341:       proc        = pa[i];
1342:       sbuf1[proc] = iptr;
1343:       iptr       += w1[proc];
1344:     }

1346:     /* Form the outgoing messages */
1347:     /* Initialize the header space */
1348:     for (i=0; i<nrqs; i++) {
1349:       proc      = pa[i];
1350:       PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));
1351:       ptr[proc] = sbuf1[proc] + 3;
1352:     }

1354:     /* Parse the isrow and copy data into outbuf */
1355:     PetscMemzero(ctr,size*sizeof(PetscInt));
1356:     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1357:       proc = row2proc[j];
1358:       if (proc != rank) { /* copy to the outgoing buf*/
1359:         *ptr[proc] = irow[j];
1360:         ctr[proc]++; ptr[proc]++;
1361:       }
1362:     }

1364:     /* Update the headers for the current IS */
1365:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1366:       if ((ctr_j = ctr[j])) {
1367:         sbuf1_j        = sbuf1[j];
1368:         k              = ++sbuf1_j[0];
1369:         sbuf1_j[2*k]   = ctr_j;
1370:         sbuf1_j[2*k-1] = 0;
1371:       }
1372:     }

1374:     /* Now post the sends */
1375:     PetscMalloc1(nrqs+1,&s_waits1);
1376:     for (i=0; i<nrqs; ++i) {
1377:       proc = pa[i];
1378:       MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1379:     }

1381:     /* Post Receives to capture the buffer size */
1382:     PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);
1383:     PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);

1385:     rbuf2[0] = tmp + msz;
1386:     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];

1388:     for (i=0; i<nrqs; ++i) {
1389:       proc = pa[i];
1390:       MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1391:     }

1393:     PetscFree2(w1,w2);

1395:     /* Send to other procs the buf size they should allocate */
1396:     /* Receive messages*/
1397:     PetscMalloc1(nrqr+1,&r_status1);
1398:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);

1400:     MPI_Waitall(nrqr,r_waits1,r_status1);
1401:     for (i=0; i<nrqr; ++i) {
1402:       req_size[i] = 0;
1403:       rbuf1_i        = rbuf1[i];
1404:       start          = 2*rbuf1_i[0] + 1;
1405:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
1406:       PetscMalloc1(end+1,&sbuf2[i]);
1407:       sbuf2_i        = sbuf2[i];
1408:       for (j=start; j<end; j++) {
1409:         k            = rbuf1_i[j] - rstart;
1410:         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1411:         sbuf2_i[j]   = ncols;
1412:         req_size[i] += ncols;
1413:       }
1414:       req_source1[i] = r_status1[i].MPI_SOURCE;

1416:       /* form the header */
1417:       sbuf2_i[0] = req_size[i];
1418:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1420:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1421:     }

1423:     PetscFree(r_status1);
1424:     PetscFree(r_waits1);

1426:     /* rbuf2 is received, Post recv column indices a->j */
1427:     MPI_Waitall(nrqs,r_waits2,r_status2);

1429:     PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);
1430:     for (i=0; i<nrqs; ++i) {
1431:       PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
1432:       req_source2[i] = r_status2[i].MPI_SOURCE;
1433:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1434:     }

1436:     /* Wait on sends1 and sends2 */
1437:     PetscMalloc1(nrqs+1,&s_status1);
1438:     MPI_Waitall(nrqs,s_waits1,s_status1);
1439:     PetscFree(s_waits1);
1440:     PetscFree(s_status1);

1442:     MPI_Waitall(nrqr,s_waits2,s_status2);
1443:     PetscFree4(r_status2,s_waits2,r_waits2,s_status2);

1445:     /* Now allocate sending buffers for a->j, and send them off */
1446:     PetscMalloc1(nrqr+1,&sbuf_aj);
1447:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1448:     PetscMalloc1(j+1,&sbuf_aj[0]);
1449:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1451:     for (i=0; i<nrqr; i++) { /* for each requested message */
1452:       rbuf1_i   = rbuf1[i];
1453:       sbuf_aj_i = sbuf_aj[i];
1454:       ct1       = 2*rbuf1_i[0] + 1;
1455:       ct2       = 0;
1456:       /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */

1458:       kmax = rbuf1[i][2];
1459:       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1460:         row    = rbuf1_i[ct1] - rstart;
1461:         nzA    = ai[row+1] - ai[row];
1462:         nzB    = bi[row+1] - bi[row];
1463:         ncols  = nzA + nzB;
1464:         cworkA = aj + ai[row]; cworkB = bj + bi[row];

1466:         /* load the column indices for this row into cols*/
1467:         cols = sbuf_aj_i + ct2;

1469:         lwrite = 0;
1470:         for (l=0; l<nzB; l++) {
1471:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472:         }
1473:         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474:         for (l=0; l<nzB; l++) {
1475:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476:         }

1478:         ct2 += ncols;
1479:       }
1480:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1481:     }

1483:     /* create column map (cmap): global col of C -> local col of submat */
1484: #if defined(PETSC_USE_CTABLE)
1485:     if (!allcolumns) {
1486:       PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);
1487:       PetscCalloc1(C->cmap->n,&cmap_loc);
1488:       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1489:         if (icol[j] >= cstart && icol[j] <cend) {
1490:           cmap_loc[icol[j] - cstart] = j+1;
1491:         } else { /* use PetscTable for non-local col indices */
1492:           PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1493:         }
1494:       }
1495:     } else {
1496:       cmap     = NULL;
1497:       cmap_loc = NULL;
1498:     }
1499:     PetscCalloc1(C->rmap->n,&rmap_loc);
1500: #else
1501:     if (!allcolumns) {
1502:       PetscCalloc1(C->cmap->N,&cmap);
1503:       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1504:     } else {
1505:       cmap = NULL;
1506:     }
1507: #endif

1509:     /* Create lens for MatSeqAIJSetPreallocation() */
1510:     PetscCalloc1(nrow,&lens);

1512:     /* Compute lens from local part of C */
1513:     for (j=0; j<nrow; j++) {
1514:       row  = irow[j];
1515:       proc = row2proc[j];
1516:       if (proc == rank) {
1517:         /* diagonal part A = c->A */
1518:         ncols = ai[row-rstart+1] - ai[row-rstart];
1519:         cols  = aj + ai[row-rstart];
1520:         if (!allcolumns) {
1521:           for (k=0; k<ncols; k++) {
1522: #if defined(PETSC_USE_CTABLE)
1523:             tcol = cmap_loc[cols[k]];
1524: #else
1525:             tcol = cmap[cols[k]+cstart];
1526: #endif
1527:             if (tcol) lens[j]++;
1528:           }
1529:         } else { /* allcolumns */
1530:           lens[j] = ncols;
1531:         }

1533:         /* off-diagonal part B = c->B */
1534:         ncols = bi[row-rstart+1] - bi[row-rstart];
1535:         cols  = bj + bi[row-rstart];
1536:         if (!allcolumns) {
1537:           for (k=0; k<ncols; k++) {
1538: #if defined(PETSC_USE_CTABLE)
1539:             PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1540: #else
1541:             tcol = cmap[bmap[cols[k]]];
1542: #endif
1543:             if (tcol) lens[j]++;
1544:           }
1545:         } else { /* allcolumns */
1546:           lens[j] += ncols;
1547:         }
1548:       }
1549:     }

1551:     /* Create row map (rmap): global row of C -> local row of submat */
1552: #if defined(PETSC_USE_CTABLE)
1553:     PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);
1554:     for (j=0; j<nrow; j++) {
1555:       row  = irow[j];
1556:       proc = row2proc[j];
1557:       if (proc == rank) { /* a local row */
1558:         rmap_loc[row - rstart] = j;
1559:       } else {
1560:         PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1561:       }
1562:     }
1563: #else
1564:     PetscCalloc1(C->rmap->N,&rmap);
1565:     for (j=0; j<nrow; j++) {
1566:       rmap[irow[j]] = j;
1567:     }
1568: #endif

1570:     /* Update lens from offproc data */
1571:     /* recv a->j is done */
1572:     MPI_Waitall(nrqs,r_waits3,r_status3);
1573:     for (i=0; i<nrqs; i++) {
1574:       proc    = pa[i];
1575:       sbuf1_i = sbuf1[proc];
1576:       /* jmax    = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1577:       ct1     = 2 + 1;
1578:       ct2     = 0;
1579:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1580:       rbuf3_i = rbuf3[i]; /* received C->j */

1582:       /* is_no  = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1583:       max1   = sbuf1_i[2];
1584:       for (k=0; k<max1; k++,ct1++) {
1585: #if defined(PETSC_USE_CTABLE)
1586:         PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1587:         row--;
1588:         if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1589: #else
1590:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1591: #endif
1592:         /* Now, store row index of submat in sbuf1_i[ct1] */
1593:         sbuf1_i[ct1] = row;

1595:         nnz = rbuf2_i[ct1];
1596:         if (!allcolumns) {
1597:           for (l=0; l<nnz; l++,ct2++) {
1598: #if defined(PETSC_USE_CTABLE)
1599:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1600:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1601:             } else {
1602:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1603:             }
1604: #else
1605:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1606: #endif
1607:             if (tcol) lens[row]++;
1608:           }
1609:         } else { /* allcolumns */
1610:           lens[row] += nnz;
1611:         }
1612:       }
1613:     }
1614:     MPI_Waitall(nrqr,s_waits3,s_status3);
1615:     PetscFree4(r_waits3,s_waits3,r_status3,s_status3);

1617:     /* Create the submatrices */
1618:     MatCreate(PETSC_COMM_SELF,&submat);
1619:     MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);

1621:     ISGetBlockSize(isrow[0],&i);
1622:     ISGetBlockSize(iscol[0],&j);
1623:     MatSetBlockSizes(submat,i,j);
1624:     MatSetType(submat,((PetscObject)A)->type_name);
1625:     MatSeqAIJSetPreallocation(submat,0,lens);

1627:     /* create struct Mat_SubSppt and attached it to submat */
1628:     PetscNew(&smatis1);
1629:     subc = (Mat_SeqAIJ*)submat->data;
1630:     subc->submatis1 = smatis1;

1632:     smatis1->id          = 0;
1633:     smatis1->nrqs        = nrqs;
1634:     smatis1->nrqr        = nrqr;
1635:     smatis1->rbuf1       = rbuf1;
1636:     smatis1->rbuf2       = rbuf2;
1637:     smatis1->rbuf3       = rbuf3;
1638:     smatis1->sbuf2       = sbuf2;
1639:     smatis1->req_source2 = req_source2;

1641:     smatis1->sbuf1       = sbuf1;
1642:     smatis1->ptr         = ptr;
1643:     smatis1->tmp         = tmp;
1644:     smatis1->ctr         = ctr;

1646:     smatis1->pa           = pa;
1647:     smatis1->req_size     = req_size;
1648:     smatis1->req_source1  = req_source1;

1650:     smatis1->allcolumns  = allcolumns;
1651:     smatis1->singleis    = PETSC_TRUE;
1652:     smatis1->row2proc    = row2proc;
1653:     smatis1->rmap        = rmap;
1654:     smatis1->cmap        = cmap;
1655: #if defined(PETSC_USE_CTABLE)
1656:     smatis1->rmap_loc    = rmap_loc;
1657:     smatis1->cmap_loc    = cmap_loc;
1658: #endif

1660:     smatis1->destroy     = submat->ops->destroy;
1661:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1662:     submat->factortype   = C->factortype;

1664:     /* compute rmax */
1665:     rmax = 0;
1666:     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);

1668:   } else { /* scall == MAT_REUSE_MATRIX */
1669:     submat = submats[0];
1670:     if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

1672:     subc    = (Mat_SeqAIJ*)submat->data;
1673:     rmax    = subc->rmax;
1674:     smatis1 = subc->submatis1;
1675:     nrqs        = smatis1->nrqs;
1676:     nrqr        = smatis1->nrqr;
1677:     rbuf1       = smatis1->rbuf1;
1678:     rbuf2       = smatis1->rbuf2;
1679:     rbuf3       = smatis1->rbuf3;
1680:     req_source2 = smatis1->req_source2;

1682:     sbuf1     = smatis1->sbuf1;
1683:     sbuf2     = smatis1->sbuf2;
1684:     ptr       = smatis1->ptr;
1685:     tmp       = smatis1->tmp;
1686:     ctr       = smatis1->ctr;

1688:     pa         = smatis1->pa;
1689:     req_size   = smatis1->req_size;
1690:     req_source1 = smatis1->req_source1;

1692:     allcolumns = smatis1->allcolumns;
1693:     row2proc   = smatis1->row2proc;
1694:     rmap       = smatis1->rmap;
1695:     cmap       = smatis1->cmap;
1696: #if defined(PETSC_USE_CTABLE)
1697:     rmap_loc   = smatis1->rmap_loc;
1698:     cmap_loc   = smatis1->cmap_loc;
1699: #endif
1700:   }

1702:   /* Post recv matrix values */
1703:   PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);
1704:   PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);
1705:   PetscObjectGetNewTag((PetscObject)C,&tag4);
1706:   for (i=0; i<nrqs; ++i) {
1707:     PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
1708:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1709:   }

1711:   /* Allocate sending buffers for a->a, and send them off */
1712:   PetscMalloc1(nrqr+1,&sbuf_aa);
1713:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1714:   PetscMalloc1(j+1,&sbuf_aa[0]);
1715:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1717:   for (i=0; i<nrqr; i++) {
1718:     rbuf1_i   = rbuf1[i];
1719:     sbuf_aa_i = sbuf_aa[i];
1720:     ct1       = 2*rbuf1_i[0]+1;
1721:     ct2       = 0;
1722:     /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */

1724:     kmax = rbuf1_i[2];
1725:     for (k=0; k<kmax; k++,ct1++) {
1726:       row = rbuf1_i[ct1] - rstart;
1727:       nzA = ai[row+1] - ai[row];
1728:       nzB = bi[row+1] - bi[row];
1729:       ncols  = nzA + nzB;
1730:       cworkB = bj + bi[row];
1731:       vworkA = a_a + ai[row];
1732:       vworkB = b_a + bi[row];

1734:       /* load the column values for this row into vals*/
1735:       vals = sbuf_aa_i + ct2;

1737:       lwrite = 0;
1738:       for (l=0; l<nzB; l++) {
1739:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1740:       }
1741:       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1742:       for (l=0; l<nzB; l++) {
1743:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1744:       }

1746:       ct2 += ncols;
1747:     }
1748:     MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1749:   }

1751:   /* Assemble submat */
1752:   /* First assemble the local rows */
1753:   for (j=0; j<nrow; j++) {
1754:     row  = irow[j];
1755:     proc = row2proc[j];
1756:     if (proc == rank) {
1757:       Crow = row - rstart;  /* local row index of C */
1758: #if defined(PETSC_USE_CTABLE)
1759:       row = rmap_loc[Crow]; /* row index of submat */
1760: #else
1761:       row = rmap[row];
1762: #endif

1764:       if (allcolumns) {
1765:         /* diagonal part A = c->A */
1766:         ncols = ai[Crow+1] - ai[Crow];
1767:         cols  = aj + ai[Crow];
1768:         vals  = a->a + ai[Crow];
1769:         i     = 0;
1770:         for (k=0; k<ncols; k++) {
1771:           subcols[i]   = cols[k] + cstart;
1772:           subvals[i++] = vals[k];
1773:         }

1775:         /* off-diagonal part B = c->B */
1776:         ncols = bi[Crow+1] - bi[Crow];
1777:         cols  = bj + bi[Crow];
1778:         vals  = b->a + bi[Crow];
1779:         for (k=0; k<ncols; k++) {
1780:           subcols[i]   = bmap[cols[k]];
1781:           subvals[i++] = vals[k];
1782:         }

1784:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);

1786:       } else { /* !allcolumns */
1787: #if defined(PETSC_USE_CTABLE)
1788:         /* diagonal part A = c->A */
1789:         ncols = ai[Crow+1] - ai[Crow];
1790:         cols  = aj + ai[Crow];
1791:         vals  = a->a + ai[Crow];
1792:         i     = 0;
1793:         for (k=0; k<ncols; k++) {
1794:           tcol = cmap_loc[cols[k]];
1795:           if (tcol) {
1796:             subcols[i]   = --tcol;
1797:             subvals[i++] = vals[k];
1798:           }
1799:         }

1801:         /* off-diagonal part B = c->B */
1802:         ncols = bi[Crow+1] - bi[Crow];
1803:         cols  = bj + bi[Crow];
1804:         vals  = b->a + bi[Crow];
1805:         for (k=0; k<ncols; k++) {
1806:           PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1807:           if (tcol) {
1808:             subcols[i]   = --tcol;
1809:             subvals[i++] = vals[k];
1810:           }
1811:         }
1812: #else
1813:         /* diagonal part A = c->A */
1814:         ncols = ai[Crow+1] - ai[Crow];
1815:         cols  = aj + ai[Crow];
1816:         vals  = a->a + ai[Crow];
1817:         i     = 0;
1818:         for (k=0; k<ncols; k++) {
1819:           tcol = cmap[cols[k]+cstart];
1820:           if (tcol) {
1821:             subcols[i]   = --tcol;
1822:             subvals[i++] = vals[k];
1823:           }
1824:         }

1826:         /* off-diagonal part B = c->B */
1827:         ncols = bi[Crow+1] - bi[Crow];
1828:         cols  = bj + bi[Crow];
1829:         vals  = b->a + bi[Crow];
1830:         for (k=0; k<ncols; k++) {
1831:           tcol = cmap[bmap[cols[k]]];
1832:           if (tcol) {
1833:             subcols[i]   = --tcol;
1834:             subvals[i++] = vals[k];
1835:           }
1836:         }
1837: #endif
1838:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1839:       }
1840:     }
1841:   }

1843:   /* Now assemble the off-proc rows */
1844:   for (i=0; i<nrqs; i++) { /* for each requested message */
1845:     /* recv values from other processes */
1846:     MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1847:     proc    = pa[idex];
1848:     sbuf1_i = sbuf1[proc];
1849:     /* jmax    = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1850:     ct1     = 2 + 1;
1851:     ct2     = 0; /* count of received C->j */
1852:     ct3     = 0; /* count of received C->j that will be inserted into submat */
1853:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1854:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1855:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1857:     /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1858:     max1 = sbuf1_i[2];             /* num of rows */
1859:     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1860:       row = sbuf1_i[ct1]; /* row index of submat */
1861:       if (!allcolumns) {
1862:         idex = 0;
1863:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1864:           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1865:           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1866: #if defined(PETSC_USE_CTABLE)
1867:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1868:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1869:             } else {
1870:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1871:             }
1872: #else
1873:             tcol = cmap[rbuf3_i[ct2]];
1874: #endif
1875:             if (tcol) {
1876:               subcols[idex]   = --tcol; /* may not be sorted */
1877:               subvals[idex++] = rbuf4_i[ct2];

1879:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1880:                For reuse, we replace received C->j with index that should be inserted to submat */
1881:               rbuf3_i[ct3++] = ct2;
1882:             }
1883:           }
1884:           MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1885:         } else { /* scall == MAT_REUSE_MATRIX */
1886:           submat = submats[0];
1887:           subc   = (Mat_SeqAIJ*)submat->data;

1889:           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1890:           for (l=0; l<nnz; l++) {
1891:             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1892:             subvals[idex++] = rbuf4_i[ct2];
1893:           }

1895:           bj = subc->j + subc->i[row]; /* sorted column indices */
1896:           MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1897:         }
1898:       } else { /* allcolumns */
1899:         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1900:         MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1901:         ct2 += nnz;
1902:       }
1903:     }
1904:   }

1906:   /* sending a->a are done */
1907:   MPI_Waitall(nrqr,s_waits4,s_status4);
1908:   PetscFree4(r_waits4,s_waits4,r_status4,s_status4);

1910:   MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1911:   MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1912:   submats[0] = submat;

1914:   /* Restore the indices */
1915:   ISRestoreIndices(isrow[0],&irow);
1916:   if (!allcolumns) {
1917:     ISRestoreIndices(iscol[0],&icol);
1918:   }

1920:   /* Destroy allocated memory */
1921:   for (i=0; i<nrqs; ++i) {
1922:     PetscFree3(rbuf4[i],subcols,subvals);
1923:   }
1924:   PetscFree3(rbuf4,subcols,subvals);
1925:   PetscFree(sbuf_aa[0]);
1926:   PetscFree(sbuf_aa);

1928:   if (scall == MAT_INITIAL_MATRIX) {
1929:     PetscFree(lens);
1930:     PetscFree(sbuf_aj[0]);
1931:     PetscFree(sbuf_aj);
1932:   }
1933:   return(0);
1934: }

1936: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1937: {
1939:   PetscInt       ncol;
1940:   PetscBool      colflag,allcolumns=PETSC_FALSE;

1943:   /* Allocate memory to hold all the submatrices */
1944:   if (scall == MAT_INITIAL_MATRIX) {
1945:     PetscCalloc1(2,submat);
1946:   }

1948:   /* Check for special case: each processor gets entire matrix columns */
1949:   ISIdentity(iscol[0],&colflag);
1950:   ISGetLocalSize(iscol[0],&ncol);
1951:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1953:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1954:   return(0);
1955: }

1957: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1958: {
1960:   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1961:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1962:   Mat_SeqAIJ     *subc;
1963:   Mat_SubSppt    *smat;

1966:   /* Check for special case: each processor has a single IS */
1967:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1968:     MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1969:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1970:     return(0);
1971:   }

1973:   /* Collect global wantallmatrix and nstages */
1974:   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1975:   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1976:   if (!nmax) nmax = 1;

1978:   if (scall == MAT_INITIAL_MATRIX) {
1979:     /* Collect global wantallmatrix and nstages */
1980:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1981:       ISIdentity(*isrow,&rowflag);
1982:       ISIdentity(*iscol,&colflag);
1983:       ISGetLocalSize(*isrow,&nrow);
1984:       ISGetLocalSize(*iscol,&ncol);
1985:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1986:         wantallmatrix = PETSC_TRUE;

1988:         PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1989:       }
1990:     }

1992:     /* Determine the number of stages through which submatrices are done
1993:        Each stage will extract nmax submatrices.
1994:        nmax is determined by the matrix column dimension.
1995:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1996:     */
1997:     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1999:     in[0] = -1*(PetscInt)wantallmatrix;
2000:     in[1] = nstages;
2001:     MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2002:     wantallmatrix = (PetscBool)(-out[0]);
2003:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

2005:   } else { /* MAT_REUSE_MATRIX */
2006:     if (ismax) {
2007:       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2008:       smat = subc->submatis1;
2009:     } else { /* (*submat)[0] is a dummy matrix */
2010:       smat = (Mat_SubSppt*)(*submat)[0]->data;
2011:     }
2012:     if (!smat) {
2013:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2014:       wantallmatrix = PETSC_TRUE;
2015:     } else if (smat->singleis) {
2016:       MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2017:       return(0);
2018:     } else {
2019:       nstages = smat->nstages;
2020:     }
2021:   }

2023:   if (wantallmatrix) {
2024:     MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2025:     return(0);
2026:   }

2028:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2029:   if (scall == MAT_INITIAL_MATRIX) {
2030:     PetscCalloc1(ismax+nstages,submat);
2031:   }

2033:   for (i=0,pos=0; i<nstages; i++) {
2034:     if (pos+nmax <= ismax) max_no = nmax;
2035:     else if (pos == ismax) max_no = 0;
2036:     else                   max_no = ismax-pos;

2038:     MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2039:     if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2040:       smat = (Mat_SubSppt*)(*submat)[pos]->data;
2041:       smat->nstages = nstages;
2042:     }
2043:     pos += max_no;
2044:   }

2046:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2047:     /* save nstages for reuse */
2048:     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2049:     smat = subc->submatis1;
2050:     smat->nstages = nstages;
2051:   }
2052:   return(0);
2053: }

2055: /* -------------------------------------------------------------------------*/
2056: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2057: {
2058:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2059:   Mat            A  = c->A;
2060:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2061:   const PetscInt **icol,**irow;
2062:   PetscInt       *nrow,*ncol,start;
2064:   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2065:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2066:   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2067:   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2068:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2069: #if defined(PETSC_USE_CTABLE)
2070:   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2071: #else
2072:   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2073: #endif
2074:   const PetscInt *irow_i;
2075:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2076:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2077:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2078:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2079:   MPI_Status     *r_status3,*r_status4,*s_status4;
2080:   MPI_Comm       comm;
2081:   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2082:   PetscMPIInt    *onodes1,*olengths1,end;
2083:   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2084:   Mat_SubSppt    *smat_i;
2085:   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2086:   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;

2089:   PetscObjectGetComm((PetscObject)C,&comm);
2090:   size = c->size;
2091:   rank = c->rank;

2093:   PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2094:   PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);

2096:   for (i=0; i<ismax; i++) {
2097:     ISSorted(iscol[i],&issorted[i]);
2098:     if (!issorted[i]) iscsorted = issorted[i];

2100:     ISSorted(isrow[i],&issorted[i]);

2102:     ISGetIndices(isrow[i],&irow[i]);
2103:     ISGetLocalSize(isrow[i],&nrow[i]);

2105:     /* Check for special case: allcolumn */
2106:     ISIdentity(iscol[i],&colflag);
2107:     ISGetLocalSize(iscol[i],&ncol[i]);
2108:     if (colflag && ncol[i] == C->cmap->N) {
2109:       allcolumns[i] = PETSC_TRUE;
2110:       icol[i] = NULL;
2111:     } else {
2112:       allcolumns[i] = PETSC_FALSE;
2113:       ISGetIndices(iscol[i],&icol[i]);
2114:     }
2115:   }

2117:   if (scall == MAT_REUSE_MATRIX) {
2118:     /* Assumes new rows are same length as the old rows */
2119:     for (i=0; i<ismax; i++) {
2120:       if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2121:       subc = (Mat_SeqAIJ*)submats[i]->data;
2122:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");

2124:       /* Initial matrix as if empty */
2125:       PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));

2127:       smat_i   = subc->submatis1;

2129:       nrqs        = smat_i->nrqs;
2130:       nrqr        = smat_i->nrqr;
2131:       rbuf1       = smat_i->rbuf1;
2132:       rbuf2       = smat_i->rbuf2;
2133:       rbuf3       = smat_i->rbuf3;
2134:       req_source2 = smat_i->req_source2;

2136:       sbuf1     = smat_i->sbuf1;
2137:       sbuf2     = smat_i->sbuf2;
2138:       ptr       = smat_i->ptr;
2139:       tmp       = smat_i->tmp;
2140:       ctr       = smat_i->ctr;

2142:       pa          = smat_i->pa;
2143:       req_size    = smat_i->req_size;
2144:       req_source1 = smat_i->req_source1;

2146:       allcolumns[i] = smat_i->allcolumns;
2147:       row2proc[i]   = smat_i->row2proc;
2148:       rmap[i]       = smat_i->rmap;
2149:       cmap[i]       = smat_i->cmap;
2150:     }

2152:     if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2153:       if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2154:       smat_i = (Mat_SubSppt*)submats[0]->data;

2156:       nrqs        = smat_i->nrqs;
2157:       nrqr        = smat_i->nrqr;
2158:       rbuf1       = smat_i->rbuf1;
2159:       rbuf2       = smat_i->rbuf2;
2160:       rbuf3       = smat_i->rbuf3;
2161:       req_source2 = smat_i->req_source2;

2163:       sbuf1       = smat_i->sbuf1;
2164:       sbuf2       = smat_i->sbuf2;
2165:       ptr         = smat_i->ptr;
2166:       tmp         = smat_i->tmp;
2167:       ctr         = smat_i->ctr;

2169:       pa          = smat_i->pa;
2170:       req_size    = smat_i->req_size;
2171:       req_source1 = smat_i->req_source1;

2173:       allcolumns[0] = PETSC_FALSE;
2174:     }
2175:   } else { /* scall == MAT_INITIAL_MATRIX */
2176:     /* Get some new tags to keep the communication clean */
2177:     PetscObjectGetNewTag((PetscObject)C,&tag2);
2178:     PetscObjectGetNewTag((PetscObject)C,&tag3);

2180:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2181:      required. Based on this, buffers are allocated, and data copied into them*/
2182:     PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size, initialize work vectors */

2184:     for (i=0; i<ismax; i++) {
2185:       jmax   = nrow[i];
2186:       irow_i = irow[i];

2188:       PetscMalloc1(jmax,&row2proc_i);
2189:       row2proc[i] = row2proc_i;

2191:       if (issorted[i]) proc = 0;
2192:       for (j=0; j<jmax; j++) {
2193:         if (!issorted[i]) proc = 0;
2194:         row = irow_i[j];
2195:         while (row >= C->rmap->range[proc+1]) proc++;
2196:         w4[proc]++;
2197:         row2proc_i[j] = proc; /* map row index to proc */
2198:       }
2199:       for (j=0; j<size; j++) {
2200:         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2201:       }
2202:     }

2204:     nrqs     = 0;              /* no of outgoing messages */
2205:     msz      = 0;              /* total mesg length (for all procs) */
2206:     w1[rank] = 0;              /* no mesg sent to self */
2207:     w3[rank] = 0;
2208:     for (i=0; i<size; i++) {
2209:       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2210:     }
2211:     PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
2212:     for (i=0,j=0; i<size; i++) {
2213:       if (w1[i]) { pa[j] = i; j++; }
2214:     }

2216:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2217:     for (i=0; i<nrqs; i++) {
2218:       j      = pa[i];
2219:       w1[j] += w2[j] + 2* w3[j];
2220:       msz   += w1[j];
2221:     }
2222:     PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

2224:     /* Determine the number of messages to expect, their lengths, from from-ids */
2225:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2226:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

2228:     /* Now post the Irecvs corresponding to these messages */
2229:     tag0 = ((PetscObject)C)->tag;
2230:     PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

2232:     PetscFree(onodes1);
2233:     PetscFree(olengths1);

2235:     /* Allocate Memory for outgoing messages */
2236:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2237:     PetscMemzero(sbuf1,size*sizeof(PetscInt*));
2238:     PetscMemzero(ptr,size*sizeof(PetscInt*));

2240:     {
2241:       PetscInt *iptr = tmp;
2242:       k    = 0;
2243:       for (i=0; i<nrqs; i++) {
2244:         j        = pa[i];
2245:         iptr    += k;
2246:         sbuf1[j] = iptr;
2247:         k        = w1[j];
2248:       }
2249:     }

2251:     /* Form the outgoing messages. Initialize the header space */
2252:     for (i=0; i<nrqs; i++) {
2253:       j           = pa[i];
2254:       sbuf1[j][0] = 0;
2255:       PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
2256:       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2257:     }

2259:     /* Parse the isrow and copy data into outbuf */
2260:     for (i=0; i<ismax; i++) {
2261:       row2proc_i = row2proc[i];
2262:       PetscMemzero(ctr,size*sizeof(PetscInt));
2263:       irow_i = irow[i];
2264:       jmax   = nrow[i];
2265:       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2266:         proc = row2proc_i[j];
2267:         if (proc != rank) { /* copy to the outgoing buf*/
2268:           ctr[proc]++;
2269:           *ptr[proc] = irow_i[j];
2270:           ptr[proc]++;
2271:         }
2272:       }
2273:       /* Update the headers for the current IS */
2274:       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2275:         if ((ctr_j = ctr[j])) {
2276:           sbuf1_j        = sbuf1[j];
2277:           k              = ++sbuf1_j[0];
2278:           sbuf1_j[2*k]   = ctr_j;
2279:           sbuf1_j[2*k-1] = i;
2280:         }
2281:       }
2282:     }

2284:     /*  Now  post the sends */
2285:     PetscMalloc1(nrqs+1,&s_waits1);
2286:     for (i=0; i<nrqs; ++i) {
2287:       j    = pa[i];
2288:       MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2289:     }

2291:     /* Post Receives to capture the buffer size */
2292:     PetscMalloc1(nrqs+1,&r_waits2);
2293:     PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
2294:     rbuf2[0] = tmp + msz;
2295:     for (i=1; i<nrqs; ++i) {
2296:       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2297:     }
2298:     for (i=0; i<nrqs; ++i) {
2299:       j    = pa[i];
2300:       MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2301:     }

2303:     /* Send to other procs the buf size they should allocate */
2304:     /* Receive messages*/
2305:     PetscMalloc1(nrqr+1,&s_waits2);
2306:     PetscMalloc1(nrqr+1,&r_status1);
2307:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2308:     {
2309:       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2310:       PetscInt   *sbuf2_i;

2312:       MPI_Waitall(nrqr,r_waits1,r_status1);
2313:       for (i=0; i<nrqr; ++i) {
2314:         req_size[i] = 0;
2315:         rbuf1_i        = rbuf1[i];
2316:         start          = 2*rbuf1_i[0] + 1;
2317:         MPI_Get_count(r_status1+i,MPIU_INT,&end);
2318:         PetscMalloc1(end+1,&sbuf2[i]);
2319:         sbuf2_i        = sbuf2[i];
2320:         for (j=start; j<end; j++) {
2321:           id              = rbuf1_i[j] - rstart;
2322:           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2323:           sbuf2_i[j]      = ncols;
2324:           req_size[i] += ncols;
2325:         }
2326:         req_source1[i] = r_status1[i].MPI_SOURCE;
2327:         /* form the header */
2328:         sbuf2_i[0] = req_size[i];
2329:         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

2331:         MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2332:       }
2333:     }
2334:     PetscFree(r_status1);
2335:     PetscFree(r_waits1);
2336:     PetscFree4(w1,w2,w3,w4);

2338:     /* Receive messages*/
2339:     PetscMalloc1(nrqs+1,&r_waits3);
2340:     PetscMalloc1(nrqs+1,&r_status2);

2342:     MPI_Waitall(nrqs,r_waits2,r_status2);
2343:     for (i=0; i<nrqs; ++i) {
2344:       PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
2345:       req_source2[i] = r_status2[i].MPI_SOURCE;
2346:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2347:     }
2348:     PetscFree(r_status2);
2349:     PetscFree(r_waits2);

2351:     /* Wait on sends1 and sends2 */
2352:     PetscMalloc1(nrqs+1,&s_status1);
2353:     PetscMalloc1(nrqr+1,&s_status2);

2355:     if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
2356:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
2357:     PetscFree(s_status1);
2358:     PetscFree(s_status2);
2359:     PetscFree(s_waits1);
2360:     PetscFree(s_waits2);

2362:     /* Now allocate sending buffers for a->j, and send them off */
2363:     PetscMalloc1(nrqr+1,&sbuf_aj);
2364:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2365:     PetscMalloc1(j+1,&sbuf_aj[0]);
2366:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

2368:     PetscMalloc1(nrqr+1,&s_waits3);
2369:     {
2370:       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2371:       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2372:       PetscInt cend = C->cmap->rend;
2373:       PetscInt *a_j = a->j,*b_j = b->j,ctmp;

2375:       for (i=0; i<nrqr; i++) {
2376:         rbuf1_i   = rbuf1[i];
2377:         sbuf_aj_i = sbuf_aj[i];
2378:         ct1       = 2*rbuf1_i[0] + 1;
2379:         ct2       = 0;
2380:         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2381:           kmax = rbuf1[i][2*j];
2382:           for (k=0; k<kmax; k++,ct1++) {
2383:             row    = rbuf1_i[ct1] - rstart;
2384:             nzA    = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2385:             ncols  = nzA + nzB;
2386:             cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];

2388:             /* load the column indices for this row into cols */
2389:             cols = sbuf_aj_i + ct2;

2391:             lwrite = 0;
2392:             for (l=0; l<nzB; l++) {
2393:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2394:             }
2395:             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2396:             for (l=0; l<nzB; l++) {
2397:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2398:             }

2400:             ct2 += ncols;
2401:           }
2402:         }
2403:         MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2404:       }
2405:     }
2406:     PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);

2408:     /* create col map: global col of C -> local col of submatrices */
2409:     {
2410:       const PetscInt *icol_i;
2411: #if defined(PETSC_USE_CTABLE)
2412:       for (i=0; i<ismax; i++) {
2413:         if (!allcolumns[i]) {
2414:           PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);

2416:           jmax   = ncol[i];
2417:           icol_i = icol[i];
2418:           cmap_i = cmap[i];
2419:           for (j=0; j<jmax; j++) {
2420:             PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2421:           }
2422:         } else cmap[i] = NULL;
2423:       }
2424: #else
2425:       for (i=0; i<ismax; i++) {
2426:         if (!allcolumns[i]) {
2427:           PetscCalloc1(C->cmap->N,&cmap[i]);
2428:           jmax   = ncol[i];
2429:           icol_i = icol[i];
2430:           cmap_i = cmap[i];
2431:           for (j=0; j<jmax; j++) {
2432:             cmap_i[icol_i[j]] = j+1;
2433:           }
2434:         } else cmap[i] = NULL;
2435:       }
2436: #endif
2437:     }

2439:     /* Create lens which is required for MatCreate... */
2440:     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2441:     PetscMalloc1(ismax,&lens);

2443:     if (ismax) {
2444:       PetscCalloc1(j,&lens[0]);
2445:     }
2446:     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

2448:     /* Update lens from local data */
2449:     for (i=0; i<ismax; i++) {
2450:       row2proc_i = row2proc[i];
2451:       jmax = nrow[i];
2452:       if (!allcolumns[i]) cmap_i = cmap[i];
2453:       irow_i = irow[i];
2454:       lens_i = lens[i];
2455:       for (j=0; j<jmax; j++) {
2456:         row = irow_i[j];
2457:         proc = row2proc_i[j];
2458:         if (proc == rank) {
2459:           MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
2460:           if (!allcolumns[i]) {
2461:             for (k=0; k<ncols; k++) {
2462: #if defined(PETSC_USE_CTABLE)
2463:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
2464: #else
2465:               tcol = cmap_i[cols[k]];
2466: #endif
2467:               if (tcol) lens_i[j]++;
2468:             }
2469:           } else { /* allcolumns */
2470:             lens_i[j] = ncols;
2471:           }
2472:           MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
2473:         }
2474:       }
2475:     }

2477:     /* Create row map: global row of C -> local row of submatrices */
2478: #if defined(PETSC_USE_CTABLE)
2479:     for (i=0; i<ismax; i++) {
2480:       PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
2481:       irow_i = irow[i];
2482:       jmax   = nrow[i];
2483:       for (j=0; j<jmax; j++) {
2484:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2485:       }
2486:     }
2487: #else
2488:     for (i=0; i<ismax; i++) {
2489:       PetscCalloc1(C->rmap->N,&rmap[i]);
2490:       rmap_i = rmap[i];
2491:       irow_i = irow[i];
2492:       jmax   = nrow[i];
2493:       for (j=0; j<jmax; j++) {
2494:         rmap_i[irow_i[j]] = j;
2495:       }
2496:     }
2497: #endif

2499:     /* Update lens from offproc data */
2500:     {
2501:       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

2503:       MPI_Waitall(nrqs,r_waits3,r_status3);
2504:       for (tmp2=0; tmp2<nrqs; tmp2++) {
2505:         sbuf1_i = sbuf1[pa[tmp2]];
2506:         jmax    = sbuf1_i[0];
2507:         ct1     = 2*jmax+1;
2508:         ct2     = 0;
2509:         rbuf2_i = rbuf2[tmp2];
2510:         rbuf3_i = rbuf3[tmp2];
2511:         for (j=1; j<=jmax; j++) {
2512:           is_no  = sbuf1_i[2*j-1];
2513:           max1   = sbuf1_i[2*j];
2514:           lens_i = lens[is_no];
2515:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2516:           rmap_i = rmap[is_no];
2517:           for (k=0; k<max1; k++,ct1++) {
2518: #if defined(PETSC_USE_CTABLE)
2519:             PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2520:             row--;
2521:             if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2522: #else
2523:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2524: #endif
2525:             max2 = rbuf2_i[ct1];
2526:             for (l=0; l<max2; l++,ct2++) {
2527:               if (!allcolumns[is_no]) {
2528: #if defined(PETSC_USE_CTABLE)
2529:                 PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2530: #else
2531:                 tcol = cmap_i[rbuf3_i[ct2]];
2532: #endif
2533:                 if (tcol) lens_i[row]++;
2534:               } else { /* allcolumns */
2535:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2536:               }
2537:             }
2538:           }
2539:         }
2540:       }
2541:     }
2542:     PetscFree(r_waits3);
2543:     if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
2544:     PetscFree2(r_status3,s_status3);
2545:     PetscFree(s_waits3);

2547:     /* Create the submatrices */
2548:     for (i=0; i<ismax; i++) {
2549:       PetscInt    rbs,cbs;

2551:       ISGetBlockSize(isrow[i],&rbs);
2552:       ISGetBlockSize(iscol[i],&cbs);

2554:       MatCreate(PETSC_COMM_SELF,submats+i);
2555:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

2557:       MatSetBlockSizes(submats[i],rbs,cbs);
2558:       MatSetType(submats[i],((PetscObject)A)->type_name);
2559:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);

2561:       /* create struct Mat_SubSppt and attached it to submat */
2562:       PetscNew(&smat_i);
2563:       subc = (Mat_SeqAIJ*)submats[i]->data;
2564:       subc->submatis1 = smat_i;

2566:       smat_i->destroy          = submats[i]->ops->destroy;
2567:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2568:       submats[i]->factortype   = C->factortype;

2570:       smat_i->id          = i;
2571:       smat_i->nrqs        = nrqs;
2572:       smat_i->nrqr        = nrqr;
2573:       smat_i->rbuf1       = rbuf1;
2574:       smat_i->rbuf2       = rbuf2;
2575:       smat_i->rbuf3       = rbuf3;
2576:       smat_i->sbuf2       = sbuf2;
2577:       smat_i->req_source2 = req_source2;

2579:       smat_i->sbuf1       = sbuf1;
2580:       smat_i->ptr         = ptr;
2581:       smat_i->tmp         = tmp;
2582:       smat_i->ctr         = ctr;

2584:       smat_i->pa           = pa;
2585:       smat_i->req_size     = req_size;
2586:       smat_i->req_source1  = req_source1;

2588:       smat_i->allcolumns  = allcolumns[i];
2589:       smat_i->singleis    = PETSC_FALSE;
2590:       smat_i->row2proc    = row2proc[i];
2591:       smat_i->rmap        = rmap[i];
2592:       smat_i->cmap        = cmap[i];
2593:     }

2595:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2596:       MatCreate(PETSC_COMM_SELF,&submats[0]);
2597:       MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2598:       MatSetType(submats[0],MATDUMMY);

2600:       /* create struct Mat_SubSppt and attached it to submat */
2601:       PetscNewLog(submats[0],&smat_i);
2602:       submats[0]->data = (void*)smat_i;

2604:       smat_i->destroy          = submats[0]->ops->destroy;
2605:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2606:       submats[0]->factortype   = C->factortype;

2608:       smat_i->id          = 0;
2609:       smat_i->nrqs        = nrqs;
2610:       smat_i->nrqr        = nrqr;
2611:       smat_i->rbuf1       = rbuf1;
2612:       smat_i->rbuf2       = rbuf2;
2613:       smat_i->rbuf3       = rbuf3;
2614:       smat_i->sbuf2       = sbuf2;
2615:       smat_i->req_source2 = req_source2;

2617:       smat_i->sbuf1       = sbuf1;
2618:       smat_i->ptr         = ptr;
2619:       smat_i->tmp         = tmp;
2620:       smat_i->ctr         = ctr;

2622:       smat_i->pa           = pa;
2623:       smat_i->req_size     = req_size;
2624:       smat_i->req_source1  = req_source1;

2626:       smat_i->allcolumns  = PETSC_FALSE;
2627:       smat_i->singleis    = PETSC_FALSE;
2628:       smat_i->row2proc    = NULL;
2629:       smat_i->rmap        = NULL;
2630:       smat_i->cmap        = NULL;
2631:     }

2633:     if (ismax) {PetscFree(lens[0]);}
2634:     PetscFree(lens);
2635:     PetscFree(sbuf_aj[0]);
2636:     PetscFree(sbuf_aj);

2638:   } /* endof scall == MAT_INITIAL_MATRIX */

2640:   /* Post recv matrix values */
2641:   PetscObjectGetNewTag((PetscObject)C,&tag4);
2642:   PetscMalloc1(nrqs+1,&rbuf4);
2643:   PetscMalloc1(nrqs+1,&r_waits4);
2644:   PetscMalloc1(nrqs+1,&r_status4);
2645:   PetscMalloc1(nrqr+1,&s_status4);
2646:   for (i=0; i<nrqs; ++i) {
2647:     PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
2648:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2649:   }

2651:   /* Allocate sending buffers for a->a, and send them off */
2652:   PetscMalloc1(nrqr+1,&sbuf_aa);
2653:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2654:   PetscMalloc1(j+1,&sbuf_aa[0]);
2655:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

2657:   PetscMalloc1(nrqr+1,&s_waits4);
2658:   {
2659:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2660:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2661:     PetscInt    cend   = C->cmap->rend;
2662:     PetscInt    *b_j   = b->j;
2663:     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;

2665:     for (i=0; i<nrqr; i++) {
2666:       rbuf1_i   = rbuf1[i];
2667:       sbuf_aa_i = sbuf_aa[i];
2668:       ct1       = 2*rbuf1_i[0]+1;
2669:       ct2       = 0;
2670:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2671:         kmax = rbuf1_i[2*j];
2672:         for (k=0; k<kmax; k++,ct1++) {
2673:           row    = rbuf1_i[ct1] - rstart;
2674:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
2675:           ncols  = nzA + nzB;
2676:           cworkB = b_j + b_i[row];
2677:           vworkA = a_a + a_i[row];
2678:           vworkB = b_a + b_i[row];

2680:           /* load the column values for this row into vals*/
2681:           vals = sbuf_aa_i+ct2;

2683:           lwrite = 0;
2684:           for (l=0; l<nzB; l++) {
2685:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2686:           }
2687:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2688:           for (l=0; l<nzB; l++) {
2689:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2690:           }

2692:           ct2 += ncols;
2693:         }
2694:       }
2695:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2696:     }
2697:   }

2699:   /* Assemble the matrices */
2700:   /* First assemble the local rows */
2701:   for (i=0; i<ismax; i++) {
2702:     row2proc_i = row2proc[i];
2703:     subc      = (Mat_SeqAIJ*)submats[i]->data;
2704:     imat_ilen = subc->ilen;
2705:     imat_j    = subc->j;
2706:     imat_i    = subc->i;
2707:     imat_a    = subc->a;

2709:     if (!allcolumns[i]) cmap_i = cmap[i];
2710:     rmap_i = rmap[i];
2711:     irow_i = irow[i];
2712:     jmax   = nrow[i];
2713:     for (j=0; j<jmax; j++) {
2714:       row  = irow_i[j];
2715:       proc = row2proc_i[j];
2716:       if (proc == rank) {
2717:         old_row = row;
2718: #if defined(PETSC_USE_CTABLE)
2719:         PetscTableFind(rmap_i,row+1,&row);
2720:         row--;
2721: #else
2722:         row = rmap_i[row];
2723: #endif
2724:         ilen_row = imat_ilen[row];
2725:         MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2726:         mat_i    = imat_i[row];
2727:         mat_a    = imat_a + mat_i;
2728:         mat_j    = imat_j + mat_i;
2729:         if (!allcolumns[i]) {
2730:           for (k=0; k<ncols; k++) {
2731: #if defined(PETSC_USE_CTABLE)
2732:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
2733: #else
2734:             tcol = cmap_i[cols[k]];
2735: #endif
2736:             if (tcol) {
2737:               *mat_j++ = tcol - 1;
2738:               *mat_a++ = vals[k];
2739:               ilen_row++;
2740:             }
2741:           }
2742:         } else { /* allcolumns */
2743:           for (k=0; k<ncols; k++) {
2744:             *mat_j++ = cols[k];  /* global col index! */
2745:             *mat_a++ = vals[k];
2746:             ilen_row++;
2747:           }
2748:         }
2749:         MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

2751:         imat_ilen[row] = ilen_row;
2752:       }
2753:     }
2754:   }

2756:   /* Now assemble the off proc rows */
2757:   MPI_Waitall(nrqs,r_waits4,r_status4);
2758:   for (tmp2=0; tmp2<nrqs; tmp2++) {
2759:     sbuf1_i = sbuf1[pa[tmp2]];
2760:     jmax    = sbuf1_i[0];
2761:     ct1     = 2*jmax + 1;
2762:     ct2     = 0;
2763:     rbuf2_i = rbuf2[tmp2];
2764:     rbuf3_i = rbuf3[tmp2];
2765:     rbuf4_i = rbuf4[tmp2];
2766:     for (j=1; j<=jmax; j++) {
2767:       is_no     = sbuf1_i[2*j-1];
2768:       rmap_i    = rmap[is_no];
2769:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2770:       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2771:       imat_ilen = subc->ilen;
2772:       imat_j    = subc->j;
2773:       imat_i    = subc->i;
2774:       imat_a    = subc->a;
2775:       max1      = sbuf1_i[2*j];
2776:       for (k=0; k<max1; k++,ct1++) {
2777:         row = sbuf1_i[ct1];
2778: #if defined(PETSC_USE_CTABLE)
2779:         PetscTableFind(rmap_i,row+1,&row);
2780:         row--;
2781: #else
2782:         row = rmap_i[row];
2783: #endif
2784:         ilen  = imat_ilen[row];
2785:         mat_i = imat_i[row];
2786:         mat_a = imat_a + mat_i;
2787:         mat_j = imat_j + mat_i;
2788:         max2  = rbuf2_i[ct1];
2789:         if (!allcolumns[is_no]) {
2790:           for (l=0; l<max2; l++,ct2++) {
2791: #if defined(PETSC_USE_CTABLE)
2792:             PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2793: #else
2794:             tcol = cmap_i[rbuf3_i[ct2]];
2795: #endif
2796:             if (tcol) {
2797:               *mat_j++ = tcol - 1;
2798:               *mat_a++ = rbuf4_i[ct2];
2799:               ilen++;
2800:             }
2801:           }
2802:         } else { /* allcolumns */
2803:           for (l=0; l<max2; l++,ct2++) {
2804:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2805:             *mat_a++ = rbuf4_i[ct2];
2806:             ilen++;
2807:           }
2808:         }
2809:         imat_ilen[row] = ilen;
2810:       }
2811:     }
2812:   }

2814:   if (!iscsorted) { /* sort column indices of the rows */
2815:     for (i=0; i<ismax; i++) {
2816:       subc      = (Mat_SeqAIJ*)submats[i]->data;
2817:       imat_j    = subc->j;
2818:       imat_i    = subc->i;
2819:       imat_a    = subc->a;
2820:       imat_ilen = subc->ilen;

2822:       if (allcolumns[i]) continue;
2823:       jmax = nrow[i];
2824:       for (j=0; j<jmax; j++) {
2825:         mat_i = imat_i[j];
2826:         mat_a = imat_a + mat_i;
2827:         mat_j = imat_j + mat_i;
2828:         PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2829:       }
2830:     }
2831:   }

2833:   PetscFree(r_status4);
2834:   PetscFree(r_waits4);
2835:   if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
2836:   PetscFree(s_waits4);
2837:   PetscFree(s_status4);

2839:   /* Restore the indices */
2840:   for (i=0; i<ismax; i++) {
2841:     ISRestoreIndices(isrow[i],irow+i);
2842:     if (!allcolumns[i]) {
2843:       ISRestoreIndices(iscol[i],icol+i);
2844:     }
2845:   }

2847:   for (i=0; i<ismax; i++) {
2848:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2849:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2850:   }

2852:   /* Destroy allocated memory */
2853:   PetscFree(sbuf_aa[0]);
2854:   PetscFree(sbuf_aa);
2855:   PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);

2857:   for (i=0; i<nrqs; ++i) {
2858:     PetscFree(rbuf4[i]);
2859:   }
2860:   PetscFree(rbuf4);

2862:   PetscFree4(row2proc,cmap,rmap,allcolumns);
2863:   return(0);
2864: }

2866: /*
2867:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2868:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2869:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2870:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2871:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2872:  state, and needs to be "assembled" later by compressing B's column space.

2874:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2875:  Following this call, C->A & C->B have been created, even if empty.
2876:  */
2877: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2878: {
2879:   /* If making this function public, change the error returned in this function away from _PLIB. */
2881:   Mat_MPIAIJ     *aij;
2882:   Mat_SeqAIJ     *Baij;
2883:   PetscBool      seqaij,Bdisassembled;
2884:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2885:   PetscScalar    v;
2886:   const PetscInt *rowindices,*colindices;

2889:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2890:   if (A) {
2891:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2892:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2893:     if (rowemb) {
2894:       ISGetLocalSize(rowemb,&m);
2895:       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2896:     } else {
2897:       if (C->rmap->n != A->rmap->n) {
2898:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2899:       }
2900:     }
2901:     if (dcolemb) {
2902:       ISGetLocalSize(dcolemb,&n);
2903:       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2904:     } else {
2905:       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2906:     }
2907:   }
2908:   if (B) {
2909:     PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2910:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2911:     if (rowemb) {
2912:       ISGetLocalSize(rowemb,&m);
2913:       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2914:     } else {
2915:       if (C->rmap->n != B->rmap->n) {
2916:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2917:       }
2918:     }
2919:     if (ocolemb) {
2920:       ISGetLocalSize(ocolemb,&n);
2921:       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2922:     } else {
2923:       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2924:     }
2925:   }

2927:   aij = (Mat_MPIAIJ*)C->data;
2928:   if (!aij->A) {
2929:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2930:     MatCreate(PETSC_COMM_SELF,&aij->A);
2931:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2932:     MatSetBlockSizesFromMats(aij->A,C,C);
2933:     MatSetType(aij->A,MATSEQAIJ);
2934:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2935:   }
2936:   if (A) {
2937:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2938:   } else {
2939:     MatSetUp(aij->A);
2940:   }
2941:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2942:     /*
2943:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2944:       need to "disassemble" B -- convert it to using C's global indices.
2945:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2947:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2948:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2950:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2951:       At least avoid calling MatSetValues() and the implied searches?
2952:     */

2954:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2955: #if defined(PETSC_USE_CTABLE)
2956:       PetscTableDestroy(&aij->colmap);
2957: #else
2958:       PetscFree(aij->colmap);
2959:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2960:       if (aij->B) {
2961:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2962:       }
2963: #endif
2964:       ngcol = 0;
2965:       if (aij->lvec) {
2966:         VecGetSize(aij->lvec,&ngcol);
2967:       }
2968:       if (aij->garray) {
2969:         PetscFree(aij->garray);
2970:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2971:       }
2972:       VecDestroy(&aij->lvec);
2973:       VecScatterDestroy(&aij->Mvctx);
2974:     }
2975:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2976:       MatDestroy(&aij->B);
2977:     }
2978:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2979:       MatZeroEntries(aij->B);
2980:     }
2981:   }
2982:   Bdisassembled = PETSC_FALSE;
2983:   if (!aij->B) {
2984:     MatCreate(PETSC_COMM_SELF,&aij->B);
2985:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2986:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2987:     MatSetBlockSizesFromMats(aij->B,B,B);
2988:     MatSetType(aij->B,MATSEQAIJ);
2989:     Bdisassembled = PETSC_TRUE;
2990:   }
2991:   if (B) {
2992:     Baij = (Mat_SeqAIJ*)B->data;
2993:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2994:       PetscMalloc1(B->rmap->n,&nz);
2995:       for (i=0; i<B->rmap->n; i++) {
2996:         nz[i] = Baij->i[i+1] - Baij->i[i];
2997:       }
2998:       MatSeqAIJSetPreallocation(aij->B,0,nz);
2999:       PetscFree(nz);
3000:     }

3002:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
3003:     shift = rend-rstart;
3004:     count = 0;
3005:     rowindices = NULL;
3006:     colindices = NULL;
3007:     if (rowemb) {
3008:       ISGetIndices(rowemb,&rowindices);
3009:     }
3010:     if (ocolemb) {
3011:       ISGetIndices(ocolemb,&colindices);
3012:     }
3013:     for (i=0; i<B->rmap->n; i++) {
3014:       PetscInt row;
3015:       row = i;
3016:       if (rowindices) row = rowindices[i];
3017:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3018:         col  = Baij->j[count];
3019:         if (colindices) col = colindices[col];
3020:         if (Bdisassembled && col>=rstart) col += shift;
3021:         v    = Baij->a[count];
3022:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3023:         ++count;
3024:       }
3025:     }
3026:     /* No assembly for aij->B is necessary. */
3027:     /* FIXME: set aij->B's nonzerostate correctly. */
3028:   } else {
3029:     MatSetUp(aij->B);
3030:   }
3031:   C->preallocated  = PETSC_TRUE;
3032:   C->was_assembled = PETSC_FALSE;
3033:   C->assembled     = PETSC_FALSE;
3034:    /*
3035:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3036:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3037:    */
3038:   return(0);
3039: }

3041: /*
3042:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3043:  */
3044: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3045: {
3046:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;

3051:   /* FIXME: make sure C is assembled */
3052:   *A = aij->A;
3053:   *B = aij->B;
3054:   /* Note that we don't incref *A and *B, so be careful! */
3055:   return(0);
3056: }

3058: /*
3059:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3060:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3061: */
3062: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3063:                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3064:                                                  PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3065:                                                  PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3066:                                                  PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3067: {
3069:   PetscMPIInt    isize,flag;
3070:   PetscInt       i,ii,cismax,ispar;
3071:   Mat            *A,*B;
3072:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

3075:   if (!ismax) return(0);

3077:   for (i = 0, cismax = 0; i < ismax; ++i) {
3078:     PetscMPIInt isize;
3079:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3080:     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3081:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
3082:     if (isize > 1) ++cismax;
3083:   }

3085:   /*
3086:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3087:      ispar counts the number of parallel ISs across C's comm.
3088:   */
3089:   MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3090:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3091:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3092:     return(0);
3093:   }

3095:   /* if (ispar) */
3096:   /*
3097:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3098:     These are used to extract the off-diag portion of the resulting parallel matrix.
3099:     The row IS for the off-diag portion is the same as for the diag portion,
3100:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3101:   */
3102:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3103:   PetscMalloc1(cismax,&ciscol_p);
3104:   for (i = 0, ii = 0; i < ismax; ++i) {
3105:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3106:     if (isize > 1) {
3107:       /*
3108:          TODO: This is the part that's ***NOT SCALABLE***.
3109:          To fix this we need to extract just the indices of C's nonzero columns
3110:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3111:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3112:          to be done without serializing on the IS list, so, most likely, it is best
3113:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3114:       */
3115:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3116:       /* Now we have to
3117:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3118:              were sorted on each rank, concatenated they might no longer be sorted;
3119:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3120:              indices in the nondecreasing order to the original index positions.
3121:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3122:       */
3123:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3124:       ISSort(ciscol[ii]);
3125:       ++ii;
3126:     }
3127:   }
3128:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3129:   for (i = 0, ii = 0; i < ismax; ++i) {
3130:     PetscInt       j,issize;
3131:     const PetscInt *indices;

3133:     /*
3134:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3135:      */
3136:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3137:     ISSort(isrow[i]);
3138:     ISGetLocalSize(isrow[i],&issize);
3139:     ISGetIndices(isrow[i],&indices);
3140:     for (j = 1; j < issize; ++j) {
3141:       if (indices[j] == indices[j-1]) {
3142:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3143:       }
3144:     }
3145:     ISRestoreIndices(isrow[i],&indices);


3148:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3149:     ISSort(iscol[i]);
3150:     ISGetLocalSize(iscol[i],&issize);
3151:     ISGetIndices(iscol[i],&indices);
3152:     for (j = 1; j < issize; ++j) {
3153:       if (indices[j-1] == indices[j]) {
3154:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155:       }
3156:     }
3157:     ISRestoreIndices(iscol[i],&indices);
3158:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3159:     if (isize > 1) {
3160:       cisrow[ii] = isrow[i];
3161:       ++ii;
3162:     }
3163:   }
3164:   /*
3165:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3166:     array of sequential matrices underlying the resulting parallel matrices.
3167:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3168:     contain duplicates.

3170:     There are as many diag matrices as there are original index sets. There are only as many parallel
3171:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3173:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3174:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3175:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3176:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3177:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3178:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3179:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3180:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3181:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3182:       values into A[i] and B[ii] sitting inside the corresponding submat.
3183:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3184:       A[i], B[ii], AA[i] or BB[ii] matrices.
3185:   */
3186:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3187:   if (scall == MAT_INITIAL_MATRIX) {
3188:     PetscMalloc1(ismax,submat);
3189:   }

3191:   /* Now obtain the sequential A and B submatrices separately. */
3192:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3193:   (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3194:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);

3196:   /*
3197:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3198:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3199:     simply into the sequential matrix identical with the corresponding A (if isize == 1).
3200:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3201:     to have the same sparsity pattern.
3202:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3203:     must be constructed for C. This is done by setseqmat(s).
3204:   */
3205:   for (i = 0, ii = 0; i < ismax; ++i) {
3206:     /*
3207:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3208:        That way we can avoid sorting and computing permutations when reusing.
3209:        To this end:
3210:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3211:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3212:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3213:     */
3214:     MatStructure pattern;
3215:     pattern = DIFFERENT_NONZERO_PATTERN;

3217:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3218:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3219:     if (isize > 1) {
3220:       if (scall == MAT_INITIAL_MATRIX) {
3221:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3222:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3223:         MatSetType((*submat)[i],MATMPIAIJ);
3224:         PetscLayoutSetUp((*submat)[i]->rmap);
3225:         PetscLayoutSetUp((*submat)[i]->cmap);
3226:       }
3227:       /*
3228:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3229:       */
3230:       {
3231:         Mat AA,BB;
3232:         AA = A[i];
3233:         BB = B[ii];
3234:         if (AA || BB) {
3235:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3236:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3237:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3238:         }

3240:         MatDestroy(&AA);
3241:       }
3242:       ISDestroy(ciscol+ii);
3243:       ISDestroy(ciscol_p+ii);
3244:       ++ii;
3245:     } else { /* if (isize == 1) */
3246:       if (scall == MAT_REUSE_MATRIX) {
3247:         MatDestroy(&(*submat)[i]);
3248:       }
3249:       if (isrow_p[i] || iscol_p[i]) {
3250:         MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3251:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3252:         /* Otherwise A is extracted straight into (*submats)[i]. */
3253:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3254:         MatDestroy(A+i);
3255:       } else (*submat)[i] = A[i];
3256:     }
3257:     ISDestroy(&isrow_p[i]);
3258:     ISDestroy(&iscol_p[i]);
3259:   }
3260:   PetscFree2(cisrow,ciscol);
3261:   PetscFree2(isrow_p,iscol_p);
3262:   PetscFree(ciscol_p);
3263:   PetscFree(A);
3264:   MatDestroySubMatrices(cismax,&B);
3265:   return(0);
3266: }

3268: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3269: {

3273:   MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3274:   return(0);
3275: }