Actual source code: sfpack.c

petsc-3.13.0 2020-03-29
Report Typos and Errors

  2:  #include <../src/vec/is/sf/impls/basic/sfpack.h>
  3:  #include <../src/vec/is/sf/impls/basic/sfbasic.h>

  5: #if defined(PETSC_HAVE_CUDA)
  6: #include <cuda_runtime.h>
  7: #endif
  8: /*
  9:  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
 10:  * therefore we pack data types manually. This file defines packing routines for the standard data types.
 11:  */

 13: #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d

 15: /* Operations working like s += t */
 16: #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
 17: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
 18: #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
 19: #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
 20: /* Ref MPI MAXLOC */
 21: #define OP_XLOC(op,s,t) \
 22:   do {                                       \
 23:     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
 24:     else if (!((s).u op (t).u)) s = t;           \
 25:   } while(0)

 27: /* DEF_PackFunc - macro defining a Pack routine

 29:    Arguments of the macro:
 30:    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
 31:    .BS        Block size for vectorization. It is a factor of bsz.
 32:    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.

 34:    Arguments of the Pack routine:
 35:    +count     Number of indices in idx[].
 36:    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
 37:    .opt       Per-pack optimization plan. NULL means no such plan.
 38:    .idx       Indices of entries to packed.
 39:    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
 40:    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
 41:    -packed    Address of the packed data.
 42:  */
 43: #define DEF_PackFunc(Type,BS,EQ) \
 44:   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
 45:   {                                                                                                          \
 47:     const Type     *u = (const Type*)unpacked,*u2;                                                           \
 48:     Type           *p = (Type*)packed,*p2;                                                                   \
 49:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
 50:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
 51:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
 53:     if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */       \
 54:     else if (opt) { /* has optimizations available */                                                        \
 55:       p2 = p;                                                                                                \
 56:       for (r=0; r<opt->n; r++) {                                                                             \
 57:         u2 = u + opt->start[r]*MBS;                                                                          \
 58:         X  = opt->X[r];                                                                                      \
 59:         Y  = opt->Y[r];                                                                                      \
 60:         for (k=0; k<opt->dz[r]; k++)                                                                         \
 61:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
 62:             PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);                        \
 63:             p2  += opt->dx[r]*MBS;                                                                           \
 64:           }                                                                                                  \
 65:       }                                                                                                      \
 66:     } else {                                                                                                 \
 67:       for (i=0; i<count; i++)                                                                                \
 68:         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
 69:           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
 70:             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
 71:     }                                                                                                        \
 72:     return(0);                                                                                  \
 73:   }

 75: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
 76:                 and inserts into a sparse array.

 78:    Arguments:
 79:   .Type       Type of the data
 80:   .BS         Block size for vectorization
 81:   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.

 83:   Notes:
 84:    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
 85:  */
 86: #define DEF_UnpackFunc(Type,BS,EQ)               \
 87:   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
 88:   {                                                                                                          \
 90:     Type           *u = (Type*)unpacked,*u2;                                                                 \
 91:     const Type     *p = (const Type*)packed;                                                                 \
 92:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
 93:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
 94:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
 96:     if (!idx) {                                                                                              \
 97:       u += start*MBS;                                                                                        \
 98:       if (u != p) {PetscArraycpy(u,p,count*MBS);}                                       \
 99:     } else if (opt) { /* has optimizations available */                                                      \
100:       for (r=0; r<opt->n; r++) {                                                                             \
101:         u2 = u + opt->start[r]*MBS;                                                                          \
102:         X  = opt->X[r];                                                                                      \
103:         Y  = opt->Y[r];                                                                                      \
104:         for (k=0; k<opt->dz[r]; k++)                                                                         \
105:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
106:             PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);                         \
107:             p   += opt->dx[r]*MBS;                                                                           \
108:           }                                                                                                  \
109:       }                                                                                                      \
110:     } else {                                                                                                 \
111:       for (i=0; i<count; i++)                                                                                \
112:         for (j=0; j<M; j++)                                                                                  \
113:           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114:     }                                                                                                        \
115:     return(0);                                                                                  \
116:   }

118: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert

120:    Arguments:
121:   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
122:   .Type       Type of the data
123:   .BS         Block size for vectorization
124:   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
125:   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
126:   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
127:  */
128: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
129:   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
130:   {                                                                                                          \
131:     Type           *u = (Type*)unpacked,*u2;                                                                 \
132:     const Type     *p = (const Type*)packed;                                                                 \
133:     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
134:     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
135:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
137:     if (!idx) {                                                                                              \
138:       u += start*MBS;                                                                                        \
139:       for (i=0; i<count; i++)                                                                                \
140:         for (j=0; j<M; j++)                                                                                  \
141:           for (k=0; k<BS; k++)                                                                               \
142:             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
143:     } else if (opt) { /* idx[] has patterns */                                                               \
144:       for (r=0; r<opt->n; r++) {                                                                             \
145:         u2 = u + opt->start[r]*MBS;                                                                          \
146:         X  = opt->X[r];                                                                                      \
147:         Y  = opt->Y[r];                                                                                      \
148:         for (k=0; k<opt->dz[r]; k++)                                                                         \
149:           for (j=0; j<opt->dy[r]; j++) {                                                                     \
150:             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
151:             p += opt->dx[r]*MBS;                                                                             \
152:           }                                                                                                  \
153:       }                                                                                                      \
154:     } else {                                                                                                 \
155:       for (i=0; i<count; i++)                                                                                \
156:         for (j=0; j<M; j++)                                                                                  \
157:           for (k=0; k<BS; k++)                                                                               \
158:             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
159:     }                                                                                                        \
160:     return(0);                                                                                  \
161:   }

163: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
164:   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
165:   {                                                                                                          \
166:     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
167:     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
168:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
169:     const PetscInt MBS = M*BS;                                                                               \
171:     for (i=0; i<count; i++) {                                                                                \
172:       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
173:       l = i*MBS;                                                                                             \
174:       for (j=0; j<M; j++)                                                                                    \
175:         for (k=0; k<BS; k++) {                                                                               \
176:           tmp = u[r+j*BS+k];                                                                                 \
177:           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
178:           p[l+j*BS+k] = tmp;                                                                                 \
179:         }                                                                                                    \
180:     }                                                                                                        \
181:     return(0);                                                                                  \
182:   }

184: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
185:   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
186:   {                                                                                                          \
188:     const Type     *u = (const Type*)src;                                                                    \
189:     Type           *v = (Type*)dst;                                                                          \
190:     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
191:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
192:     const PetscInt MBS = M*BS;                                                                               \
194:     if (!srcIdx) { /* src is contiguous */                                                                   \
195:       u += srcStart*MBS;                                                                                     \
196:       CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);  \
197:     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
198:       u += srcOpt->start[0]*MBS;                                                                             \
199:       v += dstStart*MBS;                                                                                     \
200:       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
201:       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
202:         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
203:           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
204:           v += srcOpt->dx[0]*MBS;                                                                            \
205:         }                                                                                                    \
206:     } else { /* all other cases */                                                                           \
207:       for (i=0; i<count; i++) {                                                                              \
208:         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
209:         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
210:         for (j=0; j<M; j++)                                                                                  \
211:           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
212:       }                                                                                                      \
213:     }                                                                                                        \
214:     return(0);                                                                                  \
215:   }

217: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
218:   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
219:   {                                                                                                          \
220:     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
221:     const Type     *ldata = (const Type*)leafdata;                                                           \
222:     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
223:     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
224:     const PetscInt MBS = M*BS;                                                                               \
226:     for (i=0; i<count; i++) {                                                                                \
227:       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
228:       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
229:       for (j=0; j<M; j++)                                                                                    \
230:         for (k=0; k<BS; k++) {                                                                               \
231:           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
232:           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
233:         }                                                                                                    \
234:     }                                                                                                        \
235:     return(0);                                                                                  \
236:   }

238: /* Pack, Unpack/Fetch ops */
239: #define DEF_Pack(Type,BS,EQ)                                                      \
240:   DEF_PackFunc(Type,BS,EQ)                                                        \
241:   DEF_UnpackFunc(Type,BS,EQ)                                                      \
242:   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
243:   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
244:     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
245:     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
246:     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
247:   }

249: /* Add, Mult ops */
250: #define DEF_Add(Type,BS,EQ)                                                       \
251:   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
252:   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
253:   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
254:   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
255:   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
256:   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
257:   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
258:     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
259:     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
260:     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
261:     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
262:     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
263:     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
264:   }

266: /* Max, Min ops */
267: #define DEF_Cmp(Type,BS,EQ)                                                       \
268:   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
269:   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
270:   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
271:   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
272:   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
273:     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
274:     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
275:     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
276:     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
277:   }

279: /* Logical ops.
280:   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
281:   the compilation warning "empty macro arguments are undefined in ISO C90"
282:  */
283: #define DEF_Log(Type,BS,EQ)                                                       \
284:   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
285:   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
286:   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
287:   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
288:   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
289:   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
290:   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
291:     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
292:     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
293:     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
294:     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
295:     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
296:     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
297:   }

299: /* Bitwise ops */
300: #define DEF_Bit(Type,BS,EQ)                                                       \
301:   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
302:   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
303:   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
304:   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
305:   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
306:   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
307:   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
308:     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
309:     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
310:     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
311:     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
312:     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
313:     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
314:   }

316: /* Maxloc, Minloc ops */
317: #define DEF_Xloc(Type,BS,EQ)                                                      \
318:   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
319:   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
320:   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
321:   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
322:   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
323:     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
324:     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
325:     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
326:     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
327:   }

329: #define DEF_IntegerType(Type,BS,EQ)                                               \
330:   DEF_Pack(Type,BS,EQ)                                                            \
331:   DEF_Add(Type,BS,EQ)                                                             \
332:   DEF_Cmp(Type,BS,EQ)                                                             \
333:   DEF_Log(Type,BS,EQ)                                                             \
334:   DEF_Bit(Type,BS,EQ)                                                             \
335:   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
336:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
337:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
338:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
339:     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
340:     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
341:   }

343: #define DEF_RealType(Type,BS,EQ)                                                  \
344:   DEF_Pack(Type,BS,EQ)                                                            \
345:   DEF_Add(Type,BS,EQ)                                                             \
346:   DEF_Cmp(Type,BS,EQ)                                                             \
347:   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
348:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
349:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
350:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
351:   }

353: #if defined(PETSC_HAVE_COMPLEX)
354: #define DEF_ComplexType(Type,BS,EQ)                                               \
355:   DEF_Pack(Type,BS,EQ)                                                            \
356:   DEF_Add(Type,BS,EQ)                                                             \
357:   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
358:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
359:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
360:   }
361: #endif

363: #define DEF_DumbType(Type,BS,EQ)                                                  \
364:   DEF_Pack(Type,BS,EQ)                                                            \
365:   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
366:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
367:   }

369: /* Maxloc, Minloc */
370: #define DEF_PairType(Type,BS,EQ)                                                  \
371:   DEF_Pack(Type,BS,EQ)                                                            \
372:   DEF_Xloc(Type,BS,EQ)                                                            \
373:   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
374:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
375:     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
376:   }

378: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
379: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
380: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
381: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
382: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
383: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
384: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
385: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */

387: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
388: DEF_IntegerType(int,1,1)
389: DEF_IntegerType(int,2,1)
390: DEF_IntegerType(int,4,1)
391: DEF_IntegerType(int,8,1)
392: DEF_IntegerType(int,1,0)
393: DEF_IntegerType(int,2,0)
394: DEF_IntegerType(int,4,0)
395: DEF_IntegerType(int,8,0)
396: #endif

398: /* The typedefs are used to get a typename without space that CPPJoin can handle */
399: typedef signed char SignedChar;
400: DEF_IntegerType(SignedChar,1,1)
401: DEF_IntegerType(SignedChar,2,1)
402: DEF_IntegerType(SignedChar,4,1)
403: DEF_IntegerType(SignedChar,8,1)
404: DEF_IntegerType(SignedChar,1,0)
405: DEF_IntegerType(SignedChar,2,0)
406: DEF_IntegerType(SignedChar,4,0)
407: DEF_IntegerType(SignedChar,8,0)

409: typedef unsigned char UnsignedChar;
410: DEF_IntegerType(UnsignedChar,1,1)
411: DEF_IntegerType(UnsignedChar,2,1)
412: DEF_IntegerType(UnsignedChar,4,1)
413: DEF_IntegerType(UnsignedChar,8,1)
414: DEF_IntegerType(UnsignedChar,1,0)
415: DEF_IntegerType(UnsignedChar,2,0)
416: DEF_IntegerType(UnsignedChar,4,0)
417: DEF_IntegerType(UnsignedChar,8,0)

419: DEF_RealType(PetscReal,1,1)
420: DEF_RealType(PetscReal,2,1)
421: DEF_RealType(PetscReal,4,1)
422: DEF_RealType(PetscReal,8,1)
423: DEF_RealType(PetscReal,1,0)
424: DEF_RealType(PetscReal,2,0)
425: DEF_RealType(PetscReal,4,0)
426: DEF_RealType(PetscReal,8,0)

428: #if defined(PETSC_HAVE_COMPLEX)
429: DEF_ComplexType(PetscComplex,1,1)
430: DEF_ComplexType(PetscComplex,2,1)
431: DEF_ComplexType(PetscComplex,4,1)
432: DEF_ComplexType(PetscComplex,8,1)
433: DEF_ComplexType(PetscComplex,1,0)
434: DEF_ComplexType(PetscComplex,2,0)
435: DEF_ComplexType(PetscComplex,4,0)
436: DEF_ComplexType(PetscComplex,8,0)
437: #endif

439: #define PairType(Type1,Type2) Type1##_##Type2
440: typedef struct {int u; int i;}           PairType(int,int);
441: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
442: DEF_PairType(PairType(int,int),1,1)
443: DEF_PairType(PairType(PetscInt,PetscInt),1,1)

445: /* If we don't know the basic type, we treat it as a stream of chars or ints */
446: DEF_DumbType(char,1,1)
447: DEF_DumbType(char,2,1)
448: DEF_DumbType(char,4,1)
449: DEF_DumbType(char,1,0)
450: DEF_DumbType(char,2,0)
451: DEF_DumbType(char,4,0)

453: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
454: DEF_DumbType(DumbInt,1,1)
455: DEF_DumbType(DumbInt,2,1)
456: DEF_DumbType(DumbInt,4,1)
457: DEF_DumbType(DumbInt,8,1)
458: DEF_DumbType(DumbInt,1,0)
459: DEF_DumbType(DumbInt,2,0)
460: DEF_DumbType(DumbInt,4,0)
461: DEF_DumbType(DumbInt,8,0)

463: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
464: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
465: {
466:   int ierr;
467:   MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
468:   MPI_Type_commit(newtype); if (ierr) return ierr;
469:   return MPI_SUCCESS;
470: }
471: #endif

473: /*
474:    The routine Creates a communication link for the given operation. It first looks up its link cache. If
475:    there is a free & suitable one, it uses it. Otherwise it creates a new one.

477:    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
478:    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
479:    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
480:    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.

482:    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.

484:    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.

486:    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
487:    need pack/unpack data.
488: */
489: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
490: {
491:   PetscErrorCode    ierr;
492:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
493:   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
494:   PetscSFLink       *p,link;
495:   PetscSFDirection  direction;
496:   MPI_Request       *reqs = NULL;
497:   PetscBool         match,rootdirect[2],leafdirect[2];
498:   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
499:   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/

502:   PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);

504:   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
505:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
506:     if (sfop == PETSCSF_BCAST) {
507:       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
508:       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
509:     } else if (sfop == PETSCSF_REDUCE) {
510:       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
511:       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
512:     } else { /* PETSCSF_FETCH */
513:       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
514:       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
515:     }
516:   }

518:   if (sf->use_gpu_aware_mpi) {
519:     rootmtype_mpi = rootmtype;
520:     leafmtype_mpi = leafmtype;
521:   } else {
522:     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
523:   }
524:   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
525:   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
526:   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;

528:   direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
529:   nrootreqs = bas->nrootreqs;
530:   nleafreqs = sf->nleafreqs;

532:   /* Look for free links in cache */
533:   for (p=&bas->avail; (link=*p); p=&link->next) {
534:     MPIPetsc_Type_compare(unit,link->unit,&match);
535:     if (match) {
536:       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
537:          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
538:       */
539:       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
540:         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
541:         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
542:         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
543:       }
544:       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
545:         reqs = link->leafreqs[direction][leafmtype][1];
546:         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
547:         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
548:       }
549:       *p = link->next; /* Remove from available list */
550:       goto found;
551:     }
552:   }

554:   PetscNew(&link);
555:   PetscSFLinkSetUp_Host(sf,link,unit);
556:   PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */

558:   nreqs = (nrootreqs+nleafreqs)*8;
559:   PetscMalloc1(nreqs,&link->reqs);
560:   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */

562:   for (i=0; i<2; i++) { /* Two communication directions */
563:     for (j=0; j<2; j++) { /* Two memory types */
564:       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
565:         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
566:         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
567:       }
568:     }
569:   }

571: found:
572:   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {PetscSFLinkSetUp_Device(sf,link,unit);}

574:   /* Allocate buffers along root/leafdata */
575:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
576:     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
577:     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
578:     if (bas->rootbuflen[i]) {
579:       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
580:         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
581:       } else { /* Have to have a separate rootbuf */
582:         if (!link->rootbuf_alloc[i][rootmtype]) {
583:           PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
584:         }
585:         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
586:       }
587:     }

589:     if (sf->leafbuflen[i]) {
590:       if (leafdirect[i]) {
591:         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
592:       } else {
593:         if (!link->leafbuf_alloc[i][leafmtype]) {
594:           PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
595:         }
596:         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
597:       }
598:     }
599:   }

601:   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
602:   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
603:     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
604:       PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
605:     }
606:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
607:   }
608:   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
609:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
610:       PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
611:     }
612:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
613:   }

615:   /* Set `current` state of the link. They may change between different SF invocations with the same link */
616:   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
617:     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
618:     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
619:   }

621:   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
622:   link->leafdata = leafdata;
623:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
624:     link->rootdirect[i] = rootdirect[i];
625:     link->leafdirect[i] = leafdirect[i];
626:   }
627:   link->rootdirect_mpi  = rootdirect_mpi;
628:   link->leafdirect_mpi  = leafdirect_mpi;
629:   link->rootmtype       = rootmtype;
630:   link->leafmtype       = leafmtype;
631:   link->rootmtype_mpi   = rootmtype_mpi;
632:   link->leafmtype_mpi   = leafmtype_mpi;

634:   link->next            = bas->inuse;
635:   bas->inuse            = link;
636:   *mylink               = link;
637:   return(0);
638: }

640: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
641:    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
642: */
643: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
644: {
645:   PetscErrorCode       ierr;
646:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
647:   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
648:   const PetscInt       *rootoffset,*leafoffset;
649:   PetscMPIInt          n;
650:   MPI_Aint             disp;
651:   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
652:   MPI_Datatype         unit = link->unit;
653:   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
654:   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;

657:   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
658:   if (sf->persistent) {
659:     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
660:       PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
661:       if (direction == PETSCSF_LEAF2../../../../../..) {
662:         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
663:           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
664:           PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
665:           MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
666:         }
667:       } else { /* PETSCSF_../../../../../..2LEAF */
668:         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
669:           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
670:           PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
671:           MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
672:         }
673:       }
674:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
675:     }

677:     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
678:       PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
679:       if (direction == PETSCSF_LEAF2../../../../../..) {
680:         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
681:           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
682:           PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
683:           MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
684:         }
685:       } else { /* PETSCSF_../../../../../..2LEAF */
686:         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
687:           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
688:           PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
689:           MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
690:         }
691:       }
692:       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
693:     }
694:   }
695:   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
696:   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
697:   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
698:   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
699:   return(0);
700: }


703: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
704: {
705:   PetscErrorCode    ierr;
706:   PetscSFLink       link,*p;
707:   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;

710:   /* Look for types in cache */
711:   for (p=&bas->inuse; (link=*p); p=&link->next) {
712:     PetscBool match;
713:     MPIPetsc_Type_compare(unit,link->unit,&match);
714:     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
715:       switch (cmode) {
716:       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
717:       case PETSC_USE_POINTER: break;
718:       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
719:       }
720:       *mylink = link;
721:       return(0);
722:     }
723:   }
724:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
725:   return(0);
726: }

728: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
729: {
730:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;

733:   (*link)->rootdata = NULL;
734:   (*link)->leafdata = NULL;
735:   (*link)->next     = bas->avail;
736:   bas->avail        = *link;
737:   *link             = NULL;
738:   return(0);
739: }

741: /* Destroy all links chained in 'avail' */
742: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
743: {
744:   PetscErrorCode    ierr;
745:   PetscSFLink       link = *avail,next;
746:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
747:   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;

750:   for (; link; link=next) {
751:     next = link->next;
752:     if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
753:     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
754:       if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
755:     }
756:     PetscFree(link->reqs);
757:     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
758: #if defined(PETSC_HAVE_CUDA)
759:       PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
760:       PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
761:       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
762: #endif
763:       PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
764:       PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
765:     }
766:     PetscFree(link);
767:   }
768:   *avail = NULL;
769:   return(0);
770: }

772: #if defined(PETSC_USE_DEBUG)
773: /* Error out on unsupported overlapped communications */
774: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
775: {
776:   PetscErrorCode    ierr;
777:   PetscSFLink       link,*p;
778:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
779:   PetscBool         match;

782:   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
783:      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
784:   */
785:   if (rootdata || leafdata) {
786:     for (p=&bas->inuse; (link=*p); p=&link->next) {
787:       MPIPetsc_Type_compare(unit,link->unit,&match);
788:       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
789:     }
790:   }
791:   return(0);
792: }
793: #endif

795: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
796: {
798:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
799:   PetscBool      is2Int,is2PetscInt;
800:   PetscMPIInt    ni,na,nd,combiner;
801: #if defined(PETSC_HAVE_COMPLEX)
802:   PetscInt       nPetscComplex=0;
803: #endif

806:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
807:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
808:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
809:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
810:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
811:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
812: #if defined(PETSC_HAVE_COMPLEX)
813:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
814: #endif
815:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
816:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
817:   /* TODO: shaell we also handle Fortran MPI_2REAL? */
818:   MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
819:   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
820:   link->bs = 1; /* default */

822:   if (is2Int) {
823:     PackInit_PairType_int_int_1_1(link);
824:     link->bs        = 1;
825:     link->unitbytes = 2*sizeof(int);
826:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
827:     link->basicunit = MPI_2INT;
828:     link->unit      = MPI_2INT;
829:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
830:     PackInit_PairType_PetscInt_PetscInt_1_1(link);
831:     link->bs        = 1;
832:     link->unitbytes = 2*sizeof(PetscInt);
833:     link->basicunit = MPIU_2INT;
834:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
835:     link->unit      = MPIU_2INT;
836:   } else if (nPetscReal) {
837:     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
838:     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
839:     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
840:     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
841:     link->bs        = nPetscReal;
842:     link->unitbytes = nPetscReal*sizeof(PetscReal);
843:     link->basicunit = MPIU_REAL;
844:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
845:   } else if (nPetscInt) {
846:     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
847:     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
848:     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
849:     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
850:     link->bs        = nPetscInt;
851:     link->unitbytes = nPetscInt*sizeof(PetscInt);
852:     link->basicunit = MPIU_INT;
853:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
854: #if defined(PETSC_USE_64BIT_INDICES)
855:   } else if (nInt) {
856:     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
857:     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
858:     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
859:     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
860:     link->bs        = nInt;
861:     link->unitbytes = nInt*sizeof(int);
862:     link->basicunit = MPI_INT;
863:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
864: #endif
865:   } else if (nSignedChar) {
866:     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
867:     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
868:     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
869:     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
870:     link->bs        = nSignedChar;
871:     link->unitbytes = nSignedChar*sizeof(SignedChar);
872:     link->basicunit = MPI_SIGNED_CHAR;
873:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
874:   }  else if (nUnsignedChar) {
875:     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
876:     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
877:     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
878:     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
879:     link->bs        = nUnsignedChar;
880:     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
881:     link->basicunit = MPI_UNSIGNED_CHAR;
882:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
883: #if defined(PETSC_HAVE_COMPLEX)
884:   } else if (nPetscComplex) {
885:     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
886:     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
887:     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
888:     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
889:     link->bs        = nPetscComplex;
890:     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
891:     link->basicunit = MPIU_COMPLEX;
892:     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
893: #endif
894:   } else {
895:     MPI_Aint lb,nbyte;
896:     MPI_Type_get_extent(unit,&lb,&nbyte);
897:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
898:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
899:       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
900:       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
901:       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
902:       link->bs        = nbyte;
903:       link->unitbytes = nbyte;
904:       link->basicunit = MPI_BYTE;
905:     } else {
906:       nInt = nbyte / sizeof(int);
907:       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
908:       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
909:       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
910:       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
911:       link->bs        = nInt;
912:       link->unitbytes = nbyte;
913:       link->basicunit = MPI_INT;
914:     }
915:     if (link->isbuiltin) link->unit = unit;
916:   }

918:   if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
919:   return(0);
920: }

922: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
923: {
925:   *UnpackAndOp = NULL;
926:   if (mtype == PETSC_MEMTYPE_HOST) {
927:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
928:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
929:     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
930:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
931:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
932:     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
933:     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
934:     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
935:     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
936:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
937:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
938:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
939:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
940:   }
941: #if defined(PETSC_HAVE_CUDA)
942:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
943:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
944:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
945:     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
946:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
947:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
948:     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
949:     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
950:     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
951:     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
952:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
953:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
954:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
955:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
956:   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
957:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
958:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
959:     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
960:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
961:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
962:     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
963:     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
964:     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
965:     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
966:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
967:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
968:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
969:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
970:   }
971: #endif
972:   return(0);
973: }

975: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
976: {
978:   *ScatterAndOp = NULL;
979:   if (mtype == PETSC_MEMTYPE_HOST) {
980:     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
981:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
982:     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
983:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
984:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
985:     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
986:     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
987:     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
988:     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
989:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
990:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
991:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
992:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
993:   }
994: #if defined(PETSC_HAVE_CUDA)
995:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
996:     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
997:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
998:     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
999:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1000:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1001:     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1002:     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1003:     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1004:     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1005:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1006:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1007:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1008:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1009:   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1010:     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1011:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1012:     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1013:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1014:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1015:     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1016:     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1017:     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1018:     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1019:     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1020:     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1021:     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1022:     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1023:   }
1024: #endif
1025:   return(0);
1026: }

1028: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1029: {
1031:   *FetchAndOp = NULL;
1032:   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1033:   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1034: #if defined(PETSC_HAVE_CUDA)
1035:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1036:   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1037: #endif
1038:   return(0);
1039: }

1041: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1042: {
1044:   *FetchAndOpLocal = NULL;
1045:   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1046:   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1047: #if defined(PETSC_HAVE_CUDA)
1048:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1049:   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1050: #endif
1051:   return(0);
1052: }

1054: /*=============================================================================
1055:               A set of helper routines for Pack/Unpack/Scatter on GPUs
1056:  ============================================================================*/
1057: #if defined(PETSC_HAVE_CUDA)
1058: /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1059:    make sure the data is ready for packing.
1060:  */
1061: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1062: {
1064:   if (sf->use_default_stream) return(0);
1065:   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1066:     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1067:   }
1068:   return(0);
1069: }

1071: /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1072: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1073: {
1074:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;

1077:   /* Do nothing if we use stream aware mpi || has nothing for remote */
1078:   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) return(0);
1079:   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1080:   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1081:     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1082:   }
1083:   return(0);
1084: }
1085: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1086: {
1088:   /* See comments above */
1089:   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) return(0);
1090:   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1091:     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1092:   }
1093:   return(0);
1094: }

1096: /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1097:    does not know which stream the callers will use.
1098: */
1099: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1100: {
1101:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;

1104:   /* Do nothing if we are expected to put rootdata on default stream */
1105:   if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) return(0);
1106:   /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it;
1107:      If we have something from remote and we called unpack kernel, then we must also sycn it.
1108:    */
1109:   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1110:     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1111:   }
1112:   return(0);
1113: }
1114: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1115: {
1117:   /* See comments above */
1118:   if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) return(0);
1119:   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1120:     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1121:   }
1122:   return(0);
1123: }

1125: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1126:    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1127: */
1128: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1129: {
1131:   cudaError_t    cerr;
1132:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;

1135:   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1136:     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1137:     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1138:     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1139:     if (device2host) {
1140:       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1141:       PetscLogGpuToCpu(count);
1142:     } else {
1143:       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1144:       PetscLogCpuToGpu(count);
1145:     }
1146:   }
1147:   return(0);
1148: }

1150: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1151: {
1153:   cudaError_t    cerr;

1156:   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1157:     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1158:     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1159:     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1160:     if (device2host) {
1161:       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1162:       PetscLogGpuToCpu(count);
1163:     } else {
1164:       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1165:       PetscLogCpuToGpu(count);
1166:     }
1167:   }
1168:   return(0);
1169: }
1170: #else

1172: #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1173: #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1174: #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1175: #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1176: #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1177: #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1178: #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0

1180: #endif

1182: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1183: {
1185:   PetscLogDouble flops;
1186:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;


1190:   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1191:     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1192: #if defined(PETSC_HAVE_CUDA)
1193:     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1194: #endif
1195:     {PetscLogFlops(flops);}
1196:   }
1197:   return(0);
1198: }

1200: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1201: {
1202:   PetscLogDouble flops;

1206:   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1207:     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1208: #if defined(PETSC_HAVE_CUDA)
1209:     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1210: #endif
1211:     {PetscLogFlops(flops);}
1212:   }
1213:   return(0);
1214: }

1216: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1217:   Input Arguments:
1218:   +sf      - The StarForest
1219:   .link    - The link
1220:   .count   - Number of entries to unpack
1221:   .start   - The first index, significent when indices=NULL
1222:   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1223:   .buf     - A contiguous buffer to unpack from
1224:   -op      - Operation after unpack

1226:   Output Arguments:
1227:   .data    - The data to unpack to
1228: */
1229: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1230: {
1232: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1233:   {
1235:     PetscInt       i;
1236:     PetscMPIInt    n;
1237:     if (indices) {
1238:       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1239:          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1240:       */
1241:       for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
1242:     } else {
1243:       PetscMPIIntCast(count,&n);
1244:       MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
1245:     }
1246:   }
1247: #else
1248:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1249: #endif
1250:   return(0);
1251: }

1253: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1254: {
1256: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1257:   {
1259:     PetscInt       i,disp;
1260:     if (!srcIdx) {
1261:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
1262:     } else {
1263:       for (i=0; i<count; i++) {
1264:         disp = dstIdx? dstIdx[i] : dstStart + i;
1265:         MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1266:       }
1267:     }
1268:   }
1269: #else
1270:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1271: #endif
1272:   return(0);
1273: }

1275: /*=============================================================================
1276:               Pack/Unpack/Fetch/Scatter routines
1277:  ============================================================================*/

1279: /* Pack rootdata to rootbuf
1280:   Input Arguments:
1281:   + sf       - The SF this packing works on.
1282:   . link     - It gives the memtype of the roots and also provides root buffer.
1283:   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1284:   - rootdata - Where to read the roots.

1286:   Notes:
1287:   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1288:   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1289:  */
1290: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1291: {
1292:   PetscErrorCode   ierr;
1293:   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1294:   const PetscInt   *rootindices = NULL;
1295:   PetscInt         count,start;
1296:   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1297:   PetscMemType     rootmtype = link->rootmtype;
1298:   PetscSFPackOpt   opt = NULL;


1302:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1303:   if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1304:   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1305:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1306:     PetscSFLinkGetPack(link,rootmtype,&Pack);
1307:     (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1308:   }
1309:   if (scope == PETSCSF_REMOTE) {
1310:     PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1311:     PetscSFLinkSyncStreamAfterPackRootData(sf,link);
1312:   }
1313:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1314:   return(0);
1315: }

1317: /* Pack leafdata to leafbuf */
1318: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1319: {
1320:   PetscErrorCode   ierr;
1321:   const PetscInt   *leafindices = NULL;
1322:   PetscInt         count,start;
1323:   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1324:   PetscMemType     leafmtype = link->leafmtype;
1325:   PetscSFPackOpt   opt = NULL;

1328:   PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1329:   if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1330:   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1331:     PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1332:     PetscSFLinkGetPack(link,leafmtype,&Pack);
1333:     (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1334:   }
1335:   if (scope == PETSCSF_REMOTE) {
1336:     PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1337:     PetscSFLinkSyncStreamAfterPackLeafData(sf,link);
1338:   }
1339:   PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1340:   return(0);
1341: }

1343: /* Unpack rootbuf to rootdata */
1344: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1345: {
1346:   PetscErrorCode   ierr;
1347:   const PetscInt   *rootindices = NULL;
1348:   PetscInt         count,start;
1349:   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1350:   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1351:   PetscMemType     rootmtype = link->rootmtype;
1352:   PetscSFPackOpt   opt = NULL;

1355:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1356:   if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1357:   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1358:     PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1359:     if (UnpackAndOp) {
1360:       PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1361:       (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1362:     } else {
1363:       PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1364:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1365:     }
1366:   }
1367:   if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);}
1368:   PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1369:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1370:   return(0);
1371: }

1373: /* Unpack leafbuf to leafdata */
1374: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1375: {
1376:   PetscErrorCode   ierr;
1377:   const PetscInt   *leafindices = NULL;
1378:   PetscInt         count,start;
1379:   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1380:   PetscMemType     leafmtype = link->leafmtype;
1381:   PetscSFPackOpt   opt = NULL;

1384:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1385:   if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1386:   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1387:     PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1388:     if (UnpackAndOp) {
1389:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1390:       (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1391:     } else {
1392:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1393:       PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1394:     }
1395:   }
1396:   if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);}
1397:   PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1398:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1399:   return(0);
1400: }

1402: /* FetchAndOp rootdata with rootbuf */
1403: PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1404: {
1405:   PetscErrorCode     ierr;
1406:   const PetscInt     *rootindices = NULL;
1407:   PetscInt           count,start;
1408:   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1409:   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1410:   PetscMemType       rootmtype = link->rootmtype;
1411:   PetscSFPackOpt     opt = NULL;

1414:   PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1415:   if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1416:   if (bas->rootbuflen[scope]) {
1417:     /* Do FetchAndOp on rootdata with rootbuf */
1418:     PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);
1419:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1420:     (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1421:   }
1422:   if (scope == PETSCSF_REMOTE) {
1423:     PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
1424:     PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);
1425:   }
1426:   PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1427:   PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1428:   return(0);
1429: }

1431: /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1432: PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1433: {
1434:   PetscErrorCode       ierr;
1435:   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1436:   PetscInt             count,rootstart,leafstart;
1437:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1438:   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1439:   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1440:   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;

1443:   if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1444:   if (rootmtype != leafmtype) { /* Uncommon case */
1445:      /* The local communication has to go through pack and unpack */
1446:     PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1447:     PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);
1448:     PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1449:   } else {
1450:     PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);
1451:     if (ScatterAndOp) {
1452:       PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1453:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1454:       (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1455:     } else {
1456:       PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1457:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1458:       PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1459:     }
1460:   }
1461:   return(0);
1462: }

1464: /* Reduce leafdata to rootdata locally */
1465: PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1466: {
1467:   PetscErrorCode       ierr;
1468:   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1469:   PetscInt             count,rootstart,leafstart;
1470:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1471:   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1472:   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1473:   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;

1476:   if (!sf->leafbuflen[PETSCSF_LOCAL]) return(0);
1477:   if (rootmtype != leafmtype) {
1478:     /* The local communication has to go through pack and unpack */
1479:     PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1480:     PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);
1481:     PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1482:   } else {
1483:     PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);
1484:     if (ScatterAndOp) {
1485:       PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1486:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1487:       (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1488:     } else {
1489:       PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1490:       PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1491:       PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1492:     }
1493:   }
1494:   return(0);
1495: }

1497: /* Fetch rootdata to leafdata and leafupdate locally */
1498: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1499: {
1500:   PetscErrorCode       ierr;
1501:   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1502:   PetscInt             count,rootstart,leafstart;
1503:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1504:   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1505:   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1506:   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;

1509:   if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1510:   if (rootmtype != leafmtype) {
1511:    /* The local communication has to go through pack and unpack */
1512:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1513:   } else {
1514:     PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1515:     PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1516:     PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1517:     (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1518:   }
1519:   return(0);
1520: }

1522: /*
1523:   Create per-rank pack/unpack optimizations based on indice patterns

1525:    Input Parameters:
1526:   +  n       - Number of destination ranks
1527:   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1528:   -  idx     - [*]   Array storing indices

1530:    Output Parameters:
1531:   +  opt     - Pack optimizations. NULL if no optimizations.
1532: */
1533: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1534: {
1536:   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1537:   PetscBool      optimizable = PETSC_TRUE;
1538:   PetscSFPackOpt opt;

1541:   PetscMalloc1(1,&opt);
1542:   PetscMalloc1(7*n+2,&opt->array);
1543:   opt->n      = opt->array[0] = n;
1544:   opt->offset = opt->array + 1;
1545:   opt->start  = opt->array + n   + 2;
1546:   opt->dx     = opt->array + 2*n + 2;
1547:   opt->dy     = opt->array + 3*n + 2;
1548:   opt->dz     = opt->array + 4*n + 2;
1549:   opt->X      = opt->array + 5*n + 2;
1550:   opt->Y      = opt->array + 6*n + 2;

1552:   for (r=0; r<n; r++) { /* For each destination rank */
1553:     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1554:     p     = offset[r];
1555:     start = idx[p]; /* First index for this rank */
1556:     p++;

1558:     /* Search in X dimension */
1559:     for (dx=1; dx<m; dx++,p++) {
1560:       if (start+dx != idx[p]) break;
1561:     }

1563:     dydz = m/dx;
1564:     X    = dydz > 1 ? (idx[p]-start) : dx;
1565:     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1566:     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1567:     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1568:       for (i=0; i<dx; i++,p++) {
1569:         if (start+X*dy+i != idx[p]) {
1570:           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1571:           else goto Z_dimension;
1572:         }
1573:       }
1574:     }

1576: Z_dimension:
1577:     dz = m/(dx*dy);
1578:     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1579:     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1580:     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1581:     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1582:       for (j=0; j<dy; j++) {
1583:         for (i=0; i<dx; i++,p++) {
1584:           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1585:         }
1586:       }
1587:     }
1588:     opt->start[r] = start;
1589:     opt->dx[r]    = dx;
1590:     opt->dy[r]    = dy;
1591:     opt->dz[r]    = dz;
1592:     opt->X[r]     = X;
1593:     opt->Y[r]     = Y;
1594:   }

1596: finish:
1597:   /* If not optimizable, free arrays to save memory */
1598:   if (!n || !optimizable) {
1599:     PetscFree(opt->array);
1600:     PetscFree(opt);
1601:     *out = NULL;
1602:   } else {
1603:     opt->offset[0] = 0;
1604:     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1605:     *out = opt;
1606:   }
1607:   return(0);
1608: }

1610: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1611: {
1613:   PetscSFPackOpt opt = *out;

1616:   if (opt) {
1617:     if (mtype == PETSC_MEMTYPE_HOST) {PetscFree(opt->array);}
1618: #if defined(PETSC_HAVE_CUDA)
1619:     else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1620: #endif
1621:     PetscFree(opt);
1622:     *out = NULL;
1623:   }
1624:   return(0);
1625: }

1627: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1628: {
1630:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1631:   PetscInt       i,j;

1634:   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1635:   for (i=0; i<2; i++) { /* Set defaults */
1636:     sf->leafstart[i]   = 0;
1637:     sf->leafcontig[i]  = PETSC_TRUE;
1638:     sf->leafdups[i]    = PETSC_FALSE;
1639:     bas->rootstart[i]  = 0;
1640:     bas->rootcontig[i] = PETSC_TRUE;
1641:     bas->rootdups[i]   = PETSC_FALSE;
1642:   }

1644:   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1645:   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];

1647:   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1648:   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];

1650:   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1651:   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1652:     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1653:   }
1654:   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1655:     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1656:   }

1658:   /* If not, see if we can have per-rank optimizations by doing index analysis */
1659:   if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);}
1660:   if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}

1662:   /* Are root indices for self and remote contiguous? */
1663:   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1664:   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];

1666:   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1667:   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];

1669:   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1670:     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1671:   }
1672:   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1673:     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1674:   }

1676:   if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);}
1677:   if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}

1679: #if defined(PETSC_HAVE_CUDA)
1680:     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1685: #endif

1687:   return(0);
1688: }

1690: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1691: {
1693:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1694:   PetscInt       i;

1697:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1698:     PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1699:     PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1700:     PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1701:     PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1702:   }
1703:   return(0);
1704: }