Actual source code: sfpack.c
petsc-3.14.3 2021-01-09
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: #if defined(PETSC_HAVE_CUDA)
8: #include <cuda_runtime.h>
9: #include <petsccublas.h>
10: #endif
11: /*
12: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
13: * therefore we pack data types manually. This file defines packing routines for the standard data types.
14: */
16: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
18: /* Operations working like s += t */
19: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */
20: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
21: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */
22: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0)
23: /* Ref MPI MAXLOC */
24: #define OP_XLOC(op,s,t) \
25: do { \
26: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
27: else if (!((s).u op (t).u)) s = t; \
28: } while (0)
30: /* DEF_PackFunc - macro defining a Pack routine
32: Arguments of the macro:
33: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34: .BS Block size for vectorization. It is a factor of bsz.
35: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
37: Arguments of the Pack routine:
38: +count Number of indices in idx[].
39: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40: .opt Per-pack optimization plan. NULL means no such plan.
41: .idx Indices of entries to packed.
42: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44: -packed Address of the packed data.
45: */
46: #define DEF_PackFunc(Type,BS,EQ) \
47: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
48: { \
50: const Type *u = (const Type*)unpacked,*u2; \
51: Type *p = (Type*)packed,*p2; \
52: PetscInt i,j,k,X,Y,r,bs=link->bs; \
53: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
54: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
56: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
57: else if (opt) { /* has optimizations available */ \
58: p2 = p; \
59: for (r=0; r<opt->n; r++) { \
60: u2 = u + opt->start[r]*MBS; \
61: X = opt->X[r]; \
62: Y = opt->Y[r]; \
63: for (k=0; k<opt->dz[r]; k++) \
64: for (j=0; j<opt->dy[r]; j++) { \
65: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
66: p2 += opt->dx[r]*MBS; \
67: } \
68: } \
69: } else { \
70: for (i=0; i<count; i++) \
71: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
72: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
73: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
74: } \
75: return(0); \
76: }
78: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
79: and inserts into a sparse array.
81: Arguments:
82: .Type Type of the data
83: .BS Block size for vectorization
84: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
86: Notes:
87: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
88: */
89: #define DEF_UnpackFunc(Type,BS,EQ) \
90: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
91: { \
93: Type *u = (Type*)unpacked,*u2; \
94: const Type *p = (const Type*)packed; \
95: PetscInt i,j,k,X,Y,r,bs=link->bs; \
96: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
97: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
99: if (!idx) { \
100: u += start*MBS; \
101: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
102: } else if (opt) { /* has optimizations available */ \
103: for (r=0; r<opt->n; r++) { \
104: u2 = u + opt->start[r]*MBS; \
105: X = opt->X[r]; \
106: Y = opt->Y[r]; \
107: for (k=0; k<opt->dz[r]; k++) \
108: for (j=0; j<opt->dy[r]; j++) { \
109: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
110: p += opt->dx[r]*MBS; \
111: } \
112: } \
113: } else { \
114: for (i=0; i<count; i++) \
115: for (j=0; j<M; j++) \
116: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
117: } \
118: return(0); \
119: }
121: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
123: Arguments:
124: +Opname Name of the Op, such as Add, Mult, LAND, etc.
125: .Type Type of the data
126: .BS Block size for vectorization
127: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
128: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
129: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
130: */
131: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
132: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
133: { \
134: Type *u = (Type*)unpacked,*u2; \
135: const Type *p = (const Type*)packed; \
136: PetscInt i,j,k,X,Y,r,bs=link->bs; \
137: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
138: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
140: if (!idx) { \
141: u += start*MBS; \
142: for (i=0; i<count; i++) \
143: for (j=0; j<M; j++) \
144: for (k=0; k<BS; k++) \
145: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
146: } else if (opt) { /* idx[] has patterns */ \
147: for (r=0; r<opt->n; r++) { \
148: u2 = u + opt->start[r]*MBS; \
149: X = opt->X[r]; \
150: Y = opt->Y[r]; \
151: for (k=0; k<opt->dz[r]; k++) \
152: for (j=0; j<opt->dy[r]; j++) { \
153: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
154: p += opt->dx[r]*MBS; \
155: } \
156: } \
157: } else { \
158: for (i=0; i<count; i++) \
159: for (j=0; j<M; j++) \
160: for (k=0; k<BS; k++) \
161: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
162: } \
163: return(0); \
164: }
166: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
167: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
168: { \
169: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
170: PetscInt i,j,k,r,l,bs=link->bs; \
171: const PetscInt M = (EQ) ? 1 : bs/BS; \
172: const PetscInt MBS = M*BS; \
174: for (i=0; i<count; i++) { \
175: r = (!idx ? start+i : idx[i])*MBS; \
176: l = i*MBS; \
177: for (j=0; j<M; j++) \
178: for (k=0; k<BS; k++) { \
179: tmp = u[r+j*BS+k]; \
180: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
181: p[l+j*BS+k] = tmp; \
182: } \
183: } \
184: return(0); \
185: }
187: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
188: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
189: { \
191: const Type *u = (const Type*)src; \
192: Type *v = (Type*)dst; \
193: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
194: const PetscInt M = (EQ) ? 1 : bs/BS; \
195: const PetscInt MBS = M*BS; \
197: if (!srcIdx) { /* src is contiguous */ \
198: u += srcStart*MBS; \
199: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
200: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
201: u += srcOpt->start[0]*MBS; \
202: v += dstStart*MBS; \
203: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
204: for (k=0; k<srcOpt->dz[0]; k++) \
205: for (j=0; j<srcOpt->dy[0]; j++) { \
206: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
207: v += srcOpt->dx[0]*MBS; \
208: } \
209: } else { /* all other cases */ \
210: for (i=0; i<count; i++) { \
211: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
212: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
213: for (j=0; j<M; j++) \
214: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
215: } \
216: } \
217: return(0); \
218: }
220: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
221: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
222: { \
223: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
224: const Type *ldata = (const Type*)leafdata; \
225: PetscInt i,j,k,r,l,bs = link->bs; \
226: const PetscInt M = (EQ) ? 1 : bs/BS; \
227: const PetscInt MBS = M*BS; \
229: for (i=0; i<count; i++) { \
230: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
231: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
232: for (j=0; j<M; j++) \
233: for (k=0; k<BS; k++) { \
234: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
235: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
236: } \
237: } \
238: return(0); \
239: }
241: /* Pack, Unpack/Fetch ops */
242: #define DEF_Pack(Type,BS,EQ) \
243: DEF_PackFunc(Type,BS,EQ) \
244: DEF_UnpackFunc(Type,BS,EQ) \
245: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
246: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
247: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
248: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
249: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
250: }
252: /* Add, Mult ops */
253: #define DEF_Add(Type,BS,EQ) \
254: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
255: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
256: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
257: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
258: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
259: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
260: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
261: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
262: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
263: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
264: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
265: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
266: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
267: }
269: /* Max, Min ops */
270: #define DEF_Cmp(Type,BS,EQ) \
271: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
272: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
273: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
274: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
275: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
276: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
277: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
278: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
279: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
280: }
282: /* Logical ops.
283: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
284: the compilation warning "empty macro arguments are undefined in ISO C90"
285: */
286: #define DEF_Log(Type,BS,EQ) \
287: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
288: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
289: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
290: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
291: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
292: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
293: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
294: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
295: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
296: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
297: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
298: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
299: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
300: }
302: /* Bitwise ops */
303: #define DEF_Bit(Type,BS,EQ) \
304: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
305: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
306: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
307: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
308: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
309: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
310: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
311: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
312: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
313: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
314: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
315: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
316: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
317: }
319: /* Maxloc, Minloc ops */
320: #define DEF_Xloc(Type,BS,EQ) \
321: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
322: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
323: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
324: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
325: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
326: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
327: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
328: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
329: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
330: }
332: #define DEF_IntegerType(Type,BS,EQ) \
333: DEF_Pack(Type,BS,EQ) \
334: DEF_Add(Type,BS,EQ) \
335: DEF_Cmp(Type,BS,EQ) \
336: DEF_Log(Type,BS,EQ) \
337: DEF_Bit(Type,BS,EQ) \
338: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
339: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
340: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
341: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
342: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
343: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
344: }
346: #define DEF_RealType(Type,BS,EQ) \
347: DEF_Pack(Type,BS,EQ) \
348: DEF_Add(Type,BS,EQ) \
349: DEF_Cmp(Type,BS,EQ) \
350: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
351: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
352: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
353: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
354: }
356: #if defined(PETSC_HAVE_COMPLEX)
357: #define DEF_ComplexType(Type,BS,EQ) \
358: DEF_Pack(Type,BS,EQ) \
359: DEF_Add(Type,BS,EQ) \
360: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
361: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
362: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
363: }
364: #endif
366: #define DEF_DumbType(Type,BS,EQ) \
367: DEF_Pack(Type,BS,EQ) \
368: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
369: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
370: }
372: /* Maxloc, Minloc */
373: #define DEF_PairType(Type,BS,EQ) \
374: DEF_Pack(Type,BS,EQ) \
375: DEF_Xloc(Type,BS,EQ) \
376: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
377: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
378: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
379: }
381: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
382: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
383: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
384: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
385: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
386: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
387: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
388: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
390: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
391: DEF_IntegerType(int,1,1)
392: DEF_IntegerType(int,2,1)
393: DEF_IntegerType(int,4,1)
394: DEF_IntegerType(int,8,1)
395: DEF_IntegerType(int,1,0)
396: DEF_IntegerType(int,2,0)
397: DEF_IntegerType(int,4,0)
398: DEF_IntegerType(int,8,0)
399: #endif
401: /* The typedefs are used to get a typename without space that CPPJoin can handle */
402: typedef signed char SignedChar;
403: DEF_IntegerType(SignedChar,1,1)
404: DEF_IntegerType(SignedChar,2,1)
405: DEF_IntegerType(SignedChar,4,1)
406: DEF_IntegerType(SignedChar,8,1)
407: DEF_IntegerType(SignedChar,1,0)
408: DEF_IntegerType(SignedChar,2,0)
409: DEF_IntegerType(SignedChar,4,0)
410: DEF_IntegerType(SignedChar,8,0)
412: typedef unsigned char UnsignedChar;
413: DEF_IntegerType(UnsignedChar,1,1)
414: DEF_IntegerType(UnsignedChar,2,1)
415: DEF_IntegerType(UnsignedChar,4,1)
416: DEF_IntegerType(UnsignedChar,8,1)
417: DEF_IntegerType(UnsignedChar,1,0)
418: DEF_IntegerType(UnsignedChar,2,0)
419: DEF_IntegerType(UnsignedChar,4,0)
420: DEF_IntegerType(UnsignedChar,8,0)
422: DEF_RealType(PetscReal,1,1)
423: DEF_RealType(PetscReal,2,1)
424: DEF_RealType(PetscReal,4,1)
425: DEF_RealType(PetscReal,8,1)
426: DEF_RealType(PetscReal,1,0)
427: DEF_RealType(PetscReal,2,0)
428: DEF_RealType(PetscReal,4,0)
429: DEF_RealType(PetscReal,8,0)
431: #if defined(PETSC_HAVE_COMPLEX)
432: DEF_ComplexType(PetscComplex,1,1)
433: DEF_ComplexType(PetscComplex,2,1)
434: DEF_ComplexType(PetscComplex,4,1)
435: DEF_ComplexType(PetscComplex,8,1)
436: DEF_ComplexType(PetscComplex,1,0)
437: DEF_ComplexType(PetscComplex,2,0)
438: DEF_ComplexType(PetscComplex,4,0)
439: DEF_ComplexType(PetscComplex,8,0)
440: #endif
442: #define PairType(Type1,Type2) Type1##_##Type2
443: typedef struct {int u; int i;} PairType(int,int);
444: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
445: DEF_PairType(PairType(int,int),1,1)
446: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
448: /* If we don't know the basic type, we treat it as a stream of chars or ints */
449: DEF_DumbType(char,1,1)
450: DEF_DumbType(char,2,1)
451: DEF_DumbType(char,4,1)
452: DEF_DumbType(char,1,0)
453: DEF_DumbType(char,2,0)
454: DEF_DumbType(char,4,0)
456: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
457: DEF_DumbType(DumbInt,1,1)
458: DEF_DumbType(DumbInt,2,1)
459: DEF_DumbType(DumbInt,4,1)
460: DEF_DumbType(DumbInt,8,1)
461: DEF_DumbType(DumbInt,1,0)
462: DEF_DumbType(DumbInt,2,0)
463: DEF_DumbType(DumbInt,4,0)
464: DEF_DumbType(DumbInt,8,0)
466: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
467: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
468: {
469: int ierr;
470: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
471: MPI_Type_commit(newtype); if (ierr) return ierr;
472: return MPI_SUCCESS;
473: }
474: #endif
476: /*
477: The routine Creates a communication link for the given operation. It first looks up its link cache. If
478: there is a free & suitable one, it uses it. Otherwise it creates a new one.
480: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
481: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
482: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
483: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
485: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
487: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
489: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
490: need pack/unpack data.
491: */
492: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
493: {
494: PetscErrorCode ierr;
495: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
496: PetscInt i,j,k,nrootreqs,nleafreqs,nreqs;
497: PetscSFLink *p,link;
498: PetscSFDirection direction;
499: MPI_Request *reqs = NULL;
500: PetscBool match,rootdirect[2],leafdirect[2];
501: PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */
502: PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
505: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
507: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
508: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
509: if (sfop == PETSCSF_BCAST) {
510: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
511: leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
512: } else if (sfop == PETSCSF_REDUCE) {
513: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
514: rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
515: } else { /* PETSCSF_FETCH */
516: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
517: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
518: }
519: }
521: if (sf->use_gpu_aware_mpi) {
522: rootmtype_mpi = rootmtype;
523: leafmtype_mpi = leafmtype;
524: } else {
525: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
526: }
527: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
528: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
529: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
531: direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
532: nrootreqs = bas->nrootreqs;
533: nleafreqs = sf->nleafreqs;
535: /* Look for free links in cache */
536: for (p=&bas->avail; (link=*p); p=&link->next) {
537: MPIPetsc_Type_compare(unit,link->unit,&match);
538: if (match) {
539: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
540: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
541: */
542: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
543: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
544: for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
545: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
546: }
547: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
548: reqs = link->leafreqs[direction][leafmtype][1];
549: for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
550: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
551: }
552: *p = link->next; /* Remove from available list */
553: goto found;
554: }
555: }
557: PetscNew(&link);
558: PetscSFLinkSetUp_Host(sf,link,unit);
559: PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */
561: nreqs = (nrootreqs+nleafreqs)*8;
562: PetscMalloc1(nreqs,&link->reqs);
563: for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
565: for (i=0; i<2; i++) { /* Two communication directions */
566: for (j=0; j<2; j++) { /* Two memory types */
567: for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
568: link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
569: link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
570: }
571: }
572: }
574: found:
576: #if defined(PETSC_HAVE_DEVICE)
577: if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {
578: #if defined(PETSC_HAVE_CUDA)
579: if (sf->backend == PETSCSF_BACKEND_CUDA) {PetscSFLinkSetUp_Cuda(sf,link,unit);}
580: #endif
581: #if defined(PETSC_HAVE_KOKKOS)
582: if (sf->backend == PETSCSF_BACKEND_KOKKOS) {PetscSFLinkSetUp_Kokkos(sf,link,unit);}
583: #endif
584: }
585: #endif
587: /* Allocate buffers along root/leafdata */
588: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
589: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
590: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
591: if (bas->rootbuflen[i]) {
592: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
593: link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
594: } else { /* Have to have a separate rootbuf */
595: if (!link->rootbuf_alloc[i][rootmtype]) {
596: PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
597: }
598: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
599: }
600: }
602: if (sf->leafbuflen[i]) {
603: if (leafdirect[i]) {
604: link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
605: } else {
606: if (!link->leafbuf_alloc[i][leafmtype]) {
607: PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
608: }
609: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
610: }
611: }
612: }
614: #if defined(PETSC_HAVE_DEVICE)
615: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
616: if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
617: if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
618: PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
619: }
620: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
621: }
622: if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
623: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
624: PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
625: }
626: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
627: }
628: #endif
630: /* Set `current` state of the link. They may change between different SF invocations with the same link */
631: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
632: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
633: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
634: }
636: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
637: link->leafdata = leafdata;
638: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
639: link->rootdirect[i] = rootdirect[i];
640: link->leafdirect[i] = leafdirect[i];
641: }
642: link->rootdirect_mpi = rootdirect_mpi;
643: link->leafdirect_mpi = leafdirect_mpi;
644: link->rootmtype = rootmtype;
645: link->leafmtype = leafmtype;
646: link->rootmtype_mpi = rootmtype_mpi;
647: link->leafmtype_mpi = leafmtype_mpi;
649: link->next = bas->inuse;
650: bas->inuse = link;
651: *mylink = link;
652: return(0);
653: }
655: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
656: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
657: */
658: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
659: {
660: PetscErrorCode ierr;
661: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
662: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
663: const PetscInt *rootoffset,*leafoffset;
664: PetscMPIInt n;
665: MPI_Aint disp;
666: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
667: MPI_Datatype unit = link->unit;
668: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
669: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
672: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
673: if (sf->persistent) {
674: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
675: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
676: if (direction == PETSCSF_LEAF2../../../../../..) {
677: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
678: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
679: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
680: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
681: }
682: } else { /* PETSCSF_../../../../../..2LEAF */
683: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
684: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
685: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
686: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
687: }
688: }
689: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
690: }
692: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
693: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
694: if (direction == PETSCSF_LEAF2../../../../../..) {
695: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
696: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
697: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
698: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
699: }
700: } else { /* PETSCSF_../../../../../..2LEAF */
701: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
702: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
703: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
704: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
705: }
706: }
707: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
708: }
709: }
710: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
711: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
712: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
713: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
714: return(0);
715: }
718: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
719: {
720: PetscErrorCode ierr;
721: PetscSFLink link,*p;
722: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
725: /* Look for types in cache */
726: for (p=&bas->inuse; (link=*p); p=&link->next) {
727: PetscBool match;
728: MPIPetsc_Type_compare(unit,link->unit,&match);
729: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
730: switch (cmode) {
731: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
732: case PETSC_USE_POINTER: break;
733: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
734: }
735: *mylink = link;
736: return(0);
737: }
738: }
739: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
740: }
742: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
743: {
744: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
747: (*link)->rootdata = NULL;
748: (*link)->leafdata = NULL;
749: (*link)->next = bas->avail;
750: bas->avail = *link;
751: *link = NULL;
752: return(0);
753: }
755: /* Destroy all links chained in 'avail' */
756: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
757: {
758: PetscErrorCode ierr;
759: PetscSFLink link = *avail,next;
760: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
761: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
764: for (; link; link=next) {
765: next = link->next;
766: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
767: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
768: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
769: }
770: PetscFree(link->reqs);
771: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
772: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
773: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
774: #if defined(PETSC_HAVE_DEVICE)
775: PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
776: PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
777: #endif
778: }
779: #if defined(PETSC_HAVE_DEVICE)
780: if (link->Destroy) {(*link->Destroy)(link);}
781: #if defined(PETSC_HAVE_CUDA)
782: if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
783: #elif defined(PETSC_HAVE_HIP)
784: if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
785: #endif
786: #endif
787: PetscFree(link);
788: }
789: *avail = NULL;
790: return(0);
791: }
793: /* Error out on unsupported overlapped communications */
794: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
795: {
796: PetscErrorCode ierr;
797: PetscSFLink link,*p;
798: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
799: PetscBool match;
802: if (PetscDefined(USE_DEBUG)) {
803: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
804: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
805: */
806: if (rootdata || leafdata) {
807: for (p=&bas->inuse; (link=*p); p=&link->next) {
808: MPIPetsc_Type_compare(unit,link->unit,&match);
809: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
810: }
811: }
812: }
813: return(0);
814: }
816: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
817: {
819: if (n) {PetscErrorCode PetscMemcpy(dst,src,n);}
820: return(0);
821: }
824: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
825: {
827: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
828: PetscBool is2Int,is2PetscInt;
829: PetscMPIInt ni,na,nd,combiner;
830: #if defined(PETSC_HAVE_COMPLEX)
831: PetscInt nPetscComplex=0;
832: #endif
835: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
836: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
837: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
838: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
839: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
840: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
841: #if defined(PETSC_HAVE_COMPLEX)
842: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
843: #endif
844: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
845: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
846: /* TODO: shaell we also handle Fortran MPI_2REAL? */
847: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
848: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
849: link->bs = 1; /* default */
851: if (is2Int) {
852: PackInit_PairType_int_int_1_1(link);
853: link->bs = 1;
854: link->unitbytes = 2*sizeof(int);
855: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
856: link->basicunit = MPI_2INT;
857: link->unit = MPI_2INT;
858: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
859: PackInit_PairType_PetscInt_PetscInt_1_1(link);
860: link->bs = 1;
861: link->unitbytes = 2*sizeof(PetscInt);
862: link->basicunit = MPIU_2INT;
863: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
864: link->unit = MPIU_2INT;
865: } else if (nPetscReal) {
866: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
867: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
868: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
869: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
870: link->bs = nPetscReal;
871: link->unitbytes = nPetscReal*sizeof(PetscReal);
872: link->basicunit = MPIU_REAL;
873: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
874: } else if (nPetscInt) {
875: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
876: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
877: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
878: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
879: link->bs = nPetscInt;
880: link->unitbytes = nPetscInt*sizeof(PetscInt);
881: link->basicunit = MPIU_INT;
882: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
883: #if defined(PETSC_USE_64BIT_INDICES)
884: } else if (nInt) {
885: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
886: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
887: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
888: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
889: link->bs = nInt;
890: link->unitbytes = nInt*sizeof(int);
891: link->basicunit = MPI_INT;
892: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
893: #endif
894: } else if (nSignedChar) {
895: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
896: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
897: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
898: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
899: link->bs = nSignedChar;
900: link->unitbytes = nSignedChar*sizeof(SignedChar);
901: link->basicunit = MPI_SIGNED_CHAR;
902: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
903: } else if (nUnsignedChar) {
904: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
905: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
906: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
907: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
908: link->bs = nUnsignedChar;
909: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
910: link->basicunit = MPI_UNSIGNED_CHAR;
911: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
912: #if defined(PETSC_HAVE_COMPLEX)
913: } else if (nPetscComplex) {
914: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
915: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
916: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
917: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
918: link->bs = nPetscComplex;
919: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
920: link->basicunit = MPIU_COMPLEX;
921: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
922: #endif
923: } else {
924: MPI_Aint lb,nbyte;
925: MPI_Type_get_extent(unit,&lb,&nbyte);
926: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
927: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
928: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
929: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
930: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
931: link->bs = nbyte;
932: link->unitbytes = nbyte;
933: link->basicunit = MPI_BYTE;
934: } else {
935: nInt = nbyte / sizeof(int);
936: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
937: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
938: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
939: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
940: link->bs = nInt;
941: link->unitbytes = nbyte;
942: link->basicunit = MPI_INT;
943: }
944: if (link->isbuiltin) link->unit = unit;
945: }
947: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
949: link->Memcpy = PetscSFLinkMemcpy_Host;
950: return(0);
951: }
953: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
954: {
956: *UnpackAndOp = NULL;
957: if (mtype == PETSC_MEMTYPE_HOST) {
958: if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
959: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
960: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
961: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
962: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
963: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
964: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
965: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
966: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
967: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
968: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
969: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
970: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
971: }
972: #if defined(PETSC_HAVE_DEVICE)
973: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
974: if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
975: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
976: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
977: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
978: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
979: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
980: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
981: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
982: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
983: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
984: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
985: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
986: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
987: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
988: if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
989: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
990: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
991: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
992: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
993: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
994: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
995: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
996: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
997: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
998: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
999: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
1000: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
1001: }
1002: #endif
1003: return(0);
1004: }
1006: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
1007: {
1009: *ScatterAndOp = NULL;
1010: if (mtype == PETSC_MEMTYPE_HOST) {
1011: if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
1012: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1013: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
1014: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1015: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1016: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
1017: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
1018: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
1019: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
1020: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
1021: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
1022: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
1023: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
1024: }
1025: #if defined(PETSC_HAVE_DEVICE)
1026: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1027: if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
1028: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1029: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
1030: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1031: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1032: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
1033: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
1034: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
1035: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
1036: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
1037: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
1038: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
1039: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
1040: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1041: if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
1042: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1043: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
1044: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1045: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1046: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
1047: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
1048: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
1049: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
1050: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
1051: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
1052: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
1053: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
1054: }
1055: #endif
1056: return(0);
1057: }
1059: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1060: {
1062: *FetchAndOp = NULL;
1063: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1064: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1065: #if defined(PETSC_HAVE_DEVICE)
1066: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1067: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd;
1068: #endif
1069: return(0);
1070: }
1072: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1073: {
1075: *FetchAndOpLocal = NULL;
1076: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1077: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1078: #if defined(PETSC_HAVE_DEVICE)
1079: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1080: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
1081: #endif
1082: return(0);
1083: }
1085: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1086: {
1088: PetscLogDouble flops;
1089: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1093: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1094: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1095: #if defined(PETSC_HAVE_DEVICE)
1096: if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1097: #endif
1098: {PetscLogFlops(flops);}
1099: }
1100: return(0);
1101: }
1103: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1104: {
1105: PetscLogDouble flops;
1109: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1110: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1111: #if defined(PETSC_HAVE_DEVICE)
1112: if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1113: #endif
1114: {PetscLogFlops(flops);}
1115: }
1116: return(0);
1117: }
1119: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1120: Input Arguments:
1121: +sf - The StarForest
1122: .link - The link
1123: .count - Number of entries to unpack
1124: .start - The first index, significent when indices=NULL
1125: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1126: .buf - A contiguous buffer to unpack from
1127: -op - Operation after unpack
1129: Output Arguments:
1130: .data - The data to unpack to
1131: */
1132: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1133: {
1135: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1136: {
1138: PetscInt i;
1139: PetscMPIInt n;
1140: if (indices) {
1141: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1142: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1143: */
1144: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
1145: } else {
1146: PetscMPIIntCast(count,&n);
1147: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
1148: }
1149: }
1150: return(0);
1151: #else
1152: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1153: #endif
1154: }
1156: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1157: {
1159: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1160: {
1162: PetscInt i,disp;
1163: if (!srcIdx) {
1164: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
1165: } else {
1166: for (i=0; i<count; i++) {
1167: disp = dstIdx? dstIdx[i] : dstStart + i;
1168: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1169: }
1170: }
1171: }
1172: return(0);
1173: #else
1174: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1175: #endif
1176: }
1178: /*=============================================================================
1179: Pack/Unpack/Fetch/Scatter routines
1180: ============================================================================*/
1182: /* Pack rootdata to rootbuf
1183: Input Arguments:
1184: + sf - The SF this packing works on.
1185: . link - It gives the memtype of the roots and also provides root buffer.
1186: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1187: - rootdata - Where to read the roots.
1189: Notes:
1190: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1191: in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1192: */
1193: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1194: {
1195: PetscErrorCode ierr;
1196: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1197: const PetscInt *rootindices = NULL;
1198: PetscInt count,start;
1199: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1200: PetscMemType rootmtype = link->rootmtype;
1201: PetscSFPackOpt opt = NULL;
1204: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1205: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1206: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1207: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1208: PetscSFLinkGetPack(link,rootmtype,&Pack);
1209: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1210: }
1211: if (scope == PETSCSF_REMOTE) {
1212: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1213: PetscSFLinkSyncStreamAfterPackRootData(sf,link);
1214: }
1215: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1216: return(0);
1217: }
1219: /* Pack leafdata to leafbuf */
1220: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1221: {
1222: PetscErrorCode ierr;
1223: const PetscInt *leafindices = NULL;
1224: PetscInt count,start;
1225: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1226: PetscMemType leafmtype = link->leafmtype;
1227: PetscSFPackOpt opt = NULL;
1230: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1231: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1232: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1233: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1234: PetscSFLinkGetPack(link,leafmtype,&Pack);
1235: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1236: }
1237: if (scope == PETSCSF_REMOTE) {
1238: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1239: PetscSFLinkSyncStreamAfterPackLeafData(sf,link);
1240: }
1241: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1242: return(0);
1243: }
1245: /* Unpack rootbuf to rootdata */
1246: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1247: {
1248: PetscErrorCode ierr;
1249: const PetscInt *rootindices = NULL;
1250: PetscInt count,start;
1251: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1252: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1253: PetscMemType rootmtype = link->rootmtype;
1254: PetscSFPackOpt opt = NULL;
1257: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1258: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1259: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1260: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1261: if (UnpackAndOp) {
1262: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1263: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1264: } else {
1265: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1266: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1267: }
1268: }
1269: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);}
1270: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1271: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1272: return(0);
1273: }
1275: /* Unpack leafbuf to leafdata */
1276: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1277: {
1278: PetscErrorCode ierr;
1279: const PetscInt *leafindices = NULL;
1280: PetscInt count,start;
1281: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1282: PetscMemType leafmtype = link->leafmtype;
1283: PetscSFPackOpt opt = NULL;
1286: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1287: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1288: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1289: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1290: if (UnpackAndOp) {
1291: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1292: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1293: } else {
1294: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1295: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1296: }
1297: }
1298: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);}
1299: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1300: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1301: return(0);
1302: }
1304: /* FetchAndOp rootdata with rootbuf */
1305: PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1306: {
1307: PetscErrorCode ierr;
1308: const PetscInt *rootindices = NULL;
1309: PetscInt count,start;
1310: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1311: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1312: PetscMemType rootmtype = link->rootmtype;
1313: PetscSFPackOpt opt = NULL;
1316: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1317: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1318: if (bas->rootbuflen[scope]) {
1319: /* Do FetchAndOp on rootdata with rootbuf */
1320: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);
1321: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1322: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1323: }
1324: if (scope == PETSCSF_REMOTE) {
1325: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
1326: PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);
1327: }
1328: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1329: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1330: return(0);
1331: }
1333: /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1334: PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1335: {
1336: PetscErrorCode ierr;
1337: const PetscInt *rootindices = NULL,*leafindices = NULL;
1338: PetscInt count,rootstart,leafstart;
1339: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1340: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1341: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1342: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1345: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1346: if (rootmtype != leafmtype) { /* Uncommon case */
1347: /* The local communication has to go through pack and unpack */
1348: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1349: (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);
1350: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1351: } else {
1352: PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);
1353: if (ScatterAndOp) {
1354: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1355: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1356: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1357: } else {
1358: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1359: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1360: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1361: }
1362: }
1363: return(0);
1364: }
1366: /* Reduce leafdata to rootdata locally */
1367: PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1368: {
1369: PetscErrorCode ierr;
1370: const PetscInt *rootindices = NULL,*leafindices = NULL;
1371: PetscInt count,rootstart,leafstart;
1372: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1373: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1374: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1375: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1378: if (!sf->leafbuflen[PETSCSF_LOCAL]) return(0);
1379: if (rootmtype != leafmtype) {
1380: /* The local communication has to go through pack and unpack */
1381: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1382: (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);
1383: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1384: } else {
1385: PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);
1386: if (ScatterAndOp) {
1387: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1388: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1389: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1390: } else {
1391: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1392: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1393: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1394: }
1395: }
1396: return(0);
1397: }
1399: /* Fetch rootdata to leafdata and leafupdate locally */
1400: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1401: {
1402: PetscErrorCode ierr;
1403: const PetscInt *rootindices = NULL,*leafindices = NULL;
1404: PetscInt count,rootstart,leafstart;
1405: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1406: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1407: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1408: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1411: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1412: if (rootmtype != leafmtype) {
1413: /* The local communication has to go through pack and unpack */
1414: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1415: } else {
1416: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1417: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1418: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1419: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1420: }
1421: return(0);
1422: }
1424: /*
1425: Create per-rank pack/unpack optimizations based on indice patterns
1427: Input Parameters:
1428: + n - Number of destination ranks
1429: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1430: - idx - [*] Array storing indices
1432: Output Parameters:
1433: + opt - Pack optimizations. NULL if no optimizations.
1434: */
1435: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1436: {
1438: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1439: PetscBool optimizable = PETSC_TRUE;
1440: PetscSFPackOpt opt;
1443: PetscMalloc1(1,&opt);
1444: PetscMalloc1(7*n+2,&opt->array);
1445: opt->n = opt->array[0] = n;
1446: opt->offset = opt->array + 1;
1447: opt->start = opt->array + n + 2;
1448: opt->dx = opt->array + 2*n + 2;
1449: opt->dy = opt->array + 3*n + 2;
1450: opt->dz = opt->array + 4*n + 2;
1451: opt->X = opt->array + 5*n + 2;
1452: opt->Y = opt->array + 6*n + 2;
1454: for (r=0; r<n; r++) { /* For each destination rank */
1455: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1456: p = offset[r];
1457: start = idx[p]; /* First index for this rank */
1458: p++;
1460: /* Search in X dimension */
1461: for (dx=1; dx<m; dx++,p++) {
1462: if (start+dx != idx[p]) break;
1463: }
1465: dydz = m/dx;
1466: X = dydz > 1 ? (idx[p]-start) : dx;
1467: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1468: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1469: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1470: for (i=0; i<dx; i++,p++) {
1471: if (start+X*dy+i != idx[p]) {
1472: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1473: else goto Z_dimension;
1474: }
1475: }
1476: }
1478: Z_dimension:
1479: dz = m/(dx*dy);
1480: Y = dz > 1 ? (idx[p]-start)/X : dy;
1481: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1482: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1483: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1484: for (j=0; j<dy; j++) {
1485: for (i=0; i<dx; i++,p++) {
1486: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1487: }
1488: }
1489: }
1490: opt->start[r] = start;
1491: opt->dx[r] = dx;
1492: opt->dy[r] = dy;
1493: opt->dz[r] = dz;
1494: opt->X[r] = X;
1495: opt->Y[r] = Y;
1496: }
1498: finish:
1499: /* If not optimizable, free arrays to save memory */
1500: if (!n || !optimizable) {
1501: PetscFree(opt->array);
1502: PetscFree(opt);
1503: *out = NULL;
1504: } else {
1505: opt->offset[0] = 0;
1506: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1507: *out = opt;
1508: }
1509: return(0);
1510: }
1512: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1513: {
1515: PetscSFPackOpt opt = *out;
1518: if (opt) {
1519: PetscSFFree(sf,mtype,opt->array);
1520: PetscFree(opt);
1521: *out = NULL;
1522: }
1523: return(0);
1524: }
1526: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1527: {
1529: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1530: PetscInt i,j;
1533: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1534: for (i=0; i<2; i++) { /* Set defaults */
1535: sf->leafstart[i] = 0;
1536: sf->leafcontig[i] = PETSC_TRUE;
1537: sf->leafdups[i] = PETSC_FALSE;
1538: bas->rootstart[i] = 0;
1539: bas->rootcontig[i] = PETSC_TRUE;
1540: bas->rootdups[i] = PETSC_FALSE;
1541: }
1543: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1544: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1546: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1547: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1549: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1550: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1551: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1552: }
1553: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1554: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1555: }
1557: /* If not, see if we can have per-rank optimizations by doing index analysis */
1558: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1559: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1561: /* Are root indices for self and remote contiguous? */
1562: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1563: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1565: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1566: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1568: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1569: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1570: }
1571: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1572: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1573: }
1575: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1576: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1578: #if defined(PETSC_HAVE_DEVICE)
1579: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1584: #endif
1586: return(0);
1587: }
1589: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1590: {
1592: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1593: PetscInt i;
1596: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1597: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1598: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1599: #if defined(PETSC_HAVE_DEVICE)
1600: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1601: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1602: #endif
1603: }
1604: return(0);
1605: }