Actual source code: sfpack.c
petsc-3.12.2 2019-11-22
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: #if defined(PETSC_HAVE_CUDA)
6: #include <cuda_runtime.h>
7: #endif
8: /*
9: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10: * therefore we pack data types manually. This file defines packing routines for the standard data types.
11: */
13: #define CPPJoin2(a,b) a ##_## b
14: #define CPPJoin3(a,b,c) a ##_## b ##_## c
15: #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d
16: #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d)
18: #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
19: #define IGNORE(statement) do {} while(0)
21: #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */
22: #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */
23: #define LXOR_OP(r,s,op,t) do {(r) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */
24: #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
26: #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
28: /* DEF_PackFunc - macro defining a Pack routine
30: Arguments of the macro:
31: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32: .BS Block size for vectorization. It is a factor of bs.
33: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
35: Arguments of the Pack routine:
36: +count Number of indices in idx[]
37: .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39: .opt Pack optimization plans. NULL means no plan at all.
40: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41: -packed Address of the packed data for each rank
42: */
43: #define DEF_PackFunc(Type,BS,EQ) \
44: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45: { \
47: const Type *u = (const Type*)unpacked,*u2; \
48: Type *p = (Type*)packed,*p2; \
49: PetscInt i,j,k,l,r,step,bs=link->bs; \
50: const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
51: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
53: if (!idx) {PetscArraycpy(p,u,MBS*count);} /* Indices are contiguous */ \
54: else if (!opt) { /* No optimizations available */ \
55: for (i=0; i<count; i++) \
56: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
57: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
58: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
59: } else { \
60: for (r=0; r<opt->n; r++) { \
61: p2 = p + opt->offset[r]*MBS; \
62: if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \
63: idx2 = idx + opt->offset[r]; \
64: for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \
65: for (j=0; j<M; j++) \
66: for (k=0; k<BS; k++) \
67: p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \
68: } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \
69: for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \
70: u2 = u + idx[opt->copy_start[i]]*MBS; \
71: l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \
72: PetscArraycpy(p2,u2,l); \
73: p2 += l; \
74: } \
75: } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \
76: u2 = u + idx[opt->offset[r]]*MBS; \
77: step = opt->stride_step[r]; \
78: for (i=0; i<opt->stride_n[r]; i++) \
79: for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \
80: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \
81: } \
82: } \
83: return(0); \
84: }
86: /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
88: Arguments:
89: +action Unpack or Fetch
90: .Type Type of the data
91: .BS Block size for vectorization
92: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
93: .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE
94: .CType Type with or without the const qualifier, i.e., const Type or Type
95: .Cvoid void with or without the const qualifier, i.e., const void or void
97: Notes:
98: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99: The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
100: get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
101: sometimes we input 'const', sometimes we have to input empty.
103: If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
104: */
105: #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \
106: static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107: { \
109: Type *u = (Type*)unpacked,*u2; \
110: CType *p = (CType*)packed,*p2; \
111: PetscInt i,j,k,l,r,step,bs=link->bs; \
112: const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
113: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
115: if (!idx) { \
116: FILTER(Type *v); \
117: FILTER(PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \
118: FILTER(PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \
119: PetscArraycpy(u,p,count*MBS); \
120: FILTER(PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \
121: FILTER(PetscFree(v);CHKERRQ(ierr)); \
122: } else if (!opt) { /* No optimizations available */ \
123: for (i=0; i<count; i++) \
124: for (j=0; j<M; j++) \
125: for (k=0; k<BS; k++) { \
126: FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \
127: u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
128: FILTER(p[i*MBS+j*BS+k] = t); \
129: } \
130: } else { \
131: for (r=0; r<opt->n; r++) { \
132: p2 = p + opt->offset[r]*MBS; \
133: if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \
134: idx2 = idx + opt->offset[r]; \
135: for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \
136: for (j=0; j<M; j++) \
137: for (k=0; k<BS; k++) { \
138: FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \
139: u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \
140: FILTER(p2[i*MBS+j*BS+k] = t); \
141: } \
142: } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \
143: FILTER(Type *v); \
144: FILTER(PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
145: for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \
146: u2 = u + idx[opt->copy_start[i]]*MBS; \
147: l = opt->copy_length[i]*MBS; \
148: FILTER(PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \
149: PetscArraycpy(u2,p2,l); \
150: FILTER(PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \
151: p2 += l; \
152: } \
153: FILTER(PetscFree(v);CHKERRQ(ierr)); \
154: } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \
155: u2 = u + idx[opt->offset[r]]*MBS; \
156: step = opt->stride_step[r]; \
157: for (i=0; i<opt->stride_n[r]; i++) \
158: for (j=0; j<M; j++) \
159: for (k=0; k<BS; k++) { \
160: FILTER(Type t = u2[i*step*MBS+j*BS+k]); \
161: u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \
162: FILTER(p2[i*MBS+j*BS+k] = t); \
163: } \
164: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \
165: } \
166: } \
167: return(0); \
168: }
170: /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
172: Arguments:
173: +action Unpack or Fetch
174: .opname Name of the Op, such as Add, Mult, LAND, etc.
175: .Type Type of the data
176: .BS Block size for vectorization
177: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
178: .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
179: .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
180: .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE
181: .CType Type with or without the const qualifier, i.e., const Type or Type
182: -Cvoid void with or without the const qualifier, i.e., const void or void
183: */
184: #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185: static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186: { \
187: Type *u = (Type*)unpacked,*u2,t; \
188: CType *p = (CType*)packed,*p2; \
189: PetscInt i,j,k,l,r,step,bs=link->bs; \
190: const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
191: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
193: if (!idx) { \
194: for (i=0; i<count; i++) \
195: for (j=0; j<M; j++) \
196: for (k=0; k<BS; k++) { \
197: t = u[i*MBS+j*BS+k]; \
198: APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \
199: FILTER(p[i*MBS+j*BS+k] = t); \
200: } \
201: } else if (!opt) { /* No optimizations available */ \
202: for (i=0; i<count; i++) \
203: for (j=0; j<M; j++) \
204: for (k=0; k<BS; k++) { \
205: t = u[idx[i]*MBS+j*BS+k]; \
206: APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \
207: FILTER(p[i*MBS+j*BS+k] = t); \
208: } \
209: } else { \
210: for (r=0; r<opt->n; r++) { \
211: p2 = p + opt->offset[r]*MBS; \
212: if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \
213: idx2 = idx + opt->offset[r]; \
214: for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \
215: for (j=0; j<M; j++) \
216: for (k=0; k<BS; k++) { \
217: t = u[idx2[i]*MBS+j*BS+k]; \
218: APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \
219: FILTER(p2[i*MBS+j*BS+k] = t); \
220: } \
221: } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \
222: for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \
223: u2 = u + idx[opt->copy_start[i]]*MBS; \
224: l = opt->copy_length[i]*MBS; \
225: for (j=0; j<l; j++) { \
226: t = u2[j]; \
227: APPLY (u2[j],t,op,p2[j]); \
228: FILTER(p2[j] = t); \
229: } \
230: p2 += l; \
231: } \
232: } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \
233: u2 = u + idx[opt->offset[r]]*MBS; \
234: step = opt->stride_step[r]; \
235: for (i=0; i<opt->stride_n[r]; i++) \
236: for (j=0; j<M; j++) \
237: for (k=0; k<BS; k++) { \
238: t = u2[i*step*MBS+j*BS+k]; \
239: APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \
240: FILTER(p2[i*MBS+j*BS+k] = t); \
241: } \
242: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \
243: } \
244: } \
245: return(0); \
246: }
248: /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
250: Arguments:
251: +Action Unpack or Fetch
252: .locname Max or Min
253: .type1 Type of the first data in a pair type
254: .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
255: .op > or <
256: .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE
257: .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258: -Cvoid void with or without the const qualifier, i.e., const void or void
259: */
260: #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261: static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262: PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \
263: CType *p = (CType*)packed; \
264: PetscInt i,j; \
265: for (i=0; i<count; i++) { \
266: FILTER(PairType(Type1,Type2) v); \
267: j = idx? idx[i] : i; \
268: FILTER(v = u[j]); \
269: if (p[i].a op u[j].a) { \
270: u[j] = p[i]; \
271: } else if (p[i].a == u[j].a) { \
272: u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \
273: } \
274: FILTER(p[i] = v); \
275: } \
276: return(0); \
277: }
279: /* Pack, Unpack/Fetch ops */
280: #define DEF_Pack(Type,BS,EQ) \
281: DEF_PackFunc(Type,BS,EQ) \
282: DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \
283: DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \
284: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \
285: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
286: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
287: link->h_FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \
288: }
290: /* Add, Mult ops */
291: #define DEF_Add(Type,BS,EQ) \
292: DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \
293: DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \
294: DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \
295: DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \
296: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \
297: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
298: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
299: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
300: link->h_FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \
301: }
303: /* Max, Min ops */
304: #define DEF_Cmp(Type,BS,EQ) \
305: DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \
306: DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \
307: DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \
308: DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \
309: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \
310: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
311: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
312: link->h_FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \
313: link->h_FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \
314: }
316: /* Logical ops.
317: The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
318: the compilation warning "empty macro arguments are undefined in ISO C90"
319: */
320: #define DEF_Log(Type,BS,EQ) \
321: DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \
322: DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \
323: DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \
324: DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \
325: DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \
326: DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \
327: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \
328: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \
329: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
330: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \
331: link->h_FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \
332: link->h_FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \
333: link->h_FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \
334: }
336: /* Bitwise ops */
337: #define DEF_Bit(Type,BS,EQ) \
338: DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \
339: DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \
340: DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \
341: DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \
342: DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \
343: DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \
344: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \
345: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \
346: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
347: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \
348: link->h_FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \
349: link->h_FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \
350: link->h_FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \
351: }
353: /* Maxloc, Minloc */
354: #define DEF_Xloc(Type1,Type2) \
355: DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \
356: DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \
357: DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \
358: DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \
359: static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \
360: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \
361: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \
362: link->h_FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \
363: link->h_FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \
364: }
366: #define DEF_IntegerType(Type,BS,EQ) \
367: DEF_Pack(Type,BS,EQ) \
368: DEF_Add(Type,BS,EQ) \
369: DEF_Cmp(Type,BS,EQ) \
370: DEF_Log(Type,BS,EQ) \
371: DEF_Bit(Type,BS,EQ) \
372: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \
373: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
374: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
375: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
376: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
377: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
378: }
380: #define DEF_RealType(Type,BS,EQ) \
381: DEF_Pack(Type,BS,EQ) \
382: DEF_Add(Type,BS,EQ) \
383: DEF_Cmp(Type,BS,EQ) \
384: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \
385: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
386: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
387: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
388: }
390: #if defined(PETSC_HAVE_COMPLEX)
391: #define DEF_ComplexType(Type,BS,EQ) \
392: DEF_Pack(Type,BS,EQ) \
393: DEF_Add(Type,BS,EQ) \
394: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \
395: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
396: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
397: }
398: #endif
400: #define DEF_DumbType(Type,BS,EQ) \
401: DEF_Pack(Type,BS,EQ) \
402: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \
403: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
404: }
406: /* Maxloc, Minloc */
407: #define DEF_PairType(Type1,Type2) \
408: typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \
409: DEF_Pack(PairType(Type1,Type2),1,1) \
410: DEF_Xloc(Type1,Type2) \
411: static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \
412: CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \
413: CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \
414: }
416: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
417: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
425: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426: DEF_IntegerType(int,1,1)
427: DEF_IntegerType(int,2,1)
428: DEF_IntegerType(int,4,1)
429: DEF_IntegerType(int,8,1)
430: DEF_IntegerType(int,1,0)
431: DEF_IntegerType(int,2,0)
432: DEF_IntegerType(int,4,0)
433: DEF_IntegerType(int,8,0)
434: #endif
436: /* The typedefs are used to get a typename without space that CPPJoin can handle */
437: typedef signed char SignedChar;
438: DEF_IntegerType(SignedChar,1,1)
439: DEF_IntegerType(SignedChar,2,1)
440: DEF_IntegerType(SignedChar,4,1)
441: DEF_IntegerType(SignedChar,8,1)
442: DEF_IntegerType(SignedChar,1,0)
443: DEF_IntegerType(SignedChar,2,0)
444: DEF_IntegerType(SignedChar,4,0)
445: DEF_IntegerType(SignedChar,8,0)
447: typedef unsigned char UnsignedChar;
448: DEF_IntegerType(UnsignedChar,1,1)
449: DEF_IntegerType(UnsignedChar,2,1)
450: DEF_IntegerType(UnsignedChar,4,1)
451: DEF_IntegerType(UnsignedChar,8,1)
452: DEF_IntegerType(UnsignedChar,1,0)
453: DEF_IntegerType(UnsignedChar,2,0)
454: DEF_IntegerType(UnsignedChar,4,0)
455: DEF_IntegerType(UnsignedChar,8,0)
457: DEF_RealType(PetscReal,1,1)
458: DEF_RealType(PetscReal,2,1)
459: DEF_RealType(PetscReal,4,1)
460: DEF_RealType(PetscReal,8,1)
461: DEF_RealType(PetscReal,1,0)
462: DEF_RealType(PetscReal,2,0)
463: DEF_RealType(PetscReal,4,0)
464: DEF_RealType(PetscReal,8,0)
466: #if defined(PETSC_HAVE_COMPLEX)
467: DEF_ComplexType(PetscComplex,1,1)
468: DEF_ComplexType(PetscComplex,2,1)
469: DEF_ComplexType(PetscComplex,4,1)
470: DEF_ComplexType(PetscComplex,8,1)
471: DEF_ComplexType(PetscComplex,1,0)
472: DEF_ComplexType(PetscComplex,2,0)
473: DEF_ComplexType(PetscComplex,4,0)
474: DEF_ComplexType(PetscComplex,8,0)
475: #endif
477: DEF_PairType(int,int)
478: DEF_PairType(PetscInt,PetscInt)
480: /* If we don't know the basic type, we treat it as a stream of chars or ints */
481: DEF_DumbType(char,1,1)
482: DEF_DumbType(char,2,1)
483: DEF_DumbType(char,4,1)
484: DEF_DumbType(char,1,0)
485: DEF_DumbType(char,2,0)
486: DEF_DumbType(char,4,0)
488: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489: DEF_DumbType(DumbInt,1,1)
490: DEF_DumbType(DumbInt,2,1)
491: DEF_DumbType(DumbInt,4,1)
492: DEF_DumbType(DumbInt,8,1)
493: DEF_DumbType(DumbInt,1,0)
494: DEF_DumbType(DumbInt,2,0)
495: DEF_DumbType(DumbInt,4,0)
496: DEF_DumbType(DumbInt,8,0)
498: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
499: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
500: {
501: int ierr;
502: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
503: MPI_Type_commit(newtype); if (ierr) return ierr;
504: return MPI_SUCCESS;
505: }
506: #endif
508: PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
509: {
510: PetscErrorCode ierr;
511: PetscSFPack link,*p;
512: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
515: /* Look for types in cache */
516: for (p=&bas->inuse; (link=*p); p=&link->next) {
517: PetscBool match;
518: MPIPetsc_Type_compare(unit,link->unit,&match);
519: if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
520: switch (cmode) {
521: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
522: case PETSC_USE_POINTER: break;
523: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
524: }
525: *mylink = link;
526: return(0);
527: }
528: }
529: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
530: return(0);
531: }
533: PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
534: {
535: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
538: (*link)->rkey = NULL;
539: (*link)->lkey = NULL;
540: (*link)->next = bas->avail;
541: bas->avail = *link;
542: *link = NULL;
543: return(0);
544: }
546: /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547: PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail)
548: {
549: PetscErrorCode ierr;
550: PetscSFPack link=*avail,next;
551: PetscInt i;
554: for (; link; link=next) {
555: next = link->next;
556: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
557: for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
559: }
560: PetscFree(link->reqs);
561: PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);
562: PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);
563: PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);
565: #if defined(PETSC_HAVE_CUDA)
566: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);
567: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);
568: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);
569: if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
570: #endif
571: PetscFree(link);
572: }
573: *avail = NULL;
574: return(0);
575: }
577: /* Error out on unsupported overlapped communications */
578: PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
579: {
580: PetscErrorCode ierr;
581: PetscSFPack link,*p;
582: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
583: PetscBool match;
586: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
587: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
588: */
589: if (rkey || lkey) {
590: for (p=&bas->inuse; (link=*p); p=&link->next) {
591: MPIPetsc_Type_compare(unit,link->unit,&match);
592: if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
593: }
594: }
595: return(0);
596: }
598: PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
599: {
601: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
602: PetscBool is2Int,is2PetscInt;
603: PetscMPIInt ni,na,nd,combiner;
604: #if defined(PETSC_HAVE_COMPLEX)
605: PetscInt nPetscComplex=0;
606: #endif
609: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
610: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
611: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
612: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
613: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
614: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
615: #if defined(PETSC_HAVE_COMPLEX)
616: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
617: #endif
618: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
619: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
620: /* TODO: shaell we also handle Fortran MPI_2REAL? */
621: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
622: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
623: link->bs = 1; /* default */
625: if (is2Int) {
626: PackInit_PairType_int_int(link);
627: link->bs = 1;
628: link->unitbytes = 2*sizeof(int);
629: link->basicunit = MPI_2INT;
630: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
631: PackInit_PairType_PetscInt_PetscInt(link);
632: link->bs = 1;
633: link->unitbytes = 2*sizeof(PetscInt);
634: link->basicunit = MPIU_2INT;
635: } else if (nPetscReal) {
636: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
637: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
638: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
639: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
640: link->bs = nPetscReal;
641: link->unitbytes = nPetscReal*sizeof(PetscReal);
642: link->basicunit = MPIU_REAL;
643: } else if (nPetscInt) {
644: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
645: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
646: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
647: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
648: link->bs = nPetscInt;
649: link->unitbytes = nPetscInt*sizeof(PetscInt);
650: link->basicunit = MPIU_INT;
651: #if defined(PETSC_USE_64BIT_INDICES)
652: } else if (nInt) {
653: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
654: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
655: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
656: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
657: link->bs = nInt;
658: link->unitbytes = nInt*sizeof(int);
659: link->basicunit = MPI_INT;
660: #endif
661: } else if (nSignedChar) {
662: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
663: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
664: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
666: link->bs = nSignedChar;
667: link->unitbytes = nSignedChar*sizeof(SignedChar);
668: link->basicunit = MPI_SIGNED_CHAR;
669: } else if (nUnsignedChar) {
670: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
671: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
672: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
673: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
674: link->bs = nUnsignedChar;
675: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
676: link->basicunit = MPI_UNSIGNED_CHAR;
677: #if defined(PETSC_HAVE_COMPLEX)
678: } else if (nPetscComplex) {
679: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
680: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
681: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
682: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
683: link->bs = nPetscComplex;
684: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
685: link->basicunit = MPIU_COMPLEX;
686: #endif
687: } else {
688: MPI_Aint lb,nbyte;
689: MPI_Type_get_extent(unit,&lb,&nbyte);
690: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
691: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
692: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
693: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
694: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
695: link->bs = nbyte;
696: link->unitbytes = nbyte;
697: link->basicunit = MPI_BYTE;
698: } else {
699: nInt = nbyte / sizeof(int);
700: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
701: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
702: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
703: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
704: link->bs = nInt;
705: link->unitbytes = nbyte;
706: link->basicunit = MPI_INT;
707: }
708: }
710: if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
711: else {MPI_Type_dup(unit,&link->unit);}
712: return(0);
713: }
715: PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
716: {
718: *UnpackAndOp = NULL;
719: if (mtype == PETSC_MEMTYPE_HOST) {
720: if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
721: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
722: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
723: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
724: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
725: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
726: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
727: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
728: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
729: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
730: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
731: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
732: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
733: }
734: #if defined(PETSC_HAVE_CUDA)
735: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
736: if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
737: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
738: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
739: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
740: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
741: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
742: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
743: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
744: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
745: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
746: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
747: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
748: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
749: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
750: if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
751: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
752: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
753: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
754: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
755: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
756: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
757: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
758: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
759: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
760: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
761: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
762: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
763: }
764: #endif
765: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
767: return(0);
768: }
770: PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
771: {
773: *FetchAndOp = NULL;
774: if (mtype == PETSC_MEMTYPE_HOST) {
775: if (op == MPIU_REPLACE) *FetchAndOp = link->h_FetchAndInsert;
776: else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
777: else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
778: else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
779: else if (op == MPI_MAXLOC) *FetchAndOp = link->h_FetchAndMaxloc;
780: else if (op == MPI_MINLOC) *FetchAndOp = link->h_FetchAndMinloc;
781: else if (op == MPI_PROD) *FetchAndOp = link->h_FetchAndMult;
782: else if (op == MPI_LAND) *FetchAndOp = link->h_FetchAndLAND;
783: else if (op == MPI_BAND) *FetchAndOp = link->h_FetchAndBAND;
784: else if (op == MPI_LOR) *FetchAndOp = link->h_FetchAndLOR;
785: else if (op == MPI_BOR) *FetchAndOp = link->h_FetchAndBOR;
786: else if (op == MPI_LXOR) *FetchAndOp = link->h_FetchAndLXOR;
787: else if (op == MPI_BXOR) *FetchAndOp = link->h_FetchAndBXOR;
788: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
789: }
790: #if defined(PETSC_HAVE_CUDA)
791: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
792: if (op == MPIU_REPLACE) *FetchAndOp = link->d_FetchAndInsert;
793: else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
794: else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
795: else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
796: else if (op == MPI_MAXLOC) *FetchAndOp = link->d_FetchAndMaxloc;
797: else if (op == MPI_MINLOC) *FetchAndOp = link->d_FetchAndMinloc;
798: else if (op == MPI_PROD) *FetchAndOp = link->d_FetchAndMult;
799: else if (op == MPI_LAND) *FetchAndOp = link->d_FetchAndLAND;
800: else if (op == MPI_BAND) *FetchAndOp = link->d_FetchAndBAND;
801: else if (op == MPI_LOR) *FetchAndOp = link->d_FetchAndLOR;
802: else if (op == MPI_BOR) *FetchAndOp = link->d_FetchAndBOR;
803: else if (op == MPI_LXOR) *FetchAndOp = link->d_FetchAndLXOR;
804: else if (op == MPI_BXOR) *FetchAndOp = link->d_FetchAndBXOR;
805: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
806: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
807: if (op == MPIU_REPLACE) *FetchAndOp = link->da_FetchAndInsert;
808: else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
809: else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
810: else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
811: else if (op == MPI_MAXLOC) *FetchAndOp = link->da_FetchAndMaxloc;
812: else if (op == MPI_MINLOC) *FetchAndOp = link->da_FetchAndMinloc;
813: else if (op == MPI_PROD) *FetchAndOp = link->da_FetchAndMult;
814: else if (op == MPI_LAND) *FetchAndOp = link->da_FetchAndLAND;
815: else if (op == MPI_BAND) *FetchAndOp = link->da_FetchAndBAND;
816: else if (op == MPI_LOR) *FetchAndOp = link->da_FetchAndLOR;
817: else if (op == MPI_BOR) *FetchAndOp = link->da_FetchAndBOR;
818: else if (op == MPI_LXOR) *FetchAndOp = link->da_FetchAndLXOR;
819: else if (op == MPI_BXOR) *FetchAndOp = link->da_FetchAndBXOR;
820: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
821: }
822: #endif
823: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
824: return(0);
825: }
827: /*
828: Create pack/unpack optimization plans based on indice patterns available
830: Input Parameters:
831: + n - Number of target ranks
832: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
833: - idx - [*] Array storing indices
835: Output Parameters:
836: + opt - Optimization plans. Maybe NULL if no optimization can be built.
837: */
838: PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
839: {
841: PetscInt i,j,k,n_copies,tot_copies=0,step;
842: PetscBool strided,optimized=PETSC_FALSE;
843: PetscSFPackOpt opt;
846: if (!n) {
847: *out = NULL;
848: return(0);
849: }
851: PetscCalloc1(1,&opt);
852: PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);
853: PetscArraycpy(opt->offset,offset,n+1);
854: if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
856: opt->n = n;
858: /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
859: for (i=0; i<n; i++) { /* for each target processor */
860: /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
861: n_copies = 1;
862: for (j=offset[i]; j<offset[i+1]-1; j++) {
863: if (idx[j]+1 != idx[j+1]) n_copies++;
864: }
865: /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
866: then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
867: */
868: if ((offset[i+1]-offset[i])/n_copies >= 32) {
869: opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
870: optimized = PETSC_TRUE;
871: tot_copies += n_copies;
872: }
873: }
875: /* Setup memcpy plan for each contiguous piece */
876: k = 0; /* k-th copy */
877: PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);
878: for (i=0; i<n; i++) { /* for each target processor */
879: if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
880: n_copies = 1;
881: opt->copy_start[k] = offset[i] - offset[0];
882: for (j=offset[i]; j<offset[i+1]-1; j++) {
883: if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
884: n_copies++;
885: opt->copy_start[k+1] = j-offset[0]+1;
886: opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k];
887: k++;
888: }
889: }
890: /* Set copy length of the last copy for this target */
891: opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
892: k++;
893: }
894: /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
895: opt->copy_offset[i+1] = k;
896: }
898: /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
899: for (i=0; i<n; i++) { /* for each remote */
900: if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
901: strided = PETSC_TRUE;
902: step = idx[offset[i]+1] - idx[offset[i]];
903: for (j=offset[i]; j<offset[i+1]-1; j++) {
904: if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
905: }
906: if (strided) {
907: opt->type[i] = PETSCSF_PACKOPT_STRIDE;
908: opt->stride_step[i] = step;
909: opt->stride_n[i] = offset[i+1] - offset[i];
910: optimized = PETSC_TRUE;
911: }
912: }
913: }
914: /* If no rank gets optimized, free arrays to save memory */
915: if (!optimized) {
916: PetscFree3(opt->type,opt->offset,opt->copy_offset);
917: PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);
918: PetscFree(opt);
919: *out = NULL;
920: } else *out = opt;
921: return(0);
922: }
924: PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
925: {
927: PetscSFPackOpt opt = *out;
930: if (opt) {
931: PetscFree3(opt->type,opt->offset,opt->copy_offset);
932: PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);
933: PetscFree(opt);
934: *out = NULL;
935: }
936: return(0);
937: }