Actual source code: sfpackcuda.cu
petsc-3.13.2 2020-06-02
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <cuda_runtime.h>
4: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
5: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
6: {
7: PetscInt i,j,k,m,n,r;
8: const PetscInt *offset,*start,*dx,*dy,*X,*Y;
10: n = opt[0];
11: offset = opt + 1;
12: start = opt + n + 2;
13: dx = opt + 2*n + 2;
14: dy = opt + 3*n + 2;
15: X = opt + 5*n + 2;
16: Y = opt + 6*n + 2;
17: for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
18: m = (tid - offset[r]);
19: k = m/(dx[r]*dy[r]);
20: j = (m - k*dx[r]*dy[r])/dx[r];
21: i = m - k*dx[r]*dy[r] - j*dx[r];
23: return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
24: }
26: /*====================================================================================*/
27: /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
28: /*====================================================================================*/
30: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
31: <Type> is PetscReal, which is the primitive type we operate on.
32: <bs> is 16, which says <unit> contains 16 primitive types.
33: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
34: <EQ> is 0, which is (bs == BS ? 1 : 0)
36: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
37: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
38: */
39: template<class Type,PetscInt BS,PetscInt EQ>
40: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
41: {
42: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
43: const PetscInt grid_size = gridDim.x * blockDim.x;
44: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
45: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
47: for (; tid<count; tid += grid_size) {
48: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
49: opt == NULL && idx == NULL ==> the indices are contiguous;
50: */
51: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
52: s = tid*MBS;
53: for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
54: }
55: }
57: template<class Type,class Op,PetscInt BS,PetscInt EQ>
58: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
59: {
60: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
61: const PetscInt grid_size = gridDim.x * blockDim.x;
62: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
63: Op op;
65: for (; tid<count; tid += grid_size) {
66: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
67: s = tid*MBS;
68: for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
69: }
70: }
72: template<class Type,class Op,PetscInt BS,PetscInt EQ>
73: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
74: {
75: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
76: const PetscInt grid_size = gridDim.x * blockDim.x;
77: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
78: Op op;
80: for (; tid<count; tid += grid_size) {
81: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
82: l = tid*MBS;
83: for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
84: }
85: }
87: template<class Type,class Op,PetscInt BS,PetscInt EQ>
88: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
89: {
90: PetscInt i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
91: const PetscInt grid_size = gridDim.x * blockDim.x;
92: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
93: Op op;
95: for (; tid<count; tid += grid_size) {
96: if (!srcIdx) { /* src is either contiguous or 3D */
97: k = tid/(srcx*srcy);
98: j = (tid - k*srcx*srcy)/srcx;
99: i = tid - k*srcx*srcy - j*srcx;
100: s = srcStart + k*srcX*srcY + j*srcX + i;
101: } else {
102: s = srcIdx[tid];
103: }
105: if (!dstIdx) { /* dst is either contiguous or 3D */
106: k = tid/(dstx*dsty);
107: j = (tid - k*dstx*dsty)/dstx;
108: i = tid - k*dstx*dsty - j*dstx;
109: t = dstStart + k*dstX*dstY + j*dstX + i;
110: } else {
111: t = dstIdx[tid];
112: }
114: s *= MBS;
115: t *= MBS;
116: for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
117: }
118: }
120: template<class Type,class Op,PetscInt BS,PetscInt EQ>
121: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
122: {
123: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
124: const PetscInt grid_size = gridDim.x * blockDim.x;
125: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
126: Op op;
128: for (; tid<count; tid += grid_size) {
129: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
130: l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
131: for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
132: }
133: }
135: /*====================================================================================*/
136: /* Regular operations on device */
137: /*====================================================================================*/
138: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = y; return old;}};
139: template<typename Type> struct Add {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y; return old;}};
140: template<typename Type> struct Mult {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y; return old;}};
141: template<typename Type> struct Min {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMin(x,y); return old;}};
142: template<typename Type> struct Max {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMax(x,y); return old;}};
143: template<typename Type> struct LAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x && y; return old;}};
144: template<typename Type> struct LOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x || y; return old;}};
145: template<typename Type> struct LXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = !x != !y; return old;}};
146: template<typename Type> struct BAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x & y; return old;}};
147: template<typename Type> struct BOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x | y; return old;}};
148: template<typename Type> struct BXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x ^ y; return old;}};
149: template<typename Type> struct Minloc {
150: __device__ Type operator() (Type& x,Type y) const {
151: Type old = x;
152: if (y.a < x.a) x = y;
153: else if (y.a == x.a) x.b = min(x.b,y.b);
154: return old;
155: }
156: };
157: template<typename Type> struct Maxloc {
158: __device__ Type operator() (Type& x,Type y) const {
159: Type old = x;
160: if (y.a > x.a) x = y;
161: else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
162: return old;
163: }
164: };
166: /*====================================================================================*/
167: /* Atomic operations on device */
168: /*====================================================================================*/
170: /*
171: Atomic Insert (exchange) operations
173: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
175: int atomicExch(int* address, int val);
176: unsigned int atomicExch(unsigned int* address, unsigned int val);
177: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
178: float atomicExch(float* address, float val);
180: reads the 32-bit or 64-bit word old located at the address address in global or shared
181: memory and stores val back to memory at the same address. These two operations are
182: performed in one atomic transaction. The function returns old.
184: PETSc notes:
186: It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.
188: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
189: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
190: be atomic itself.
192: With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
193: insertion is atomic. Hope no user codes rely on that.
194: */
196: #if defined(PETSC_USE_REAL_DOUBLE)
197: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}
198: #endif
200: #if defined(PETSC_USE_64BIT_INDICES)
201: __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
202: #endif
204: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
206: /*
207: Atomic add operations
209: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
211: int atomicAdd(int* address, int val);
212: unsigned int atomicAdd(unsigned int* address,unsigned int val);
213: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
214: float atomicAdd(float* address, float val);
215: double atomicAdd(double* address, double val);
216: __half2 atomicAdd(__half2 *address, __half2 val);
217: __half atomicAdd(__half *address, __half val);
219: reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
220: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
221: function returns old.
223: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
224: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
225: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
226: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
227: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
228: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
229: */
231: #if defined(PETSC_USE_64BIT_INDICES)
232: __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
233: #endif
235: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
237: template<> struct AtomicAdd<double> {
238: __device__ double operator() (double& x,double y) const {
239: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
240: return atomicAdd(&x,y);
241: #else
242: double *address = &x, val = y;
243: unsigned long long int *address_as_ull = (unsigned long long int*)address;
244: unsigned long long int old = *address_as_ull, assumed;
245: do {
246: assumed = old;
247: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
248: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
249: } while (assumed != old);
250: return __longlong_as_double(old);
251: #endif
252: }
253: };
255: template<> struct AtomicAdd<float> {
256: __device__ float operator() (float& x,float y) const {
257: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
258: return atomicAdd(&x,y);
259: #else
260: float *address = &x, val = y;
261: int *address_as_int = (int*)address;
262: int old = *address_as_int, assumed;
263: do {
264: assumed = old;
265: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
266: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
267: } while (assumed != old);
268: return __int_as_float(old);
269: #endif
270: }
271: };
273: template<> struct AtomicAdd<PetscComplex> {
274: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
275: PetscComplex old, *z = &old;
276: PetscReal *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
277: AtomicAdd<PetscReal> op;
278: z[0] = op(xp[0],yp[0]);
279: z[1] = op(xp[1],yp[1]);
280: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
281: }
282: };
284: /*
285: Atomic Mult operations:
287: CUDA has no atomicMult at all, so we build our own with atomicCAS
288: */
289: #if defined(PETSC_USE_REAL_DOUBLE)
290: __device__ static double atomicMult(double* address, double val)
291: {
292: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
293: unsigned long long int old = *address_as_ull, assumed;
294: do {
295: assumed = old;
296: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
297: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
298: } while (assumed != old);
299: return __longlong_as_double(old);
300: }
301: #elif defined(PETSC_USE_REAL_SINGLE)
302: __device__ static float atomicMult(float* address,float val)
303: {
304: int *address_as_int = (int*)(address);
305: int old = *address_as_int, assumed;
306: do {
307: assumed = old;
308: old = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
309: } while (assumed != old);
310: return __int_as_float(old);
311: }
312: #endif
314: __device__ static int atomicMult(int* address,int val)
315: {
316: int *address_as_int = (int*)(address);
317: int old = *address_as_int, assumed;
318: do {
319: assumed = old;
320: old = atomicCAS(address_as_int, assumed, val*assumed);
321: } while (assumed != old);
322: return (int)old;
323: }
325: #if defined(PETSC_USE_64BIT_INDICES)
326: __device__ static int atomicMult(PetscInt* address,PetscInt val)
327: {
328: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
329: unsigned long long int old = *address_as_ull, assumed;
330: do {
331: assumed = old;
332: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
333: } while (assumed != old);
334: return (PetscInt)old;
335: }
336: #endif
338: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
340: /*
341: Atomic Min/Max operations
343: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
345: int atomicMin(int* address, int val);
346: unsigned int atomicMin(unsigned int* address,unsigned int val);
347: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
349: reads the 32-bit or 64-bit word old located at the address address in global or shared
350: memory, computes the minimum of old and val, and stores the result back to memory
351: at the same address. These three operations are performed in one atomic transaction.
352: The function returns old.
353: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
355: atomicMax() is similar.
356: */
358: #if defined(PETSC_USE_REAL_DOUBLE)
359: __device__ static double atomicMin(double* address, double val)
360: {
361: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
362: unsigned long long int old = *address_as_ull, assumed;
363: do {
364: assumed = old;
365: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
366: } while (assumed != old);
367: return __longlong_as_double(old);
368: }
370: __device__ static double atomicMax(double* address, double val)
371: {
372: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
373: unsigned long long int old = *address_as_ull, assumed;
374: do {
375: assumed = old;
376: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
377: } while (assumed != old);
378: return __longlong_as_double(old);
379: }
380: #elif defined(PETSC_USE_REAL_SINGLE)
381: __device__ static float atomicMin(float* address,float val)
382: {
383: int *address_as_int = (int*)(address);
384: int old = *address_as_int, assumed;
385: do {
386: assumed = old;
387: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
388: } while (assumed != old);
389: return __int_as_float(old);
390: }
392: __device__ static float atomicMax(float* address,float val)
393: {
394: int *address_as_int = (int*)(address);
395: int old = *address_as_int, assumed;
396: do {
397: assumed = old;
398: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
399: } while (assumed != old);
400: return __int_as_float(old);
401: }
402: #endif
404: /*
405: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
406: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
407: This causes compilation errors with pgi compilers and 64-bit indices:
408: error: function "atomicMin(long long *, long long)" has already been defined
410: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
411: */
412: #if defined(PETSC_USE_64BIT_INDICES) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
413: __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
414: {
415: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
416: unsigned long long int old = *address_as_ull, assumed;
417: do {
418: assumed = old;
419: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
420: } while (assumed != old);
421: return (PetscInt)old;
422: }
424: __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
425: {
426: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
427: unsigned long long int old = *address_as_ull, assumed;
428: do {
429: assumed = old;
430: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
431: } while (assumed != old);
432: return (PetscInt)old;
433: }
434: #endif
436: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
437: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
439: /*
440: Atomic bitwise operations
442: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
444: int atomicAnd(int* address, int val);
445: unsigned int atomicAnd(unsigned int* address,unsigned int val);
446: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
448: reads the 32-bit or 64-bit word old located at the address address in global or shared
449: memory, computes (old & val), and stores the result back to memory at the same
450: address. These three operations are performed in one atomic transaction.
451: The function returns old.
453: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
455: atomicOr() and atomicXor are similar.
456: */
458: #if defined(PETSC_USE_64BIT_INDICES)
459: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin(PetscInt* address,PetscInt val) */
460: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
461: {
462: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
463: unsigned long long int old = *address_as_ull, assumed;
464: do {
465: assumed = old;
466: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
467: } while (assumed != old);
468: return (PetscInt)old;
469: }
470: __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
471: {
472: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
473: unsigned long long int old = *address_as_ull, assumed;
474: do {
475: assumed = old;
476: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
477: } while (assumed != old);
478: return (PetscInt)old;
479: }
481: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
482: {
483: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
484: unsigned long long int old = *address_as_ull, assumed;
485: do {
486: assumed = old;
487: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
488: } while (assumed != old);
489: return (PetscInt)old;
490: }
491: #else
492: /*
493: See also comments at atomicMin(PetscInt* address,PetscInt val)
494: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
495: __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
496: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
497: */
498: #endif
499: #endif
501: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
502: template<typename Type> struct AtomicBOR {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
503: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
505: /*
506: Atomic logical operations:
508: CUDA has no atomic logical operations at all. We support them on integer types.
509: */
511: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
512: which is what we want since we only support 32-bit and 64-bit integers.
513: */
514: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
516: template<typename Type,class Op>
517: struct AtomicLogical<Type,Op,4> {
518: __device__ Type operator()(Type& x,Type y) const {
519: int *address_as_int = (int*)(&x);
520: int old = *address_as_int, assumed;
521: Op op;
522: do {
523: assumed = old;
524: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
525: } while (assumed != old);
526: return (Type)old;
527: }
528: };
530: template<typename Type,class Op>
531: struct AtomicLogical<Type,Op,8> {
532: __device__ Type operator()(Type& x,Type y) const {
533: unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
534: unsigned long long int old = *address_as_ull, assumed;
535: Op op;
536: do {
537: assumed = old;
538: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
539: } while (assumed != old);
540: return (Type)old;
541: }
542: };
544: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
545: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
546: template<typename Type> struct lor {__device__ Type operator()(Type x, Type y) {return x || y;}};
547: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
549: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
550: template<typename Type> struct AtomicLOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
551: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
553: /*====================================================================================*/
554: /* Wrapper functions of cuda kernels. Function pointers are stored in 'link' */
555: /*====================================================================================*/
556: template<typename Type,PetscInt BS,PetscInt EQ>
557: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
558: {
559: cudaError_t cerr;
560: PetscInt nthreads=256;
561: PetscInt nblocks=(count+nthreads-1)/nthreads;
562: const PetscInt *iarray=opt ? opt->array : NULL;
565: if (!count) return(0);
566: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
567: d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
568: cerr = cudaGetLastError();CHKERRCUDA(cerr);
569: return(0);
570: }
572: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
573: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
574: {
575: cudaError_t cerr;
576: PetscInt nthreads=256;
577: PetscInt nblocks=(count+nthreads-1)/nthreads;
578: const PetscInt *iarray=opt ? opt->array : NULL;
581: if (!count) return(0);
582: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
583: d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
584: cerr = cudaGetLastError();CHKERRCUDA(cerr);
585: return(0);
586: }
588: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
589: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
590: {
591: cudaError_t cerr;
592: PetscInt nthreads=256;
593: PetscInt nblocks=(count+nthreads-1)/nthreads;
594: const PetscInt *iarray=opt ? opt->array : NULL;
597: if (!count) return(0);
598: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
599: d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
600: cerr = cudaGetLastError();CHKERRCUDA(cerr);
601: return(0);
602: }
604: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
605: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
606: {
607: cudaError_t cerr;
608: PetscInt nthreads=256;
609: PetscInt nblocks=(count+nthreads-1)/nthreads;
610: PetscInt srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
613: if (!count) return(0);
614: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
616: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
617: if (srcOpt) {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
618: else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
620: if (dstOpt) {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
621: else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
623: d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
624: cerr = cudaGetLastError();CHKERRCUDA(cerr);
625: return(0);
626: }
628: /* Specialization for Insert since we may use cudaMemcpyAsync */
629: template<typename Type,PetscInt BS,PetscInt EQ>
630: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
631: {
632: PetscErrorCode ierr;
633: cudaError_t cerr;
636: if (!count) return(0);
637: /*src and dst are contiguous */
638: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
639: cerr = cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
640: } else {
641: ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
642: }
643: return(0);
644: }
646: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
647: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
648: {
649: cudaError_t cerr;
650: PetscInt nthreads=256;
651: PetscInt nblocks=(count+nthreads-1)/nthreads;
652: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
653: const PetscInt *larray = leafopt ? leafopt->array : NULL;
656: if (!count) return(0);
657: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
658: d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
659: cerr = cudaGetLastError();CHKERRCUDA(cerr);
660: return(0);
661: }
663: /*====================================================================================*/
664: /* Init various types and instantiate pack/unpack function pointers */
665: /*====================================================================================*/
666: template<typename Type,PetscInt BS,PetscInt EQ>
667: static void PackInit_RealType(PetscSFLink link)
668: {
669: /* Pack/unpack for remote communication */
670: link->d_Pack = Pack<Type,BS,EQ>;
671: link->d_UnpackAndInsert = UnpackAndOp <Type,Insert<Type> ,BS,EQ>;
672: link->d_UnpackAndAdd = UnpackAndOp <Type,Add<Type> ,BS,EQ>;
673: link->d_UnpackAndMult = UnpackAndOp <Type,Mult<Type> ,BS,EQ>;
674: link->d_UnpackAndMin = UnpackAndOp <Type,Min<Type> ,BS,EQ>;
675: link->d_UnpackAndMax = UnpackAndOp <Type,Max<Type> ,BS,EQ>;
676: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
678: /* Scatter for local communication */
679: link->d_ScatterAndInsert = ScatterAndInsert<Type ,BS,EQ>; /* Has special optimizations */
680: link->d_ScatterAndAdd = ScatterAndOp <Type,Add<Type> ,BS,EQ>;
681: link->d_ScatterAndMult = ScatterAndOp <Type,Mult<Type> ,BS,EQ>;
682: link->d_ScatterAndMin = ScatterAndOp <Type,Min<Type> ,BS,EQ>;
683: link->d_ScatterAndMax = ScatterAndOp <Type,Max<Type> ,BS,EQ>;
684: link->d_FetchAndAddLocal = FetchAndOpLocal <Type,Add <Type> ,BS,EQ>;
686: /* Atomic versions when there are data-race possibilities */
687: link->da_UnpackAndInsert = UnpackAndOp <Type,AtomicInsert<Type>,BS,EQ>;
688: link->da_UnpackAndAdd = UnpackAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
689: link->da_UnpackAndMult = UnpackAndOp <Type,AtomicMult<Type> ,BS,EQ>;
690: link->da_UnpackAndMin = UnpackAndOp <Type,AtomicMin<Type> ,BS,EQ>;
691: link->da_UnpackAndMax = UnpackAndOp <Type,AtomicMax<Type> ,BS,EQ>;
692: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
694: link->da_ScatterAndInsert = ScatterAndOp <Type,AtomicInsert<Type>,BS,EQ>;
695: link->da_ScatterAndAdd = ScatterAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
696: link->da_ScatterAndMult = ScatterAndOp <Type,AtomicMult<Type> ,BS,EQ>;
697: link->da_ScatterAndMin = ScatterAndOp <Type,AtomicMin<Type> ,BS,EQ>;
698: link->da_ScatterAndMax = ScatterAndOp <Type,AtomicMax<Type> ,BS,EQ>;
699: link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type> ,BS,EQ>;
700: }
702: /* Have this templated class to specialize for char integers */
703: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
704: struct PackInit_IntegerType_Atomic {
705: static void Init(PetscSFLink link) {
706: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
707: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
708: link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
709: link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
710: link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
711: link->da_UnpackAndLAND = UnpackAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
712: link->da_UnpackAndLOR = UnpackAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
713: link->da_UnpackAndLXOR = UnpackAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
714: link->da_UnpackAndBAND = UnpackAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
715: link->da_UnpackAndBOR = UnpackAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
716: link->da_UnpackAndBXOR = UnpackAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
717: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
719: link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
720: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
721: link->da_ScatterAndMult = ScatterAndOp<Type,AtomicMult<Type> ,BS,EQ>;
722: link->da_ScatterAndMin = ScatterAndOp<Type,AtomicMin<Type> ,BS,EQ>;
723: link->da_ScatterAndMax = ScatterAndOp<Type,AtomicMax<Type> ,BS,EQ>;
724: link->da_ScatterAndLAND = ScatterAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
725: link->da_ScatterAndLOR = ScatterAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
726: link->da_ScatterAndLXOR = ScatterAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
727: link->da_ScatterAndBAND = ScatterAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
728: link->da_ScatterAndBOR = ScatterAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
729: link->da_ScatterAndBXOR = ScatterAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
730: link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
731: }
732: };
734: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
735: template<typename Type,PetscInt BS,PetscInt EQ>
736: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
737: static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
738: };
740: template<typename Type,PetscInt BS,PetscInt EQ>
741: static void PackInit_IntegerType(PetscSFLink link)
742: {
743: link->d_Pack = Pack<Type,BS,EQ>;
744: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
745: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
746: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
747: link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
748: link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
749: link->d_UnpackAndLAND = UnpackAndOp<Type,LAND<Type> ,BS,EQ>;
750: link->d_UnpackAndLOR = UnpackAndOp<Type,LOR<Type> ,BS,EQ>;
751: link->d_UnpackAndLXOR = UnpackAndOp<Type,LXOR<Type> ,BS,EQ>;
752: link->d_UnpackAndBAND = UnpackAndOp<Type,BAND<Type> ,BS,EQ>;
753: link->d_UnpackAndBOR = UnpackAndOp<Type,BOR<Type> ,BS,EQ>;
754: link->d_UnpackAndBXOR = UnpackAndOp<Type,BXOR<Type> ,BS,EQ>;
755: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
757: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
758: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
759: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
760: link->d_ScatterAndMin = ScatterAndOp<Type,Min<Type> ,BS,EQ>;
761: link->d_ScatterAndMax = ScatterAndOp<Type,Max<Type> ,BS,EQ>;
762: link->d_ScatterAndLAND = ScatterAndOp<Type,LAND<Type> ,BS,EQ>;
763: link->d_ScatterAndLOR = ScatterAndOp<Type,LOR<Type> ,BS,EQ>;
764: link->d_ScatterAndLXOR = ScatterAndOp<Type,LXOR<Type> ,BS,EQ>;
765: link->d_ScatterAndBAND = ScatterAndOp<Type,BAND<Type> ,BS,EQ>;
766: link->d_ScatterAndBOR = ScatterAndOp<Type,BOR<Type> ,BS,EQ>;
767: link->d_ScatterAndBXOR = ScatterAndOp<Type,BXOR<Type> ,BS,EQ>;
768: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
769: PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
770: }
772: #if defined(PETSC_HAVE_COMPLEX)
773: template<typename Type,PetscInt BS,PetscInt EQ>
774: static void PackInit_ComplexType(PetscSFLink link)
775: {
776: link->d_Pack = Pack<Type,BS,EQ>;
777: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
778: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
779: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
780: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
782: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
783: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
784: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
785: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
787: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
788: link->da_UnpackAndMult = NULL; /* Not implemented yet */
789: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
790: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
791: }
792: #endif
794: typedef signed char SignedChar;
795: typedef unsigned char UnsignedChar;
796: typedef struct {int a; int b; } PairInt;
797: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
799: template<typename Type>
800: static void PackInit_PairType(PetscSFLink link)
801: {
802: link->d_Pack = Pack<Type,1,1>;
803: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
804: link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
805: link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
807: link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
808: link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
809: link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
810: /* Atomics for pair types are not implemented yet */
811: }
813: template<typename Type,PetscInt BS,PetscInt EQ>
814: static void PackInit_DumbType(PetscSFLink link)
815: {
816: link->d_Pack = Pack<Type,BS,EQ>;
817: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
818: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
819: /* Atomics for dumb types are not implemented yet */
820: }
822: /*====================================================================================*/
823: /* Main driver to init MPI datatype on device */
824: /*====================================================================================*/
826: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
827: PetscErrorCode PetscSFLinkSetUp_Device(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
828: {
830: cudaError_t err;
831: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
832: PetscBool is2Int,is2PetscInt;
833: #if defined(PETSC_HAVE_COMPLEX)
834: PetscInt nPetscComplex=0;
835: #endif
838: if (link->deviceinited) return(0);
839: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
840: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
841: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
842: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
843: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
844: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
845: #if defined(PETSC_HAVE_COMPLEX)
846: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
847: #endif
848: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
849: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
851: if (is2Int) {
852: PackInit_PairType<PairInt>(link);
853: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
854: PackInit_PairType<PairPetscInt>(link);
855: } else if (nPetscReal) {
856: if (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
857: else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
858: else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
859: else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
860: } else if (nPetscInt) {
861: if (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
862: else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
863: else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
864: else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
865: #if defined(PETSC_USE_64BIT_INDICES)
866: } else if (nInt) {
867: if (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
868: else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
869: else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
870: else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
871: #endif
872: } else if (nSignedChar) {
873: if (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
874: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
875: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
876: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
877: } else if (nUnsignedChar) {
878: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
879: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
880: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
881: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
882: #if defined(PETSC_HAVE_COMPLEX)
883: } else if (nPetscComplex) {
884: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
885: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
886: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
887: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
888: #endif
889: } else {
890: MPI_Aint lb,nbyte;
891: MPI_Type_get_extent(unit,&lb,&nbyte);
892: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
893: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
894: if (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
895: else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
896: else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
897: } else {
898: nInt = nbyte / sizeof(int);
899: if (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
900: else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
901: else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
902: else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
903: }
904: }
906: if (!sf->use_default_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
907: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
908: int device;
909: struct cudaDeviceProp props;
910: err = cudaGetDevice(&device);CHKERRCUDA(err);
911: err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
912: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
913: }
914: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
915: link->deviceinited = PETSC_TRUE;
916: return(0);
917: }