Actual source code: sfpackcuda.cu

petsc-3.12.2 2019-11-22
Report Typos and Errors
  1:  #include <../src/vec/is/sf/impls/basic/sfpack.h>
  2: #include <cuda_runtime.h>

  4: /*====================================================================================*/
  5: /*  Templated CUDA kernels for pack/unpack. The Op can be regular or atomic           */
  6: /*====================================================================================*/

  8: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
  9:    <Type> is PetscReal, which is the primitive type we operate on.
 10:    <bs>   is 16, which says <unit> contains 16 primitive types.
 11:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 12:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 14:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 15:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 16: */
 17: template<class Type,PetscInt BS,PetscInt EQ>
 18: __global__ static void d_Pack(PetscInt count,const PetscInt *idx,PetscInt bs,const void *unpacked,void *packed)
 19: {
 20:   PetscInt        i,tid = blockIdx.x*blockDim.x + threadIdx.x;
 21:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 22:   const Type      *u = (const Type*)unpacked;
 23:   Type            *p = (Type*)packed;
 24:   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
 25:   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 27:   for (; tid<count; tid += grid_size) {
 28:     if (!idx) {for (i=0; i<MBS; i++) p[tid*MBS+i] = u[tid*MBS+i];}
 29:     else      {for (i=0; i<MBS; i++) p[tid*MBS+i] = u[idx[tid]*MBS+i];}
 30:   }
 31: }

 33: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 34: __global__ static void d_UnpackAndOp(PetscInt count,const PetscInt *idx,PetscInt bs,void *unpacked,const void *packed)
 35: {
 36:   PetscInt        i,tid = blockIdx.x*blockDim.x + threadIdx.x;
 37:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 38:   Type            *u = (Type*)unpacked;
 39:   const Type      *p = (const Type*)packed;
 40:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 41:   Op              op;

 43:   for (; tid<count; tid += grid_size) {
 44:     if (!idx) {for (i=0; i<MBS; i++) op(u[tid*MBS+i],     p[tid*MBS+i]);}
 45:     else      {for (i=0; i<MBS; i++) op(u[idx[tid]*MBS+i],p[tid*MBS+i]);}
 46:   }
 47: }

 49: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 50: __global__ static void d_FetchAndOp(PetscInt count,const PetscInt *idx,PetscInt bs,void *unpacked,void *packed)
 51: {
 52:   PetscInt        i,tid = blockIdx.x*blockDim.x + threadIdx.x;
 53:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 54:   Type            *u = (Type*)unpacked,*p;
 55:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 56:   Op              op;

 58:   for (; tid<count; tid += grid_size) {
 59:     if (!idx) {for (i=0; i<MBS; i++) p[tid*MBS+i] = op(u[tid*MBS+i],p[tid*MBS+i]);}
 60:     else      {for (i=0; i<MBS; i++) p[tid*MBS+i] = op(u[idx[tid]*MBS+i],p[tid*MBS+i]);}
 61:   }
 62: }

 64: /*====================================================================================*/
 65: /*                             Regular operations on device                           */
 66: /*====================================================================================*/
 67: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
 68: template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
 69: template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
 70: template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
 71: template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
 72: template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
 73: template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
 74: template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
 75: template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
 76: template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
 77: template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
 78: template<typename Type> struct Minloc {
 79:   __device__ Type operator() (Type& x,Type y) const {
 80:     Type old = x;
 81:     if (y.a < x.a) x = y;
 82:     else if (y.a == x.a) x.b = min(x.b,y.b);
 83:     return old;
 84:   }
 85: };
 86: template<typename Type> struct Maxloc {
 87:   __device__ Type operator() (Type& x,Type y) const {
 88:     Type old = x;
 89:     if (y.a > x.a) x = y;
 90:     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
 91:     return old;
 92:   }
 93: };

 95: /*====================================================================================*/
 96: /*                             Atomic operations on device                            */
 97: /*====================================================================================*/

 99: /*
100:   Atomic Insert (exchange) operations

102:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

104:   int atomicExch(int* address, int val);
105:   unsigned int atomicExch(unsigned int* address, unsigned int val);
106:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
107:   float atomicExch(float* address, float val);

109:   reads the 32-bit or 64-bit word old located at the address address in global or shared
110:   memory and stores val back to memory at the same address. These two operations are
111:   performed in one atomic transaction. The function returns old.

113:   PETSc notes:

115:   It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.

117:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
118:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
119:   be atomic itself.

121:   With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
122:   insertion is atomic. Hope no user codes rely on that.
123: */

125: #if defined(PETSC_USE_REAL_DOUBLE)
126: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}
127: #endif

129: #if defined(PETSC_USE_64BIT_INDICES)
130: __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
131: #endif

133: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};

135: /*
136:   Atomic add operations

138:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

140:   int atomicAdd(int* address, int val);
141:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
142:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
143:   float atomicAdd(float* address, float val);
144:   double atomicAdd(double* address, double val);
145:   __half2 atomicAdd(__half2 *address, __half2 val);
146:   __half atomicAdd(__half *address, __half val);

148:   reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
149:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
150:   function returns old.

152:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
153:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
154:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
155:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
156:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
157:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
158: */

160: #if defined(PETSC_USE_64BIT_INDICES)
161: __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
162: #endif

164: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};

166: template<> struct AtomicAdd<double> {
167:   __device__ double operator() (double& x,double y) const {
168: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
169:     return atomicAdd(&x,y);
170: #else
171:     double                 *address = &x, val = y;
172:     unsigned long long int *address_as_ull = (unsigned long long int*)address;
173:     unsigned long long int old = *address_as_ull, assumed;
174:     do {
175:       assumed = old;
176:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
177:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
178:     } while (assumed != old);
179:     return __longlong_as_double(old);
180: #endif
181:   }
182: };

184: template<> struct AtomicAdd<float> {
185:   __device__ float operator() (float& x,float y) const {
186: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
187:     return atomicAdd(&x,y);
188: #else
189:     float *address = &x, val = y;
190:     int   *address_as_int = (int*)address;
191:     int   old = *address_as_int, assumed;
192:     do {
193:       assumed = old;
194:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
195:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
196:     } while (assumed != old);
197:     return __int_as_float(old);
198: #endif
199:   }
200: };

202: template<> struct AtomicAdd<PetscComplex> {
203:  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
204:   PetscComplex         old, *z = &old;
205:   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
206:   AtomicAdd<PetscReal> op;
207:   z[0] = op(xp[0],yp[0]);
208:   z[1] = op(xp[1],yp[1]);
209:   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
210:  }
211: };

213: /*
214:   Atomic Mult operations:

216:   CUDA has no atomicMult at all, so we build our own with atomicCAS
217:  */
218: #if defined(PETSC_USE_REAL_DOUBLE)
219: __device__ static double atomicMult(double* address, double val)
220: {
221:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
222:   unsigned long long int old = *address_as_ull, assumed;
223:   do {
224:     assumed = old;
225:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
226:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
227:   } while (assumed != old);
228:   return __longlong_as_double(old);
229: }
230: #elif defined(PETSC_USE_REAL_SINGLE)
231: __device__ static float atomicMult(float* address,float val)
232: {
233:   int *address_as_int = (int*)(address);
234:   int old = *address_as_int, assumed;
235:   do {
236:     assumed  = old;
237:     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
238:   } while (assumed != old);
239:   return __int_as_float(old);
240: }
241: #endif

243: __device__ static int atomicMult(int* address,int val)
244: {
245:   int *address_as_int = (int*)(address);
246:   int old = *address_as_int, assumed;
247:   do {
248:     assumed = old;
249:     old     = atomicCAS(address_as_int, assumed, val*assumed);
250:   } while (assumed != old);
251:   return (int)old;
252: }

254: #if defined(PETSC_USE_64BIT_INDICES)
255: __device__ static int atomicMult(PetscInt* address,PetscInt val)
256: {
257:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
258:   unsigned long long int old = *address_as_ull, assumed;
259:   do {
260:     assumed = old;
261:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
262:   } while (assumed != old);
263:   return (PetscInt)old;
264: }
265: #endif

267: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};

269: /*
270:   Atomic Min/Max operations

272:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

274:   int atomicMin(int* address, int val);
275:   unsigned int atomicMin(unsigned int* address,unsigned int val);
276:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

278:   reads the 32-bit or 64-bit word old located at the address address in global or shared
279:   memory, computes the minimum of old and val, and stores the result back to memory
280:   at the same address. These three operations are performed in one atomic transaction.
281:   The function returns old.
282:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

284:   atomicMax() is similar.
285:  */

287: #if defined(PETSC_USE_REAL_DOUBLE)
288: __device__ static double atomicMin(double* address, double val)
289: {
290:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
291:   unsigned long long int old = *address_as_ull, assumed;
292:   do {
293:     assumed = old;
294:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
295:   } while (assumed != old);
296:   return __longlong_as_double(old);
297: }

299: __device__ static double atomicMax(double* address, double val)
300: {
301:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
302:   unsigned long long int old = *address_as_ull, assumed;
303:   do {
304:     assumed  = old;
305:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
306:   } while (assumed != old);
307:   return __longlong_as_double(old);
308: }
309: #elif defined(PETSC_USE_REAL_SINGLE)
310: __device__ static float atomicMin(float* address,float val)
311: {
312:   int *address_as_int = (int*)(address);
313:   int old = *address_as_int, assumed;
314:   do {
315:     assumed = old;
316:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
317:   } while (assumed != old);
318:   return __int_as_float(old);
319: }

321: __device__ static float atomicMax(float* address,float val)
322: {
323:   int *address_as_int = (int*)(address);
324:   int old = *address_as_int, assumed;
325:   do {
326:     assumed = old;
327:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
328:   } while (assumed != old);
329:   return __int_as_float(old);
330: }
331: #endif

333: #if defined(PETSC_USE_64BIT_INDICES)
334: __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
335: {
336:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
337:   unsigned long long int old = *address_as_ull, assumed;
338:   do {
339:     assumed = old;
340:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
341:   } while (assumed != old);
342:   return (PetscInt)old;
343: }

345: __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
346: {
347:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
348:   unsigned long long int old = *address_as_ull, assumed;
349:   do {
350:     assumed = old;
351:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
352:   } while (assumed != old);
353:   return (PetscInt)old;
354: }
355: #endif

357: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
358: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};

360: /*
361:   Atomic bitwise operations

363:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

365:   int atomicAnd(int* address, int val);
366:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
367:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

369:   reads the 32-bit or 64-bit word old located at the address address in global or shared
370:   memory, computes (old & val), and stores the result back to memory at the same
371:   address. These three operations are performed in one atomic transaction.
372:   The function returns old.

374:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

376:   atomicOr() and atomicXor are similar.
377: */

379: #if defined(PETSC_USE_64BIT_INDICES)
380: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 350)
381: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
382: {
383:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
384:   unsigned long long int old = *address_as_ull, assumed;
385:   do {
386:     assumed = old;
387:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
388:   } while (assumed != old);
389:   return (PetscInt)old;
390: }
391: __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
392: {
393:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
394:   unsigned long long int old = *address_as_ull, assumed;
395:   do {
396:     assumed = old;
397:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
398:   } while (assumed != old);
399:   return (PetscInt)old;
400: }

402: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
403: {
404:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
405:   unsigned long long int old = *address_as_ull, assumed;
406:   do {
407:     assumed = old;
408:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
409:   } while (assumed != old);
410:   return (PetscInt)old;
411: }
412: #else
413: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
414: __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
415: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
416: #endif
417: #endif

419: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
420: template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
421: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};

423: /*
424:   Atomic logical operations:

426:   CUDA has no atomic logical operations at all. We support them on integer types.
427: */

429: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
430:    which is what we want since we only support 32-bit and 64-bit integers.
431:  */
432: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;

434: template<typename Type,class Op>
435: struct AtomicLogical<Type,Op,4> {
436:   __device__ Type operator()(Type& x,Type y) const {
437:     int *address_as_int = (int*)(&x);
438:     int old = *address_as_int, assumed;
439:     Op op;
440:     do {
441:       assumed = old;
442:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
443:     } while (assumed != old);
444:     return (Type)old;
445:   }
446: };

448: template<typename Type,class Op>
449: struct AtomicLogical<Type,Op,8> {
450:   __device__ Type operator()(Type& x,Type y) const {
451:     unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
452:     unsigned long long int old = *address_as_ull, assumed;
453:     Op op;
454:     do {
455:       assumed = old;
456:       old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
457:     } while (assumed != old);
458:     return (Type)old;
459:   }
460: };

462: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
463: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
464: template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
465: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};

467: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
468: template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
469: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};

471: /*====================================================================================*/
472: /*  Wrapper functions on cuda kernels. Function pointers are stored in 'link'         */
473: /*====================================================================================*/
474: template<typename Type,PetscInt BS,PetscInt EQ>
475: static PetscErrorCode Pack(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed)
476: {
477:   cudaError_t err;
478:   PetscInt    nthreads=256;
479:   PetscInt    nblocks=(count+nthreads-1)/nthreads;

482:   if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
483:   d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
484:   err = cudaGetLastError();CHKERRCUDA(err);
485:   return(0);
486: }

488: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
489: static PetscErrorCode UnpackAndOp(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,const void *packed)
490: {
491:   cudaError_t err;
492:   PetscInt    nthreads=256;
493:   PetscInt    nblocks=(count+nthreads-1)/nthreads;

496:   if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
497:   d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
498:   err = cudaGetLastError();CHKERRCUDA(err);
499:   return(0);
500: }

502: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
503: static PetscErrorCode FetchAndOp(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,void *packed)
504: {
505:   cudaError_t err;
506:   PetscInt    nthreads=256;
507:   PetscInt    nblocks=(count+nthreads-1)/nthreads;

510:   if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
511:   d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
512:   err = cudaGetLastError();CHKERRCUDA(err);
513:   return(0);
514: }

516: /*====================================================================================*/
517: /*  Init various types and instantiate pack/unpack function pointers                  */
518: /*====================================================================================*/
519: template<typename Type,PetscInt BS,PetscInt EQ>
520: static void PackInit_RealType(PetscSFPack link)
521: {
522:   link->d_Pack             = Pack<Type,BS,EQ>;
523:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
524:   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
525:   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
526:   link->d_UnpackAndMin     = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
527:   link->d_UnpackAndMax     = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;

529:   link->d_FetchAndInsert   = FetchAndOp <Type,Insert<Type>,BS,EQ>;
530:   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
531:   link->d_FetchAndMult     = FetchAndOp <Type,Mult<Type>  ,BS,EQ>;
532:   link->d_FetchAndMin      = FetchAndOp <Type,Min<Type>   ,BS,EQ>;
533:   link->d_FetchAndMax      = FetchAndOp <Type,Max<Type>   ,BS,EQ>;

535:   /* Pack() is always data race free */
536:   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
537:   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
538:   link->da_UnpackAndMult   = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
539:   link->da_UnpackAndMin    = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
540:   link->da_UnpackAndMax    = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;

542:   link->da_FetchAndInsert  = FetchAndOp <Type,AtomicInsert<Type>,BS,EQ>;
543:   link->da_FetchAndAdd     = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;
544:   link->da_FetchAndMult    = FetchAndOp <Type,AtomicMult<Type>  ,BS,EQ>;
545:   link->da_FetchAndMin     = FetchAndOp <Type,AtomicMin<Type>   ,BS,EQ>;
546:   link->da_FetchAndMax     = FetchAndOp <Type,AtomicMax<Type>   ,BS,EQ>;
547: }

549: /* Have this templated class to specialize for char integers */
550: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
551: struct PackInit_IntegerType_Atomic {
552:   static void Init(PetscSFPack link) {
553:     link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
554:     link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
555:     link->da_UnpackAndMult   = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
556:     link->da_UnpackAndMin    = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
557:     link->da_UnpackAndMax    = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
558:     link->da_UnpackAndLAND   = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
559:     link->da_UnpackAndLOR    = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
560:     link->da_UnpackAndLXOR   = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
561:     link->da_UnpackAndBAND   = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
562:     link->da_UnpackAndBOR    = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
563:     link->da_UnpackAndBXOR   = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;

565:     link->da_FetchAndInsert  = FetchAndOp <Type,AtomicInsert<Type>,BS,EQ>;
566:     link->da_FetchAndAdd     = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;
567:     link->da_FetchAndMult    = FetchAndOp <Type,AtomicMult<Type>  ,BS,EQ>;
568:     link->da_FetchAndMin     = FetchAndOp <Type,AtomicMin<Type>   ,BS,EQ>;
569:     link->da_FetchAndMax     = FetchAndOp <Type,AtomicMax<Type>   ,BS,EQ>;
570:     link->da_FetchAndLAND    = FetchAndOp <Type,AtomicLAND<Type>  ,BS,EQ>;
571:     link->da_FetchAndLOR     = FetchAndOp <Type,AtomicLOR<Type>   ,BS,EQ>;
572:     link->da_FetchAndLXOR    = FetchAndOp <Type,AtomicLXOR<Type>  ,BS,EQ>;
573:     link->da_FetchAndBAND    = FetchAndOp <Type,AtomicBAND<Type>  ,BS,EQ>;
574:     link->da_FetchAndBOR     = FetchAndOp <Type,AtomicBOR<Type>   ,BS,EQ>;
575:     link->da_FetchAndBXOR    = FetchAndOp <Type,AtomicBXOR<Type>  ,BS,EQ>;
576:   }
577: };

579: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
580: template<typename Type,PetscInt BS,PetscInt EQ>
581: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
582:   static void Init(PetscSFPack link) {/* Nothing to leave function pointers NULL */}
583: };

585: template<typename Type,PetscInt BS,PetscInt EQ>
586: static void PackInit_IntegerType(PetscSFPack link)
587: {
588:   link->d_Pack            = Pack<Type,BS,EQ>;
589:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
590:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
591:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
592:   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
593:   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
594:   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
595:   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
596:   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
597:   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
598:   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
599:   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;

601:   link->d_FetchAndInsert  = FetchAndOp <Type,Insert<Type>,BS,EQ>;
602:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
603:   link->d_FetchAndMult    = FetchAndOp <Type,Mult<Type>  ,BS,EQ>;
604:   link->d_FetchAndMin     = FetchAndOp <Type,Min<Type>   ,BS,EQ>;
605:   link->d_FetchAndMax     = FetchAndOp <Type,Max<Type>   ,BS,EQ>;
606:   link->d_FetchAndLAND    = FetchAndOp <Type,LAND<Type>  ,BS,EQ>;
607:   link->d_FetchAndLOR     = FetchAndOp <Type,LOR<Type>   ,BS,EQ>;
608:   link->d_FetchAndLXOR    = FetchAndOp <Type,LXOR<Type>  ,BS,EQ>;
609:   link->d_FetchAndBAND    = FetchAndOp <Type,BAND<Type>  ,BS,EQ>;
610:   link->d_FetchAndBOR     = FetchAndOp <Type,BOR<Type>   ,BS,EQ>;
611:   link->d_FetchAndBXOR    = FetchAndOp <Type,BXOR<Type>  ,BS,EQ>;

613:   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
614: }

616: #if defined(PETSC_HAVE_COMPLEX)
617: template<typename Type,PetscInt BS,PetscInt EQ>
618: static void PackInit_ComplexType(PetscSFPack link)
619: {
620:   link->d_Pack            = Pack<Type,BS,EQ>;

622:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
623:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
624:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
625:   link->d_FetchAndInsert  = FetchAndOp <Type,Insert<Type>,BS,EQ>;
626:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
627:   link->d_FetchAndMult    = FetchAndOp <Type,Mult<Type>  ,BS,EQ>;

629:   link->da_UnpackAndAdd   = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
630:   link->da_UnpackAndMult  = NULL; /* Not implemented yet */
631:   link->da_FetchAndAdd    = NULL; /* Return value of atomicAdd on complex is not atomic */
632: }
633: #endif

635: typedef signed char                      SignedChar;
636: typedef unsigned char                    UnsignedChar;
637: typedef struct {int a;      int b;     } PairInt;
638: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;

640: template<typename Type>
641: static void PackInit_PairType(PetscSFPack link)
642: {
643:   link->d_Pack            = Pack<Type,1,1>;
644:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
645:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
646:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
647:   link->d_FetchAndInsert  = FetchAndOp <Type,Insert<Type>,1,1>;
648:   link->d_FetchAndMinloc  = FetchAndOp <Type,Minloc<Type>,1,1>;
649:   link->d_FetchAndMinloc  = FetchAndOp <Type,Minloc<Type>,1,1>;

651:   /* Atomics for pair types are not implemented yet */
652: }

654: template<typename Type,PetscInt BS,PetscInt EQ>
655: static void PackInit_DumbType(PetscSFPack link)
656: {
657:   link->d_Pack            = Pack<Type,BS,EQ>;
658:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
659:   link->d_FetchAndInsert  = FetchAndOp <Type,Insert<Type>,BS,EQ>;

661:   /* Atomics for dumb types are not implemented yet */
662: }

664: /*====================================================================================*/
665: /*                Main driver to init MPI datatype on device                          */
666: /*====================================================================================*/

668: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
669: PetscErrorCode PetscSFPackSetUp_Device(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
670: {
672:   cudaError_t    err;
673:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
674:   PetscBool      is2Int,is2PetscInt;
675: #if defined(PETSC_HAVE_COMPLEX)
676:   PetscInt       nPetscComplex=0;
677: #endif

680:   if (link->deviceinited) return(0);
681:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
682:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
683:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
684:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
685:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
686:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
687: #if defined(PETSC_HAVE_COMPLEX)
688:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
689: #endif
690:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
691:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);

693:   if (is2Int) {
694:     PackInit_PairType<PairInt>(link);
695:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
696:     PackInit_PairType<PairPetscInt>(link);
697:   } else if (nPetscReal) {
698:     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
699:     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
700:     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
701:     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
702:   } else if (nPetscInt) {
703:     if      (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
704:     else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
705:     else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
706:     else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
707: #if defined(PETSC_USE_64BIT_INDICES)
708:   } else if (nInt) {
709:     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
710:     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
711:     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
712:     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
713: #endif
714:   } else if (nSignedChar) {
715:     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
716:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
717:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
718:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
719:   }  else if (nUnsignedChar) {
720:     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
721:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
722:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
723:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
724: #if defined(PETSC_HAVE_COMPLEX)
725:   } else if (nPetscComplex) {
726:     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
727:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
728:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
729:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
730: #endif
731:   } else {
732:     MPI_Aint lb,nbyte;
733:     MPI_Type_get_extent(unit,&lb,&nbyte);
734:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
735:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
736:       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
737:       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
738:       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
739:     } else {
740:       nInt = nbyte / sizeof(int);
741:       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
742:       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
743:       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
744:       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
745:     }
746:   }

748:   if (!sf_use_default_cuda_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
749:   if (!sf->MAX_CORESIDENT_THREADS) {
750:     int                   device;
751:     struct cudaDeviceProp props;
752:     err = cudaGetDevice(&device);CHKERRCUDA(err);
753:     err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
754:     sf->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
755:   }
756:   link->MAX_CORESIDENT_THREADS = sf->MAX_CORESIDENT_THREADS;

758:   link->deviceinited = PETSC_TRUE;
759:   return(0);
760: }