Actual source code: veccuda.c

petsc-3.9.1 2018-04-29
Report Typos and Errors
  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12:  #include <petsccuda.h>
 13: #include <petsc/private/vecimpl.h>          /*I <petscvec.h> I*/
 14:  #include <../src/vec/vec/impls/dvecimpl.h>
 15:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 17: static PetscErrorCode PetscCUBLASDestroyHandle();

 19: /*
 20:    Implementation for obtaining read-write access to the cuBLAS handle.
 21:    Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
 22:  */
 23: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
 24: {
 25:   static cublasHandle_t cublasv2handle = NULL;
 26:   cublasStatus_t        cberr;
 27:   PetscErrorCode        ierr;

 30:   if (!cublasv2handle) {
 31:     cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
 32:     /* Make sure that the handle will be destroyed properly */
 33:     PetscRegisterFinalize(PetscCUBLASDestroyHandle);
 34:   }
 35:   *handle = &cublasv2handle;
 36:   return(0);
 37: }

 39: /*
 40:    Singleton for obtaining a handle to cuBLAS.
 41:    The handle is required for calls to routines in cuBLAS.
 42:  */
 43: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
 44: {
 45:   cublasHandle_t *p_cublasv2handle;

 49:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 50:   *handle = *p_cublasv2handle;
 51:   return(0);
 52: }


 55: /*
 56:    Destroys the CUBLAS handle.
 57:    This function is intended and registered for PetscFinalize - do not call manually!
 58:  */
 59: PetscErrorCode PetscCUBLASDestroyHandle()
 60: {
 61:   cublasHandle_t *p_cublasv2handle;
 62:   cublasStatus_t cberr;

 66:   PetscCUBLASGetHandle_Private(&p_cublasv2handle);
 67:   cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
 68:   *p_cublasv2handle = NULL;  /* Ensures proper reinitialization */
 69:   return(0);
 70: }

 72: /*
 73:     Allocates space for the vector array on the Host if it does not exist.
 74:     Does NOT change the PetscCUDAFlag for the vector
 75:     Does NOT zero the CUDA array
 76:  */
 77: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 78: {
 80:   PetscScalar    *array;
 81:   Vec_Seq        *s = (Vec_Seq*)v->data;
 82:   PetscInt       n = v->map->n;

 85:   if (!s) {
 86:     PetscNewLog((PetscObject)v,&s);
 87:     v->data = s;
 88:   }
 89:   if (!s->array) {
 90:     PetscMalloc1(n,&array);
 91:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 92:     s->array           = array;
 93:     s->array_allocated = array;
 94:     if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
 95:       v->valid_GPU_array = PETSC_OFFLOAD_CPU;
 96:     }
 97:   }
 98:   return(0);
 99: }

101: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
102: {
103:   PetscScalar       *ya;
104:   const PetscScalar *xa;
105:   PetscErrorCode    ierr;

108:   VecCUDAAllocateCheckHost(xin);
109:   VecCUDAAllocateCheckHost(yin);
110:   if (xin != yin) {
111:     VecGetArrayRead(xin,&xa);
112:     VecGetArray(yin,&ya);
113:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
114:     VecRestoreArrayRead(xin,&xa);
115:     VecRestoreArray(yin,&ya);
116:   }
117:   return(0);
118: }

120: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
121: {
123:   PetscInt       n = xin->map->n,i;
124:   PetscScalar    *xx;

127:   VecGetArray(xin,&xx);
128:   for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
129:   VecRestoreArray(xin,&xx);
130:   return(0);
131: }

133: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
134: {
135:   Vec_Seq        *vs = (Vec_Seq*)v->data;

139:   PetscObjectSAWsViewOff(v);
140: #if defined(PETSC_USE_LOG)
141:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
142: #endif
143:   if (vs) {
144:     if (vs->array_allocated) { PetscFree(vs->array_allocated); }
145:     PetscFree(vs);
146:   }
147:   return(0);
148: }

150: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
151: {
152:   Vec_Seq *v = (Vec_Seq*)vin->data;

155:   v->array         = v->unplacedarray;
156:   v->unplacedarray = 0;
157:   return(0);
158: }

160: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
161: {

165:   VecCUDAAllocateCheck(v);
166:   return(0);
167: }

169: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
170: {

174:   VecCUDACopyToGPU(v);
175:   return(0);
176: }

178: /*
179:     VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

181:    Input Parameters:
182: .    v - the vector
183: .    indices - the requested indices, this should be created with CUDAIndicesCreate()

185: */
186: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci)
187: {

191:   VecCUDACopyToGPUSome(v,ci);
192:   return(0);
193: }

195: /*
196:   VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

198:   Input Parameters:
199:  +    v - the vector
200:  -    indices - the requested indices, this should be created with CUDAIndicesCreate()
201: */
202: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci)
203: {

207:   VecCUDACopyFromGPUSome(v,ci);
208:   return(0);
209: }

211: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
212: {

216:   VecSetRandom_SeqCUDA_Private(xin,r);
217:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
218:   return(0);
219: }

221: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
222: {

226:   VecCUDACopyFromGPU(vin);
227:   VecResetArray_SeqCUDA_Private(vin);
228:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
229:   return(0);
230: }

232: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
233: {

237:   VecCUDACopyFromGPU(vin);
238:   VecPlaceArray_Seq(vin,a);
239:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
240:   return(0);
241: }

243: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
244: {

248:   VecCUDACopyFromGPU(vin);
249:   VecReplaceArray_Seq(vin,a);
250:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
251:   return(0);
252: }

254: /*@
255:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

257:  Collective on MPI_Comm

259:  Input Parameter:
260:  +  comm - the communicator, should be PETSC_COMM_SELF
261:  -  n - the vector length

263:  Output Parameter:
264:  .  v - the vector

266:  Notes:
267:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
268:  same type as an existing vector.

270:  Level: intermediate

272:  Concepts: vectors^creating sequential

274:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
275:  @*/
276: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
277: {

281:   VecCreate(comm,v);
282:   VecSetSizes(*v,n,n);
283:   VecSetType(*v,VECSEQCUDA);
284:   return(0);
285: }

287: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
288: {

292:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
293:   PetscLayoutReference(win->map,&(*V)->map);
294:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
295:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
296:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
297:   return(0);
298: }

300: PetscErrorCode VecCreate_SeqCUDA(Vec V)
301: {

305:   PetscLayoutSetUp(V->map);
306:   VecCUDAAllocateCheck(V);
307:   V->valid_GPU_array = PETSC_OFFLOAD_GPU;
308:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
309:   VecSet(V,0.0);
310:   return(0);
311: }

313: /*@C
314:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
315:    where the user provides the array space to store the vector values. The array
316:    provided must be a GPU array.

318:    Collective on MPI_Comm

320:    Input Parameter:
321: +  comm - the communicator, should be PETSC_COMM_SELF
322: .  bs - the block size
323: .  n - the vector length
324: -  array - GPU memory where the vector elements are to be stored.

326:    Output Parameter:
327: .  V - the vector

329:    Notes:
330:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
331:    same type as an existing vector.

333:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
334:    at a later stage to SET the array for storing the vector values.

336:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
337:    The user should not free the array until the vector is destroyed.

339:    Level: intermediate

341:    Concepts: vectors^creating with array

343: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
344:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
345:           VecCreateMPIWithArray()
346: @*/
347: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
348: {
350:   PetscMPIInt    size;

353:   VecCreate(comm,V);
354:   VecSetSizes(*V,n,n);
355:   VecSetBlockSize(*V,bs);
356:   MPI_Comm_size(comm,&size);
357:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
358:   VecCreate_SeqCUDA_Private(*V,array);
359:   return(0);
360: }

362: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
363: {
365:   cudaError_t    err;
366:   Vec_CUDA       *veccuda;
367:   PetscMPIInt    size;

370:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
371:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
372:   VecCreate_Seq_Private(V,0);
373:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);

375:   V->ops->dot                    = VecDot_SeqCUDA;
376:   V->ops->norm                   = VecNorm_SeqCUDA;
377:   V->ops->tdot                   = VecTDot_SeqCUDA;
378:   V->ops->scale                  = VecScale_SeqCUDA;
379:   V->ops->copy                   = VecCopy_SeqCUDA;
380:   V->ops->set                    = VecSet_SeqCUDA;
381:   V->ops->swap                   = VecSwap_SeqCUDA;
382:   V->ops->axpy                   = VecAXPY_SeqCUDA;
383:   V->ops->axpby                  = VecAXPBY_SeqCUDA;
384:   V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
385:   V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
386:   V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
387:   V->ops->setrandom              = VecSetRandom_SeqCUDA;
388:   V->ops->dot_local              = VecDot_SeqCUDA;
389:   V->ops->tdot_local             = VecTDot_SeqCUDA;
390:   V->ops->norm_local             = VecNorm_SeqCUDA;
391:   V->ops->mdot_local             = VecMDot_SeqCUDA;
392:   V->ops->maxpy                  = VecMAXPY_SeqCUDA;
393:   V->ops->mdot                   = VecMDot_SeqCUDA;
394:   V->ops->aypx                   = VecAYPX_SeqCUDA;
395:   V->ops->waxpy                  = VecWAXPY_SeqCUDA;
396:   V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
397:   V->ops->placearray             = VecPlaceArray_SeqCUDA;
398:   V->ops->replacearray           = VecReplaceArray_SeqCUDA;
399:   V->ops->resetarray             = VecResetArray_SeqCUDA;
400:   V->ops->destroy                = VecDestroy_SeqCUDA;
401:   V->ops->duplicate              = VecDuplicate_SeqCUDA;
402:   V->ops->conjugate              = VecConjugate_SeqCUDA;
403:   V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
404:   V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
405:   V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUDA;
406:   V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;

408:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
409:   if (array) {
410:     if (!V->spptr) {
411:       PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
412:       veccuda = (Vec_CUDA*)V->spptr;
413:       err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
414:       veccuda->GPUarray_allocated = 0;
415:       veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
416:       V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
417:     }
418:     veccuda = (Vec_CUDA*)V->spptr;
419:     veccuda->GPUarray = (PetscScalar*)array;
420:   }

422:   return(0);
423: }