Actual source code: veccuda.c
petsc-3.12.0 2019-09-29
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h> /*I <petscvec.h> I*/
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
16: static PetscErrorCode PetscCUBLASDestroyHandle();
18: /*
19: Implementation for obtaining read-write access to the cuBLAS handle.
20: Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
21: */
22: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
23: {
24: static cublasHandle_t cublasv2handle = NULL;
25: cublasStatus_t cberr;
26: PetscErrorCode ierr;
29: if (!cublasv2handle) {
30: cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
31: /* Make sure that the handle will be destroyed properly */
32: PetscRegisterFinalize(PetscCUBLASDestroyHandle);
33: }
34: *handle = &cublasv2handle;
35: return(0);
36: }
38: /*
39: Initializing the cuBLAS handle can take 1/2 a second therefore
40: initialize in PetscInitialize() before being timing so it does
41: not distort the -log_view information
42: */
43: PetscErrorCode PetscCUBLASInitializeHandle(void)
44: {
45: cublasHandle_t *p_cublasv2handle;
49: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
50: return(0);
51: }
53: /*
54: Singleton for obtaining a handle to cuBLAS.
55: The handle is required for calls to routines in cuBLAS.
56: */
57: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
58: {
59: cublasHandle_t *p_cublasv2handle;
63: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
64: *handle = *p_cublasv2handle;
65: return(0);
66: }
69: /*
70: Destroys the CUBLAS handle.
71: This function is intended and registered for PetscFinalize - do not call manually!
72: */
73: PetscErrorCode PetscCUBLASDestroyHandle()
74: {
75: cublasHandle_t *p_cublasv2handle;
76: cublasStatus_t cberr;
80: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
81: cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
82: *p_cublasv2handle = NULL; /* Ensures proper reinitialization */
83: return(0);
84: }
86: /*
87: Allocates space for the vector array on the Host if it does not exist.
88: Does NOT change the PetscCUDAFlag for the vector
89: Does NOT zero the CUDA array
90: */
91: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
92: {
94: PetscScalar *array;
95: Vec_Seq *s = (Vec_Seq*)v->data;
96: PetscInt n = v->map->n;
99: if (!s) {
100: PetscNewLog((PetscObject)v,&s);
101: v->data = s;
102: }
103: if (!s->array) {
104: PetscMalloc1(n,&array);
105: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
106: s->array = array;
107: s->array_allocated = array;
108: if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
109: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
110: }
111: }
112: return(0);
113: }
115: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
116: {
117: PetscScalar *ya;
118: const PetscScalar *xa;
119: PetscErrorCode ierr;
122: VecCUDAAllocateCheckHost(xin);
123: VecCUDAAllocateCheckHost(yin);
124: if (xin != yin) {
125: VecGetArrayRead(xin,&xa);
126: VecGetArray(yin,&ya);
127: PetscArraycpy(ya,xa,xin->map->n);
128: VecRestoreArrayRead(xin,&xa);
129: VecRestoreArray(yin,&ya);
130: }
131: return(0);
132: }
134: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
135: {
137: PetscInt n = xin->map->n,i;
138: PetscScalar *xx;
141: VecGetArray(xin,&xx);
142: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
143: VecRestoreArray(xin,&xx);
144: return(0);
145: }
147: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
148: {
149: Vec_Seq *vs = (Vec_Seq*)v->data;
153: PetscObjectSAWsViewOff(v);
154: #if defined(PETSC_USE_LOG)
155: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
156: #endif
157: if (vs) {
158: if (vs->array_allocated) { PetscFree(vs->array_allocated); }
159: PetscFree(vs);
160: }
161: return(0);
162: }
164: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
165: {
166: Vec_Seq *v = (Vec_Seq*)vin->data;
169: v->array = v->unplacedarray;
170: v->unplacedarray = 0;
171: return(0);
172: }
174: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
175: {
179: VecCUDAAllocateCheck(v);
180: return(0);
181: }
183: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
184: {
188: VecCUDACopyToGPU(v);
189: return(0);
190: }
192: /*
193: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
195: Input Parameters:
196: + v - the vector
197: . ci - the requested indices, this should be created with CUDAIndicesCreate()
198: - mode - vec scatter mode used in VecScatterBegin/End
199: */
200: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
201: {
205: VecCUDACopyToGPUSome(v,ci,mode);
206: return(0);
207: }
209: /*
210: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
212: Input Parameters:
213: + v - the vector
214: . ci - the requested indices, this should be created with CUDAIndicesCreate()
215: - mode - vec scatter mode used in VecScatterBegin/End
216: */
217: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
218: {
222: VecCUDACopyFromGPUSome(v,ci,mode);
223: return(0);
224: }
226: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
227: {
231: VecSetRandom_SeqCUDA_Private(xin,r);
232: xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
233: return(0);
234: }
236: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
237: {
241: VecCUDACopyFromGPU(vin);
242: VecResetArray_SeqCUDA_Private(vin);
243: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
244: return(0);
245: }
247: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
248: {
252: VecCUDACopyFromGPU(vin);
253: VecPlaceArray_Seq(vin,a);
254: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
255: return(0);
256: }
258: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
259: {
263: VecCUDACopyFromGPU(vin);
264: VecReplaceArray_Seq(vin,a);
265: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
266: return(0);
267: }
269: /*@
270: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
272: Collective
274: Input Parameter:
275: + comm - the communicator, should be PETSC_COMM_SELF
276: - n - the vector length
278: Output Parameter:
279: . v - the vector
281: Notes:
282: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
283: same type as an existing vector.
285: Level: intermediate
287: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
288: @*/
289: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
290: {
294: VecCreate(comm,v);
295: VecSetSizes(*v,n,n);
296: VecSetType(*v,VECSEQCUDA);
297: return(0);
298: }
300: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
301: {
305: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
306: PetscLayoutReference(win->map,&(*V)->map);
307: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
308: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
309: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
310: return(0);
311: }
313: PetscErrorCode VecCreate_SeqCUDA(Vec V)
314: {
318: PetscLayoutSetUp(V->map);
319: VecCUDAAllocateCheck(V);
320: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
321: VecCUDAAllocateCheckHost(V);
322: VecSet(V,0.0);
323: VecSet_Seq(V,0.0);
324: V->valid_GPU_array = PETSC_OFFLOAD_BOTH;
325: return(0);
326: }
328: /*@C
329: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
330: where the user provides the array space to store the vector values. The array
331: provided must be a GPU array.
333: Collective
335: Input Parameter:
336: + comm - the communicator, should be PETSC_COMM_SELF
337: . bs - the block size
338: . n - the vector length
339: - array - GPU memory where the vector elements are to be stored.
341: Output Parameter:
342: . V - the vector
344: Notes:
345: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
346: same type as an existing vector.
348: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
349: at a later stage to SET the array for storing the vector values.
351: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
352: The user should not free the array until the vector is destroyed.
354: Level: intermediate
356: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
357: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
358: VecCreateMPIWithArray()
359: @*/
360: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
361: {
363: PetscMPIInt size;
366: VecCreate(comm,V);
367: VecSetSizes(*V,n,n);
368: VecSetBlockSize(*V,bs);
369: MPI_Comm_size(comm,&size);
370: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
371: VecCreate_SeqCUDA_Private(*V,array);
372: return(0);
373: }
375: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
376: {
380: VecCUDAAllocateCheckHost(v);
381: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
382: *vv = *((PetscScalar**)v->data);
383: return(0);
384: }
386: PetscErrorCode VecPinToCPU_SeqCUDA(Vec V,PetscBool pin)
387: {
391: V->pinnedtocpu = pin;
392: if (pin) {
393: VecCUDACopyFromGPU(V);
394: V->valid_GPU_array = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
395: V->ops->dot = VecDot_Seq;
396: V->ops->norm = VecNorm_Seq;
397: V->ops->tdot = VecTDot_Seq;
398: V->ops->scale = VecScale_Seq;
399: V->ops->copy = VecCopy_Seq;
400: V->ops->set = VecSet_Seq;
401: V->ops->swap = VecSwap_Seq;
402: V->ops->axpy = VecAXPY_Seq;
403: V->ops->axpby = VecAXPBY_Seq;
404: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
405: V->ops->pointwisemult = VecPointwiseMult_Seq;
406: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
407: V->ops->setrandom = VecSetRandom_Seq;
408: V->ops->dot_local = VecDot_Seq;
409: V->ops->tdot_local = VecTDot_Seq;
410: V->ops->norm_local = VecNorm_Seq;
411: V->ops->mdot_local = VecMDot_Seq;
412: V->ops->mtdot_local = VecMTDot_Seq;
413: V->ops->maxpy = VecMAXPY_Seq;
414: V->ops->mdot = VecMDot_Seq;
415: V->ops->mtdot = VecMTDot_Seq;
416: V->ops->aypx = VecAYPX_Seq;
417: V->ops->waxpy = VecWAXPY_Seq;
418: V->ops->dotnorm2 = NULL;
419: V->ops->placearray = VecPlaceArray_Seq;
420: V->ops->replacearray = VecReplaceArray_Seq;
421: V->ops->resetarray = VecResetArray_Seq;
422: V->ops->duplicate = VecDuplicate_Seq;
423: V->ops->conjugate = VecConjugate_Seq;
424: V->ops->getlocalvector = NULL;
425: V->ops->restorelocalvector = NULL;
426: V->ops->getlocalvectorread = NULL;
427: V->ops->restorelocalvectorread = NULL;
428: V->ops->getarraywrite = NULL;
429: } else {
430: V->ops->dot = VecDot_SeqCUDA;
431: V->ops->norm = VecNorm_SeqCUDA;
432: V->ops->tdot = VecTDot_SeqCUDA;
433: V->ops->scale = VecScale_SeqCUDA;
434: V->ops->copy = VecCopy_SeqCUDA;
435: V->ops->set = VecSet_SeqCUDA;
436: V->ops->swap = VecSwap_SeqCUDA;
437: V->ops->axpy = VecAXPY_SeqCUDA;
438: V->ops->axpby = VecAXPBY_SeqCUDA;
439: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
440: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
441: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
442: V->ops->setrandom = VecSetRandom_SeqCUDA;
443: V->ops->dot_local = VecDot_SeqCUDA;
444: V->ops->tdot_local = VecTDot_SeqCUDA;
445: V->ops->norm_local = VecNorm_SeqCUDA;
446: V->ops->mdot_local = VecMDot_SeqCUDA;
447: V->ops->maxpy = VecMAXPY_SeqCUDA;
448: V->ops->mdot = VecMDot_SeqCUDA;
449: V->ops->aypx = VecAYPX_SeqCUDA;
450: V->ops->waxpy = VecWAXPY_SeqCUDA;
451: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
452: V->ops->placearray = VecPlaceArray_SeqCUDA;
453: V->ops->replacearray = VecReplaceArray_SeqCUDA;
454: V->ops->resetarray = VecResetArray_SeqCUDA;
455: V->ops->destroy = VecDestroy_SeqCUDA;
456: V->ops->duplicate = VecDuplicate_SeqCUDA;
457: V->ops->conjugate = VecConjugate_SeqCUDA;
458: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
459: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
460: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
461: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
462: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
463: }
464: return(0);
465: }
467: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
468: {
470: Vec_CUDA *veccuda;
471: PetscMPIInt size;
474: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
475: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
476: VecCreate_Seq_Private(V,0);
477: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
478: VecPinToCPU_SeqCUDA(V,PETSC_FALSE);
479: V->ops->pintocpu = VecPinToCPU_SeqCUDA;
481: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
482: if (array) {
483: if (!V->spptr) {
484: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
485: veccuda = (Vec_CUDA*)V->spptr;
486: veccuda->stream = 0; /* using default stream */
487: veccuda->GPUarray_allocated = 0;
488: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
489: V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
490: }
491: veccuda = (Vec_CUDA*)V->spptr;
492: veccuda->GPUarray = (PetscScalar*)array;
493: }
495: return(0);
496: }