Actual source code: veccuda.c
petsc-3.9.1 2018-04-29
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsccuda.h>
13: #include <petsc/private/vecimpl.h> /*I <petscvec.h> I*/
14: #include <../src/vec/vec/impls/dvecimpl.h>
15: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
17: static PetscErrorCode PetscCUBLASDestroyHandle();
19: /*
20: Implementation for obtaining read-write access to the cuBLAS handle.
21: Required to properly deal with repeated calls of PetscInitizalize()/PetscFinalize().
22: */
23: static PetscErrorCode PetscCUBLASGetHandle_Private(cublasHandle_t **handle)
24: {
25: static cublasHandle_t cublasv2handle = NULL;
26: cublasStatus_t cberr;
27: PetscErrorCode ierr;
30: if (!cublasv2handle) {
31: cberr = cublasCreate(&cublasv2handle);CHKERRCUBLAS(cberr);
32: /* Make sure that the handle will be destroyed properly */
33: PetscRegisterFinalize(PetscCUBLASDestroyHandle);
34: }
35: *handle = &cublasv2handle;
36: return(0);
37: }
39: /*
40: Singleton for obtaining a handle to cuBLAS.
41: The handle is required for calls to routines in cuBLAS.
42: */
43: PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *handle)
44: {
45: cublasHandle_t *p_cublasv2handle;
49: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
50: *handle = *p_cublasv2handle;
51: return(0);
52: }
55: /*
56: Destroys the CUBLAS handle.
57: This function is intended and registered for PetscFinalize - do not call manually!
58: */
59: PetscErrorCode PetscCUBLASDestroyHandle()
60: {
61: cublasHandle_t *p_cublasv2handle;
62: cublasStatus_t cberr;
66: PetscCUBLASGetHandle_Private(&p_cublasv2handle);
67: cberr = cublasDestroy(*p_cublasv2handle);CHKERRCUBLAS(cberr);
68: *p_cublasv2handle = NULL; /* Ensures proper reinitialization */
69: return(0);
70: }
72: /*
73: Allocates space for the vector array on the Host if it does not exist.
74: Does NOT change the PetscCUDAFlag for the vector
75: Does NOT zero the CUDA array
76: */
77: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
78: {
80: PetscScalar *array;
81: Vec_Seq *s = (Vec_Seq*)v->data;
82: PetscInt n = v->map->n;
85: if (!s) {
86: PetscNewLog((PetscObject)v,&s);
87: v->data = s;
88: }
89: if (!s->array) {
90: PetscMalloc1(n,&array);
91: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
92: s->array = array;
93: s->array_allocated = array;
94: if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
95: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
96: }
97: }
98: return(0);
99: }
101: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
102: {
103: PetscScalar *ya;
104: const PetscScalar *xa;
105: PetscErrorCode ierr;
108: VecCUDAAllocateCheckHost(xin);
109: VecCUDAAllocateCheckHost(yin);
110: if (xin != yin) {
111: VecGetArrayRead(xin,&xa);
112: VecGetArray(yin,&ya);
113: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
114: VecRestoreArrayRead(xin,&xa);
115: VecRestoreArray(yin,&ya);
116: }
117: return(0);
118: }
120: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
121: {
123: PetscInt n = xin->map->n,i;
124: PetscScalar *xx;
127: VecGetArray(xin,&xx);
128: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
129: VecRestoreArray(xin,&xx);
130: return(0);
131: }
133: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
134: {
135: Vec_Seq *vs = (Vec_Seq*)v->data;
139: PetscObjectSAWsViewOff(v);
140: #if defined(PETSC_USE_LOG)
141: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
142: #endif
143: if (vs) {
144: if (vs->array_allocated) { PetscFree(vs->array_allocated); }
145: PetscFree(vs);
146: }
147: return(0);
148: }
150: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
151: {
152: Vec_Seq *v = (Vec_Seq*)vin->data;
155: v->array = v->unplacedarray;
156: v->unplacedarray = 0;
157: return(0);
158: }
160: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
161: {
165: VecCUDAAllocateCheck(v);
166: return(0);
167: }
169: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
170: {
174: VecCUDACopyToGPU(v);
175: return(0);
176: }
178: /*
179: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
181: Input Parameters:
182: . v - the vector
183: . indices - the requested indices, this should be created with CUDAIndicesCreate()
185: */
186: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci)
187: {
191: VecCUDACopyToGPUSome(v,ci);
192: return(0);
193: }
195: /*
196: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
198: Input Parameters:
199: + v - the vector
200: - indices - the requested indices, this should be created with CUDAIndicesCreate()
201: */
202: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci)
203: {
207: VecCUDACopyFromGPUSome(v,ci);
208: return(0);
209: }
211: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
212: {
216: VecSetRandom_SeqCUDA_Private(xin,r);
217: xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
218: return(0);
219: }
221: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
222: {
226: VecCUDACopyFromGPU(vin);
227: VecResetArray_SeqCUDA_Private(vin);
228: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
229: return(0);
230: }
232: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
233: {
237: VecCUDACopyFromGPU(vin);
238: VecPlaceArray_Seq(vin,a);
239: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
240: return(0);
241: }
243: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
244: {
248: VecCUDACopyFromGPU(vin);
249: VecReplaceArray_Seq(vin,a);
250: vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
251: return(0);
252: }
254: /*@
255: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
257: Collective on MPI_Comm
259: Input Parameter:
260: + comm - the communicator, should be PETSC_COMM_SELF
261: - n - the vector length
263: Output Parameter:
264: . v - the vector
266: Notes:
267: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
268: same type as an existing vector.
270: Level: intermediate
272: Concepts: vectors^creating sequential
274: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
275: @*/
276: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
277: {
281: VecCreate(comm,v);
282: VecSetSizes(*v,n,n);
283: VecSetType(*v,VECSEQCUDA);
284: return(0);
285: }
287: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
288: {
292: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
293: PetscLayoutReference(win->map,&(*V)->map);
294: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
295: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
296: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
297: return(0);
298: }
300: PetscErrorCode VecCreate_SeqCUDA(Vec V)
301: {
305: PetscLayoutSetUp(V->map);
306: VecCUDAAllocateCheck(V);
307: V->valid_GPU_array = PETSC_OFFLOAD_GPU;
308: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
309: VecSet(V,0.0);
310: return(0);
311: }
313: /*@C
314: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
315: where the user provides the array space to store the vector values. The array
316: provided must be a GPU array.
318: Collective on MPI_Comm
320: Input Parameter:
321: + comm - the communicator, should be PETSC_COMM_SELF
322: . bs - the block size
323: . n - the vector length
324: - array - GPU memory where the vector elements are to be stored.
326: Output Parameter:
327: . V - the vector
329: Notes:
330: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
331: same type as an existing vector.
333: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
334: at a later stage to SET the array for storing the vector values.
336: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
337: The user should not free the array until the vector is destroyed.
339: Level: intermediate
341: Concepts: vectors^creating with array
343: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
344: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
345: VecCreateMPIWithArray()
346: @*/
347: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
348: {
350: PetscMPIInt size;
353: VecCreate(comm,V);
354: VecSetSizes(*V,n,n);
355: VecSetBlockSize(*V,bs);
356: MPI_Comm_size(comm,&size);
357: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
358: VecCreate_SeqCUDA_Private(*V,array);
359: return(0);
360: }
362: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
363: {
365: cudaError_t err;
366: Vec_CUDA *veccuda;
367: PetscMPIInt size;
370: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
371: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
372: VecCreate_Seq_Private(V,0);
373: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
375: V->ops->dot = VecDot_SeqCUDA;
376: V->ops->norm = VecNorm_SeqCUDA;
377: V->ops->tdot = VecTDot_SeqCUDA;
378: V->ops->scale = VecScale_SeqCUDA;
379: V->ops->copy = VecCopy_SeqCUDA;
380: V->ops->set = VecSet_SeqCUDA;
381: V->ops->swap = VecSwap_SeqCUDA;
382: V->ops->axpy = VecAXPY_SeqCUDA;
383: V->ops->axpby = VecAXPBY_SeqCUDA;
384: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
385: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
386: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
387: V->ops->setrandom = VecSetRandom_SeqCUDA;
388: V->ops->dot_local = VecDot_SeqCUDA;
389: V->ops->tdot_local = VecTDot_SeqCUDA;
390: V->ops->norm_local = VecNorm_SeqCUDA;
391: V->ops->mdot_local = VecMDot_SeqCUDA;
392: V->ops->maxpy = VecMAXPY_SeqCUDA;
393: V->ops->mdot = VecMDot_SeqCUDA;
394: V->ops->aypx = VecAYPX_SeqCUDA;
395: V->ops->waxpy = VecWAXPY_SeqCUDA;
396: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
397: V->ops->placearray = VecPlaceArray_SeqCUDA;
398: V->ops->replacearray = VecReplaceArray_SeqCUDA;
399: V->ops->resetarray = VecResetArray_SeqCUDA;
400: V->ops->destroy = VecDestroy_SeqCUDA;
401: V->ops->duplicate = VecDuplicate_SeqCUDA;
402: V->ops->conjugate = VecConjugate_SeqCUDA;
403: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
404: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
405: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
406: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
408: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
409: if (array) {
410: if (!V->spptr) {
411: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
412: veccuda = (Vec_CUDA*)V->spptr;
413: err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
414: veccuda->GPUarray_allocated = 0;
415: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
416: V->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
417: }
418: veccuda = (Vec_CUDA*)V->spptr;
419: veccuda->GPUarray = (PetscScalar*)array;
420: }
422: return(0);
423: }