Actual source code: veccuda.c
petsc-3.12.3 2020-01-03
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h> /*I <petscvec.h> I*/
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
16: /*
17: Allocates space for the vector array on the Host if it does not exist.
18: Does NOT change the PetscCUDAFlag for the vector
19: Does NOT zero the CUDA array
20: */
21: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
22: {
24: PetscScalar *array;
25: Vec_Seq *s = (Vec_Seq*)v->data;
26: PetscInt n = v->map->n;
29: if (!s) {
30: PetscNewLog((PetscObject)v,&s);
31: v->data = s;
32: }
33: if (!s->array) {
34: PetscMalloc1(n,&array);
35: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
36: s->array = array;
37: s->array_allocated = array;
38: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
39: v->offloadmask = PETSC_OFFLOAD_CPU;
40: }
41: }
42: return(0);
43: }
45: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
46: {
47: PetscScalar *ya;
48: const PetscScalar *xa;
49: PetscErrorCode ierr;
52: VecCUDAAllocateCheckHost(xin);
53: VecCUDAAllocateCheckHost(yin);
54: if (xin != yin) {
55: VecGetArrayRead(xin,&xa);
56: VecGetArray(yin,&ya);
57: PetscArraycpy(ya,xa,xin->map->n);
58: VecRestoreArrayRead(xin,&xa);
59: VecRestoreArray(yin,&ya);
60: }
61: return(0);
62: }
64: PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r)
65: {
67: PetscInt n = xin->map->n,i;
68: PetscScalar *xx;
71: VecGetArray(xin,&xx);
72: for (i=0; i<n; i++) { PetscRandomGetValue(r,&xx[i]); }
73: VecRestoreArray(xin,&xx);
74: return(0);
75: }
77: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
78: {
79: Vec_Seq *vs = (Vec_Seq*)v->data;
83: PetscObjectSAWsViewOff(v);
84: #if defined(PETSC_USE_LOG)
85: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
86: #endif
87: if (vs) {
88: if (vs->array_allocated) { PetscFree(vs->array_allocated); }
89: PetscFree(vs);
90: }
91: return(0);
92: }
94: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
95: {
96: Vec_Seq *v = (Vec_Seq*)vin->data;
99: v->array = v->unplacedarray;
100: v->unplacedarray = 0;
101: return(0);
102: }
104: PetscErrorCode VecCUDAAllocateCheck_Public(Vec v)
105: {
109: VecCUDAAllocateCheck(v);
110: return(0);
111: }
113: PetscErrorCode VecCUDACopyToGPU_Public(Vec v)
114: {
118: VecCUDACopyToGPU(v);
119: return(0);
120: }
122: /*
123: VecCUDACopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
125: Input Parameters:
126: + v - the vector
127: . ci - the requested indices, this should be created with CUDAIndicesCreate()
128: - mode - vec scatter mode used in VecScatterBegin/End
129: */
130: PetscErrorCode VecCUDACopyToGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
131: {
135: VecCUDACopyToGPUSome(v,ci,mode);
136: return(0);
137: }
139: /*
140: VecCUDACopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
142: Input Parameters:
143: + v - the vector
144: . ci - the requested indices, this should be created with CUDAIndicesCreate()
145: - mode - vec scatter mode used in VecScatterBegin/End
146: */
147: PetscErrorCode VecCUDACopyFromGPUSome_Public(Vec v,PetscCUDAIndices ci,ScatterMode mode)
148: {
152: VecCUDACopyFromGPUSome(v,ci,mode);
153: return(0);
154: }
156: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
157: {
161: VecSetRandom_SeqCUDA_Private(xin,r);
162: xin->offloadmask = PETSC_OFFLOAD_CPU;
163: return(0);
164: }
166: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
167: {
171: VecCUDACopyFromGPU(vin);
172: VecResetArray_SeqCUDA_Private(vin);
173: vin->offloadmask = PETSC_OFFLOAD_CPU;
174: return(0);
175: }
177: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
178: {
182: VecCUDACopyFromGPU(vin);
183: VecPlaceArray_Seq(vin,a);
184: vin->offloadmask = PETSC_OFFLOAD_CPU;
185: return(0);
186: }
188: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
189: {
193: VecCUDACopyFromGPU(vin);
194: VecReplaceArray_Seq(vin,a);
195: vin->offloadmask = PETSC_OFFLOAD_CPU;
196: return(0);
197: }
199: /*@
200: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
202: Collective
204: Input Parameter:
205: + comm - the communicator, should be PETSC_COMM_SELF
206: - n - the vector length
208: Output Parameter:
209: . v - the vector
211: Notes:
212: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
213: same type as an existing vector.
215: Level: intermediate
217: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
218: @*/
219: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
220: {
224: VecCreate(comm,v);
225: VecSetSizes(*v,n,n);
226: VecSetType(*v,VECSEQCUDA);
227: return(0);
228: }
230: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
231: {
235: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
236: PetscLayoutReference(win->map,&(*V)->map);
237: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
238: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
239: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
240: return(0);
241: }
243: PetscErrorCode VecCreate_SeqCUDA(Vec V)
244: {
248: PetscLayoutSetUp(V->map);
249: VecCUDAAllocateCheck(V);
250: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
251: VecCUDAAllocateCheckHost(V);
252: VecSet(V,0.0);
253: VecSet_Seq(V,0.0);
254: V->offloadmask = PETSC_OFFLOAD_BOTH;
255: return(0);
256: }
258: /*@C
259: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
260: where the user provides the array space to store the vector values. The array
261: provided must be a GPU array.
263: Collective
265: Input Parameter:
266: + comm - the communicator, should be PETSC_COMM_SELF
267: . bs - the block size
268: . n - the vector length
269: - array - GPU memory where the vector elements are to be stored.
271: Output Parameter:
272: . V - the vector
274: Notes:
275: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
276: same type as an existing vector.
278: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
279: at a later stage to SET the array for storing the vector values.
281: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
282: The user should not free the array until the vector is destroyed.
284: Level: intermediate
286: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
287: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
288: VecCreateMPIWithArray()
289: @*/
290: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
291: {
293: PetscMPIInt size;
296: VecCreate(comm,V);
297: VecSetSizes(*V,n,n);
298: VecSetBlockSize(*V,bs);
299: MPI_Comm_size(comm,&size);
300: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQ on more than one process");
301: VecCreate_SeqCUDA_Private(*V,array);
302: return(0);
303: }
305: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **vv)
306: {
310: VecCUDAAllocateCheckHost(v);
311: v->offloadmask = PETSC_OFFLOAD_CPU;
312: *vv = *((PetscScalar**)v->data);
313: return(0);
314: }
316: PetscErrorCode VecPinToCPU_SeqCUDA(Vec V,PetscBool pin)
317: {
321: V->pinnedtocpu = pin;
322: if (pin) {
323: VecCUDACopyFromGPU(V);
324: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
325: V->ops->dot = VecDot_Seq;
326: V->ops->norm = VecNorm_Seq;
327: V->ops->tdot = VecTDot_Seq;
328: V->ops->scale = VecScale_Seq;
329: V->ops->copy = VecCopy_Seq;
330: V->ops->set = VecSet_Seq;
331: V->ops->swap = VecSwap_Seq;
332: V->ops->axpy = VecAXPY_Seq;
333: V->ops->axpby = VecAXPBY_Seq;
334: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
335: V->ops->pointwisemult = VecPointwiseMult_Seq;
336: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
337: V->ops->setrandom = VecSetRandom_Seq;
338: V->ops->dot_local = VecDot_Seq;
339: V->ops->tdot_local = VecTDot_Seq;
340: V->ops->norm_local = VecNorm_Seq;
341: V->ops->mdot_local = VecMDot_Seq;
342: V->ops->mtdot_local = VecMTDot_Seq;
343: V->ops->maxpy = VecMAXPY_Seq;
344: V->ops->mdot = VecMDot_Seq;
345: V->ops->mtdot = VecMTDot_Seq;
346: V->ops->aypx = VecAYPX_Seq;
347: V->ops->waxpy = VecWAXPY_Seq;
348: V->ops->dotnorm2 = NULL;
349: V->ops->placearray = VecPlaceArray_Seq;
350: V->ops->replacearray = VecReplaceArray_Seq;
351: V->ops->resetarray = VecResetArray_Seq;
352: V->ops->duplicate = VecDuplicate_Seq;
353: V->ops->conjugate = VecConjugate_Seq;
354: V->ops->getlocalvector = NULL;
355: V->ops->restorelocalvector = NULL;
356: V->ops->getlocalvectorread = NULL;
357: V->ops->restorelocalvectorread = NULL;
358: V->ops->getarraywrite = NULL;
359: } else {
360: V->ops->dot = VecDot_SeqCUDA;
361: V->ops->norm = VecNorm_SeqCUDA;
362: V->ops->tdot = VecTDot_SeqCUDA;
363: V->ops->scale = VecScale_SeqCUDA;
364: V->ops->copy = VecCopy_SeqCUDA;
365: V->ops->set = VecSet_SeqCUDA;
366: V->ops->swap = VecSwap_SeqCUDA;
367: V->ops->axpy = VecAXPY_SeqCUDA;
368: V->ops->axpby = VecAXPBY_SeqCUDA;
369: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
370: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
371: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
372: V->ops->setrandom = VecSetRandom_SeqCUDA;
373: V->ops->dot_local = VecDot_SeqCUDA;
374: V->ops->tdot_local = VecTDot_SeqCUDA;
375: V->ops->norm_local = VecNorm_SeqCUDA;
376: V->ops->mdot_local = VecMDot_SeqCUDA;
377: V->ops->maxpy = VecMAXPY_SeqCUDA;
378: V->ops->mdot = VecMDot_SeqCUDA;
379: V->ops->aypx = VecAYPX_SeqCUDA;
380: V->ops->waxpy = VecWAXPY_SeqCUDA;
381: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
382: V->ops->placearray = VecPlaceArray_SeqCUDA;
383: V->ops->replacearray = VecReplaceArray_SeqCUDA;
384: V->ops->resetarray = VecResetArray_SeqCUDA;
385: V->ops->destroy = VecDestroy_SeqCUDA;
386: V->ops->duplicate = VecDuplicate_SeqCUDA;
387: V->ops->conjugate = VecConjugate_SeqCUDA;
388: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
389: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
390: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
391: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
392: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
393: }
394: return(0);
395: }
397: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
398: {
400: Vec_CUDA *veccuda;
401: PetscMPIInt size;
404: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
405: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUDA on more than one process");
406: VecCreate_Seq_Private(V,0);
407: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
408: VecPinToCPU_SeqCUDA(V,PETSC_FALSE);
409: V->ops->pintocpu = VecPinToCPU_SeqCUDA;
411: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
412: if (array) {
413: if (!V->spptr) {
414: PetscMalloc(sizeof(Vec_CUDA),&V->spptr);
415: veccuda = (Vec_CUDA*)V->spptr;
416: veccuda->stream = 0; /* using default stream */
417: veccuda->GPUarray_allocated = 0;
418: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
419: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
420: }
421: veccuda = (Vec_CUDA*)V->spptr;
422: veccuda->GPUarray = (PetscScalar*)array;
423: }
425: return(0);
426: }