Actual source code: mpicuda.cu
petsc-3.11.0 2019-03-29
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
9: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
11: PetscErrorCode VecDestroy_MPICUDA(Vec v)
12: {
14: cudaError_t err;
17: if (v->spptr) {
18: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
19: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
20: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
21: }
22: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
23: PetscFree(v->spptr);
24: }
25: VecDestroy_MPI(v);
26: return(0);
27: }
29: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
30: {
31: PetscReal sum,work = 0.0;
35: if (type == NORM_2 || type == NORM_FROBENIUS) {
36: VecNorm_SeqCUDA(xin,NORM_2,&work);
37: work *= work;
38: MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
39: *z = PetscSqrtReal(sum);
40: } else if (type == NORM_1) {
41: /* Find the local part */
42: VecNorm_SeqCUDA(xin,NORM_1,&work);
43: /* Find the global max */
44: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
45: } else if (type == NORM_INFINITY) {
46: /* Find the local max */
47: VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
48: /* Find the global max */
49: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
50: } else if (type == NORM_1_AND_2) {
51: PetscReal temp[2];
52: VecNorm_SeqCUDA(xin,NORM_1,temp);
53: VecNorm_SeqCUDA(xin,NORM_2,temp+1);
54: temp[1] = temp[1]*temp[1];
55: MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
56: z[1] = PetscSqrtReal(z[1]);
57: }
58: return(0);
59: }
61: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
62: {
63: PetscScalar sum,work;
67: VecDot_SeqCUDA(xin,yin,&work);
68: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
69: *z = sum;
70: return(0);
71: }
73: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
74: {
75: PetscScalar sum,work;
79: VecTDot_SeqCUDA(xin,yin,&work);
80: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
81: *z = sum;
82: return(0);
83: }
85: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
86: {
87: PetscScalar awork[128],*work = awork;
91: if (nv > 128) {
92: PetscMalloc1(nv,&work);
93: }
94: VecMDot_SeqCUDA(xin,nv,y,work);
95: MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
96: if (nv > 128) {
97: PetscFree(work);
98: }
99: return(0);
100: }
102: /*MC
103: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
105: Options Database Keys:
106: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
108: Level: beginner
110: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI()
111: M*/
114: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
115: {
117: Vec_MPI *vw,*w = (Vec_MPI*)win->data;
118: PetscScalar *array;
121: VecCreate(PetscObjectComm((PetscObject)win),v);
122: PetscLayoutReference(win->map,&(*v)->map);
124: VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
125: vw = (Vec_MPI*)(*v)->data;
126: PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));
128: /* save local representation of the parallel vector (and scatter) if it exists */
129: if (w->localrep) {
130: VecGetArray(*v,&array);
131: VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
132: PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
133: VecRestoreArray(*v,&array);
134: PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
135: vw->localupdate = w->localupdate;
136: if (vw->localupdate) {
137: PetscObjectReference((PetscObject)vw->localupdate);
138: }
139: }
141: /* New vector should inherit stashing property of parent */
142: (*v)->stash.donotstash = win->stash.donotstash;
143: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
145: /* change type_name appropriately */
146: VecCUDAAllocateCheck(*v);
147: PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);
149: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
150: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
151: (*v)->map->bs = PetscAbs(win->map->bs);
152: (*v)->bstash.bs = win->bstash.bs;
153: return(0);
154: }
156: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
157: {
159: PetscScalar work[2],sum[2];
162: VecDotNorm2_SeqCUDA(s,t,work,work+1);
163: MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
164: *dp = sum[0];
165: *nm = sum[1];
166: return(0);
167: }
169: PetscErrorCode VecCreate_MPICUDA(Vec vv)
170: {
174: PetscLayoutSetUp(vv->map);
175: VecCUDAAllocateCheck(vv);CHKERRCUDA(ierr);
176: vv->valid_GPU_array = PETSC_OFFLOAD_GPU;
177: VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
178: VecSet(vv,0.0);
179: return(0);
180: }
182: PetscErrorCode VecCreate_CUDA(Vec v)
183: {
185: PetscMPIInt size;
188: MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
189: if (size == 1) {
190: VecSetType(v,VECSEQCUDA);
191: } else {
192: VecSetType(v,VECMPICUDA);
193: }
194: return(0);
195: }
197: /*@C
198: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
199: where the user provides the GPU array space to store the vector values.
201: Collective on MPI_Comm
203: Input Parameters:
204: + comm - the MPI communicator to use
205: . bs - block size, same meaning as VecSetBlockSize()
206: . n - local vector length, cannot be PETSC_DECIDE
207: . N - global vector length (or PETSC_DECIDE to have calculated)
208: - array - the user provided GPU array to store the vector values
210: Output Parameter:
211: . vv - the vector
213: Notes:
214: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
215: same type as an existing vector.
217: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
218: at a later stage to SET the array for storing the vector values.
220: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
221: The user should not free the array until the vector is destroyed.
223: Level: intermediate
225: Concepts: vectors^creating with array
227: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
228: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
229: VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()
231: @*/
232: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
233: {
237: if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
238: PetscSplitOwnership(comm,&n,&N);
239: VecCreate(comm,vv);
240: VecSetSizes(*vv,n,N);
241: VecSetBlockSize(*vv,bs);
242: VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
243: return(0);
244: }
246: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
247: {
249: cudaError_t err;
250: Vec_CUDA *veccuda;
253: VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
254: PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);
256: vv->ops->dotnorm2 = VecDotNorm2_MPICUDA;
257: vv->ops->waxpy = VecWAXPY_SeqCUDA;
258: vv->ops->duplicate = VecDuplicate_MPICUDA;
259: vv->ops->dot = VecDot_MPICUDA;
260: vv->ops->mdot = VecMDot_MPICUDA;
261: vv->ops->tdot = VecTDot_MPICUDA;
262: vv->ops->norm = VecNorm_MPICUDA;
263: vv->ops->scale = VecScale_SeqCUDA;
264: vv->ops->copy = VecCopy_SeqCUDA;
265: vv->ops->set = VecSet_SeqCUDA;
266: vv->ops->swap = VecSwap_SeqCUDA;
267: vv->ops->axpy = VecAXPY_SeqCUDA;
268: vv->ops->axpby = VecAXPBY_SeqCUDA;
269: vv->ops->maxpy = VecMAXPY_SeqCUDA;
270: vv->ops->aypx = VecAYPX_SeqCUDA;
271: vv->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
272: vv->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
273: vv->ops->setrandom = VecSetRandom_SeqCUDA;
274: vv->ops->placearray = VecPlaceArray_SeqCUDA;
275: vv->ops->replacearray = VecReplaceArray_SeqCUDA;
276: vv->ops->resetarray = VecResetArray_SeqCUDA;
277: vv->ops->dot_local = VecDot_SeqCUDA;
278: vv->ops->tdot_local = VecTDot_SeqCUDA;
279: vv->ops->norm_local = VecNorm_SeqCUDA;
280: vv->ops->mdot_local = VecMDot_SeqCUDA;
281: vv->ops->destroy = VecDestroy_MPICUDA;
282: vv->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
283: vv->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
284: vv->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
285: vv->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
286: vv->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
288: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
289: if (alloc && !array) {
290: VecCUDAAllocateCheck(vv);
291: VecSet(vv,0.0);
292: }
293: if (array) {
294: if (!vv->spptr) {
295: /* Cannot use PetscNew() here because spptr is void* */
296: PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
297: veccuda = (Vec_CUDA*)vv->spptr;
298: err = cudaStreamCreate(&veccuda->stream);CHKERRCUDA(err);
299: veccuda->GPUarray_allocated = 0;
300: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
301: vv->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
302: }
303: veccuda = (Vec_CUDA*)vv->spptr;
304: veccuda->GPUarray = (PetscScalar*)array;
305: }
307: return(0);
308: }