Actual source code: vecscattercuda.cu
petsc-3.14.5 2021-03-03
1: /*
2: Implements the various scatter operations on cuda vectors
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
15: PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {
17: PetscCUDAIndices cci;
18: VecScatterCUDAIndices_StoS stos_scatter;
19: cudaError_t err;
20: PetscInt *intVecGPU;
21: int device;
22: cudaDeviceProp props;
23: PetscErrorCode ierr;
26: cci = new struct _p_PetscCUDAIndices;
27: stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;
29: /* create the "from" indices */
30: stos_scatter->fslots = 0;
31: stos_scatter->fromFirst = 0;
32: stos_scatter->fromStep = 0;
33: if (n) {
34: if (fslots) {
35: /* allocate GPU memory for the to-slots */
36: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
37: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
38: PetscLogCpuToGpu(n*sizeof(PetscInt));
40: /* assign the pointer to the struct */
41: stos_scatter->fslots = intVecGPU;
42: stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
43: } else if (fromStep) {
44: stos_scatter->fromFirst = fromFirst;
45: stos_scatter->fromStep = fromStep;
46: stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
47: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
48: }
50: /* create the "to" indices */
51: stos_scatter->tslots = 0;
52: stos_scatter->toFirst = 0;
53: stos_scatter->toStep = 0;
54: if (n) {
55: if (tslots) {
56: /* allocate GPU memory for the to-slots */
57: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
58: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
59: PetscLogCpuToGpu(n*sizeof(PetscInt));
61: /* assign the pointer to the struct */
62: stos_scatter->tslots = intVecGPU;
63: stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
64: } else if (toStep) {
65: stos_scatter->toFirst = toFirst;
66: stos_scatter->toStep = toStep;
67: stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
68: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
69: }
71: /* Scatter uses default stream as well.
72: Note: Here we have used a separate stream earlier.
73: However, without proper synchronization with the default stream, one will inevitably run into data races.
74: Please keep that in mind when trying to reintroduce streams for scatters.
75: Ultimately, it's better to have correct code/results at 90 percent of peak performance than to have incorrect code/results at 99 percent of peak performance. */
76: stos_scatter->stream = 0;
78: /* the number of indices */
79: stos_scatter->n = n;
81: /* get the maximum number of coresident thread blocks */
82: cudaGetDevice(&device);
83: cudaGetDeviceProperties(&props, device);
84: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
85: if (props.major>=3) {
86: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
87: } else {
88: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
89: }
91: /* assign the indices */
92: cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
93: cci->scatterType = VEC_SCATTER_CUDA_STOS;
94: *ci = cci;
95: return(0);
96: }
98: PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
99: {
100: PetscCUDAIndices cci;
101: VecScatterCUDAIndices_PtoP ptop_scatter;
104: cci = new struct _p_PetscCUDAIndices;
105: ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;
107: /* this calculation assumes that the input indices are sorted */
108: if (sendIndices) {
109: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
110: ptop_scatter->sendLowestIndex = sendIndices[0];
111: } else {
112: ptop_scatter->ns = 0;
113: ptop_scatter->sendLowestIndex = 0;
114: }
115: if (recvIndices) {
116: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
117: ptop_scatter->recvLowestIndex = recvIndices[0];
118: } else {
119: ptop_scatter->nr = 0;
120: ptop_scatter->recvLowestIndex = 0;
121: }
123: /* assign indices */
124: cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
125: cci->scatterType = VEC_SCATTER_CUDA_PTOP;
127: *ci = cci;
128: return(0);
129: }
131: PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
132: {
134: if (*ci) {
135: if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
136: delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
137: (*ci)->scatter = 0;
138: } else {
139: cudaError_t err;
140: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
141: if (stos_scatter->fslots) {
142: err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
143: stos_scatter->fslots = 0;
144: }
146: /* free the GPU memory for the to-slots */
147: if (stos_scatter->tslots) {
148: err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
149: stos_scatter->tslots = 0;
150: }
152: /* free the stream variable */
153: if (stos_scatter->stream) {
154: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
155: stos_scatter->stream = 0;
156: }
157: delete stos_scatter;
158: (*ci)->scatter = 0;
159: }
160: delete *ci;
161: *ci = 0;
162: }
163: return(0);
164: }
166: /* Insert operator */
167: class Insert {
168: public:
169: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
170: return a;
171: }
172: };
174: /* Add operator */
175: class Add {
176: public:
177: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
178: return a+b;
179: }
180: };
182: /* Add operator */
183: class Max {
184: public:
185: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
186: return PetscMax(PetscRealPart(a),PetscRealPart(b));
187: }
188: };
190: /* Sequential general to sequential general GPU kernel */
191: template<class OPERATOR>
192: __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
193: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
194: const int grid_size = gridDim.x * blockDim.x;
195: for (int i = tidx; i < n; i += grid_size) {
196: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
197: }
198: }
200: /* Sequential general to sequential strided GPU kernel */
201: template<class OPERATOR>
202: __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
203: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
204: const int grid_size = gridDim.x * blockDim.x;
205: for (int i = tidx; i < n; i += grid_size) {
206: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
207: }
208: }
210: /* Sequential strided to sequential strided GPU kernel */
211: template<class OPERATOR>
212: __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
213: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
214: const int grid_size = gridDim.x * blockDim.x;
215: for (int i = tidx; i < n; i += grid_size) {
216: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
217: }
218: }
220: /* Sequential strided to sequential general GPU kernel */
221: template<class OPERATOR>
222: __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
223: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
224: const int grid_size = gridDim.x * blockDim.x;
225: for (int i = tidx; i < n; i += grid_size) {
226: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
227: }
228: }
230: template<class OPERATOR>
231: void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {
233: PetscInt nBlocks=0,nThreads=128;
234: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
236: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
237: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
238: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
239: }
240: dim3 block(nThreads,1,1);
241: dim3 grid(nBlocks,1,1);
243: if (mode == SCATTER_FORWARD) {
244: if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245: VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
246: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
247: VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
248: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
249: VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
250: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
251: VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
252: }
253: } else {
254: if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255: VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
256: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
257: VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
258: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
259: VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
260: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
261: VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
262: }
263: }
264: }
266: PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
267: {
268: PetscErrorCode ierr;
269: const PetscScalar *xarray;
270: PetscScalar *yarray;
271: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
272: cudaError_t err;
275: VecCUDAAllocateCheck(x);
276: VecCUDAAllocateCheck(y);
277: VecCUDAGetArrayRead(x,&xarray);
278: VecCUDAGetArray(y,&yarray);
279: if (stos_scatter->n) {
280: if (addv == INSERT_VALUES)
281: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
282: else if (addv == ADD_VALUES)
283: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
284: else if (addv == MAX_VALUES)
285: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287: err = cudaGetLastError();CHKERRCUDA(err);
288: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
289: }
290: VecCUDARestoreArrayRead(x,&xarray);
291: VecCUDARestoreArray(y,&yarray);
292: return(0);
293: }