Actual source code: vecscattercuda.cu

petsc-3.12.2 2019-11-22
Report Typos and Errors
  1: /*
  2:    Implements the various scatter operations on cuda vectors
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 13: #include <cuda_runtime.h>

 15: PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {

 17:   PetscCUDAIndices           cci;
 18:   VecScatterCUDAIndices_StoS stos_scatter;
 19:   cudaError_t                err;
 20:   PetscInt                   *intVecGPU;
 21:   int                        device;
 22:   cudaDeviceProp             props;
 23:   PetscErrorCode             ierr;

 26:   cci = new struct _p_PetscCUDAIndices;
 27:   stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;

 29:   /* create the "from" indices */
 30:   stos_scatter->fslots = 0;
 31:   stos_scatter->fromFirst = 0;
 32:   stos_scatter->fromStep = 0;
 33:   if (n) {
 34:     if (fslots) {
 35:       /* allocate GPU memory for the to-slots */
 36:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
 37:       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 38:       PetscLogCpuToGpu(n*sizeof(PetscInt));

 40:       /* assign the pointer to the struct */
 41:       stos_scatter->fslots = intVecGPU;
 42:       stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
 43:     } else if (fromStep) {
 44:       stos_scatter->fromFirst = fromFirst;
 45:       stos_scatter->fromStep = fromStep;
 46:       stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
 47:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
 48:   }

 50:   /* create the "to" indices */
 51:   stos_scatter->tslots = 0;
 52:   stos_scatter->toFirst = 0;
 53:   stos_scatter->toStep = 0;
 54:   if (n) {
 55:     if (tslots) {
 56:       /* allocate GPU memory for the to-slots */
 57:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
 58:       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 59:       PetscLogCpuToGpu(n*sizeof(PetscInt));

 61:       /* assign the pointer to the struct */
 62:       stos_scatter->tslots = intVecGPU;
 63:       stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
 64:     } else if (toStep) {
 65:       stos_scatter->toFirst = toFirst;
 66:       stos_scatter->toStep = toStep;
 67:       stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
 68:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
 69:   }

 71:   /* Scatter uses default stream as well.
 72:      Note: Here we have used a separate stream earlier.
 73:            However, without proper synchronization with the default stream, one will inevitably run into data races.
 74:            Please keep that in mind when trying to reintroduce streams for scatters.
 75:            Ultimately, it's better to have correct code/results at 90 percent of peak performance than to have incorrect code/results at 99 percent of peak performance. */
 76:   stos_scatter->stream = 0;

 78:   /* the number of indices */
 79:   stos_scatter->n = n;

 81:   /* get the maximum number of coresident thread blocks */
 82:   cudaGetDevice(&device);
 83:   cudaGetDeviceProperties(&props, device);
 84:   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
 85:   if (props.major>=3) {
 86:     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
 87:   } else {
 88:     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
 89:   }

 91:   /* assign the indices */
 92:   cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
 93:   cci->scatterType = VEC_SCATTER_CUDA_STOS;
 94:   *ci = cci;
 95:   return(0);
 96: }

 98: PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
 99: {
100:   PetscCUDAIndices           cci;
101:   VecScatterCUDAIndices_PtoP ptop_scatter;

104:   cci = new struct _p_PetscCUDAIndices;
105:   ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;

107:   /* this calculation assumes that the input indices are sorted */
108:   if (sendIndices) {
109:     ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
110:     ptop_scatter->sendLowestIndex = sendIndices[0];
111:   } else {
112:     ptop_scatter->ns = 0;
113:     ptop_scatter->sendLowestIndex = 0;
114:   }
115:   if (recvIndices) {
116:     ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
117:     ptop_scatter->recvLowestIndex = recvIndices[0];
118:   } else {
119:     ptop_scatter->nr = 0;
120:     ptop_scatter->recvLowestIndex = 0;
121:   }

123:   /* assign indices */
124:   cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
125:   cci->scatterType = VEC_SCATTER_CUDA_PTOP;

127:   *ci = cci;
128:   return(0);
129: }

131: PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
132: {
134:   if (*ci) {
135:     if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
136:       delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
137:       (*ci)->scatter = 0;
138:     } else {
139:       cudaError_t err;
140:       VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
141:       if (stos_scatter->fslots) {
142:         err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
143:         stos_scatter->fslots = 0;
144:       }

146:       /* free the GPU memory for the to-slots */
147:       if (stos_scatter->tslots) {
148:         err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
149:         stos_scatter->tslots = 0;
150:       }

152:       /* free the stream variable */
153:       if (stos_scatter->stream) {
154:         err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
155:         stos_scatter->stream = 0;
156:       }
157:       delete stos_scatter;
158:       (*ci)->scatter = 0;
159:     }
160:     delete *ci;
161:     *ci = 0;
162:   }
163:   return(0);
164: }

166: /* Insert operator */
167: class Insert {
168:   public:
169:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
170:       return a;
171:     }
172: };

174: /* Add operator */
175: class Add {
176:   public:
177:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
178:       return a+b;
179:     }
180: };

182: /* Add operator */
183: class Max {
184:   public:
185:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
186:       return PetscMax(PetscRealPart(a),PetscRealPart(b));
187:     }
188: };

190: /* Sequential general to sequential general GPU kernel */
191: template<class OPERATOR>
192: __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
193:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
194:   const int grid_size = gridDim.x * blockDim.x;
195:   for (int i = tidx; i < n; i += grid_size) {
196:     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
197:   }
198: }

200: /* Sequential general to sequential strided GPU kernel */
201: template<class OPERATOR>
202: __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
203:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
204:   const int grid_size = gridDim.x * blockDim.x;
205:   for (int i = tidx; i < n; i += grid_size) {
206:     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
207:   }
208: }

210: /* Sequential strided to sequential strided GPU kernel */
211: template<class OPERATOR>
212: __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
213:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
214:   const int grid_size = gridDim.x * blockDim.x;
215:   for (int i = tidx; i < n; i += grid_size) {
216:     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
217:   }
218: }

220: /* Sequential strided to sequential general GPU kernel */
221: template<class OPERATOR>
222: __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
223:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
224:   const int grid_size = gridDim.x * blockDim.x;
225:   for (int i = tidx; i < n; i += grid_size) {
226:     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
227:   }
228: }

230: template<class OPERATOR>
231: void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {

233:   PetscInt                   nBlocks=0,nThreads=128;
234:   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;

236:   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
237:   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
238:     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
239:   }
240:   dim3 block(nThreads,1,1);
241:   dim3 grid(nBlocks,1,1);

243:   if (mode == SCATTER_FORWARD) {
244:     if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245:       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
246:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
247:       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
248:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
249:       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
250:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
251:       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
252:     }
253:   } else {
254:     if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255:       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
256:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
257:       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
258:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
259:       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
260:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
261:       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
262:     }
263:   }
264: }

266: PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
267: {
268:   PetscErrorCode             ierr;
269:   const PetscScalar          *xarray;
270:   PetscScalar                *yarray;
271:   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
272:   cudaError_t                err;

275:   VecCUDAAllocateCheck(x);
276:   VecCUDAAllocateCheck(y);
277:   VecCUDAGetArrayRead(x,&xarray);
278:   VecCUDAGetArray(y,&yarray);
279:   if (stos_scatter->n) {
280:     if (addv == INSERT_VALUES)
281:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
282:     else if (addv == ADD_VALUES)
283:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
284:     else if (addv == MAX_VALUES)
285:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287:     err = cudaGetLastError();CHKERRCUDA(err);
288:     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
289:   }
290:   VecCUDARestoreArrayRead(x,&xarray);
291:   VecCUDARestoreArray(y,&yarray);
292:   return(0);
293: }