Actual source code: vecscattercusp.cu
petsc-3.6.4 2016-04-12
1: /*
2: Implements the various scatter operations on cusp vectors
3: */
5: #define PETSC_SKIP_COMPLEX
7: #include <petscconf.h>
8: PETSC_CUDA_EXTERN_C_BEGIN
9: #include <petsc/private/vecimpl.h> /*I "petscvec.h" I*/
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: PETSC_CUDA_EXTERN_C_END
12: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
14: #include <cuda_runtime.h>
18: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {
20: PetscCUSPIndices cci;
21: VecScatterCUSPIndices_StoS stos_scatter;
22: cudaError_t err = cudaSuccess;
23: cudaStream_t stream;
24: PetscInt *intVecGPU;
25: int device;
26: cudaDeviceProp props;
29: cci = new struct _p_PetscCUSPIndices;
30: stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;
32: /* create the "from" indices */
33: stos_scatter->fslots = 0;
34: stos_scatter->fromFirst = 0;
35: stos_scatter->fromStep = 0;
36: if (n) {
37: if (fslots) {
38: /* allocate GPU memory for the to-slots */
39: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
40: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
42: /* assign the pointer to the struct */
43: stos_scatter->fslots = intVecGPU;
44: stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
45: } else if (fromStep) {
46: stos_scatter->fromFirst = fromFirst;
47: stos_scatter->fromStep = fromStep;
48: stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
49: } else {
50: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
51: }
52: }
54: /* create the "to" indices */
55: stos_scatter->tslots = 0;
56: stos_scatter->toFirst = 0;
57: stos_scatter->toStep = 0;
58: if (n) {
59: if (tslots) {
60: /* allocate GPU memory for the to-slots */
61: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
62: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
64: /* assign the pointer to the struct */
65: stos_scatter->tslots = intVecGPU;
66: stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
67: } else if (toStep) {
68: stos_scatter->toFirst = toFirst;
69: stos_scatter->toStep = toStep;
70: stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
71: } else {
72: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
73: }
74: }
76: /* allocate the stream variable */
77: err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
78: stos_scatter->stream = stream;
80: /* the number of indices */
81: stos_scatter->n = n;
83: /* get the maximum number of coresident thread blocks */
84: cudaGetDevice(&device);
85: cudaGetDeviceProperties(&props, device);
86: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
87: if (props.major>=3) {
88: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
89: } else {
90: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
91: }
93: /* assign the indices */
94: cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
95: cci->scatterType = VEC_SCATTER_CUSP_STOS;
96: *ci = cci;
97: return(0);
98: }
102: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
103: {
104: PetscCUSPIndices cci;
105: VecScatterCUSPIndices_PtoP ptop_scatter;
108: cci = new struct _p_PetscCUSPIndices;
109: ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;
111: /* this calculation assumes that the input indices are sorted */
112: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
113: ptop_scatter->sendLowestIndex = sendIndices[0];
114: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
115: ptop_scatter->recvLowestIndex = recvIndices[0];
117: /* assign indices */
118: cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
119: cci->scatterType = VEC_SCATTER_CUSP_PTOP;
121: *ci = cci;
122: return(0);
123: }
127: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
128: {
130: if (!(*ci)) return(0);
131: try {
132: if (ci) {
133: if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
134: delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
135: (*ci)->scatter = 0;
136: } else {
137: cudaError_t err = cudaSuccess;
138: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
139: if (stos_scatter->fslots) {
140: err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
141: stos_scatter->fslots = 0;
142: }
144: /* free the GPU memory for the to-slots */
145: if (stos_scatter->tslots) {
146: err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
147: stos_scatter->tslots = 0;
148: }
150: /* free the stream variable */
151: if (stos_scatter->stream) {
152: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
153: stos_scatter->stream = 0;
154: }
155: delete stos_scatter;
156: (*ci)->scatter = 0;
157: }
158: delete *ci;
159: *ci = 0;
160: }
161: } catch(char *ex) {
162: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
163: }
164: return(0);
165: }
167: /* Insert operator */
168: class Insert {
169: public:
170: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
171: return a;
172: }
173: };
175: /* Add operator */
176: class Add {
177: public:
178: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
179: return a+b;
180: }
181: };
183: /* Add operator */
184: class Max {
185: public:
186: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
187: #if !defined(PETSC_USE_COMPLEX)
188: return PetscMax(a,b);
189: #endif
190: }
191: };
193: /* Sequential general to sequential general GPU kernel */
194: template<class OPERATOR>
195: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
196: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
197: const int grid_size = gridDim.x * blockDim.x;
198: for (int i = tidx; i < n; i += grid_size) {
199: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
200: }
201: }
203: /* Sequential general to sequential strided GPU kernel */
204: template<class OPERATOR>
205: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
206: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
207: const int grid_size = gridDim.x * blockDim.x;
208: for (int i = tidx; i < n; i += grid_size) {
209: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
210: }
211: }
213: /* Sequential strided to sequential strided GPU kernel */
214: template<class OPERATOR>
215: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
216: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
217: const int grid_size = gridDim.x * blockDim.x;
218: for (int i = tidx; i < n; i += grid_size) {
219: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
220: }
221: }
223: /* Sequential strided to sequential general GPU kernel */
224: template<class OPERATOR>
225: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
226: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
227: const int grid_size = gridDim.x * blockDim.x;
228: for (int i = tidx; i < n; i += grid_size) {
229: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
230: }
231: }
233: template<class OPERATOR>
234: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {
236: PetscInt nBlocks=0,nThreads=128;
237: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
239: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
240: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
241: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
242: }
243: dim3 block(nThreads,1,1);
244: dim3 grid(nBlocks,1,1);
246: if (mode == SCATTER_FORWARD) {
247: if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
248: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
249: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
250: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
251: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
252: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
253: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
254: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
255: }
256: } else {
257: if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
258: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
259: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
260: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
261: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
262: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
263: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
264: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
265: }
266: }
267: }
271: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
272: {
273: PetscErrorCode ierr;
274: CUSPARRAY *xarray,*yarray;
275: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
276: cudaError_t err = cudaSuccess;
279: VecCUSPAllocateCheck(x);
280: VecCUSPAllocateCheck(y);
281: VecCUSPGetArrayRead(x,&xarray);
282: VecCUSPGetArrayReadWrite(y,&yarray);
283: if (stos_scatter->n) {
284: if (addv == INSERT_VALUES)
285: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
286: else if (addv == ADD_VALUES)
287: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
288: #if !defined(PETSC_USE_COMPLEX)
289: else if (addv == MAX_VALUES)
290: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
291: #endif
292: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
293: err = cudaGetLastError();CHKERRCUSP((int)err);
294: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
295: }
296: VecCUSPRestoreArrayRead(x,&xarray);
297: VecCUSPRestoreArrayReadWrite(y,&yarray);
298: return(0);
299: }