Actual source code: vecscattercusp.cu

petsc-3.6.4 2016-04-12
Report Typos and Errors
  1: /*
  2:    Implements the various scatter operations on cusp vectors
  3: */

  5: #define PETSC_SKIP_COMPLEX

  7: #include <petscconf.h>
  8: PETSC_CUDA_EXTERN_C_BEGIN
  9: #include <petsc/private/vecimpl.h>          /*I "petscvec.h" I*/
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: PETSC_CUDA_EXTERN_C_END
 12: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 14: #include <cuda_runtime.h>

 18: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {

 20:   PetscCUSPIndices           cci;
 21:   VecScatterCUSPIndices_StoS stos_scatter;
 22:   cudaError_t                err = cudaSuccess;
 23:   cudaStream_t               stream;
 24:   PetscInt                   *intVecGPU;
 25:   int                        device;
 26:   cudaDeviceProp             props;

 29:   cci = new struct _p_PetscCUSPIndices;
 30:   stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;

 32:   /* create the "from" indices */
 33:   stos_scatter->fslots = 0;
 34:   stos_scatter->fromFirst = 0;
 35:   stos_scatter->fromStep = 0;
 36:   if (n) {
 37:     if (fslots) {
 38:       /* allocate GPU memory for the to-slots */
 39:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 40:       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 42:       /* assign the pointer to the struct */
 43:       stos_scatter->fslots = intVecGPU;
 44:       stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
 45:     } else if (fromStep) {
 46:       stos_scatter->fromFirst = fromFirst;
 47:       stos_scatter->fromStep = fromStep;
 48:       stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
 49:     } else {
 50:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
 51:     }
 52:   }

 54:   /* create the "to" indices */
 55:   stos_scatter->tslots = 0;
 56:   stos_scatter->toFirst = 0;
 57:   stos_scatter->toStep = 0;
 58:   if (n) {
 59:     if (tslots) {
 60:       /* allocate GPU memory for the to-slots */
 61:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 62:       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 64:       /* assign the pointer to the struct */
 65:       stos_scatter->tslots = intVecGPU;
 66:       stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
 67:     } else if (toStep) {
 68:       stos_scatter->toFirst = toFirst;
 69:       stos_scatter->toStep = toStep;
 70:       stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
 71:     } else {
 72:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
 73:     }
 74:   }

 76:   /* allocate the stream variable */
 77:   err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
 78:   stos_scatter->stream = stream;

 80:   /* the number of indices */
 81:   stos_scatter->n = n;

 83:   /* get the maximum number of coresident thread blocks */
 84:   cudaGetDevice(&device);
 85:   cudaGetDeviceProperties(&props, device);
 86:   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
 87:   if (props.major>=3) {
 88:     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
 89:   } else {
 90:     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
 91:   }

 93:   /* assign the indices */
 94:   cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
 95:   cci->scatterType = VEC_SCATTER_CUSP_STOS;
 96:   *ci = cci;
 97:   return(0);
 98: }

102: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
103: {
104:   PetscCUSPIndices           cci;
105:   VecScatterCUSPIndices_PtoP ptop_scatter;

108:   cci = new struct _p_PetscCUSPIndices;
109:   ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;

111:   /* this calculation assumes that the input indices are sorted */
112:   ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
113:   ptop_scatter->sendLowestIndex = sendIndices[0];
114:   ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
115:   ptop_scatter->recvLowestIndex = recvIndices[0];

117:   /* assign indices */
118:   cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
119:   cci->scatterType = VEC_SCATTER_CUSP_PTOP;

121:   *ci = cci;
122:   return(0);
123: }

127: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
128: {
130:   if (!(*ci)) return(0);
131:   try {
132:     if (ci) {
133:       if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
134:         delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
135:         (*ci)->scatter = 0;
136:       } else {
137:         cudaError_t err = cudaSuccess;
138:         VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
139:         if (stos_scatter->fslots) {
140:           err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
141:           stos_scatter->fslots = 0;
142:         }

144:         /* free the GPU memory for the to-slots */
145:         if (stos_scatter->tslots) {
146:           err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
147:           stos_scatter->tslots = 0;
148:         }

150:         /* free the stream variable */
151:         if (stos_scatter->stream) {
152:           err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
153:           stos_scatter->stream = 0;
154:         }
155:         delete stos_scatter;
156:         (*ci)->scatter = 0;
157:       }
158:       delete *ci;
159:       *ci = 0;
160:     }
161:   } catch(char *ex) {
162:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
163:   }
164:   return(0);
165: }

167: /* Insert operator */
168: class Insert {
169:  public:
170:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
171:     return a;
172:   }
173: };

175: /* Add operator */
176: class Add {
177:  public:
178:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
179:     return a+b;
180:   }
181: };

183: /* Add operator */
184: class Max {
185:  public:
186:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
187: #if !defined(PETSC_USE_COMPLEX)
188:     return PetscMax(a,b);
189: #endif
190:   }
191: };

193: /* Sequential general to sequential general GPU kernel */
194: template<class OPERATOR>
195: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
196:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
197:   const int grid_size = gridDim.x * blockDim.x;
198:   for (int i = tidx; i < n; i += grid_size) {
199:     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
200:   }
201: }

203: /* Sequential general to sequential strided GPU kernel */
204: template<class OPERATOR>
205: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
206:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
207:   const int grid_size = gridDim.x * blockDim.x;
208:   for (int i = tidx; i < n; i += grid_size) {
209:     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
210:   }
211: }

213: /* Sequential strided to sequential strided GPU kernel */
214: template<class OPERATOR>
215: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
216:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
217:   const int grid_size = gridDim.x * blockDim.x;
218:   for (int i = tidx; i < n; i += grid_size) {
219:     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
220:   }
221: }

223: /* Sequential strided to sequential general GPU kernel */
224: template<class OPERATOR>
225: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
226:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
227:   const int grid_size = gridDim.x * blockDim.x;
228:   for (int i = tidx; i < n; i += grid_size) {
229:     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
230:   }
231: }

233: template<class OPERATOR>
234: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {

236:   PetscInt                   nBlocks=0,nThreads=128;
237:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;

239:   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
240:   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
241:     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
242:   }
243:   dim3 block(nThreads,1,1);
244:   dim3 grid(nBlocks,1,1);

246:   if (mode == SCATTER_FORWARD) {
247:     if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
248:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
249:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
250:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
251:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
252:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
253:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
254:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
255:     }
256:   } else {
257:     if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
258:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
259:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
260:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
261:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
262:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
263:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
264:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
265:     }
266:   }
267: }

271: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
272: {
273:   PetscErrorCode             ierr;
274:   CUSPARRAY                  *xarray,*yarray;
275:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
276:   cudaError_t                err = cudaSuccess;

279:   VecCUSPAllocateCheck(x);
280:   VecCUSPAllocateCheck(y);
281:   VecCUSPGetArrayRead(x,&xarray);
282:   VecCUSPGetArrayReadWrite(y,&yarray);
283:   if (stos_scatter->n) {
284:     if (addv == INSERT_VALUES)
285:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
286:     else if (addv == ADD_VALUES)
287:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
288: #if !defined(PETSC_USE_COMPLEX)
289:     else if (addv == MAX_VALUES)
290:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
291: #endif
292:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
293:     err = cudaGetLastError();CHKERRCUSP((int)err);
294:     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
295:   }
296:   VecCUSPRestoreArrayRead(x,&xarray);
297:   VecCUSPRestoreArrayReadWrite(y,&yarray);
298:   return(0);
299: }