Actual source code: veccuda2.cu

petsc-3.8.3 2017-12-09
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUDAFlag for the vector
 20:     Does NOT zero the CUDA array

 22:  */
 23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 24: {
 26:   cudaError_t    err;
 27:   cudaStream_t   stream;
 28:   Vec_CUDA       *veccuda;

 31:   if (!v->spptr) {
 32:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 33:     veccuda = (Vec_CUDA*)v->spptr;
 34:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 35:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 36:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 37:     veccuda->stream = stream;
 38:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:     if (v->valid_GPU_array == PETSC_CUDA_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->valid_GPU_array = PETSC_CUDA_CPU;
 42:       } else {
 43:         v->valid_GPU_array = PETSC_CUDA_GPU;
 44:       }
 45:     }
 46:   }
 47:   return(0);
 48: }

 50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 51: PetscErrorCode VecCUDACopyToGPU(Vec v)
 52: {
 54:   cudaError_t    err;
 55:   Vec_CUDA       *veccuda;
 56:   PetscScalar    *varray;

 60:   VecCUDAAllocateCheck(v);
 61:   if (v->valid_GPU_array == PETSC_CUDA_CPU) {
 62:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 63:     veccuda=(Vec_CUDA*)v->spptr;
 64:     varray=veccuda->GPUarray;
 65:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 66:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 67:     v->valid_GPU_array = PETSC_CUDA_BOTH;
 68:   }
 69:   return(0);
 70: }

 72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
 73: {
 74:   PetscScalar    *varray;
 76:   cudaError_t    err;
 77:   PetscScalar    *cpuPtr, *gpuPtr;
 78:   Vec_Seq        *s;
 79:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

 83:   VecCUDAAllocateCheck(v);
 84:   if (v->valid_GPU_array == PETSC_CUDA_CPU) {
 85:     s = (Vec_Seq*)v->data;

 87:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 88:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 89:     gpuPtr = varray + ptop_scatter->recvLowestIndex;
 90:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

 92:     /* Note : this code copies the smallest contiguous chunk of data
 93:        containing ALL of the indices */
 94:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

 96:     // Set the buffer states
 97:     v->valid_GPU_array = PETSC_CUDA_BOTH;
 98:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
 99:   }
100:   return(0);
101: }


104: /*
105:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
106: */
107: PetscErrorCode VecCUDACopyFromGPU(Vec v)
108: {
110:   cudaError_t    err;
111:   Vec_CUDA       *veccuda;
112:   PetscScalar    *varray;

116:   VecCUDAAllocateCheckHost(v);
117:   if (v->valid_GPU_array == PETSC_CUDA_GPU) {
118:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
119:     veccuda=(Vec_CUDA*)v->spptr;
120:     varray=veccuda->GPUarray;
121:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
122:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
123:     v->valid_GPU_array = PETSC_CUDA_BOTH;
124:   }
125:   return(0);
126: }

128: /* Note that this function only copies *some* of the values up from the GPU to CPU,
129:    which means that we need recombine the data at some point before using any of the standard functions.
130:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
131:    where you have to always call in pairs
132: */
133: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
134: {
135:   const PetscScalar *varray, *gpuPtr;
136:   PetscErrorCode    ierr;
137:   cudaError_t       err;
138:   PetscScalar       *cpuPtr;
139:   Vec_Seq           *s;
140:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

144:   VecCUDAAllocateCheckHost(v);
145:   if (v->valid_GPU_array == PETSC_CUDA_GPU) {
146:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);

148:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
149:     s = (Vec_Seq*)v->data;
150:     gpuPtr = varray + ptop_scatter->sendLowestIndex;
151:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

153:     /* Note : this code copies the smallest contiguous chunk of data
154:        containing ALL of the indices */
155:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);

157:     VecCUDARestoreArrayRead(v,&varray);
158:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
159:   }
160:   return(0);
161: }

163: /*MC
164:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

166:    Options Database Keys:
167: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

169:   Level: beginner

171: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
172: M*/

174: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
175: {
176:   const PetscScalar *xarray;
177:   PetscScalar       *yarray;
178:   PetscErrorCode    ierr;
179:   PetscBLASInt      one=1,bn;
180:   PetscScalar       sone=1.0;
181:   cublasStatus_t    cberr;
182:   cudaError_t       err;

185:   PetscBLASIntCast(yin->map->n,&bn);
186:   VecCUDAGetArrayRead(xin,&xarray);
187:   VecCUDAGetArrayReadWrite(yin,&yarray);
188:   if (alpha == (PetscScalar)0.0) {
189:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
190:   } else if (alpha == (PetscScalar)1.0) {
191:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
192:     PetscLogFlops(2.0*yin->map->n);
193:   } else {
194:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
195:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
196:     PetscLogFlops(2.0*yin->map->n);
197:   }
198:   WaitForGPU();CHKERRCUDA(ierr);
199:   VecCUDARestoreArrayRead(xin,&xarray);
200:   VecCUDARestoreArrayReadWrite(yin,&yarray);
201:   return(0);
202: }

204: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
205: {
206:   const PetscScalar *xarray;
207:   PetscScalar       *yarray;
208:   PetscErrorCode    ierr;
209:   PetscBLASInt      one=1,bn;
210:   cublasStatus_t    cberr;

213:   if (alpha != (PetscScalar)0.0) {
214:     PetscBLASIntCast(yin->map->n,&bn);
215:     VecCUDAGetArrayRead(xin,&xarray);
216:     VecCUDAGetArrayReadWrite(yin,&yarray);
217:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
218:     WaitForGPU();CHKERRCUDA(ierr);
219:     VecCUDARestoreArrayRead(xin,&xarray);
220:     VecCUDARestoreArrayReadWrite(yin,&yarray);
221:     PetscLogFlops(2.0*yin->map->n);
222:   }
223:   return(0);
224: }

226: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
227: {
228:   PetscInt                              n = xin->map->n;
229:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
230:   PetscScalar                           *warray=NULL;
231:   thrust::device_ptr<const PetscScalar> xptr,yptr;
232:   thrust::device_ptr<PetscScalar>       wptr;
233:   PetscErrorCode                        ierr;

236:   VecCUDAGetArrayWrite(win,&warray);
237:   VecCUDAGetArrayRead(xin,&xarray);
238:   VecCUDAGetArrayRead(yin,&yarray);
239:   try {
240:     wptr = thrust::device_pointer_cast(warray);
241:     xptr = thrust::device_pointer_cast(xarray);
242:     yptr = thrust::device_pointer_cast(yarray);
243:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
244:     WaitForGPU();CHKERRCUDA(ierr);
245:   } catch (char *ex) {
246:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
247:   }
248:   PetscLogFlops(n);
249:   VecCUDARestoreArrayRead(xin,&xarray);
250:   VecCUDARestoreArrayRead(yin,&yarray);
251:   VecCUDARestoreArrayWrite(win,&warray);
252:   return(0);
253: }

255: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
256: {
257:   const PetscScalar *xarray=NULL,*yarray=NULL;
258:   PetscScalar       *warray=NULL;
259:   PetscErrorCode    ierr;
260:   PetscBLASInt      one=1,bn;
261:   cublasStatus_t    cberr;
262:   cudaError_t       err;

265:   PetscBLASIntCast(win->map->n,&bn);
266:   if (alpha == (PetscScalar)0.0) {
267:     VecCopy_SeqCUDA(yin,win);
268:   } else {
269:     VecCUDAGetArrayRead(xin,&xarray);
270:     VecCUDAGetArrayRead(yin,&yarray);
271:     VecCUDAGetArrayWrite(win,&warray);
272:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
273:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
274:     PetscLogFlops(2*win->map->n);
275:     WaitForGPU();CHKERRCUDA(ierr);
276:     VecCUDARestoreArrayRead(xin,&xarray);
277:     VecCUDARestoreArrayRead(yin,&yarray);
278:     VecCUDARestoreArrayWrite(win,&warray);
279:   }
280:   return(0);
281: }

283: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
284: {
286:   PetscInt       n = xin->map->n,j,j_rem;
287:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

290:   PetscLogFlops(nv*2.0*n);
291:   switch (j_rem=nv&0x3) {
292:     case 3:
293:       alpha0 = alpha[0];
294:       alpha1 = alpha[1];
295:       alpha2 = alpha[2];
296:       alpha += 3;
297:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
298:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
299:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
300:       y   += 3;
301:       break;
302:     case 2:
303:       alpha0 = alpha[0];
304:       alpha1 = alpha[1];
305:       alpha +=2;
306:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
307:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
308:       y +=2;
309:       break;
310:     case 1:
311:       alpha0 = *alpha++;
312:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
313:       y     +=1;
314:       break;
315:   }
316:   for (j=j_rem; j<nv; j+=4) {
317:     alpha0 = alpha[0];
318:     alpha1 = alpha[1];
319:     alpha2 = alpha[2];
320:     alpha3 = alpha[3];
321:     alpha += 4;
322:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
323:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
324:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
325:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
326:     y   += 4;
327:   }
328:   WaitForGPU();CHKERRCUDA(ierr);
329:   return(0);
330: }

332: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
333: {
334:   const PetscScalar *xarray,*yarray;
335:   PetscErrorCode    ierr;
336:   PetscBLASInt      one=1,bn;
337:   cublasStatus_t    cberr;

340:   PetscBLASIntCast(yin->map->n,&bn);
341:   VecCUDAGetArrayRead(xin,&xarray);
342:   VecCUDAGetArrayRead(yin,&yarray);
343:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
344:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
345:   WaitForGPU();CHKERRCUDA(ierr);
346:   if (xin->map->n >0) {
347:     PetscLogFlops(2.0*xin->map->n-1);
348:   }
349:   VecCUDARestoreArrayRead(xin,&xarray);
350:   VecCUDARestoreArrayRead(yin,&yarray);
351:   return(0);
352: }

354: //
355: // CUDA kernels for MDot to follow
356: //

358: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
359: #define MDOT_WORKGROUP_SIZE 128
360: #define MDOT_WORKGROUP_NUM  128

362: #if !defined(PETSC_USE_COMPLEX)
363: // M = 2:
364: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
365:                                         PetscInt size, PetscScalar *group_results)
366: {
367:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
368:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
369:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
370:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
371:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

373:   PetscScalar entry_x    = 0;
374:   PetscScalar group_sum0 = 0;
375:   PetscScalar group_sum1 = 0;
376:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
377:     entry_x     = x[i];   // load only once from global memory!
378:     group_sum0 += entry_x * y0[i];
379:     group_sum1 += entry_x * y1[i];
380:   }
381:   tmp_buffer[threadIdx.x]                       = group_sum0;
382:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

384:   // parallel reduction
385:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
386:     __syncthreads();
387:     if (threadIdx.x < stride) {
388:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
389:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
390:     }
391:   }

393:   // write result of group to group_results
394:   if (threadIdx.x == 0) {
395:     group_results[blockIdx.x]             = tmp_buffer[0];
396:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
397:   }
398: }

400: // M = 3:
401: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
402:                                         PetscInt size, PetscScalar *group_results)
403: {
404:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
405:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
406:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
407:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
408:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

410:   PetscScalar entry_x    = 0;
411:   PetscScalar group_sum0 = 0;
412:   PetscScalar group_sum1 = 0;
413:   PetscScalar group_sum2 = 0;
414:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
415:     entry_x     = x[i];   // load only once from global memory!
416:     group_sum0 += entry_x * y0[i];
417:     group_sum1 += entry_x * y1[i];
418:     group_sum2 += entry_x * y2[i];
419:   }
420:   tmp_buffer[threadIdx.x]                           = group_sum0;
421:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
422:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

424:   // parallel reduction
425:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
426:     __syncthreads();
427:     if (threadIdx.x < stride) {
428:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
429:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
430:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
431:     }
432:   }

434:   // write result of group to group_results
435:   if (threadIdx.x == 0) {
436:     group_results[blockIdx.x                ] = tmp_buffer[0];
437:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
438:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
439:   }
440: }

442: // M = 4:
443: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444:                                         PetscInt size, PetscScalar *group_results)
445: {
446:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
447:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
448:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
449:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
450:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

452:   PetscScalar entry_x    = 0;
453:   PetscScalar group_sum0 = 0;
454:   PetscScalar group_sum1 = 0;
455:   PetscScalar group_sum2 = 0;
456:   PetscScalar group_sum3 = 0;
457:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
458:     entry_x     = x[i];   // load only once from global memory!
459:     group_sum0 += entry_x * y0[i];
460:     group_sum1 += entry_x * y1[i];
461:     group_sum2 += entry_x * y2[i];
462:     group_sum3 += entry_x * y3[i];
463:   }
464:   tmp_buffer[threadIdx.x]                           = group_sum0;
465:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
466:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
467:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

469:   // parallel reduction
470:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
471:     __syncthreads();
472:     if (threadIdx.x < stride) {
473:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
474:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
475:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
476:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
477:     }
478:   }

480:   // write result of group to group_results
481:   if (threadIdx.x == 0) {
482:     group_results[blockIdx.x                ] = tmp_buffer[0];
483:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
484:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
485:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
486:   }
487: }

489: // M = 8:
490: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
491:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
492:                                           PetscInt size, PetscScalar *group_results)
493: {
494:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
495:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
496:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
497:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
498:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

500:   PetscScalar entry_x    = 0;
501:   PetscScalar group_sum0 = 0;
502:   PetscScalar group_sum1 = 0;
503:   PetscScalar group_sum2 = 0;
504:   PetscScalar group_sum3 = 0;
505:   PetscScalar group_sum4 = 0;
506:   PetscScalar group_sum5 = 0;
507:   PetscScalar group_sum6 = 0;
508:   PetscScalar group_sum7 = 0;
509:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
510:     entry_x     = x[i];   // load only once from global memory!
511:     group_sum0 += entry_x * y0[i];
512:     group_sum1 += entry_x * y1[i];
513:     group_sum2 += entry_x * y2[i];
514:     group_sum3 += entry_x * y3[i];
515:     group_sum4 += entry_x * y4[i];
516:     group_sum5 += entry_x * y5[i];
517:     group_sum6 += entry_x * y6[i];
518:     group_sum7 += entry_x * y7[i];
519:   }
520:   tmp_buffer[threadIdx.x]                           = group_sum0;
521:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
522:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
523:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
524:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
525:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
526:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
527:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

529:   // parallel reduction
530:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
531:     __syncthreads();
532:     if (threadIdx.x < stride) {
533:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
534:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
535:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
536:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
537:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
538:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
539:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
540:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
541:     }
542:   }

544:   // write result of group to group_results
545:   if (threadIdx.x == 0) {
546:     group_results[blockIdx.x                ] = tmp_buffer[0];
547:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
548:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
549:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
550:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
551:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
552:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
553:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
554:   }
555: }
556: #endif /* !defined(PETSC_USE_COMPLEX) */

558: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
559: {
560:   PetscErrorCode    ierr;
561:   PetscInt          i,n = xin->map->n,current_y_index = 0;
562:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
563:   PetscScalar       *group_results_gpu;
564: #if !defined(PETSC_USE_COMPLEX)
565:   PetscInt          j;
566:   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
567: #endif
568:   cudaError_t    cuda_ierr;
569:   PetscBLASInt   one=1,bn;
570:   cublasStatus_t cberr;

573:   PetscBLASIntCast(xin->map->n,&bn);
574:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
575:   /* Handle the case of local size zero first */
576:   if (!xin->map->n) {
577:     for (i=0; i<nv; ++i) z[i] = 0;
578:     return(0);
579:   }

581:   // allocate scratchpad memory for the results of individual work groups:
582:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

584:   VecCUDAGetArrayRead(xin,&xptr);

586:   while (current_y_index < nv)
587:   {
588:     switch (nv - current_y_index) {

590:       case 7:
591:       case 6:
592:       case 5:
593:       case 4:
594:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
595:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
596:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
597:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);

599: #if defined(PETSC_USE_COMPLEX)
600:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
601:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
602:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
603:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
604: #else
605:         // run kernel:
606:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

608:         // copy results back to
609:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

611:         // sum group results into z:
612:         for (j=0; j<4; ++j) {
613:           z[current_y_index + j] = 0;
614:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
615:         }
616: #endif
617:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
618:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
619:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
620:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
621:         current_y_index += 4;
622:         break;

624:       case 3:
625:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
626:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
627:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

629: #if defined(PETSC_USE_COMPLEX)
630:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
631:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
632:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
633: #else
634:         // run kernel:
635:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

637:         // copy results back to
638:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

640:         // sum group results into z:
641:         for (j=0; j<3; ++j) {
642:           z[current_y_index + j] = 0;
643:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
644:         }
645: #endif

647:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
648:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
649:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
650:         current_y_index += 3;
651:         break;

653:       case 2:
654:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
655:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);

657: #if defined(PETSC_USE_COMPLEX)
658:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
659:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
660: #else
661:         // run kernel:
662:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

664:         // copy results back to
665:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

667:         // sum group results into z:
668:         for (j=0; j<2; ++j) {
669:           z[current_y_index + j] = 0;
670:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
671:         }
672: #endif
673:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
674:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
675:         current_y_index += 2;
676:         break;

678:       case 1:
679:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
680:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
681:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
682:         current_y_index += 1;
683:         break;

685:       default: // 8 or more vectors left
686:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
687:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
688:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
689:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
690:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
691:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
692:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
693:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);

695: #if defined(PETSC_USE_COMPLEX)
696:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
697:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
698:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
699:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
700:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
701:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
702:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
703:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
704: #else
705:         // run kernel:
706:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

708:         // copy results back to
709:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

711:         // sum group results into z:
712:         for (j=0; j<8; ++j) {
713:           z[current_y_index + j] = 0;
714:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
715:         }
716: #endif
717:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
718:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
719:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
720:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
721:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
722:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
723:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
724:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
725:         current_y_index += 8;
726:         break;
727:     }
728:   }
729:   VecCUDARestoreArrayRead(xin,&xptr);

731:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
732:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
733:   return(0);
734: }

736: #undef MDOT_WORKGROUP_SIZE
737: #undef MDOT_WORKGROUP_NUM

739: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
740: {
741:   PetscInt                        n = xin->map->n;
742:   PetscScalar                     *xarray=NULL;
743:   thrust::device_ptr<PetscScalar> xptr;
744:   PetscErrorCode                  ierr;
745:   cudaError_t                     err;

748:   VecCUDAGetArrayWrite(xin,&xarray);
749:   if (alpha == (PetscScalar)0.0) {
750:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
751:   } else {
752:     try {
753:       xptr = thrust::device_pointer_cast(xarray);
754:       thrust::fill(xptr,xptr+n,alpha);
755:     } catch (char *ex) {
756:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
757:     }
758:   }
759:   WaitForGPU();CHKERRCUDA(ierr);
760:   VecCUDARestoreArrayWrite(xin,&xarray);
761:   return(0);
762: }

764: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
765: {
766:   PetscScalar    *xarray;
768:   PetscBLASInt   one=1,bn;
769:   cublasStatus_t cberr;

772:   if (alpha == (PetscScalar)0.0) {
773:     VecSet_SeqCUDA(xin,alpha);
774:   } else if (alpha != (PetscScalar)1.0) {
775:     PetscBLASIntCast(xin->map->n,&bn);
776:     VecCUDAGetArrayReadWrite(xin,&xarray);
777:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
778:     VecCUDARestoreArrayReadWrite(xin,&xarray);
779:   }
780:   WaitForGPU();CHKERRCUDA(ierr);
781:   PetscLogFlops(xin->map->n);
782:   return(0);
783: }

785: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
786: {
787:   const PetscScalar *xarray,*yarray;
788:   PetscErrorCode    ierr;
789:   PetscBLASInt      one=1,bn;
790:   cublasStatus_t    cberr;

793:   PetscBLASIntCast(xin->map->n,&bn);
794:   VecCUDAGetArrayRead(xin,&xarray);
795:   VecCUDAGetArrayRead(yin,&yarray);
796:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
797:   WaitForGPU();CHKERRCUDA(ierr);
798:   if (xin->map->n > 0) {
799:     PetscLogFlops(2.0*xin->map->n-1);
800:   }
801:   VecCUDARestoreArrayRead(yin,&yarray);
802:   VecCUDARestoreArrayRead(xin,&xarray);
803:   return(0);
804: }

806: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
807: {
808:   const PetscScalar *xarray;
809:   PetscScalar       *yarray;
810:   PetscErrorCode    ierr;
811:   cudaError_t       err;

814:   if (xin != yin) {
815:     if (xin->valid_GPU_array == PETSC_CUDA_GPU) {
816:       VecCUDAGetArrayRead(xin,&xarray);
817:       VecCUDAGetArrayWrite(yin,&yarray);
818:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
819:       WaitForGPU();CHKERRCUDA(ierr);
820:       VecCUDARestoreArrayRead(xin,&xarray);
821:       VecCUDARestoreArrayWrite(yin,&yarray);

823:     } else if (xin->valid_GPU_array == PETSC_CUDA_CPU) {
824:       /* copy in CPU if we are on the CPU*/
825:       VecCopy_SeqCUDA_Private(xin,yin);
826:     } else if (xin->valid_GPU_array == PETSC_CUDA_BOTH) {
827:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
828:       if (yin->valid_GPU_array == PETSC_CUDA_CPU) {
829:         /* copy in CPU */
830:         VecCopy_SeqCUDA_Private(xin,yin);

832:       } else if (yin->valid_GPU_array == PETSC_CUDA_GPU) {
833:         /* copy in GPU */
834:         VecCUDAGetArrayRead(xin,&xarray);
835:         VecCUDAGetArrayWrite(yin,&yarray);
836:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
837:         VecCUDARestoreArrayRead(xin,&xarray);
838:         VecCUDARestoreArrayWrite(yin,&yarray);
839:       } else if (yin->valid_GPU_array == PETSC_CUDA_BOTH) {
840:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
841:            default to copy in GPU (this is an arbitrary choice) */
842:         VecCUDAGetArrayRead(xin,&xarray);
843:         VecCUDAGetArrayWrite(yin,&yarray);
844:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
845:         VecCUDARestoreArrayRead(xin,&xarray);
846:         VecCUDARestoreArrayWrite(yin,&yarray);
847:       } else {
848:         VecCopy_SeqCUDA_Private(xin,yin);
849:       }
850:     }
851:   }
852:   return(0);
853: }

855: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
856: {
858:   PetscBLASInt   one = 1,bn;
859:   PetscScalar    *xarray,*yarray;
860:   cublasStatus_t cberr;

863:   PetscBLASIntCast(xin->map->n,&bn);
864:   if (xin != yin) {
865:     VecCUDAGetArrayReadWrite(xin,&xarray);
866:     VecCUDAGetArrayReadWrite(yin,&yarray);
867:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
868:     WaitForGPU();CHKERRCUDA(ierr);
869:     VecCUDARestoreArrayReadWrite(xin,&xarray);
870:     VecCUDARestoreArrayReadWrite(yin,&yarray);
871:   }
872:   return(0);
873: }

875: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
876: {
877:   PetscErrorCode    ierr;
878:   PetscScalar       a = alpha,b = beta;
879:   const PetscScalar *xarray;
880:   PetscScalar       *yarray;
881:   PetscBLASInt      one = 1, bn;
882:   cublasStatus_t    cberr;
883:   cudaError_t       err;

886:   PetscBLASIntCast(yin->map->n,&bn);
887:   if (a == (PetscScalar)0.0) {
888:     VecScale_SeqCUDA(yin,beta);
889:   } else if (b == (PetscScalar)1.0) {
890:     VecAXPY_SeqCUDA(yin,alpha,xin);
891:   } else if (a == (PetscScalar)1.0) {
892:     VecAYPX_SeqCUDA(yin,beta,xin);
893:   } else if (b == (PetscScalar)0.0) {
894:     VecCUDAGetArrayRead(xin,&xarray);
895:     VecCUDAGetArrayReadWrite(yin,&yarray);
896:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
897:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
898:     PetscLogFlops(xin->map->n);
899:     WaitForGPU();CHKERRCUDA(ierr);
900:     VecCUDARestoreArrayRead(xin,&xarray);
901:     VecCUDARestoreArrayReadWrite(yin,&yarray);
902:   } else {
903:     VecCUDAGetArrayRead(xin,&xarray);
904:     VecCUDAGetArrayReadWrite(yin,&yarray);
905:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
906:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
907:     VecCUDARestoreArrayRead(xin,&xarray);
908:     VecCUDARestoreArrayReadWrite(yin,&yarray);
909:     WaitForGPU();CHKERRCUDA(ierr);
910:     PetscLogFlops(3.0*xin->map->n);
911:   }
912:   return(0);
913: }

915: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
916: {
918:   PetscInt       n = zin->map->n;

921:   if (gamma == (PetscScalar)1.0) {
922:     /* z = ax + b*y + z */
923:     VecAXPY_SeqCUDA(zin,alpha,xin);
924:     VecAXPY_SeqCUDA(zin,beta,yin);
925:     PetscLogFlops(4.0*n);
926:   } else {
927:     /* z = a*x + b*y + c*z */
928:     VecScale_SeqCUDA(zin,gamma);
929:     VecAXPY_SeqCUDA(zin,alpha,xin);
930:     VecAXPY_SeqCUDA(zin,beta,yin);
931:     PetscLogFlops(5.0*n);
932:   }
933:   WaitForGPU();CHKERRCUDA(ierr);
934:   return(0);
935: }

937: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
938: {
939:   PetscInt                              n = win->map->n;
940:   const PetscScalar                     *xarray,*yarray;
941:   PetscScalar                           *warray;
942:   thrust::device_ptr<const PetscScalar> xptr,yptr;
943:   thrust::device_ptr<PetscScalar>       wptr;
944:   PetscErrorCode                        ierr;

947:   VecCUDAGetArrayReadWrite(win,&warray);
948:   VecCUDAGetArrayRead(xin,&xarray);
949:   VecCUDAGetArrayRead(yin,&yarray);
950:   try {
951:     wptr = thrust::device_pointer_cast(warray);
952:     xptr = thrust::device_pointer_cast(xarray);
953:     yptr = thrust::device_pointer_cast(yarray);
954:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
955:     WaitForGPU();CHKERRCUDA(ierr);
956:   } catch (char *ex) {
957:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
958:   }
959:   VecCUDARestoreArrayRead(xin,&xarray);
960:   VecCUDARestoreArrayRead(yin,&yarray);
961:   VecCUDARestoreArrayReadWrite(win,&warray);
962:   PetscLogFlops(n);
963:   return(0);
964: }

966: /* should do infinity norm in cuda */

968: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
969: {
970:   PetscErrorCode    ierr;
971:   PetscInt          n = xin->map->n;
972:   PetscBLASInt      one = 1, bn;
973:   const PetscScalar *xarray;
974:   cublasStatus_t    cberr;
975:   cudaError_t       err;

978:   PetscBLASIntCast(n,&bn);
979:   if (type == NORM_2 || type == NORM_FROBENIUS) {
980:     VecCUDAGetArrayRead(xin,&xarray);
981:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
982:     WaitForGPU();CHKERRCUDA(ierr);
983:     VecCUDARestoreArrayRead(xin,&xarray);
984:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
985:   } else if (type == NORM_INFINITY) {
986:     int  i;
987:     VecCUDAGetArrayRead(xin,&xarray);
988:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
989:     err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
990:     VecCUDARestoreArrayRead(xin,&xarray);
991:   } else if (type == NORM_1) {
992:     VecCUDAGetArrayRead(xin,&xarray);
993:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
994:     VecCUDARestoreArrayRead(xin,&xarray);
995:     WaitForGPU();CHKERRCUDA(ierr);
996:     PetscLogFlops(PetscMax(n-1.0,0.0));
997:   } else if (type == NORM_1_AND_2) {
998:     VecNorm_SeqCUDA(xin,NORM_1,z);
999:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1000:   }
1001:   return(0);
1002: }

1004: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1005: {
1006:   PetscErrorCode    ierr;
1007:   PetscReal         n=s->map->n;
1008:   const PetscScalar *sarray,*tarray;

1011:   VecCUDAGetArrayRead(s,&sarray);
1012:   VecCUDAGetArrayRead(t,&tarray);
1013:   VecDot_SeqCUDA(s,t,dp);
1014:   VecDot_SeqCUDA(t,t,nm);
1015:   VecCUDARestoreArrayRead(s,&sarray);
1016:   VecCUDARestoreArrayRead(t,&tarray);
1017:   WaitForGPU();CHKERRCUDA(ierr);
1018:   PetscLogFlops(4.0*n);
1019:   return(0);
1020: }

1022: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1023: {
1025:   cudaError_t    err;

1028:   if (v->spptr) {
1029:     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1030:       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1031:     }
1032:     if (((Vec_CUDA*)v->spptr)->stream) {
1033:       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1034:     }
1035:     PetscFree(v->spptr);
1036:   }
1037:   VecDestroy_SeqCUDA_Private(v);
1038:   return(0);
1039: }

1041: #if defined(PETSC_USE_COMPLEX)
1042: struct conjugate
1043: {
1044:   __host__ __device__
1045:     PetscScalar operator()(PetscScalar x)
1046:     {
1047:       return PetscConj(x);
1048:     }
1049: };
1050: #endif

1052: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1053: {
1054:   PetscScalar                     *xarray;
1055:   PetscErrorCode                  ierr;
1056: #if defined(PETSC_USE_COMPLEX)
1057:   PetscInt                        n = xin->map->n;
1058:   thrust::device_ptr<PetscScalar> xptr;
1059: #endif

1062:   VecCUDAGetArrayReadWrite(xin,&xarray);
1063: #if defined(PETSC_USE_COMPLEX)
1064:   try {
1065:     xptr = thrust::device_pointer_cast(xarray);
1066:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1067:     WaitForGPU();CHKERRCUDA(ierr);
1068:   } catch (char *ex) {
1069:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1070:   }
1071: #endif
1072:   VecCUDARestoreArrayReadWrite(xin,&xarray);
1073:   return(0);
1074: }

1076: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1077: {
1079:   cudaError_t    err;


1086:   if (w->data) {
1087:     if (((Vec_Seq*)w->data)->array_allocated) {
1088:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1089:     }
1090:     ((Vec_Seq*)w->data)->array = NULL;
1091:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1092:   }
1093:   if (w->spptr) {
1094:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1095:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1096:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1097:     }
1098:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1099:     PetscFree(w->spptr);
1100:   }

1102:   if (v->petscnative) {
1103:     PetscFree(w->data);
1104:     w->data = v->data;
1105:     w->valid_GPU_array = v->valid_GPU_array;
1106:     w->spptr = v->spptr;
1107:     PetscObjectStateIncrease((PetscObject)w);
1108:   } else {
1109:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1110:     w->valid_GPU_array = PETSC_CUDA_CPU;
1111:     VecCUDAAllocateCheck(w);
1112:   }
1113:   return(0);
1114: }

1116: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1117: {
1119:   cudaError_t    err;


1126:   if (v->petscnative) {
1127:     v->data = w->data;
1128:     v->valid_GPU_array = w->valid_GPU_array;
1129:     v->spptr = w->spptr;
1130:     VecCUDACopyFromGPU(v);
1131:     PetscObjectStateIncrease((PetscObject)v);
1132:     w->data = 0;
1133:     w->valid_GPU_array = PETSC_CUDA_UNALLOCATED;
1134:     w->spptr = 0;
1135:   } else {
1136:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1137:     if ((Vec_CUDA*)w->spptr) {
1138:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1139:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1140:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1141:       PetscFree(w->spptr);
1142:     }
1143:   }
1144:   return(0);
1145: }

1147: /*@C
1148:    VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1150:    This function has semantics similar to VecGetArray():  the pointer
1151:    returned by this function points to a consistent view of the vector
1152:    data.  This may involve a copy operation of data from the host to the
1153:    device if the data on the device is out of date.  If the device
1154:    memory hasn't been allocated previously it will be allocated as part
1155:    of this function call.  VecCUDAGetArrayReadWrite() assumes that
1156:    the user will modify the vector data.  This is similar to
1157:    intent(inout) in fortran.

1159:    The CUDA device pointer has to be released by calling
1160:    VecCUDARestoreArrayReadWrite().  Upon restoring the vector data
1161:    the data on the host will be marked as out of date.  A subsequent
1162:    access of the host data will thus incur a data transfer from the
1163:    device to the host.


1166:    Input Parameter:
1167: .  v - the vector

1169:    Output Parameter:
1170: .  a - the CUDA device pointer

1172:    Fortran note:
1173:    This function is not currently available from Fortran.

1175:    Level: intermediate

1177: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1178: @*/
1179: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1180: {

1185:   *a   = 0;
1186:   VecCUDACopyToGPU(v);
1187:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1188:   return(0);
1189: }

1191: /*@C
1192:    VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().

1194:    This marks the host data as out of date.  Subsequent access to the
1195:    vector data on the host side with for instance VecGetArray() incurs a
1196:    data transfer.

1198:    Input Parameter:
1199: +  v - the vector
1200: -  a - the CUDA device pointer.  This pointer is invalid after
1201:        VecCUDARestoreArrayReadWrite() returns.

1203:    Fortran note:
1204:    This function is not currently available from Fortran.

1206:    Level: intermediate

1208: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1209: @*/
1210: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1211: {

1216:   v->valid_GPU_array = PETSC_CUDA_GPU;

1218:   PetscObjectStateIncrease((PetscObject)v);
1219:   return(0);
1220: }

1222: /*@C
1223:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1225:    This function is analogous to VecGetArrayRead():  The pointer
1226:    returned by this function points to a consistent view of the vector
1227:    data.  This may involve a copy operation of data from the host to the
1228:    device if the data on the device is out of date.  If the device
1229:    memory hasn't been allocated previously it will be allocated as part
1230:    of this function call.  VecCUDAGetArrayRead() assumes that the
1231:    user will not modify the vector data.  This is analgogous to
1232:    intent(in) in Fortran.

1234:    The CUDA device pointer has to be released by calling
1235:    VecCUDARestoreArrayRead().  If the data on the host side was
1236:    previously up to date it will remain so, i.e. data on both the device
1237:    and the host is up to date.  Accessing data on the host side does not
1238:    incur a device to host data transfer.

1240:    Input Parameter:
1241: .  v - the vector

1243:    Output Parameter:
1244: .  a - the CUDA pointer.

1246:    Fortran note:
1247:    This function is not currently available from Fortran.

1249:    Level: intermediate

1251: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1252: @*/
1253: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1254: {

1259:   *a   = 0;
1260:   VecCUDACopyToGPU(v);
1261:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1262:   return(0);
1263: }

1265: /*@C
1266:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1268:    If the data on the host side was previously up to date it will remain
1269:    so, i.e. data on both the device and the host is up to date.
1270:    Accessing data on the host side e.g. with VecGetArray() does not
1271:    incur a device to host data transfer.

1273:    Input Parameter:
1274: +  v - the vector
1275: -  a - the CUDA device pointer.  This pointer is invalid after
1276:        VecCUDARestoreArrayRead() returns.

1278:    Fortran note:
1279:    This function is not currently available from Fortran.

1281:    Level: intermediate

1283: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1284: @*/
1285: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1286: {
1289:   return(0);
1290: }

1292: /*@C
1293:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1295:    The data pointed to by the device pointer is uninitialized.  The user
1296:    may not read from this data.  Furthermore, the entire array needs to
1297:    be filled by the user to obtain well-defined behaviour.  The device
1298:    memory will be allocated by this function if it hasn't been allocated
1299:    previously.  This is analogous to intent(out) in Fortran.

1301:    The device pointer needs to be released with
1302:    VecCUDARestoreArrayWrite().  When the pointer is released the
1303:    host data of the vector is marked as out of data.  Subsequent access
1304:    of the host data with e.g. VecGetArray() incurs a device to host data
1305:    transfer.


1308:    Input Parameter:
1309: .  v - the vector

1311:    Output Parameter:
1312: .  a - the CUDA pointer

1314:    Fortran note:
1315:    This function is not currently available from Fortran.

1317:    Level: advanced

1319: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1320: @*/
1321: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1322: {

1327:   *a   = 0;
1328:   VecCUDAAllocateCheck(v);
1329:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1330:   return(0);
1331: }

1333: /*@C
1334:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1336:    Data on the host will be marked as out of date.  Subsequent access of
1337:    the data on the host side e.g. with VecGetArray() will incur a device
1338:    to host data transfer.

1340:    Input Parameter:
1341: +  v - the vector
1342: -  a - the CUDA device pointer.  This pointer is invalid after
1343:        VecCUDARestoreArrayWrite() returns.

1345:    Fortran note:
1346:    This function is not currently available from Fortran.

1348:    Level: intermediate

1350: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1351: @*/
1352: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1353: {

1358:   v->valid_GPU_array = PETSC_CUDA_GPU;

1360:   PetscObjectStateIncrease((PetscObject)v);
1361:   return(0);
1362: }

1364: /*@C
1365:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1366:    GPU array provided by the user. This is useful to avoid copying an
1367:    array into a vector.

1369:    Not Collective

1371:    Input Parameters:
1372: +  vec - the vector
1373: -  array - the GPU array

1375:    Notes:
1376:    You can return to the original GPU array with a call to VecCUDAResetArray()
1377:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1378:    same time on the same vector.

1380:    Level: developer

1382: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1384: @*/
1385: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1386: {

1391:   VecCUDACopyToGPU(vin);
1392:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1393:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1394:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1395:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1396:   PetscObjectStateIncrease((PetscObject)vin);
1397:   return(0);
1398: }

1400: /*@C
1401:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1402:    with a GPU array provided by the user. This is useful to avoid copying
1403:    a GPU array into a vector.

1405:    Not Collective

1407:    Input Parameters:
1408: +  vec - the vector
1409: -  array - the GPU array

1411:    Notes:
1412:    This permanently replaces the GPU array and frees the memory associated
1413:    with the old GPU array.

1415:    The memory passed in CANNOT be freed by the user. It will be freed
1416:    when the vector is destroyed.

1418:    Not supported from Fortran

1420:    Level: developer

1422: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1424: @*/
1425: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1426: {
1427:   cudaError_t err;

1432:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1433:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1434:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1435:   PetscObjectStateIncrease((PetscObject)vin);
1436:   return(0);
1437: }

1439: /*@C
1440:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1441:    after the use of VecCUDAPlaceArray().

1443:    Not Collective

1445:    Input Parameters:
1446: .  vec - the vector

1448:    Level: developer

1450: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1452: @*/
1453: PetscErrorCode VecCUDAResetArray(Vec vin)
1454: {

1459:   VecCUDACopyToGPU(vin);
1460:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1461:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1462:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1463:   PetscObjectStateIncrease((PetscObject)vin);
1464:   return(0);
1465: }