Actual source code: veccuda2.cu

petsc-3.7.1 2016-05-15
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>

 19: /*
 20:     Allocates space for the vector array on the GPU if it does not exist.
 21:     Does NOT change the PetscCUDAFlag for the vector
 22:     Does NOT zero the CUDA array

 24:  */
 25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 26: {
 28:   cudaError_t    err;
 29:   cudaStream_t   stream;
 30:   Vec_CUDA       *veccuda;

 33:   if (!v->spptr) {
 34:     PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void **) &veccuda->GPUarray, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 38:     veccuda->stream = stream;
 39:     veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 40:     v->ops->destroy = VecDestroy_SeqCUDA;
 41:     if (v->valid_GPU_array == PETSC_CUDA_UNALLOCATED) {
 42:       if (v->data && ((Vec_Seq*)v->data)->array) {
 43:         v->valid_GPU_array = PETSC_CUDA_CPU;
 44:       } else {
 45:         v->valid_GPU_array = PETSC_CUDA_GPU;
 46:       }
 47:     }
 48:   }
 49:   return(0);
 50: }

 54: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 55: PetscErrorCode VecCUDACopyToGPU(Vec v)
 56: {
 58:   cudaError_t    err;
 59:   Vec_CUDA       *veccuda;
 60:   PetscScalar    *varray;

 63:   VecCUDAAllocateCheck(v);
 64:   if (v->valid_GPU_array == PETSC_CUDA_CPU) {
 65:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 66:     veccuda=(Vec_CUDA*)v->spptr;
 67:     varray=veccuda->GPUarray;
 68:     err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 69:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 70:     v->valid_GPU_array = PETSC_CUDA_BOTH;
 71:   }
 72:   return(0);
 73: }

 77: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
 78: {
 79:   PetscScalar    *varray;
 81:   cudaError_t    err;
 82:   PetscScalar    *cpuPtr, *gpuPtr;
 83:   Vec_Seq        *s;
 84:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

 87:   VecCUDAAllocateCheck(v);
 88:   if (v->valid_GPU_array == PETSC_CUDA_CPU) {
 89:     s = (Vec_Seq*)v->data;

 91:     PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
 92:     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
 93:     gpuPtr = varray + ptop_scatter->recvLowestIndex;
 94:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

 96:     /* Note : this code copies the smallest contiguous chunk of data
 97:        containing ALL of the indices */
 98:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

100:     // Set the buffer states
101:     v->valid_GPU_array = PETSC_CUDA_BOTH;
102:     PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
103:   }
104:   return(0);
105: }


110: /*
111:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecCUDACopyFromGPU(Vec v)
114: {
116:   cudaError_t    err;
117:   Vec_CUDA       *veccuda;
118:   PetscScalar    *varray;

121:   VecCUDAAllocateCheckHost(v);
122:   if (v->valid_GPU_array == PETSC_CUDA_GPU) {
123:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
124:     veccuda=(Vec_CUDA*)v->spptr;
125:     varray=veccuda->GPUarray;
126:     err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
127:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
128:     v->valid_GPU_array = PETSC_CUDA_BOTH;
129:   }
130:   return(0);
131: }

135: /* Note that this function only copies *some* of the values up from the GPU to CPU,
136:    which means that we need recombine the data at some point before using any of the standard functions.
137:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
138:    where you have to always call in pairs
139: */
140: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
141: {
142:   PetscScalar    *varray;
144:   cudaError_t    err;
145:   PetscScalar    *cpuPtr, *gpuPtr;
146:   Vec_Seq        *s;
147:   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;

150:   VecCUDAAllocateCheckHost(v);
151:   if (v->valid_GPU_array == PETSC_CUDA_GPU) {
152:     PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);

154:     varray=((Vec_CUDA*)v->spptr)->GPUarray;
155:     s = (Vec_Seq*)v->data;
156:     gpuPtr = varray + ptop_scatter->sendLowestIndex;
157:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

159:     /* Note : this code copies the smallest contiguous chunk of data
160:        containing ALL of the indices */
161:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);

163:     VecCUDARestoreArrayRead(v,&varray);
164:     PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
165:   }
166:   return(0);
167: }

169: /*MC
170:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

172:    Options Database Keys:
173: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

175:   Level: beginner

177: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
178: M*/

182: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
183: {
184:   PetscScalar    *xarray,*yarray;
186:   PetscBLASInt   one=1,bn;
187:   PetscScalar    sone=1.0;
188:   cublasStatus_t cberr;
189:   cudaError_t    err;

192:   PetscBLASIntCast(yin->map->n,&bn);
193:   VecCUDAGetArrayRead(xin,&xarray);
194:   VecCUDAGetArrayReadWrite(yin,&yarray);
195:   if (alpha == (PetscScalar)0.0) {
196:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
197:   } else if (alpha == (PetscScalar)1.0) {
198:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
199:     PetscLogFlops(2.0*yin->map->n);
200:   } else {
201:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
202:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
203:     PetscLogFlops(2.0*yin->map->n);
204:   }
205:   WaitForGPU();CHKERRCUDA(ierr);
206:   VecCUDARestoreArrayRead(xin,&xarray);
207:   VecCUDARestoreArrayReadWrite(yin,&yarray);
208:   return(0);
209: }

213: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
214: {
215:   PetscScalar    *xarray,*yarray;
217:   PetscBLASInt   one=1,bn;
218:   cublasStatus_t cberr;

221:   if (alpha != (PetscScalar)0.0) {
222:     PetscBLASIntCast(yin->map->n,&bn);
223:     VecCUDAGetArrayRead(xin,&xarray);
224:     VecCUDAGetArrayReadWrite(yin,&yarray);
225:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226:     WaitForGPU();CHKERRCUDA(ierr);
227:     VecCUDARestoreArrayRead(xin,&xarray);
228:     VecCUDARestoreArrayReadWrite(yin,&yarray);
229:     PetscLogFlops(2.0*yin->map->n);
230:   }
231:   return(0);
232: }

236: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
237: {
238:   PetscInt                        n = xin->map->n;
239:   PetscScalar                     *warray=NULL,*xarray=NULL,*yarray=NULL;
240:   thrust::device_ptr<PetscScalar> wptr,xptr,yptr;
241:   PetscErrorCode                  ierr;

244:   VecCUDAGetArrayWrite(win,&warray);
245:   VecCUDAGetArrayRead(xin,&xarray);
246:   VecCUDAGetArrayRead(yin,&yarray);
247:   try {
248:     wptr = thrust::device_pointer_cast(warray);
249:     xptr = thrust::device_pointer_cast(xarray);
250:     yptr = thrust::device_pointer_cast(yarray);
251:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
252:     WaitForGPU();CHKERRCUDA(ierr);
253:   } catch (char *ex) {
254:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
255:   }
256:   PetscLogFlops(n);
257:   VecCUDARestoreArrayRead(xin,&xarray);
258:   VecCUDARestoreArrayRead(yin,&yarray);
259:   VecCUDARestoreArrayWrite(win,&warray);
260:   return(0);
261: }

265: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
266: {
267:   PetscScalar    *xarray=NULL,*yarray=NULL,*warray=NULL;
269:   PetscBLASInt   one=1,bn;
270:   cublasStatus_t cberr;
271:   cudaError_t    err;

274:   PetscBLASIntCast(win->map->n,&bn);
275:   if (alpha == (PetscScalar)0.0) {
276:     VecCopy_SeqCUDA(yin,win);
277:   } else {
278:     VecCUDAGetArrayRead(xin,&xarray);
279:     VecCUDAGetArrayRead(yin,&yarray);
280:     VecCUDAGetArrayWrite(win,&warray);
281:     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
282:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
283:     PetscLogFlops(2*win->map->n);
284:     WaitForGPU();CHKERRCUDA(ierr);
285:     VecCUDARestoreArrayRead(xin,&xarray);
286:     VecCUDARestoreArrayRead(yin,&yarray);
287:     VecCUDARestoreArrayWrite(win,&warray);
288:   }
289:   return(0);
290: }

294: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
295: {
297:   PetscInt       n = xin->map->n,j,j_rem;
298:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

301:   PetscLogFlops(nv*2.0*n);
302:   switch (j_rem=nv&0x3) {
303:     case 3:
304:       alpha0 = alpha[0];
305:       alpha1 = alpha[1];
306:       alpha2 = alpha[2];
307:       alpha += 3;
308:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
309:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
310:       VecAXPY_SeqCUDA(xin,alpha2,y[2]);
311:       y   += 3;
312:       break;
313:     case 2:
314:       alpha0 = alpha[0];
315:       alpha1 = alpha[1];
316:       alpha +=2;
317:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
318:       VecAXPY_SeqCUDA(xin,alpha1,y[1]);
319:       y +=2;
320:       break;
321:     case 1:
322:       alpha0 = *alpha++;
323:       VecAXPY_SeqCUDA(xin,alpha0,y[0]);
324:       y     +=1;
325:       break;
326:   }
327:   for (j=j_rem; j<nv; j+=4) {
328:     alpha0 = alpha[0];
329:     alpha1 = alpha[1];
330:     alpha2 = alpha[2];
331:     alpha3 = alpha[3];
332:     alpha += 4;
333:     VecAXPY_SeqCUDA(xin,alpha0,y[0]);
334:     VecAXPY_SeqCUDA(xin,alpha1,y[1]);
335:     VecAXPY_SeqCUDA(xin,alpha2,y[2]);
336:     VecAXPY_SeqCUDA(xin,alpha3,y[3]);
337:     y   += 4;
338:   }
339:   WaitForGPU();CHKERRCUDA(ierr);
340:   return(0);
341: }

345: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
346: {
347:   PetscScalar    *xarray,*yarray;
349:   PetscBLASInt   one=1,bn;
350:   cublasStatus_t cberr;

353:   PetscBLASIntCast(yin->map->n,&bn);
354:   VecCUDAGetArrayRead(xin,&xarray);
355:   VecCUDAGetArrayRead(yin,&yarray);
356:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
357:   cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
358:   WaitForGPU();CHKERRCUDA(ierr);
359:   if (xin->map->n >0) {
360:     PetscLogFlops(2.0*xin->map->n-1);
361:   }
362:   VecCUDARestoreArrayRead(xin,&xarray);
363:   VecCUDARestoreArrayRead(yin,&yarray);
364:   return(0);
365: }

367: //
368: // CUDA kernels for MDot to follow
369: //

371: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
372: #define MDOT_WORKGROUP_SIZE 128
373: #define MDOT_WORKGROUP_NUM  128

375: #if !defined(PETSC_USE_COMPLEX)
376: // M = 2:
377: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
378:                                         PetscInt size, PetscScalar *group_results)
379: {
380:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
381:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
382:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
383:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
384:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

386:   PetscScalar entry_x    = 0;
387:   PetscScalar group_sum0 = 0;
388:   PetscScalar group_sum1 = 0;
389:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
390:     entry_x     = x[i];   // load only once from global memory!
391:     group_sum0 += entry_x * y0[i];
392:     group_sum1 += entry_x * y1[i];
393:   }
394:   tmp_buffer[threadIdx.x]                       = group_sum0;
395:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

397:   // parallel reduction
398:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
399:     __syncthreads();
400:     if (threadIdx.x < stride) {
401:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
402:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
403:     }
404:   }

406:   // write result of group to group_results
407:   if (threadIdx.x == 0) {
408:     group_results[blockIdx.x]             = tmp_buffer[0];
409:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
410:   }
411: }

413: // M = 3:
414: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
415:                                         PetscInt size, PetscScalar *group_results)
416: {
417:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
418:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
419:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
420:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
421:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

423:   PetscScalar entry_x    = 0;
424:   PetscScalar group_sum0 = 0;
425:   PetscScalar group_sum1 = 0;
426:   PetscScalar group_sum2 = 0;
427:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
428:     entry_x     = x[i];   // load only once from global memory!
429:     group_sum0 += entry_x * y0[i];
430:     group_sum1 += entry_x * y1[i];
431:     group_sum2 += entry_x * y2[i];
432:   }
433:   tmp_buffer[threadIdx.x]                           = group_sum0;
434:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
435:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

437:   // parallel reduction
438:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439:     __syncthreads();
440:     if (threadIdx.x < stride) {
441:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
442:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
443:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
444:     }
445:   }

447:   // write result of group to group_results
448:   if (threadIdx.x == 0) {
449:     group_results[blockIdx.x                ] = tmp_buffer[0];
450:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
451:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
452:   }
453: }

455: // M = 4:
456: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
457:                                         PetscInt size, PetscScalar *group_results)
458: {
459:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
460:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
461:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
462:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
463:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

465:   PetscScalar entry_x    = 0;
466:   PetscScalar group_sum0 = 0;
467:   PetscScalar group_sum1 = 0;
468:   PetscScalar group_sum2 = 0;
469:   PetscScalar group_sum3 = 0;
470:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
471:     entry_x     = x[i];   // load only once from global memory!
472:     group_sum0 += entry_x * y0[i];
473:     group_sum1 += entry_x * y1[i];
474:     group_sum2 += entry_x * y2[i];
475:     group_sum3 += entry_x * y3[i];
476:   }
477:   tmp_buffer[threadIdx.x]                           = group_sum0;
478:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
479:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
480:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

482:   // parallel reduction
483:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484:     __syncthreads();
485:     if (threadIdx.x < stride) {
486:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
487:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
488:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490:     }
491:   }

493:   // write result of group to group_results
494:   if (threadIdx.x == 0) {
495:     group_results[blockIdx.x                ] = tmp_buffer[0];
496:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
497:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
498:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
499:   }
500: }

502: // M = 8:
503: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
504:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
505:                                           PetscInt size, PetscScalar *group_results)
506: {
507:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
508:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
509:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
510:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
511:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

513:   PetscScalar entry_x    = 0;
514:   PetscScalar group_sum0 = 0;
515:   PetscScalar group_sum1 = 0;
516:   PetscScalar group_sum2 = 0;
517:   PetscScalar group_sum3 = 0;
518:   PetscScalar group_sum4 = 0;
519:   PetscScalar group_sum5 = 0;
520:   PetscScalar group_sum6 = 0;
521:   PetscScalar group_sum7 = 0;
522:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
523:     entry_x     = x[i];   // load only once from global memory!
524:     group_sum0 += entry_x * y0[i];
525:     group_sum1 += entry_x * y1[i];
526:     group_sum2 += entry_x * y2[i];
527:     group_sum3 += entry_x * y3[i];
528:     group_sum4 += entry_x * y4[i];
529:     group_sum5 += entry_x * y5[i];
530:     group_sum6 += entry_x * y6[i];
531:     group_sum7 += entry_x * y7[i];
532:   }
533:   tmp_buffer[threadIdx.x]                           = group_sum0;
534:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
535:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
536:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
537:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
538:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
539:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
540:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

542:   // parallel reduction
543:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
544:     __syncthreads();
545:     if (threadIdx.x < stride) {
546:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
547:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
548:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
549:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
550:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
551:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
552:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
553:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
554:     }
555:   }

557:   // write result of group to group_results
558:   if (threadIdx.x == 0) {
559:     group_results[blockIdx.x                ] = tmp_buffer[0];
560:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
561:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
562:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
563:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
564:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
565:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
566:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
567:   }
568: }
569: #endif /* !defined(PETSC_USE_COMPLEX) */

573: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
574: {
576:   PetscInt       i,n = xin->map->n,current_y_index = 0;
577:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
578: #if !defined(PETSC_USE_COMPLEX)
579:   PetscInt       j;
580:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
581: #endif
582:   cudaError_t    cuda_ierr;
583:   PetscBLASInt   one=1,bn;
584:   cublasStatus_t cberr;

587:   PetscBLASIntCast(xin->map->n,&bn);
588:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
589:   /* Handle the case of local size zero first */
590:   if (!xin->map->n) {
591:     for (i=0; i<nv; ++i) z[i] = 0;
592:     return(0);
593:   }

595:   // allocate scratchpad memory for the results of individual work groups:
596:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);

598:   VecCUDAGetArrayRead(xin,&xptr);

600:   while (current_y_index < nv)
601:   {
602:     switch (nv - current_y_index) {

604:       case 7:
605:       case 6:
606:       case 5:
607:       case 4:
608:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
609:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
610:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
611:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);

613: #if defined(PETSC_USE_COMPLEX)
614:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
615:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
616:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
617:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
618: #else
619:         // run kernel:
620:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

622:         // copy results back to
623:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

625:         // sum group results into z:
626:         for (j=0; j<4; ++j) {
627:           z[current_y_index + j] = 0;
628:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
629:         }
630: #endif
631:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
632:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
633:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
634:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
635:         current_y_index += 4;
636:         break;

638:       case 3:
639:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
640:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
641:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

643: #if defined(PETSC_USE_COMPLEX)
644:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
645:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
646:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
647: #else
648:         // run kernel:
649:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

651:         // copy results back to
652:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

654:         // sum group results into z:
655:         for (j=0; j<3; ++j) {
656:           z[current_y_index + j] = 0;
657:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
658:         }
659: #endif

661:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
662:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
663:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
664:         current_y_index += 3;
665:         break;

667:       case 2:
668:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
669:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);

671: #if defined(PETSC_USE_COMPLEX)
672:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
673:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
674: #else
675:         // run kernel:
676:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

678:         // copy results back to
679:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

681:         // sum group results into z:
682:         for (j=0; j<2; ++j) {
683:           z[current_y_index + j] = 0;
684:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
685:         }
686: #endif
687:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
688:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
689:         current_y_index += 2;
690:         break;

692:       case 1:
693:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
694:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
695:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
696:         current_y_index += 1;
697:         break;

699:       default: // 8 or more vectors left
700:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
701:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
702:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
703:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
704:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
705:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
706:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
707:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);

709: #if defined(PETSC_USE_COMPLEX)
710:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
711:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
712:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
713:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
714:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
715:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
716:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
717:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
718: #else
719:         // run kernel:
720:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

722:         // copy results back to
723:         cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

725:         // sum group results into z:
726:         for (j=0; j<8; ++j) {
727:           z[current_y_index + j] = 0;
728:           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
729:         }
730: #endif
731:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
732:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
733:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
734:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
735:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
736:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
737:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
738:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
739:         current_y_index += 8;
740:         break;
741:     }
742:   }
743:   VecCUDARestoreArrayRead(xin,&xptr);

745:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
746:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
747:   return(0);
748: }

750: #undef MDOT_WORKGROUP_SIZE
751: #undef MDOT_WORKGROUP_NUM

755: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
756: {
757:   PetscInt                        n = xin->map->n;
758:   PetscScalar                     *xarray=NULL;
759:   thrust::device_ptr<PetscScalar> xptr;
760:   PetscErrorCode                  ierr;
761:   cudaError_t                     err;

764:   VecCUDAGetArrayWrite(xin,&xarray);
765:   if (alpha == (PetscScalar)0.0) {
766:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
767:   } else {
768:     try {
769:       xptr = thrust::device_pointer_cast(xarray);
770:       thrust::fill(xptr,xptr+n,alpha);
771:     } catch (char *ex) {
772:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
773:     }
774:   }
775:   WaitForGPU();CHKERRCUDA(ierr);
776:   VecCUDARestoreArrayWrite(xin,&xarray);
777:   return(0);
778: }

782: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
783: {
784:   PetscScalar    *xarray;
786:   PetscBLASInt   one=1,bn;
787:   cublasStatus_t cberr;

790:   if (alpha == (PetscScalar)0.0) {
791:     VecSet_SeqCUDA(xin,alpha);
792:   } else if (alpha != (PetscScalar)1.0) {
793:     PetscBLASIntCast(xin->map->n,&bn);
794:     VecCUDAGetArrayReadWrite(xin,&xarray);
795:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
796:     VecCUDARestoreArrayReadWrite(xin,&xarray);
797:   }
798:   WaitForGPU();CHKERRCUDA(ierr);
799:   PetscLogFlops(xin->map->n);
800:   return(0);
801: }

805: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
806: {
807:   PetscScalar    *xarray,*yarray;
809:   PetscBLASInt   one=1,bn;
810:   cublasStatus_t cberr;

813:   PetscBLASIntCast(xin->map->n,&bn);
814:   VecCUDAGetArrayRead(xin,&xarray);
815:   VecCUDAGetArrayRead(yin,&yarray);
816:   cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
817:   WaitForGPU();CHKERRCUDA(ierr);
818:   if (xin->map->n > 0) {
819:     PetscLogFlops(2.0*xin->map->n-1);
820:   }
821:   VecCUDARestoreArrayRead(yin,&yarray);
822:   VecCUDARestoreArrayRead(xin,&xarray);
823:   return(0);
824: }

828: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
829: {
830:   PetscScalar    *xarray,*yarray;
832:   cudaError_t    err;

835:   if (xin != yin) {
836:     if (xin->valid_GPU_array == PETSC_CUDA_GPU) {
837:       VecCUDAGetArrayRead(xin,&xarray);
838:       VecCUDAGetArrayWrite(yin,&yarray);
839:       err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
840:       WaitForGPU();CHKERRCUDA(ierr);
841:       VecCUDARestoreArrayRead(xin,&xarray);
842:       VecCUDARestoreArrayWrite(yin,&yarray);

844:     } else if (xin->valid_GPU_array == PETSC_CUDA_CPU) {
845:       /* copy in CPU if we are on the CPU*/
846:       VecCopy_SeqCUDA_Private(xin,yin);
847:     } else if (xin->valid_GPU_array == PETSC_CUDA_BOTH) {
848:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
849:       if (yin->valid_GPU_array == PETSC_CUDA_CPU) {
850:         /* copy in CPU */
851:         VecCopy_SeqCUDA_Private(xin,yin);

853:       } else if (yin->valid_GPU_array == PETSC_CUDA_GPU) {
854:         /* copy in GPU */
855:         VecCUDAGetArrayRead(xin,&xarray);
856:         VecCUDAGetArrayWrite(yin,&yarray);
857:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
858:         VecCUDARestoreArrayRead(xin,&xarray);
859:         VecCUDARestoreArrayWrite(yin,&yarray);
860:       } else if (yin->valid_GPU_array == PETSC_CUDA_BOTH) {
861:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
862:            default to copy in GPU (this is an arbitrary choice) */
863:         VecCUDAGetArrayRead(xin,&xarray);
864:         VecCUDAGetArrayWrite(yin,&yarray);
865:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
866:         VecCUDARestoreArrayRead(xin,&xarray);
867:         VecCUDARestoreArrayWrite(yin,&yarray);
868:       } else {
869:         VecCopy_SeqCUDA_Private(xin,yin);
870:       }
871:     }
872:   }
873:   return(0);
874: }

878: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
879: {
881:   PetscBLASInt   one = 1,bn;
882:   PetscScalar    *xarray,*yarray;
883:   cublasStatus_t cberr;

886:   PetscBLASIntCast(xin->map->n,&bn);
887:   if (xin != yin) {
888:     VecCUDAGetArrayReadWrite(xin,&xarray);
889:     VecCUDAGetArrayReadWrite(yin,&yarray);
890:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
891:     WaitForGPU();CHKERRCUDA(ierr);
892:     VecCUDARestoreArrayReadWrite(xin,&xarray);
893:     VecCUDARestoreArrayReadWrite(yin,&yarray);
894:   }
895:   return(0);
896: }

900: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
901: {
903:   PetscScalar    a = alpha,b = beta;
904:   PetscScalar    *xarray,*yarray;
905:   PetscBLASInt   one = 1, bn;
906:   cublasStatus_t cberr;
907:   cudaError_t    err;

910:   PetscBLASIntCast(yin->map->n,&bn);
911:   if (a == (PetscScalar)0.0) {
912:     VecScale_SeqCUDA(yin,beta);
913:   } else if (b == (PetscScalar)1.0) {
914:     VecAXPY_SeqCUDA(yin,alpha,xin);
915:   } else if (a == (PetscScalar)1.0) {
916:     VecAYPX_SeqCUDA(yin,beta,xin);
917:   } else if (b == (PetscScalar)0.0) {
918:     VecCUDAGetArrayRead(xin,&xarray);
919:     VecCUDAGetArrayReadWrite(yin,&yarray);
920:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
921:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
922:     PetscLogFlops(xin->map->n);
923:     WaitForGPU();CHKERRCUDA(ierr);
924:     VecCUDARestoreArrayRead(xin,&xarray);
925:     VecCUDARestoreArrayReadWrite(yin,&yarray);
926:   } else {
927:     VecCUDAGetArrayRead(xin,&xarray);
928:     VecCUDAGetArrayReadWrite(yin,&yarray);
929:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
930:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
931:     VecCUDARestoreArrayRead(xin,&xarray);
932:     VecCUDARestoreArrayReadWrite(yin,&yarray);
933:     WaitForGPU();CHKERRCUDA(ierr);
934:     PetscLogFlops(3.0*xin->map->n);
935:   }
936:   return(0);
937: }

941: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
942: {
944:   PetscInt       n = zin->map->n;

947:   if (gamma == (PetscScalar)1.0) {
948:     /* z = ax + b*y + z */
949:     VecAXPY_SeqCUDA(zin,alpha,xin);
950:     VecAXPY_SeqCUDA(zin,beta,yin);
951:     PetscLogFlops(4.0*n);
952:   } else {
953:     /* z = a*x + b*y + c*z */
954:     VecScale_SeqCUDA(zin,gamma);
955:     VecAXPY_SeqCUDA(zin,alpha,xin);
956:     VecAXPY_SeqCUDA(zin,beta,yin);
957:     PetscLogFlops(5.0*n);
958:   }
959:   WaitForGPU();CHKERRCUDA(ierr);
960:   return(0);
961: }

965: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
966: {
967:   PetscInt                        n = win->map->n;
968:   PetscScalar                     *warray,*xarray,*yarray;
969:   thrust::device_ptr<PetscScalar> wptr,xptr,yptr;

973:   VecCUDAGetArrayReadWrite(win,&warray);
974:   VecCUDAGetArrayRead(xin,&xarray);
975:   VecCUDAGetArrayRead(yin,&yarray);
976:   try {
977:     wptr = thrust::device_pointer_cast(warray);
978:     xptr = thrust::device_pointer_cast(xarray);
979:     yptr = thrust::device_pointer_cast(yarray);
980:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
981:     WaitForGPU();CHKERRCUDA(ierr);
982:   } catch (char *ex) {
983:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
984:   }
985:   VecCUDARestoreArrayRead(xin,&xarray);
986:   VecCUDARestoreArrayRead(yin,&yarray);
987:   VecCUDARestoreArrayReadWrite(win,&warray);
988:   PetscLogFlops(n);
989:   return(0);
990: }

992: /* should do infinity norm in cuda */

996: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
997: {
998:   PetscErrorCode    ierr;
999:   PetscInt          n = xin->map->n;
1000:   PetscBLASInt      one = 1, bn;
1001:   PetscScalar       *xarray;
1002:   cublasStatus_t    cberr;
1003:   cudaError_t       err;

1006:   PetscBLASIntCast(n,&bn);
1007:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1008:     VecCUDAGetArrayRead(xin,&xarray);
1009:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1010:     WaitForGPU();CHKERRCUDA(ierr);
1011:     VecCUDARestoreArrayRead(xin,&xarray);
1012:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1013:   } else if (type == NORM_INFINITY) {
1014:     PetscInt  i;
1015:     VecCUDAGetArrayRead(xin,&xarray);
1016:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1017:     err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1018:     VecCUDARestoreArrayRead(xin,&xarray);
1019:   } else if (type == NORM_1) {
1020:     VecCUDAGetArrayRead(xin,&xarray);
1021:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1022:     VecCUDARestoreArrayRead(xin,&xarray);
1023:     WaitForGPU();CHKERRCUDA(ierr);
1024:     PetscLogFlops(PetscMax(n-1.0,0.0));
1025:   } else if (type == NORM_1_AND_2) {
1026:     VecNorm_SeqCUDA(xin,NORM_1,z);
1027:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1028:   }
1029:   return(0);
1030: }

1034: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1035: {
1036:   PetscErrorCode                         ierr;
1037:   PetscReal                              n=s->map->n;
1038:   PetscScalar                            *sarray,*tarray;

1041:   VecCUDAGetArrayRead(s,&sarray);
1042:   VecCUDAGetArrayRead(t,&tarray);
1043:   VecDot_SeqCUDA(s,t,dp);
1044:   VecDot_SeqCUDA(t,t,nm);
1045:   VecCUDARestoreArrayRead(s,&sarray);
1046:   VecCUDARestoreArrayRead(t,&tarray);
1047:   WaitForGPU();CHKERRCUDA(ierr);
1048:   PetscLogFlops(4.0*n);
1049:   return(0);
1050: }

1054: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1055: {
1057:   cudaError_t    err;

1060:   if (v->spptr) {
1061:     err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray);CHKERRCUDA(err);
1062:     ((Vec_CUDA*)v->spptr)->GPUarray = NULL;
1063:     err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1064:     PetscFree(v->spptr);
1065:   }
1066:   VecDestroy_SeqCUDA_Private(v);
1067:   return(0);
1068: }

1070: #if defined(PETSC_USE_COMPLEX)
1071: struct conjugate
1072: {
1073:   __host__ __device__
1074:     PetscScalar operator()(PetscScalar x)
1075:     {
1076:       return PetscConj(x);
1077:     }
1078: };
1079: #endif

1083: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1084: {
1085:   PetscScalar                     *xarray;
1086:   PetscErrorCode                  ierr;
1087: #if defined(PETSC_USE_COMPLEX)
1088:   PetscInt                        n = xin->map->n;
1089:   thrust::device_ptr<PetscScalar> xptr;
1090: #endif

1093:   VecCUDAGetArrayReadWrite(xin,&xarray);
1094: #if defined(PETSC_USE_COMPLEX)
1095:   try {
1096:     xptr = thrust::device_pointer_cast(xarray);
1097:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1098:     WaitForGPU();CHKERRCUDA(ierr);
1099:   } catch (char *ex) {
1100:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1101:   }
1102: #endif
1103:   VecCUDARestoreArrayReadWrite(xin,&xarray);
1104:   return(0);
1105: }

1109: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1110: {
1111:   VecType        t;
1113:   cudaError_t    err;
1114:   PetscBool      flg;

1119:   VecGetType(w,&t);
1120:   PetscStrcmp(t,VECSEQCUDA,&flg);
1121:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUDA);

1123:   if (w->data) {
1124:     if (((Vec_Seq*)w->data)->array_allocated) {
1125:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1126:     }
1127:     ((Vec_Seq*)w->data)->array = NULL;
1128:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1129:   }
1130:   if (w->spptr) {
1131:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1132:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1133:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1134:     }
1135:     err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1136:     PetscFree(w->spptr);
1137:   }

1139:   if (v->petscnative) {
1140:     PetscFree(w->data);
1141:     w->data = v->data;
1142:     w->valid_GPU_array = v->valid_GPU_array;
1143:     w->spptr = v->spptr;
1144:     PetscObjectStateIncrease((PetscObject)w);
1145:   } else {
1146:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1147:     w->valid_GPU_array = PETSC_CUDA_CPU;
1148:     VecCUDAAllocateCheck(w);
1149:   }
1150:   return(0);
1151: }

1155: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1156: {
1157:   VecType        t;
1159:   cudaError_t    err;
1160:   PetscBool      flg;

1165:   VecGetType(w,&t);
1166:   PetscStrcmp(t,VECSEQCUDA,&flg);
1167:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUDA);

1169:   if (v->petscnative) {
1170:     v->data = w->data;
1171:     v->valid_GPU_array = w->valid_GPU_array;
1172:     v->spptr = w->spptr;
1173:     VecCUDACopyFromGPU(v);
1174:     PetscObjectStateIncrease((PetscObject)v);
1175:     w->data = 0;
1176:     w->valid_GPU_array = PETSC_CUDA_UNALLOCATED;
1177:     w->spptr = 0;
1178:   } else {
1179:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1180:     if ((Vec_CUDA*)w->spptr) {
1181:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1182:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1183:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1184:       PetscFree(w->spptr);
1185:     }
1186:   }
1187:   return(0);
1188: }

1192: /*@C
1193:    VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1195:    This function has semantics similar to VecGetArray():  the pointer
1196:    returned by this function points to a consistent view of the vector
1197:    data.  This may involve a copy operation of data from the host to the
1198:    device if the data on the device is out of date.  If the device
1199:    memory hasn't been allocated previously it will be allocated as part
1200:    of this function call.  VecCUDAGetArrayReadWrite() assumes that
1201:    the user will modify the vector data.  This is similar to
1202:    intent(inout) in fortran.

1204:    The CUDA device pointer has to be released by calling
1205:    VecCUDARestoreArrayReadWrite().  Upon restoring the vector data
1206:    the data on the host will be marked as out of date.  A subsequent
1207:    access of the host data will thus incur a data transfer from the
1208:    device to the host.


1211:    Input Parameter:
1212: .  v - the vector

1214:    Output Parameter:
1215: .  a - the CUDA device pointer

1217:    Fortran note:
1218:    This function is not currently available from Fortran.

1220:    Level: intermediate

1222: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1223: @*/
1224: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1225: {

1229:   *a   = 0;
1230:   VecCUDACopyToGPU(v);
1231:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1232:   return(0);
1233: }

1237: /*@C
1238:    VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().

1240:    This marks the host data as out of date.  Subsequent access to the
1241:    vector data on the host side with for instance VecGetArray() incurs a
1242:    data transfer.

1244:    Input Parameter:
1245: +  v - the vector
1246: -  a - the CUDA device pointer.  This pointer is invalid after
1247:        VecCUDARestoreArrayReadWrite() returns.

1249:    Fortran note:
1250:    This function is not currently available from Fortran.

1252:    Level: intermediate

1254: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1255: @*/
1256: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1257: {

1261:   v->valid_GPU_array = PETSC_CUDA_GPU;

1263:   PetscObjectStateIncrease((PetscObject)v);
1264:   return(0);
1265: }

1269: /*@C
1270:    VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.

1272:    This function is analogous to VecGetArrayRead():  The pointer
1273:    returned by this function points to a consistent view of the vector
1274:    data.  This may involve a copy operation of data from the host to the
1275:    device if the data on the device is out of date.  If the device
1276:    memory hasn't been allocated previously it will be allocated as part
1277:    of this function call.  VecCUDAGetArrayRead() assumes that the
1278:    user will not modify the vector data.  This is analgogous to
1279:    intent(in) in Fortran.

1281:    The CUDA device pointer has to be released by calling
1282:    VecCUDARestoreArrayRead().  If the data on the host side was
1283:    previously up to date it will remain so, i.e. data on both the device
1284:    and the host is up to date.  Accessing data on the host side does not
1285:    incur a device to host data transfer.

1287:    Input Parameter:
1288: .  v - the vector

1290:    Output Parameter:
1291: .  a - the CUDA pointer.

1293:    Fortran note:
1294:    This function is not currently available from Fortran.

1296:    Level: intermediate

1298: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1299: @*/
1300: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, PetscScalar **a)
1301: {

1305:   *a   = 0;
1306:   VecCUDACopyToGPU(v);
1307:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1308:   return(0);
1309: }

1313: /*@C
1314:    VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().

1316:    If the data on the host side was previously up to date it will remain
1317:    so, i.e. data on both the device and the host is up to date.
1318:    Accessing data on the host side e.g. with VecGetArray() does not
1319:    incur a device to host data transfer.

1321:    Input Parameter:
1322: +  v - the vector
1323: -  a - the CUDA device pointer.  This pointer is invalid after
1324:        VecCUDARestoreArrayRead() returns.

1326:    Fortran note:
1327:    This function is not currently available from Fortran.

1329:    Level: intermediate

1331: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1332: @*/
1333: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, PetscScalar **a)
1334: {
1336:   return(0);
1337: }

1341: /*@C
1342:    VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.

1344:    The data pointed to by the device pointer is uninitialized.  The user
1345:    may not read from this data.  Furthermore, the entire array needs to
1346:    be filled by the user to obtain well-defined behaviour.  The device
1347:    memory will be allocated by this function if it hasn't been allocated
1348:    previously.  This is analogous to intent(out) in Fortran.

1350:    The device pointer needs to be released with
1351:    VecCUDARestoreArrayWrite().  When the pointer is released the
1352:    host data of the vector is marked as out of data.  Subsequent access
1353:    of the host data with e.g. VecGetArray() incurs a device to host data
1354:    transfer.


1357:    Input Parameter:
1358: .  v - the vector

1360:    Output Parameter:
1361: .  a - the CUDA pointer

1363:    Fortran note:
1364:    This function is not currently available from Fortran.

1366:    Level: advanced

1368: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1369: @*/
1370: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1371: {

1375:   *a   = 0;
1376:   VecCUDAAllocateCheck(v);
1377:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
1378:   return(0);
1379: }

1383: /*@C
1384:    VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().

1386:    Data on the host will be marked as out of date.  Subsequent access of
1387:    the data on the host side e.g. with VecGetArray() will incur a device
1388:    to host data transfer.

1390:    Input Parameter:
1391: +  v - the vector
1392: -  a - the CUDA device pointer.  This pointer is invalid after
1393:        VecCUDARestoreArrayWrite() returns.

1395:    Fortran note:
1396:    This function is not currently available from Fortran.

1398:    Level: intermediate

1400: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1401: @*/
1402: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1403: {

1407:   v->valid_GPU_array = PETSC_CUDA_GPU;

1409:   PetscObjectStateIncrease((PetscObject)v);
1410:   return(0);
1411: }

1415: /*@C
1416:    VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1417:    GPU array provided by the user. This is useful to avoid copying an
1418:    array into a vector.

1420:    Not Collective

1422:    Input Parameters:
1423: +  vec - the vector
1424: -  array - the GPU array

1426:    Notes:
1427:    You can return to the original GPU array with a call to VecCUDAResetArray()
1428:    It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1429:    same time on the same vector.

1431:    Level: developer

1433: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()

1435: @*/
1436: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1437: {

1441:   VecCUDACopyToGPU(vin);
1442:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1443:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1444:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1445:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1446:   return(0);
1447: }

1451: /*@C
1452:    VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1453:    with a GPU array provided by the user. This is useful to avoid copying
1454:    a GPU array into a vector.

1456:    Not Collective

1458:    Input Parameters:
1459: +  vec - the vector
1460: -  array - the GPU array

1462:    Notes:
1463:    This permanently replaces the GPU array and frees the memory associated
1464:    with the old GPU array.

1466:    The memory passed in CANNOT be freed by the user. It will be freed
1467:    when the vector is destroyed.

1469:    Not supported from Fortran

1471:    Level: developer

1473: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()

1475: @*/
1476: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1477: {
1478:   cudaError_t err;

1481:   err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1482:   ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1483:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1484:   return(0);
1485: }

1489: /*@C
1490:    VecCUDAResetArray - Resets a vector to use its default memory. Call this
1491:    after the use of VecCUDAPlaceArray().

1493:    Not Collective

1495:    Input Parameters:
1496: .  vec - the vector

1498:    Level: developer

1500: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()

1502: @*/
1503: PetscErrorCode VecCUDAResetArray(Vec vin)
1504: {

1508:   VecCUDACopyToGPU(vin);
1509:   ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1510:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
1511:   vin->valid_GPU_array = PETSC_CUDA_GPU;
1512:   return(0);
1513: }