Actual source code: veccuda2.cu
petsc-3.11.0 2019-03-29
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
17: /*
18: Allocates space for the vector array on the GPU if it does not exist.
19: Does NOT change the PetscCUDAFlag for the vector
20: Does NOT zero the CUDA array
22: */
23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
24: {
26: cudaError_t err;
27: cudaStream_t stream;
28: Vec_CUDA *veccuda;
31: if (!v->spptr) {
32: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
33: veccuda = (Vec_CUDA*)v->spptr;
34: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
35: veccuda->GPUarray = veccuda->GPUarray_allocated;
36: err = cudaStreamCreate(&stream);CHKERRCUDA(err);
37: veccuda->stream = stream;
38: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
39: if (v->valid_GPU_array == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->valid_GPU_array = PETSC_OFFLOAD_CPU;
42: } else {
43: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
44: }
45: }
46: }
47: return(0);
48: }
50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
51: PetscErrorCode VecCUDACopyToGPU(Vec v)
52: {
54: cudaError_t err;
55: Vec_CUDA *veccuda;
56: PetscScalar *varray;
60: VecCUDAAllocateCheck(v);
61: if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
62: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
63: veccuda=(Vec_CUDA*)v->spptr;
64: varray=veccuda->GPUarray;
65: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
66: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
67: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
68: }
69: return(0);
70: }
72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
73: {
74: PetscScalar *varray;
76: cudaError_t err;
77: PetscScalar *cpuPtr, *gpuPtr;
78: Vec_Seq *s;
79: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
80: PetscInt lowestIndex,n;
84: VecCUDAAllocateCheck(v);
85: if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
86: s = (Vec_Seq*)v->data;
87: if (mode & SCATTER_REVERSE) {
88: lowestIndex = ptop_scatter->sendLowestIndex;
89: n = ptop_scatter->ns;
90: } else {
91: lowestIndex = ptop_scatter->recvLowestIndex;
92: n = ptop_scatter->nr;
93: }
95: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
96: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
97: gpuPtr = varray + lowestIndex;
98: cpuPtr = s->array + lowestIndex;
100: /* Note : this code copies the smallest contiguous chunk of data
101: containing ALL of the indices */
102: err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
104: // Set the buffer states
105: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
106: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
107: }
108: return(0);
109: }
112: /*
113: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
114: */
115: PetscErrorCode VecCUDACopyFromGPU(Vec v)
116: {
118: cudaError_t err;
119: Vec_CUDA *veccuda;
120: PetscScalar *varray;
124: VecCUDAAllocateCheckHost(v);
125: if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
126: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
127: veccuda=(Vec_CUDA*)v->spptr;
128: varray=veccuda->GPUarray;
129: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
130: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
131: v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
132: }
133: return(0);
134: }
136: /* Note that this function only copies *some* of the values up from the GPU to CPU,
137: which means that we need recombine the data at some point before using any of the standard functions.
138: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
139: where you have to always call in pairs
140: */
141: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
142: {
143: const PetscScalar *varray, *gpuPtr;
144: PetscErrorCode ierr;
145: cudaError_t err;
146: PetscScalar *cpuPtr;
147: Vec_Seq *s;
148: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
149: PetscInt lowestIndex,n;
153: VecCUDAAllocateCheckHost(v);
154: if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
155: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
156: if (mode & SCATTER_REVERSE) {
157: lowestIndex = ptop_scatter->recvLowestIndex;
158: n = ptop_scatter->nr;
159: } else {
160: lowestIndex = ptop_scatter->sendLowestIndex;
161: n = ptop_scatter->ns;
162: }
164: varray=((Vec_CUDA*)v->spptr)->GPUarray;
165: s = (Vec_Seq*)v->data;
166: gpuPtr = varray + lowestIndex;
167: cpuPtr = s->array + lowestIndex;
169: /* Note : this code copies the smallest contiguous chunk of data
170: containing ALL of the indices */
171: err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
173: VecCUDARestoreArrayRead(v,&varray);
174: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
175: }
176: return(0);
177: }
179: /*MC
180: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
182: Options Database Keys:
183: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
185: Level: beginner
187: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
188: M*/
190: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
191: {
192: const PetscScalar *xarray;
193: PetscScalar *yarray;
194: PetscErrorCode ierr;
195: PetscBLASInt one=1,bn;
196: PetscScalar sone=1.0;
197: cublasHandle_t cublasv2handle;
198: cublasStatus_t cberr;
199: cudaError_t err;
202: PetscCUBLASGetHandle(&cublasv2handle);
203: PetscBLASIntCast(yin->map->n,&bn);
204: VecCUDAGetArrayRead(xin,&xarray);
205: VecCUDAGetArrayReadWrite(yin,&yarray);
206: if (alpha == (PetscScalar)0.0) {
207: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
208: } else if (alpha == (PetscScalar)1.0) {
209: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
210: PetscLogFlops(2.0*yin->map->n);
211: } else {
212: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
213: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
214: PetscLogFlops(2.0*yin->map->n);
215: }
216: WaitForGPU();CHKERRCUDA(ierr);
217: VecCUDARestoreArrayRead(xin,&xarray);
218: VecCUDARestoreArrayReadWrite(yin,&yarray);
219: return(0);
220: }
222: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
223: {
224: const PetscScalar *xarray;
225: PetscScalar *yarray;
226: PetscErrorCode ierr;
227: PetscBLASInt one=1,bn;
228: cublasHandle_t cublasv2handle;
229: cublasStatus_t cberr;
232: PetscCUBLASGetHandle(&cublasv2handle);
233: if (alpha != (PetscScalar)0.0) {
234: PetscBLASIntCast(yin->map->n,&bn);
235: VecCUDAGetArrayRead(xin,&xarray);
236: VecCUDAGetArrayReadWrite(yin,&yarray);
237: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
238: WaitForGPU();CHKERRCUDA(ierr);
239: VecCUDARestoreArrayRead(xin,&xarray);
240: VecCUDARestoreArrayReadWrite(yin,&yarray);
241: PetscLogFlops(2.0*yin->map->n);
242: }
243: return(0);
244: }
246: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
247: {
248: PetscInt n = xin->map->n;
249: const PetscScalar *xarray=NULL,*yarray=NULL;
250: PetscScalar *warray=NULL;
251: thrust::device_ptr<const PetscScalar> xptr,yptr;
252: thrust::device_ptr<PetscScalar> wptr;
253: PetscErrorCode ierr;
256: VecCUDAGetArrayWrite(win,&warray);
257: VecCUDAGetArrayRead(xin,&xarray);
258: VecCUDAGetArrayRead(yin,&yarray);
259: try {
260: wptr = thrust::device_pointer_cast(warray);
261: xptr = thrust::device_pointer_cast(xarray);
262: yptr = thrust::device_pointer_cast(yarray);
263: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
264: WaitForGPU();CHKERRCUDA(ierr);
265: } catch (char *ex) {
266: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
267: }
268: PetscLogFlops(n);
269: VecCUDARestoreArrayRead(xin,&xarray);
270: VecCUDARestoreArrayRead(yin,&yarray);
271: VecCUDARestoreArrayWrite(win,&warray);
272: return(0);
273: }
275: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
276: {
277: const PetscScalar *xarray=NULL,*yarray=NULL;
278: PetscScalar *warray=NULL;
279: PetscErrorCode ierr;
280: PetscBLASInt one=1,bn;
281: cublasHandle_t cublasv2handle;
282: cublasStatus_t cberr;
283: cudaError_t err;
286: PetscCUBLASGetHandle(&cublasv2handle);
287: PetscBLASIntCast(win->map->n,&bn);
288: if (alpha == (PetscScalar)0.0) {
289: VecCopy_SeqCUDA(yin,win);
290: } else {
291: VecCUDAGetArrayRead(xin,&xarray);
292: VecCUDAGetArrayRead(yin,&yarray);
293: VecCUDAGetArrayWrite(win,&warray);
294: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
295: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
296: PetscLogFlops(2*win->map->n);
297: WaitForGPU();CHKERRCUDA(ierr);
298: VecCUDARestoreArrayRead(xin,&xarray);
299: VecCUDARestoreArrayRead(yin,&yarray);
300: VecCUDARestoreArrayWrite(win,&warray);
301: }
302: return(0);
303: }
305: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
306: {
308: PetscInt n = xin->map->n,j,j_rem;
309: PetscScalar alpha0,alpha1,alpha2,alpha3;
312: PetscLogFlops(nv*2.0*n);
313: switch (j_rem=nv&0x3) {
314: case 3:
315: alpha0 = alpha[0];
316: alpha1 = alpha[1];
317: alpha2 = alpha[2];
318: alpha += 3;
319: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
320: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
321: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
322: y += 3;
323: break;
324: case 2:
325: alpha0 = alpha[0];
326: alpha1 = alpha[1];
327: alpha +=2;
328: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
329: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
330: y +=2;
331: break;
332: case 1:
333: alpha0 = *alpha++;
334: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
335: y +=1;
336: break;
337: }
338: for (j=j_rem; j<nv; j+=4) {
339: alpha0 = alpha[0];
340: alpha1 = alpha[1];
341: alpha2 = alpha[2];
342: alpha3 = alpha[3];
343: alpha += 4;
344: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
345: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
346: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
347: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
348: y += 4;
349: }
350: WaitForGPU();CHKERRCUDA(ierr);
351: return(0);
352: }
354: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
355: {
356: const PetscScalar *xarray,*yarray;
357: PetscErrorCode ierr;
358: PetscBLASInt one=1,bn;
359: cublasHandle_t cublasv2handle;
360: cublasStatus_t cberr;
363: PetscCUBLASGetHandle(&cublasv2handle);
364: PetscBLASIntCast(yin->map->n,&bn);
365: VecCUDAGetArrayRead(xin,&xarray);
366: VecCUDAGetArrayRead(yin,&yarray);
367: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
368: cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
369: WaitForGPU();CHKERRCUDA(ierr);
370: if (xin->map->n >0) {
371: PetscLogFlops(2.0*xin->map->n-1);
372: }
373: VecCUDARestoreArrayRead(xin,&xarray);
374: VecCUDARestoreArrayRead(yin,&yarray);
375: return(0);
376: }
378: //
379: // CUDA kernels for MDot to follow
380: //
382: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
383: #define MDOT_WORKGROUP_SIZE 128
384: #define MDOT_WORKGROUP_NUM 128
386: #if !defined(PETSC_USE_COMPLEX)
387: // M = 2:
388: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
389: PetscInt size, PetscScalar *group_results)
390: {
391: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
392: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
393: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
394: PetscInt vec_start_index = blockIdx.x * entries_per_group;
395: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
397: PetscScalar entry_x = 0;
398: PetscScalar group_sum0 = 0;
399: PetscScalar group_sum1 = 0;
400: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
401: entry_x = x[i]; // load only once from global memory!
402: group_sum0 += entry_x * y0[i];
403: group_sum1 += entry_x * y1[i];
404: }
405: tmp_buffer[threadIdx.x] = group_sum0;
406: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
408: // parallel reduction
409: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
410: __syncthreads();
411: if (threadIdx.x < stride) {
412: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
413: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
414: }
415: }
417: // write result of group to group_results
418: if (threadIdx.x == 0) {
419: group_results[blockIdx.x] = tmp_buffer[0];
420: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
421: }
422: }
424: // M = 3:
425: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
426: PetscInt size, PetscScalar *group_results)
427: {
428: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
429: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
430: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
431: PetscInt vec_start_index = blockIdx.x * entries_per_group;
432: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
434: PetscScalar entry_x = 0;
435: PetscScalar group_sum0 = 0;
436: PetscScalar group_sum1 = 0;
437: PetscScalar group_sum2 = 0;
438: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
439: entry_x = x[i]; // load only once from global memory!
440: group_sum0 += entry_x * y0[i];
441: group_sum1 += entry_x * y1[i];
442: group_sum2 += entry_x * y2[i];
443: }
444: tmp_buffer[threadIdx.x] = group_sum0;
445: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
446: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
448: // parallel reduction
449: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
450: __syncthreads();
451: if (threadIdx.x < stride) {
452: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
453: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
454: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
455: }
456: }
458: // write result of group to group_results
459: if (threadIdx.x == 0) {
460: group_results[blockIdx.x ] = tmp_buffer[0];
461: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
462: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
463: }
464: }
466: // M = 4:
467: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
468: PetscInt size, PetscScalar *group_results)
469: {
470: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
471: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
472: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
473: PetscInt vec_start_index = blockIdx.x * entries_per_group;
474: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
476: PetscScalar entry_x = 0;
477: PetscScalar group_sum0 = 0;
478: PetscScalar group_sum1 = 0;
479: PetscScalar group_sum2 = 0;
480: PetscScalar group_sum3 = 0;
481: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
482: entry_x = x[i]; // load only once from global memory!
483: group_sum0 += entry_x * y0[i];
484: group_sum1 += entry_x * y1[i];
485: group_sum2 += entry_x * y2[i];
486: group_sum3 += entry_x * y3[i];
487: }
488: tmp_buffer[threadIdx.x] = group_sum0;
489: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
490: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
491: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
493: // parallel reduction
494: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
495: __syncthreads();
496: if (threadIdx.x < stride) {
497: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
498: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
499: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
500: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
501: }
502: }
504: // write result of group to group_results
505: if (threadIdx.x == 0) {
506: group_results[blockIdx.x ] = tmp_buffer[0];
507: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
508: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
509: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
510: }
511: }
513: // M = 8:
514: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
515: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
516: PetscInt size, PetscScalar *group_results)
517: {
518: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
519: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
520: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
521: PetscInt vec_start_index = blockIdx.x * entries_per_group;
522: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
524: PetscScalar entry_x = 0;
525: PetscScalar group_sum0 = 0;
526: PetscScalar group_sum1 = 0;
527: PetscScalar group_sum2 = 0;
528: PetscScalar group_sum3 = 0;
529: PetscScalar group_sum4 = 0;
530: PetscScalar group_sum5 = 0;
531: PetscScalar group_sum6 = 0;
532: PetscScalar group_sum7 = 0;
533: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
534: entry_x = x[i]; // load only once from global memory!
535: group_sum0 += entry_x * y0[i];
536: group_sum1 += entry_x * y1[i];
537: group_sum2 += entry_x * y2[i];
538: group_sum3 += entry_x * y3[i];
539: group_sum4 += entry_x * y4[i];
540: group_sum5 += entry_x * y5[i];
541: group_sum6 += entry_x * y6[i];
542: group_sum7 += entry_x * y7[i];
543: }
544: tmp_buffer[threadIdx.x] = group_sum0;
545: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
546: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
547: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
548: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
549: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
550: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
551: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
553: // parallel reduction
554: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
555: __syncthreads();
556: if (threadIdx.x < stride) {
557: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
558: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
559: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
560: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
561: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
562: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
563: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
564: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
565: }
566: }
568: // write result of group to group_results
569: if (threadIdx.x == 0) {
570: group_results[blockIdx.x ] = tmp_buffer[0];
571: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
572: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
573: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
574: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
575: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
576: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
577: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
578: }
579: }
580: #endif /* !defined(PETSC_USE_COMPLEX) */
582: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
583: {
584: PetscErrorCode ierr;
585: PetscInt i,n = xin->map->n,current_y_index = 0;
586: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
587: PetscScalar *group_results_gpu;
588: #if !defined(PETSC_USE_COMPLEX)
589: PetscInt j;
590: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
591: #endif
592: cudaError_t cuda_ierr;
593: PetscBLASInt one=1,bn;
594: cublasHandle_t cublasv2handle;
595: cublasStatus_t cberr;
598: PetscCUBLASGetHandle(&cublasv2handle);
599: PetscBLASIntCast(xin->map->n,&bn);
600: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
601: /* Handle the case of local size zero first */
602: if (!xin->map->n) {
603: for (i=0; i<nv; ++i) z[i] = 0;
604: return(0);
605: }
607: // allocate scratchpad memory for the results of individual work groups:
608: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
610: VecCUDAGetArrayRead(xin,&xptr);
612: while (current_y_index < nv)
613: {
614: switch (nv - current_y_index) {
616: case 7:
617: case 6:
618: case 5:
619: case 4:
620: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
621: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
622: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
623: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
625: #if defined(PETSC_USE_COMPLEX)
626: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
627: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
628: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
629: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
630: #else
631: // run kernel:
632: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
634: // copy results back to
635: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
637: // sum group results into z:
638: for (j=0; j<4; ++j) {
639: z[current_y_index + j] = 0;
640: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
641: }
642: #endif
643: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
644: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
645: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
646: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
647: current_y_index += 4;
648: break;
650: case 3:
651: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
652: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
653: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
655: #if defined(PETSC_USE_COMPLEX)
656: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
657: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
658: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
659: #else
660: // run kernel:
661: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
663: // copy results back to
664: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
666: // sum group results into z:
667: for (j=0; j<3; ++j) {
668: z[current_y_index + j] = 0;
669: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
670: }
671: #endif
673: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
674: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
675: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
676: current_y_index += 3;
677: break;
679: case 2:
680: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
681: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
683: #if defined(PETSC_USE_COMPLEX)
684: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
685: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
686: #else
687: // run kernel:
688: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
690: // copy results back to
691: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
693: // sum group results into z:
694: for (j=0; j<2; ++j) {
695: z[current_y_index + j] = 0;
696: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
697: }
698: #endif
699: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
700: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
701: current_y_index += 2;
702: break;
704: case 1:
705: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
706: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
707: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
708: current_y_index += 1;
709: break;
711: default: // 8 or more vectors left
712: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
713: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
714: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
715: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
716: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
717: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
718: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
719: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
721: #if defined(PETSC_USE_COMPLEX)
722: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
723: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
724: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
725: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
726: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
727: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
728: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
729: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
730: #else
731: // run kernel:
732: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
734: // copy results back to
735: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
737: // sum group results into z:
738: for (j=0; j<8; ++j) {
739: z[current_y_index + j] = 0;
740: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
741: }
742: #endif
743: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
744: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
745: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
746: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
747: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
748: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
749: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
750: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
751: current_y_index += 8;
752: break;
753: }
754: }
755: VecCUDARestoreArrayRead(xin,&xptr);
757: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
758: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
759: return(0);
760: }
762: #undef MDOT_WORKGROUP_SIZE
763: #undef MDOT_WORKGROUP_NUM
765: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
766: {
767: PetscInt n = xin->map->n;
768: PetscScalar *xarray=NULL;
769: thrust::device_ptr<PetscScalar> xptr;
770: PetscErrorCode ierr;
771: cudaError_t err;
774: VecCUDAGetArrayWrite(xin,&xarray);
775: if (alpha == (PetscScalar)0.0) {
776: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
777: } else {
778: try {
779: xptr = thrust::device_pointer_cast(xarray);
780: thrust::fill(xptr,xptr+n,alpha);
781: } catch (char *ex) {
782: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
783: }
784: }
785: WaitForGPU();CHKERRCUDA(ierr);
786: VecCUDARestoreArrayWrite(xin,&xarray);
787: return(0);
788: }
790: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
791: {
792: PetscScalar *xarray;
794: PetscBLASInt one=1,bn;
795: cublasHandle_t cublasv2handle;
796: cublasStatus_t cberr;
799: if (alpha == (PetscScalar)0.0) {
800: VecSet_SeqCUDA(xin,alpha);
801: } else if (alpha != (PetscScalar)1.0) {
802: PetscCUBLASGetHandle(&cublasv2handle);
803: PetscBLASIntCast(xin->map->n,&bn);
804: VecCUDAGetArrayReadWrite(xin,&xarray);
805: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
806: VecCUDARestoreArrayReadWrite(xin,&xarray);
807: }
808: WaitForGPU();CHKERRCUDA(ierr);
809: PetscLogFlops(xin->map->n);
810: return(0);
811: }
813: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
814: {
815: const PetscScalar *xarray,*yarray;
816: PetscErrorCode ierr;
817: PetscBLASInt one=1,bn;
818: cublasHandle_t cublasv2handle;
819: cublasStatus_t cberr;
822: PetscCUBLASGetHandle(&cublasv2handle);
823: PetscBLASIntCast(xin->map->n,&bn);
824: VecCUDAGetArrayRead(xin,&xarray);
825: VecCUDAGetArrayRead(yin,&yarray);
826: cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
827: WaitForGPU();CHKERRCUDA(ierr);
828: if (xin->map->n > 0) {
829: PetscLogFlops(2.0*xin->map->n-1);
830: }
831: VecCUDARestoreArrayRead(yin,&yarray);
832: VecCUDARestoreArrayRead(xin,&xarray);
833: return(0);
834: }
836: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
837: {
838: const PetscScalar *xarray;
839: PetscScalar *yarray;
840: PetscErrorCode ierr;
841: cudaError_t err;
844: if (xin != yin) {
845: if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
846: VecCUDAGetArrayRead(xin,&xarray);
847: VecCUDAGetArrayWrite(yin,&yarray);
848: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
849: WaitForGPU();CHKERRCUDA(ierr);
850: VecCUDARestoreArrayRead(xin,&xarray);
851: VecCUDARestoreArrayWrite(yin,&yarray);
853: } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
854: /* copy in CPU if we are on the CPU*/
855: VecCopy_SeqCUDA_Private(xin,yin);
856: } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
857: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
858: if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
859: /* copy in CPU */
860: VecCopy_SeqCUDA_Private(xin,yin);
862: } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
863: /* copy in GPU */
864: VecCUDAGetArrayRead(xin,&xarray);
865: VecCUDAGetArrayWrite(yin,&yarray);
866: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
867: VecCUDARestoreArrayRead(xin,&xarray);
868: VecCUDARestoreArrayWrite(yin,&yarray);
869: } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
870: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
871: default to copy in GPU (this is an arbitrary choice) */
872: VecCUDAGetArrayRead(xin,&xarray);
873: VecCUDAGetArrayWrite(yin,&yarray);
874: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
875: VecCUDARestoreArrayRead(xin,&xarray);
876: VecCUDARestoreArrayWrite(yin,&yarray);
877: } else {
878: VecCopy_SeqCUDA_Private(xin,yin);
879: }
880: }
881: }
882: return(0);
883: }
885: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
886: {
888: PetscBLASInt one = 1,bn;
889: PetscScalar *xarray,*yarray;
890: cublasHandle_t cublasv2handle;
891: cublasStatus_t cberr;
894: PetscCUBLASGetHandle(&cublasv2handle);
895: PetscBLASIntCast(xin->map->n,&bn);
896: if (xin != yin) {
897: VecCUDAGetArrayReadWrite(xin,&xarray);
898: VecCUDAGetArrayReadWrite(yin,&yarray);
899: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
900: WaitForGPU();CHKERRCUDA(ierr);
901: VecCUDARestoreArrayReadWrite(xin,&xarray);
902: VecCUDARestoreArrayReadWrite(yin,&yarray);
903: }
904: return(0);
905: }
907: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
908: {
909: PetscErrorCode ierr;
910: PetscScalar a = alpha,b = beta;
911: const PetscScalar *xarray;
912: PetscScalar *yarray;
913: PetscBLASInt one = 1, bn;
914: cublasHandle_t cublasv2handle;
915: cublasStatus_t cberr;
916: cudaError_t err;
919: PetscCUBLASGetHandle(&cublasv2handle);
920: PetscBLASIntCast(yin->map->n,&bn);
921: if (a == (PetscScalar)0.0) {
922: VecScale_SeqCUDA(yin,beta);
923: } else if (b == (PetscScalar)1.0) {
924: VecAXPY_SeqCUDA(yin,alpha,xin);
925: } else if (a == (PetscScalar)1.0) {
926: VecAYPX_SeqCUDA(yin,beta,xin);
927: } else if (b == (PetscScalar)0.0) {
928: VecCUDAGetArrayRead(xin,&xarray);
929: VecCUDAGetArrayReadWrite(yin,&yarray);
930: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
931: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
932: PetscLogFlops(xin->map->n);
933: WaitForGPU();CHKERRCUDA(ierr);
934: VecCUDARestoreArrayRead(xin,&xarray);
935: VecCUDARestoreArrayReadWrite(yin,&yarray);
936: } else {
937: VecCUDAGetArrayRead(xin,&xarray);
938: VecCUDAGetArrayReadWrite(yin,&yarray);
939: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
940: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
941: VecCUDARestoreArrayRead(xin,&xarray);
942: VecCUDARestoreArrayReadWrite(yin,&yarray);
943: WaitForGPU();CHKERRCUDA(ierr);
944: PetscLogFlops(3.0*xin->map->n);
945: }
946: return(0);
947: }
949: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
950: {
952: PetscInt n = zin->map->n;
955: if (gamma == (PetscScalar)1.0) {
956: /* z = ax + b*y + z */
957: VecAXPY_SeqCUDA(zin,alpha,xin);
958: VecAXPY_SeqCUDA(zin,beta,yin);
959: PetscLogFlops(4.0*n);
960: } else {
961: /* z = a*x + b*y + c*z */
962: VecScale_SeqCUDA(zin,gamma);
963: VecAXPY_SeqCUDA(zin,alpha,xin);
964: VecAXPY_SeqCUDA(zin,beta,yin);
965: PetscLogFlops(5.0*n);
966: }
967: WaitForGPU();CHKERRCUDA(ierr);
968: return(0);
969: }
971: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
972: {
973: PetscInt n = win->map->n;
974: const PetscScalar *xarray,*yarray;
975: PetscScalar *warray;
976: thrust::device_ptr<const PetscScalar> xptr,yptr;
977: thrust::device_ptr<PetscScalar> wptr;
978: PetscErrorCode ierr;
981: VecCUDAGetArrayReadWrite(win,&warray);
982: VecCUDAGetArrayRead(xin,&xarray);
983: VecCUDAGetArrayRead(yin,&yarray);
984: try {
985: wptr = thrust::device_pointer_cast(warray);
986: xptr = thrust::device_pointer_cast(xarray);
987: yptr = thrust::device_pointer_cast(yarray);
988: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
989: WaitForGPU();CHKERRCUDA(ierr);
990: } catch (char *ex) {
991: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
992: }
993: VecCUDARestoreArrayRead(xin,&xarray);
994: VecCUDARestoreArrayRead(yin,&yarray);
995: VecCUDARestoreArrayReadWrite(win,&warray);
996: PetscLogFlops(n);
997: return(0);
998: }
1000: /* should do infinity norm in cuda */
1002: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1003: {
1004: PetscErrorCode ierr;
1005: PetscInt n = xin->map->n;
1006: PetscBLASInt one = 1, bn;
1007: const PetscScalar *xarray;
1008: cublasHandle_t cublasv2handle;
1009: cublasStatus_t cberr;
1010: cudaError_t err;
1013: PetscCUBLASGetHandle(&cublasv2handle);
1014: PetscBLASIntCast(n,&bn);
1015: if (type == NORM_2 || type == NORM_FROBENIUS) {
1016: VecCUDAGetArrayRead(xin,&xarray);
1017: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1018: WaitForGPU();CHKERRCUDA(ierr);
1019: VecCUDARestoreArrayRead(xin,&xarray);
1020: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1021: } else if (type == NORM_INFINITY) {
1022: int i;
1023: VecCUDAGetArrayRead(xin,&xarray);
1024: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1025: if (bn) {
1026: PetscScalar zs;
1028: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1029: *z = PetscAbsScalar(zs);
1030: } else *z = 0.0;
1031: VecCUDARestoreArrayRead(xin,&xarray);
1032: } else if (type == NORM_1) {
1033: VecCUDAGetArrayRead(xin,&xarray);
1034: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1035: VecCUDARestoreArrayRead(xin,&xarray);
1036: WaitForGPU();CHKERRCUDA(ierr);
1037: PetscLogFlops(PetscMax(n-1.0,0.0));
1038: } else if (type == NORM_1_AND_2) {
1039: VecNorm_SeqCUDA(xin,NORM_1,z);
1040: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1041: }
1042: return(0);
1043: }
1045: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1046: {
1047: PetscErrorCode ierr;
1048: PetscReal n=s->map->n;
1049: const PetscScalar *sarray,*tarray;
1052: VecCUDAGetArrayRead(s,&sarray);
1053: VecCUDAGetArrayRead(t,&tarray);
1054: VecDot_SeqCUDA(s,t,dp);
1055: VecDot_SeqCUDA(t,t,nm);
1056: VecCUDARestoreArrayRead(s,&sarray);
1057: VecCUDARestoreArrayRead(t,&tarray);
1058: WaitForGPU();CHKERRCUDA(ierr);
1059: PetscLogFlops(4.0*n);
1060: return(0);
1061: }
1063: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1064: {
1066: cudaError_t err;
1069: if (v->spptr) {
1070: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1071: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1072: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1073: }
1074: if (((Vec_CUDA*)v->spptr)->stream) {
1075: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1076: }
1077: PetscFree(v->spptr);
1078: }
1079: VecDestroy_SeqCUDA_Private(v);
1080: return(0);
1081: }
1083: #if defined(PETSC_USE_COMPLEX)
1084: struct conjugate
1085: {
1086: __host__ __device__
1087: PetscScalar operator()(PetscScalar x)
1088: {
1089: return PetscConj(x);
1090: }
1091: };
1092: #endif
1094: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1095: {
1096: PetscScalar *xarray;
1097: PetscErrorCode ierr;
1098: #if defined(PETSC_USE_COMPLEX)
1099: PetscInt n = xin->map->n;
1100: thrust::device_ptr<PetscScalar> xptr;
1101: #endif
1104: VecCUDAGetArrayReadWrite(xin,&xarray);
1105: #if defined(PETSC_USE_COMPLEX)
1106: try {
1107: xptr = thrust::device_pointer_cast(xarray);
1108: thrust::transform(xptr,xptr+n,xptr,conjugate());
1109: WaitForGPU();CHKERRCUDA(ierr);
1110: } catch (char *ex) {
1111: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1112: }
1113: #endif
1114: VecCUDARestoreArrayReadWrite(xin,&xarray);
1115: return(0);
1116: }
1118: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1119: {
1121: cudaError_t err;
1128: if (w->data) {
1129: if (((Vec_Seq*)w->data)->array_allocated) {
1130: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1131: }
1132: ((Vec_Seq*)w->data)->array = NULL;
1133: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1134: }
1135: if (w->spptr) {
1136: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1137: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1138: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1139: }
1140: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1141: PetscFree(w->spptr);
1142: }
1144: if (v->petscnative) {
1145: PetscFree(w->data);
1146: w->data = v->data;
1147: w->valid_GPU_array = v->valid_GPU_array;
1148: w->spptr = v->spptr;
1149: PetscObjectStateIncrease((PetscObject)w);
1150: } else {
1151: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1152: w->valid_GPU_array = PETSC_OFFLOAD_CPU;
1153: VecCUDAAllocateCheck(w);
1154: }
1155: return(0);
1156: }
1158: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1159: {
1161: cudaError_t err;
1168: if (v->petscnative) {
1169: v->data = w->data;
1170: v->valid_GPU_array = w->valid_GPU_array;
1171: v->spptr = w->spptr;
1172: VecCUDACopyFromGPU(v);
1173: PetscObjectStateIncrease((PetscObject)v);
1174: w->data = 0;
1175: w->valid_GPU_array = PETSC_OFFLOAD_UNALLOCATED;
1176: w->spptr = 0;
1177: } else {
1178: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1179: if ((Vec_CUDA*)w->spptr) {
1180: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1181: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1182: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1183: PetscFree(w->spptr);
1184: }
1185: }
1186: return(0);
1187: }
1189: /*@C
1190: VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1192: This function has semantics similar to VecGetArray(): the pointer
1193: returned by this function points to a consistent view of the vector
1194: data. This may involve a copy operation of data from the host to the
1195: device if the data on the device is out of date. If the device
1196: memory hasn't been allocated previously it will be allocated as part
1197: of this function call. VecCUDAGetArrayReadWrite() assumes that
1198: the user will modify the vector data. This is similar to
1199: intent(inout) in fortran.
1201: The CUDA device pointer has to be released by calling
1202: VecCUDARestoreArrayReadWrite(). Upon restoring the vector data
1203: the data on the host will be marked as out of date. A subsequent
1204: access of the host data will thus incur a data transfer from the
1205: device to the host.
1208: Input Parameter:
1209: . v - the vector
1211: Output Parameter:
1212: . a - the CUDA device pointer
1214: Fortran note:
1215: This function is not currently available from Fortran.
1217: Level: intermediate
1219: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1220: @*/
1221: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1222: {
1227: *a = 0;
1228: VecCUDACopyToGPU(v);
1229: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1230: return(0);
1231: }
1233: /*@C
1234: VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().
1236: This marks the host data as out of date. Subsequent access to the
1237: vector data on the host side with for instance VecGetArray() incurs a
1238: data transfer.
1240: Input Parameter:
1241: + v - the vector
1242: - a - the CUDA device pointer. This pointer is invalid after
1243: VecCUDARestoreArrayReadWrite() returns.
1245: Fortran note:
1246: This function is not currently available from Fortran.
1248: Level: intermediate
1250: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1251: @*/
1252: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1253: {
1258: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
1260: PetscObjectStateIncrease((PetscObject)v);
1261: return(0);
1262: }
1264: /*@C
1265: VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.
1267: This function is analogous to VecGetArrayRead(): The pointer
1268: returned by this function points to a consistent view of the vector
1269: data. This may involve a copy operation of data from the host to the
1270: device if the data on the device is out of date. If the device
1271: memory hasn't been allocated previously it will be allocated as part
1272: of this function call. VecCUDAGetArrayRead() assumes that the
1273: user will not modify the vector data. This is analgogous to
1274: intent(in) in Fortran.
1276: The CUDA device pointer has to be released by calling
1277: VecCUDARestoreArrayRead(). If the data on the host side was
1278: previously up to date it will remain so, i.e. data on both the device
1279: and the host is up to date. Accessing data on the host side does not
1280: incur a device to host data transfer.
1282: Input Parameter:
1283: . v - the vector
1285: Output Parameter:
1286: . a - the CUDA pointer.
1288: Fortran note:
1289: This function is not currently available from Fortran.
1291: Level: intermediate
1293: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1294: @*/
1295: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1296: {
1301: *a = 0;
1302: VecCUDACopyToGPU(v);
1303: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1304: return(0);
1305: }
1307: /*@C
1308: VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().
1310: If the data on the host side was previously up to date it will remain
1311: so, i.e. data on both the device and the host is up to date.
1312: Accessing data on the host side e.g. with VecGetArray() does not
1313: incur a device to host data transfer.
1315: Input Parameter:
1316: + v - the vector
1317: - a - the CUDA device pointer. This pointer is invalid after
1318: VecCUDARestoreArrayRead() returns.
1320: Fortran note:
1321: This function is not currently available from Fortran.
1323: Level: intermediate
1325: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1326: @*/
1327: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1328: {
1331: return(0);
1332: }
1334: /*@C
1335: VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.
1337: The data pointed to by the device pointer is uninitialized. The user
1338: may not read from this data. Furthermore, the entire array needs to
1339: be filled by the user to obtain well-defined behaviour. The device
1340: memory will be allocated by this function if it hasn't been allocated
1341: previously. This is analogous to intent(out) in Fortran.
1343: The device pointer needs to be released with
1344: VecCUDARestoreArrayWrite(). When the pointer is released the
1345: host data of the vector is marked as out of data. Subsequent access
1346: of the host data with e.g. VecGetArray() incurs a device to host data
1347: transfer.
1350: Input Parameter:
1351: . v - the vector
1353: Output Parameter:
1354: . a - the CUDA pointer
1356: Fortran note:
1357: This function is not currently available from Fortran.
1359: Level: advanced
1361: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1362: @*/
1363: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1364: {
1369: *a = 0;
1370: VecCUDAAllocateCheck(v);
1371: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1372: return(0);
1373: }
1375: /*@C
1376: VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().
1378: Data on the host will be marked as out of date. Subsequent access of
1379: the data on the host side e.g. with VecGetArray() will incur a device
1380: to host data transfer.
1382: Input Parameter:
1383: + v - the vector
1384: - a - the CUDA device pointer. This pointer is invalid after
1385: VecCUDARestoreArrayWrite() returns.
1387: Fortran note:
1388: This function is not currently available from Fortran.
1390: Level: intermediate
1392: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1393: @*/
1394: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1395: {
1400: v->valid_GPU_array = PETSC_OFFLOAD_GPU;
1402: PetscObjectStateIncrease((PetscObject)v);
1403: return(0);
1404: }
1406: /*@C
1407: VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1408: GPU array provided by the user. This is useful to avoid copying an
1409: array into a vector.
1411: Not Collective
1413: Input Parameters:
1414: + vec - the vector
1415: - array - the GPU array
1417: Notes:
1418: You can return to the original GPU array with a call to VecCUDAResetArray()
1419: It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1420: same time on the same vector.
1422: Level: developer
1424: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()
1426: @*/
1427: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1428: {
1433: VecCUDACopyToGPU(vin);
1434: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1435: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1436: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1437: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1438: PetscObjectStateIncrease((PetscObject)vin);
1439: return(0);
1440: }
1442: /*@C
1443: VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1444: with a GPU array provided by the user. This is useful to avoid copying
1445: a GPU array into a vector.
1447: Not Collective
1449: Input Parameters:
1450: + vec - the vector
1451: - array - the GPU array
1453: Notes:
1454: This permanently replaces the GPU array and frees the memory associated
1455: with the old GPU array.
1457: The memory passed in CANNOT be freed by the user. It will be freed
1458: when the vector is destroyed.
1460: Not supported from Fortran
1462: Level: developer
1464: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()
1466: @*/
1467: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1468: {
1469: cudaError_t err;
1474: err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1475: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1476: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1477: PetscObjectStateIncrease((PetscObject)vin);
1478: return(0);
1479: }
1481: /*@C
1482: VecCUDAResetArray - Resets a vector to use its default memory. Call this
1483: after the use of VecCUDAPlaceArray().
1485: Not Collective
1487: Input Parameters:
1488: . vec - the vector
1490: Level: developer
1492: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()
1494: @*/
1495: PetscErrorCode VecCUDAResetArray(Vec vin)
1496: {
1501: VecCUDACopyToGPU(vin);
1502: ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1503: ((Vec_Seq*)vin->data)->unplacedarray = 0;
1504: vin->valid_GPU_array = PETSC_OFFLOAD_GPU;
1505: PetscObjectStateIncrease((PetscObject)vin);
1506: return(0);
1507: }