Actual source code: veccuda2.cu
petsc-3.8.3 2017-12-09
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
17: /*
18: Allocates space for the vector array on the GPU if it does not exist.
19: Does NOT change the PetscCUDAFlag for the vector
20: Does NOT zero the CUDA array
22: */
23: PetscErrorCode VecCUDAAllocateCheck(Vec v)
24: {
26: cudaError_t err;
27: cudaStream_t stream;
28: Vec_CUDA *veccuda;
31: if (!v->spptr) {
32: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
33: veccuda = (Vec_CUDA*)v->spptr;
34: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
35: veccuda->GPUarray = veccuda->GPUarray_allocated;
36: err = cudaStreamCreate(&stream);CHKERRCUDA(err);
37: veccuda->stream = stream;
38: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
39: if (v->valid_GPU_array == PETSC_CUDA_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->valid_GPU_array = PETSC_CUDA_CPU;
42: } else {
43: v->valid_GPU_array = PETSC_CUDA_GPU;
44: }
45: }
46: }
47: return(0);
48: }
50: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
51: PetscErrorCode VecCUDACopyToGPU(Vec v)
52: {
54: cudaError_t err;
55: Vec_CUDA *veccuda;
56: PetscScalar *varray;
60: VecCUDAAllocateCheck(v);
61: if (v->valid_GPU_array == PETSC_CUDA_CPU) {
62: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
63: veccuda=(Vec_CUDA*)v->spptr;
64: varray=veccuda->GPUarray;
65: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
66: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
67: v->valid_GPU_array = PETSC_CUDA_BOTH;
68: }
69: return(0);
70: }
72: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci)
73: {
74: PetscScalar *varray;
76: cudaError_t err;
77: PetscScalar *cpuPtr, *gpuPtr;
78: Vec_Seq *s;
79: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
83: VecCUDAAllocateCheck(v);
84: if (v->valid_GPU_array == PETSC_CUDA_CPU) {
85: s = (Vec_Seq*)v->data;
87: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
88: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
89: gpuPtr = varray + ptop_scatter->recvLowestIndex;
90: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
92: /* Note : this code copies the smallest contiguous chunk of data
93: containing ALL of the indices */
94: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
96: // Set the buffer states
97: v->valid_GPU_array = PETSC_CUDA_BOTH;
98: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
99: }
100: return(0);
101: }
104: /*
105: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
106: */
107: PetscErrorCode VecCUDACopyFromGPU(Vec v)
108: {
110: cudaError_t err;
111: Vec_CUDA *veccuda;
112: PetscScalar *varray;
116: VecCUDAAllocateCheckHost(v);
117: if (v->valid_GPU_array == PETSC_CUDA_GPU) {
118: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
119: veccuda=(Vec_CUDA*)v->spptr;
120: varray=veccuda->GPUarray;
121: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
122: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
123: v->valid_GPU_array = PETSC_CUDA_BOTH;
124: }
125: return(0);
126: }
128: /* Note that this function only copies *some* of the values up from the GPU to CPU,
129: which means that we need recombine the data at some point before using any of the standard functions.
130: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
131: where you have to always call in pairs
132: */
133: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci)
134: {
135: const PetscScalar *varray, *gpuPtr;
136: PetscErrorCode ierr;
137: cudaError_t err;
138: PetscScalar *cpuPtr;
139: Vec_Seq *s;
140: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
144: VecCUDAAllocateCheckHost(v);
145: if (v->valid_GPU_array == PETSC_CUDA_GPU) {
146: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
148: varray=((Vec_CUDA*)v->spptr)->GPUarray;
149: s = (Vec_Seq*)v->data;
150: gpuPtr = varray + ptop_scatter->sendLowestIndex;
151: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
153: /* Note : this code copies the smallest contiguous chunk of data
154: containing ALL of the indices */
155: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
157: VecCUDARestoreArrayRead(v,&varray);
158: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
159: }
160: return(0);
161: }
163: /*MC
164: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
166: Options Database Keys:
167: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
169: Level: beginner
171: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
172: M*/
174: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
175: {
176: const PetscScalar *xarray;
177: PetscScalar *yarray;
178: PetscErrorCode ierr;
179: PetscBLASInt one=1,bn;
180: PetscScalar sone=1.0;
181: cublasStatus_t cberr;
182: cudaError_t err;
185: PetscBLASIntCast(yin->map->n,&bn);
186: VecCUDAGetArrayRead(xin,&xarray);
187: VecCUDAGetArrayReadWrite(yin,&yarray);
188: if (alpha == (PetscScalar)0.0) {
189: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
190: } else if (alpha == (PetscScalar)1.0) {
191: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
192: PetscLogFlops(2.0*yin->map->n);
193: } else {
194: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
195: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
196: PetscLogFlops(2.0*yin->map->n);
197: }
198: WaitForGPU();CHKERRCUDA(ierr);
199: VecCUDARestoreArrayRead(xin,&xarray);
200: VecCUDARestoreArrayReadWrite(yin,&yarray);
201: return(0);
202: }
204: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
205: {
206: const PetscScalar *xarray;
207: PetscScalar *yarray;
208: PetscErrorCode ierr;
209: PetscBLASInt one=1,bn;
210: cublasStatus_t cberr;
213: if (alpha != (PetscScalar)0.0) {
214: PetscBLASIntCast(yin->map->n,&bn);
215: VecCUDAGetArrayRead(xin,&xarray);
216: VecCUDAGetArrayReadWrite(yin,&yarray);
217: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
218: WaitForGPU();CHKERRCUDA(ierr);
219: VecCUDARestoreArrayRead(xin,&xarray);
220: VecCUDARestoreArrayReadWrite(yin,&yarray);
221: PetscLogFlops(2.0*yin->map->n);
222: }
223: return(0);
224: }
226: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
227: {
228: PetscInt n = xin->map->n;
229: const PetscScalar *xarray=NULL,*yarray=NULL;
230: PetscScalar *warray=NULL;
231: thrust::device_ptr<const PetscScalar> xptr,yptr;
232: thrust::device_ptr<PetscScalar> wptr;
233: PetscErrorCode ierr;
236: VecCUDAGetArrayWrite(win,&warray);
237: VecCUDAGetArrayRead(xin,&xarray);
238: VecCUDAGetArrayRead(yin,&yarray);
239: try {
240: wptr = thrust::device_pointer_cast(warray);
241: xptr = thrust::device_pointer_cast(xarray);
242: yptr = thrust::device_pointer_cast(yarray);
243: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
244: WaitForGPU();CHKERRCUDA(ierr);
245: } catch (char *ex) {
246: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
247: }
248: PetscLogFlops(n);
249: VecCUDARestoreArrayRead(xin,&xarray);
250: VecCUDARestoreArrayRead(yin,&yarray);
251: VecCUDARestoreArrayWrite(win,&warray);
252: return(0);
253: }
255: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
256: {
257: const PetscScalar *xarray=NULL,*yarray=NULL;
258: PetscScalar *warray=NULL;
259: PetscErrorCode ierr;
260: PetscBLASInt one=1,bn;
261: cublasStatus_t cberr;
262: cudaError_t err;
265: PetscBLASIntCast(win->map->n,&bn);
266: if (alpha == (PetscScalar)0.0) {
267: VecCopy_SeqCUDA(yin,win);
268: } else {
269: VecCUDAGetArrayRead(xin,&xarray);
270: VecCUDAGetArrayRead(yin,&yarray);
271: VecCUDAGetArrayWrite(win,&warray);
272: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
273: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
274: PetscLogFlops(2*win->map->n);
275: WaitForGPU();CHKERRCUDA(ierr);
276: VecCUDARestoreArrayRead(xin,&xarray);
277: VecCUDARestoreArrayRead(yin,&yarray);
278: VecCUDARestoreArrayWrite(win,&warray);
279: }
280: return(0);
281: }
283: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
284: {
286: PetscInt n = xin->map->n,j,j_rem;
287: PetscScalar alpha0,alpha1,alpha2,alpha3;
290: PetscLogFlops(nv*2.0*n);
291: switch (j_rem=nv&0x3) {
292: case 3:
293: alpha0 = alpha[0];
294: alpha1 = alpha[1];
295: alpha2 = alpha[2];
296: alpha += 3;
297: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
298: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
299: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
300: y += 3;
301: break;
302: case 2:
303: alpha0 = alpha[0];
304: alpha1 = alpha[1];
305: alpha +=2;
306: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
307: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
308: y +=2;
309: break;
310: case 1:
311: alpha0 = *alpha++;
312: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
313: y +=1;
314: break;
315: }
316: for (j=j_rem; j<nv; j+=4) {
317: alpha0 = alpha[0];
318: alpha1 = alpha[1];
319: alpha2 = alpha[2];
320: alpha3 = alpha[3];
321: alpha += 4;
322: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
323: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
324: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
325: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
326: y += 4;
327: }
328: WaitForGPU();CHKERRCUDA(ierr);
329: return(0);
330: }
332: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
333: {
334: const PetscScalar *xarray,*yarray;
335: PetscErrorCode ierr;
336: PetscBLASInt one=1,bn;
337: cublasStatus_t cberr;
340: PetscBLASIntCast(yin->map->n,&bn);
341: VecCUDAGetArrayRead(xin,&xarray);
342: VecCUDAGetArrayRead(yin,&yarray);
343: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
344: cberr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cberr);
345: WaitForGPU();CHKERRCUDA(ierr);
346: if (xin->map->n >0) {
347: PetscLogFlops(2.0*xin->map->n-1);
348: }
349: VecCUDARestoreArrayRead(xin,&xarray);
350: VecCUDARestoreArrayRead(yin,&yarray);
351: return(0);
352: }
354: //
355: // CUDA kernels for MDot to follow
356: //
358: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
359: #define MDOT_WORKGROUP_SIZE 128
360: #define MDOT_WORKGROUP_NUM 128
362: #if !defined(PETSC_USE_COMPLEX)
363: // M = 2:
364: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
365: PetscInt size, PetscScalar *group_results)
366: {
367: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
368: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
369: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
370: PetscInt vec_start_index = blockIdx.x * entries_per_group;
371: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
373: PetscScalar entry_x = 0;
374: PetscScalar group_sum0 = 0;
375: PetscScalar group_sum1 = 0;
376: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
377: entry_x = x[i]; // load only once from global memory!
378: group_sum0 += entry_x * y0[i];
379: group_sum1 += entry_x * y1[i];
380: }
381: tmp_buffer[threadIdx.x] = group_sum0;
382: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
384: // parallel reduction
385: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
386: __syncthreads();
387: if (threadIdx.x < stride) {
388: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
389: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
390: }
391: }
393: // write result of group to group_results
394: if (threadIdx.x == 0) {
395: group_results[blockIdx.x] = tmp_buffer[0];
396: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
397: }
398: }
400: // M = 3:
401: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
402: PetscInt size, PetscScalar *group_results)
403: {
404: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
405: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
406: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
407: PetscInt vec_start_index = blockIdx.x * entries_per_group;
408: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
410: PetscScalar entry_x = 0;
411: PetscScalar group_sum0 = 0;
412: PetscScalar group_sum1 = 0;
413: PetscScalar group_sum2 = 0;
414: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
415: entry_x = x[i]; // load only once from global memory!
416: group_sum0 += entry_x * y0[i];
417: group_sum1 += entry_x * y1[i];
418: group_sum2 += entry_x * y2[i];
419: }
420: tmp_buffer[threadIdx.x] = group_sum0;
421: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
422: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
424: // parallel reduction
425: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
426: __syncthreads();
427: if (threadIdx.x < stride) {
428: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
429: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
430: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
431: }
432: }
434: // write result of group to group_results
435: if (threadIdx.x == 0) {
436: group_results[blockIdx.x ] = tmp_buffer[0];
437: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
438: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
439: }
440: }
442: // M = 4:
443: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444: PetscInt size, PetscScalar *group_results)
445: {
446: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
447: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
448: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
449: PetscInt vec_start_index = blockIdx.x * entries_per_group;
450: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
452: PetscScalar entry_x = 0;
453: PetscScalar group_sum0 = 0;
454: PetscScalar group_sum1 = 0;
455: PetscScalar group_sum2 = 0;
456: PetscScalar group_sum3 = 0;
457: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
458: entry_x = x[i]; // load only once from global memory!
459: group_sum0 += entry_x * y0[i];
460: group_sum1 += entry_x * y1[i];
461: group_sum2 += entry_x * y2[i];
462: group_sum3 += entry_x * y3[i];
463: }
464: tmp_buffer[threadIdx.x] = group_sum0;
465: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
466: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
467: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
469: // parallel reduction
470: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
471: __syncthreads();
472: if (threadIdx.x < stride) {
473: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
474: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
475: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
476: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
477: }
478: }
480: // write result of group to group_results
481: if (threadIdx.x == 0) {
482: group_results[blockIdx.x ] = tmp_buffer[0];
483: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
484: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
485: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
486: }
487: }
489: // M = 8:
490: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
491: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
492: PetscInt size, PetscScalar *group_results)
493: {
494: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
495: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
496: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
497: PetscInt vec_start_index = blockIdx.x * entries_per_group;
498: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
500: PetscScalar entry_x = 0;
501: PetscScalar group_sum0 = 0;
502: PetscScalar group_sum1 = 0;
503: PetscScalar group_sum2 = 0;
504: PetscScalar group_sum3 = 0;
505: PetscScalar group_sum4 = 0;
506: PetscScalar group_sum5 = 0;
507: PetscScalar group_sum6 = 0;
508: PetscScalar group_sum7 = 0;
509: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
510: entry_x = x[i]; // load only once from global memory!
511: group_sum0 += entry_x * y0[i];
512: group_sum1 += entry_x * y1[i];
513: group_sum2 += entry_x * y2[i];
514: group_sum3 += entry_x * y3[i];
515: group_sum4 += entry_x * y4[i];
516: group_sum5 += entry_x * y5[i];
517: group_sum6 += entry_x * y6[i];
518: group_sum7 += entry_x * y7[i];
519: }
520: tmp_buffer[threadIdx.x] = group_sum0;
521: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
522: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
523: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
524: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
525: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
526: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
527: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
529: // parallel reduction
530: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
531: __syncthreads();
532: if (threadIdx.x < stride) {
533: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
534: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
535: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
536: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
537: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
538: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
539: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
540: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
541: }
542: }
544: // write result of group to group_results
545: if (threadIdx.x == 0) {
546: group_results[blockIdx.x ] = tmp_buffer[0];
547: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
548: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
549: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
550: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
551: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
552: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
553: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
554: }
555: }
556: #endif /* !defined(PETSC_USE_COMPLEX) */
558: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
559: {
560: PetscErrorCode ierr;
561: PetscInt i,n = xin->map->n,current_y_index = 0;
562: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
563: PetscScalar *group_results_gpu;
564: #if !defined(PETSC_USE_COMPLEX)
565: PetscInt j;
566: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
567: #endif
568: cudaError_t cuda_ierr;
569: PetscBLASInt one=1,bn;
570: cublasStatus_t cberr;
573: PetscBLASIntCast(xin->map->n,&bn);
574: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
575: /* Handle the case of local size zero first */
576: if (!xin->map->n) {
577: for (i=0; i<nv; ++i) z[i] = 0;
578: return(0);
579: }
581: // allocate scratchpad memory for the results of individual work groups:
582: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
584: VecCUDAGetArrayRead(xin,&xptr);
586: while (current_y_index < nv)
587: {
588: switch (nv - current_y_index) {
590: case 7:
591: case 6:
592: case 5:
593: case 4:
594: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
595: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
596: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
597: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
599: #if defined(PETSC_USE_COMPLEX)
600: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
601: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
602: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
603: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
604: #else
605: // run kernel:
606: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
608: // copy results back to
609: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
611: // sum group results into z:
612: for (j=0; j<4; ++j) {
613: z[current_y_index + j] = 0;
614: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
615: }
616: #endif
617: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
618: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
619: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
620: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
621: current_y_index += 4;
622: break;
624: case 3:
625: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
626: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
627: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
629: #if defined(PETSC_USE_COMPLEX)
630: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
631: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
632: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
633: #else
634: // run kernel:
635: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
637: // copy results back to
638: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
640: // sum group results into z:
641: for (j=0; j<3; ++j) {
642: z[current_y_index + j] = 0;
643: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
644: }
645: #endif
647: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
648: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
649: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
650: current_y_index += 3;
651: break;
653: case 2:
654: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
655: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
657: #if defined(PETSC_USE_COMPLEX)
658: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
659: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
660: #else
661: // run kernel:
662: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
664: // copy results back to
665: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
667: // sum group results into z:
668: for (j=0; j<2; ++j) {
669: z[current_y_index + j] = 0;
670: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
671: }
672: #endif
673: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
674: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
675: current_y_index += 2;
676: break;
678: case 1:
679: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
680: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
681: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
682: current_y_index += 1;
683: break;
685: default: // 8 or more vectors left
686: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
687: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
688: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
689: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
690: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
691: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
692: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
693: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
695: #if defined(PETSC_USE_COMPLEX)
696: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
697: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
698: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
699: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
700: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
701: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
702: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
703: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
704: #else
705: // run kernel:
706: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
708: // copy results back to
709: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
711: // sum group results into z:
712: for (j=0; j<8; ++j) {
713: z[current_y_index + j] = 0;
714: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
715: }
716: #endif
717: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
718: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
719: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
720: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
721: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
722: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
723: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
724: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
725: current_y_index += 8;
726: break;
727: }
728: }
729: VecCUDARestoreArrayRead(xin,&xptr);
731: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
732: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
733: return(0);
734: }
736: #undef MDOT_WORKGROUP_SIZE
737: #undef MDOT_WORKGROUP_NUM
739: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
740: {
741: PetscInt n = xin->map->n;
742: PetscScalar *xarray=NULL;
743: thrust::device_ptr<PetscScalar> xptr;
744: PetscErrorCode ierr;
745: cudaError_t err;
748: VecCUDAGetArrayWrite(xin,&xarray);
749: if (alpha == (PetscScalar)0.0) {
750: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
751: } else {
752: try {
753: xptr = thrust::device_pointer_cast(xarray);
754: thrust::fill(xptr,xptr+n,alpha);
755: } catch (char *ex) {
756: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
757: }
758: }
759: WaitForGPU();CHKERRCUDA(ierr);
760: VecCUDARestoreArrayWrite(xin,&xarray);
761: return(0);
762: }
764: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
765: {
766: PetscScalar *xarray;
768: PetscBLASInt one=1,bn;
769: cublasStatus_t cberr;
772: if (alpha == (PetscScalar)0.0) {
773: VecSet_SeqCUDA(xin,alpha);
774: } else if (alpha != (PetscScalar)1.0) {
775: PetscBLASIntCast(xin->map->n,&bn);
776: VecCUDAGetArrayReadWrite(xin,&xarray);
777: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
778: VecCUDARestoreArrayReadWrite(xin,&xarray);
779: }
780: WaitForGPU();CHKERRCUDA(ierr);
781: PetscLogFlops(xin->map->n);
782: return(0);
783: }
785: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
786: {
787: const PetscScalar *xarray,*yarray;
788: PetscErrorCode ierr;
789: PetscBLASInt one=1,bn;
790: cublasStatus_t cberr;
793: PetscBLASIntCast(xin->map->n,&bn);
794: VecCUDAGetArrayRead(xin,&xarray);
795: VecCUDAGetArrayRead(yin,&yarray);
796: cberr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cberr);
797: WaitForGPU();CHKERRCUDA(ierr);
798: if (xin->map->n > 0) {
799: PetscLogFlops(2.0*xin->map->n-1);
800: }
801: VecCUDARestoreArrayRead(yin,&yarray);
802: VecCUDARestoreArrayRead(xin,&xarray);
803: return(0);
804: }
806: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
807: {
808: const PetscScalar *xarray;
809: PetscScalar *yarray;
810: PetscErrorCode ierr;
811: cudaError_t err;
814: if (xin != yin) {
815: if (xin->valid_GPU_array == PETSC_CUDA_GPU) {
816: VecCUDAGetArrayRead(xin,&xarray);
817: VecCUDAGetArrayWrite(yin,&yarray);
818: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
819: WaitForGPU();CHKERRCUDA(ierr);
820: VecCUDARestoreArrayRead(xin,&xarray);
821: VecCUDARestoreArrayWrite(yin,&yarray);
823: } else if (xin->valid_GPU_array == PETSC_CUDA_CPU) {
824: /* copy in CPU if we are on the CPU*/
825: VecCopy_SeqCUDA_Private(xin,yin);
826: } else if (xin->valid_GPU_array == PETSC_CUDA_BOTH) {
827: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
828: if (yin->valid_GPU_array == PETSC_CUDA_CPU) {
829: /* copy in CPU */
830: VecCopy_SeqCUDA_Private(xin,yin);
832: } else if (yin->valid_GPU_array == PETSC_CUDA_GPU) {
833: /* copy in GPU */
834: VecCUDAGetArrayRead(xin,&xarray);
835: VecCUDAGetArrayWrite(yin,&yarray);
836: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
837: VecCUDARestoreArrayRead(xin,&xarray);
838: VecCUDARestoreArrayWrite(yin,&yarray);
839: } else if (yin->valid_GPU_array == PETSC_CUDA_BOTH) {
840: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
841: default to copy in GPU (this is an arbitrary choice) */
842: VecCUDAGetArrayRead(xin,&xarray);
843: VecCUDAGetArrayWrite(yin,&yarray);
844: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
845: VecCUDARestoreArrayRead(xin,&xarray);
846: VecCUDARestoreArrayWrite(yin,&yarray);
847: } else {
848: VecCopy_SeqCUDA_Private(xin,yin);
849: }
850: }
851: }
852: return(0);
853: }
855: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
856: {
858: PetscBLASInt one = 1,bn;
859: PetscScalar *xarray,*yarray;
860: cublasStatus_t cberr;
863: PetscBLASIntCast(xin->map->n,&bn);
864: if (xin != yin) {
865: VecCUDAGetArrayReadWrite(xin,&xarray);
866: VecCUDAGetArrayReadWrite(yin,&yarray);
867: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
868: WaitForGPU();CHKERRCUDA(ierr);
869: VecCUDARestoreArrayReadWrite(xin,&xarray);
870: VecCUDARestoreArrayReadWrite(yin,&yarray);
871: }
872: return(0);
873: }
875: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
876: {
877: PetscErrorCode ierr;
878: PetscScalar a = alpha,b = beta;
879: const PetscScalar *xarray;
880: PetscScalar *yarray;
881: PetscBLASInt one = 1, bn;
882: cublasStatus_t cberr;
883: cudaError_t err;
886: PetscBLASIntCast(yin->map->n,&bn);
887: if (a == (PetscScalar)0.0) {
888: VecScale_SeqCUDA(yin,beta);
889: } else if (b == (PetscScalar)1.0) {
890: VecAXPY_SeqCUDA(yin,alpha,xin);
891: } else if (a == (PetscScalar)1.0) {
892: VecAYPX_SeqCUDA(yin,beta,xin);
893: } else if (b == (PetscScalar)0.0) {
894: VecCUDAGetArrayRead(xin,&xarray);
895: VecCUDAGetArrayReadWrite(yin,&yarray);
896: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
897: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
898: PetscLogFlops(xin->map->n);
899: WaitForGPU();CHKERRCUDA(ierr);
900: VecCUDARestoreArrayRead(xin,&xarray);
901: VecCUDARestoreArrayReadWrite(yin,&yarray);
902: } else {
903: VecCUDAGetArrayRead(xin,&xarray);
904: VecCUDAGetArrayReadWrite(yin,&yarray);
905: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
906: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
907: VecCUDARestoreArrayRead(xin,&xarray);
908: VecCUDARestoreArrayReadWrite(yin,&yarray);
909: WaitForGPU();CHKERRCUDA(ierr);
910: PetscLogFlops(3.0*xin->map->n);
911: }
912: return(0);
913: }
915: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
916: {
918: PetscInt n = zin->map->n;
921: if (gamma == (PetscScalar)1.0) {
922: /* z = ax + b*y + z */
923: VecAXPY_SeqCUDA(zin,alpha,xin);
924: VecAXPY_SeqCUDA(zin,beta,yin);
925: PetscLogFlops(4.0*n);
926: } else {
927: /* z = a*x + b*y + c*z */
928: VecScale_SeqCUDA(zin,gamma);
929: VecAXPY_SeqCUDA(zin,alpha,xin);
930: VecAXPY_SeqCUDA(zin,beta,yin);
931: PetscLogFlops(5.0*n);
932: }
933: WaitForGPU();CHKERRCUDA(ierr);
934: return(0);
935: }
937: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
938: {
939: PetscInt n = win->map->n;
940: const PetscScalar *xarray,*yarray;
941: PetscScalar *warray;
942: thrust::device_ptr<const PetscScalar> xptr,yptr;
943: thrust::device_ptr<PetscScalar> wptr;
944: PetscErrorCode ierr;
947: VecCUDAGetArrayReadWrite(win,&warray);
948: VecCUDAGetArrayRead(xin,&xarray);
949: VecCUDAGetArrayRead(yin,&yarray);
950: try {
951: wptr = thrust::device_pointer_cast(warray);
952: xptr = thrust::device_pointer_cast(xarray);
953: yptr = thrust::device_pointer_cast(yarray);
954: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
955: WaitForGPU();CHKERRCUDA(ierr);
956: } catch (char *ex) {
957: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
958: }
959: VecCUDARestoreArrayRead(xin,&xarray);
960: VecCUDARestoreArrayRead(yin,&yarray);
961: VecCUDARestoreArrayReadWrite(win,&warray);
962: PetscLogFlops(n);
963: return(0);
964: }
966: /* should do infinity norm in cuda */
968: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
969: {
970: PetscErrorCode ierr;
971: PetscInt n = xin->map->n;
972: PetscBLASInt one = 1, bn;
973: const PetscScalar *xarray;
974: cublasStatus_t cberr;
975: cudaError_t err;
978: PetscBLASIntCast(n,&bn);
979: if (type == NORM_2 || type == NORM_FROBENIUS) {
980: VecCUDAGetArrayRead(xin,&xarray);
981: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
982: WaitForGPU();CHKERRCUDA(ierr);
983: VecCUDARestoreArrayRead(xin,&xarray);
984: PetscLogFlops(PetscMax(2.0*n-1,0.0));
985: } else if (type == NORM_INFINITY) {
986: int i;
987: VecCUDAGetArrayRead(xin,&xarray);
988: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
989: err = cudaMemcpy(z,xarray+i,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
990: VecCUDARestoreArrayRead(xin,&xarray);
991: } else if (type == NORM_1) {
992: VecCUDAGetArrayRead(xin,&xarray);
993: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
994: VecCUDARestoreArrayRead(xin,&xarray);
995: WaitForGPU();CHKERRCUDA(ierr);
996: PetscLogFlops(PetscMax(n-1.0,0.0));
997: } else if (type == NORM_1_AND_2) {
998: VecNorm_SeqCUDA(xin,NORM_1,z);
999: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1000: }
1001: return(0);
1002: }
1004: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1005: {
1006: PetscErrorCode ierr;
1007: PetscReal n=s->map->n;
1008: const PetscScalar *sarray,*tarray;
1011: VecCUDAGetArrayRead(s,&sarray);
1012: VecCUDAGetArrayRead(t,&tarray);
1013: VecDot_SeqCUDA(s,t,dp);
1014: VecDot_SeqCUDA(t,t,nm);
1015: VecCUDARestoreArrayRead(s,&sarray);
1016: VecCUDARestoreArrayRead(t,&tarray);
1017: WaitForGPU();CHKERRCUDA(ierr);
1018: PetscLogFlops(4.0*n);
1019: return(0);
1020: }
1022: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1023: {
1025: cudaError_t err;
1028: if (v->spptr) {
1029: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1030: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1031: }
1032: if (((Vec_CUDA*)v->spptr)->stream) {
1033: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1034: }
1035: PetscFree(v->spptr);
1036: }
1037: VecDestroy_SeqCUDA_Private(v);
1038: return(0);
1039: }
1041: #if defined(PETSC_USE_COMPLEX)
1042: struct conjugate
1043: {
1044: __host__ __device__
1045: PetscScalar operator()(PetscScalar x)
1046: {
1047: return PetscConj(x);
1048: }
1049: };
1050: #endif
1052: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1053: {
1054: PetscScalar *xarray;
1055: PetscErrorCode ierr;
1056: #if defined(PETSC_USE_COMPLEX)
1057: PetscInt n = xin->map->n;
1058: thrust::device_ptr<PetscScalar> xptr;
1059: #endif
1062: VecCUDAGetArrayReadWrite(xin,&xarray);
1063: #if defined(PETSC_USE_COMPLEX)
1064: try {
1065: xptr = thrust::device_pointer_cast(xarray);
1066: thrust::transform(xptr,xptr+n,xptr,conjugate());
1067: WaitForGPU();CHKERRCUDA(ierr);
1068: } catch (char *ex) {
1069: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1070: }
1071: #endif
1072: VecCUDARestoreArrayReadWrite(xin,&xarray);
1073: return(0);
1074: }
1076: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1077: {
1079: cudaError_t err;
1086: if (w->data) {
1087: if (((Vec_Seq*)w->data)->array_allocated) {
1088: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1089: }
1090: ((Vec_Seq*)w->data)->array = NULL;
1091: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1092: }
1093: if (w->spptr) {
1094: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1095: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1096: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1097: }
1098: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1099: PetscFree(w->spptr);
1100: }
1102: if (v->petscnative) {
1103: PetscFree(w->data);
1104: w->data = v->data;
1105: w->valid_GPU_array = v->valid_GPU_array;
1106: w->spptr = v->spptr;
1107: PetscObjectStateIncrease((PetscObject)w);
1108: } else {
1109: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1110: w->valid_GPU_array = PETSC_CUDA_CPU;
1111: VecCUDAAllocateCheck(w);
1112: }
1113: return(0);
1114: }
1116: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1117: {
1119: cudaError_t err;
1126: if (v->petscnative) {
1127: v->data = w->data;
1128: v->valid_GPU_array = w->valid_GPU_array;
1129: v->spptr = w->spptr;
1130: VecCUDACopyFromGPU(v);
1131: PetscObjectStateIncrease((PetscObject)v);
1132: w->data = 0;
1133: w->valid_GPU_array = PETSC_CUDA_UNALLOCATED;
1134: w->spptr = 0;
1135: } else {
1136: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1137: if ((Vec_CUDA*)w->spptr) {
1138: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1139: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1140: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1141: PetscFree(w->spptr);
1142: }
1143: }
1144: return(0);
1145: }
1147: /*@C
1148: VecCUDAGetArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1150: This function has semantics similar to VecGetArray(): the pointer
1151: returned by this function points to a consistent view of the vector
1152: data. This may involve a copy operation of data from the host to the
1153: device if the data on the device is out of date. If the device
1154: memory hasn't been allocated previously it will be allocated as part
1155: of this function call. VecCUDAGetArrayReadWrite() assumes that
1156: the user will modify the vector data. This is similar to
1157: intent(inout) in fortran.
1159: The CUDA device pointer has to be released by calling
1160: VecCUDARestoreArrayReadWrite(). Upon restoring the vector data
1161: the data on the host will be marked as out of date. A subsequent
1162: access of the host data will thus incur a data transfer from the
1163: device to the host.
1166: Input Parameter:
1167: . v - the vector
1169: Output Parameter:
1170: . a - the CUDA device pointer
1172: Fortran note:
1173: This function is not currently available from Fortran.
1175: Level: intermediate
1177: .seealso: VecCUDARestoreArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1178: @*/
1179: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayReadWrite(Vec v, PetscScalar **a)
1180: {
1185: *a = 0;
1186: VecCUDACopyToGPU(v);
1187: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1188: return(0);
1189: }
1191: /*@C
1192: VecCUDARestoreArrayReadWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayReadWrite().
1194: This marks the host data as out of date. Subsequent access to the
1195: vector data on the host side with for instance VecGetArray() incurs a
1196: data transfer.
1198: Input Parameter:
1199: + v - the vector
1200: - a - the CUDA device pointer. This pointer is invalid after
1201: VecCUDARestoreArrayReadWrite() returns.
1203: Fortran note:
1204: This function is not currently available from Fortran.
1206: Level: intermediate
1208: .seealso: VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1209: @*/
1210: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayReadWrite(Vec v, PetscScalar **a)
1211: {
1216: v->valid_GPU_array = PETSC_CUDA_GPU;
1218: PetscObjectStateIncrease((PetscObject)v);
1219: return(0);
1220: }
1222: /*@C
1223: VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.
1225: This function is analogous to VecGetArrayRead(): The pointer
1226: returned by this function points to a consistent view of the vector
1227: data. This may involve a copy operation of data from the host to the
1228: device if the data on the device is out of date. If the device
1229: memory hasn't been allocated previously it will be allocated as part
1230: of this function call. VecCUDAGetArrayRead() assumes that the
1231: user will not modify the vector data. This is analgogous to
1232: intent(in) in Fortran.
1234: The CUDA device pointer has to be released by calling
1235: VecCUDARestoreArrayRead(). If the data on the host side was
1236: previously up to date it will remain so, i.e. data on both the device
1237: and the host is up to date. Accessing data on the host side does not
1238: incur a device to host data transfer.
1240: Input Parameter:
1241: . v - the vector
1243: Output Parameter:
1244: . a - the CUDA pointer.
1246: Fortran note:
1247: This function is not currently available from Fortran.
1249: Level: intermediate
1251: .seealso: VecCUDARestoreArrayRead(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1252: @*/
1253: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
1254: {
1259: *a = 0;
1260: VecCUDACopyToGPU(v);
1261: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1262: return(0);
1263: }
1265: /*@C
1266: VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayRead().
1268: If the data on the host side was previously up to date it will remain
1269: so, i.e. data on both the device and the host is up to date.
1270: Accessing data on the host side e.g. with VecGetArray() does not
1271: incur a device to host data transfer.
1273: Input Parameter:
1274: + v - the vector
1275: - a - the CUDA device pointer. This pointer is invalid after
1276: VecCUDARestoreArrayRead() returns.
1278: Fortran note:
1279: This function is not currently available from Fortran.
1281: Level: intermediate
1283: .seealso: VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1284: @*/
1285: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
1286: {
1289: return(0);
1290: }
1292: /*@C
1293: VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.
1295: The data pointed to by the device pointer is uninitialized. The user
1296: may not read from this data. Furthermore, the entire array needs to
1297: be filled by the user to obtain well-defined behaviour. The device
1298: memory will be allocated by this function if it hasn't been allocated
1299: previously. This is analogous to intent(out) in Fortran.
1301: The device pointer needs to be released with
1302: VecCUDARestoreArrayWrite(). When the pointer is released the
1303: host data of the vector is marked as out of data. Subsequent access
1304: of the host data with e.g. VecGetArray() incurs a device to host data
1305: transfer.
1308: Input Parameter:
1309: . v - the vector
1311: Output Parameter:
1312: . a - the CUDA pointer
1314: Fortran note:
1315: This function is not currently available from Fortran.
1317: Level: advanced
1319: .seealso: VecCUDARestoreArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1320: @*/
1321: PETSC_EXTERN PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
1322: {
1327: *a = 0;
1328: VecCUDAAllocateCheck(v);
1329: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
1330: return(0);
1331: }
1333: /*@C
1334: VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with VecCUDAGetArrayWrite().
1336: Data on the host will be marked as out of date. Subsequent access of
1337: the data on the host side e.g. with VecGetArray() will incur a device
1338: to host data transfer.
1340: Input Parameter:
1341: + v - the vector
1342: - a - the CUDA device pointer. This pointer is invalid after
1343: VecCUDARestoreArrayWrite() returns.
1345: Fortran note:
1346: This function is not currently available from Fortran.
1348: Level: intermediate
1350: .seealso: VecCUDAGetArrayWrite(), VecCUDAGetArrayReadWrite(), VecCUDAGetArrayRead(), VecCUDAGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1351: @*/
1352: PETSC_EXTERN PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
1353: {
1358: v->valid_GPU_array = PETSC_CUDA_GPU;
1360: PetscObjectStateIncrease((PetscObject)v);
1361: return(0);
1362: }
1364: /*@C
1365: VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a
1366: GPU array provided by the user. This is useful to avoid copying an
1367: array into a vector.
1369: Not Collective
1371: Input Parameters:
1372: + vec - the vector
1373: - array - the GPU array
1375: Notes:
1376: You can return to the original GPU array with a call to VecCUDAResetArray()
1377: It is not possible to use VecCUDAPlaceArray() and VecPlaceArray() at the
1378: same time on the same vector.
1380: Level: developer
1382: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAReplaceArray()
1384: @*/
1385: PetscErrorCode VecCUDAPlaceArray(Vec vin,PetscScalar *a)
1386: {
1391: VecCUDACopyToGPU(vin);
1392: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUDAPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUDAResetArray()/VecResetArray()");
1393: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUDA*)vin->spptr)->GPUarray; /* save previous GPU array so reset can bring it back */
1394: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1395: vin->valid_GPU_array = PETSC_CUDA_GPU;
1396: PetscObjectStateIncrease((PetscObject)vin);
1397: return(0);
1398: }
1400: /*@C
1401: VecCUDAReplaceArray - Allows one to replace the GPU array in a vector
1402: with a GPU array provided by the user. This is useful to avoid copying
1403: a GPU array into a vector.
1405: Not Collective
1407: Input Parameters:
1408: + vec - the vector
1409: - array - the GPU array
1411: Notes:
1412: This permanently replaces the GPU array and frees the memory associated
1413: with the old GPU array.
1415: The memory passed in CANNOT be freed by the user. It will be freed
1416: when the vector is destroyed.
1418: Not supported from Fortran
1420: Level: developer
1422: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUDAResetArray(), VecCUDAPlaceArray(), VecReplaceArray()
1424: @*/
1425: PetscErrorCode VecCUDAReplaceArray(Vec vin,PetscScalar *a)
1426: {
1427: cudaError_t err;
1432: err = cudaFree(((Vec_CUDA*)vin->spptr)->GPUarray);CHKERRCUDA(err);
1433: ((Vec_CUDA*)vin->spptr)->GPUarray = a;
1434: vin->valid_GPU_array = PETSC_CUDA_GPU;
1435: PetscObjectStateIncrease((PetscObject)vin);
1436: return(0);
1437: }
1439: /*@C
1440: VecCUDAResetArray - Resets a vector to use its default memory. Call this
1441: after the use of VecCUDAPlaceArray().
1443: Not Collective
1445: Input Parameters:
1446: . vec - the vector
1448: Level: developer
1450: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUDAPlaceArray(), VecCUDAReplaceArray()
1452: @*/
1453: PetscErrorCode VecCUDAResetArray(Vec vin)
1454: {
1459: VecCUDACopyToGPU(vin);
1460: ((Vec_CUDA*)vin->spptr)->GPUarray = (PetscScalar *) ((Vec_Seq*)vin->data)->unplacedarray;
1461: ((Vec_Seq*)vin->data)->unplacedarray = 0;
1462: vin->valid_GPU_array = PETSC_CUDA_GPU;
1463: PetscObjectStateIncrease((PetscObject)vin);
1464: return(0);
1465: }