Actual source code: veccusp2.cu
petsc-3.7.1 2016-05-15
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #define PETSC_SKIP_COMPLEX
6: #define PETSC_SKIP_SPINLOCK
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
13: #include <cuda_runtime.h>
17: /*
18: Allocates space for the vector array on the GPU if it does not exist.
19: Does NOT change the PetscCUSPFlag for the vector
20: Does NOT zero the CUSP array
22: */
23: PetscErrorCode VecCUSPAllocateCheck(Vec v)
24: {
25: cudaError_t err;
26: cudaStream_t stream;
27: Vec_CUSP *veccusp;
30: if (!v->spptr) {
31: try {
32: v->spptr = new Vec_CUSP;
33: veccusp = (Vec_CUSP*)v->spptr;
34: veccusp->GPUarray = new CUSPARRAY;
35: veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
36: err = cudaStreamCreate(&stream);CHKERRCUSP(err);
37: veccusp->stream = stream;
38: veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
39: v->ops->destroy = VecDestroy_SeqCUSP;
40: if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
41: if (v->data && ((Vec_Seq*)v->data)->array) {
42: v->valid_GPU_array = PETSC_CUSP_CPU;
43: } else {
44: v->valid_GPU_array = PETSC_CUSP_GPU;
45: }
46: }
47: } catch(char *ex) {
48: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
49: }
50: }
51: return(0);
52: }
56: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
57: PetscErrorCode VecCUSPCopyToGPU(Vec v)
58: {
60: cudaError_t err;
61: Vec_CUSP *veccusp;
62: CUSPARRAY *varray;
65: VecCUSPAllocateCheck(v);
66: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
67: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
68: try {
69: veccusp=(Vec_CUSP*)v->spptr;
70: varray=veccusp->GPUarray;
71: err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
72: } catch(char *ex) {
73: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
74: }
75: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
76: v->valid_GPU_array = PETSC_CUSP_BOTH;
77: }
78: return(0);
79: }
83: PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
84: {
85: CUSPARRAY *varray;
87: cudaError_t err;
88: PetscScalar *cpuPtr, *gpuPtr;
89: Vec_Seq *s;
90: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
93: VecCUSPAllocateCheck(v);
94: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
95: s = (Vec_Seq*)v->data;
97: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
98: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
99: gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
100: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
102: /* Note : this code copies the smallest contiguous chunk of data
103: containing ALL of the indices */
104: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
106: // Set the buffer states
107: v->valid_GPU_array = PETSC_CUSP_BOTH;
108: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
109: }
110: return(0);
111: }
116: /*
117: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
118: */
119: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
120: {
122: cudaError_t err;
123: Vec_CUSP *veccusp;
124: CUSPARRAY *varray;
127: VecCUSPAllocateCheckHost(v);
128: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
129: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
130: try {
131: veccusp=(Vec_CUSP*)v->spptr;
132: varray=veccusp->GPUarray;
133: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
134: } catch(char *ex) {
135: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
136: }
137: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
138: v->valid_GPU_array = PETSC_CUSP_BOTH;
139: }
140: return(0);
141: }
145: /* Note that this function only copies *some* of the values up from the GPU to CPU,
146: which means that we need recombine the data at some point before using any of the standard functions.
147: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
148: where you have to always call in pairs
149: */
150: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
151: {
152: CUSPARRAY *varray;
154: cudaError_t err;
155: PetscScalar *cpuPtr, *gpuPtr;
156: Vec_Seq *s;
157: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
160: VecCUSPAllocateCheckHost(v);
161: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
162: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
164: varray=((Vec_CUSP*)v->spptr)->GPUarray;
165: s = (Vec_Seq*)v->data;
166: gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
167: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
169: /* Note : this code copies the smallest contiguous chunk of data
170: containing ALL of the indices */
171: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
173: VecCUSPRestoreArrayRead(v,&varray);
174: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
175: }
176: return(0);
177: }
179: /*MC
180: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
182: Options Database Keys:
183: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
185: Level: beginner
187: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
188: M*/
190: /* for VecAYPX_SeqCUSP*/
191: namespace cusp
192: {
193: namespace blas
194: {
195: namespace detail
196: {
197: template <typename T>
198: struct AYPX : public thrust::binary_function<T,T,T>
199: {
200: T alpha;
202: AYPX(T _alpha) : alpha(_alpha) {}
204: __host__ __device__
205: T operator()(T x, T y)
206: {
207: return alpha * y + x;
208: }
209: };
210: }
212: template <typename ForwardIterator1,
213: typename ForwardIterator2,
214: typename ScalarType>
215: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
216: {
217: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
218: }
219: template <typename Array1, typename Array2, typename ScalarType>
220: void aypx(const Array1& x, Array2& y, ScalarType alpha)
221: {
222: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
223: cusp::assert_same_dimensions(x,y);
224: #else
225: detail::assert_same_dimensions(x,y);
226: #endif
227: aypx(x.begin(),x.end(),y.begin(),alpha);
228: }
229: }
230: }
234: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
235: {
236: CUSPARRAY *xarray,*yarray;
240: VecCUSPGetArrayRead(xin,&xarray);
241: VecCUSPGetArrayReadWrite(yin,&yarray);
242: try {
243: if (alpha != 0.0) {
244: cusp::blas::aypx(*xarray,*yarray,alpha);
245: PetscLogFlops(2.0*yin->map->n);
246: } else {
247: cusp::blas::copy(*xarray,*yarray);
248: }
249: WaitForGPU();CHKERRCUSP(ierr);
250: } catch(char *ex) {
251: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
252: }
253: VecCUSPRestoreArrayRead(xin,&xarray);
254: VecCUSPRestoreArrayReadWrite(yin,&yarray);
255: return(0);
256: }
261: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
262: {
263: CUSPARRAY *xarray,*yarray;
267: if (alpha != 0.0) {
268: VecCUSPGetArrayRead(xin,&xarray);
269: VecCUSPGetArrayReadWrite(yin,&yarray);
270: try {
271: cusp::blas::axpy(*xarray,*yarray,alpha);
272: WaitForGPU();CHKERRCUSP(ierr);
273: } catch(char *ex) {
274: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
275: }
276: VecCUSPRestoreArrayRead(xin,&xarray);
277: VecCUSPRestoreArrayReadWrite(yin,&yarray);
278: PetscLogFlops(2.0*yin->map->n);
279: }
280: return(0);
281: }
283: struct VecCUSPPointwiseDivide
284: {
285: template <typename Tuple>
286: __host__ __device__
287: void operator()(Tuple t)
288: {
289: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
290: }
291: };
295: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
296: {
297: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
301: VecCUSPGetArrayRead(xin,&xarray);
302: VecCUSPGetArrayRead(yin,&yarray);
303: VecCUSPGetArrayWrite(win,&warray);
304: try {
305: thrust::for_each(
306: thrust::make_zip_iterator(
307: thrust::make_tuple(
308: warray->begin(),
309: xarray->begin(),
310: yarray->begin())),
311: thrust::make_zip_iterator(
312: thrust::make_tuple(
313: warray->end(),
314: xarray->end(),
315: yarray->end())),
316: VecCUSPPointwiseDivide());
317: WaitForGPU();CHKERRCUSP(ierr);
318: } catch(char *ex) {
319: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
320: }
321: PetscLogFlops(win->map->n);
322: VecCUSPRestoreArrayRead(xin,&xarray);
323: VecCUSPRestoreArrayRead(yin,&yarray);
324: VecCUSPRestoreArrayWrite(win,&warray);
325: return(0);
326: }
329: struct VecCUSPWAXPY
330: {
331: template <typename Tuple>
332: __host__ __device__
333: void operator()(Tuple t)
334: {
335: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
336: }
337: };
339: struct VecCUSPSum
340: {
341: template <typename Tuple>
342: __host__ __device__
343: void operator()(Tuple t)
344: {
345: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
346: }
347: };
349: struct VecCUSPDiff
350: {
351: template <typename Tuple>
352: __host__ __device__
353: void operator()(Tuple t)
354: {
355: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
356: }
357: };
361: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
362: {
363: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
367: if (alpha == 0.0) {
368: VecCopy_SeqCUSP(yin,win);
369: } else {
370: VecCUSPGetArrayRead(xin,&xarray);
371: VecCUSPGetArrayRead(yin,&yarray);
372: VecCUSPGetArrayWrite(win,&warray);
373: if (alpha == 1.0) {
374: try {
375: thrust::for_each(
376: thrust::make_zip_iterator(
377: thrust::make_tuple(
378: warray->begin(),
379: yarray->begin(),
380: xarray->begin())),
381: thrust::make_zip_iterator(
382: thrust::make_tuple(
383: warray->end(),
384: yarray->end(),
385: xarray->end())),
386: VecCUSPSum());
387: } catch(char *ex) {
388: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
389: }
390: PetscLogFlops(win->map->n);
391: } else if (alpha == -1.0) {
392: try {
393: thrust::for_each(
394: thrust::make_zip_iterator(
395: thrust::make_tuple(
396: warray->begin(),
397: yarray->begin(),
398: xarray->begin())),
399: thrust::make_zip_iterator(
400: thrust::make_tuple(
401: warray->end(),
402: yarray->end(),
403: xarray->end())),
404: VecCUSPDiff());
405: } catch(char *ex) {
406: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
407: }
408: PetscLogFlops(win->map->n);
409: } else {
410: try {
411: thrust::for_each(
412: thrust::make_zip_iterator(
413: thrust::make_tuple(
414: warray->begin(),
415: yarray->begin(),
416: thrust::make_constant_iterator(alpha),
417: xarray->begin())),
418: thrust::make_zip_iterator(
419: thrust::make_tuple(
420: warray->end(),
421: yarray->end(),
422: thrust::make_constant_iterator(alpha),
423: xarray->end())),
424: VecCUSPWAXPY());
425: } catch(char *ex) {
426: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
427: }
428: PetscLogFlops(2*win->map->n);
429: }
430: WaitForGPU();CHKERRCUSP(ierr);
431: VecCUSPRestoreArrayRead(xin,&xarray);
432: VecCUSPRestoreArrayRead(yin,&yarray);
433: VecCUSPRestoreArrayWrite(win,&warray);
434: }
435: return(0);
436: }
438: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
439: struct VecCUSPMAXPY4
440: {
441: template <typename Tuple>
442: __host__ __device__
443: void operator()(Tuple t)
444: {
445: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
446: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
447: }
448: };
451: struct VecCUSPMAXPY3
452: {
453: template <typename Tuple>
454: __host__ __device__
455: void operator()(Tuple t)
456: {
457: /*y += a1*x1 +a2*x2 + a3*x3 */
458: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
459: }
460: };
462: struct VecCUSPMAXPY2
463: {
464: template <typename Tuple>
465: __host__ __device__
466: void operator()(Tuple t)
467: {
468: /*y += a1*x1 +a2*x2*/
469: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
470: }
471: };
474: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
475: {
477: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
478: PetscInt n = xin->map->n,j,j_rem;
479: PetscScalar alpha0,alpha1,alpha2,alpha3;
482: PetscLogFlops(nv*2.0*n);
483: VecCUSPGetArrayReadWrite(xin,&xarray);
484: switch (j_rem=nv&0x3) {
485: case 3:
486: alpha0 = alpha[0];
487: alpha1 = alpha[1];
488: alpha2 = alpha[2];
489: alpha += 3;
490: VecCUSPGetArrayRead(y[0],&yy0);
491: VecCUSPGetArrayRead(y[1],&yy1);
492: VecCUSPGetArrayRead(y[2],&yy2);
493: try {
494: thrust::for_each(
495: thrust::make_zip_iterator(
496: thrust::make_tuple(
497: xarray->begin(),
498: thrust::make_constant_iterator(alpha0),
499: yy0->begin(),
500: thrust::make_constant_iterator(alpha1),
501: yy1->begin(),
502: thrust::make_constant_iterator(alpha2),
503: yy2->begin())),
504: thrust::make_zip_iterator(
505: thrust::make_tuple(
506: xarray->end(),
507: thrust::make_constant_iterator(alpha0),
508: yy0->end(),
509: thrust::make_constant_iterator(alpha1),
510: yy1->end(),
511: thrust::make_constant_iterator(alpha2),
512: yy2->end())),
513: VecCUSPMAXPY3());
514: } catch(char *ex) {
515: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
516: }
517: VecCUSPRestoreArrayRead(y[0],&yy0);
518: VecCUSPRestoreArrayRead(y[1],&yy1);
519: VecCUSPRestoreArrayRead(y[2],&yy2);
520: y += 3;
521: break;
522: case 2:
523: alpha0 = alpha[0];
524: alpha1 = alpha[1];
525: alpha +=2;
526: VecCUSPGetArrayRead(y[0],&yy0);
527: VecCUSPGetArrayRead(y[1],&yy1);
528: try {
529: thrust::for_each(
530: thrust::make_zip_iterator(
531: thrust::make_tuple(
532: xarray->begin(),
533: thrust::make_constant_iterator(alpha0),
534: yy0->begin(),
535: thrust::make_constant_iterator(alpha1),
536: yy1->begin())),
537: thrust::make_zip_iterator(
538: thrust::make_tuple(
539: xarray->end(),
540: thrust::make_constant_iterator(alpha0),
541: yy0->end(),
542: thrust::make_constant_iterator(alpha1),
543: yy1->end())),
544: VecCUSPMAXPY2());
545: } catch(char *ex) {
546: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
547: }
548: y +=2;
549: break;
550: case 1:
551: alpha0 = *alpha++;
552: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
553: y +=1;
554: break;
555: }
556: for (j=j_rem; j<nv; j+=4) {
557: alpha0 = alpha[0];
558: alpha1 = alpha[1];
559: alpha2 = alpha[2];
560: alpha3 = alpha[3];
561: alpha += 4;
562: VecCUSPGetArrayRead(y[0],&yy0);
563: VecCUSPGetArrayRead(y[1],&yy1);
564: VecCUSPGetArrayRead(y[2],&yy2);
565: VecCUSPGetArrayRead(y[3],&yy3);
566: try {
567: thrust::for_each(
568: thrust::make_zip_iterator(
569: thrust::make_tuple(
570: xarray->begin(),
571: thrust::make_constant_iterator(alpha0),
572: yy0->begin(),
573: thrust::make_constant_iterator(alpha1),
574: yy1->begin(),
575: thrust::make_constant_iterator(alpha2),
576: yy2->begin(),
577: thrust::make_constant_iterator(alpha3),
578: yy3->begin())),
579: thrust::make_zip_iterator(
580: thrust::make_tuple(
581: xarray->end(),
582: thrust::make_constant_iterator(alpha0),
583: yy0->end(),
584: thrust::make_constant_iterator(alpha1),
585: yy1->end(),
586: thrust::make_constant_iterator(alpha2),
587: yy2->end(),
588: thrust::make_constant_iterator(alpha3),
589: yy3->end())),
590: VecCUSPMAXPY4());
591: } catch(char *ex) {
592: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
593: }
594: VecCUSPRestoreArrayRead(y[0],&yy0);
595: VecCUSPRestoreArrayRead(y[1],&yy1);
596: VecCUSPRestoreArrayRead(y[2],&yy2);
597: VecCUSPRestoreArrayRead(y[3],&yy3);
598: y += 4;
599: }
600: VecCUSPRestoreArrayReadWrite(xin,&xarray);
601: WaitForGPU();CHKERRCUSP(ierr);
602: return(0);
603: }
608: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
609: {
610: CUSPARRAY *xarray,*yarray;
612: // PetscScalar *xptr,*yptr,*zgpu;
613: //PetscReal tmp;
616: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
617: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
618: VecCUSPGetArrayRead(xin,&xarray);
619: VecCUSPGetArrayRead(yin,&yarray);
620: try {
621: #if defined(PETSC_USE_COMPLEX)
622: *z = cusp::blas::dotc(*yarray,*xarray);
623: #else
624: *z = cusp::blas::dot(*yarray,*xarray);
625: #endif
626: } catch(char *ex) {
627: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
628: }
629: WaitForGPU();CHKERRCUSP(ierr);
630: if (xin->map->n >0) {
631: PetscLogFlops(2.0*xin->map->n-1);
632: }
633: VecCUSPRestoreArrayRead(xin,&xarray);
634: VecCUSPRestoreArrayRead(yin,&yarray);
635: return(0);
636: }
638: //
639: // CUDA kernels for MDot to follow
640: //
642: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
643: #define MDOT_WORKGROUP_SIZE 128
644: #define MDOT_WORKGROUP_NUM 128
646: // M = 2:
647: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
648: PetscInt size, PetscScalar *group_results)
649: {
650: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
651: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
652: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
653: PetscInt vec_start_index = blockIdx.x * entries_per_group;
654: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
656: PetscScalar entry_x = 0;
657: PetscScalar group_sum0 = 0;
658: PetscScalar group_sum1 = 0;
659: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
660: entry_x = x[i]; // load only once from global memory!
661: group_sum0 += entry_x * y0[i];
662: group_sum1 += entry_x * y1[i];
663: }
664: tmp_buffer[threadIdx.x] = group_sum0;
665: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
667: // parallel reduction
668: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
669: __syncthreads();
670: if (threadIdx.x < stride) {
671: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
672: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
673: }
674: }
676: // write result of group to group_results
677: if (threadIdx.x == 0) {
678: group_results[blockIdx.x] = tmp_buffer[0];
679: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
680: }
681: }
683: // M = 3:
684: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
685: PetscInt size, PetscScalar *group_results)
686: {
687: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
688: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
689: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
690: PetscInt vec_start_index = blockIdx.x * entries_per_group;
691: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
693: PetscScalar entry_x = 0;
694: PetscScalar group_sum0 = 0;
695: PetscScalar group_sum1 = 0;
696: PetscScalar group_sum2 = 0;
697: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
698: entry_x = x[i]; // load only once from global memory!
699: group_sum0 += entry_x * y0[i];
700: group_sum1 += entry_x * y1[i];
701: group_sum2 += entry_x * y2[i];
702: }
703: tmp_buffer[threadIdx.x] = group_sum0;
704: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
705: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
707: // parallel reduction
708: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
709: __syncthreads();
710: if (threadIdx.x < stride) {
711: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
712: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
713: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
714: }
715: }
717: // write result of group to group_results
718: if (threadIdx.x == 0) {
719: group_results[blockIdx.x ] = tmp_buffer[0];
720: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
721: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
722: }
723: }
725: // M = 4:
726: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
727: PetscInt size, PetscScalar *group_results)
728: {
729: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
730: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
731: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
732: PetscInt vec_start_index = blockIdx.x * entries_per_group;
733: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
735: PetscScalar entry_x = 0;
736: PetscScalar group_sum0 = 0;
737: PetscScalar group_sum1 = 0;
738: PetscScalar group_sum2 = 0;
739: PetscScalar group_sum3 = 0;
740: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
741: entry_x = x[i]; // load only once from global memory!
742: group_sum0 += entry_x * y0[i];
743: group_sum1 += entry_x * y1[i];
744: group_sum2 += entry_x * y2[i];
745: group_sum3 += entry_x * y3[i];
746: }
747: tmp_buffer[threadIdx.x] = group_sum0;
748: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
749: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
750: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
752: // parallel reduction
753: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
754: __syncthreads();
755: if (threadIdx.x < stride) {
756: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
757: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
758: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
759: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
760: }
761: }
763: // write result of group to group_results
764: if (threadIdx.x == 0) {
765: group_results[blockIdx.x ] = tmp_buffer[0];
766: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
767: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
768: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
769: }
770: }
772: // M = 8:
773: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
774: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
775: PetscInt size, PetscScalar *group_results)
776: {
777: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
778: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
779: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
780: PetscInt vec_start_index = blockIdx.x * entries_per_group;
781: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
783: PetscScalar entry_x = 0;
784: PetscScalar group_sum0 = 0;
785: PetscScalar group_sum1 = 0;
786: PetscScalar group_sum2 = 0;
787: PetscScalar group_sum3 = 0;
788: PetscScalar group_sum4 = 0;
789: PetscScalar group_sum5 = 0;
790: PetscScalar group_sum6 = 0;
791: PetscScalar group_sum7 = 0;
792: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
793: entry_x = x[i]; // load only once from global memory!
794: group_sum0 += entry_x * y0[i];
795: group_sum1 += entry_x * y1[i];
796: group_sum2 += entry_x * y2[i];
797: group_sum3 += entry_x * y3[i];
798: group_sum4 += entry_x * y4[i];
799: group_sum5 += entry_x * y5[i];
800: group_sum6 += entry_x * y6[i];
801: group_sum7 += entry_x * y7[i];
802: }
803: tmp_buffer[threadIdx.x] = group_sum0;
804: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
805: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
806: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
807: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
808: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
809: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
810: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
812: // parallel reduction
813: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
814: __syncthreads();
815: if (threadIdx.x < stride) {
816: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
817: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
818: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
819: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
820: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
821: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
822: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
823: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
824: }
825: }
827: // write result of group to group_results
828: if (threadIdx.x == 0) {
829: group_results[blockIdx.x ] = tmp_buffer[0];
830: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
831: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
832: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
833: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
834: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
835: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
836: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
837: }
838: }
843: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
844: {
846: PetscInt i,j,n = xin->map->n,current_y_index = 0;
847: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
848: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
849: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
850: cudaError_t cuda_ierr;
853: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
854: /* Handle the case of local size zero first */
855: if (!xin->map->n) {
856: for (i=0; i<nv; ++i) z[i] = 0;
857: return(0);
858: }
860: // allocate scratchpad memory for the results of individual work groups:
861: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
862: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
864: VecCUSPGetArrayRead(xin,&xarray);
865: xptr = thrust::raw_pointer_cast(xarray->data());
867: while (current_y_index < nv)
868: {
869: switch (nv - current_y_index) {
871: case 7:
872: case 6:
873: case 5:
874: case 4:
875: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
876: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
877: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
878: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
880: #if defined(PETSC_USE_COMPLEX)
881: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
882: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
883: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
884: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
885: #else
886: // extract raw device pointers:
887: y0ptr = thrust::raw_pointer_cast(y0array->data());
888: y1ptr = thrust::raw_pointer_cast(y1array->data());
889: y2ptr = thrust::raw_pointer_cast(y2array->data());
890: y3ptr = thrust::raw_pointer_cast(y3array->data());
892: // run kernel:
893: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
895: // copy results back to
896: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
897: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
899: // sum group results into z:
900: for (j=0; j<4; ++j) {
901: z[current_y_index + j] = 0;
902: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
903: }
904: #endif
905: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
906: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
907: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
908: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
909: current_y_index += 4;
910: break;
912: case 3:
913: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
914: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
915: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
917: #if defined(PETSC_USE_COMPLEX)
918: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
919: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
920: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
921: #else
922: // extract raw device pointers:
923: y0ptr = thrust::raw_pointer_cast(y0array->data());
924: y1ptr = thrust::raw_pointer_cast(y1array->data());
925: y2ptr = thrust::raw_pointer_cast(y2array->data());
927: // run kernel:
928: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
930: // copy results back to
931: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
932: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
934: // sum group results into z:
935: for (j=0; j<3; ++j) {
936: z[current_y_index + j] = 0;
937: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
938: }
939: #endif
941: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
942: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
943: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
944: current_y_index += 3;
945: break;
947: case 2:
948: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
949: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
951: #if defined(PETSC_USE_COMPLEX)
952: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
953: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
954: #else
955: // extract raw device pointers:
956: y0ptr = thrust::raw_pointer_cast(y0array->data());
957: y1ptr = thrust::raw_pointer_cast(y1array->data());
959: // run kernel:
960: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
962: // copy results back to
963: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
964: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
966: // sum group results into z:
967: for (j=0; j<2; ++j) {
968: z[current_y_index + j] = 0;
969: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
970: }
971: #endif
972: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
973: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
974: current_y_index += 2;
975: break;
977: case 1:
978: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
979: #if defined(PETSC_USE_COMPLEX)
980: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
981: #else
982: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
983: #endif
984: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
985: current_y_index += 1;
986: break;
988: default: // 8 or more vectors left
989: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
990: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
991: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
992: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
993: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
994: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
995: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
996: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
998: #if defined(PETSC_USE_COMPLEX)
999: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1000: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1001: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1002: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1003: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1004: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1005: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1006: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1007: #else
1008: // extract raw device pointers:
1009: y0ptr = thrust::raw_pointer_cast(y0array->data());
1010: y1ptr = thrust::raw_pointer_cast(y1array->data());
1011: y2ptr = thrust::raw_pointer_cast(y2array->data());
1012: y3ptr = thrust::raw_pointer_cast(y3array->data());
1013: y4ptr = thrust::raw_pointer_cast(y4array->data());
1014: y5ptr = thrust::raw_pointer_cast(y5array->data());
1015: y6ptr = thrust::raw_pointer_cast(y6array->data());
1016: y7ptr = thrust::raw_pointer_cast(y7array->data());
1018: // run kernel:
1019: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1021: // copy results back to
1022: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1023: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1025: // sum group results into z:
1026: for (j=0; j<8; ++j) {
1027: z[current_y_index + j] = 0;
1028: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1029: }
1030: #endif
1031: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1032: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1033: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1034: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1035: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1036: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1037: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1038: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1039: current_y_index += 8;
1040: break;
1041: }
1042: }
1043: VecCUSPRestoreArrayRead(xin,&xarray);
1045: cuda_cudaFree(group_results_gpu);
1046: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1047: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1048: return(0);
1049: }
1051: #undef MDOT_WORKGROUP_SIZE
1052: #undef MDOT_WORKGROUP_NUM
1058: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1059: {
1060: CUSPARRAY *xarray=NULL;
1064: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1065: VecCUSPGetArrayWrite(xin,&xarray);
1066: try {
1067: cusp::blas::fill(*xarray,alpha);
1068: } catch(char *ex) {
1069: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1070: }
1071: WaitForGPU();CHKERRCUSP(ierr);
1072: VecCUSPRestoreArrayWrite(xin,&xarray);
1073: return(0);
1074: }
1078: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1079: {
1080: CUSPARRAY *xarray;
1084: if (alpha == 0.0) {
1085: VecSet_SeqCUSP(xin,alpha);
1086: } else if (alpha != 1.0) {
1087: VecCUSPGetArrayReadWrite(xin,&xarray);
1088: try {
1089: cusp::blas::scal(*xarray,alpha);
1090: } catch(char *ex) {
1091: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1092: }
1093: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1094: }
1095: WaitForGPU();CHKERRCUSP(ierr);
1096: PetscLogFlops(xin->map->n);
1097: return(0);
1098: }
1102: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1103: {
1104: CUSPARRAY *xarray,*yarray;
1108: //#if defined(PETSC_USE_COMPLEX)
1109: /*Not working for complex*/
1110: //#else
1111: VecCUSPGetArrayRead(xin,&xarray);
1112: VecCUSPGetArrayRead(yin,&yarray);
1113: try {
1114: *z = cusp::blas::dot(*xarray,*yarray);
1115: } catch(char *ex) {
1116: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1117: }
1118: //#endif
1119: WaitForGPU();CHKERRCUSP(ierr);
1120: if (xin->map->n > 0) {
1121: PetscLogFlops(2.0*xin->map->n-1);
1122: }
1123: VecCUSPRestoreArrayRead(yin,&yarray);
1124: VecCUSPRestoreArrayRead(xin,&xarray);
1125: return(0);
1126: }
1130: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1131: {
1132: CUSPARRAY *xarray,*yarray;
1136: if (xin != yin) {
1137: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1138: VecCUSPGetArrayRead(xin,&xarray);
1139: VecCUSPGetArrayWrite(yin,&yarray);
1140: try {
1141: cusp::blas::copy(*xarray,*yarray);
1142: } catch(char *ex) {
1143: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1144: }
1145: WaitForGPU();CHKERRCUSP(ierr);
1146: VecCUSPRestoreArrayRead(xin,&xarray);
1147: VecCUSPRestoreArrayWrite(yin,&yarray);
1149: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1150: /* copy in CPU if we are on the CPU*/
1151: VecCopy_SeqCUSP_Private(xin,yin);
1152: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1153: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1154: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1155: /* copy in CPU */
1156: VecCopy_SeqCUSP_Private(xin,yin);
1158: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1159: /* copy in GPU */
1160: VecCUSPGetArrayRead(xin,&xarray);
1161: VecCUSPGetArrayWrite(yin,&yarray);
1162: try {
1163: cusp::blas::copy(*xarray,*yarray);
1164: WaitForGPU();CHKERRCUSP(ierr);
1165: } catch(char *ex) {
1166: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1167: }
1168: VecCUSPRestoreArrayRead(xin,&xarray);
1169: VecCUSPRestoreArrayWrite(yin,&yarray);
1170: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1171: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1172: default to copy in GPU (this is an arbitrary choice) */
1173: VecCUSPGetArrayRead(xin,&xarray);
1174: VecCUSPGetArrayWrite(yin,&yarray);
1175: try {
1176: cusp::blas::copy(*xarray,*yarray);
1177: WaitForGPU();CHKERRCUSP(ierr);
1178: } catch(char *ex) {
1179: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1180: }
1181: VecCUSPRestoreArrayRead(xin,&xarray);
1182: VecCUSPRestoreArrayWrite(yin,&yarray);
1183: } else {
1184: VecCopy_SeqCUSP_Private(xin,yin);
1185: }
1186: }
1187: }
1188: return(0);
1189: }
1194: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1195: {
1197: PetscBLASInt one = 1,bn;
1198: CUSPARRAY *xarray,*yarray;
1199: cublasStatus_t cberr;
1202: PetscBLASIntCast(xin->map->n,&bn);
1203: if (xin != yin) {
1204: VecCUSPGetArrayReadWrite(xin,&xarray);
1205: VecCUSPGetArrayReadWrite(yin,&yarray);
1207: #if defined(PETSC_USE_COMPLEX)
1208: #if defined(PETSC_USE_REAL_SINGLE)
1209: cberr = cublasCswap(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1210: #else
1211: cberr = cublasZswap(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1212: #endif
1213: #else
1214: #if defined(PETSC_USE_REAL_SINGLE)
1215: cberr = cublasSswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1216: #else
1217: cberr = cublasDswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1218: #endif
1219: #endif
1220: WaitForGPU();CHKERRCUSP(ierr);
1221: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1222: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1223: }
1224: return(0);
1225: }
1227: struct VecCUSPAX
1228: {
1229: template <typename Tuple>
1230: __host__ __device__
1231: void operator()(Tuple t)
1232: {
1233: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1234: }
1235: };
1239: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1240: {
1242: PetscScalar a = alpha,b = beta;
1243: CUSPARRAY *xarray,*yarray;
1246: if (a == 0.0) {
1247: VecScale_SeqCUSP(yin,beta);
1248: } else if (b == 1.0) {
1249: VecAXPY_SeqCUSP(yin,alpha,xin);
1250: } else if (a == 1.0) {
1251: VecAYPX_SeqCUSP(yin,beta,xin);
1252: } else if (b == 0.0) {
1253: VecCUSPGetArrayRead(xin,&xarray);
1254: VecCUSPGetArrayReadWrite(yin,&yarray);
1255: try {
1256: thrust::for_each(
1257: thrust::make_zip_iterator(
1258: thrust::make_tuple(
1259: yarray->begin(),
1260: thrust::make_constant_iterator(a),
1261: xarray->begin())),
1262: thrust::make_zip_iterator(
1263: thrust::make_tuple(
1264: yarray->end(),
1265: thrust::make_constant_iterator(a),
1266: xarray->end())),
1267: VecCUSPAX());
1268: } catch(char *ex) {
1269: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1270: }
1271: PetscLogFlops(xin->map->n);
1272: WaitForGPU();CHKERRCUSP(ierr);
1273: VecCUSPRestoreArrayRead(xin,&xarray);
1274: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1275: } else {
1276: VecCUSPGetArrayRead(xin,&xarray);
1277: VecCUSPGetArrayReadWrite(yin,&yarray);
1278: try {
1279: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1280: } catch(char *ex) {
1281: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1282: }
1283: VecCUSPRestoreArrayRead(xin,&xarray);
1284: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1285: WaitForGPU();CHKERRCUSP(ierr);
1286: PetscLogFlops(3.0*xin->map->n);
1287: }
1288: return(0);
1289: }
1291: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1292: struct VecCUSPXPBYPCZ
1293: {
1294: /* z = x + b*y + c*z */
1295: template <typename Tuple>
1296: __host__ __device__
1297: void operator()(Tuple t)
1298: {
1299: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1300: }
1301: };
1303: struct VecCUSPAXPBYPZ
1304: {
1305: /* z = ax + b*y + z */
1306: template <typename Tuple>
1307: __host__ __device__
1308: void operator()(Tuple t)
1309: {
1310: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1311: }
1312: };
1316: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1317: {
1319: PetscInt n = zin->map->n;
1320: CUSPARRAY *xarray,*yarray,*zarray;
1323: VecCUSPGetArrayRead(xin,&xarray);
1324: VecCUSPGetArrayRead(yin,&yarray);
1325: VecCUSPGetArrayReadWrite(zin,&zarray);
1326: if (alpha == 1.0) {
1327: try {
1328: thrust::for_each(
1329: thrust::make_zip_iterator(
1330: thrust::make_tuple(
1331: zarray->begin(),
1332: thrust::make_constant_iterator(gamma),
1333: xarray->begin(),
1334: yarray->begin(),
1335: thrust::make_constant_iterator(beta))),
1336: thrust::make_zip_iterator(
1337: thrust::make_tuple(
1338: zarray->end(),
1339: thrust::make_constant_iterator(gamma),
1340: xarray->end(),
1341: yarray->end(),
1342: thrust::make_constant_iterator(beta))),
1343: VecCUSPXPBYPCZ());
1344: } catch(char *ex) {
1345: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1346: }
1347: PetscLogFlops(4.0*n);
1348: } else if (gamma == 1.0) {
1349: try {
1350: thrust::for_each(
1351: thrust::make_zip_iterator(
1352: thrust::make_tuple(
1353: zarray->begin(),
1354: xarray->begin(),
1355: thrust::make_constant_iterator(alpha),
1356: yarray->begin(),
1357: thrust::make_constant_iterator(beta))),
1358: thrust::make_zip_iterator(
1359: thrust::make_tuple(
1360: zarray->end(),
1361: xarray->end(),
1362: thrust::make_constant_iterator(alpha),
1363: yarray->end(),
1364: thrust::make_constant_iterator(beta))),
1365: VecCUSPAXPBYPZ());
1366: } catch(char *ex) {
1367: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1368: }
1369: PetscLogFlops(4.0*n);
1370: } else {
1371: try {
1372: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1373: } catch(char *ex) {
1374: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1375: }
1376: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1377: VecCUSPRestoreArrayRead(xin,&xarray);
1378: VecCUSPRestoreArrayRead(yin,&yarray);
1379: PetscLogFlops(5.0*n);
1380: }
1381: WaitForGPU();CHKERRCUSP(ierr);
1382: return(0);
1383: }
1387: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1388: {
1390: PetscInt n = win->map->n;
1391: CUSPARRAY *xarray,*yarray,*warray;
1394: VecCUSPGetArrayRead(xin,&xarray);
1395: VecCUSPGetArrayRead(yin,&yarray);
1396: VecCUSPGetArrayReadWrite(win,&warray);
1397: try {
1398: cusp::blas::xmy(*xarray,*yarray,*warray);
1399: } catch(char *ex) {
1400: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1401: }
1402: VecCUSPRestoreArrayRead(xin,&xarray);
1403: VecCUSPRestoreArrayRead(yin,&yarray);
1404: VecCUSPRestoreArrayReadWrite(win,&warray);
1405: PetscLogFlops(n);
1406: WaitForGPU();CHKERRCUSP(ierr);
1407: return(0);
1408: }
1411: /* should do infinity norm in cusp */
1415: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1416: {
1417: const PetscScalar *xx;
1418: PetscErrorCode ierr;
1419: PetscInt n = xin->map->n;
1420: PetscBLASInt one = 1, bn;
1421: CUSPARRAY *xarray;
1422: cublasStatus_t cberr;
1425: PetscBLASIntCast(n,&bn);
1426: if (type == NORM_2 || type == NORM_FROBENIUS) {
1427: VecCUSPGetArrayRead(xin,&xarray);
1428: try {
1429: *z = cusp::blas::nrm2(*xarray);
1430: } catch(char *ex) {
1431: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1432: }
1433: WaitForGPU();CHKERRCUSP(ierr);
1434: VecCUSPRestoreArrayRead(xin,&xarray);
1435: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1436: } else if (type == NORM_INFINITY) {
1437: PetscInt i;
1438: PetscReal max = 0.0,tmp;
1440: VecGetArrayRead(xin,&xx);
1441: for (i=0; i<n; i++) {
1442: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1443: /* check special case of tmp == NaN */
1444: if (tmp != tmp) {max = tmp; break;}
1445: xx++;
1446: }
1447: VecRestoreArrayRead(xin,&xx);
1448: *z = max;
1449: } else if (type == NORM_1) {
1450: VecCUSPGetArrayRead(xin,&xarray);
1451: #if defined(PETSC_USE_COMPLEX)
1452: #if defined(PETSC_USE_REAL_SINGLE)
1453: cberr = cublasSasum(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1454: #else
1455: cberr = cublasDasum(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1456: #endif
1457: #else
1458: #if defined(PETSC_USE_REAL_SINGLE)
1459: cberr = cublasSasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1460: #else
1461: cberr = cublasDasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1462: #endif
1463: #endif
1464: VecCUSPRestoreArrayRead(xin,&xarray);
1465: WaitForGPU();CHKERRCUSP(ierr);
1466: PetscLogFlops(PetscMax(n-1.0,0.0));
1467: } else if (type == NORM_1_AND_2) {
1468: VecNorm_SeqCUSP(xin,NORM_1,z);
1469: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1470: }
1471: return(0);
1472: }
1474: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1475: template <typename T>
1476: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1477: {
1478: __host__ __device__
1479: T operator()(T x)
1480: {
1481: #if defined(PETSC_USE_COMPLEX)
1482: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1483: #else
1484: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1485: #endif
1486: }
1487: };
1489: template <typename T>
1490: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1491: {
1492: __host__ __device__
1493: T operator()(T x,T y)
1494: {
1495: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1496: }
1497: };
1501: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1502: {
1503: PetscErrorCode ierr;
1504: PetscScalar zero = 0.0;
1505: PetscReal n=s->map->n;
1506: thrust::tuple<PetscScalar,PetscScalar> result;
1507: CUSPARRAY *sarray,*tarray;
1510: /*VecCUSPCopyToGPU(s);
1511: VecCUSPCopyToGPU(t);*/
1512: VecCUSPGetArrayRead(s,&sarray);
1513: VecCUSPGetArrayRead(t,&tarray);
1514: try {
1515: #if defined(PETSC_USE_COMPLEX)
1516: VecDot_SeqCUSP(s,t,dp);
1517: VecDot_SeqCUSP(t,t,nm);
1518: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1519: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1520: #else
1521: result = thrust::transform_reduce(
1522: thrust::make_zip_iterator(
1523: thrust::make_tuple(
1524: sarray->begin(),
1525: tarray->begin())),
1526: thrust::make_zip_iterator(
1527: thrust::make_tuple(
1528: sarray->end(),
1529: tarray->end())),
1530: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1531: thrust::make_tuple(zero,zero), /*init */
1532: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1533: *dp = thrust::get<0>(result);
1534: *nm = thrust::get<1>(result);
1535: #endif
1536: } catch(char *ex) {
1537: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1538: }
1539: VecCUSPRestoreArrayRead(s,&sarray);
1540: VecCUSPRestoreArrayRead(t,&tarray);
1541: WaitForGPU();CHKERRCUSP(ierr);
1542: PetscLogFlops(4.0*n);
1543: return(0);
1544: }
1548: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1549: {
1551: cudaError_t err;
1554: try {
1555: if (v->spptr) {
1556: delete ((Vec_CUSP*)v->spptr)->GPUarray;
1557: err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1558: delete (Vec_CUSP*)v->spptr;
1559: }
1560: } catch(char *ex) {
1561: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1562: }
1563: VecDestroy_SeqCUSP_Private(v);
1564: return(0);
1565: }
1568: #if defined(PETSC_USE_COMPLEX)
1569: struct conjugate
1570: {
1571: __host__ __device__
1572: PetscScalar operator()(PetscScalar x)
1573: {
1574: return cusp::conj(x);
1575: }
1576: };
1577: #endif
1582: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1583: {
1585: CUSPARRAY *xarray;
1588: VecCUSPGetArrayReadWrite(xin,&xarray);
1589: #if defined(PETSC_USE_COMPLEX)
1590: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1591: #endif
1592: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1593: return(0);
1594: }
1598: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1599: {
1600: VecType t;
1602: cudaError_t err;
1603: PetscBool flg;
1608: VecGetType(w,&t);
1609: PetscStrcmp(t,VECSEQCUSP,&flg);
1610: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);
1612: if (w->data) {
1613: if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1614: ((Vec_Seq*)w->data)->array = 0;
1615: ((Vec_Seq*)w->data)->array_allocated = 0;
1616: ((Vec_Seq*)w->data)->unplacedarray = 0;
1617: }
1618: if (w->spptr) {
1619: if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1620: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1621: delete (Vec_CUSP*)w->spptr;
1622: w->spptr = 0;
1623: }
1625: if (v->petscnative) {
1626: PetscFree(w->data);
1627: w->data = v->data;
1628: w->valid_GPU_array = v->valid_GPU_array;
1629: w->spptr = v->spptr;
1630: PetscObjectStateIncrease((PetscObject)w);
1631: } else {
1632: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1633: w->valid_GPU_array = PETSC_CUSP_CPU;
1634: VecCUSPAllocateCheck(w);
1635: }
1636: return(0);
1637: }
1641: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1642: {
1643: VecType t;
1645: cudaError_t err;
1646: PetscBool flg;
1651: VecGetType(w,&t);
1652: PetscStrcmp(t,VECSEQCUSP,&flg);
1653: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);
1655: if (v->petscnative) {
1656: v->data = w->data;
1657: v->valid_GPU_array = w->valid_GPU_array;
1658: v->spptr = w->spptr;
1659: VecCUSPCopyFromGPU(v);
1660: PetscObjectStateIncrease((PetscObject)v);
1661: w->data = 0;
1662: w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1663: w->spptr = 0;
1664: } else {
1665: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1666: if ((Vec_CUSP*)w->spptr) {
1667: delete ((Vec_CUSP*)w->spptr)->GPUarray;
1668: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1669: delete (Vec_CUSP*)w->spptr;
1670: }
1671: }
1672: return(0);
1673: }
1677: /*@C
1678: VecCUSPGetArrayReadWrite - Provides access to the CUSP vector inside a vector.
1680: This function has semantics similar to VecGetArray(): the CUSP
1681: vector returned by this function points to a consistent view of the
1682: vector data. This may involve a copy operation of data from the host
1683: to the device if the data on the device is out of date. If the
1684: device memory hasn't been allocated previously it will be allocated
1685: as part of this function call. VecCUSPGetArrayReadWrite() assumes
1686: that the user will modify the vector data. This is similar to
1687: intent(inout) in fortran.
1689: The CUSP device vector has to be released by calling
1690: VecCUSPRestoreArrayReadWrite(). Upon restoring the vector data the
1691: data on the host will be marked as out of date. A subsequent access
1692: of the host data will thus incur a data transfer from the device to
1693: the host.
1696: Input Parameter:
1697: . v - the vector
1699: Output Parameter:
1700: . a - the CUSP device vector
1701:
1702: Fortran note: This function is not currently available from Fortran.
1704: Fortran note:
1705: This function is not currently available from Fortran.
1707: Level: intermediate
1709: .seealso: VecCUSPRestoreArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1710: @*/
1711: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1712: {
1716: *a = 0;
1717: VecCUSPCopyToGPU(v);
1718: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1719: return(0);
1720: }
1724: /*@C
1725: VecCUSPRestoreArrayReadWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayReadWrite().
1727: This marks the host data as out of date. Subsequent access to the
1728: vector data on the host side with for instance VecGetArray() incurs a
1729: data transfer.
1731: Input Parameter:
1732: + v - the vector
1733: - a - the CUSP device vector. This pointer is invalid after
1734: VecCUSPRestoreArrayReadWrite() returns.
1736: Fortran note:
1737: This function is not currently available from Fortran.
1739: Level: intermediate
1741: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1742: @*/
1743: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1744: {
1748: v->valid_GPU_array = PETSC_CUSP_GPU;
1750: PetscObjectStateIncrease((PetscObject)v);
1751: return(0);
1752: }
1756: /*@C
1757: VecCUSPGetArrayRead - Provides read access to the CUSP device vector inside a vector.
1759: This function is analogous to VecGetArrayRead(): The CUSP vector
1760: returned by this function points to a consistent view of the vector
1761: data. This may involve a copy operation of data from the host to the
1762: device if the data on the device is out of date. If the device
1763: memory hasn't been allocated previously it will be allocated as part
1764: of this function call. VecCUSPGetArrayRead() assumes that the user
1765: will not modify the vector data. This is analogous to intent(in) in
1766: Fortran.
1768: The CUSP device vector has to be released by calling
1769: VecCUSPRestoreArrayRead(). If the data on the host side was
1770: previously up to date it will remain so, i.e. data on both the device
1771: and the host is up to date. Accessing data on the host side does not
1772: incur a device to host data transfer.
1774: Input Parameter:
1775: . v - the vector
1777: Output Parameter:
1778: . a - the CUSP device vector
1780: Fortran note:
1781: This function is not currently available from Fortran.
1783: Level: intermediate
1785: .seealso: VecCUSPRestoreArrayRead(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecGetArrayRead()
1786: @*/
1787: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1788: {
1792: *a = 0;
1793: VecCUSPCopyToGPU(v);
1794: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1795: return(0);
1796: }
1800: /*@C
1801: VecCUSPRestoreArrayRead - Restore a CUSP device vector previously acquired with VecCUSPGetArrayRead().
1803: If the data on the host side was previously up to date it will remain
1804: so, i.e. data on both the device and the host is up to date.
1805: Accessing data on the host side e.g. with VecGetArray() does not
1806: incur a device to host data transfer.
1808: Input Parameter:
1809: + v - the vector
1810: - a - the CUSP device vector. This pointer is invalid after
1811: VecCUSPRestoreArrayRead() returns.
1813: Fortran note:
1814: This function is not currently available from Fortran.
1816: Level: intermediate
1818: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1819: @*/
1820: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1821: {
1823: return(0);
1824: }
1828: /*@C
1829: VecCUSPGetArrayWrite - Provides write access to the CUSP device vector inside a vector.
1831: The data pointed to by the device vector is uninitialized. The user
1832: must not read this data. Furthermore, the entire array needs to be
1833: filled by the user to obtain well-defined behaviour. The device
1834: memory will be allocated by this function if it hasn't been allocated
1835: previously. This is analogous to intent(out) in Fortran.
1837: The CUSP device vector needs to be released with
1838: VecCUSPRestoreArrayWrite(). When the pointer is released the host
1839: data of the vector is marked as out of data. Subsequent access of
1840: the host data with e.g. VecGetArray() incurs a device to host data
1841: transfer.
1844: Input Parameter:
1845: . v - the vector
1847: Output Parameter:
1848: . a - the CUDA pointer
1850: Fortran note:
1851: This function is not currently available from Fortran.
1853: Level: intermediate
1855: .seealso: VecCUSPRestoreArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1856: @*/
1857: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1858: {
1862: *a = 0;
1863: VecCUSPAllocateCheck(v);
1864: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1865: return(0);
1866: }
1870: /*@C
1871: VecCUSPRestoreArrayWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayWrite().
1873: Data on the host will be marked as out of date. Subsequent access of
1874: the data on the host side e.g. with VecGetArray() will incur a device
1875: to host data transfer.
1877: Input Parameter:
1878: + v - the vector
1879: - a - the CUDA device pointer. This pointer is invalid after
1880: VecCUSPRestoreArrayWrite() returns.
1882: Fortran note:
1883: This function is not currently available from Fortran.
1885: Level: intermediate
1887: .seealso: VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1888: @*/
1889: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
1890: {
1894: v->valid_GPU_array = PETSC_CUSP_GPU;
1896: PetscObjectStateIncrease((PetscObject)v);
1897: return(0);
1898: }
1902: /*@C
1903: VecCUSPGetCUDAArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1905: This function has semantics similar to VecGetArray(): the pointer
1906: returned by this function points to a consistent view of the vector
1907: data. This may involve a copy operation of data from the host to the
1908: device if the data on the device is out of date. If the device
1909: memory hasn't been allocated previously it will be allocated as part
1910: of this function call. VecCUSPGetCUDAArrayReadWrite() assumes that
1911: the user will modify the vector data. This is similar to
1912: intent(inout) in fortran.
1914: The CUDA device pointer has to be released by calling
1915: VecCUSPRestoreCUDAArrayReadWrite(). Upon restoring the vector data
1916: the data on the host will be marked as out of date. A subsequent
1917: access of the host data will thus incur a data transfer from the
1918: device to the host.
1921: Input Parameter:
1922: . v - the vector
1924: Output Parameter:
1925: . a - the CUDA device pointer
1927: Fortran note:
1928: This function is not currently available from Fortran.
1930: Level: advanced
1932: .seealso: VecCUSPRestoreCUDAArrayReadWrite(), VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1933: @*/
1934: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayReadWrite(Vec v, PetscScalar **a)
1935: {
1937: CUSPARRAY *cusparray;
1941: VecCUSPGetArrayReadWrite(v, &cusparray);
1942: *a = thrust::raw_pointer_cast(cusparray->data());
1943: return(0);
1944: }
1948: /*@C
1949: VecCUSPRestoreCUDAArrayReadWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayReadWrite().
1951: This marks the host data as out of date. Subsequent access to the
1952: vector data on the host side with for instance VecGetArray() incurs a
1953: data transfer.
1955: Input Parameter:
1956: + v - the vector
1957: - a - the CUDA device pointer. This pointer is invalid after
1958: VecCUSPRestoreCUDAArrayReadWrite() returns.
1960: Fortran note:
1961: This function is not currently available from Fortran.
1963: Level: advanced
1965: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1966: @*/
1967: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayReadWrite(Vec v, PetscScalar **a)
1968: {
1972: v->valid_GPU_array = PETSC_CUSP_GPU;
1973: PetscObjectStateIncrease((PetscObject)v);
1974: return(0);
1975: }
1979: /*@C
1980: VecCUSPGetCUDAArrayRead - Provides read access to the CUDA buffer inside a vector.
1982: This function is analogous to VecGetArrayRead(): The pointer
1983: returned by this function points to a consistent view of the vector
1984: data. This may involve a copy operation of data from the host to the
1985: device if the data on the device is out of date. If the device
1986: memory hasn't been allocated previously it will be allocated as part
1987: of this function call. VecCUSPGetCUDAArrayRead() assumes that the
1988: user will not modify the vector data. This is analgogous to
1989: intent(in) in Fortran.
1991: The CUDA device pointer has to be released by calling
1992: VecCUSPRestoreCUDAArrayRead(). If the data on the host side was
1993: previously up to date it will remain so, i.e. data on both the device
1994: and the host is up to date. Accessing data on the host side does not
1995: incur a device to host data transfer.
1997: Input Parameter:
1998: . v - the vector
2000: Output Parameter:
2001: . a - the CUDA pointer.
2003: Fortran note:
2004: This function is not currently available from Fortran.
2006: Level: advanced
2008: .seealso: VecCUSPRestoreCUDAArrayRead(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2009: @*/
2010: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayRead(Vec v, PetscScalar **a)
2011: {
2013: CUSPARRAY *cusparray;
2017: VecCUSPGetArrayRead(v, &cusparray);
2018: *a = thrust::raw_pointer_cast(cusparray->data());
2019: return(0);
2020: }
2024: /*@C
2025: VecCUSPRestoreCUDAArrayRead - Restore a device vector previously acquired with VecCUSPGetCUDAArrayRead()
2027: If the data on the host side was previously up to date it will remain
2028: so, i.e. data on both the device and the host is up to date.
2029: Accessing data on the host side e.g. with VecGetArray() does not
2030: incur a device to host data transfer.
2032: Input Parameter:
2033: + v - the vector
2035: - a - the CUDA device pointer. This pointer is invalid after
2036: VecCUSPRestoreCUDAArrayRead() returns.
2038: Fortran note:
2039: This function is not currently available from Fortran.
2041: Level: advanced
2043: Fortran note: This function is not currently available from Fortran.
2045: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2046: @*/
2047: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayRead(Vec v, PetscScalar **a)
2048: {
2050: v->valid_GPU_array = PETSC_CUSP_BOTH;
2051: return(0);
2052: }
2056: /*@C
2057: VecCUSPGetCUDAArrayWrite - Provides write access to the CUDA buffer inside a vector.
2059: The data pointed to by the device pointer is uninitialized. The user
2060: may not read from this data. Furthermore, the entire array needs to
2061: be filled by the user to obtain well-defined behaviour. The device
2062: memory will be allocated by this function if it hasn't been allocated
2063: previously. This is analogous to intent(out) in Fortran.
2065: The device pointer needs to be released with
2066: VecCUSPRestoreCUDAArrayWrite(). When the pointer is released the
2067: host data of the vector is marked as out of data. Subsequent access
2068: of the host data with e.g. VecGetArray() incurs a device to host data
2069: transfer.
2072: Input Parameter:
2073: . v - the vector
2075: Output Parameter:
2076: . a - the CUDA pointer
2078: Fortran note:
2079: This function is not currently available from Fortran.
2081: Level: advanced
2083: .seealso: VecCUSPRestoreCUDAArrayWrite(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2084: @*/
2085: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayWrite(Vec v, PetscScalar **a)
2086: {
2088: CUSPARRAY *cusparray;
2092: VecCUSPGetArrayWrite(v, &cusparray);
2093: *a = thrust::raw_pointer_cast(cusparray->data());
2094: return(0);
2095: }
2099: /*@C
2100: VecCUSPRestoreCUDAArrayWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayWrite().
2102: Data on the host will be marked as out of date. Subsequent access of
2103: the data on the host side e.g. with VecGetArray() will incur a device
2104: to host data transfer.
2106: Input Parameter:
2107: + v - the vector
2109: - a - the CUDA device pointer. This pointer is invalid after
2110: VecCUSPRestoreCUDAArrayWrite() returns.
2112: Fortran note:
2113: This function is not currently available from Fortran.
2115: Level: advanced
2117: .seealso: VecCUSPGetCUDAArrayWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2118: @*/
2119: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayWrite(Vec v, PetscScalar **a)
2120: {
2124: v->valid_GPU_array = PETSC_CUSP_GPU;
2125: PetscObjectStateIncrease((PetscObject)v);
2126: return(0);
2127: }
2132: /*@C
2133: VecCUSPPlaceArray - Allows one to replace the array in a vector with a
2134: CUSPARRAY provided by the user. This is useful to avoid copying a
2135: CUSPARRAY into a vector.
2137: Not Collective
2139: Input Parameters:
2140: + vec - the vector
2141: - array - the CUSPARRAY
2143: Notes:
2144: You can return to the original CUSPARRAY with a call to VecCUSPResetArray()
2145: It is not possible to use VecCUSPPlaceArray() and VecPlaceArray() at the
2146: same time on the same vector.
2148: Level: developer
2150: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPReplaceArray()
2152: @*/
2153: PetscErrorCode VecCUSPPlaceArray(Vec vin,CUSPARRAY *a)
2154: {
2158: VecCUSPCopyToGPU(vin);
2159: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUSPPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUSPResetArray()/VecResetArray()");
2160: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUSP*)vin->spptr)->GPUarray; /* save previous CUDAARRAY so reset can bring it back */
2161: ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2162: vin->valid_GPU_array = PETSC_CUSP_GPU;
2163: return(0);
2164: }
2168: /*@C
2169: VecCUSPReplaceArray - Allows one to replace the CUSPARRAY in a vector
2170: with a CUSPARRAY provided by the user. This is useful to avoid copying
2171: a CUSPARRAY into a vector.
2173: Not Collective
2175: Input Parameters:
2176: + vec - the vector
2177: - array - the CUSPARRAY
2179: Notes:
2180: This permanently replaces the CUSPARRAY and frees the memory associated
2181: with the old CUSPARRAY.
2183: The memory passed in CANNOT be freed by the user. It will be freed
2184: when the vector is destroy.
2186: Not supported from Fortran
2188: Level: developer
2190: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPPlaceArray(), VecReplaceArray()
2192: @*/
2193: PetscErrorCode VecCUSPReplaceArray(Vec vin,CUSPARRAY *a)
2194: {
2196: delete ((Vec_CUSP*)vin->spptr)->GPUarray;
2197: ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2198: vin->valid_GPU_array = PETSC_CUSP_GPU;
2199: return(0);
2200: }
2204: /*@C
2205: VecCUSPResetArray - Resets a vector to use its default memory. Call this
2206: after the use of VecCUSPPlaceArray().
2208: Not Collective
2210: Input Parameters:
2211: . vec - the vector
2213: Level: developer
2215: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUSPPlaceArray(), VecCUSPReplaceArray()
2217: @*/
2218: PetscErrorCode VecCUSPResetArray(Vec vin)
2219: {
2223: VecCUSPCopyToGPU(vin);
2224: ((Vec_CUSP*)vin->spptr)->GPUarray = (CUSPARRAY *) ((Vec_Seq*)vin->data)->unplacedarray;
2225: ((Vec_Seq*)vin->data)->unplacedarray = 0;
2226: vin->valid_GPU_array = PETSC_CUSP_GPU;
2227: return(0);
2228: }