Actual source code: veccusp.cu
petsc-3.6.4 2016-04-12
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #define PETSC_SKIP_COMPLEX
7: #include <petscconf.h>
8: PETSC_CUDA_EXTERN_C_BEGIN
9: #include <petsc/private/vecimpl.h> /*I "petscvec.h" I*/
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: PETSC_CUDA_EXTERN_C_END
12: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
14: #include <cuda_runtime.h>
18: /*
19: Allocates space for the vector array on the Host if it does not exist.
20: Does NOT change the PetscCUSPFlag for the vector
21: Does NOT zero the CUSP array
22: */
23: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
24: {
26: PetscScalar *array;
27: Vec_Seq *s = (Vec_Seq*)v->data;
28: PetscInt n = v->map->n;
31: if (!s) {
32: PetscNewLog((PetscObject)v,&s);
33: v->data = s;
34: }
35: if (!s->array) {
36: PetscMalloc1(n,&array);
37: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
38: s->array = array;
39: s->array_allocated = array;
40: if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
41: v->valid_GPU_array = PETSC_CUSP_CPU;
42: }
43: }
44: return(0);
45: }
49: /*
50: Allocates space for the vector array on the GPU if it does not exist.
51: Does NOT change the PetscCUSPFlag for the vector
52: Does NOT zero the CUSP array
54: */
55: PetscErrorCode VecCUSPAllocateCheck(Vec v)
56: {
57: cudaError_t err;
58: cudaStream_t stream;
59: Vec_CUSP *veccusp;
62: if (!v->spptr) {
63: try {
64: v->spptr = new Vec_CUSP;
65: veccusp = (Vec_CUSP*)v->spptr;
66: veccusp->GPUarray = new CUSPARRAY;
67: veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
68: err = cudaStreamCreate(&stream);CHKERRCUSP(err);
69: veccusp->stream = stream;
70: veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
71: v->ops->destroy = VecDestroy_SeqCUSP;
72: if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
73: if (v->data && ((Vec_Seq*)v->data)->array) {
74: v->valid_GPU_array = PETSC_CUSP_CPU;
75: } else {
76: v->valid_GPU_array = PETSC_CUSP_GPU;
77: }
78: }
79: } catch(char *ex) {
80: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
81: }
82: }
83: return(0);
84: }
89: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
90: PetscErrorCode VecCUSPCopyToGPU(Vec v)
91: {
93: cudaError_t err;
94: Vec_CUSP *veccusp;
95: CUSPARRAY *varray;
98: VecCUSPAllocateCheck(v);
99: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
100: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
101: try {
102: veccusp=(Vec_CUSP*)v->spptr;
103: varray=veccusp->GPUarray;
104: err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
105: } catch(char *ex) {
106: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
107: }
108: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
109: v->valid_GPU_array = PETSC_CUSP_BOTH;
110: }
111: return(0);
112: }
116: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
117: {
118: CUSPARRAY *varray;
120: cudaError_t err;
121: PetscScalar *cpuPtr, *gpuPtr;
122: Vec_Seq *s;
123: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
126: VecCUSPAllocateCheck(v);
127: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
128: s = (Vec_Seq*)v->data;
130: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
131: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
132: gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
133: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
135: /* Note : this code copies the smallest contiguous chunk of data
136: containing ALL of the indices */
137: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
139: // Set the buffer states
140: v->valid_GPU_array = PETSC_CUSP_BOTH;
141: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
142: }
143: return(0);
144: }
149: /*
150: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
151: */
152: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
153: {
155: cudaError_t err;
156: Vec_CUSP *veccusp;
157: CUSPARRAY *varray;
160: VecCUSPAllocateCheckHost(v);
161: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
162: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
163: try {
164: veccusp=(Vec_CUSP*)v->spptr;
165: varray=veccusp->GPUarray;
166: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
167: } catch(char *ex) {
168: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
169: }
170: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
171: v->valid_GPU_array = PETSC_CUSP_BOTH;
172: }
173: return(0);
174: }
178: /* Note that this function only copies *some* of the values up from the GPU to CPU,
179: which means that we need recombine the data at some point before using any of the standard functions.
180: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
181: where you have to always call in pairs
182: */
183: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
184: {
185: CUSPARRAY *varray;
187: cudaError_t err;
188: PetscScalar *cpuPtr, *gpuPtr;
189: Vec_Seq *s;
190: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
193: VecCUSPAllocateCheckHost(v);
194: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
195: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
197: varray=((Vec_CUSP*)v->spptr)->GPUarray;
198: s = (Vec_Seq*)v->data;
199: gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
200: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
202: /* Note : this code copies the smallest contiguous chunk of data
203: containing ALL of the indices */
204: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
206: VecCUSPRestoreArrayRead(v,&varray);
207: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
208: }
209: return(0);
210: }
214: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
215: {
216: PetscScalar *ya;
217: const PetscScalar *xa;
218: PetscErrorCode ierr;
221: VecCUSPAllocateCheckHost(xin);
222: VecCUSPAllocateCheckHost(yin);
223: if (xin != yin) {
224: VecGetArrayRead(xin,&xa);
225: VecGetArray(yin,&ya);
226: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
227: VecRestoreArrayRead(xin,&xa);
228: VecRestoreArray(yin,&ya);
229: }
230: return(0);
231: }
235: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
236: {
238: PetscInt n = xin->map->n,i;
239: PetscScalar *xx;
242: VecGetArray(xin,&xx);
243: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
244: VecRestoreArray(xin,&xx);
245: return(0);
246: }
250: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
251: {
252: Vec_Seq *vs = (Vec_Seq*)v->data;
256: PetscObjectSAWsViewOff(v);
257: #if defined(PETSC_USE_LOG)
258: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
259: #endif
260: if (vs) {
261: if (vs->array_allocated) PetscFree(vs->array_allocated);
262: PetscFree(vs);
263: }
264: return(0);
265: }
269: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
270: {
271: Vec_Seq *v = (Vec_Seq*)vin->data;
274: v->array = v->unplacedarray;
275: v->unplacedarray = 0;
276: return(0);
277: }
279: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
282: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
283: {
287: VecCUSPAllocateCheck(v);
288: return(0);
289: }
293: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
294: {
298: VecCUSPCopyToGPU(v);
299: return(0);
300: }
306: /*
307: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
309: Input Parameters:
310: + v - the vector
311: - indices - the requested indices, this should be created with CUSPIndicesCreate()
313: */
314: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
315: {
319: VecCUSPCopyToGPUSome(v,ci);
320: return(0);
321: }
325: /*
326: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
328: Input Parameters:
329: + v - the vector
330: - indices - the requested indices, this should be created with CUSPIndicesCreate()
331: */
332: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
333: {
337: VecCUSPCopyFromGPUSome(v,ci);
338: return(0);
339: }
341: /*MC
342: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
344: Options Database Keys:
345: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
347: Level: beginner
349: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
350: M*/
352: /* for VecAYPX_SeqCUSP*/
353: namespace cusp
354: {
355: namespace blas
356: {
357: namespace detail
358: {
359: template <typename T>
360: struct AYPX : public thrust::binary_function<T,T,T>
361: {
362: T alpha;
364: AYPX(T _alpha) : alpha(_alpha) {}
366: __host__ __device__
367: T operator()(T x, T y)
368: {
369: return alpha * y + x;
370: }
371: };
372: }
374: template <typename ForwardIterator1,
375: typename ForwardIterator2,
376: typename ScalarType>
377: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
378: {
379: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
380: }
381: template <typename Array1, typename Array2, typename ScalarType>
382: void aypx(const Array1& x, Array2& y, ScalarType alpha)
383: {
384: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
385: cusp::assert_same_dimensions(x,y);
386: #else
387: detail::assert_same_dimensions(x,y);
388: #endif
389: aypx(x.begin(),x.end(),y.begin(),alpha);
390: }
391: }
392: }
396: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
397: {
398: CUSPARRAY *xarray,*yarray;
402: VecCUSPGetArrayRead(xin,&xarray);
403: VecCUSPGetArrayReadWrite(yin,&yarray);
404: try {
405: if (alpha != 0.0) {
406: cusp::blas::aypx(*xarray,*yarray,alpha);
407: PetscLogFlops(2.0*yin->map->n);
408: } else {
409: cusp::blas::copy(*xarray,*yarray);
410: }
411: WaitForGPU();CHKERRCUSP(ierr);
412: } catch(char *ex) {
413: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
414: }
415: VecCUSPRestoreArrayRead(xin,&xarray);
416: VecCUSPRestoreArrayReadWrite(yin,&yarray);
417: return(0);
418: }
423: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
424: {
425: CUSPARRAY *xarray,*yarray;
429: if (alpha != 0.0) {
430: VecCUSPGetArrayRead(xin,&xarray);
431: VecCUSPGetArrayReadWrite(yin,&yarray);
432: try {
433: cusp::blas::axpy(*xarray,*yarray,alpha);
434: WaitForGPU();CHKERRCUSP(ierr);
435: } catch(char *ex) {
436: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
437: }
438: VecCUSPRestoreArrayRead(xin,&xarray);
439: VecCUSPRestoreArrayReadWrite(yin,&yarray);
440: PetscLogFlops(2.0*yin->map->n);
441: }
442: return(0);
443: }
445: struct VecCUSPPointwiseDivide
446: {
447: template <typename Tuple>
448: __host__ __device__
449: void operator()(Tuple t)
450: {
451: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
452: }
453: };
457: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
458: {
459: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
463: VecCUSPGetArrayRead(xin,&xarray);
464: VecCUSPGetArrayRead(yin,&yarray);
465: VecCUSPGetArrayWrite(win,&warray);
466: try {
467: thrust::for_each(
468: thrust::make_zip_iterator(
469: thrust::make_tuple(
470: warray->begin(),
471: xarray->begin(),
472: yarray->begin())),
473: thrust::make_zip_iterator(
474: thrust::make_tuple(
475: warray->end(),
476: xarray->end(),
477: yarray->end())),
478: VecCUSPPointwiseDivide());
479: WaitForGPU();CHKERRCUSP(ierr);
480: } catch(char *ex) {
481: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
482: }
483: PetscLogFlops(win->map->n);
484: VecCUSPRestoreArrayRead(xin,&xarray);
485: VecCUSPRestoreArrayRead(yin,&yarray);
486: VecCUSPRestoreArrayWrite(win,&warray);
487: return(0);
488: }
491: struct VecCUSPWAXPY
492: {
493: template <typename Tuple>
494: __host__ __device__
495: void operator()(Tuple t)
496: {
497: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
498: }
499: };
501: struct VecCUSPSum
502: {
503: template <typename Tuple>
504: __host__ __device__
505: void operator()(Tuple t)
506: {
507: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
508: }
509: };
511: struct VecCUSPDiff
512: {
513: template <typename Tuple>
514: __host__ __device__
515: void operator()(Tuple t)
516: {
517: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
518: }
519: };
523: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
524: {
525: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
529: if (alpha == 0.0) {
530: VecCopy_SeqCUSP(yin,win);
531: } else {
532: VecCUSPGetArrayRead(xin,&xarray);
533: VecCUSPGetArrayRead(yin,&yarray);
534: VecCUSPGetArrayWrite(win,&warray);
535: if (alpha == 1.0) {
536: try {
537: thrust::for_each(
538: thrust::make_zip_iterator(
539: thrust::make_tuple(
540: warray->begin(),
541: yarray->begin(),
542: xarray->begin())),
543: thrust::make_zip_iterator(
544: thrust::make_tuple(
545: warray->end(),
546: yarray->end(),
547: xarray->end())),
548: VecCUSPSum());
549: } catch(char *ex) {
550: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
551: }
552: PetscLogFlops(win->map->n);
553: } else if (alpha == -1.0) {
554: try {
555: thrust::for_each(
556: thrust::make_zip_iterator(
557: thrust::make_tuple(
558: warray->begin(),
559: yarray->begin(),
560: xarray->begin())),
561: thrust::make_zip_iterator(
562: thrust::make_tuple(
563: warray->end(),
564: yarray->end(),
565: xarray->end())),
566: VecCUSPDiff());
567: } catch(char *ex) {
568: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
569: }
570: PetscLogFlops(win->map->n);
571: } else {
572: try {
573: thrust::for_each(
574: thrust::make_zip_iterator(
575: thrust::make_tuple(
576: warray->begin(),
577: yarray->begin(),
578: thrust::make_constant_iterator(alpha),
579: xarray->begin())),
580: thrust::make_zip_iterator(
581: thrust::make_tuple(
582: warray->end(),
583: yarray->end(),
584: thrust::make_constant_iterator(alpha),
585: xarray->end())),
586: VecCUSPWAXPY());
587: } catch(char *ex) {
588: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
589: }
590: PetscLogFlops(2*win->map->n);
591: }
592: WaitForGPU();CHKERRCUSP(ierr);
593: VecCUSPRestoreArrayRead(xin,&xarray);
594: VecCUSPRestoreArrayRead(yin,&yarray);
595: VecCUSPRestoreArrayWrite(win,&warray);
596: }
597: return(0);
598: }
600: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
601: struct VecCUSPMAXPY4
602: {
603: template <typename Tuple>
604: __host__ __device__
605: void operator()(Tuple t)
606: {
607: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
608: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
609: }
610: };
613: struct VecCUSPMAXPY3
614: {
615: template <typename Tuple>
616: __host__ __device__
617: void operator()(Tuple t)
618: {
619: /*y += a1*x1 +a2*x2 + a3*x3 */
620: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
621: }
622: };
624: struct VecCUSPMAXPY2
625: {
626: template <typename Tuple>
627: __host__ __device__
628: void operator()(Tuple t)
629: {
630: /*y += a1*x1 +a2*x2*/
631: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
632: }
633: };
636: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
637: {
639: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
640: PetscInt n = xin->map->n,j,j_rem;
641: PetscScalar alpha0,alpha1,alpha2,alpha3;
644: PetscLogFlops(nv*2.0*n);
645: VecCUSPGetArrayReadWrite(xin,&xarray);
646: switch (j_rem=nv&0x3) {
647: case 3:
648: alpha0 = alpha[0];
649: alpha1 = alpha[1];
650: alpha2 = alpha[2];
651: alpha += 3;
652: VecCUSPGetArrayRead(y[0],&yy0);
653: VecCUSPGetArrayRead(y[1],&yy1);
654: VecCUSPGetArrayRead(y[2],&yy2);
655: try {
656: thrust::for_each(
657: thrust::make_zip_iterator(
658: thrust::make_tuple(
659: xarray->begin(),
660: thrust::make_constant_iterator(alpha0),
661: yy0->begin(),
662: thrust::make_constant_iterator(alpha1),
663: yy1->begin(),
664: thrust::make_constant_iterator(alpha2),
665: yy2->begin())),
666: thrust::make_zip_iterator(
667: thrust::make_tuple(
668: xarray->end(),
669: thrust::make_constant_iterator(alpha0),
670: yy0->end(),
671: thrust::make_constant_iterator(alpha1),
672: yy1->end(),
673: thrust::make_constant_iterator(alpha2),
674: yy2->end())),
675: VecCUSPMAXPY3());
676: } catch(char *ex) {
677: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
678: }
679: VecCUSPRestoreArrayRead(y[0],&yy0);
680: VecCUSPRestoreArrayRead(y[1],&yy1);
681: VecCUSPRestoreArrayRead(y[2],&yy2);
682: y += 3;
683: break;
684: case 2:
685: alpha0 = alpha[0];
686: alpha1 = alpha[1];
687: alpha +=2;
688: VecCUSPGetArrayRead(y[0],&yy0);
689: VecCUSPGetArrayRead(y[1],&yy1);
690: try {
691: thrust::for_each(
692: thrust::make_zip_iterator(
693: thrust::make_tuple(
694: xarray->begin(),
695: thrust::make_constant_iterator(alpha0),
696: yy0->begin(),
697: thrust::make_constant_iterator(alpha1),
698: yy1->begin())),
699: thrust::make_zip_iterator(
700: thrust::make_tuple(
701: xarray->end(),
702: thrust::make_constant_iterator(alpha0),
703: yy0->end(),
704: thrust::make_constant_iterator(alpha1),
705: yy1->end())),
706: VecCUSPMAXPY2());
707: } catch(char *ex) {
708: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
709: }
710: y +=2;
711: break;
712: case 1:
713: alpha0 = *alpha++;
714: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
715: y +=1;
716: break;
717: }
718: for (j=j_rem; j<nv; j+=4) {
719: alpha0 = alpha[0];
720: alpha1 = alpha[1];
721: alpha2 = alpha[2];
722: alpha3 = alpha[3];
723: alpha += 4;
724: VecCUSPGetArrayRead(y[0],&yy0);
725: VecCUSPGetArrayRead(y[1],&yy1);
726: VecCUSPGetArrayRead(y[2],&yy2);
727: VecCUSPGetArrayRead(y[3],&yy3);
728: try {
729: thrust::for_each(
730: thrust::make_zip_iterator(
731: thrust::make_tuple(
732: xarray->begin(),
733: thrust::make_constant_iterator(alpha0),
734: yy0->begin(),
735: thrust::make_constant_iterator(alpha1),
736: yy1->begin(),
737: thrust::make_constant_iterator(alpha2),
738: yy2->begin(),
739: thrust::make_constant_iterator(alpha3),
740: yy3->begin())),
741: thrust::make_zip_iterator(
742: thrust::make_tuple(
743: xarray->end(),
744: thrust::make_constant_iterator(alpha0),
745: yy0->end(),
746: thrust::make_constant_iterator(alpha1),
747: yy1->end(),
748: thrust::make_constant_iterator(alpha2),
749: yy2->end(),
750: thrust::make_constant_iterator(alpha3),
751: yy3->end())),
752: VecCUSPMAXPY4());
753: } catch(char *ex) {
754: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
755: }
756: VecCUSPRestoreArrayRead(y[0],&yy0);
757: VecCUSPRestoreArrayRead(y[1],&yy1);
758: VecCUSPRestoreArrayRead(y[2],&yy2);
759: VecCUSPRestoreArrayRead(y[3],&yy3);
760: y += 4;
761: }
762: VecCUSPRestoreArrayReadWrite(xin,&xarray);
763: WaitForGPU();CHKERRCUSP(ierr);
764: return(0);
765: }
770: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
771: {
772: CUSPARRAY *xarray,*yarray;
774: // PetscScalar *xptr,*yptr,*zgpu;
775: //PetscReal tmp;
778: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
779: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
780: VecCUSPGetArrayRead(xin,&xarray);
781: VecCUSPGetArrayRead(yin,&yarray);
782: try {
783: #if defined(PETSC_USE_COMPLEX)
784: *z = cusp::blas::dotc(*yarray,*xarray);
785: #else
786: *z = cusp::blas::dot(*yarray,*xarray);
787: #endif
788: } catch(char *ex) {
789: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
790: }
791: WaitForGPU();CHKERRCUSP(ierr);
792: if (xin->map->n >0) {
793: PetscLogFlops(2.0*xin->map->n-1);
794: }
795: VecCUSPRestoreArrayRead(xin,&xarray);
796: VecCUSPRestoreArrayRead(yin,&yarray);
797: return(0);
798: }
800: //
801: // CUDA kernels for MDot to follow
802: //
804: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
805: #define MDOT_WORKGROUP_SIZE 128
806: #define MDOT_WORKGROUP_NUM 128
808: // M = 2:
809: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
810: PetscInt size, PetscScalar *group_results)
811: {
812: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
813: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
814: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
815: PetscInt vec_start_index = blockIdx.x * entries_per_group;
816: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
818: PetscScalar entry_x = 0;
819: PetscScalar group_sum0 = 0;
820: PetscScalar group_sum1 = 0;
821: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
822: entry_x = x[i]; // load only once from global memory!
823: group_sum0 += entry_x * y0[i];
824: group_sum1 += entry_x * y1[i];
825: }
826: tmp_buffer[threadIdx.x] = group_sum0;
827: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
829: // parallel reduction
830: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
831: __syncthreads();
832: if (threadIdx.x < stride) {
833: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
834: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
835: }
836: }
838: // write result of group to group_results
839: if (threadIdx.x == 0) {
840: group_results[blockIdx.x] = tmp_buffer[0];
841: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
842: }
843: }
845: // M = 3:
846: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
847: PetscInt size, PetscScalar *group_results)
848: {
849: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
850: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
851: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
852: PetscInt vec_start_index = blockIdx.x * entries_per_group;
853: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
855: PetscScalar entry_x = 0;
856: PetscScalar group_sum0 = 0;
857: PetscScalar group_sum1 = 0;
858: PetscScalar group_sum2 = 0;
859: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
860: entry_x = x[i]; // load only once from global memory!
861: group_sum0 += entry_x * y0[i];
862: group_sum1 += entry_x * y1[i];
863: group_sum2 += entry_x * y2[i];
864: }
865: tmp_buffer[threadIdx.x] = group_sum0;
866: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
867: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
869: // parallel reduction
870: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
871: __syncthreads();
872: if (threadIdx.x < stride) {
873: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
874: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
875: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
876: }
877: }
879: // write result of group to group_results
880: if (threadIdx.x == 0) {
881: group_results[blockIdx.x ] = tmp_buffer[0];
882: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
883: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
884: }
885: }
887: // M = 4:
888: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
889: PetscInt size, PetscScalar *group_results)
890: {
891: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
892: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
893: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
894: PetscInt vec_start_index = blockIdx.x * entries_per_group;
895: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
897: PetscScalar entry_x = 0;
898: PetscScalar group_sum0 = 0;
899: PetscScalar group_sum1 = 0;
900: PetscScalar group_sum2 = 0;
901: PetscScalar group_sum3 = 0;
902: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
903: entry_x = x[i]; // load only once from global memory!
904: group_sum0 += entry_x * y0[i];
905: group_sum1 += entry_x * y1[i];
906: group_sum2 += entry_x * y2[i];
907: group_sum3 += entry_x * y3[i];
908: }
909: tmp_buffer[threadIdx.x] = group_sum0;
910: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
911: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
912: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
914: // parallel reduction
915: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
916: __syncthreads();
917: if (threadIdx.x < stride) {
918: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
919: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
920: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
921: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
922: }
923: }
925: // write result of group to group_results
926: if (threadIdx.x == 0) {
927: group_results[blockIdx.x ] = tmp_buffer[0];
928: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
929: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
930: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
931: }
932: }
934: // M = 8:
935: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
936: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
937: PetscInt size, PetscScalar *group_results)
938: {
939: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
940: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
941: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
942: PetscInt vec_start_index = blockIdx.x * entries_per_group;
943: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
945: PetscScalar entry_x = 0;
946: PetscScalar group_sum0 = 0;
947: PetscScalar group_sum1 = 0;
948: PetscScalar group_sum2 = 0;
949: PetscScalar group_sum3 = 0;
950: PetscScalar group_sum4 = 0;
951: PetscScalar group_sum5 = 0;
952: PetscScalar group_sum6 = 0;
953: PetscScalar group_sum7 = 0;
954: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
955: entry_x = x[i]; // load only once from global memory!
956: group_sum0 += entry_x * y0[i];
957: group_sum1 += entry_x * y1[i];
958: group_sum2 += entry_x * y2[i];
959: group_sum3 += entry_x * y3[i];
960: group_sum4 += entry_x * y4[i];
961: group_sum5 += entry_x * y5[i];
962: group_sum6 += entry_x * y6[i];
963: group_sum7 += entry_x * y7[i];
964: }
965: tmp_buffer[threadIdx.x] = group_sum0;
966: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
967: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
968: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
969: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
970: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
971: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
972: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
974: // parallel reduction
975: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
976: __syncthreads();
977: if (threadIdx.x < stride) {
978: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
979: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
980: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
981: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
982: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
983: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
984: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
985: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
986: }
987: }
989: // write result of group to group_results
990: if (threadIdx.x == 0) {
991: group_results[blockIdx.x ] = tmp_buffer[0];
992: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
993: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
994: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
995: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
996: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
997: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
998: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
999: }
1000: }
1005: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1006: {
1008: PetscInt i,j,n = xin->map->n,current_y_index = 0;
1009: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1010: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1011: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1012: cudaError_t cuda_ierr;
1015: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1016: /* Handle the case of local size zero first */
1017: if (!xin->map->n) {
1018: for (i=0; i<nv; ++i) z[i] = 0;
1019: return(0);
1020: }
1022: // allocate scratchpad memory for the results of individual work groups:
1023: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1024: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
1026: VecCUSPGetArrayRead(xin,&xarray);
1027: xptr = thrust::raw_pointer_cast(xarray->data());
1029: while (current_y_index < nv)
1030: {
1031: switch (nv - current_y_index) {
1033: case 7:
1034: case 6:
1035: case 5:
1036: case 4:
1037: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1038: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1039: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1040: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1042: #if defined(PETSC_USE_COMPLEX)
1043: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1044: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1045: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1046: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1047: #else
1048: // extract raw device pointers:
1049: y0ptr = thrust::raw_pointer_cast(y0array->data());
1050: y1ptr = thrust::raw_pointer_cast(y1array->data());
1051: y2ptr = thrust::raw_pointer_cast(y2array->data());
1052: y3ptr = thrust::raw_pointer_cast(y3array->data());
1054: // run kernel:
1055: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
1057: // copy results back to
1058: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1059: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1061: // sum group results into z:
1062: for (j=0; j<4; ++j) {
1063: z[current_y_index + j] = 0;
1064: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1065: }
1066: #endif
1067: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1068: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1069: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1070: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1071: current_y_index += 4;
1072: break;
1074: case 3:
1075: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1076: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1077: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1079: #if defined(PETSC_USE_COMPLEX)
1080: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1081: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1082: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1083: #else
1084: // extract raw device pointers:
1085: y0ptr = thrust::raw_pointer_cast(y0array->data());
1086: y1ptr = thrust::raw_pointer_cast(y1array->data());
1087: y2ptr = thrust::raw_pointer_cast(y2array->data());
1089: // run kernel:
1090: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
1092: // copy results back to
1093: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1094: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1096: // sum group results into z:
1097: for (j=0; j<3; ++j) {
1098: z[current_y_index + j] = 0;
1099: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1100: }
1101: #endif
1103: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1104: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1105: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1106: current_y_index += 3;
1107: break;
1109: case 2:
1110: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1111: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1113: #if defined(PETSC_USE_COMPLEX)
1114: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1115: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1116: #else
1117: // extract raw device pointers:
1118: y0ptr = thrust::raw_pointer_cast(y0array->data());
1119: y1ptr = thrust::raw_pointer_cast(y1array->data());
1121: // run kernel:
1122: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
1124: // copy results back to
1125: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1126: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1128: // sum group results into z:
1129: for (j=0; j<2; ++j) {
1130: z[current_y_index + j] = 0;
1131: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1132: }
1133: #endif
1134: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1135: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1136: current_y_index += 2;
1137: break;
1139: case 1:
1140: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1141: #if defined(PETSC_USE_COMPLEX)
1142: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1143: #else
1144: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1145: #endif
1146: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1147: current_y_index += 1;
1148: break;
1150: default: // 8 or more vectors left
1151: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1152: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1153: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1154: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1155: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1156: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1157: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1158: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
1160: #if defined(PETSC_USE_COMPLEX)
1161: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1162: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1163: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1164: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1165: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1166: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1167: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1168: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1169: #else
1170: // extract raw device pointers:
1171: y0ptr = thrust::raw_pointer_cast(y0array->data());
1172: y1ptr = thrust::raw_pointer_cast(y1array->data());
1173: y2ptr = thrust::raw_pointer_cast(y2array->data());
1174: y3ptr = thrust::raw_pointer_cast(y3array->data());
1175: y4ptr = thrust::raw_pointer_cast(y4array->data());
1176: y5ptr = thrust::raw_pointer_cast(y5array->data());
1177: y6ptr = thrust::raw_pointer_cast(y6array->data());
1178: y7ptr = thrust::raw_pointer_cast(y7array->data());
1180: // run kernel:
1181: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1183: // copy results back to
1184: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1185: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1187: // sum group results into z:
1188: for (j=0; j<8; ++j) {
1189: z[current_y_index + j] = 0;
1190: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1191: }
1192: #endif
1193: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1194: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1195: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1196: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1197: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1198: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1199: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1200: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1201: current_y_index += 8;
1202: break;
1203: }
1204: }
1205: VecCUSPRestoreArrayRead(xin,&xarray);
1207: cuda_cudaFree(group_results_gpu);
1208: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1209: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1210: return(0);
1211: }
1213: #undef MDOT_WORKGROUP_SIZE
1214: #undef MDOT_WORKGROUP_NUM
1220: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1221: {
1222: CUSPARRAY *xarray=NULL;
1226: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1227: VecCUSPGetArrayWrite(xin,&xarray);
1228: try {
1229: cusp::blas::fill(*xarray,alpha);
1230: } catch(char *ex) {
1231: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1232: }
1233: WaitForGPU();CHKERRCUSP(ierr);
1234: VecCUSPRestoreArrayWrite(xin,&xarray);
1235: return(0);
1236: }
1240: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1241: {
1242: CUSPARRAY *xarray;
1246: if (alpha == 0.0) {
1247: VecSet_SeqCUSP(xin,alpha);
1248: } else if (alpha != 1.0) {
1249: VecCUSPGetArrayReadWrite(xin,&xarray);
1250: try {
1251: cusp::blas::scal(*xarray,alpha);
1252: } catch(char *ex) {
1253: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1254: }
1255: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1256: }
1257: WaitForGPU();CHKERRCUSP(ierr);
1258: PetscLogFlops(xin->map->n);
1259: return(0);
1260: }
1265: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1266: {
1267: CUSPARRAY *xarray,*yarray;
1271: //#if defined(PETSC_USE_COMPLEX)
1272: /*Not working for complex*/
1273: //#else
1274: VecCUSPGetArrayRead(xin,&xarray);
1275: VecCUSPGetArrayRead(yin,&yarray);
1276: try {
1277: *z = cusp::blas::dot(*xarray,*yarray);
1278: } catch(char *ex) {
1279: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1280: }
1281: //#endif
1282: WaitForGPU();CHKERRCUSP(ierr);
1283: if (xin->map->n > 0) {
1284: PetscLogFlops(2.0*xin->map->n-1);
1285: }
1286: VecCUSPRestoreArrayRead(yin,&yarray);
1287: VecCUSPRestoreArrayRead(xin,&xarray);
1288: return(0);
1289: }
1292: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1293: {
1294: CUSPARRAY *xarray,*yarray;
1298: if (xin != yin) {
1299: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1300: VecCUSPGetArrayRead(xin,&xarray);
1301: VecCUSPGetArrayWrite(yin,&yarray);
1302: try {
1303: cusp::blas::copy(*xarray,*yarray);
1304: } catch(char *ex) {
1305: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1306: }
1307: WaitForGPU();CHKERRCUSP(ierr);
1308: VecCUSPRestoreArrayRead(xin,&xarray);
1309: VecCUSPRestoreArrayWrite(yin,&yarray);
1311: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1312: /* copy in CPU if we are on the CPU*/
1313: VecCopy_SeqCUSP_Private(xin,yin);
1314: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1315: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1316: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1317: /* copy in CPU */
1318: VecCopy_SeqCUSP_Private(xin,yin);
1320: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1321: /* copy in GPU */
1322: VecCUSPGetArrayRead(xin,&xarray);
1323: VecCUSPGetArrayWrite(yin,&yarray);
1324: try {
1325: cusp::blas::copy(*xarray,*yarray);
1326: WaitForGPU();CHKERRCUSP(ierr);
1327: } catch(char *ex) {
1328: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1329: }
1330: VecCUSPRestoreArrayRead(xin,&xarray);
1331: VecCUSPRestoreArrayWrite(yin,&yarray);
1332: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1333: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1334: default to copy in GPU (this is an arbitrary choice) */
1335: VecCUSPGetArrayRead(xin,&xarray);
1336: VecCUSPGetArrayWrite(yin,&yarray);
1337: try {
1338: cusp::blas::copy(*xarray,*yarray);
1339: WaitForGPU();CHKERRCUSP(ierr);
1340: } catch(char *ex) {
1341: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1342: }
1343: VecCUSPRestoreArrayRead(xin,&xarray);
1344: VecCUSPRestoreArrayWrite(yin,&yarray);
1345: } else {
1346: VecCopy_SeqCUSP_Private(xin,yin);
1347: }
1348: }
1349: }
1350: return(0);
1351: }
1356: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1357: {
1359: PetscBLASInt one = 1,bn;
1360: CUSPARRAY *xarray,*yarray;
1363: PetscBLASIntCast(xin->map->n,&bn);
1364: if (xin != yin) {
1365: VecCUSPGetArrayReadWrite(xin,&xarray);
1366: VecCUSPGetArrayReadWrite(yin,&yarray);
1368: #if defined(PETSC_USE_COMPLEX)
1369: #if defined(PETSC_USE_REAL_SINGLE)
1370: cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1371: #else
1372: cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1373: #endif
1374: #else
1375: #if defined(PETSC_USE_REAL_SINGLE)
1376: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1377: #else
1378: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1379: #endif
1380: #endif
1381: cublasGetError();CHKERRCUSP(ierr);
1382: WaitForGPU();CHKERRCUSP(ierr);
1383: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1384: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1385: }
1386: return(0);
1387: }
1389: struct VecCUSPAX
1390: {
1391: template <typename Tuple>
1392: __host__ __device__
1393: void operator()(Tuple t)
1394: {
1395: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1396: }
1397: };
1400: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1401: {
1403: PetscScalar a = alpha,b = beta;
1404: CUSPARRAY *xarray,*yarray;
1407: if (a == 0.0) {
1408: VecScale_SeqCUSP(yin,beta);
1409: } else if (b == 1.0) {
1410: VecAXPY_SeqCUSP(yin,alpha,xin);
1411: } else if (a == 1.0) {
1412: VecAYPX_SeqCUSP(yin,beta,xin);
1413: } else if (b == 0.0) {
1414: VecCUSPGetArrayRead(xin,&xarray);
1415: VecCUSPGetArrayReadWrite(yin,&yarray);
1416: try {
1417: thrust::for_each(
1418: thrust::make_zip_iterator(
1419: thrust::make_tuple(
1420: yarray->begin(),
1421: thrust::make_constant_iterator(a),
1422: xarray->begin())),
1423: thrust::make_zip_iterator(
1424: thrust::make_tuple(
1425: yarray->end(),
1426: thrust::make_constant_iterator(a),
1427: xarray->end())),
1428: VecCUSPAX());
1429: } catch(char *ex) {
1430: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1431: }
1432: PetscLogFlops(xin->map->n);
1433: WaitForGPU();CHKERRCUSP(ierr);
1434: VecCUSPRestoreArrayRead(xin,&xarray);
1435: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1436: } else {
1437: VecCUSPGetArrayRead(xin,&xarray);
1438: VecCUSPGetArrayReadWrite(yin,&yarray);
1439: try {
1440: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1441: } catch(char *ex) {
1442: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1443: }
1444: VecCUSPRestoreArrayRead(xin,&xarray);
1445: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1446: WaitForGPU();CHKERRCUSP(ierr);
1447: PetscLogFlops(3.0*xin->map->n);
1448: }
1449: return(0);
1450: }
1452: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1453: struct VecCUSPXPBYPCZ
1454: {
1455: /* z = x + b*y + c*z */
1456: template <typename Tuple>
1457: __host__ __device__
1458: void operator()(Tuple t)
1459: {
1460: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1461: }
1462: };
1463: struct VecCUSPAXPBYPZ
1464: {
1465: /* z = ax + b*y + z */
1466: template <typename Tuple>
1467: __host__ __device__
1468: void operator()(Tuple t)
1469: {
1470: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1471: }
1472: };
1476: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1477: {
1479: PetscInt n = zin->map->n;
1480: CUSPARRAY *xarray,*yarray,*zarray;
1483: VecCUSPGetArrayRead(xin,&xarray);
1484: VecCUSPGetArrayRead(yin,&yarray);
1485: VecCUSPGetArrayReadWrite(zin,&zarray);
1486: if (alpha == 1.0) {
1487: try {
1488: thrust::for_each(
1489: thrust::make_zip_iterator(
1490: thrust::make_tuple(
1491: zarray->begin(),
1492: thrust::make_constant_iterator(gamma),
1493: xarray->begin(),
1494: yarray->begin(),
1495: thrust::make_constant_iterator(beta))),
1496: thrust::make_zip_iterator(
1497: thrust::make_tuple(
1498: zarray->end(),
1499: thrust::make_constant_iterator(gamma),
1500: xarray->end(),
1501: yarray->end(),
1502: thrust::make_constant_iterator(beta))),
1503: VecCUSPXPBYPCZ());
1504: } catch(char *ex) {
1505: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1506: }
1507: PetscLogFlops(4.0*n);
1508: } else if (gamma == 1.0) {
1509: try {
1510: thrust::for_each(
1511: thrust::make_zip_iterator(
1512: thrust::make_tuple(
1513: zarray->begin(),
1514: xarray->begin(),
1515: thrust::make_constant_iterator(alpha),
1516: yarray->begin(),
1517: thrust::make_constant_iterator(beta))),
1518: thrust::make_zip_iterator(
1519: thrust::make_tuple(
1520: zarray->end(),
1521: xarray->end(),
1522: thrust::make_constant_iterator(alpha),
1523: yarray->end(),
1524: thrust::make_constant_iterator(beta))),
1525: VecCUSPAXPBYPZ());
1526: } catch(char *ex) {
1527: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1528: }
1529: PetscLogFlops(4.0*n);
1530: } else {
1531: try {
1532: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1533: } catch(char *ex) {
1534: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1535: }
1536: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1537: VecCUSPRestoreArrayRead(xin,&xarray);
1538: VecCUSPRestoreArrayRead(yin,&yarray);
1539: PetscLogFlops(5.0*n);
1540: }
1541: WaitForGPU();CHKERRCUSP(ierr);
1542: return(0);
1543: }
1547: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1548: {
1550: PetscInt n = win->map->n;
1551: CUSPARRAY *xarray,*yarray,*warray;
1554: VecCUSPGetArrayRead(xin,&xarray);
1555: VecCUSPGetArrayRead(yin,&yarray);
1556: VecCUSPGetArrayReadWrite(win,&warray);
1557: try {
1558: cusp::blas::xmy(*xarray,*yarray,*warray);
1559: } catch(char *ex) {
1560: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1561: }
1562: VecCUSPRestoreArrayRead(xin,&xarray);
1563: VecCUSPRestoreArrayRead(yin,&yarray);
1564: VecCUSPRestoreArrayReadWrite(win,&warray);
1565: PetscLogFlops(n);
1566: WaitForGPU();CHKERRCUSP(ierr);
1567: return(0);
1568: }
1571: /* should do infinity norm in cusp */
1575: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1576: {
1577: const PetscScalar *xx;
1578: PetscErrorCode ierr;
1579: PetscInt n = xin->map->n;
1580: PetscBLASInt one = 1, bn;
1581: CUSPARRAY *xarray;
1584: PetscBLASIntCast(n,&bn);
1585: if (type == NORM_2 || type == NORM_FROBENIUS) {
1586: VecCUSPGetArrayRead(xin,&xarray);
1587: try {
1588: *z = cusp::blas::nrm2(*xarray);
1589: } catch(char *ex) {
1590: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1591: }
1592: WaitForGPU();CHKERRCUSP(ierr);
1593: VecCUSPRestoreArrayRead(xin,&xarray);
1594: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1595: } else if (type == NORM_INFINITY) {
1596: PetscInt i;
1597: PetscReal max = 0.0,tmp;
1599: VecGetArrayRead(xin,&xx);
1600: for (i=0; i<n; i++) {
1601: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1602: /* check special case of tmp == NaN */
1603: if (tmp != tmp) {max = tmp; break;}
1604: xx++;
1605: }
1606: VecRestoreArrayRead(xin,&xx);
1607: *z = max;
1608: } else if (type == NORM_1) {
1609: VecCUSPGetArrayRead(xin,&xarray);
1610: #if defined(PETSC_USE_COMPLEX)
1611: #if defined(PETSC_USE_REAL_SINGLE)
1612: *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1613: #else
1614: *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1615: #endif
1616: #else
1617: #if defined(PETSC_USE_REAL_SINGLE)
1618: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1619: #else
1620: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1621: #endif
1622: #endif
1623: cublasGetError();CHKERRCUSP(ierr);
1624: VecCUSPRestoreArrayRead(xin,&xarray);
1625: WaitForGPU();CHKERRCUSP(ierr);
1626: PetscLogFlops(PetscMax(n-1.0,0.0));
1627: } else if (type == NORM_1_AND_2) {
1628: VecNorm_SeqCUSP(xin,NORM_1,z);
1629: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1630: }
1631: return(0);
1632: }
1635: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1639: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1640: {
1644: VecSetRandom_SeqCUSP_Private(xin,r);
1645: xin->valid_GPU_array = PETSC_CUSP_CPU;
1646: return(0);
1647: }
1651: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1652: {
1656: VecCUSPCopyFromGPU(vin);
1657: VecResetArray_SeqCUSP_Private(vin);
1658: vin->valid_GPU_array = PETSC_CUSP_CPU;
1659: return(0);
1660: }
1664: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1665: {
1669: VecCUSPAllocateCheckHost(vin);
1670: VecPlaceArray_Seq(vin,a);
1671: vin->valid_GPU_array = PETSC_CUSP_CPU;
1672: return(0);
1673: }
1678: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1679: {
1683: VecCUSPCopyFromGPU(vin);
1684: VecReplaceArray_Seq(vin,a);
1685: vin->valid_GPU_array = PETSC_CUSP_CPU;
1686: return(0);
1687: }
1692: /*@
1693: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1695: Collective on MPI_Comm
1697: Input Parameter:
1698: + comm - the communicator, should be PETSC_COMM_SELF
1699: - n - the vector length
1701: Output Parameter:
1702: . V - the vector
1704: Notes:
1705: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1706: same type as an existing vector.
1708: Level: intermediate
1710: Concepts: vectors^creating sequential
1712: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1713: @*/
1714: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1715: {
1719: VecCreate(comm,v);
1720: VecSetSizes(*v,n,n);
1721: VecSetType(*v,VECSEQCUSP);
1722: return(0);
1723: }
1725: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1726: template <typename T>
1727: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1728: {
1729: __host__ __device__
1730: T operator()(T x)
1731: {
1732: #if defined(PETSC_USE_COMPLEX)
1733: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1734: #else
1735: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1736: #endif
1737: }
1738: };
1740: template <typename T>
1741: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1742: {
1743: __host__ __device__
1744: T operator()(T x,T y)
1745: {
1746: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1747: }
1748: };
1752: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1753: {
1754: PetscErrorCode ierr;
1755: PetscScalar zero = 0.0;
1756: PetscReal n=s->map->n;
1757: thrust::tuple<PetscScalar,PetscScalar> result;
1758: CUSPARRAY *sarray,*tarray;
1761: /*VecCUSPCopyToGPU(s);
1762: VecCUSPCopyToGPU(t);*/
1763: VecCUSPGetArrayRead(s,&sarray);
1764: VecCUSPGetArrayRead(t,&tarray);
1765: try {
1766: #if defined(PETSC_USE_COMPLEX)
1767: VecDot_SeqCUSP(s,t,dp);
1768: VecDot_SeqCUSP(t,t,nm);
1769: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1770: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1771: #else
1772: result = thrust::transform_reduce(
1773: thrust::make_zip_iterator(
1774: thrust::make_tuple(
1775: sarray->begin(),
1776: tarray->begin())),
1777: thrust::make_zip_iterator(
1778: thrust::make_tuple(
1779: sarray->end(),
1780: tarray->end())),
1781: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1782: thrust::make_tuple(zero,zero), /*init */
1783: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1784: *dp = thrust::get<0>(result);
1785: *nm = thrust::get<1>(result);
1786: #endif
1787: } catch(char *ex) {
1788: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1789: }
1790: VecCUSPRestoreArrayRead(s,&sarray);
1791: VecCUSPRestoreArrayRead(t,&tarray);
1792: WaitForGPU();CHKERRCUSP(ierr);
1793: PetscLogFlops(4.0*n);
1794: return(0);
1795: }
1799: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1800: {
1804: VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1805: PetscLayoutReference(win->map,&(*V)->map);
1806: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1807: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1808: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1809: return(0);
1810: }
1814: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1815: {
1817: cudaError_t err;
1819: try {
1820: if (v->spptr) {
1821: delete ((Vec_CUSP*)v->spptr)->GPUarray;
1822: err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1823: delete (Vec_CUSP*)v->spptr;
1824: }
1825: } catch(char *ex) {
1826: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1827: }
1828: VecDestroy_SeqCUSP_Private(v);
1829: return(0);
1830: }
1833: #if defined(PETSC_USE_COMPLEX)
1834: struct conjugate
1835: {
1836: __host__ __device__
1837: PetscScalar operator()(PetscScalar x)
1838: {
1839: return cusp::conj(x);
1840: }
1841: };
1842: #endif
1847: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1848: {
1850: CUSPARRAY *xarray;
1853: VecCUSPGetArrayReadWrite(xin,&xarray);
1854: #if defined(PETSC_USE_COMPLEX)
1855: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1856: #endif
1857: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1858: return(0);
1859: }
1863: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1864: {
1865: VecType t;
1867: cudaError_t err;
1868: PetscBool flg;
1873: VecGetType(w,&t);
1874: PetscStrcmp(t,VECSEQCUSP,&flg);
1875: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);
1876:
1877: if (w->data) {
1878: if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1879: ((Vec_Seq*)w->data)->array = 0;
1880: ((Vec_Seq*)w->data)->array_allocated = 0;
1881: ((Vec_Seq*)w->data)->unplacedarray = 0;
1882: }
1883: if (w->spptr) {
1884: if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1885: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1886: delete (Vec_CUSP*)w->spptr;
1887: w->spptr = 0;
1888: }
1890: if (v->petscnative) {
1891: w->data = v->data;
1892: w->valid_GPU_array = v->valid_GPU_array;
1893: w->spptr = v->spptr;
1894: PetscObjectStateIncrease((PetscObject)w);
1895: } else {
1896: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1897: w->valid_GPU_array = PETSC_CUSP_CPU;
1898: VecCUSPAllocateCheck(w);
1899: }
1900: return(0);
1901: }
1905: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1906: {
1907: VecType t;
1909: cudaError_t err;
1910: PetscBool flg;
1915: VecGetType(w,&t);
1916: PetscStrcmp(t,VECSEQCUSP,&flg);
1917: if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);
1919: if (v->petscnative) {
1920: v->data = w->data;
1921: v->valid_GPU_array = w->valid_GPU_array;
1922: v->spptr = w->spptr;
1923: VecCUSPCopyFromGPU(v);
1924: PetscObjectStateIncrease((PetscObject)v);
1925: w->data = 0;
1926: w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1927: w->spptr = 0;
1928: } else {
1929: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1930: if ((Vec_CUSP*)w->spptr) {
1931: delete ((Vec_CUSP*)w->spptr)->GPUarray;
1932: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1933: delete (Vec_CUSP*)w->spptr;
1934: }
1935: }
1936: return(0);
1937: }
1941: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1942: {
1944: PetscMPIInt size;
1947: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1948: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1949: VecCreate_Seq_Private(V,0);
1950: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1952: V->ops->dot = VecDot_SeqCUSP;
1953: V->ops->norm = VecNorm_SeqCUSP;
1954: V->ops->tdot = VecTDot_SeqCUSP;
1955: V->ops->scale = VecScale_SeqCUSP;
1956: V->ops->copy = VecCopy_SeqCUSP;
1957: V->ops->set = VecSet_SeqCUSP;
1958: V->ops->swap = VecSwap_SeqCUSP;
1959: V->ops->axpy = VecAXPY_SeqCUSP;
1960: V->ops->axpby = VecAXPBY_SeqCUSP;
1961: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
1962: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
1963: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1964: V->ops->setrandom = VecSetRandom_SeqCUSP;
1965: V->ops->dot_local = VecDot_SeqCUSP;
1966: V->ops->tdot_local = VecTDot_SeqCUSP;
1967: V->ops->norm_local = VecNorm_SeqCUSP;
1968: V->ops->mdot_local = VecMDot_SeqCUSP;
1969: V->ops->maxpy = VecMAXPY_SeqCUSP;
1970: V->ops->mdot = VecMDot_SeqCUSP;
1971: V->ops->aypx = VecAYPX_SeqCUSP;
1972: V->ops->waxpy = VecWAXPY_SeqCUSP;
1973: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
1974: V->ops->placearray = VecPlaceArray_SeqCUSP;
1975: V->ops->replacearray = VecReplaceArray_SeqCUSP;
1976: V->ops->resetarray = VecResetArray_SeqCUSP;
1977: V->ops->destroy = VecDestroy_SeqCUSP;
1978: V->ops->duplicate = VecDuplicate_SeqCUSP;
1979: V->ops->conjugate = VecConjugate_SeqCUSP;
1980: V->ops->getlocalvector = VecGetLocalVector_SeqCUSP;
1981: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUSP;
1982: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUSP;
1983: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUSP;
1985: VecCUSPAllocateCheck(V);
1986: V->valid_GPU_array = PETSC_CUSP_GPU;
1987: VecSet(V,0.0);
1988: return(0);
1989: }
1993: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1994: {
1998: *a = 0;
1999: VecCUSPCopyToGPU(v);
2000: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2001: return(0);
2002: }
2006: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
2007: {
2011: v->valid_GPU_array = PETSC_CUSP_GPU;
2013: PetscObjectStateIncrease((PetscObject)v);
2014: return(0);
2015: }
2019: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
2020: {
2024: *a = 0;
2025: VecCUSPCopyToGPU(v);
2026: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2027: return(0);
2028: }
2032: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
2033: {
2035: return(0);
2036: }
2040: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
2041: {
2045: *a = 0;
2046: VecCUSPAllocateCheck(v);
2047: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
2048: return(0);
2049: }
2053: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2054: {
2058: v->valid_GPU_array = PETSC_CUSP_GPU;
2060: PetscObjectStateIncrease((PetscObject)v);
2061: return(0);
2062: }
2067: /*MC
2068: VecCUSPGetCUDAArray - Provides write access to the CUDA buffer inside a vector.
2070: Input Parameter:
2071: - v - the vector
2073: Output Parameter:
2074: . a - the CUDA pointer
2076: Level: intermediate
2078: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite()
2079: M*/
2080: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArray(Vec v, PetscScalar **a)
2081: {
2083: CUSPARRAY *cusparray;
2087: VecCUSPAllocateCheck(v);
2088: VecCUSPGetArrayWrite(v, &cusparray);
2089: *a = thrust::raw_pointer_cast(cusparray->data());
2090: return(0);
2091: }
2097: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArray(Vec v, PetscScalar **a)
2098: {
2102: /* Note: cannot call VecCUSPRestoreArrayWrite() here because the CUSP vector is not available. */
2103: v->valid_GPU_array = PETSC_CUSP_GPU;
2104: PetscObjectStateIncrease((PetscObject)v);
2105: return(0);
2106: }