Actual source code: veccusp.cu

petsc-3.6.4 2016-04-12
Report Typos and Errors
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #define PETSC_SKIP_COMPLEX

  7: #include <petscconf.h>
  8: PETSC_CUDA_EXTERN_C_BEGIN
  9: #include <petsc/private/vecimpl.h>          /*I "petscvec.h" I*/
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: PETSC_CUDA_EXTERN_C_END
 12: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 14: #include <cuda_runtime.h>

 18: /*
 19:     Allocates space for the vector array on the Host if it does not exist.
 20:     Does NOT change the PetscCUSPFlag for the vector
 21:     Does NOT zero the CUSP array
 22:  */
 23: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
 24: {
 26:   PetscScalar    *array;
 27:   Vec_Seq        *s = (Vec_Seq*)v->data;
 28:   PetscInt       n = v->map->n;

 31:   if (!s) {
 32:     PetscNewLog((PetscObject)v,&s);
 33:     v->data = s;
 34:   }
 35:   if (!s->array) {
 36:     PetscMalloc1(n,&array);
 37:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 38:     s->array           = array;
 39:     s->array_allocated = array;
 40:     if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
 41:       v->valid_GPU_array = PETSC_CUSP_CPU;
 42:     }
 43:   }
 44:   return(0);
 45: }

 49: /*
 50:     Allocates space for the vector array on the GPU if it does not exist.
 51:     Does NOT change the PetscCUSPFlag for the vector
 52:     Does NOT zero the CUSP array

 54:  */
 55: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 56: {
 57:   cudaError_t    err;
 58:   cudaStream_t   stream;
 59:   Vec_CUSP       *veccusp;

 62:   if (!v->spptr) {
 63:     try {
 64:       v->spptr = new Vec_CUSP;
 65:       veccusp = (Vec_CUSP*)v->spptr;
 66:       veccusp->GPUarray = new CUSPARRAY;
 67:       veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
 68:       err = cudaStreamCreate(&stream);CHKERRCUSP(err);
 69:       veccusp->stream = stream;
 70:       veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 71:       v->ops->destroy = VecDestroy_SeqCUSP;
 72:       if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
 73:         if (v->data && ((Vec_Seq*)v->data)->array) {
 74:           v->valid_GPU_array = PETSC_CUSP_CPU;
 75:         } else {
 76:           v->valid_GPU_array = PETSC_CUSP_GPU;
 77:         }
 78:       }
 79:     } catch(char *ex) {
 80:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 81:     }
 82:   }
 83:   return(0);
 84: }


 89: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 90: PetscErrorCode VecCUSPCopyToGPU(Vec v)
 91: {
 93:   cudaError_t    err;
 94:   Vec_CUSP       *veccusp;
 95:   CUSPARRAY      *varray;

 98:   VecCUSPAllocateCheck(v);
 99:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
100:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
101:     try {
102:       veccusp=(Vec_CUSP*)v->spptr;
103:       varray=veccusp->GPUarray;
104:       err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
105:     } catch(char *ex) {
106:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
107:     }
108:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
109:     v->valid_GPU_array = PETSC_CUSP_BOTH;
110:   }
111:   return(0);
112: }

116: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
117: {
118:   CUSPARRAY      *varray;
120:   cudaError_t    err;
121:   PetscScalar    *cpuPtr, *gpuPtr;
122:   Vec_Seq        *s;
123:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

126:   VecCUSPAllocateCheck(v);
127:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
128:     s = (Vec_Seq*)v->data;

130:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
131:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
132:     gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
133:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

135:     /* Note : this code copies the smallest contiguous chunk of data
136:        containing ALL of the indices */
137:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);

139:     // Set the buffer states
140:     v->valid_GPU_array = PETSC_CUSP_BOTH;
141:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
142:   }
143:   return(0);
144: }


149: /*
150:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
151: */
152: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
153: {
155:   cudaError_t    err;
156:   Vec_CUSP       *veccusp;
157:   CUSPARRAY      *varray;

160:   VecCUSPAllocateCheckHost(v);
161:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
162:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
163:     try {
164:       veccusp=(Vec_CUSP*)v->spptr;
165:       varray=veccusp->GPUarray;
166:       err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
167:     } catch(char *ex) {
168:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
169:     }
170:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
171:     v->valid_GPU_array = PETSC_CUSP_BOTH;
172:   }
173:   return(0);
174: }

178: /* Note that this function only copies *some* of the values up from the GPU to CPU,
179:    which means that we need recombine the data at some point before using any of the standard functions.
180:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
181:    where you have to always call in pairs
182: */
183: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
184: {
185:   CUSPARRAY      *varray;
187:   cudaError_t    err;
188:   PetscScalar    *cpuPtr, *gpuPtr;
189:   Vec_Seq        *s;
190:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

193:   VecCUSPAllocateCheckHost(v);
194:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
195:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);

197:     varray=((Vec_CUSP*)v->spptr)->GPUarray;
198:     s = (Vec_Seq*)v->data;
199:     gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
200:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

202:     /* Note : this code copies the smallest contiguous chunk of data
203:        containing ALL of the indices */
204:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);

206:     VecCUSPRestoreArrayRead(v,&varray);
207:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
208:   }
209:   return(0);
210: }

214: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
215: {
216:   PetscScalar       *ya;
217:   const PetscScalar *xa;
218:   PetscErrorCode    ierr;

221:   VecCUSPAllocateCheckHost(xin);
222:   VecCUSPAllocateCheckHost(yin);
223:   if (xin != yin) {
224:     VecGetArrayRead(xin,&xa);
225:     VecGetArray(yin,&ya);
226:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
227:     VecRestoreArrayRead(xin,&xa);
228:     VecRestoreArray(yin,&ya);
229:   }
230:   return(0);
231: }

235: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
236: {
238:   PetscInt       n = xin->map->n,i;
239:   PetscScalar    *xx;

242:   VecGetArray(xin,&xx);
243:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
244:   VecRestoreArray(xin,&xx);
245:   return(0);
246: }

250: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
251: {
252:   Vec_Seq        *vs = (Vec_Seq*)v->data;

256:   PetscObjectSAWsViewOff(v);
257: #if defined(PETSC_USE_LOG)
258:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
259: #endif
260:   if (vs) {
261:     if (vs->array_allocated) PetscFree(vs->array_allocated);
262:     PetscFree(vs);
263:   }
264:   return(0);
265: }

269: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
270: {
271:   Vec_Seq *v = (Vec_Seq*)vin->data;

274:   v->array         = v->unplacedarray;
275:   v->unplacedarray = 0;
276:   return(0);
277: }

279: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
282: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
283: {

287:   VecCUSPAllocateCheck(v);
288:   return(0);
289: }

293: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
294: {

298:   VecCUSPCopyToGPU(v);
299:   return(0);
300: }



306: /*
307:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

309:    Input Parameters:
310: +    v - the vector
311: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

313: */
314: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
315: {

319:   VecCUSPCopyToGPUSome(v,ci);
320:   return(0);
321: }

325: /*
326:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

328:   Input Parameters:
329:  +    v - the vector
330:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
331: */
332: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
333: {

337:   VecCUSPCopyFromGPUSome(v,ci);
338:   return(0);
339: }

341: /*MC
342:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

344:    Options Database Keys:
345: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

347:   Level: beginner

349: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
350: M*/

352: /* for VecAYPX_SeqCUSP*/
353: namespace cusp
354: {
355: namespace blas
356: {
357: namespace detail
358: {
359:   template <typename T>
360:     struct AYPX : public thrust::binary_function<T,T,T>
361:     {
362:       T alpha;

364:       AYPX(T _alpha) : alpha(_alpha) {}

366:       __host__ __device__
367:       T operator()(T x, T y)
368:       {
369:         return alpha * y + x;
370:       }
371:     };
372: }

374:  template <typename ForwardIterator1,
375:            typename ForwardIterator2,
376:            typename ScalarType>
377: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
378:            {
379:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
380:            }
381:  template <typename Array1, typename Array2, typename ScalarType>
382:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
383:  {
384: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
385:    cusp::assert_same_dimensions(x,y);
386: #else
387:    detail::assert_same_dimensions(x,y);
388: #endif
389:    aypx(x.begin(),x.end(),y.begin(),alpha);
390:  }
391: }
392: }

396: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
397: {
398:   CUSPARRAY      *xarray,*yarray;

402:   VecCUSPGetArrayRead(xin,&xarray);
403:   VecCUSPGetArrayReadWrite(yin,&yarray);
404:   try {
405:     if (alpha != 0.0) {
406:       cusp::blas::aypx(*xarray,*yarray,alpha);
407:       PetscLogFlops(2.0*yin->map->n);
408:     } else {
409:       cusp::blas::copy(*xarray,*yarray);
410:     }
411:     WaitForGPU();CHKERRCUSP(ierr);
412:   } catch(char *ex) {
413:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
414:   }
415:   VecCUSPRestoreArrayRead(xin,&xarray);
416:   VecCUSPRestoreArrayReadWrite(yin,&yarray);
417:   return(0);
418: }


423: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
424: {
425:   CUSPARRAY      *xarray,*yarray;

429:   if (alpha != 0.0) {
430:     VecCUSPGetArrayRead(xin,&xarray);
431:     VecCUSPGetArrayReadWrite(yin,&yarray);
432:     try {
433:       cusp::blas::axpy(*xarray,*yarray,alpha);
434:       WaitForGPU();CHKERRCUSP(ierr);
435:     } catch(char *ex) {
436:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
437:     }
438:     VecCUSPRestoreArrayRead(xin,&xarray);
439:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
440:     PetscLogFlops(2.0*yin->map->n);
441:   }
442:   return(0);
443: }

445: struct VecCUSPPointwiseDivide
446: {
447:   template <typename Tuple>
448:   __host__ __device__
449:   void operator()(Tuple t)
450:   {
451:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
452:   }
453: };

457: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
458: {
459:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

463:   VecCUSPGetArrayRead(xin,&xarray);
464:   VecCUSPGetArrayRead(yin,&yarray);
465:   VecCUSPGetArrayWrite(win,&warray);
466:   try {
467:     thrust::for_each(
468:       thrust::make_zip_iterator(
469:         thrust::make_tuple(
470:           warray->begin(),
471:           xarray->begin(),
472:           yarray->begin())),
473:       thrust::make_zip_iterator(
474:         thrust::make_tuple(
475:           warray->end(),
476:           xarray->end(),
477:           yarray->end())),
478:       VecCUSPPointwiseDivide());
479:     WaitForGPU();CHKERRCUSP(ierr);
480:   } catch(char *ex) {
481:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
482:   }
483:   PetscLogFlops(win->map->n);
484:   VecCUSPRestoreArrayRead(xin,&xarray);
485:   VecCUSPRestoreArrayRead(yin,&yarray);
486:   VecCUSPRestoreArrayWrite(win,&warray);
487:   return(0);
488: }


491: struct VecCUSPWAXPY
492: {
493:   template <typename Tuple>
494:   __host__ __device__
495:   void operator()(Tuple t)
496:   {
497:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
498:   }
499: };

501: struct VecCUSPSum
502: {
503:   template <typename Tuple>
504:   __host__ __device__
505:   void operator()(Tuple t)
506:   {
507:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
508:   }
509: };

511: struct VecCUSPDiff
512: {
513:   template <typename Tuple>
514:   __host__ __device__
515:   void operator()(Tuple t)
516:   {
517:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
518:   }
519: };

523: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
524: {
525:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

529:   if (alpha == 0.0) {
530:     VecCopy_SeqCUSP(yin,win);
531:   } else {
532:     VecCUSPGetArrayRead(xin,&xarray);
533:     VecCUSPGetArrayRead(yin,&yarray);
534:     VecCUSPGetArrayWrite(win,&warray);
535:     if (alpha == 1.0) {
536:       try {
537:         thrust::for_each(
538:           thrust::make_zip_iterator(
539:             thrust::make_tuple(
540:               warray->begin(),
541:               yarray->begin(),
542:               xarray->begin())),
543:           thrust::make_zip_iterator(
544:             thrust::make_tuple(
545:               warray->end(),
546:               yarray->end(),
547:               xarray->end())),
548:           VecCUSPSum());
549:       } catch(char *ex) {
550:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
551:       }
552:       PetscLogFlops(win->map->n);
553:     } else if (alpha == -1.0) {
554:       try {
555:         thrust::for_each(
556:           thrust::make_zip_iterator(
557:             thrust::make_tuple(
558:               warray->begin(),
559:               yarray->begin(),
560:               xarray->begin())),
561:           thrust::make_zip_iterator(
562:             thrust::make_tuple(
563:               warray->end(),
564:               yarray->end(),
565:               xarray->end())),
566:           VecCUSPDiff());
567:       } catch(char *ex) {
568:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
569:       }
570:       PetscLogFlops(win->map->n);
571:     } else {
572:       try {
573:         thrust::for_each(
574:           thrust::make_zip_iterator(
575:             thrust::make_tuple(
576:               warray->begin(),
577:               yarray->begin(),
578:               thrust::make_constant_iterator(alpha),
579:               xarray->begin())),
580:           thrust::make_zip_iterator(
581:             thrust::make_tuple(
582:               warray->end(),
583:               yarray->end(),
584:               thrust::make_constant_iterator(alpha),
585:               xarray->end())),
586:           VecCUSPWAXPY());
587:       } catch(char *ex) {
588:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
589:       }
590:       PetscLogFlops(2*win->map->n);
591:     }
592:     WaitForGPU();CHKERRCUSP(ierr);
593:     VecCUSPRestoreArrayRead(xin,&xarray);
594:     VecCUSPRestoreArrayRead(yin,&yarray);
595:     VecCUSPRestoreArrayWrite(win,&warray);
596:   }
597:   return(0);
598: }

600: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
601: struct VecCUSPMAXPY4
602: {
603:   template <typename Tuple>
604:   __host__ __device__
605:   void operator()(Tuple t)
606:   {
607:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
608:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
609:   }
610: };


613: struct VecCUSPMAXPY3
614: {
615:   template <typename Tuple>
616:   __host__ __device__
617:   void operator()(Tuple t)
618:   {
619:     /*y += a1*x1 +a2*x2 + a3*x3 */
620:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
621:   }
622: };

624: struct VecCUSPMAXPY2
625: {
626:   template <typename Tuple>
627:   __host__ __device__
628:   void operator()(Tuple t)
629:   {
630:     /*y += a1*x1 +a2*x2*/
631:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
632:   }
633: };
636: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
637: {
639:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
640:   PetscInt       n = xin->map->n,j,j_rem;
641:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

644:   PetscLogFlops(nv*2.0*n);
645:   VecCUSPGetArrayReadWrite(xin,&xarray);
646:   switch (j_rem=nv&0x3) {
647:   case 3:
648:     alpha0 = alpha[0];
649:     alpha1 = alpha[1];
650:     alpha2 = alpha[2];
651:     alpha += 3;
652:     VecCUSPGetArrayRead(y[0],&yy0);
653:     VecCUSPGetArrayRead(y[1],&yy1);
654:     VecCUSPGetArrayRead(y[2],&yy2);
655:     try {
656:       thrust::for_each(
657:         thrust::make_zip_iterator(
658:           thrust::make_tuple(
659:             xarray->begin(),
660:             thrust::make_constant_iterator(alpha0),
661:             yy0->begin(),
662:             thrust::make_constant_iterator(alpha1),
663:             yy1->begin(),
664:             thrust::make_constant_iterator(alpha2),
665:             yy2->begin())),
666:         thrust::make_zip_iterator(
667:           thrust::make_tuple(
668:             xarray->end(),
669:             thrust::make_constant_iterator(alpha0),
670:             yy0->end(),
671:             thrust::make_constant_iterator(alpha1),
672:             yy1->end(),
673:             thrust::make_constant_iterator(alpha2),
674:             yy2->end())),
675:         VecCUSPMAXPY3());
676:     } catch(char *ex) {
677:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
678:     }
679:     VecCUSPRestoreArrayRead(y[0],&yy0);
680:     VecCUSPRestoreArrayRead(y[1],&yy1);
681:     VecCUSPRestoreArrayRead(y[2],&yy2);
682:     y   += 3;
683:     break;
684:   case 2:
685:     alpha0 = alpha[0];
686:     alpha1 = alpha[1];
687:     alpha +=2;
688:     VecCUSPGetArrayRead(y[0],&yy0);
689:     VecCUSPGetArrayRead(y[1],&yy1);
690:     try {
691:       thrust::for_each(
692:         thrust::make_zip_iterator(
693:           thrust::make_tuple(
694:             xarray->begin(),
695:             thrust::make_constant_iterator(alpha0),
696:             yy0->begin(),
697:             thrust::make_constant_iterator(alpha1),
698:             yy1->begin())),
699:         thrust::make_zip_iterator(
700:           thrust::make_tuple(
701:             xarray->end(),
702:             thrust::make_constant_iterator(alpha0),
703:             yy0->end(),
704:             thrust::make_constant_iterator(alpha1),
705:             yy1->end())),
706:         VecCUSPMAXPY2());
707:     } catch(char *ex) {
708:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
709:     }
710:     y +=2;
711:     break;
712:   case 1:
713:     alpha0 = *alpha++;
714:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
715:     y     +=1;
716:     break;
717:   }
718:   for (j=j_rem; j<nv; j+=4) {
719:     alpha0 = alpha[0];
720:     alpha1 = alpha[1];
721:     alpha2 = alpha[2];
722:     alpha3 = alpha[3];
723:     alpha += 4;
724:     VecCUSPGetArrayRead(y[0],&yy0);
725:     VecCUSPGetArrayRead(y[1],&yy1);
726:     VecCUSPGetArrayRead(y[2],&yy2);
727:     VecCUSPGetArrayRead(y[3],&yy3);
728:     try {
729:       thrust::for_each(
730:         thrust::make_zip_iterator(
731:           thrust::make_tuple(
732:             xarray->begin(),
733:             thrust::make_constant_iterator(alpha0),
734:             yy0->begin(),
735:             thrust::make_constant_iterator(alpha1),
736:             yy1->begin(),
737:             thrust::make_constant_iterator(alpha2),
738:             yy2->begin(),
739:             thrust::make_constant_iterator(alpha3),
740:             yy3->begin())),
741:         thrust::make_zip_iterator(
742:           thrust::make_tuple(
743:             xarray->end(),
744:             thrust::make_constant_iterator(alpha0),
745:             yy0->end(),
746:             thrust::make_constant_iterator(alpha1),
747:             yy1->end(),
748:             thrust::make_constant_iterator(alpha2),
749:             yy2->end(),
750:             thrust::make_constant_iterator(alpha3),
751:             yy3->end())),
752:         VecCUSPMAXPY4());
753:     } catch(char *ex) {
754:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
755:     }
756:     VecCUSPRestoreArrayRead(y[0],&yy0);
757:     VecCUSPRestoreArrayRead(y[1],&yy1);
758:     VecCUSPRestoreArrayRead(y[2],&yy2);
759:     VecCUSPRestoreArrayRead(y[3],&yy3);
760:     y   += 4;
761:   }
762:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
763:   WaitForGPU();CHKERRCUSP(ierr);
764:   return(0);
765: }


770: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
771: {
772:   CUSPARRAY      *xarray,*yarray;
774:   //  PetscScalar    *xptr,*yptr,*zgpu;
775:   //PetscReal tmp;

778:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
779:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
780:   VecCUSPGetArrayRead(xin,&xarray);
781:   VecCUSPGetArrayRead(yin,&yarray);
782:   try {
783: #if defined(PETSC_USE_COMPLEX)
784:     *z = cusp::blas::dotc(*yarray,*xarray);
785: #else
786:     *z = cusp::blas::dot(*yarray,*xarray);
787: #endif
788:   } catch(char *ex) {
789:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
790:   }
791:   WaitForGPU();CHKERRCUSP(ierr);
792:   if (xin->map->n >0) {
793:     PetscLogFlops(2.0*xin->map->n-1);
794:   }
795:   VecCUSPRestoreArrayRead(xin,&xarray);
796:   VecCUSPRestoreArrayRead(yin,&yarray);
797:   return(0);
798: }

800: //
801: // CUDA kernels for MDot to follow
802: //

804: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
805: #define MDOT_WORKGROUP_SIZE 128
806: #define MDOT_WORKGROUP_NUM  128

808: // M = 2:
809: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
810:                                         PetscInt size, PetscScalar *group_results)
811: {
812:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
813:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
814:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
815:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
816:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

818:   PetscScalar entry_x    = 0;
819:   PetscScalar group_sum0 = 0;
820:   PetscScalar group_sum1 = 0;
821:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
822:     entry_x     = x[i];   // load only once from global memory!
823:     group_sum0 += entry_x * y0[i];
824:     group_sum1 += entry_x * y1[i];
825:   }
826:   tmp_buffer[threadIdx.x]                       = group_sum0;
827:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

829:   // parallel reduction
830:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
831:     __syncthreads();
832:     if (threadIdx.x < stride) {
833:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
834:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
835:     }
836:   }

838:   // write result of group to group_results
839:   if (threadIdx.x == 0) {
840:     group_results[blockIdx.x]             = tmp_buffer[0];
841:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
842:   }
843: }

845: // M = 3:
846: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
847:                                         PetscInt size, PetscScalar *group_results)
848: {
849:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
850:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
851:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
852:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
853:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

855:   PetscScalar entry_x    = 0;
856:   PetscScalar group_sum0 = 0;
857:   PetscScalar group_sum1 = 0;
858:   PetscScalar group_sum2 = 0;
859:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
860:     entry_x     = x[i];   // load only once from global memory!
861:     group_sum0 += entry_x * y0[i];
862:     group_sum1 += entry_x * y1[i];
863:     group_sum2 += entry_x * y2[i];
864:   }
865:   tmp_buffer[threadIdx.x]                           = group_sum0;
866:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
867:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

869:   // parallel reduction
870:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
871:     __syncthreads();
872:     if (threadIdx.x < stride) {
873:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
874:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
875:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
876:     }
877:   }

879:   // write result of group to group_results
880:   if (threadIdx.x == 0) {
881:     group_results[blockIdx.x                ] = tmp_buffer[0];
882:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
883:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
884:   }
885: }

887: // M = 4:
888: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
889:                                         PetscInt size, PetscScalar *group_results)
890: {
891:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
892:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
893:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
894:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
895:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

897:   PetscScalar entry_x    = 0;
898:   PetscScalar group_sum0 = 0;
899:   PetscScalar group_sum1 = 0;
900:   PetscScalar group_sum2 = 0;
901:   PetscScalar group_sum3 = 0;
902:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
903:     entry_x     = x[i];   // load only once from global memory!
904:     group_sum0 += entry_x * y0[i];
905:     group_sum1 += entry_x * y1[i];
906:     group_sum2 += entry_x * y2[i];
907:     group_sum3 += entry_x * y3[i];
908:   }
909:   tmp_buffer[threadIdx.x]                           = group_sum0;
910:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
911:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
912:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

914:   // parallel reduction
915:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
916:     __syncthreads();
917:     if (threadIdx.x < stride) {
918:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
919:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
920:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
921:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
922:     }
923:   }

925:   // write result of group to group_results
926:   if (threadIdx.x == 0) {
927:     group_results[blockIdx.x                ] = tmp_buffer[0];
928:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
929:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
930:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
931:   }
932: }

934: // M = 8:
935: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
936:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
937:                                           PetscInt size, PetscScalar *group_results)
938: {
939:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
940:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
941:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
942:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
943:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

945:   PetscScalar entry_x    = 0;
946:   PetscScalar group_sum0 = 0;
947:   PetscScalar group_sum1 = 0;
948:   PetscScalar group_sum2 = 0;
949:   PetscScalar group_sum3 = 0;
950:   PetscScalar group_sum4 = 0;
951:   PetscScalar group_sum5 = 0;
952:   PetscScalar group_sum6 = 0;
953:   PetscScalar group_sum7 = 0;
954:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
955:     entry_x     = x[i];   // load only once from global memory!
956:     group_sum0 += entry_x * y0[i];
957:     group_sum1 += entry_x * y1[i];
958:     group_sum2 += entry_x * y2[i];
959:     group_sum3 += entry_x * y3[i];
960:     group_sum4 += entry_x * y4[i];
961:     group_sum5 += entry_x * y5[i];
962:     group_sum6 += entry_x * y6[i];
963:     group_sum7 += entry_x * y7[i];
964:   }
965:   tmp_buffer[threadIdx.x]                           = group_sum0;
966:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
967:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
968:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
969:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
970:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
971:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
972:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

974:   // parallel reduction
975:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
976:     __syncthreads();
977:     if (threadIdx.x < stride) {
978:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
979:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
980:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
981:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
982:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
983:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
984:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
985:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
986:     }
987:   }

989:   // write result of group to group_results
990:   if (threadIdx.x == 0) {
991:     group_results[blockIdx.x                ] = tmp_buffer[0];
992:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
993:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
994:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
995:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
996:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
997:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
998:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
999:   }
1000: }


1005: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1006: {
1008:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
1009:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1010:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1011:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1012:   cudaError_t    cuda_ierr;

1015:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1016:   /* Handle the case of local size zero first */
1017:   if (!xin->map->n) {
1018:     for (i=0; i<nv; ++i) z[i] = 0;
1019:     return(0);
1020:   }

1022:   // allocate scratchpad memory for the results of individual work groups:
1023:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1024:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

1026:   VecCUSPGetArrayRead(xin,&xarray);
1027:   xptr = thrust::raw_pointer_cast(xarray->data());

1029:   while (current_y_index < nv)
1030:   {
1031:     switch (nv - current_y_index) {

1033:     case 7:
1034:     case 6:
1035:     case 5:
1036:     case 4:
1037:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1038:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1039:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1040:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

1042: #if defined(PETSC_USE_COMPLEX)
1043:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1044:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1045:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1046:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1047: #else
1048:       // extract raw device pointers:
1049:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1050:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1051:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1052:       y3ptr = thrust::raw_pointer_cast(y3array->data());

1054:       // run kernel:
1055:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

1057:       // copy results back to
1058:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1059:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1061:       // sum group results into z:
1062:       for (j=0; j<4; ++j) {
1063:         z[current_y_index + j] = 0;
1064:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1065:       }
1066: #endif
1067:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1068:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1069:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1070:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1071:       current_y_index += 4;
1072:       break;

1074:     case 3:
1075:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1076:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1077:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

1079: #if defined(PETSC_USE_COMPLEX)
1080:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1081:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1082:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1083: #else
1084:       // extract raw device pointers:
1085:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1086:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1087:       y2ptr = thrust::raw_pointer_cast(y2array->data());

1089:       // run kernel:
1090:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

1092:       // copy results back to
1093:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1094:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1096:       // sum group results into z:
1097:       for (j=0; j<3; ++j) {
1098:         z[current_y_index + j] = 0;
1099:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1100:       }
1101: #endif

1103:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1104:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1105:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1106:       current_y_index += 3;
1107:       break;

1109:     case 2:
1110:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1111:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

1113: #if defined(PETSC_USE_COMPLEX)
1114:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1115:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1116: #else
1117:       // extract raw device pointers:
1118:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1119:       y1ptr = thrust::raw_pointer_cast(y1array->data());

1121:       // run kernel:
1122:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

1124:       // copy results back to
1125:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1126:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1128:       // sum group results into z:
1129:       for (j=0; j<2; ++j) {
1130:         z[current_y_index + j] = 0;
1131:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1132:       }
1133: #endif
1134:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1135:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1136:       current_y_index += 2;
1137:       break;

1139:     case 1:
1140:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1141: #if defined(PETSC_USE_COMPLEX)
1142:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1143: #else
1144:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1145: #endif
1146:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1147:       current_y_index += 1;
1148:       break;

1150:     default: // 8 or more vectors left
1151:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1152:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1153:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1154:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1155:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1156:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1157:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1158:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

1160: #if defined(PETSC_USE_COMPLEX)
1161:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1162:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1163:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1164:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1165:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1166:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1167:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1168:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1169: #else
1170:       // extract raw device pointers:
1171:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1172:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1173:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1174:       y3ptr = thrust::raw_pointer_cast(y3array->data());
1175:       y4ptr = thrust::raw_pointer_cast(y4array->data());
1176:       y5ptr = thrust::raw_pointer_cast(y5array->data());
1177:       y6ptr = thrust::raw_pointer_cast(y6array->data());
1178:       y7ptr = thrust::raw_pointer_cast(y7array->data());

1180:       // run kernel:
1181:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1183:       // copy results back to
1184:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1185:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1187:       // sum group results into z:
1188:       for (j=0; j<8; ++j) {
1189:         z[current_y_index + j] = 0;
1190:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1191:       }
1192: #endif
1193:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1194:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1195:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1196:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1197:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1198:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1199:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1200:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1201:       current_y_index += 8;
1202:       break;
1203:     }
1204:   }
1205:   VecCUSPRestoreArrayRead(xin,&xarray);

1207:   cuda_cudaFree(group_results_gpu);
1208:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1209:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1210:   return(0);
1211: }

1213: #undef MDOT_WORKGROUP_SIZE
1214: #undef MDOT_WORKGROUP_NUM



1220: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1221: {
1222:   CUSPARRAY      *xarray=NULL;

1226:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1227:   VecCUSPGetArrayWrite(xin,&xarray);
1228:   try {
1229:     cusp::blas::fill(*xarray,alpha);
1230:   } catch(char *ex) {
1231:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1232:   }
1233:   WaitForGPU();CHKERRCUSP(ierr);
1234:   VecCUSPRestoreArrayWrite(xin,&xarray);
1235:   return(0);
1236: }

1240: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1241: {
1242:   CUSPARRAY      *xarray;

1246:   if (alpha == 0.0) {
1247:     VecSet_SeqCUSP(xin,alpha);
1248:   } else if (alpha != 1.0) {
1249:     VecCUSPGetArrayReadWrite(xin,&xarray);
1250:     try {
1251:       cusp::blas::scal(*xarray,alpha);
1252:     } catch(char *ex) {
1253:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1254:     }
1255:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1256:   }
1257:   WaitForGPU();CHKERRCUSP(ierr);
1258:   PetscLogFlops(xin->map->n);
1259:   return(0);
1260: }


1265: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1266: {
1267:   CUSPARRAY      *xarray,*yarray;

1271:   //#if defined(PETSC_USE_COMPLEX)
1272:   /*Not working for complex*/
1273:   //#else
1274:   VecCUSPGetArrayRead(xin,&xarray);
1275:   VecCUSPGetArrayRead(yin,&yarray);
1276:   try {
1277:     *z = cusp::blas::dot(*xarray,*yarray);
1278:   } catch(char *ex) {
1279:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1280:   }
1281:   //#endif
1282:   WaitForGPU();CHKERRCUSP(ierr);
1283:   if (xin->map->n > 0) {
1284:     PetscLogFlops(2.0*xin->map->n-1);
1285:   }
1286:   VecCUSPRestoreArrayRead(yin,&yarray);
1287:   VecCUSPRestoreArrayRead(xin,&xarray);
1288:   return(0);
1289: }
1292: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1293: {
1294:   CUSPARRAY      *xarray,*yarray;

1298:   if (xin != yin) {
1299:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1300:       VecCUSPGetArrayRead(xin,&xarray);
1301:       VecCUSPGetArrayWrite(yin,&yarray);
1302:       try {
1303:         cusp::blas::copy(*xarray,*yarray);
1304:       } catch(char *ex) {
1305:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1306:       }
1307:       WaitForGPU();CHKERRCUSP(ierr);
1308:       VecCUSPRestoreArrayRead(xin,&xarray);
1309:       VecCUSPRestoreArrayWrite(yin,&yarray);

1311:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1312:       /* copy in CPU if we are on the CPU*/
1313:       VecCopy_SeqCUSP_Private(xin,yin);
1314:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1315:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1316:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1317:         /* copy in CPU */
1318:         VecCopy_SeqCUSP_Private(xin,yin);

1320:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1321:         /* copy in GPU */
1322:         VecCUSPGetArrayRead(xin,&xarray);
1323:         VecCUSPGetArrayWrite(yin,&yarray);
1324:         try {
1325:           cusp::blas::copy(*xarray,*yarray);
1326:           WaitForGPU();CHKERRCUSP(ierr);
1327:         } catch(char *ex) {
1328:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1329:         }
1330:         VecCUSPRestoreArrayRead(xin,&xarray);
1331:         VecCUSPRestoreArrayWrite(yin,&yarray);
1332:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1333:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1334:            default to copy in GPU (this is an arbitrary choice) */
1335:         VecCUSPGetArrayRead(xin,&xarray);
1336:         VecCUSPGetArrayWrite(yin,&yarray);
1337:         try {
1338:           cusp::blas::copy(*xarray,*yarray);
1339:           WaitForGPU();CHKERRCUSP(ierr);
1340:         } catch(char *ex) {
1341:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1342:         }
1343:         VecCUSPRestoreArrayRead(xin,&xarray);
1344:         VecCUSPRestoreArrayWrite(yin,&yarray);
1345:       } else {
1346:         VecCopy_SeqCUSP_Private(xin,yin);
1347:       }
1348:     }
1349:   }
1350:   return(0);
1351: }


1356: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1357: {
1359:   PetscBLASInt   one = 1,bn;
1360:   CUSPARRAY      *xarray,*yarray;

1363:   PetscBLASIntCast(xin->map->n,&bn);
1364:   if (xin != yin) {
1365:     VecCUSPGetArrayReadWrite(xin,&xarray);
1366:     VecCUSPGetArrayReadWrite(yin,&yarray);

1368: #if defined(PETSC_USE_COMPLEX)
1369: #if defined(PETSC_USE_REAL_SINGLE)
1370:     cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1371: #else
1372:     cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1373: #endif
1374: #else
1375: #if defined(PETSC_USE_REAL_SINGLE)
1376:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1377: #else
1378:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1379: #endif
1380: #endif
1381:     cublasGetError();CHKERRCUSP(ierr);
1382:     WaitForGPU();CHKERRCUSP(ierr);
1383:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1384:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1385:   }
1386:   return(0);
1387: }

1389: struct VecCUSPAX
1390: {
1391:   template <typename Tuple>
1392:   __host__ __device__
1393:   void operator()(Tuple t)
1394:   {
1395:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1396:   }
1397: };
1400: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1401: {
1403:   PetscScalar    a = alpha,b = beta;
1404:   CUSPARRAY      *xarray,*yarray;

1407:   if (a == 0.0) {
1408:     VecScale_SeqCUSP(yin,beta);
1409:   } else if (b == 1.0) {
1410:     VecAXPY_SeqCUSP(yin,alpha,xin);
1411:   } else if (a == 1.0) {
1412:     VecAYPX_SeqCUSP(yin,beta,xin);
1413:   } else if (b == 0.0) {
1414:     VecCUSPGetArrayRead(xin,&xarray);
1415:     VecCUSPGetArrayReadWrite(yin,&yarray);
1416:     try {
1417:       thrust::for_each(
1418:         thrust::make_zip_iterator(
1419:           thrust::make_tuple(
1420:             yarray->begin(),
1421:             thrust::make_constant_iterator(a),
1422:             xarray->begin())),
1423:         thrust::make_zip_iterator(
1424:           thrust::make_tuple(
1425:             yarray->end(),
1426:             thrust::make_constant_iterator(a),
1427:             xarray->end())),
1428:         VecCUSPAX());
1429:     } catch(char *ex) {
1430:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1431:     }
1432:     PetscLogFlops(xin->map->n);
1433:     WaitForGPU();CHKERRCUSP(ierr);
1434:     VecCUSPRestoreArrayRead(xin,&xarray);
1435:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1436:   } else {
1437:     VecCUSPGetArrayRead(xin,&xarray);
1438:     VecCUSPGetArrayReadWrite(yin,&yarray);
1439:     try {
1440:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1441:     } catch(char *ex) {
1442:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1443:     }
1444:     VecCUSPRestoreArrayRead(xin,&xarray);
1445:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1446:     WaitForGPU();CHKERRCUSP(ierr);
1447:     PetscLogFlops(3.0*xin->map->n);
1448:   }
1449:   return(0);
1450: }

1452: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1453: struct VecCUSPXPBYPCZ
1454: {
1455:   /* z = x + b*y + c*z */
1456:   template <typename Tuple>
1457:   __host__ __device__
1458:   void operator()(Tuple t)
1459:   {
1460:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1461:   }
1462: };
1463: struct VecCUSPAXPBYPZ
1464: {
1465:   /* z = ax + b*y + z */
1466:   template <typename Tuple>
1467:   __host__ __device__
1468:   void operator()(Tuple t)
1469:   {
1470:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1471:   }
1472: };

1476: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1477: {
1479:   PetscInt       n = zin->map->n;
1480:   CUSPARRAY      *xarray,*yarray,*zarray;

1483:   VecCUSPGetArrayRead(xin,&xarray);
1484:   VecCUSPGetArrayRead(yin,&yarray);
1485:   VecCUSPGetArrayReadWrite(zin,&zarray);
1486:   if (alpha == 1.0) {
1487:     try {
1488:       thrust::for_each(
1489:         thrust::make_zip_iterator(
1490:           thrust::make_tuple(
1491:             zarray->begin(),
1492:             thrust::make_constant_iterator(gamma),
1493:             xarray->begin(),
1494:             yarray->begin(),
1495:             thrust::make_constant_iterator(beta))),
1496:         thrust::make_zip_iterator(
1497:           thrust::make_tuple(
1498:             zarray->end(),
1499:             thrust::make_constant_iterator(gamma),
1500:             xarray->end(),
1501:             yarray->end(),
1502:             thrust::make_constant_iterator(beta))),
1503:         VecCUSPXPBYPCZ());
1504:     } catch(char *ex) {
1505:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1506:     }
1507:     PetscLogFlops(4.0*n);
1508:   } else if (gamma == 1.0) {
1509:     try {
1510:       thrust::for_each(
1511:         thrust::make_zip_iterator(
1512:           thrust::make_tuple(
1513:             zarray->begin(),
1514:             xarray->begin(),
1515:             thrust::make_constant_iterator(alpha),
1516:             yarray->begin(),
1517:             thrust::make_constant_iterator(beta))),
1518:         thrust::make_zip_iterator(
1519:           thrust::make_tuple(
1520:             zarray->end(),
1521:             xarray->end(),
1522:             thrust::make_constant_iterator(alpha),
1523:             yarray->end(),
1524:             thrust::make_constant_iterator(beta))),
1525:         VecCUSPAXPBYPZ());
1526:     } catch(char *ex) {
1527:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1528:     }
1529:     PetscLogFlops(4.0*n);
1530:   } else {
1531:     try {
1532:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1533:     } catch(char *ex) {
1534:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1535:     }
1536:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1537:     VecCUSPRestoreArrayRead(xin,&xarray);
1538:     VecCUSPRestoreArrayRead(yin,&yarray);
1539:     PetscLogFlops(5.0*n);
1540:   }
1541:   WaitForGPU();CHKERRCUSP(ierr);
1542:   return(0);
1543: }

1547: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1548: {
1550:   PetscInt       n = win->map->n;
1551:   CUSPARRAY      *xarray,*yarray,*warray;

1554:   VecCUSPGetArrayRead(xin,&xarray);
1555:   VecCUSPGetArrayRead(yin,&yarray);
1556:   VecCUSPGetArrayReadWrite(win,&warray);
1557:   try {
1558:     cusp::blas::xmy(*xarray,*yarray,*warray);
1559:   } catch(char *ex) {
1560:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1561:   }
1562:   VecCUSPRestoreArrayRead(xin,&xarray);
1563:   VecCUSPRestoreArrayRead(yin,&yarray);
1564:   VecCUSPRestoreArrayReadWrite(win,&warray);
1565:   PetscLogFlops(n);
1566:   WaitForGPU();CHKERRCUSP(ierr);
1567:   return(0);
1568: }


1571: /* should do infinity norm in cusp */

1575: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1576: {
1577:   const PetscScalar *xx;
1578:   PetscErrorCode    ierr;
1579:   PetscInt          n = xin->map->n;
1580:   PetscBLASInt      one = 1, bn;
1581:   CUSPARRAY         *xarray;

1584:   PetscBLASIntCast(n,&bn);
1585:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1586:     VecCUSPGetArrayRead(xin,&xarray);
1587:     try {
1588:       *z = cusp::blas::nrm2(*xarray);
1589:     } catch(char *ex) {
1590:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1591:     }
1592:     WaitForGPU();CHKERRCUSP(ierr);
1593:     VecCUSPRestoreArrayRead(xin,&xarray);
1594:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1595:   } else if (type == NORM_INFINITY) {
1596:     PetscInt  i;
1597:     PetscReal max = 0.0,tmp;

1599:     VecGetArrayRead(xin,&xx);
1600:     for (i=0; i<n; i++) {
1601:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1602:       /* check special case of tmp == NaN */
1603:       if (tmp != tmp) {max = tmp; break;}
1604:       xx++;
1605:     }
1606:     VecRestoreArrayRead(xin,&xx);
1607:     *z   = max;
1608:   } else if (type == NORM_1) {
1609:     VecCUSPGetArrayRead(xin,&xarray);
1610: #if defined(PETSC_USE_COMPLEX)
1611: #if defined(PETSC_USE_REAL_SINGLE)
1612:     *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1613: #else
1614:     *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1615: #endif
1616: #else
1617: #if defined(PETSC_USE_REAL_SINGLE)
1618:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1619: #else
1620:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1621: #endif
1622: #endif
1623:     cublasGetError();CHKERRCUSP(ierr);
1624:     VecCUSPRestoreArrayRead(xin,&xarray);
1625:     WaitForGPU();CHKERRCUSP(ierr);
1626:     PetscLogFlops(PetscMax(n-1.0,0.0));
1627:   } else if (type == NORM_1_AND_2) {
1628:     VecNorm_SeqCUSP(xin,NORM_1,z);
1629:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1630:   }
1631:   return(0);
1632: }


1635: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1639: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1640: {

1644:   VecSetRandom_SeqCUSP_Private(xin,r);
1645:   xin->valid_GPU_array = PETSC_CUSP_CPU;
1646:   return(0);
1647: }

1651: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1652: {

1656:   VecCUSPCopyFromGPU(vin);
1657:   VecResetArray_SeqCUSP_Private(vin);
1658:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1659:   return(0);
1660: }

1664: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1665: {

1669:   VecCUSPAllocateCheckHost(vin);
1670:   VecPlaceArray_Seq(vin,a);
1671:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1672:   return(0);
1673: }


1678: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1679: {

1683:   VecCUSPCopyFromGPU(vin);
1684:   VecReplaceArray_Seq(vin,a);
1685:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1686:   return(0);
1687: }


1692: /*@
1693:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1695:    Collective on MPI_Comm

1697:    Input Parameter:
1698: +  comm - the communicator, should be PETSC_COMM_SELF
1699: -  n - the vector length

1701:    Output Parameter:
1702: .  V - the vector

1704:    Notes:
1705:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1706:    same type as an existing vector.

1708:    Level: intermediate

1710:    Concepts: vectors^creating sequential

1712: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1713: @*/
1714: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1715: {

1719:   VecCreate(comm,v);
1720:   VecSetSizes(*v,n,n);
1721:   VecSetType(*v,VECSEQCUSP);
1722:   return(0);
1723: }

1725: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1726: template <typename T>
1727: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1728: {
1729:   __host__ __device__
1730:   T operator()(T x)
1731:   {
1732: #if defined(PETSC_USE_COMPLEX)
1733:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1734: #else
1735:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1736: #endif
1737:   }
1738: };

1740: template <typename T>
1741: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1742: {
1743:   __host__ __device__
1744:   T operator()(T x,T y)
1745:   {
1746:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1747:   }
1748: };

1752: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1753: {
1754:   PetscErrorCode                         ierr;
1755:   PetscScalar                            zero = 0.0;
1756:   PetscReal                              n=s->map->n;
1757:   thrust::tuple<PetscScalar,PetscScalar> result;
1758:   CUSPARRAY                              *sarray,*tarray;

1761:   /*VecCUSPCopyToGPU(s);
1762:    VecCUSPCopyToGPU(t);*/
1763:   VecCUSPGetArrayRead(s,&sarray);
1764:   VecCUSPGetArrayRead(t,&tarray);
1765:   try {
1766: #if defined(PETSC_USE_COMPLEX)
1767:     VecDot_SeqCUSP(s,t,dp);
1768:     VecDot_SeqCUSP(t,t,nm);
1769:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1770:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1771: #else
1772:     result = thrust::transform_reduce(
1773:               thrust::make_zip_iterator(
1774:                 thrust::make_tuple(
1775:                   sarray->begin(),
1776:                   tarray->begin())),
1777:               thrust::make_zip_iterator(
1778:                 thrust::make_tuple(
1779:                   sarray->end(),
1780:                   tarray->end())),
1781:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1782:               thrust::make_tuple(zero,zero),                                   /*init */
1783:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1784:     *dp = thrust::get<0>(result);
1785:     *nm = thrust::get<1>(result);
1786: #endif
1787:   } catch(char *ex) {
1788:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1789:   }
1790:   VecCUSPRestoreArrayRead(s,&sarray);
1791:   VecCUSPRestoreArrayRead(t,&tarray);
1792:   WaitForGPU();CHKERRCUSP(ierr);
1793:   PetscLogFlops(4.0*n);
1794:   return(0);
1795: }

1799: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1800: {

1804:   VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1805:   PetscLayoutReference(win->map,&(*V)->map);
1806:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1807:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1808:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1809:   return(0);
1810: }

1814: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1815: {
1817:   cudaError_t    err;
1819:   try {
1820:     if (v->spptr) {
1821:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
1822:       err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1823:       delete (Vec_CUSP*)v->spptr;
1824:     }
1825:   } catch(char *ex) {
1826:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1827:   }
1828:   VecDestroy_SeqCUSP_Private(v);
1829:   return(0);
1830: }


1833: #if defined(PETSC_USE_COMPLEX)
1834: struct conjugate 
1835: {
1836:   __host__ __device__
1837:   PetscScalar operator()(PetscScalar x)
1838:   {
1839:     return cusp::conj(x);
1840:   }
1841: };
1842: #endif


1847: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1848: {
1850:   CUSPARRAY      *xarray;

1853:   VecCUSPGetArrayReadWrite(xin,&xarray);
1854: #if defined(PETSC_USE_COMPLEX)
1855:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1856: #endif
1857:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
1858:   return(0);
1859: }

1863: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1864: {
1865:   VecType        t;
1867:   cudaError_t    err;
1868:   PetscBool      flg;

1873:   VecGetType(w,&t);
1874:   PetscStrcmp(t,VECSEQCUSP,&flg);
1875:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);
1876: 
1877:   if (w->data) {
1878:     if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1879:     ((Vec_Seq*)w->data)->array = 0;
1880:     ((Vec_Seq*)w->data)->array_allocated = 0;
1881:     ((Vec_Seq*)w->data)->unplacedarray = 0;
1882:   }
1883:   if (w->spptr) {
1884:     if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1885:     err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1886:     delete (Vec_CUSP*)w->spptr;
1887:     w->spptr = 0;
1888:   }

1890:   if (v->petscnative) {
1891:     w->data = v->data;
1892:     w->valid_GPU_array = v->valid_GPU_array;
1893:     w->spptr = v->spptr;
1894:     PetscObjectStateIncrease((PetscObject)w);
1895:   } else {
1896:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1897:     w->valid_GPU_array = PETSC_CUSP_CPU;
1898:     VecCUSPAllocateCheck(w);
1899:   }
1900:   return(0);
1901: }

1905: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1906: {
1907:   VecType        t;
1909:   cudaError_t    err;
1910:   PetscBool      flg;

1915:   VecGetType(w,&t);
1916:   PetscStrcmp(t,VECSEQCUSP,&flg);
1917:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);

1919:   if (v->petscnative) {
1920:     v->data = w->data;
1921:     v->valid_GPU_array = w->valid_GPU_array;
1922:     v->spptr = w->spptr;
1923:     VecCUSPCopyFromGPU(v);
1924:     PetscObjectStateIncrease((PetscObject)v);
1925:     w->data = 0;
1926:     w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1927:     w->spptr = 0;
1928:   } else {
1929:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1930:     if ((Vec_CUSP*)w->spptr) {
1931:       delete ((Vec_CUSP*)w->spptr)->GPUarray;
1932:       err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1933:       delete (Vec_CUSP*)w->spptr;
1934:     }
1935:   }
1936:   return(0);
1937: }

1941: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1942: {
1944:   PetscMPIInt    size;

1947:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1948:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1949:   VecCreate_Seq_Private(V,0);
1950:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);

1952:   V->ops->dot                    = VecDot_SeqCUSP;
1953:   V->ops->norm                   = VecNorm_SeqCUSP;
1954:   V->ops->tdot                   = VecTDot_SeqCUSP;
1955:   V->ops->scale                  = VecScale_SeqCUSP;
1956:   V->ops->copy                   = VecCopy_SeqCUSP;
1957:   V->ops->set                    = VecSet_SeqCUSP;
1958:   V->ops->swap                   = VecSwap_SeqCUSP;
1959:   V->ops->axpy                   = VecAXPY_SeqCUSP;
1960:   V->ops->axpby                  = VecAXPBY_SeqCUSP;
1961:   V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUSP;
1962:   V->ops->pointwisemult          = VecPointwiseMult_SeqCUSP;
1963:   V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUSP;
1964:   V->ops->setrandom              = VecSetRandom_SeqCUSP;
1965:   V->ops->dot_local              = VecDot_SeqCUSP;
1966:   V->ops->tdot_local             = VecTDot_SeqCUSP;
1967:   V->ops->norm_local             = VecNorm_SeqCUSP;
1968:   V->ops->mdot_local             = VecMDot_SeqCUSP;
1969:   V->ops->maxpy                  = VecMAXPY_SeqCUSP;
1970:   V->ops->mdot                   = VecMDot_SeqCUSP;
1971:   V->ops->aypx                   = VecAYPX_SeqCUSP;
1972:   V->ops->waxpy                  = VecWAXPY_SeqCUSP;
1973:   V->ops->dotnorm2               = VecDotNorm2_SeqCUSP;
1974:   V->ops->placearray             = VecPlaceArray_SeqCUSP;
1975:   V->ops->replacearray           = VecReplaceArray_SeqCUSP;
1976:   V->ops->resetarray             = VecResetArray_SeqCUSP;
1977:   V->ops->destroy                = VecDestroy_SeqCUSP;
1978:   V->ops->duplicate              = VecDuplicate_SeqCUSP;
1979:   V->ops->conjugate              = VecConjugate_SeqCUSP;
1980:   V->ops->getlocalvector         = VecGetLocalVector_SeqCUSP;
1981:   V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUSP;
1982:   V->ops->getlocalvectorread     = VecGetLocalVector_SeqCUSP;
1983:   V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUSP;

1985:   VecCUSPAllocateCheck(V);
1986:   V->valid_GPU_array      = PETSC_CUSP_GPU;
1987:   VecSet(V,0.0);
1988:   return(0);
1989: }

1993: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1994: {

1998:   *a   = 0;
1999:   VecCUSPCopyToGPU(v);
2000:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2001:   return(0);
2002: }

2006: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
2007: {

2011:   v->valid_GPU_array = PETSC_CUSP_GPU;

2013:   PetscObjectStateIncrease((PetscObject)v);
2014:   return(0);
2015: }

2019: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
2020: {

2024:   *a   = 0;
2025:   VecCUSPCopyToGPU(v);
2026:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2027:   return(0);
2028: }

2032: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
2033: {
2035:   return(0);
2036: }

2040: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
2041: {

2045:   *a   = 0;
2046:   VecCUSPAllocateCheck(v);
2047:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2048:   return(0);
2049: }

2053: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2054: {

2058:   v->valid_GPU_array = PETSC_CUSP_GPU;

2060:   PetscObjectStateIncrease((PetscObject)v);
2061:   return(0);
2062: }


2067: /*MC
2068:    VecCUSPGetCUDAArray - Provides write access to the CUDA buffer inside a vector.

2070:    Input Parameter:
2071: -  v - the vector

2073:    Output Parameter:
2074: .  a - the CUDA pointer

2076:    Level: intermediate

2078: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite()
2079: M*/
2080: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArray(Vec v, PetscScalar **a)
2081: {
2083:   CUSPARRAY      *cusparray;

2087:   VecCUSPAllocateCheck(v);
2088:   VecCUSPGetArrayWrite(v, &cusparray);
2089:   *a   = thrust::raw_pointer_cast(cusparray->data());
2090:   return(0);
2091: }



2097: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArray(Vec v, PetscScalar **a)
2098: {

2102:   /* Note: cannot call VecCUSPRestoreArrayWrite() here because the CUSP vector is not available. */
2103:   v->valid_GPU_array = PETSC_CUSP_GPU;
2104:   PetscObjectStateIncrease((PetscObject)v);
2105:   return(0);
2106: }