Actual source code: veccusp.cu

petsc-3.4.2 2013-07-02
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #include <petscconf.h>
  6: PETSC_CUDA_EXTERN_C_BEGIN
  7: #include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 12: #include <cuda_runtime.h>

 16: /*
 17:     Allocates space for the vector array on the Host if it does not exist.
 18:     Does NOT change the PetscCUSPFlag for the vector
 19:     Does NOT zero the CUSP array
 20:  */
 21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
 22: {
 24:   PetscScalar    *array;
 25:   Vec_Seq        *s;
 26:   PetscInt       n = v->map->n;

 29:   s    = (Vec_Seq*)v->data;
 30:   VecCUSPAllocateCheck(v);
 31:   if (s->array == 0) {
 32:     //#if defined(PETSC_HAVE_TXPETSCGPU)
 33:     //if (n>0)
 34:     // cudaMallocHost((void **) &array, n*sizeof(PetscScalar));CHKERRCUSP(ierr);
 35:     //#else
 36:     PetscMalloc(n*sizeof(PetscScalar),&array);
 37:     PetscLogObjectMemory(v,n*sizeof(PetscScalar));
 38:     s->array           = array;
 39:     s->array_allocated = array;
 40:   }
 41:   return(0);
 42: }


 47: /*
 48:     Allocates space for the vector array on the GPU if it does not exist.
 49:     Does NOT change the PetscCUSPFlag for the vector
 50:     Does NOT zero the CUSP array

 52:  */
 53: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 54: {
 56:   int            rank;

 59:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
 60:   // First allocate memory on the GPU if needed
 61:   if (!v->spptr) {
 62:     try {
 63:       v->spptr                        = new Vec_CUSP;
 64:       ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
 65:       ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);

 67: #if defined(PETSC_HAVE_TXPETSCGPU)
 69:       ((Vec_CUSP*)v->spptr)->GPUvector = new GPU_Vector<PetscInt, PetscScalar>(((Vec_CUSP*)v->spptr)->GPUarray, rank);
 70:       ((Vec_CUSP*)v->spptr)->GPUvector->buildStreamsAndEvents();CHKERRCUSP(ierr);

 72:       Vec_Seq *s;
 73:       s = (Vec_Seq*)v->data;
 74:       if (v->map->n>0) {
 75:         if (s->array==0) {
 76:           // In this branch, GPUvector owns the ptr and manages the memory
 77:           ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);

 79:           s->array           = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
 80:           s->array_allocated = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
 81:         } else {
 82:           // In this branch, Petsc owns the ptr to start, however we want to use
 83:           // page locked host memory for faster data transfers. So, a new
 84:           // page-locked buffer is allocated. Then, the old Petsc memory
 85:           // is copied in to the new buffer. Then the old Petsc memory is freed.
 86:           // GPUvector owns the new ptr.
 87:           ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
 88:           PetscScalar * temp = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();

 90:           PetscMemcpy(temp,s->array,v->map->n*sizeof(PetscScalar));
 91:           PetscFree(s->array);

 93:           s->array           = temp;
 94:           s->array_allocated = temp;
 95:         }
 96:         WaitForGPU();CHKERRCUSP(ierr);
 97:       }
 98:       v->ops->destroy = VecDestroy_SeqCUSP;
 99: #endif
100:     } catch(char *ex) {
101:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
102:     }
103:   }
104:   return(0);
105: }


110: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
111: PetscErrorCode VecCUSPCopyToGPU(Vec v)
112: {

116:   VecCUSPAllocateCheck(v);
117:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
118:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
119:     try {
120: #if defined(PETSC_HAVE_TXPETSCGPU)
121:       ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUAll();CHKERRCUSP(ierr);
122: #else
123:       CUSPARRAY *varray;
124:       varray = ((Vec_CUSP*)v->spptr)->GPUarray;
125:       varray->assign(*(PetscScalar**)v->data,*(PetscScalar**)v->data + v->map->n);
126:       WaitForGPU();CHKERRCUSP(ierr);
127: #endif

129:     } catch(char *ex) {
130:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
131:     }
132:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
133:     v->valid_GPU_array = PETSC_CUSP_BOTH;
134:   }
135:   return(0);
136: }

140: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
141: {
143:   CUSPARRAY      *varray;

146:   VecCUSPAllocateCheck(v);
147:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
148:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
149:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
150: #if defined(PETSC_HAVE_TXPETSCGPU)
151:     ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUSome(varray, ci->recvIndices);CHKERRCUSP(ierr);
152: #else
153:     Vec_Seq *s;
154:     s = (Vec_Seq*)v->data;

156:     CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
157:     CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;

159:     thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
160:                  thrust::make_permutation_iterator(s->array,indicesCPU->end()),
161:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
162: #endif
163:     // Set the buffer states
164:     v->valid_GPU_array = PETSC_CUSP_BOTH;
165:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
166:   }
167:   return(0);
168: }


173: /*
174:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
175: */
176: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
177: {

181:   VecCUSPAllocateCheckHost(v);
182:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
183:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
184:     try {
185: #if defined(PETSC_HAVE_TXPETSCGPU)
186:       ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUAll();CHKERRCUSP(ierr);
187: #else
188:       CUSPARRAY *varray;
189:       varray = ((Vec_CUSP*)v->spptr)->GPUarray;
190:       thrust::copy(varray->begin(),varray->end(),*(PetscScalar**)v->data);
191:       WaitForGPU();CHKERRCUSP(ierr);
192: #endif
193:     } catch(char *ex) {
194:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
195:     }
196:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
197:     v->valid_GPU_array = PETSC_CUSP_BOTH;
198:   }
199:   return(0);
200: }

204: /* Note that this function only copies *some* of the values up from the GPU to CPU,
205:    which means that we need recombine the data at some point before using any of the standard functions.
206:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
207:    where you have to always call in pairs
208: */
209: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
210: {
211:   CUSPARRAY      *varray;

215:   VecCUSPAllocateCheck(v);
216:   VecCUSPAllocateCheckHost(v);
217:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
218:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
219:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
220: #if defined(PETSC_HAVE_TXPETSCGPU)
221:     ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUSome(varray, ci->sendIndices);CHKERRCUSP(ierr);
222: #else
223:     Vec_Seq *s;
224:     s = (Vec_Seq*)v->data;
225:     CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
226:     CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;

228:     thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
229:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
230:                  thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
231: #endif
232:     VecCUSPRestoreArrayRead(v,&varray);
233:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
234:     v->valid_GPU_array = PETSC_CUSP_BOTH;
235:   }
236:   return(0);
237: }


242: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
243: {
244:   PetscScalar       *ya;
245:   const PetscScalar *xa;
246:   PetscErrorCode    ierr;

249:   if (xin != yin) {
250:     VecGetArrayRead(xin,&xa);
251:     VecGetArray(yin,&ya);
252:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
253:     VecRestoreArrayRead(xin,&xa);
254:     VecRestoreArray(yin,&ya);
255:   }
256:   return(0);
257: }

261: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
262: {
264:   PetscInt       n = xin->map->n,i;
265:   PetscScalar    *xx;

268:   VecGetArray(xin,&xx);
269:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
270:   VecRestoreArray(xin,&xx);
271:   return(0);
272: }

276: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
277: {
278:   Vec_Seq        *vs = (Vec_Seq*)v->data;

282:   PetscObjectAMSViewOff(v);
283: #if defined(PETSC_USE_LOG)
284:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
285: #endif
286:   if (vs->array_allocated) PetscFree(vs->array_allocated);
287:   PetscFree(vs);
288:   return(0);
289: }

293: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
294: {
295:   Vec_Seq *v = (Vec_Seq*)vin->data;

298:   v->array         = v->unplacedarray;
299:   v->unplacedarray = 0;
300:   return(0);
301: }

303: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
306: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
307: {

311:   VecCUSPAllocateCheck(v);
312:   return(0);
313: }

317: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
318: {

322:   VecCUSPCopyToGPU(v);
323:   return(0);
324: }

328: /*
329:     PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()

331:    Input Parameters:
332: +    n - the number of indices
333: -    indices - integer list of indices

335:    Output Parameter:
336: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

338: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
339: */
340: PetscErrorCode PetscCUSPIndicesCreate(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
341: {
342:   PetscCUSPIndices cci;

345:   cci = new struct _p_PetscCUSPIndices;
346: #if defined(PETSC_HAVE_TXPETSCGPU)
347:   cci->sendIndices = new GPU_Indices<PetscInt, PetscScalar>();
348:   cci->sendIndices->buildIndices(sendIndices, ns);
349:   cci->recvIndices = new GPU_Indices<PetscInt, PetscScalar>();
350:   cci->recvIndices->buildIndices(recvIndices, nr);
351: #else
352:   cci->sendIndicesCPU.assign(sendIndices,sendIndices+ns);
353:   cci->sendIndicesGPU.assign(sendIndices,sendIndices+ns);

355:   cci->recvIndicesCPU.assign(recvIndices,recvIndices+nr);
356:   cci->recvIndicesGPU.assign(recvIndices,recvIndices+nr);
357: #endif
358:   *ci = cci;
359:   return(0);
360: }

364: /*
365:     PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()

367:    Input Parameters:
368: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

370: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
371: */
372: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
373: {
375:   if (!(*ci)) return(0);
376:   try {
377: #if defined(PETSC_HAVE_TXPETSCGPU)
378:     if ((*ci)->sendIndices) delete (*ci)->sendIndices;
379:     if ((*ci)->recvIndices) delete (*ci)->recvIndices;
380: #endif
381:     if (ci) delete *ci;
382:   } catch(char *ex) {
383:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
384:   }
385:   *ci = 0;
386:   return(0);
387: }

389: #if defined(PETSC_HAVE_TXPETSCGPU)
392: /*
393:  *VecCUSPResetIndexBuffersFlagsGPU_Public resets indexing flags ... only called in VecScatterFinalizeForGPU
394:  */
395: PetscErrorCode VecCUSPResetIndexBuffersFlagsGPU_Public(PetscCUSPIndices ci)
396: {
398:   if (ci->sendIndices) ci->sendIndices->resetStatusFlag();
399:   if (ci->recvIndices) ci->recvIndices->resetStatusFlag();
400:   return(0);
401: }
402: #endif


407: /*
408:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

410:    Input Parameters:
411: +    v - the vector
412: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

414: */
415: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
416: {

420:   VecCUSPCopyToGPUSome(v,ci);
421:   return(0);
422: }

426: /*
427:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

429:   Input Parameters:
430:  +    v - the vector
431:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
432: */
433: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
434: {

438:   VecCUSPCopyFromGPUSome(v,ci);
439:   return(0);
440: }

442: #if defined(PETSC_HAVE_TXPETSCGPU)
445: /* Note that this function only moves *some* of the data from a GPU vector to a contiguous buffer on the GPU.
446:    Afterwords, this buffer can be messaged to the host easily with asynchronous memory transfers.
447:    which means that we need recombine the data at some point before using any of the standard functions.
448:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
449:    where you have to always call in pairs
450: */
451: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
452: {
453:   CUSPARRAY      *varray;

457:   VecCUSPAllocateCheck(v);
458:   if (v->valid_GPU_array == PETSC_CUSP_GPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
459:     VecCUSPGetArrayRead(v,&varray);
460:     ((Vec_CUSP*)v->spptr)->GPUvector->copySomeToContiguousBuffer(varray, ci->sendIndices);CHKERRCUSP(ierr);
461:     VecCUSPRestoreArrayRead(v,&varray);
462:   }
463:   return(0);
464: }



470: /*
471:   VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector

473:   Input Parameters:
474:  +    v - the vector
475:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
476: */
477: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
478: {

482:   VecCUSPCopySomeToContiguousBufferGPU(v,ci);
483:   return(0);
484: }

486: /* Note that this function only moves *some* of the data from a contiguous buffer on the GPU to arbitrary locations
487:    in a GPU vector. This function will typically be called after an asynchronous memory transfer from the host to the device.
488:    which means that we need recombine the data at some point before using any of the standard functions.
489:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
490:    where you have to always call in pairs
491: */
492: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
493: {
494:   CUSPARRAY      *varray;

498:   VecCUSPAllocateCheck(v);
499:   if (v->valid_GPU_array == PETSC_CUSP_CPU  || v->valid_GPU_array == PETSC_CUSP_BOTH) {
500:     VecCUSPGetArrayRead(v,&varray);
501:     ((Vec_CUSP*)v->spptr)->GPUvector->copySomeFromContiguousBuffer(varray, ci->recvIndices);CHKERRCUSP(ierr);
502:     VecCUSPRestoreArrayRead(v,&varray);
503:   }
504:   return(0);
505: }

509: /*
510:   VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector

512:   Input Parameters:
513:  +    v - the vector
514:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
515: */
516: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
517: {

521:   VecCUSPCopySomeFromContiguousBufferGPU(v,ci);
522:   return(0);
523: }

525: #endif


528: /*MC
529:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

531:    Options Database Keys:
532: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

534:   Level: beginner

536: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
537: M*/

539: /* for VecAYPX_SeqCUSP*/
540: namespace cusp
541: {
542: namespace blas
543: {
544: namespace detail
545: {
546:   template <typename T>
547:     struct AYPX : public thrust::binary_function<T,T,T>
548:     {
549:       T alpha;

551:       AYPX(T _alpha) : alpha(_alpha) {}

553:       __host__ __device__
554:       T operator()(T x, T y)
555:       {
556:         return alpha * y + x;
557:       }
558:     };
559: }

561:  template <typename ForwardIterator1,
562:            typename ForwardIterator2,
563:            typename ScalarType>
564: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
565:            {
566:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
567:            }
568:  template <typename Array1, typename Array2, typename ScalarType>
569:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
570:  {
571:    detail::assert_same_dimensions(x,y);
572:    aypx(x.begin(),x.end(),y.begin(),alpha);
573:  }
574: }
575: }

579: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
580: {
581:   CUSPARRAY      *xarray,*yarray;

585:   if (alpha != 0.0) {
586:     VecCUSPGetArrayRead(xin,&xarray);
587:     VecCUSPGetArrayReadWrite(yin,&yarray);
588:     try {
589:       cusp::blas::aypx(*xarray,*yarray,alpha);
590:       WaitForGPU();CHKERRCUSP(ierr);
591:     } catch(char *ex) {
592:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
593:     }
594:     VecCUSPRestoreArrayRead(xin,&xarray);
595:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
596:     PetscLogFlops(2.0*yin->map->n);
597:   }
598:   return(0);
599: }


604: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
605: {
606:   CUSPARRAY      *xarray,*yarray;

610:   if (alpha != 0.0) {
611:     VecCUSPGetArrayRead(xin,&xarray);
612:     VecCUSPGetArrayReadWrite(yin,&yarray);
613:     try {
614:       cusp::blas::axpy(*xarray,*yarray,alpha);
615:       WaitForGPU();CHKERRCUSP(ierr);
616:     } catch(char *ex) {
617:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
618:     }
619:     VecCUSPRestoreArrayRead(xin,&xarray);
620:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
621:     PetscLogFlops(2.0*yin->map->n);
622:   }
623:   return(0);
624: }

626: struct VecCUSPPointwiseDivide
627: {
628:   template <typename Tuple>
629:   __host__ __device__
630:   void operator()(Tuple t)
631:   {
632:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
633:   }
634: };

638: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
639: {
640:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

644:   VecCUSPGetArrayRead(xin,&xarray);
645:   VecCUSPGetArrayRead(yin,&yarray);
646:   VecCUSPGetArrayWrite(win,&warray);
647:   try {
648:     thrust::for_each(
649:       thrust::make_zip_iterator(
650:         thrust::make_tuple(
651:           warray->begin(),
652:           xarray->begin(),
653:           yarray->begin())),
654:       thrust::make_zip_iterator(
655:         thrust::make_tuple(
656:           warray->end(),
657:           xarray->end(),
658:           yarray->end())),
659:       VecCUSPPointwiseDivide());
660:     WaitForGPU();CHKERRCUSP(ierr);
661:   } catch(char *ex) {
662:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
663:   }
664:   PetscLogFlops(win->map->n);
665:   VecCUSPRestoreArrayRead(xin,&xarray);
666:   VecCUSPRestoreArrayRead(yin,&yarray);
667:   VecCUSPRestoreArrayWrite(win,&warray);
668:   return(0);
669: }


672: struct VecCUSPWAXPY
673: {
674:   template <typename Tuple>
675:   __host__ __device__
676:   void operator()(Tuple t)
677:   {
678:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
679:   }
680: };

682: struct VecCUSPSum
683: {
684:   template <typename Tuple>
685:   __host__ __device__
686:   void operator()(Tuple t)
687:   {
688:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
689:   }
690: };

692: struct VecCUSPDiff
693: {
694:   template <typename Tuple>
695:   __host__ __device__
696:   void operator()(Tuple t)
697:   {
698:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
699:   }
700: };

704: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
705: {
706:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

710:   if (alpha == 0.0) {
711:     VecCopy_SeqCUSP(yin,win);
712:   } else {
713:     VecCUSPGetArrayRead(xin,&xarray);
714:     VecCUSPGetArrayRead(yin,&yarray);
715:     VecCUSPGetArrayWrite(win,&warray);
716:     if (alpha == 1.0) {
717:       try {
718:         thrust::for_each(
719:           thrust::make_zip_iterator(
720:             thrust::make_tuple(
721:               warray->begin(),
722:               yarray->begin(),
723:               xarray->begin())),
724:           thrust::make_zip_iterator(
725:             thrust::make_tuple(
726:               warray->end(),
727:               yarray->end(),
728:               xarray->end())),
729:           VecCUSPSum());
730:       } catch(char *ex) {
731:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
732:       }
733:       PetscLogFlops(win->map->n);
734:     } else if (alpha == -1.0) {
735:       try {
736:         thrust::for_each(
737:           thrust::make_zip_iterator(
738:             thrust::make_tuple(
739:               warray->begin(),
740:               yarray->begin(),
741:               xarray->begin())),
742:           thrust::make_zip_iterator(
743:             thrust::make_tuple(
744:               warray->end(),
745:               yarray->end(),
746:               xarray->end())),
747:           VecCUSPDiff());
748:       } catch(char *ex) {
749:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
750:       }
751:       PetscLogFlops(win->map->n);
752:     } else {
753:       try {
754:         thrust::for_each(
755:           thrust::make_zip_iterator(
756:             thrust::make_tuple(
757:               warray->begin(),
758:               yarray->begin(),
759:               thrust::make_constant_iterator(alpha),
760:               xarray->begin())),
761:           thrust::make_zip_iterator(
762:             thrust::make_tuple(
763:               warray->end(),
764:               yarray->end(),
765:               thrust::make_constant_iterator(alpha),
766:               xarray->end())),
767:           VecCUSPWAXPY());
768:       } catch(char *ex) {
769:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
770:       }
771:       PetscLogFlops(2*win->map->n);
772:     }
773:     WaitForGPU();CHKERRCUSP(ierr);
774:     VecCUSPRestoreArrayRead(xin,&xarray);
775:     VecCUSPRestoreArrayRead(yin,&yarray);
776:     VecCUSPRestoreArrayWrite(win,&warray);
777:   }
778:   return(0);
779: }

781: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
782: struct VecCUSPMAXPY4
783: {
784:   template <typename Tuple>
785:   __host__ __device__
786:   void operator()(Tuple t)
787:   {
788:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
789:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
790:   }
791: };


794: struct VecCUSPMAXPY3
795: {
796:   template <typename Tuple>
797:   __host__ __device__
798:   void operator()(Tuple t)
799:   {
800:     /*y += a1*x1 +a2*x2 + a3*x3 */
801:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
802:   }
803: };

805: struct VecCUSPMAXPY2
806: {
807:   template <typename Tuple>
808:   __host__ __device__
809:   void operator()(Tuple t)
810:   {
811:     /*y += a1*x1 +a2*x2*/
812:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
813:   }
814: };
817: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
818: {
820:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
821:   PetscInt       n = xin->map->n,j,j_rem;
822:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

825:   PetscLogFlops(nv*2.0*n);
826:   VecCUSPGetArrayReadWrite(xin,&xarray);
827:   switch (j_rem=nv&0x3) {
828:   case 3:
829:     alpha0 = alpha[0];
830:     alpha1 = alpha[1];
831:     alpha2 = alpha[2];
832:     alpha += 3;
833:     VecCUSPGetArrayRead(y[0],&yy0);
834:     VecCUSPGetArrayRead(y[1],&yy1);
835:     VecCUSPGetArrayRead(y[2],&yy2);
836:     try {
837:       thrust::for_each(
838:         thrust::make_zip_iterator(
839:           thrust::make_tuple(
840:             xarray->begin(),
841:             thrust::make_constant_iterator(alpha0),
842:             yy0->begin(),
843:             thrust::make_constant_iterator(alpha1),
844:             yy1->begin(),
845:             thrust::make_constant_iterator(alpha2),
846:             yy2->begin())),
847:         thrust::make_zip_iterator(
848:           thrust::make_tuple(
849:             xarray->end(),
850:             thrust::make_constant_iterator(alpha0),
851:             yy0->end(),
852:             thrust::make_constant_iterator(alpha1),
853:             yy1->end(),
854:             thrust::make_constant_iterator(alpha2),
855:             yy2->end())),
856:         VecCUSPMAXPY3());
857:     } catch(char *ex) {
858:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
859:     }
860:     VecCUSPRestoreArrayRead(y[0],&yy0);
861:     VecCUSPRestoreArrayRead(y[1],&yy1);
862:     VecCUSPRestoreArrayRead(y[2],&yy2);
863:     y   += 3;
864:     break;
865:   case 2:
866:     alpha0 = alpha[0];
867:     alpha1 = alpha[1];
868:     alpha +=2;
869:     VecCUSPGetArrayRead(y[0],&yy0);
870:     VecCUSPGetArrayRead(y[1],&yy1);
871:     try {
872:       thrust::for_each(
873:         thrust::make_zip_iterator(
874:           thrust::make_tuple(
875:             xarray->begin(),
876:             thrust::make_constant_iterator(alpha0),
877:             yy0->begin(),
878:             thrust::make_constant_iterator(alpha1),
879:             yy1->begin())),
880:         thrust::make_zip_iterator(
881:           thrust::make_tuple(
882:             xarray->end(),
883:             thrust::make_constant_iterator(alpha0),
884:             yy0->end(),
885:             thrust::make_constant_iterator(alpha1),
886:             yy1->end())),
887:         VecCUSPMAXPY2());
888:     } catch(char *ex) {
889:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
890:     }
891:     y +=2;
892:     break;
893:   case 1:
894:     alpha0 = *alpha++;
895:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
896:     y     +=1;
897:     break;
898:   }
899:   for (j=j_rem; j<nv; j+=4) {
900:     alpha0 = alpha[0];
901:     alpha1 = alpha[1];
902:     alpha2 = alpha[2];
903:     alpha3 = alpha[3];
904:     alpha += 4;
905:     VecCUSPGetArrayRead(y[0],&yy0);
906:     VecCUSPGetArrayRead(y[1],&yy1);
907:     VecCUSPGetArrayRead(y[2],&yy2);
908:     VecCUSPGetArrayRead(y[3],&yy3);
909:     try {
910:       thrust::for_each(
911:         thrust::make_zip_iterator(
912:           thrust::make_tuple(
913:             xarray->begin(),
914:             thrust::make_constant_iterator(alpha0),
915:             yy0->begin(),
916:             thrust::make_constant_iterator(alpha1),
917:             yy1->begin(),
918:             thrust::make_constant_iterator(alpha2),
919:             yy2->begin(),
920:             thrust::make_constant_iterator(alpha3),
921:             yy3->begin())),
922:         thrust::make_zip_iterator(
923:           thrust::make_tuple(
924:             xarray->end(),
925:             thrust::make_constant_iterator(alpha0),
926:             yy0->end(),
927:             thrust::make_constant_iterator(alpha1),
928:             yy1->end(),
929:             thrust::make_constant_iterator(alpha2),
930:             yy2->end(),
931:             thrust::make_constant_iterator(alpha3),
932:             yy3->end())),
933:         VecCUSPMAXPY4());
934:     } catch(char *ex) {
935:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
936:     }
937:     VecCUSPRestoreArrayRead(y[0],&yy0);
938:     VecCUSPRestoreArrayRead(y[1],&yy1);
939:     VecCUSPRestoreArrayRead(y[2],&yy2);
940:     VecCUSPRestoreArrayRead(y[3],&yy3);
941:     y   += 4;
942:   }
943:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
944:   WaitForGPU();CHKERRCUSP(ierr);
945:   return(0);
946: }


951: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
952: {
953:   CUSPARRAY      *xarray,*yarray;
955:   //  PetscScalar    *xptr,*yptr,*zgpu;
956:   //PetscReal tmp;

959:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
960:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
961:   VecCUSPGetArrayRead(xin,&xarray);
962:   VecCUSPGetArrayRead(yin,&yarray);
963:   try {
964: #if defined(PETSC_USE_COMPLEX)
965:     *z = cusp::blas::dotc(*yarray,*xarray);
966: #else
967:     *z = cusp::blas::dot(*yarray,*xarray);
968: #endif
969:   } catch(char *ex) {
970:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
971:   }
972:   WaitForGPU();CHKERRCUSP(ierr);
973:   if (xin->map->n >0) {
974:     PetscLogFlops(2.0*xin->map->n-1);
975:   }
976:   VecCUSPRestoreArrayRead(xin,&xarray);
977:   VecCUSPRestoreArrayRead(yin,&yarray);
978:   //printf("VecDot_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*z),PetscImaginaryPart(*z));
979:   return(0);
980: }

982: //
983: // CUDA kernels for MDot to follow
984: //

986: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
987: #define MDOT_WORKGROUP_SIZE 128
988: #define MDOT_WORKGROUP_NUM  128

990: // M = 2:
991: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
992:                                         PetscInt size, PetscScalar *group_results)
993: {
994:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
995:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
996:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
997:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
998:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

1000:   PetscScalar entry_x    = 0;
1001:   PetscScalar group_sum0 = 0;
1002:   PetscScalar group_sum1 = 0;
1003:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1004:     entry_x     = x[i];   // load only once from global memory!
1005:     group_sum0 += entry_x * y0[i];
1006:     group_sum1 += entry_x * y1[i];
1007:   }
1008:   tmp_buffer[threadIdx.x]                       = group_sum0;
1009:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

1011:   // parallel reduction
1012:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1013:     __syncthreads();
1014:     if (threadIdx.x < stride) {
1015:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
1016:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1017:     }
1018:   }

1020:   // write result of group to group_results
1021:   if (threadIdx.x == 0) {
1022:     group_results[blockIdx.x]             = tmp_buffer[0];
1023:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
1024:   }
1025: }

1027: // M = 3:
1028: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
1029:                                         PetscInt size, PetscScalar *group_results)
1030: {
1031:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
1032:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1033:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
1034:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
1035:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

1037:   PetscScalar entry_x    = 0;
1038:   PetscScalar group_sum0 = 0;
1039:   PetscScalar group_sum1 = 0;
1040:   PetscScalar group_sum2 = 0;
1041:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1042:     entry_x     = x[i];   // load only once from global memory!
1043:     group_sum0 += entry_x * y0[i];
1044:     group_sum1 += entry_x * y1[i];
1045:     group_sum2 += entry_x * y2[i];
1046:   }
1047:   tmp_buffer[threadIdx.x]                           = group_sum0;
1048:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
1049:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

1051:   // parallel reduction
1052:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1053:     __syncthreads();
1054:     if (threadIdx.x < stride) {
1055:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
1056:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
1057:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1058:     }
1059:   }

1061:   // write result of group to group_results
1062:   if (threadIdx.x == 0) {
1063:     group_results[blockIdx.x                ] = tmp_buffer[0];
1064:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
1065:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1066:   }
1067: }

1069: // M = 4:
1070: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
1071:                                         PetscInt size, PetscScalar *group_results)
1072: {
1073:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
1074:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1075:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
1076:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
1077:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

1079:   PetscScalar entry_x    = 0;
1080:   PetscScalar group_sum0 = 0;
1081:   PetscScalar group_sum1 = 0;
1082:   PetscScalar group_sum2 = 0;
1083:   PetscScalar group_sum3 = 0;
1084:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1085:     entry_x     = x[i];   // load only once from global memory!
1086:     group_sum0 += entry_x * y0[i];
1087:     group_sum1 += entry_x * y1[i];
1088:     group_sum2 += entry_x * y2[i];
1089:     group_sum3 += entry_x * y3[i];
1090:   }
1091:   tmp_buffer[threadIdx.x]                           = group_sum0;
1092:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
1093:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
1094:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

1096:   // parallel reduction
1097:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1098:     __syncthreads();
1099:     if (threadIdx.x < stride) {
1100:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
1101:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
1102:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1103:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1104:     }
1105:   }

1107:   // write result of group to group_results
1108:   if (threadIdx.x == 0) {
1109:     group_results[blockIdx.x                ] = tmp_buffer[0];
1110:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
1111:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1112:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1113:   }
1114: }

1116: // M = 8:
1117: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
1118:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
1119:                                           PetscInt size, PetscScalar *group_results)
1120: {
1121:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
1122:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
1123:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
1124:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
1125:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

1127:   PetscScalar entry_x    = 0;
1128:   PetscScalar group_sum0 = 0;
1129:   PetscScalar group_sum1 = 0;
1130:   PetscScalar group_sum2 = 0;
1131:   PetscScalar group_sum3 = 0;
1132:   PetscScalar group_sum4 = 0;
1133:   PetscScalar group_sum5 = 0;
1134:   PetscScalar group_sum6 = 0;
1135:   PetscScalar group_sum7 = 0;
1136:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1137:     entry_x     = x[i];   // load only once from global memory!
1138:     group_sum0 += entry_x * y0[i];
1139:     group_sum1 += entry_x * y1[i];
1140:     group_sum2 += entry_x * y2[i];
1141:     group_sum3 += entry_x * y3[i];
1142:     group_sum4 += entry_x * y4[i];
1143:     group_sum5 += entry_x * y5[i];
1144:     group_sum6 += entry_x * y6[i];
1145:     group_sum7 += entry_x * y7[i];
1146:   }
1147:   tmp_buffer[threadIdx.x]                           = group_sum0;
1148:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
1149:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
1150:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
1151:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
1152:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
1153:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1154:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

1156:   // parallel reduction
1157:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1158:     __syncthreads();
1159:     if (threadIdx.x < stride) {
1160:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
1161:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
1162:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1163:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1164:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1165:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1166:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1167:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1168:     }
1169:   }

1171:   // write result of group to group_results
1172:   if (threadIdx.x == 0) {
1173:     group_results[blockIdx.x                ] = tmp_buffer[0];
1174:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
1175:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1176:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1177:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1178:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1179:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1180:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1181:   }
1182: }


1187: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1188: {
1190:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
1191:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1192:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1193:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1194:   cudaError_t    cuda_ierr;

1197:   // allocate scratchpad memory for the results of individual work groups:
1198:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1199:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1200:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

1202:   VecCUSPGetArrayRead(xin,&xarray);
1203:   xptr = thrust::raw_pointer_cast(xarray->data());

1205:   while (current_y_index < nv)
1206:   {
1207:     switch (nv - current_y_index) {

1209:     case 7:
1210:     case 6:
1211:     case 5:
1212:     case 4:
1213:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1214:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1215:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1216:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

1218: #if defined(PETSC_USE_COMPLEX)
1219:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1220:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1221:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1222:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1223: #else
1224:       // extract raw device pointers:
1225:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1226:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1227:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1228:       y3ptr = thrust::raw_pointer_cast(y3array->data());

1230:       // run kernel:
1231:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

1233:       // copy results back to
1234:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1235:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1237:       // sum group results into z:
1238:       for (j=0; j<4; ++j) {
1239:         z[current_y_index + j] = 0;
1240:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1241:       }
1242: #endif
1243:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1244:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1245:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1246:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1247:       current_y_index += 4;
1248:       break;

1250:     case 3:
1251:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1252:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1253:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

1255: #if defined(PETSC_USE_COMPLEX)
1256:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1257:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1258:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1259: #else
1260:       // extract raw device pointers:
1261:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1262:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1263:       y2ptr = thrust::raw_pointer_cast(y2array->data());

1265:       // run kernel:
1266:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

1268:       // copy results back to
1269:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1270:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1272:       // sum group results into z:
1273:       for (j=0; j<3; ++j) {
1274:         z[current_y_index + j] = 0;
1275:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1276:       }
1277: #endif

1279:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1280:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1281:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1282:       current_y_index += 3;
1283:       break;

1285:     case 2:
1286:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1287:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

1289: #if defined(PETSC_USE_COMPLEX)
1290:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1291:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1292: #else
1293:       // extract raw device pointers:
1294:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1295:       y1ptr = thrust::raw_pointer_cast(y1array->data());

1297:       // run kernel:
1298:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

1300:       // copy results back to
1301:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1302:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1304:       // sum group results into z:
1305:       for (j=0; j<2; ++j) {
1306:         z[current_y_index + j] = 0;
1307:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1308:       }
1309: #endif
1310:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1311:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1312:       current_y_index += 2;
1313:       break;

1315:     case 1:
1316:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1317: #if defined(PETSC_USE_COMPLEX)
1318:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1319: #else
1320:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1321: #endif
1322:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1323:       current_y_index += 1;
1324:       break;

1326:     default: // 8 or more vectors left
1327:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1328:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1329:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1330:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1331:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1332:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1333:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1334:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

1336: #if defined(PETSC_USE_COMPLEX)
1337:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1338:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1339:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1340:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1341:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1342:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1343:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1344:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1345: #else
1346:       // extract raw device pointers:
1347:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1348:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1349:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1350:       y3ptr = thrust::raw_pointer_cast(y3array->data());
1351:       y4ptr = thrust::raw_pointer_cast(y4array->data());
1352:       y5ptr = thrust::raw_pointer_cast(y5array->data());
1353:       y6ptr = thrust::raw_pointer_cast(y6array->data());
1354:       y7ptr = thrust::raw_pointer_cast(y7array->data());

1356:       // run kernel:
1357:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1359:       // copy results back to
1360:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1361:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1363:       // sum group results into z:
1364:       for (j=0; j<8; ++j) {
1365:         z[current_y_index + j] = 0;
1366:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1367:       }
1368: #endif
1369:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1370:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1371:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1372:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1373:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1374:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1375:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1376:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1377:       current_y_index += 8;
1378:       break;
1379:     }
1380:   }
1381:   VecCUSPRestoreArrayRead(xin,&xarray);

1383:   cuda_cudaFree(group_results_gpu);
1384:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1385:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1386:   return(0);
1387: }

1389: #undef MDOT_WORKGROUP_SIZE
1390: #undef MDOT_WORKGROUP_NUM



1396: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1397: {
1398:   CUSPARRAY      *xarray=NULL;

1402:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1403:   VecCUSPGetArrayWrite(xin,&xarray);
1404:   try {
1405:     cusp::blas::fill(*xarray,alpha);
1406:   } catch(char *ex) {
1407:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1408:   }
1409:   WaitForGPU();CHKERRCUSP(ierr);
1410:   VecCUSPRestoreArrayWrite(xin,&xarray);
1411:   return(0);
1412: }

1416: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1417: {
1418:   CUSPARRAY      *xarray;

1422:   if (alpha == 0.0) {
1423:     VecSet_SeqCUSP(xin,alpha);
1424:   } else if (alpha != 1.0) {
1425:     VecCUSPGetArrayReadWrite(xin,&xarray);
1426:     try {
1427:       cusp::blas::scal(*xarray,alpha);
1428:     } catch(char *ex) {
1429:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1430:     }
1431:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1432:   }
1433:   WaitForGPU();CHKERRCUSP(ierr);
1434:   PetscLogFlops(xin->map->n);
1435:   return(0);
1436: }


1441: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1442: {
1443:   CUSPARRAY      *xarray,*yarray;

1447:   //#if defined(PETSC_USE_COMPLEX)
1448:   /*Not working for complex*/
1449:   //#else
1450:   VecCUSPGetArrayRead(xin,&xarray);
1451:   VecCUSPGetArrayRead(yin,&yarray);
1452:   try {
1453:     *z = cusp::blas::dot(*xarray,*yarray);
1454:   } catch(char *ex) {
1455:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1456:   }
1457:   //#endif
1458:   WaitForGPU();CHKERRCUSP(ierr);
1459:   if (xin->map->n > 0) {
1460:     PetscLogFlops(2.0*xin->map->n-1);
1461:   }
1462:   VecCUSPRestoreArrayRead(yin,&yarray);
1463:   VecCUSPRestoreArrayRead(xin,&xarray);
1464:   return(0);
1465: }
1468: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1469: {
1470:   CUSPARRAY      *xarray,*yarray;

1474:   if (xin != yin) {
1475:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1476:       VecCUSPGetArrayRead(xin,&xarray);
1477:       VecCUSPGetArrayWrite(yin,&yarray);
1478:       try {
1479:         cusp::blas::copy(*xarray,*yarray);
1480:       } catch(char *ex) {
1481:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1482:       }
1483:       WaitForGPU();CHKERRCUSP(ierr);
1484:       VecCUSPRestoreArrayRead(xin,&xarray);
1485:       VecCUSPRestoreArrayWrite(yin,&yarray);

1487:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1488:       /* copy in CPU if we are on the CPU*/
1489:       VecCopy_SeqCUSP_Private(xin,yin);
1490:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1491:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1492:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1493:         /* copy in CPU */
1494:         VecCopy_SeqCUSP_Private(xin,yin);

1496:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1497:         /* copy in GPU */
1498:         VecCUSPGetArrayRead(xin,&xarray);
1499:         VecCUSPGetArrayWrite(yin,&yarray);
1500:         try {
1501:           cusp::blas::copy(*xarray,*yarray);
1502:           WaitForGPU();CHKERRCUSP(ierr);
1503:         } catch(char *ex) {
1504:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1505:         }
1506:         VecCUSPRestoreArrayRead(xin,&xarray);
1507:         VecCUSPRestoreArrayWrite(yin,&yarray);
1508:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1509:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1510:            default to copy in GPU (this is an arbitrary choice) */
1511:         VecCUSPGetArrayRead(xin,&xarray);
1512:         VecCUSPGetArrayWrite(yin,&yarray);
1513:         try {
1514:           cusp::blas::copy(*xarray,*yarray);
1515:           WaitForGPU();CHKERRCUSP(ierr);
1516:         } catch(char *ex) {
1517:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1518:         }
1519:         VecCUSPRestoreArrayRead(xin,&xarray);
1520:         VecCUSPRestoreArrayWrite(yin,&yarray);
1521:       } else {
1522:         VecCopy_SeqCUSP_Private(xin,yin);
1523:       }
1524:     }
1525:   }
1526:   return(0);
1527: }


1532: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1533: {
1535:   PetscBLASInt   one = 1,bn;
1536:   CUSPARRAY      *xarray,*yarray;

1539:   PetscBLASIntCast(xin->map->n,&bn);
1540:   if (xin != yin) {
1541:     VecCUSPGetArrayReadWrite(xin,&xarray);
1542:     VecCUSPGetArrayReadWrite(yin,&yarray);

1544: #if defined(PETSC_USE_COMPLEX)
1545: #if defined(PETSC_USE_REAL_SINGLE)
1546:     cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1547: #else
1548:     cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1549: #endif
1550: #else
1551: #if defined(PETSC_USE_REAL_SINGLE)
1552:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1553: #else
1554:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1555: #endif
1556: #endif
1557:     cublasGetError();CHKERRCUSP(ierr);
1558:     WaitForGPU();CHKERRCUSP(ierr);
1559:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1560:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1561:   }
1562:   return(0);
1563: }

1565: struct VecCUSPAX
1566: {
1567:   template <typename Tuple>
1568:   __host__ __device__
1569:   void operator()(Tuple t)
1570:   {
1571:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1572:   }
1573: };
1576: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1577: {
1579:   PetscScalar    a = alpha,b = beta;
1580:   CUSPARRAY      *xarray,*yarray;

1583:   if (a == 0.0) {
1584:     VecScale_SeqCUSP(yin,beta);
1585:   } else if (b == 1.0) {
1586:     VecAXPY_SeqCUSP(yin,alpha,xin);
1587:   } else if (a == 1.0) {
1588:     VecAYPX_SeqCUSP(yin,beta,xin);
1589:   } else if (b == 0.0) {
1590:     VecCUSPGetArrayRead(xin,&xarray);
1591:     VecCUSPGetArrayReadWrite(yin,&yarray);
1592:     try {
1593:       thrust::for_each(
1594:         thrust::make_zip_iterator(
1595:           thrust::make_tuple(
1596:             yarray->begin(),
1597:             thrust::make_constant_iterator(a),
1598:             xarray->begin())),
1599:         thrust::make_zip_iterator(
1600:           thrust::make_tuple(
1601:             yarray->end(),
1602:             thrust::make_constant_iterator(a),
1603:             xarray->end())),
1604:         VecCUSPAX());
1605:     } catch(char *ex) {
1606:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1607:     }
1608:     PetscLogFlops(xin->map->n);
1609:     WaitForGPU();CHKERRCUSP(ierr);
1610:     VecCUSPRestoreArrayRead(xin,&xarray);
1611:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1612:   } else {
1613:     VecCUSPGetArrayRead(xin,&xarray);
1614:     VecCUSPGetArrayReadWrite(yin,&yarray);
1615:     try {
1616:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1617:     } catch(char *ex) {
1618:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1619:     }
1620:     VecCUSPRestoreArrayRead(xin,&xarray);
1621:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1622:     WaitForGPU();CHKERRCUSP(ierr);
1623:     PetscLogFlops(3.0*xin->map->n);
1624:   }
1625:   return(0);
1626: }

1628: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1629: struct VecCUSPXPBYPCZ
1630: {
1631:   /* z = x + b*y + c*z */
1632:   template <typename Tuple>
1633:   __host__ __device__
1634:   void operator()(Tuple t)
1635:   {
1636:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1637:   }
1638: };
1639: struct VecCUSPAXPBYPZ
1640: {
1641:   /* z = ax + b*y + z */
1642:   template <typename Tuple>
1643:   __host__ __device__
1644:   void operator()(Tuple t)
1645:   {
1646:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1647:   }
1648: };

1652: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1653: {
1655:   PetscInt       n = zin->map->n;
1656:   CUSPARRAY      *xarray,*yarray,*zarray;

1659:   VecCUSPGetArrayRead(xin,&xarray);
1660:   VecCUSPGetArrayRead(yin,&yarray);
1661:   VecCUSPGetArrayReadWrite(zin,&zarray);
1662:   if (alpha == 1.0) {
1663:     try {
1664:       thrust::for_each(
1665:         thrust::make_zip_iterator(
1666:           thrust::make_tuple(
1667:             zarray->begin(),
1668:             thrust::make_constant_iterator(gamma),
1669:             xarray->begin(),
1670:             yarray->begin(),
1671:             thrust::make_constant_iterator(beta))),
1672:         thrust::make_zip_iterator(
1673:           thrust::make_tuple(
1674:             zarray->end(),
1675:             thrust::make_constant_iterator(gamma),
1676:             xarray->end(),
1677:             yarray->end(),
1678:             thrust::make_constant_iterator(beta))),
1679:         VecCUSPXPBYPCZ());
1680:     } catch(char *ex) {
1681:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1682:     }
1683:     PetscLogFlops(4.0*n);
1684:   } else if (gamma == 1.0) {
1685:     try {
1686:       thrust::for_each(
1687:         thrust::make_zip_iterator(
1688:           thrust::make_tuple(
1689:             zarray->begin(),
1690:             xarray->begin(),
1691:             thrust::make_constant_iterator(alpha),
1692:             yarray->begin(),
1693:             thrust::make_constant_iterator(beta))),
1694:         thrust::make_zip_iterator(
1695:           thrust::make_tuple(
1696:             zarray->end(),
1697:             xarray->end(),
1698:             thrust::make_constant_iterator(alpha),
1699:             yarray->end(),
1700:             thrust::make_constant_iterator(beta))),
1701:         VecCUSPAXPBYPZ());
1702:     } catch(char *ex) {
1703:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1704:     }
1705:     PetscLogFlops(4.0*n);
1706:   } else {
1707:     try {
1708:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1709:     } catch(char *ex) {
1710:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1711:     }
1712:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1713:     VecCUSPRestoreArrayRead(xin,&xarray);
1714:     VecCUSPRestoreArrayRead(yin,&yarray);
1715:     PetscLogFlops(5.0*n);
1716:   }
1717:   WaitForGPU();CHKERRCUSP(ierr);
1718:   return(0);
1719: }

1723: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1724: {
1726:   PetscInt       n = win->map->n;
1727:   CUSPARRAY      *xarray,*yarray,*warray;

1730:   VecCUSPGetArrayRead(xin,&xarray);
1731:   VecCUSPGetArrayRead(yin,&yarray);
1732:   VecCUSPGetArrayReadWrite(win,&warray);
1733:   try {
1734:     cusp::blas::xmy(*xarray,*yarray,*warray);
1735:   } catch(char *ex) {
1736:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1737:   }
1738:   VecCUSPRestoreArrayRead(xin,&xarray);
1739:   VecCUSPRestoreArrayRead(yin,&yarray);
1740:   VecCUSPRestoreArrayReadWrite(win,&warray);
1741:   PetscLogFlops(n);
1742:   WaitForGPU();CHKERRCUSP(ierr);
1743:   return(0);
1744: }


1747: /* should do infinity norm in cusp */

1751: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1752: {
1753:   const PetscScalar *xx;
1754:   PetscErrorCode    ierr;
1755:   PetscInt          n = xin->map->n;
1756:   PetscBLASInt      one = 1, bn;
1757:   CUSPARRAY         *xarray;

1760:   PetscBLASIntCast(n,&bn);
1761:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1762:     VecCUSPGetArrayRead(xin,&xarray);
1763:     try {
1764:       *z = cusp::blas::nrm2(*xarray);
1765:     } catch(char *ex) {
1766:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1767:     }
1768:     WaitForGPU();CHKERRCUSP(ierr);
1769:     VecCUSPRestoreArrayRead(xin,&xarray);
1770:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1771:   } else if (type == NORM_INFINITY) {
1772:     PetscInt  i;
1773:     PetscReal max = 0.0,tmp;

1775:     VecGetArrayRead(xin,&xx);
1776:     for (i=0; i<n; i++) {
1777:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1778:       /* check special case of tmp == NaN */
1779:       if (tmp != tmp) {max = tmp; break;}
1780:       xx++;
1781:     }
1782:     VecRestoreArrayRead(xin,&xx);
1783:     *z   = max;
1784:   } else if (type == NORM_1) {
1785:     VecCUSPGetArrayRead(xin,&xarray);
1786: #if defined(PETSC_USE_COMPLEX)
1787: #if defined(PETSC_USE_REAL_SINGLE)
1788:     *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1789: #else
1790:     *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1791: #endif
1792: #else
1793: #if defined(PETSC_USE_REAL_SINGLE)
1794:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1795: #else
1796:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1797: #endif
1798: #endif
1799:     cublasGetError();CHKERRCUSP(ierr);
1800:     VecCUSPRestoreArrayRead(xin,&xarray);
1801:     WaitForGPU();CHKERRCUSP(ierr);
1802:     PetscLogFlops(PetscMax(n-1.0,0.0));
1803:   } else if (type == NORM_1_AND_2) {
1804:     VecNorm_SeqCUSP(xin,NORM_1,z);
1805:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1806:   }
1807:   //printf("VecNorm_SeqCUSP=%1.5g\n",*z);
1808:   return(0);
1809: }


1812: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1816: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1817: {

1821:   VecSetRandom_SeqCUSP_Private(xin,r);
1822:   xin->valid_GPU_array = PETSC_CUSP_CPU;
1823:   return(0);
1824: }

1828: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1829: {

1833:   VecCUSPCopyFromGPU(vin);
1834:   VecResetArray_SeqCUSP_Private(vin);
1835:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1836:   return(0);
1837: }

1841: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1842: {

1846:   VecCUSPCopyFromGPU(vin);
1847:   VecPlaceArray_Seq(vin,a);
1848:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1849:   return(0);
1850: }


1855: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1856: {

1860:   VecCUSPCopyFromGPU(vin);
1861:   VecReplaceArray_Seq(vin,a);
1862:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1863:   return(0);
1864: }


1869: /*@
1870:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1872:    Collective on MPI_Comm

1874:    Input Parameter:
1875: +  comm - the communicator, should be PETSC_COMM_SELF
1876: -  n - the vector length

1878:    Output Parameter:
1879: .  V - the vector

1881:    Notes:
1882:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1883:    same type as an existing vector.

1885:    Level: intermediate

1887:    Concepts: vectors^creating sequential

1889: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1890: @*/
1891: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1892: {

1896:   VecCreate(comm,v);
1897:   VecSetSizes(*v,n,n);
1898:   VecSetType(*v,VECSEQCUSP);
1899:   return(0);
1900: }

1902: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1903: template <typename T>
1904: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1905: {
1906:   __host__ __device__
1907:   T operator()(T x)
1908:   {
1909: #if defined(PETSC_USE_COMPLEX)
1910:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1911: #else
1912:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1913: #endif
1914:   }
1915: };

1917: template <typename T>
1918: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1919: {
1920:   __host__ __device__
1921:   T operator()(T x,T y)
1922:   {
1923:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1924:   }
1925: };

1929: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1930: {
1931:   PetscErrorCode                         ierr;
1932:   PetscScalar                            zero = 0.0;
1933:   PetscReal                              n=s->map->n;
1934:   thrust::tuple<PetscScalar,PetscScalar> result;
1935:   CUSPARRAY                              *sarray,*tarray;

1938:   /*VecCUSPCopyToGPU(s);
1939:    VecCUSPCopyToGPU(t);*/
1940:   VecCUSPGetArrayRead(s,&sarray);
1941:   VecCUSPGetArrayRead(t,&tarray);
1942:   try {
1943: #if defined(PETSC_USE_COMPLEX)
1944:     VecDot_SeqCUSP(s,t,dp);
1945:     VecDot_SeqCUSP(t,t,nm);
1946:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1947:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1948: #else
1949:     result = thrust::transform_reduce(
1950:               thrust::make_zip_iterator(
1951:                 thrust::make_tuple(
1952:                   sarray->begin(),
1953:                   tarray->begin())),
1954:               thrust::make_zip_iterator(
1955:                 thrust::make_tuple(
1956:                   sarray->end(),
1957:                   tarray->end())),
1958:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1959:               thrust::make_tuple(zero,zero),                                   /*init */
1960:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1961:     *dp = thrust::get<0>(result);
1962:     *nm = thrust::get<1>(result);
1963: #endif
1964:   } catch(char *ex) {
1965:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1966:   }
1967:   VecCUSPRestoreArrayRead(s,&sarray);
1968:   VecCUSPRestoreArrayRead(t,&tarray);
1969:   WaitForGPU();CHKERRCUSP(ierr);
1970:   PetscLogFlops(4.0*n);
1971:   return(0);
1972: }

1976: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1977: {

1981:   VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1982:   PetscLayoutReference(win->map,&(*V)->map);
1983:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1984:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1985:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1986:   return(0);
1987: }

1991: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1992: {

1996:   try {
1997:     if (v->spptr) {
1998: #if defined(PETSC_HAVE_TXPETSCGPU)
1999:       if (((Vec_CUSP*)v->spptr)->GPUvector) delete ((Vec_CUSP*)v->spptr)->GPUvector;
2000:       Vec_Seq *s;
2001:       s                  = (Vec_Seq*)v->data;
2002:       s->array           = NULL;
2003:       s->array_allocated = NULL;
2004: #endif
2005:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
2006:       delete (Vec_CUSP*) v->spptr;
2007:     }
2008:   } catch(char *ex) {
2009:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
2010:   }
2011:   VecDestroy_SeqCUSP_Private(v);
2012:   return(0);
2013: }


2016: #if defined(PETSC_USE_COMPLEX)
2017: struct conjugate 
2018: {
2019:   __host__ __device__
2020:   PetscScalar operator()(PetscScalar x)
2021:   {
2022:     return cusp::conj(x);
2023:   }
2024: };
2025: #endif


2030: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
2031: {
2033:   CUSPARRAY      *xarray;

2036:   VecCUSPGetArrayReadWrite(xin,&xarray);
2037: #if defined(PETSC_USE_COMPLEX)
2038:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
2039: #endif
2040:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
2041:   return(0);
2042: }

2046: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
2047: {
2049:   PetscMPIInt    size;

2052:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
2053:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
2054:   VecCreate_Seq_Private(V,0);
2055:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);

2057:   V->ops->dot             = VecDot_SeqCUSP;
2058:   V->ops->norm            = VecNorm_SeqCUSP;
2059:   V->ops->tdot            = VecTDot_SeqCUSP;
2060:   V->ops->scale           = VecScale_SeqCUSP;
2061:   V->ops->copy            = VecCopy_SeqCUSP;
2062:   V->ops->set             = VecSet_SeqCUSP;
2063:   V->ops->swap            = VecSwap_SeqCUSP;
2064:   V->ops->axpy            = VecAXPY_SeqCUSP;
2065:   V->ops->axpby           = VecAXPBY_SeqCUSP;
2066:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqCUSP;
2067:   V->ops->pointwisemult   = VecPointwiseMult_SeqCUSP;
2068:   V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
2069:   V->ops->setrandom       = VecSetRandom_SeqCUSP;
2070:   V->ops->dot_local       = VecDot_SeqCUSP;
2071:   V->ops->tdot_local      = VecTDot_SeqCUSP;
2072:   V->ops->norm_local      = VecNorm_SeqCUSP;
2073:   V->ops->mdot_local      = VecMDot_SeqCUSP;
2074:   V->ops->maxpy           = VecMAXPY_SeqCUSP;
2075:   V->ops->mdot            = VecMDot_SeqCUSP;
2076:   V->ops->aypx            = VecAYPX_SeqCUSP;
2077:   V->ops->waxpy           = VecWAXPY_SeqCUSP;
2078:   V->ops->dotnorm2        = VecDotNorm2_SeqCUSP;
2079:   V->ops->placearray      = VecPlaceArray_SeqCUSP;
2080:   V->ops->replacearray    = VecReplaceArray_SeqCUSP;
2081:   V->ops->resetarray      = VecResetArray_SeqCUSP;
2082:   V->ops->destroy         = VecDestroy_SeqCUSP;
2083:   V->ops->duplicate       = VecDuplicate_SeqCUSP;
2084:   V->ops->conjugate       = VecConjugate_SeqCUSP;

2086:   VecCUSPAllocateCheck(V);
2087:   V->valid_GPU_array      = PETSC_CUSP_GPU;
2088:   VecSet(V,0.0);
2089:   return(0);
2090: }

2094: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
2095: {

2099:   *a   = 0;
2100:   VecCUSPCopyToGPU(v);
2101:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2102:   return(0);
2103: }

2107: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
2108: {

2112:   v->valid_GPU_array = PETSC_CUSP_GPU;

2114:   PetscObjectStateIncrease((PetscObject)v);
2115:   return(0);
2116: }

2120: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
2121: {

2125:   *a   = 0;
2126:   VecCUSPCopyToGPU(v);
2127:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2128:   return(0);
2129: }

2133: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
2134: {
2136:   return(0);
2137: }

2141: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
2142: {

2146:   *a   = 0;
2147:   VecCUSPAllocateCheck(v);
2148:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
2149:   return(0);
2150: }

2154: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2155: {

2159:   v->valid_GPU_array = PETSC_CUSP_GPU;

2161:   PetscObjectStateIncrease((PetscObject)v);
2162:   return(0);
2163: }