Actual source code: veccusp2.cu

petsc-3.7.2 2016-06-05
Report Typos and Errors
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #define PETSC_SKIP_COMPLEX
  6: #define PETSC_SKIP_SPINLOCK

  8: #include <petscconf.h>
  9: #include <petsc/private/vecimpl.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 13: #include <cuda_runtime.h>

 17: /*
 18:     Allocates space for the vector array on the GPU if it does not exist.
 19:     Does NOT change the PetscCUSPFlag for the vector
 20:     Does NOT zero the CUSP array

 22:  */
 23: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 24: {
 25:   cudaError_t    err;
 26:   cudaStream_t   stream;
 27:   Vec_CUSP       *veccusp;

 30:   if (!v->spptr) {
 31:     try {
 32:       v->spptr = new Vec_CUSP;
 33:       veccusp = (Vec_CUSP*)v->spptr;
 34:       veccusp->GPUarray = new CUSPARRAY;
 35:       veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
 36:       err = cudaStreamCreate(&stream);CHKERRCUSP(err);
 37:       veccusp->stream = stream;
 38:       veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 39:       v->ops->destroy = VecDestroy_SeqCUSP;
 40:       if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
 41:         if (v->data && ((Vec_Seq*)v->data)->array) {
 42:           v->valid_GPU_array = PETSC_CUSP_CPU;
 43:         } else {
 44:           v->valid_GPU_array = PETSC_CUSP_GPU;
 45:         }
 46:       }
 47:     } catch(char *ex) {
 48:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 49:     }
 50:   }
 51:   return(0);
 52: }

 56: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 57: PetscErrorCode VecCUSPCopyToGPU(Vec v)
 58: {
 60:   cudaError_t    err;
 61:   Vec_CUSP       *veccusp;
 62:   CUSPARRAY      *varray;

 65:   VecCUSPAllocateCheck(v);
 66:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 67:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
 68:     try {
 69:       veccusp=(Vec_CUSP*)v->spptr;
 70:       varray=veccusp->GPUarray;
 71:       err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
 72:     } catch(char *ex) {
 73:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 74:     }
 75:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
 76:     v->valid_GPU_array = PETSC_CUSP_BOTH;
 77:   }
 78:   return(0);
 79: }

 83: PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
 84: {
 85:   CUSPARRAY      *varray;
 87:   cudaError_t    err;
 88:   PetscScalar    *cpuPtr, *gpuPtr;
 89:   Vec_Seq        *s;
 90:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

 93:   VecCUSPAllocateCheck(v);
 94:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 95:     s = (Vec_Seq*)v->data;

 97:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
 98:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
 99:     gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
100:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

102:     /* Note : this code copies the smallest contiguous chunk of data
103:        containing ALL of the indices */
104:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);

106:     // Set the buffer states
107:     v->valid_GPU_array = PETSC_CUSP_BOTH;
108:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
109:   }
110:   return(0);
111: }


116: /*
117:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
118: */
119: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
120: {
122:   cudaError_t    err;
123:   Vec_CUSP       *veccusp;
124:   CUSPARRAY      *varray;

127:   VecCUSPAllocateCheckHost(v);
128:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
129:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
130:     try {
131:       veccusp=(Vec_CUSP*)v->spptr;
132:       varray=veccusp->GPUarray;
133:       err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
134:     } catch(char *ex) {
135:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
136:     }
137:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
138:     v->valid_GPU_array = PETSC_CUSP_BOTH;
139:   }
140:   return(0);
141: }

145: /* Note that this function only copies *some* of the values up from the GPU to CPU,
146:    which means that we need recombine the data at some point before using any of the standard functions.
147:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
148:    where you have to always call in pairs
149: */
150: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
151: {
152:   CUSPARRAY      *varray;
154:   cudaError_t    err;
155:   PetscScalar    *cpuPtr, *gpuPtr;
156:   Vec_Seq        *s;
157:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

160:   VecCUSPAllocateCheckHost(v);
161:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
162:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);

164:     varray=((Vec_CUSP*)v->spptr)->GPUarray;
165:     s = (Vec_Seq*)v->data;
166:     gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
167:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

169:     /* Note : this code copies the smallest contiguous chunk of data
170:        containing ALL of the indices */
171:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);

173:     VecCUSPRestoreArrayRead(v,&varray);
174:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
175:   }
176:   return(0);
177: }

179: /*MC
180:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

182:    Options Database Keys:
183: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

185:   Level: beginner

187: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
188: M*/

190: /* for VecAYPX_SeqCUSP*/
191: namespace cusp
192: {
193: namespace blas
194: {
195: namespace detail
196: {
197:   template <typename T>
198:     struct AYPX : public thrust::binary_function<T,T,T>
199:     {
200:       T alpha;

202:       AYPX(T _alpha) : alpha(_alpha) {}

204:       __host__ __device__
205:       T operator()(T x, T y)
206:       {
207:         return alpha * y + x;
208:       }
209:     };
210: }

212:  template <typename ForwardIterator1,
213:            typename ForwardIterator2,
214:            typename ScalarType>
215: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
216:            {
217:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
218:            }
219:  template <typename Array1, typename Array2, typename ScalarType>
220:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
221:  {
222: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
223:    cusp::assert_same_dimensions(x,y);
224: #else
225:    detail::assert_same_dimensions(x,y);
226: #endif
227:    aypx(x.begin(),x.end(),y.begin(),alpha);
228:  }
229: }
230: }

234: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
235: {
236:   CUSPARRAY      *xarray,*yarray;

240:   VecCUSPGetArrayRead(xin,&xarray);
241:   VecCUSPGetArrayReadWrite(yin,&yarray);
242:   try {
243:     if (alpha != 0.0) {
244:       cusp::blas::aypx(*xarray,*yarray,alpha);
245:       PetscLogFlops(2.0*yin->map->n);
246:     } else {
247:       cusp::blas::copy(*xarray,*yarray);
248:     }
249:     WaitForGPU();CHKERRCUSP(ierr);
250:   } catch(char *ex) {
251:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
252:   }
253:   VecCUSPRestoreArrayRead(xin,&xarray);
254:   VecCUSPRestoreArrayReadWrite(yin,&yarray);
255:   return(0);
256: }


261: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
262: {
263:   CUSPARRAY      *xarray,*yarray;

267:   if (alpha != 0.0) {
268:     VecCUSPGetArrayRead(xin,&xarray);
269:     VecCUSPGetArrayReadWrite(yin,&yarray);
270:     try {
271:       cusp::blas::axpy(*xarray,*yarray,alpha);
272:       WaitForGPU();CHKERRCUSP(ierr);
273:     } catch(char *ex) {
274:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
275:     }
276:     VecCUSPRestoreArrayRead(xin,&xarray);
277:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
278:     PetscLogFlops(2.0*yin->map->n);
279:   }
280:   return(0);
281: }

283: struct VecCUSPPointwiseDivide
284: {
285:   template <typename Tuple>
286:   __host__ __device__
287:   void operator()(Tuple t)
288:   {
289:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
290:   }
291: };

295: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
296: {
297:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

301:   VecCUSPGetArrayRead(xin,&xarray);
302:   VecCUSPGetArrayRead(yin,&yarray);
303:   VecCUSPGetArrayWrite(win,&warray);
304:   try {
305:     thrust::for_each(
306:       thrust::make_zip_iterator(
307:         thrust::make_tuple(
308:           warray->begin(),
309:           xarray->begin(),
310:           yarray->begin())),
311:       thrust::make_zip_iterator(
312:         thrust::make_tuple(
313:           warray->end(),
314:           xarray->end(),
315:           yarray->end())),
316:       VecCUSPPointwiseDivide());
317:     WaitForGPU();CHKERRCUSP(ierr);
318:   } catch(char *ex) {
319:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
320:   }
321:   PetscLogFlops(win->map->n);
322:   VecCUSPRestoreArrayRead(xin,&xarray);
323:   VecCUSPRestoreArrayRead(yin,&yarray);
324:   VecCUSPRestoreArrayWrite(win,&warray);
325:   return(0);
326: }


329: struct VecCUSPWAXPY
330: {
331:   template <typename Tuple>
332:   __host__ __device__
333:   void operator()(Tuple t)
334:   {
335:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
336:   }
337: };

339: struct VecCUSPSum
340: {
341:   template <typename Tuple>
342:   __host__ __device__
343:   void operator()(Tuple t)
344:   {
345:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
346:   }
347: };

349: struct VecCUSPDiff
350: {
351:   template <typename Tuple>
352:   __host__ __device__
353:   void operator()(Tuple t)
354:   {
355:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
356:   }
357: };

361: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
362: {
363:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

367:   if (alpha == 0.0) {
368:     VecCopy_SeqCUSP(yin,win);
369:   } else {
370:     VecCUSPGetArrayRead(xin,&xarray);
371:     VecCUSPGetArrayRead(yin,&yarray);
372:     VecCUSPGetArrayWrite(win,&warray);
373:     if (alpha == 1.0) {
374:       try {
375:         thrust::for_each(
376:           thrust::make_zip_iterator(
377:             thrust::make_tuple(
378:               warray->begin(),
379:               yarray->begin(),
380:               xarray->begin())),
381:           thrust::make_zip_iterator(
382:             thrust::make_tuple(
383:               warray->end(),
384:               yarray->end(),
385:               xarray->end())),
386:           VecCUSPSum());
387:       } catch(char *ex) {
388:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
389:       }
390:       PetscLogFlops(win->map->n);
391:     } else if (alpha == -1.0) {
392:       try {
393:         thrust::for_each(
394:           thrust::make_zip_iterator(
395:             thrust::make_tuple(
396:               warray->begin(),
397:               yarray->begin(),
398:               xarray->begin())),
399:           thrust::make_zip_iterator(
400:             thrust::make_tuple(
401:               warray->end(),
402:               yarray->end(),
403:               xarray->end())),
404:           VecCUSPDiff());
405:       } catch(char *ex) {
406:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
407:       }
408:       PetscLogFlops(win->map->n);
409:     } else {
410:       try {
411:         thrust::for_each(
412:           thrust::make_zip_iterator(
413:             thrust::make_tuple(
414:               warray->begin(),
415:               yarray->begin(),
416:               thrust::make_constant_iterator(alpha),
417:               xarray->begin())),
418:           thrust::make_zip_iterator(
419:             thrust::make_tuple(
420:               warray->end(),
421:               yarray->end(),
422:               thrust::make_constant_iterator(alpha),
423:               xarray->end())),
424:           VecCUSPWAXPY());
425:       } catch(char *ex) {
426:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
427:       }
428:       PetscLogFlops(2*win->map->n);
429:     }
430:     WaitForGPU();CHKERRCUSP(ierr);
431:     VecCUSPRestoreArrayRead(xin,&xarray);
432:     VecCUSPRestoreArrayRead(yin,&yarray);
433:     VecCUSPRestoreArrayWrite(win,&warray);
434:   }
435:   return(0);
436: }

438: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
439: struct VecCUSPMAXPY4
440: {
441:   template <typename Tuple>
442:   __host__ __device__
443:   void operator()(Tuple t)
444:   {
445:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
446:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
447:   }
448: };


451: struct VecCUSPMAXPY3
452: {
453:   template <typename Tuple>
454:   __host__ __device__
455:   void operator()(Tuple t)
456:   {
457:     /*y += a1*x1 +a2*x2 + a3*x3 */
458:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
459:   }
460: };

462: struct VecCUSPMAXPY2
463: {
464:   template <typename Tuple>
465:   __host__ __device__
466:   void operator()(Tuple t)
467:   {
468:     /*y += a1*x1 +a2*x2*/
469:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
470:   }
471: };
474: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
475: {
477:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
478:   PetscInt       n = xin->map->n,j,j_rem;
479:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

482:   PetscLogFlops(nv*2.0*n);
483:   VecCUSPGetArrayReadWrite(xin,&xarray);
484:   switch (j_rem=nv&0x3) {
485:   case 3:
486:     alpha0 = alpha[0];
487:     alpha1 = alpha[1];
488:     alpha2 = alpha[2];
489:     alpha += 3;
490:     VecCUSPGetArrayRead(y[0],&yy0);
491:     VecCUSPGetArrayRead(y[1],&yy1);
492:     VecCUSPGetArrayRead(y[2],&yy2);
493:     try {
494:       thrust::for_each(
495:         thrust::make_zip_iterator(
496:           thrust::make_tuple(
497:             xarray->begin(),
498:             thrust::make_constant_iterator(alpha0),
499:             yy0->begin(),
500:             thrust::make_constant_iterator(alpha1),
501:             yy1->begin(),
502:             thrust::make_constant_iterator(alpha2),
503:             yy2->begin())),
504:         thrust::make_zip_iterator(
505:           thrust::make_tuple(
506:             xarray->end(),
507:             thrust::make_constant_iterator(alpha0),
508:             yy0->end(),
509:             thrust::make_constant_iterator(alpha1),
510:             yy1->end(),
511:             thrust::make_constant_iterator(alpha2),
512:             yy2->end())),
513:         VecCUSPMAXPY3());
514:     } catch(char *ex) {
515:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
516:     }
517:     VecCUSPRestoreArrayRead(y[0],&yy0);
518:     VecCUSPRestoreArrayRead(y[1],&yy1);
519:     VecCUSPRestoreArrayRead(y[2],&yy2);
520:     y   += 3;
521:     break;
522:   case 2:
523:     alpha0 = alpha[0];
524:     alpha1 = alpha[1];
525:     alpha +=2;
526:     VecCUSPGetArrayRead(y[0],&yy0);
527:     VecCUSPGetArrayRead(y[1],&yy1);
528:     try {
529:       thrust::for_each(
530:         thrust::make_zip_iterator(
531:           thrust::make_tuple(
532:             xarray->begin(),
533:             thrust::make_constant_iterator(alpha0),
534:             yy0->begin(),
535:             thrust::make_constant_iterator(alpha1),
536:             yy1->begin())),
537:         thrust::make_zip_iterator(
538:           thrust::make_tuple(
539:             xarray->end(),
540:             thrust::make_constant_iterator(alpha0),
541:             yy0->end(),
542:             thrust::make_constant_iterator(alpha1),
543:             yy1->end())),
544:         VecCUSPMAXPY2());
545:     } catch(char *ex) {
546:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
547:     }
548:     y +=2;
549:     break;
550:   case 1:
551:     alpha0 = *alpha++;
552:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
553:     y     +=1;
554:     break;
555:   }
556:   for (j=j_rem; j<nv; j+=4) {
557:     alpha0 = alpha[0];
558:     alpha1 = alpha[1];
559:     alpha2 = alpha[2];
560:     alpha3 = alpha[3];
561:     alpha += 4;
562:     VecCUSPGetArrayRead(y[0],&yy0);
563:     VecCUSPGetArrayRead(y[1],&yy1);
564:     VecCUSPGetArrayRead(y[2],&yy2);
565:     VecCUSPGetArrayRead(y[3],&yy3);
566:     try {
567:       thrust::for_each(
568:         thrust::make_zip_iterator(
569:           thrust::make_tuple(
570:             xarray->begin(),
571:             thrust::make_constant_iterator(alpha0),
572:             yy0->begin(),
573:             thrust::make_constant_iterator(alpha1),
574:             yy1->begin(),
575:             thrust::make_constant_iterator(alpha2),
576:             yy2->begin(),
577:             thrust::make_constant_iterator(alpha3),
578:             yy3->begin())),
579:         thrust::make_zip_iterator(
580:           thrust::make_tuple(
581:             xarray->end(),
582:             thrust::make_constant_iterator(alpha0),
583:             yy0->end(),
584:             thrust::make_constant_iterator(alpha1),
585:             yy1->end(),
586:             thrust::make_constant_iterator(alpha2),
587:             yy2->end(),
588:             thrust::make_constant_iterator(alpha3),
589:             yy3->end())),
590:         VecCUSPMAXPY4());
591:     } catch(char *ex) {
592:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
593:     }
594:     VecCUSPRestoreArrayRead(y[0],&yy0);
595:     VecCUSPRestoreArrayRead(y[1],&yy1);
596:     VecCUSPRestoreArrayRead(y[2],&yy2);
597:     VecCUSPRestoreArrayRead(y[3],&yy3);
598:     y   += 4;
599:   }
600:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
601:   WaitForGPU();CHKERRCUSP(ierr);
602:   return(0);
603: }


608: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
609: {
610:   CUSPARRAY      *xarray,*yarray;
612:   //  PetscScalar    *xptr,*yptr,*zgpu;
613:   //PetscReal tmp;

616:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
617:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
618:   VecCUSPGetArrayRead(xin,&xarray);
619:   VecCUSPGetArrayRead(yin,&yarray);
620:   try {
621: #if defined(PETSC_USE_COMPLEX)
622:     *z = cusp::blas::dotc(*yarray,*xarray);
623: #else
624:     *z = cusp::blas::dot(*yarray,*xarray);
625: #endif
626:   } catch(char *ex) {
627:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
628:   }
629:   WaitForGPU();CHKERRCUSP(ierr);
630:   if (xin->map->n >0) {
631:     PetscLogFlops(2.0*xin->map->n-1);
632:   }
633:   VecCUSPRestoreArrayRead(xin,&xarray);
634:   VecCUSPRestoreArrayRead(yin,&yarray);
635:   return(0);
636: }

638: //
639: // CUDA kernels for MDot to follow
640: //

642: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
643: #define MDOT_WORKGROUP_SIZE 128
644: #define MDOT_WORKGROUP_NUM  128

646: // M = 2:
647: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
648:                                         PetscInt size, PetscScalar *group_results)
649: {
650:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
651:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
652:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
653:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
654:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

656:   PetscScalar entry_x    = 0;
657:   PetscScalar group_sum0 = 0;
658:   PetscScalar group_sum1 = 0;
659:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
660:     entry_x     = x[i];   // load only once from global memory!
661:     group_sum0 += entry_x * y0[i];
662:     group_sum1 += entry_x * y1[i];
663:   }
664:   tmp_buffer[threadIdx.x]                       = group_sum0;
665:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

667:   // parallel reduction
668:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
669:     __syncthreads();
670:     if (threadIdx.x < stride) {
671:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
672:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
673:     }
674:   }

676:   // write result of group to group_results
677:   if (threadIdx.x == 0) {
678:     group_results[blockIdx.x]             = tmp_buffer[0];
679:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
680:   }
681: }

683: // M = 3:
684: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
685:                                         PetscInt size, PetscScalar *group_results)
686: {
687:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
688:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
689:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
690:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
691:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

693:   PetscScalar entry_x    = 0;
694:   PetscScalar group_sum0 = 0;
695:   PetscScalar group_sum1 = 0;
696:   PetscScalar group_sum2 = 0;
697:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
698:     entry_x     = x[i];   // load only once from global memory!
699:     group_sum0 += entry_x * y0[i];
700:     group_sum1 += entry_x * y1[i];
701:     group_sum2 += entry_x * y2[i];
702:   }
703:   tmp_buffer[threadIdx.x]                           = group_sum0;
704:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
705:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

707:   // parallel reduction
708:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
709:     __syncthreads();
710:     if (threadIdx.x < stride) {
711:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
712:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
713:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
714:     }
715:   }

717:   // write result of group to group_results
718:   if (threadIdx.x == 0) {
719:     group_results[blockIdx.x                ] = tmp_buffer[0];
720:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
721:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
722:   }
723: }

725: // M = 4:
726: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
727:                                         PetscInt size, PetscScalar *group_results)
728: {
729:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
730:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
731:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
732:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
733:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

735:   PetscScalar entry_x    = 0;
736:   PetscScalar group_sum0 = 0;
737:   PetscScalar group_sum1 = 0;
738:   PetscScalar group_sum2 = 0;
739:   PetscScalar group_sum3 = 0;
740:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
741:     entry_x     = x[i];   // load only once from global memory!
742:     group_sum0 += entry_x * y0[i];
743:     group_sum1 += entry_x * y1[i];
744:     group_sum2 += entry_x * y2[i];
745:     group_sum3 += entry_x * y3[i];
746:   }
747:   tmp_buffer[threadIdx.x]                           = group_sum0;
748:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
749:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
750:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

752:   // parallel reduction
753:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
754:     __syncthreads();
755:     if (threadIdx.x < stride) {
756:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
757:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
758:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
759:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
760:     }
761:   }

763:   // write result of group to group_results
764:   if (threadIdx.x == 0) {
765:     group_results[blockIdx.x                ] = tmp_buffer[0];
766:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
767:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
768:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
769:   }
770: }

772: // M = 8:
773: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
774:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
775:                                           PetscInt size, PetscScalar *group_results)
776: {
777:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
778:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
779:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
780:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
781:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

783:   PetscScalar entry_x    = 0;
784:   PetscScalar group_sum0 = 0;
785:   PetscScalar group_sum1 = 0;
786:   PetscScalar group_sum2 = 0;
787:   PetscScalar group_sum3 = 0;
788:   PetscScalar group_sum4 = 0;
789:   PetscScalar group_sum5 = 0;
790:   PetscScalar group_sum6 = 0;
791:   PetscScalar group_sum7 = 0;
792:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
793:     entry_x     = x[i];   // load only once from global memory!
794:     group_sum0 += entry_x * y0[i];
795:     group_sum1 += entry_x * y1[i];
796:     group_sum2 += entry_x * y2[i];
797:     group_sum3 += entry_x * y3[i];
798:     group_sum4 += entry_x * y4[i];
799:     group_sum5 += entry_x * y5[i];
800:     group_sum6 += entry_x * y6[i];
801:     group_sum7 += entry_x * y7[i];
802:   }
803:   tmp_buffer[threadIdx.x]                           = group_sum0;
804:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
805:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
806:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
807:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
808:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
809:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
810:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

812:   // parallel reduction
813:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
814:     __syncthreads();
815:     if (threadIdx.x < stride) {
816:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
817:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
818:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
819:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
820:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
821:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
822:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
823:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
824:     }
825:   }

827:   // write result of group to group_results
828:   if (threadIdx.x == 0) {
829:     group_results[blockIdx.x                ] = tmp_buffer[0];
830:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
831:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
832:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
833:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
834:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
835:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
836:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
837:   }
838: }


843: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
844: {
846:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
847:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
848:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
849:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
850:   cudaError_t    cuda_ierr;

853:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
854:   /* Handle the case of local size zero first */
855:   if (!xin->map->n) {
856:     for (i=0; i<nv; ++i) z[i] = 0;
857:     return(0);
858:   }

860:   // allocate scratchpad memory for the results of individual work groups:
861:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
862:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

864:   VecCUSPGetArrayRead(xin,&xarray);
865:   xptr = thrust::raw_pointer_cast(xarray->data());

867:   while (current_y_index < nv)
868:   {
869:     switch (nv - current_y_index) {

871:     case 7:
872:     case 6:
873:     case 5:
874:     case 4:
875:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
876:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
877:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
878:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

880: #if defined(PETSC_USE_COMPLEX)
881:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
882:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
883:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
884:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
885: #else
886:       // extract raw device pointers:
887:       y0ptr = thrust::raw_pointer_cast(y0array->data());
888:       y1ptr = thrust::raw_pointer_cast(y1array->data());
889:       y2ptr = thrust::raw_pointer_cast(y2array->data());
890:       y3ptr = thrust::raw_pointer_cast(y3array->data());

892:       // run kernel:
893:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

895:       // copy results back to
896:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
897:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

899:       // sum group results into z:
900:       for (j=0; j<4; ++j) {
901:         z[current_y_index + j] = 0;
902:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
903:       }
904: #endif
905:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
906:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
907:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
908:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
909:       current_y_index += 4;
910:       break;

912:     case 3:
913:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
914:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
915:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

917: #if defined(PETSC_USE_COMPLEX)
918:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
919:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
920:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
921: #else
922:       // extract raw device pointers:
923:       y0ptr = thrust::raw_pointer_cast(y0array->data());
924:       y1ptr = thrust::raw_pointer_cast(y1array->data());
925:       y2ptr = thrust::raw_pointer_cast(y2array->data());

927:       // run kernel:
928:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

930:       // copy results back to
931:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
932:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

934:       // sum group results into z:
935:       for (j=0; j<3; ++j) {
936:         z[current_y_index + j] = 0;
937:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
938:       }
939: #endif

941:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
942:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
943:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
944:       current_y_index += 3;
945:       break;

947:     case 2:
948:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
949:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

951: #if defined(PETSC_USE_COMPLEX)
952:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
953:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
954: #else
955:       // extract raw device pointers:
956:       y0ptr = thrust::raw_pointer_cast(y0array->data());
957:       y1ptr = thrust::raw_pointer_cast(y1array->data());

959:       // run kernel:
960:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

962:       // copy results back to
963:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
964:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

966:       // sum group results into z:
967:       for (j=0; j<2; ++j) {
968:         z[current_y_index + j] = 0;
969:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
970:       }
971: #endif
972:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
973:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
974:       current_y_index += 2;
975:       break;

977:     case 1:
978:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
979: #if defined(PETSC_USE_COMPLEX)
980:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
981: #else
982:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
983: #endif
984:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
985:       current_y_index += 1;
986:       break;

988:     default: // 8 or more vectors left
989:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
990:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
991:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
992:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
993:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
994:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
995:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
996:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

998: #if defined(PETSC_USE_COMPLEX)
999:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1000:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1001:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1002:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1003:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1004:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1005:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1006:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1007: #else
1008:       // extract raw device pointers:
1009:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1010:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1011:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1012:       y3ptr = thrust::raw_pointer_cast(y3array->data());
1013:       y4ptr = thrust::raw_pointer_cast(y4array->data());
1014:       y5ptr = thrust::raw_pointer_cast(y5array->data());
1015:       y6ptr = thrust::raw_pointer_cast(y6array->data());
1016:       y7ptr = thrust::raw_pointer_cast(y7array->data());

1018:       // run kernel:
1019:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1021:       // copy results back to
1022:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1023:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1025:       // sum group results into z:
1026:       for (j=0; j<8; ++j) {
1027:         z[current_y_index + j] = 0;
1028:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1029:       }
1030: #endif
1031:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1032:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1033:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1034:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1035:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1036:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1037:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1038:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1039:       current_y_index += 8;
1040:       break;
1041:     }
1042:   }
1043:   VecCUSPRestoreArrayRead(xin,&xarray);

1045:   cuda_cudaFree(group_results_gpu);
1046:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1047:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1048:   return(0);
1049: }

1051: #undef MDOT_WORKGROUP_SIZE
1052: #undef MDOT_WORKGROUP_NUM



1058: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1059: {
1060:   CUSPARRAY      *xarray=NULL;

1064:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1065:   VecCUSPGetArrayWrite(xin,&xarray);
1066:   try {
1067:     cusp::blas::fill(*xarray,alpha);
1068:   } catch(char *ex) {
1069:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1070:   }
1071:   WaitForGPU();CHKERRCUSP(ierr);
1072:   VecCUSPRestoreArrayWrite(xin,&xarray);
1073:   return(0);
1074: }

1078: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1079: {
1080:   CUSPARRAY      *xarray;

1084:   if (alpha == 0.0) {
1085:     VecSet_SeqCUSP(xin,alpha);
1086:   } else if (alpha != 1.0) {
1087:     VecCUSPGetArrayReadWrite(xin,&xarray);
1088:     try {
1089:       cusp::blas::scal(*xarray,alpha);
1090:     } catch(char *ex) {
1091:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1092:     }
1093:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1094:   }
1095:   WaitForGPU();CHKERRCUSP(ierr);
1096:   PetscLogFlops(xin->map->n);
1097:   return(0);
1098: }

1102: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1103: {
1104:   CUSPARRAY      *xarray,*yarray;

1108:   //#if defined(PETSC_USE_COMPLEX)
1109:   /*Not working for complex*/
1110:   //#else
1111:   VecCUSPGetArrayRead(xin,&xarray);
1112:   VecCUSPGetArrayRead(yin,&yarray);
1113:   try {
1114:     *z = cusp::blas::dot(*xarray,*yarray);
1115:   } catch(char *ex) {
1116:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1117:   }
1118:   //#endif
1119:   WaitForGPU();CHKERRCUSP(ierr);
1120:   if (xin->map->n > 0) {
1121:     PetscLogFlops(2.0*xin->map->n-1);
1122:   }
1123:   VecCUSPRestoreArrayRead(yin,&yarray);
1124:   VecCUSPRestoreArrayRead(xin,&xarray);
1125:   return(0);
1126: }

1130: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1131: {
1132:   CUSPARRAY      *xarray,*yarray;

1136:   if (xin != yin) {
1137:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1138:       VecCUSPGetArrayRead(xin,&xarray);
1139:       VecCUSPGetArrayWrite(yin,&yarray);
1140:       try {
1141:         cusp::blas::copy(*xarray,*yarray);
1142:       } catch(char *ex) {
1143:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1144:       }
1145:       WaitForGPU();CHKERRCUSP(ierr);
1146:       VecCUSPRestoreArrayRead(xin,&xarray);
1147:       VecCUSPRestoreArrayWrite(yin,&yarray);

1149:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1150:       /* copy in CPU if we are on the CPU*/
1151:       VecCopy_SeqCUSP_Private(xin,yin);
1152:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1153:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1154:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1155:         /* copy in CPU */
1156:         VecCopy_SeqCUSP_Private(xin,yin);

1158:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1159:         /* copy in GPU */
1160:         VecCUSPGetArrayRead(xin,&xarray);
1161:         VecCUSPGetArrayWrite(yin,&yarray);
1162:         try {
1163:           cusp::blas::copy(*xarray,*yarray);
1164:           WaitForGPU();CHKERRCUSP(ierr);
1165:         } catch(char *ex) {
1166:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1167:         }
1168:         VecCUSPRestoreArrayRead(xin,&xarray);
1169:         VecCUSPRestoreArrayWrite(yin,&yarray);
1170:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1171:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1172:            default to copy in GPU (this is an arbitrary choice) */
1173:         VecCUSPGetArrayRead(xin,&xarray);
1174:         VecCUSPGetArrayWrite(yin,&yarray);
1175:         try {
1176:           cusp::blas::copy(*xarray,*yarray);
1177:           WaitForGPU();CHKERRCUSP(ierr);
1178:         } catch(char *ex) {
1179:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1180:         }
1181:         VecCUSPRestoreArrayRead(xin,&xarray);
1182:         VecCUSPRestoreArrayWrite(yin,&yarray);
1183:       } else {
1184:         VecCopy_SeqCUSP_Private(xin,yin);
1185:       }
1186:     }
1187:   }
1188:   return(0);
1189: }


1194: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1195: {
1197:   PetscBLASInt   one = 1,bn;
1198:   CUSPARRAY      *xarray,*yarray;
1199:   cublasStatus_t cberr;

1202:   PetscBLASIntCast(xin->map->n,&bn);
1203:   if (xin != yin) {
1204:     VecCUSPGetArrayReadWrite(xin,&xarray);
1205:     VecCUSPGetArrayReadWrite(yin,&yarray);

1207: #if defined(PETSC_USE_COMPLEX)
1208: #if defined(PETSC_USE_REAL_SINGLE)
1209:     cberr = cublasCswap(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1210: #else
1211:     cberr = cublasZswap(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1212: #endif
1213: #else
1214: #if defined(PETSC_USE_REAL_SINGLE)
1215:     cberr = cublasSswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1216: #else
1217:     cberr = cublasDswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1218: #endif
1219: #endif
1220:     WaitForGPU();CHKERRCUSP(ierr);
1221:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1222:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1223:   }
1224:   return(0);
1225: }

1227: struct VecCUSPAX
1228: {
1229:   template <typename Tuple>
1230:   __host__ __device__
1231:   void operator()(Tuple t)
1232:   {
1233:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1234:   }
1235: };

1239: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1240: {
1242:   PetscScalar    a = alpha,b = beta;
1243:   CUSPARRAY      *xarray,*yarray;

1246:   if (a == 0.0) {
1247:     VecScale_SeqCUSP(yin,beta);
1248:   } else if (b == 1.0) {
1249:     VecAXPY_SeqCUSP(yin,alpha,xin);
1250:   } else if (a == 1.0) {
1251:     VecAYPX_SeqCUSP(yin,beta,xin);
1252:   } else if (b == 0.0) {
1253:     VecCUSPGetArrayRead(xin,&xarray);
1254:     VecCUSPGetArrayReadWrite(yin,&yarray);
1255:     try {
1256:       thrust::for_each(
1257:         thrust::make_zip_iterator(
1258:           thrust::make_tuple(
1259:             yarray->begin(),
1260:             thrust::make_constant_iterator(a),
1261:             xarray->begin())),
1262:         thrust::make_zip_iterator(
1263:           thrust::make_tuple(
1264:             yarray->end(),
1265:             thrust::make_constant_iterator(a),
1266:             xarray->end())),
1267:         VecCUSPAX());
1268:     } catch(char *ex) {
1269:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1270:     }
1271:     PetscLogFlops(xin->map->n);
1272:     WaitForGPU();CHKERRCUSP(ierr);
1273:     VecCUSPRestoreArrayRead(xin,&xarray);
1274:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1275:   } else {
1276:     VecCUSPGetArrayRead(xin,&xarray);
1277:     VecCUSPGetArrayReadWrite(yin,&yarray);
1278:     try {
1279:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1280:     } catch(char *ex) {
1281:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1282:     }
1283:     VecCUSPRestoreArrayRead(xin,&xarray);
1284:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1285:     WaitForGPU();CHKERRCUSP(ierr);
1286:     PetscLogFlops(3.0*xin->map->n);
1287:   }
1288:   return(0);
1289: }

1291: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1292: struct VecCUSPXPBYPCZ
1293: {
1294:   /* z = x + b*y + c*z */
1295:   template <typename Tuple>
1296:   __host__ __device__
1297:   void operator()(Tuple t)
1298:   {
1299:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1300:   }
1301: };

1303: struct VecCUSPAXPBYPZ
1304: {
1305:   /* z = ax + b*y + z */
1306:   template <typename Tuple>
1307:   __host__ __device__
1308:   void operator()(Tuple t)
1309:   {
1310:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1311:   }
1312: };

1316: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1317: {
1319:   PetscInt       n = zin->map->n;
1320:   CUSPARRAY      *xarray,*yarray,*zarray;

1323:   VecCUSPGetArrayRead(xin,&xarray);
1324:   VecCUSPGetArrayRead(yin,&yarray);
1325:   VecCUSPGetArrayReadWrite(zin,&zarray);
1326:   if (alpha == 1.0) {
1327:     try {
1328:       thrust::for_each(
1329:         thrust::make_zip_iterator(
1330:           thrust::make_tuple(
1331:             zarray->begin(),
1332:             thrust::make_constant_iterator(gamma),
1333:             xarray->begin(),
1334:             yarray->begin(),
1335:             thrust::make_constant_iterator(beta))),
1336:         thrust::make_zip_iterator(
1337:           thrust::make_tuple(
1338:             zarray->end(),
1339:             thrust::make_constant_iterator(gamma),
1340:             xarray->end(),
1341:             yarray->end(),
1342:             thrust::make_constant_iterator(beta))),
1343:         VecCUSPXPBYPCZ());
1344:     } catch(char *ex) {
1345:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1346:     }
1347:     PetscLogFlops(4.0*n);
1348:   } else if (gamma == 1.0) {
1349:     try {
1350:       thrust::for_each(
1351:         thrust::make_zip_iterator(
1352:           thrust::make_tuple(
1353:             zarray->begin(),
1354:             xarray->begin(),
1355:             thrust::make_constant_iterator(alpha),
1356:             yarray->begin(),
1357:             thrust::make_constant_iterator(beta))),
1358:         thrust::make_zip_iterator(
1359:           thrust::make_tuple(
1360:             zarray->end(),
1361:             xarray->end(),
1362:             thrust::make_constant_iterator(alpha),
1363:             yarray->end(),
1364:             thrust::make_constant_iterator(beta))),
1365:         VecCUSPAXPBYPZ());
1366:     } catch(char *ex) {
1367:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1368:     }
1369:     PetscLogFlops(4.0*n);
1370:   } else {
1371:     try {
1372:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1373:     } catch(char *ex) {
1374:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1375:     }
1376:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1377:     VecCUSPRestoreArrayRead(xin,&xarray);
1378:     VecCUSPRestoreArrayRead(yin,&yarray);
1379:     PetscLogFlops(5.0*n);
1380:   }
1381:   WaitForGPU();CHKERRCUSP(ierr);
1382:   return(0);
1383: }

1387: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1388: {
1390:   PetscInt       n = win->map->n;
1391:   CUSPARRAY      *xarray,*yarray,*warray;

1394:   VecCUSPGetArrayRead(xin,&xarray);
1395:   VecCUSPGetArrayRead(yin,&yarray);
1396:   VecCUSPGetArrayReadWrite(win,&warray);
1397:   try {
1398:     cusp::blas::xmy(*xarray,*yarray,*warray);
1399:   } catch(char *ex) {
1400:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1401:   }
1402:   VecCUSPRestoreArrayRead(xin,&xarray);
1403:   VecCUSPRestoreArrayRead(yin,&yarray);
1404:   VecCUSPRestoreArrayReadWrite(win,&warray);
1405:   PetscLogFlops(n);
1406:   WaitForGPU();CHKERRCUSP(ierr);
1407:   return(0);
1408: }


1411: /* should do infinity norm in cusp */

1415: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1416: {
1417:   const PetscScalar *xx;
1418:   PetscErrorCode    ierr;
1419:   PetscInt          n = xin->map->n;
1420:   PetscBLASInt      one = 1, bn;
1421:   CUSPARRAY         *xarray;
1422:   cublasStatus_t    cberr;

1425:   PetscBLASIntCast(n,&bn);
1426:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1427:     VecCUSPGetArrayRead(xin,&xarray);
1428:     try {
1429:       *z = cusp::blas::nrm2(*xarray);
1430:     } catch(char *ex) {
1431:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1432:     }
1433:     WaitForGPU();CHKERRCUSP(ierr);
1434:     VecCUSPRestoreArrayRead(xin,&xarray);
1435:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1436:   } else if (type == NORM_INFINITY) {
1437:     PetscInt  i;
1438:     PetscReal max = 0.0,tmp;

1440:     VecGetArrayRead(xin,&xx);
1441:     for (i=0; i<n; i++) {
1442:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1443:       /* check special case of tmp == NaN */
1444:       if (tmp != tmp) {max = tmp; break;}
1445:       xx++;
1446:     }
1447:     VecRestoreArrayRead(xin,&xx);
1448:     *z   = max;
1449:   } else if (type == NORM_1) {
1450:     VecCUSPGetArrayRead(xin,&xarray);
1451: #if defined(PETSC_USE_COMPLEX)
1452: #if defined(PETSC_USE_REAL_SINGLE)
1453:     cberr = cublasSasum(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1454: #else
1455:     cberr = cublasDasum(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1456: #endif
1457: #else
1458: #if defined(PETSC_USE_REAL_SINGLE)
1459:     cberr = cublasSasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1460: #else
1461:     cberr = cublasDasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1462: #endif
1463: #endif
1464:     VecCUSPRestoreArrayRead(xin,&xarray);
1465:     WaitForGPU();CHKERRCUSP(ierr);
1466:     PetscLogFlops(PetscMax(n-1.0,0.0));
1467:   } else if (type == NORM_1_AND_2) {
1468:     VecNorm_SeqCUSP(xin,NORM_1,z);
1469:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1470:   }
1471:   return(0);
1472: }

1474: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1475: template <typename T>
1476: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1477: {
1478:   __host__ __device__
1479:   T operator()(T x)
1480:   {
1481: #if defined(PETSC_USE_COMPLEX)
1482:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1483: #else
1484:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1485: #endif
1486:   }
1487: };

1489: template <typename T>
1490: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1491: {
1492:   __host__ __device__
1493:   T operator()(T x,T y)
1494:   {
1495:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1496:   }
1497: };

1501: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1502: {
1503:   PetscErrorCode                         ierr;
1504:   PetscScalar                            zero = 0.0;
1505:   PetscReal                              n=s->map->n;
1506:   thrust::tuple<PetscScalar,PetscScalar> result;
1507:   CUSPARRAY                              *sarray,*tarray;

1510:   /*VecCUSPCopyToGPU(s);
1511:    VecCUSPCopyToGPU(t);*/
1512:   VecCUSPGetArrayRead(s,&sarray);
1513:   VecCUSPGetArrayRead(t,&tarray);
1514:   try {
1515: #if defined(PETSC_USE_COMPLEX)
1516:     VecDot_SeqCUSP(s,t,dp);
1517:     VecDot_SeqCUSP(t,t,nm);
1518:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1519:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1520: #else
1521:     result = thrust::transform_reduce(
1522:               thrust::make_zip_iterator(
1523:                 thrust::make_tuple(
1524:                   sarray->begin(),
1525:                   tarray->begin())),
1526:               thrust::make_zip_iterator(
1527:                 thrust::make_tuple(
1528:                   sarray->end(),
1529:                   tarray->end())),
1530:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1531:               thrust::make_tuple(zero,zero),                                   /*init */
1532:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1533:     *dp = thrust::get<0>(result);
1534:     *nm = thrust::get<1>(result);
1535: #endif
1536:   } catch(char *ex) {
1537:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1538:   }
1539:   VecCUSPRestoreArrayRead(s,&sarray);
1540:   VecCUSPRestoreArrayRead(t,&tarray);
1541:   WaitForGPU();CHKERRCUSP(ierr);
1542:   PetscLogFlops(4.0*n);
1543:   return(0);
1544: }

1548: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1549: {
1551:   cudaError_t    err;

1554:   try {
1555:     if (v->spptr) {
1556:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
1557:       err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1558:       delete (Vec_CUSP*)v->spptr;
1559:     }
1560:   } catch(char *ex) {
1561:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1562:   }
1563:   VecDestroy_SeqCUSP_Private(v);
1564:   return(0);
1565: }


1568: #if defined(PETSC_USE_COMPLEX)
1569: struct conjugate
1570: {
1571:   __host__ __device__
1572:   PetscScalar operator()(PetscScalar x)
1573:   {
1574:     return cusp::conj(x);
1575:   }
1576: };
1577: #endif


1582: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1583: {
1585:   CUSPARRAY      *xarray;

1588:   VecCUSPGetArrayReadWrite(xin,&xarray);
1589: #if defined(PETSC_USE_COMPLEX)
1590:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1591: #endif
1592:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
1593:   return(0);
1594: }

1598: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1599: {
1600:   VecType        t;
1602:   cudaError_t    err;
1603:   PetscBool      flg;

1608:   VecGetType(w,&t);
1609:   PetscStrcmp(t,VECSEQCUSP,&flg);
1610:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);

1612:   if (w->data) {
1613:     if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1614:     ((Vec_Seq*)w->data)->array = 0;
1615:     ((Vec_Seq*)w->data)->array_allocated = 0;
1616:     ((Vec_Seq*)w->data)->unplacedarray = 0;
1617:   }
1618:   if (w->spptr) {
1619:     if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1620:     err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1621:     delete (Vec_CUSP*)w->spptr;
1622:     w->spptr = 0;
1623:   }

1625:   if (v->petscnative) {
1626:     PetscFree(w->data);
1627:     w->data = v->data;
1628:     w->valid_GPU_array = v->valid_GPU_array;
1629:     w->spptr = v->spptr;
1630:     PetscObjectStateIncrease((PetscObject)w);
1631:   } else {
1632:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1633:     w->valid_GPU_array = PETSC_CUSP_CPU;
1634:     VecCUSPAllocateCheck(w);
1635:   }
1636:   return(0);
1637: }

1641: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1642: {
1643:   VecType        t;
1645:   cudaError_t    err;
1646:   PetscBool      flg;

1651:   VecGetType(w,&t);
1652:   PetscStrcmp(t,VECSEQCUSP,&flg);
1653:   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP);

1655:   if (v->petscnative) {
1656:     v->data = w->data;
1657:     v->valid_GPU_array = w->valid_GPU_array;
1658:     v->spptr = w->spptr;
1659:     VecCUSPCopyFromGPU(v);
1660:     PetscObjectStateIncrease((PetscObject)v);
1661:     w->data = 0;
1662:     w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1663:     w->spptr = 0;
1664:   } else {
1665:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1666:     if ((Vec_CUSP*)w->spptr) {
1667:       delete ((Vec_CUSP*)w->spptr)->GPUarray;
1668:       err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1669:       delete (Vec_CUSP*)w->spptr;
1670:     }
1671:   }
1672:   return(0);
1673: }

1677: /*@C
1678:    VecCUSPGetArrayReadWrite - Provides access to the CUSP vector inside a vector.

1680:    This function has semantics similar to VecGetArray():  the CUSP
1681:    vector returned by this function points to a consistent view of the
1682:    vector data.  This may involve a copy operation of data from the host
1683:    to the device if the data on the device is out of date.  If the
1684:    device memory hasn't been allocated previously it will be allocated
1685:    as part of this function call.  VecCUSPGetArrayReadWrite() assumes
1686:    that the user will modify the vector data.  This is similar to
1687:    intent(inout) in fortran.

1689:    The CUSP device vector has to be released by calling
1690:    VecCUSPRestoreArrayReadWrite().  Upon restoring the vector data the
1691:    data on the host will be marked as out of date.  A subsequent access
1692:    of the host data will thus incur a data transfer from the device to
1693:    the host.


1696:    Input Parameter:
1697: .  v - the vector

1699:    Output Parameter:
1700: .  a - the CUSP device vector
1701:    
1702:    Fortran note: This function is not currently available from Fortran.

1704:    Fortran note:
1705:    This function is not currently available from Fortran.

1707:    Level: intermediate

1709: .seealso: VecCUSPRestoreArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1710: @*/
1711: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1712: {

1716:   *a   = 0;
1717:   VecCUSPCopyToGPU(v);
1718:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1719:   return(0);
1720: }

1724: /*@C
1725:    VecCUSPRestoreArrayReadWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayReadWrite().

1727:    This marks the host data as out of date.  Subsequent access to the
1728:    vector data on the host side with for instance VecGetArray() incurs a
1729:    data transfer.

1731:    Input Parameter:
1732: +  v - the vector
1733: -  a - the CUSP device vector.  This pointer is invalid after
1734:        VecCUSPRestoreArrayReadWrite() returns.

1736:    Fortran note:
1737:    This function is not currently available from Fortran.

1739:    Level: intermediate

1741: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1742: @*/
1743: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1744: {

1748:   v->valid_GPU_array = PETSC_CUSP_GPU;

1750:   PetscObjectStateIncrease((PetscObject)v);
1751:   return(0);
1752: }

1756: /*@C
1757:    VecCUSPGetArrayRead - Provides read access to the CUSP device vector inside a vector.

1759:    This function is analogous to VecGetArrayRead():  The CUSP vector
1760:    returned by this function points to a consistent view of the vector
1761:    data.  This may involve a copy operation of data from the host to the
1762:    device if the data on the device is out of date.  If the device
1763:    memory hasn't been allocated previously it will be allocated as part
1764:    of this function call.  VecCUSPGetArrayRead() assumes that the user
1765:    will not modify the vector data.  This is analogous to intent(in) in
1766:    Fortran.

1768:    The CUSP device vector has to be released by calling
1769:    VecCUSPRestoreArrayRead().  If the data on the host side was
1770:    previously up to date it will remain so, i.e. data on both the device
1771:    and the host is up to date.  Accessing data on the host side does not
1772:    incur a device to host data transfer.

1774:    Input Parameter:
1775: .  v - the vector

1777:    Output Parameter:
1778: .  a - the CUSP device vector

1780:    Fortran note:
1781:    This function is not currently available from Fortran.

1783:    Level: intermediate

1785: .seealso: VecCUSPRestoreArrayRead(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecGetArrayRead()
1786: @*/
1787: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1788: {

1792:   *a   = 0;
1793:   VecCUSPCopyToGPU(v);
1794:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1795:   return(0);
1796: }

1800: /*@C
1801:    VecCUSPRestoreArrayRead - Restore a CUSP device vector previously acquired with VecCUSPGetArrayRead().

1803:    If the data on the host side was previously up to date it will remain
1804:    so, i.e. data on both the device and the host is up to date.
1805:    Accessing data on the host side e.g. with VecGetArray() does not
1806:    incur a device to host data transfer.

1808:    Input Parameter:
1809: +  v - the vector
1810: -  a - the CUSP device vector.  This pointer is invalid after
1811:        VecCUSPRestoreArrayRead() returns.

1813:    Fortran note:
1814:    This function is not currently available from Fortran.

1816:    Level: intermediate

1818: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1819: @*/
1820: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1821: {
1823:   return(0);
1824: }

1828: /*@C
1829:    VecCUSPGetArrayWrite - Provides write access to the CUSP device vector inside a vector.

1831:    The data pointed to by the device vector is uninitialized.  The user
1832:    must not read this data.  Furthermore, the entire array needs to be
1833:    filled by the user to obtain well-defined behaviour.  The device
1834:    memory will be allocated by this function if it hasn't been allocated
1835:    previously.  This is analogous to intent(out) in Fortran.

1837:    The CUSP device vector needs to be released with
1838:    VecCUSPRestoreArrayWrite().  When the pointer is released the host
1839:    data of the vector is marked as out of data.  Subsequent access of
1840:    the host data with e.g. VecGetArray() incurs a device to host data
1841:    transfer.


1844:    Input Parameter:
1845: .  v - the vector

1847:    Output Parameter:
1848: .  a - the CUDA pointer

1850:    Fortran note:
1851:    This function is not currently available from Fortran.

1853:    Level: intermediate

1855: .seealso: VecCUSPRestoreArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1856: @*/
1857: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1858: {

1862:   *a   = 0;
1863:   VecCUSPAllocateCheck(v);
1864:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1865:   return(0);
1866: }

1870: /*@C
1871:    VecCUSPRestoreArrayWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayWrite().

1873:    Data on the host will be marked as out of date.  Subsequent access of
1874:    the data on the host side e.g. with VecGetArray() will incur a device
1875:    to host data transfer.

1877:    Input Parameter:
1878: +  v - the vector
1879: -  a - the CUDA device pointer.  This pointer is invalid after
1880:        VecCUSPRestoreArrayWrite() returns.

1882:    Fortran note:
1883:    This function is not currently available from Fortran.

1885:    Level: intermediate

1887: .seealso: VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1888: @*/
1889: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
1890: {

1894:   v->valid_GPU_array = PETSC_CUSP_GPU;

1896:   PetscObjectStateIncrease((PetscObject)v);
1897:   return(0);
1898: }

1902: /*@C
1903:    VecCUSPGetCUDAArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1905:    This function has semantics similar to VecGetArray():  the pointer
1906:    returned by this function points to a consistent view of the vector
1907:    data.  This may involve a copy operation of data from the host to the
1908:    device if the data on the device is out of date.  If the device
1909:    memory hasn't been allocated previously it will be allocated as part
1910:    of this function call.  VecCUSPGetCUDAArrayReadWrite() assumes that
1911:    the user will modify the vector data.  This is similar to
1912:    intent(inout) in fortran.

1914:    The CUDA device pointer has to be released by calling
1915:    VecCUSPRestoreCUDAArrayReadWrite().  Upon restoring the vector data
1916:    the data on the host will be marked as out of date.  A subsequent
1917:    access of the host data will thus incur a data transfer from the
1918:    device to the host.


1921:    Input Parameter:
1922: .  v - the vector

1924:    Output Parameter:
1925: .  a - the CUDA device pointer

1927:    Fortran note:
1928:    This function is not currently available from Fortran.

1930:    Level: advanced

1932: .seealso: VecCUSPRestoreCUDAArrayReadWrite(), VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1933: @*/
1934: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayReadWrite(Vec v, PetscScalar **a)
1935: {
1937:   CUSPARRAY      *cusparray;

1941:   VecCUSPGetArrayReadWrite(v, &cusparray);
1942:   *a   = thrust::raw_pointer_cast(cusparray->data());
1943:   return(0);
1944: }

1948: /*@C
1949:    VecCUSPRestoreCUDAArrayReadWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayReadWrite().

1951:    This marks the host data as out of date.  Subsequent access to the
1952:    vector data on the host side with for instance VecGetArray() incurs a
1953:    data transfer.

1955:    Input Parameter:
1956: +  v - the vector
1957: -  a - the CUDA device pointer.  This pointer is invalid after
1958:        VecCUSPRestoreCUDAArrayReadWrite() returns.

1960:    Fortran note:
1961:    This function is not currently available from Fortran.

1963:    Level: advanced

1965: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1966: @*/
1967: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayReadWrite(Vec v, PetscScalar **a)
1968: {

1972:   v->valid_GPU_array = PETSC_CUSP_GPU;
1973:   PetscObjectStateIncrease((PetscObject)v);
1974:   return(0);
1975: }

1979: /*@C
1980:    VecCUSPGetCUDAArrayRead - Provides read access to the CUDA buffer inside a vector.

1982:    This function is analogous to VecGetArrayRead():  The pointer
1983:    returned by this function points to a consistent view of the vector
1984:    data.  This may involve a copy operation of data from the host to the
1985:    device if the data on the device is out of date.  If the device
1986:    memory hasn't been allocated previously it will be allocated as part
1987:    of this function call.  VecCUSPGetCUDAArrayRead() assumes that the
1988:    user will not modify the vector data.  This is analgogous to
1989:    intent(in) in Fortran.

1991:    The CUDA device pointer has to be released by calling
1992:    VecCUSPRestoreCUDAArrayRead().  If the data on the host side was
1993:    previously up to date it will remain so, i.e. data on both the device
1994:    and the host is up to date.  Accessing data on the host side does not
1995:    incur a device to host data transfer.

1997:    Input Parameter:
1998: .  v - the vector

2000:    Output Parameter:
2001: .  a - the CUDA pointer.

2003:    Fortran note:
2004:    This function is not currently available from Fortran.

2006:    Level: advanced

2008: .seealso: VecCUSPRestoreCUDAArrayRead(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2009: @*/
2010: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayRead(Vec v, PetscScalar **a)
2011: {
2013:   CUSPARRAY      *cusparray;

2017:   VecCUSPGetArrayRead(v, &cusparray);
2018:   *a   = thrust::raw_pointer_cast(cusparray->data());
2019:   return(0);
2020: }

2024: /*@C
2025:    VecCUSPRestoreCUDAArrayRead - Restore a device vector previously acquired with VecCUSPGetCUDAArrayRead()

2027:    If the data on the host side was previously up to date it will remain
2028:    so, i.e. data on both the device and the host is up to date.
2029:    Accessing data on the host side e.g. with VecGetArray() does not
2030:    incur a device to host data transfer.

2032:    Input Parameter:
2033: +  v - the vector

2035: -  a - the CUDA device pointer.  This pointer is invalid after
2036:        VecCUSPRestoreCUDAArrayRead() returns.

2038:    Fortran note:
2039:    This function is not currently available from Fortran.

2041:    Level: advanced

2043:    Fortran note: This function is not currently available from Fortran.

2045: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2046: @*/
2047: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayRead(Vec v, PetscScalar **a)
2048: {
2050:   v->valid_GPU_array = PETSC_CUSP_BOTH;
2051:   return(0);
2052: }

2056: /*@C
2057:    VecCUSPGetCUDAArrayWrite - Provides write access to the CUDA buffer inside a vector.

2059:    The data pointed to by the device pointer is uninitialized.  The user
2060:    may not read from this data.  Furthermore, the entire array needs to
2061:    be filled by the user to obtain well-defined behaviour.  The device
2062:    memory will be allocated by this function if it hasn't been allocated
2063:    previously.  This is analogous to intent(out) in Fortran.

2065:    The device pointer needs to be released with
2066:    VecCUSPRestoreCUDAArrayWrite().  When the pointer is released the
2067:    host data of the vector is marked as out of data.  Subsequent access
2068:    of the host data with e.g. VecGetArray() incurs a device to host data
2069:    transfer.


2072:    Input Parameter:
2073: .  v - the vector

2075:    Output Parameter:
2076: .  a - the CUDA pointer

2078:    Fortran note:
2079:    This function is not currently available from Fortran.

2081:    Level: advanced

2083: .seealso: VecCUSPRestoreCUDAArrayWrite(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2084: @*/
2085: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayWrite(Vec v, PetscScalar **a)
2086: {
2088:   CUSPARRAY      *cusparray;

2092:   VecCUSPGetArrayWrite(v, &cusparray);
2093:   *a   = thrust::raw_pointer_cast(cusparray->data());
2094:   return(0);
2095: }

2099: /*@C
2100:    VecCUSPRestoreCUDAArrayWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayWrite().

2102:    Data on the host will be marked as out of date.  Subsequent access of
2103:    the data on the host side e.g. with VecGetArray() will incur a device
2104:    to host data transfer.

2106:    Input Parameter:
2107: +  v - the vector

2109: -  a - the CUDA device pointer.  This pointer is invalid after
2110:        VecCUSPRestoreCUDAArrayWrite() returns.

2112:    Fortran note:
2113:    This function is not currently available from Fortran.

2115:    Level: advanced

2117: .seealso: VecCUSPGetCUDAArrayWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2118: @*/
2119: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayWrite(Vec v, PetscScalar **a)
2120: {

2124:   v->valid_GPU_array = PETSC_CUSP_GPU;
2125:   PetscObjectStateIncrease((PetscObject)v);
2126:   return(0);
2127: }


2132: /*@C
2133:    VecCUSPPlaceArray - Allows one to replace the array in a vector with a
2134:    CUSPARRAY provided by the user. This is useful to avoid copying a
2135:    CUSPARRAY into a vector.

2137:    Not Collective

2139:    Input Parameters:
2140: +  vec - the vector
2141: -  array - the CUSPARRAY

2143:    Notes:
2144:    You can return to the original CUSPARRAY with a call to VecCUSPResetArray()
2145:    It is not possible to use VecCUSPPlaceArray() and VecPlaceArray() at the
2146:    same time on the same vector.

2148:    Level: developer

2150: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPReplaceArray()

2152: @*/
2153: PetscErrorCode VecCUSPPlaceArray(Vec vin,CUSPARRAY *a)
2154: {

2158:   VecCUSPCopyToGPU(vin);
2159:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUSPPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUSPResetArray()/VecResetArray()");
2160:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUSP*)vin->spptr)->GPUarray; /* save previous CUDAARRAY so reset can bring it back */
2161:   ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2162:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2163:   return(0);
2164: }

2168: /*@C
2169:    VecCUSPReplaceArray - Allows one to replace the CUSPARRAY in a vector
2170:    with a CUSPARRAY provided by the user. This is useful to avoid copying
2171:    a CUSPARRAY into a vector.

2173:    Not Collective

2175:    Input Parameters:
2176: +  vec - the vector
2177: -  array - the CUSPARRAY

2179:    Notes:
2180:    This permanently replaces the CUSPARRAY and frees the memory associated
2181:    with the old CUSPARRAY.

2183:    The memory passed in CANNOT be freed by the user. It will be freed
2184:    when the vector is destroy.

2186:    Not supported from Fortran

2188:    Level: developer

2190: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPPlaceArray(), VecReplaceArray()

2192: @*/
2193: PetscErrorCode VecCUSPReplaceArray(Vec vin,CUSPARRAY *a)
2194: {
2196:   delete ((Vec_CUSP*)vin->spptr)->GPUarray;
2197:   ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2198:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2199:   return(0);
2200: }

2204: /*@C
2205:    VecCUSPResetArray - Resets a vector to use its default memory. Call this
2206:    after the use of VecCUSPPlaceArray().

2208:    Not Collective

2210:    Input Parameters:
2211: .  vec - the vector

2213:    Level: developer

2215: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUSPPlaceArray(), VecCUSPReplaceArray()

2217: @*/
2218: PetscErrorCode VecCUSPResetArray(Vec vin)
2219: {

2223:   VecCUSPCopyToGPU(vin);
2224:   ((Vec_CUSP*)vin->spptr)->GPUarray = (CUSPARRAY *) ((Vec_Seq*)vin->data)->unplacedarray;
2225:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
2226:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2227:   return(0);
2228: }