Actual source code: veccusp2.cu

petsc-3.8.3 2017-12-09
Report Typos and Errors
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #define PETSC_SKIP_COMPLEX
  6: #define PETSC_SKIP_SPINLOCK

  8: #include <petscconf.h>
  9:  #include <petsc/private/vecimpl.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 13: #include <cuda_runtime.h>

 15: /*
 16:     Allocates space for the vector array on the GPU if it does not exist.
 17:     Does NOT change the PetscCUSPFlag for the vector
 18:     Does NOT zero the CUSP array

 20:  */
 21: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 22: {
 23:   cudaError_t    err;
 24:   cudaStream_t   stream;
 25:   Vec_CUSP       *veccusp;

 28:   if (!v->spptr) {
 29:     try {
 30:       v->spptr = new Vec_CUSP;
 31:       veccusp = (Vec_CUSP*)v->spptr;
 32:       veccusp->GPUarray = new CUSPARRAY;
 33:       veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
 34:       err = cudaStreamCreate(&stream);CHKERRCUSP(err);
 35:       veccusp->stream = stream;
 36:       veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 37:       v->ops->destroy = VecDestroy_SeqCUSP;
 38:       if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
 39:         if (v->data && ((Vec_Seq*)v->data)->array) {
 40:           v->valid_GPU_array = PETSC_CUSP_CPU;
 41:         } else {
 42:           v->valid_GPU_array = PETSC_CUSP_GPU;
 43:         }
 44:       }
 45:     } catch(char *ex) {
 46:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 47:     }
 48:   }
 49:   return(0);
 50: }

 52: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 53: PetscErrorCode VecCUSPCopyToGPU(Vec v)
 54: {
 56:   cudaError_t    err;
 57:   Vec_CUSP       *veccusp;
 58:   CUSPARRAY      *varray;

 62:   VecCUSPAllocateCheck(v);
 63:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 64:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
 65:     try {
 66:       veccusp=(Vec_CUSP*)v->spptr;
 67:       varray=veccusp->GPUarray;
 68:       err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
 69:     } catch(char *ex) {
 70:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 71:     }
 72:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
 73:     v->valid_GPU_array = PETSC_CUSP_BOTH;
 74:   }
 75:   return(0);
 76: }

 78: PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
 79: {
 80:   CUSPARRAY      *varray;
 82:   cudaError_t    err;
 83:   PetscScalar    *cpuPtr, *gpuPtr;
 84:   Vec_Seq        *s;
 85:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

 89:   VecCUSPAllocateCheck(v);
 90:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 91:     s = (Vec_Seq*)v->data;

 93:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
 94:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
 95:     gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
 96:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

 98:     /* Note : this code copies the smallest contiguous chunk of data
 99:        containing ALL of the indices */
100:     err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);

102:     // Set the buffer states
103:     v->valid_GPU_array = PETSC_CUSP_BOTH;
104:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
105:   }
106:   return(0);
107: }


110: /*
111:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
114: {
116:   cudaError_t    err;
117:   Vec_CUSP       *veccusp;
118:   CUSPARRAY      *varray;

122:   VecCUSPAllocateCheckHost(v);
123:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
124:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
125:     try {
126:       veccusp=(Vec_CUSP*)v->spptr;
127:       varray=veccusp->GPUarray;
128:       err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
129:     } catch(char *ex) {
130:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
131:     }
132:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
133:     v->valid_GPU_array = PETSC_CUSP_BOTH;
134:   }
135:   return(0);
136: }

138: /* Note that this function only copies *some* of the values up from the GPU to CPU,
139:    which means that we need recombine the data at some point before using any of the standard functions.
140:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
141:    where you have to always call in pairs
142: */
143: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
144: {
145:   CUSPARRAY      *varray;
147:   cudaError_t    err;
148:   PetscScalar    *cpuPtr, *gpuPtr;
149:   Vec_Seq        *s;
150:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

154:   VecCUSPAllocateCheckHost(v);
155:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
156:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);

158:     varray=((Vec_CUSP*)v->spptr)->GPUarray;
159:     s = (Vec_Seq*)v->data;
160:     gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
161:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

163:     /* Note : this code copies the smallest contiguous chunk of data
164:        containing ALL of the indices */
165:     err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);

167:     VecCUSPRestoreArrayRead(v,&varray);
168:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
169:   }
170:   return(0);
171: }

173: /*MC
174:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

176:    Options Database Keys:
177: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

179:   Level: beginner

181: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
182: M*/

184: /* for VecAYPX_SeqCUSP*/
185: namespace cusp
186: {
187: namespace blas
188: {
189: namespace detail
190: {
191:   template <typename T>
192:     struct AYPX : public thrust::binary_function<T,T,T>
193:     {
194:       T alpha;

196:       AYPX(T _alpha) : alpha(_alpha) {}

198:       __host__ __device__
199:       T operator()(T x, T y)
200:       {
201:         return alpha * y + x;
202:       }
203:     };
204: }

206:  template <typename ForwardIterator1,
207:            typename ForwardIterator2,
208:            typename ScalarType>
209: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
210:            {
211:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
212:            }
213:  template <typename Array1, typename Array2, typename ScalarType>
214:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
215:  {
216: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
217:    cusp::assert_same_dimensions(x,y);
218: #else
219:    detail::assert_same_dimensions(x,y);
220: #endif
221:    aypx(x.begin(),x.end(),y.begin(),alpha);
222:  }
223: }
224: }

226: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
227: {
228:   CUSPARRAY      *xarray,*yarray;

232:   VecCUSPGetArrayRead(xin,&xarray);
233:   VecCUSPGetArrayReadWrite(yin,&yarray);
234:   try {
235:     if (alpha != 0.0) {
236:       cusp::blas::aypx(*xarray,*yarray,alpha);
237:       PetscLogFlops(2.0*yin->map->n);
238:     } else {
239:       cusp::blas::copy(*xarray,*yarray);
240:     }
241:     WaitForGPU();CHKERRCUSP(ierr);
242:   } catch(char *ex) {
243:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
244:   }
245:   VecCUSPRestoreArrayRead(xin,&xarray);
246:   VecCUSPRestoreArrayReadWrite(yin,&yarray);
247:   return(0);
248: }


251: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
252: {
253:   CUSPARRAY      *xarray,*yarray;

257:   if (alpha != 0.0) {
258:     VecCUSPGetArrayRead(xin,&xarray);
259:     VecCUSPGetArrayReadWrite(yin,&yarray);
260:     try {
261:       cusp::blas::axpy(*xarray,*yarray,alpha);
262:       WaitForGPU();CHKERRCUSP(ierr);
263:     } catch(char *ex) {
264:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
265:     }
266:     VecCUSPRestoreArrayRead(xin,&xarray);
267:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
268:     PetscLogFlops(2.0*yin->map->n);
269:   }
270:   return(0);
271: }

273: struct VecCUSPPointwiseDivide
274: {
275:   template <typename Tuple>
276:   __host__ __device__
277:   void operator()(Tuple t)
278:   {
279:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
280:   }
281: };

283: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
284: {
285:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

289:   VecCUSPGetArrayRead(xin,&xarray);
290:   VecCUSPGetArrayRead(yin,&yarray);
291:   VecCUSPGetArrayWrite(win,&warray);
292:   try {
293:     thrust::for_each(
294:       thrust::make_zip_iterator(
295:         thrust::make_tuple(
296:           warray->begin(),
297:           xarray->begin(),
298:           yarray->begin())),
299:       thrust::make_zip_iterator(
300:         thrust::make_tuple(
301:           warray->end(),
302:           xarray->end(),
303:           yarray->end())),
304:       VecCUSPPointwiseDivide());
305:     WaitForGPU();CHKERRCUSP(ierr);
306:   } catch(char *ex) {
307:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
308:   }
309:   PetscLogFlops(win->map->n);
310:   VecCUSPRestoreArrayRead(xin,&xarray);
311:   VecCUSPRestoreArrayRead(yin,&yarray);
312:   VecCUSPRestoreArrayWrite(win,&warray);
313:   return(0);
314: }


317: struct VecCUSPWAXPY
318: {
319:   template <typename Tuple>
320:   __host__ __device__
321:   void operator()(Tuple t)
322:   {
323:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
324:   }
325: };

327: struct VecCUSPSum
328: {
329:   template <typename Tuple>
330:   __host__ __device__
331:   void operator()(Tuple t)
332:   {
333:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
334:   }
335: };

337: struct VecCUSPDiff
338: {
339:   template <typename Tuple>
340:   __host__ __device__
341:   void operator()(Tuple t)
342:   {
343:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
344:   }
345: };

347: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
348: {
349:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

353:   if (alpha == 0.0) {
354:     VecCopy_SeqCUSP(yin,win);
355:   } else {
356:     VecCUSPGetArrayRead(xin,&xarray);
357:     VecCUSPGetArrayRead(yin,&yarray);
358:     VecCUSPGetArrayWrite(win,&warray);
359:     if (alpha == 1.0) {
360:       try {
361:         thrust::for_each(
362:           thrust::make_zip_iterator(
363:             thrust::make_tuple(
364:               warray->begin(),
365:               yarray->begin(),
366:               xarray->begin())),
367:           thrust::make_zip_iterator(
368:             thrust::make_tuple(
369:               warray->end(),
370:               yarray->end(),
371:               xarray->end())),
372:           VecCUSPSum());
373:       } catch(char *ex) {
374:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
375:       }
376:       PetscLogFlops(win->map->n);
377:     } else if (alpha == -1.0) {
378:       try {
379:         thrust::for_each(
380:           thrust::make_zip_iterator(
381:             thrust::make_tuple(
382:               warray->begin(),
383:               yarray->begin(),
384:               xarray->begin())),
385:           thrust::make_zip_iterator(
386:             thrust::make_tuple(
387:               warray->end(),
388:               yarray->end(),
389:               xarray->end())),
390:           VecCUSPDiff());
391:       } catch(char *ex) {
392:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
393:       }
394:       PetscLogFlops(win->map->n);
395:     } else {
396:       try {
397:         thrust::for_each(
398:           thrust::make_zip_iterator(
399:             thrust::make_tuple(
400:               warray->begin(),
401:               yarray->begin(),
402:               thrust::make_constant_iterator(alpha),
403:               xarray->begin())),
404:           thrust::make_zip_iterator(
405:             thrust::make_tuple(
406:               warray->end(),
407:               yarray->end(),
408:               thrust::make_constant_iterator(alpha),
409:               xarray->end())),
410:           VecCUSPWAXPY());
411:       } catch(char *ex) {
412:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
413:       }
414:       PetscLogFlops(2*win->map->n);
415:     }
416:     WaitForGPU();CHKERRCUSP(ierr);
417:     VecCUSPRestoreArrayRead(xin,&xarray);
418:     VecCUSPRestoreArrayRead(yin,&yarray);
419:     VecCUSPRestoreArrayWrite(win,&warray);
420:   }
421:   return(0);
422: }

424: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
425: struct VecCUSPMAXPY4
426: {
427:   template <typename Tuple>
428:   __host__ __device__
429:   void operator()(Tuple t)
430:   {
431:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
432:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
433:   }
434: };


437: struct VecCUSPMAXPY3
438: {
439:   template <typename Tuple>
440:   __host__ __device__
441:   void operator()(Tuple t)
442:   {
443:     /*y += a1*x1 +a2*x2 + a3*x3 */
444:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
445:   }
446: };

448: struct VecCUSPMAXPY2
449: {
450:   template <typename Tuple>
451:   __host__ __device__
452:   void operator()(Tuple t)
453:   {
454:     /*y += a1*x1 +a2*x2*/
455:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
456:   }
457: };
458: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
459: {
461:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
462:   PetscInt       n = xin->map->n,j,j_rem;
463:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

466:   PetscLogFlops(nv*2.0*n);
467:   VecCUSPGetArrayReadWrite(xin,&xarray);
468:   switch (j_rem=nv&0x3) {
469:   case 3:
470:     alpha0 = alpha[0];
471:     alpha1 = alpha[1];
472:     alpha2 = alpha[2];
473:     alpha += 3;
474:     VecCUSPGetArrayRead(y[0],&yy0);
475:     VecCUSPGetArrayRead(y[1],&yy1);
476:     VecCUSPGetArrayRead(y[2],&yy2);
477:     try {
478:       thrust::for_each(
479:         thrust::make_zip_iterator(
480:           thrust::make_tuple(
481:             xarray->begin(),
482:             thrust::make_constant_iterator(alpha0),
483:             yy0->begin(),
484:             thrust::make_constant_iterator(alpha1),
485:             yy1->begin(),
486:             thrust::make_constant_iterator(alpha2),
487:             yy2->begin())),
488:         thrust::make_zip_iterator(
489:           thrust::make_tuple(
490:             xarray->end(),
491:             thrust::make_constant_iterator(alpha0),
492:             yy0->end(),
493:             thrust::make_constant_iterator(alpha1),
494:             yy1->end(),
495:             thrust::make_constant_iterator(alpha2),
496:             yy2->end())),
497:         VecCUSPMAXPY3());
498:     } catch(char *ex) {
499:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
500:     }
501:     VecCUSPRestoreArrayRead(y[0],&yy0);
502:     VecCUSPRestoreArrayRead(y[1],&yy1);
503:     VecCUSPRestoreArrayRead(y[2],&yy2);
504:     y   += 3;
505:     break;
506:   case 2:
507:     alpha0 = alpha[0];
508:     alpha1 = alpha[1];
509:     alpha +=2;
510:     VecCUSPGetArrayRead(y[0],&yy0);
511:     VecCUSPGetArrayRead(y[1],&yy1);
512:     try {
513:       thrust::for_each(
514:         thrust::make_zip_iterator(
515:           thrust::make_tuple(
516:             xarray->begin(),
517:             thrust::make_constant_iterator(alpha0),
518:             yy0->begin(),
519:             thrust::make_constant_iterator(alpha1),
520:             yy1->begin())),
521:         thrust::make_zip_iterator(
522:           thrust::make_tuple(
523:             xarray->end(),
524:             thrust::make_constant_iterator(alpha0),
525:             yy0->end(),
526:             thrust::make_constant_iterator(alpha1),
527:             yy1->end())),
528:         VecCUSPMAXPY2());
529:     } catch(char *ex) {
530:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
531:     }
532:     y +=2;
533:     break;
534:   case 1:
535:     alpha0 = *alpha++;
536:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
537:     y     +=1;
538:     break;
539:   }
540:   for (j=j_rem; j<nv; j+=4) {
541:     alpha0 = alpha[0];
542:     alpha1 = alpha[1];
543:     alpha2 = alpha[2];
544:     alpha3 = alpha[3];
545:     alpha += 4;
546:     VecCUSPGetArrayRead(y[0],&yy0);
547:     VecCUSPGetArrayRead(y[1],&yy1);
548:     VecCUSPGetArrayRead(y[2],&yy2);
549:     VecCUSPGetArrayRead(y[3],&yy3);
550:     try {
551:       thrust::for_each(
552:         thrust::make_zip_iterator(
553:           thrust::make_tuple(
554:             xarray->begin(),
555:             thrust::make_constant_iterator(alpha0),
556:             yy0->begin(),
557:             thrust::make_constant_iterator(alpha1),
558:             yy1->begin(),
559:             thrust::make_constant_iterator(alpha2),
560:             yy2->begin(),
561:             thrust::make_constant_iterator(alpha3),
562:             yy3->begin())),
563:         thrust::make_zip_iterator(
564:           thrust::make_tuple(
565:             xarray->end(),
566:             thrust::make_constant_iterator(alpha0),
567:             yy0->end(),
568:             thrust::make_constant_iterator(alpha1),
569:             yy1->end(),
570:             thrust::make_constant_iterator(alpha2),
571:             yy2->end(),
572:             thrust::make_constant_iterator(alpha3),
573:             yy3->end())),
574:         VecCUSPMAXPY4());
575:     } catch(char *ex) {
576:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
577:     }
578:     VecCUSPRestoreArrayRead(y[0],&yy0);
579:     VecCUSPRestoreArrayRead(y[1],&yy1);
580:     VecCUSPRestoreArrayRead(y[2],&yy2);
581:     VecCUSPRestoreArrayRead(y[3],&yy3);
582:     y   += 4;
583:   }
584:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
585:   WaitForGPU();CHKERRCUSP(ierr);
586:   return(0);
587: }


590: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
591: {
592:   CUSPARRAY      *xarray,*yarray;
594:   //  PetscScalar    *xptr,*yptr,*zgpu;
595:   //PetscReal tmp;

598:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
599:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
600:   VecCUSPGetArrayRead(xin,&xarray);
601:   VecCUSPGetArrayRead(yin,&yarray);
602:   try {
603: #if defined(PETSC_USE_COMPLEX)
604:     *z = cusp::blas::dotc(*yarray,*xarray);
605: #else
606:     *z = cusp::blas::dot(*yarray,*xarray);
607: #endif
608:   } catch(char *ex) {
609:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
610:   }
611:   WaitForGPU();CHKERRCUSP(ierr);
612:   if (xin->map->n >0) {
613:     PetscLogFlops(2.0*xin->map->n-1);
614:   }
615:   VecCUSPRestoreArrayRead(xin,&xarray);
616:   VecCUSPRestoreArrayRead(yin,&yarray);
617:   return(0);
618: }

620: //
621: // CUDA kernels for MDot to follow
622: //

624: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
625: #define MDOT_WORKGROUP_SIZE 128
626: #define MDOT_WORKGROUP_NUM  128

628: // M = 2:
629: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
630:                                         PetscInt size, PetscScalar *group_results)
631: {
632:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
633:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
634:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
635:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
636:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

638:   PetscScalar entry_x    = 0;
639:   PetscScalar group_sum0 = 0;
640:   PetscScalar group_sum1 = 0;
641:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
642:     entry_x     = x[i];   // load only once from global memory!
643:     group_sum0 += entry_x * y0[i];
644:     group_sum1 += entry_x * y1[i];
645:   }
646:   tmp_buffer[threadIdx.x]                       = group_sum0;
647:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

649:   // parallel reduction
650:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
651:     __syncthreads();
652:     if (threadIdx.x < stride) {
653:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
654:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
655:     }
656:   }

658:   // write result of group to group_results
659:   if (threadIdx.x == 0) {
660:     group_results[blockIdx.x]             = tmp_buffer[0];
661:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
662:   }
663: }

665: // M = 3:
666: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
667:                                         PetscInt size, PetscScalar *group_results)
668: {
669:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
670:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
671:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
672:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
673:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

675:   PetscScalar entry_x    = 0;
676:   PetscScalar group_sum0 = 0;
677:   PetscScalar group_sum1 = 0;
678:   PetscScalar group_sum2 = 0;
679:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
680:     entry_x     = x[i];   // load only once from global memory!
681:     group_sum0 += entry_x * y0[i];
682:     group_sum1 += entry_x * y1[i];
683:     group_sum2 += entry_x * y2[i];
684:   }
685:   tmp_buffer[threadIdx.x]                           = group_sum0;
686:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
687:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

689:   // parallel reduction
690:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
691:     __syncthreads();
692:     if (threadIdx.x < stride) {
693:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
694:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
695:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
696:     }
697:   }

699:   // write result of group to group_results
700:   if (threadIdx.x == 0) {
701:     group_results[blockIdx.x                ] = tmp_buffer[0];
702:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
703:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
704:   }
705: }

707: // M = 4:
708: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
709:                                         PetscInt size, PetscScalar *group_results)
710: {
711:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
712:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
713:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
714:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
715:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

717:   PetscScalar entry_x    = 0;
718:   PetscScalar group_sum0 = 0;
719:   PetscScalar group_sum1 = 0;
720:   PetscScalar group_sum2 = 0;
721:   PetscScalar group_sum3 = 0;
722:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
723:     entry_x     = x[i];   // load only once from global memory!
724:     group_sum0 += entry_x * y0[i];
725:     group_sum1 += entry_x * y1[i];
726:     group_sum2 += entry_x * y2[i];
727:     group_sum3 += entry_x * y3[i];
728:   }
729:   tmp_buffer[threadIdx.x]                           = group_sum0;
730:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
731:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
732:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

734:   // parallel reduction
735:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
736:     __syncthreads();
737:     if (threadIdx.x < stride) {
738:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
739:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
740:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
741:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
742:     }
743:   }

745:   // write result of group to group_results
746:   if (threadIdx.x == 0) {
747:     group_results[blockIdx.x                ] = tmp_buffer[0];
748:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
749:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
750:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
751:   }
752: }

754: // M = 8:
755: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
756:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
757:                                           PetscInt size, PetscScalar *group_results)
758: {
759:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
760:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
761:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
762:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
763:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

765:   PetscScalar entry_x    = 0;
766:   PetscScalar group_sum0 = 0;
767:   PetscScalar group_sum1 = 0;
768:   PetscScalar group_sum2 = 0;
769:   PetscScalar group_sum3 = 0;
770:   PetscScalar group_sum4 = 0;
771:   PetscScalar group_sum5 = 0;
772:   PetscScalar group_sum6 = 0;
773:   PetscScalar group_sum7 = 0;
774:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
775:     entry_x     = x[i];   // load only once from global memory!
776:     group_sum0 += entry_x * y0[i];
777:     group_sum1 += entry_x * y1[i];
778:     group_sum2 += entry_x * y2[i];
779:     group_sum3 += entry_x * y3[i];
780:     group_sum4 += entry_x * y4[i];
781:     group_sum5 += entry_x * y5[i];
782:     group_sum6 += entry_x * y6[i];
783:     group_sum7 += entry_x * y7[i];
784:   }
785:   tmp_buffer[threadIdx.x]                           = group_sum0;
786:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
787:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
788:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
789:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
790:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
791:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
792:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

794:   // parallel reduction
795:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
796:     __syncthreads();
797:     if (threadIdx.x < stride) {
798:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
799:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
800:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
801:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
802:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
803:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
804:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
805:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
806:     }
807:   }

809:   // write result of group to group_results
810:   if (threadIdx.x == 0) {
811:     group_results[blockIdx.x                ] = tmp_buffer[0];
812:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
813:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
814:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
815:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
816:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
817:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
818:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
819:   }
820: }


823: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
824: {
826:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
827:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
828:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
829:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
830:   cudaError_t    cuda_ierr;

833:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
834:   /* Handle the case of local size zero first */
835:   if (!xin->map->n) {
836:     for (i=0; i<nv; ++i) z[i] = 0;
837:     return(0);
838:   }

840:   // allocate scratchpad memory for the results of individual work groups:
841:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
842:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

844:   VecCUSPGetArrayRead(xin,&xarray);
845:   xptr = thrust::raw_pointer_cast(xarray->data());

847:   while (current_y_index < nv)
848:   {
849:     switch (nv - current_y_index) {

851:     case 7:
852:     case 6:
853:     case 5:
854:     case 4:
855:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
856:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
857:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
858:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

860: #if defined(PETSC_USE_COMPLEX)
861:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
862:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
863:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
864:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
865: #else
866:       // extract raw device pointers:
867:       y0ptr = thrust::raw_pointer_cast(y0array->data());
868:       y1ptr = thrust::raw_pointer_cast(y1array->data());
869:       y2ptr = thrust::raw_pointer_cast(y2array->data());
870:       y3ptr = thrust::raw_pointer_cast(y3array->data());

872:       // run kernel:
873:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

875:       // copy results back to
876:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
877:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

879:       // sum group results into z:
880:       for (j=0; j<4; ++j) {
881:         z[current_y_index + j] = 0;
882:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
883:       }
884: #endif
885:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
886:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
887:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
888:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
889:       current_y_index += 4;
890:       break;

892:     case 3:
893:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
894:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
895:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

897: #if defined(PETSC_USE_COMPLEX)
898:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
899:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
900:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
901: #else
902:       // extract raw device pointers:
903:       y0ptr = thrust::raw_pointer_cast(y0array->data());
904:       y1ptr = thrust::raw_pointer_cast(y1array->data());
905:       y2ptr = thrust::raw_pointer_cast(y2array->data());

907:       // run kernel:
908:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

910:       // copy results back to
911:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
912:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

914:       // sum group results into z:
915:       for (j=0; j<3; ++j) {
916:         z[current_y_index + j] = 0;
917:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
918:       }
919: #endif

921:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
922:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
923:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
924:       current_y_index += 3;
925:       break;

927:     case 2:
928:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
929:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

931: #if defined(PETSC_USE_COMPLEX)
932:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
933:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
934: #else
935:       // extract raw device pointers:
936:       y0ptr = thrust::raw_pointer_cast(y0array->data());
937:       y1ptr = thrust::raw_pointer_cast(y1array->data());

939:       // run kernel:
940:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

942:       // copy results back to
943:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
944:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

946:       // sum group results into z:
947:       for (j=0; j<2; ++j) {
948:         z[current_y_index + j] = 0;
949:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
950:       }
951: #endif
952:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
953:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
954:       current_y_index += 2;
955:       break;

957:     case 1:
958:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
959: #if defined(PETSC_USE_COMPLEX)
960:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
961: #else
962:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
963: #endif
964:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
965:       current_y_index += 1;
966:       break;

968:     default: // 8 or more vectors left
969:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
970:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
971:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
972:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
973:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
974:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
975:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
976:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

978: #if defined(PETSC_USE_COMPLEX)
979:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
980:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
981:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
982:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
983:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
984:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
985:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
986:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
987: #else
988:       // extract raw device pointers:
989:       y0ptr = thrust::raw_pointer_cast(y0array->data());
990:       y1ptr = thrust::raw_pointer_cast(y1array->data());
991:       y2ptr = thrust::raw_pointer_cast(y2array->data());
992:       y3ptr = thrust::raw_pointer_cast(y3array->data());
993:       y4ptr = thrust::raw_pointer_cast(y4array->data());
994:       y5ptr = thrust::raw_pointer_cast(y5array->data());
995:       y6ptr = thrust::raw_pointer_cast(y6array->data());
996:       y7ptr = thrust::raw_pointer_cast(y7array->data());

998:       // run kernel:
999:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1001:       // copy results back to
1002:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1003:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1005:       // sum group results into z:
1006:       for (j=0; j<8; ++j) {
1007:         z[current_y_index + j] = 0;
1008:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1009:       }
1010: #endif
1011:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1012:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1013:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1014:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1015:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1016:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1017:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1018:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1019:       current_y_index += 8;
1020:       break;
1021:     }
1022:   }
1023:   VecCUSPRestoreArrayRead(xin,&xarray);

1025:   cuda_cudaFree(group_results_gpu);
1026:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1027:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1028:   return(0);
1029: }

1031: #undef MDOT_WORKGROUP_SIZE
1032: #undef MDOT_WORKGROUP_NUM



1036: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1037: {
1038:   CUSPARRAY      *xarray=NULL;

1042:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1043:   VecCUSPGetArrayWrite(xin,&xarray);
1044:   try {
1045:     cusp::blas::fill(*xarray,alpha);
1046:   } catch(char *ex) {
1047:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1048:   }
1049:   WaitForGPU();CHKERRCUSP(ierr);
1050:   VecCUSPRestoreArrayWrite(xin,&xarray);
1051:   return(0);
1052: }

1054: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1055: {
1056:   CUSPARRAY      *xarray;

1060:   if (alpha == 0.0) {
1061:     VecSet_SeqCUSP(xin,alpha);
1062:   } else if (alpha != 1.0) {
1063:     VecCUSPGetArrayReadWrite(xin,&xarray);
1064:     try {
1065:       cusp::blas::scal(*xarray,alpha);
1066:     } catch(char *ex) {
1067:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1068:     }
1069:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1070:   }
1071:   WaitForGPU();CHKERRCUSP(ierr);
1072:   PetscLogFlops(xin->map->n);
1073:   return(0);
1074: }

1076: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1077: {
1078:   CUSPARRAY      *xarray,*yarray;

1082:   //#if defined(PETSC_USE_COMPLEX)
1083:   /*Not working for complex*/
1084:   //#else
1085:   VecCUSPGetArrayRead(xin,&xarray);
1086:   VecCUSPGetArrayRead(yin,&yarray);
1087:   try {
1088:     *z = cusp::blas::dot(*xarray,*yarray);
1089:   } catch(char *ex) {
1090:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1091:   }
1092:   //#endif
1093:   WaitForGPU();CHKERRCUSP(ierr);
1094:   if (xin->map->n > 0) {
1095:     PetscLogFlops(2.0*xin->map->n-1);
1096:   }
1097:   VecCUSPRestoreArrayRead(yin,&yarray);
1098:   VecCUSPRestoreArrayRead(xin,&xarray);
1099:   return(0);
1100: }

1102: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1103: {
1104:   CUSPARRAY      *xarray,*yarray;

1108:   if (xin != yin) {
1109:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1110:       VecCUSPGetArrayRead(xin,&xarray);
1111:       VecCUSPGetArrayWrite(yin,&yarray);
1112:       try {
1113:         cusp::blas::copy(*xarray,*yarray);
1114:       } catch(char *ex) {
1115:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1116:       }
1117:       WaitForGPU();CHKERRCUSP(ierr);
1118:       VecCUSPRestoreArrayRead(xin,&xarray);
1119:       VecCUSPRestoreArrayWrite(yin,&yarray);

1121:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1122:       /* copy in CPU if we are on the CPU*/
1123:       VecCopy_SeqCUSP_Private(xin,yin);
1124:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1125:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1126:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1127:         /* copy in CPU */
1128:         VecCopy_SeqCUSP_Private(xin,yin);

1130:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1131:         /* copy in GPU */
1132:         VecCUSPGetArrayRead(xin,&xarray);
1133:         VecCUSPGetArrayWrite(yin,&yarray);
1134:         try {
1135:           cusp::blas::copy(*xarray,*yarray);
1136:           WaitForGPU();CHKERRCUSP(ierr);
1137:         } catch(char *ex) {
1138:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1139:         }
1140:         VecCUSPRestoreArrayRead(xin,&xarray);
1141:         VecCUSPRestoreArrayWrite(yin,&yarray);
1142:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1143:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1144:            default to copy in GPU (this is an arbitrary choice) */
1145:         VecCUSPGetArrayRead(xin,&xarray);
1146:         VecCUSPGetArrayWrite(yin,&yarray);
1147:         try {
1148:           cusp::blas::copy(*xarray,*yarray);
1149:           WaitForGPU();CHKERRCUSP(ierr);
1150:         } catch(char *ex) {
1151:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1152:         }
1153:         VecCUSPRestoreArrayRead(xin,&xarray);
1154:         VecCUSPRestoreArrayWrite(yin,&yarray);
1155:       } else {
1156:         VecCopy_SeqCUSP_Private(xin,yin);
1157:       }
1158:     }
1159:   }
1160:   return(0);
1161: }


1164: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1165: {
1167:   PetscBLASInt   one = 1,bn;
1168:   CUSPARRAY      *xarray,*yarray;
1169:   cublasStatus_t cberr;

1172:   PetscBLASIntCast(xin->map->n,&bn);
1173:   if (xin != yin) {
1174:     VecCUSPGetArrayReadWrite(xin,&xarray);
1175:     VecCUSPGetArrayReadWrite(yin,&yarray);

1177: #if defined(PETSC_USE_COMPLEX)
1178: #if defined(PETSC_USE_REAL_SINGLE)
1179:     cberr = cublasCswap(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1180: #else
1181:     cberr = cublasZswap(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1182: #endif
1183: #else
1184: #if defined(PETSC_USE_REAL_SINGLE)
1185:     cberr = cublasSswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1186: #else
1187:     cberr = cublasDswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1188: #endif
1189: #endif
1190:     WaitForGPU();CHKERRCUSP(ierr);
1191:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1192:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1193:   }
1194:   return(0);
1195: }

1197: struct VecCUSPAX
1198: {
1199:   template <typename Tuple>
1200:   __host__ __device__
1201:   void operator()(Tuple t)
1202:   {
1203:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1204:   }
1205: };

1207: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1208: {
1210:   PetscScalar    a = alpha,b = beta;
1211:   CUSPARRAY      *xarray,*yarray;

1214:   if (a == 0.0) {
1215:     VecScale_SeqCUSP(yin,beta);
1216:   } else if (b == 1.0) {
1217:     VecAXPY_SeqCUSP(yin,alpha,xin);
1218:   } else if (a == 1.0) {
1219:     VecAYPX_SeqCUSP(yin,beta,xin);
1220:   } else if (b == 0.0) {
1221:     VecCUSPGetArrayRead(xin,&xarray);
1222:     VecCUSPGetArrayReadWrite(yin,&yarray);
1223:     try {
1224:       thrust::for_each(
1225:         thrust::make_zip_iterator(
1226:           thrust::make_tuple(
1227:             yarray->begin(),
1228:             thrust::make_constant_iterator(a),
1229:             xarray->begin())),
1230:         thrust::make_zip_iterator(
1231:           thrust::make_tuple(
1232:             yarray->end(),
1233:             thrust::make_constant_iterator(a),
1234:             xarray->end())),
1235:         VecCUSPAX());
1236:     } catch(char *ex) {
1237:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1238:     }
1239:     PetscLogFlops(xin->map->n);
1240:     WaitForGPU();CHKERRCUSP(ierr);
1241:     VecCUSPRestoreArrayRead(xin,&xarray);
1242:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1243:   } else {
1244:     VecCUSPGetArrayRead(xin,&xarray);
1245:     VecCUSPGetArrayReadWrite(yin,&yarray);
1246:     try {
1247:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1248:     } catch(char *ex) {
1249:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1250:     }
1251:     VecCUSPRestoreArrayRead(xin,&xarray);
1252:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1253:     WaitForGPU();CHKERRCUSP(ierr);
1254:     PetscLogFlops(3.0*xin->map->n);
1255:   }
1256:   return(0);
1257: }

1259: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1260: struct VecCUSPXPBYPCZ
1261: {
1262:   /* z = x + b*y + c*z */
1263:   template <typename Tuple>
1264:   __host__ __device__
1265:   void operator()(Tuple t)
1266:   {
1267:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1268:   }
1269: };

1271: struct VecCUSPAXPBYPZ
1272: {
1273:   /* z = ax + b*y + z */
1274:   template <typename Tuple>
1275:   __host__ __device__
1276:   void operator()(Tuple t)
1277:   {
1278:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1279:   }
1280: };

1282: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1283: {
1285:   PetscInt       n = zin->map->n;
1286:   CUSPARRAY      *xarray,*yarray,*zarray;

1289:   VecCUSPGetArrayRead(xin,&xarray);
1290:   VecCUSPGetArrayRead(yin,&yarray);
1291:   VecCUSPGetArrayReadWrite(zin,&zarray);
1292:   if (alpha == 1.0) {
1293:     try {
1294:       thrust::for_each(
1295:         thrust::make_zip_iterator(
1296:           thrust::make_tuple(
1297:             zarray->begin(),
1298:             thrust::make_constant_iterator(gamma),
1299:             xarray->begin(),
1300:             yarray->begin(),
1301:             thrust::make_constant_iterator(beta))),
1302:         thrust::make_zip_iterator(
1303:           thrust::make_tuple(
1304:             zarray->end(),
1305:             thrust::make_constant_iterator(gamma),
1306:             xarray->end(),
1307:             yarray->end(),
1308:             thrust::make_constant_iterator(beta))),
1309:         VecCUSPXPBYPCZ());
1310:     } catch(char *ex) {
1311:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1312:     }
1313:     PetscLogFlops(4.0*n);
1314:   } else if (gamma == 1.0) {
1315:     try {
1316:       thrust::for_each(
1317:         thrust::make_zip_iterator(
1318:           thrust::make_tuple(
1319:             zarray->begin(),
1320:             xarray->begin(),
1321:             thrust::make_constant_iterator(alpha),
1322:             yarray->begin(),
1323:             thrust::make_constant_iterator(beta))),
1324:         thrust::make_zip_iterator(
1325:           thrust::make_tuple(
1326:             zarray->end(),
1327:             xarray->end(),
1328:             thrust::make_constant_iterator(alpha),
1329:             yarray->end(),
1330:             thrust::make_constant_iterator(beta))),
1331:         VecCUSPAXPBYPZ());
1332:     } catch(char *ex) {
1333:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1334:     }
1335:     PetscLogFlops(4.0*n);
1336:   } else {
1337:     try {
1338:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1339:     } catch(char *ex) {
1340:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1341:     }
1342:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1343:     VecCUSPRestoreArrayRead(xin,&xarray);
1344:     VecCUSPRestoreArrayRead(yin,&yarray);
1345:     PetscLogFlops(5.0*n);
1346:   }
1347:   WaitForGPU();CHKERRCUSP(ierr);
1348:   return(0);
1349: }

1351: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1352: {
1354:   PetscInt       n = win->map->n;
1355:   CUSPARRAY      *xarray,*yarray,*warray;

1358:   VecCUSPGetArrayRead(xin,&xarray);
1359:   VecCUSPGetArrayRead(yin,&yarray);
1360:   VecCUSPGetArrayReadWrite(win,&warray);
1361:   try {
1362:     cusp::blas::xmy(*xarray,*yarray,*warray);
1363:   } catch(char *ex) {
1364:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1365:   }
1366:   VecCUSPRestoreArrayRead(xin,&xarray);
1367:   VecCUSPRestoreArrayRead(yin,&yarray);
1368:   VecCUSPRestoreArrayReadWrite(win,&warray);
1369:   PetscLogFlops(n);
1370:   WaitForGPU();CHKERRCUSP(ierr);
1371:   return(0);
1372: }


1375: /* should do infinity norm in cusp */

1377: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1378: {
1379:   const PetscScalar *xx;
1380:   PetscErrorCode    ierr;
1381:   PetscInt          n = xin->map->n;
1382:   PetscBLASInt      one = 1, bn;
1383:   CUSPARRAY         *xarray;
1384:   cublasStatus_t    cberr;

1387:   PetscBLASIntCast(n,&bn);
1388:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1389:     VecCUSPGetArrayRead(xin,&xarray);
1390:     try {
1391:       *z = cusp::blas::nrm2(*xarray);
1392:     } catch(char *ex) {
1393:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1394:     }
1395:     WaitForGPU();CHKERRCUSP(ierr);
1396:     VecCUSPRestoreArrayRead(xin,&xarray);
1397:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1398:   } else if (type == NORM_INFINITY) {
1399:     PetscInt  i;
1400:     PetscReal max = 0.0,tmp;

1402:     VecGetArrayRead(xin,&xx);
1403:     for (i=0; i<n; i++) {
1404:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1405:       /* check special case of tmp == NaN */
1406:       if (tmp != tmp) {max = tmp; break;}
1407:       xx++;
1408:     }
1409:     VecRestoreArrayRead(xin,&xx);
1410:     *z   = max;
1411:   } else if (type == NORM_1) {
1412:     VecCUSPGetArrayRead(xin,&xarray);
1413: #if defined(PETSC_USE_COMPLEX)
1414: #if defined(PETSC_USE_REAL_SINGLE)
1415:     cberr = cublasSasum(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1416: #else
1417:     cberr = cublasDasum(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1418: #endif
1419: #else
1420: #if defined(PETSC_USE_REAL_SINGLE)
1421:     cberr = cublasSasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1422: #else
1423:     cberr = cublasDasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1424: #endif
1425: #endif
1426:     VecCUSPRestoreArrayRead(xin,&xarray);
1427:     WaitForGPU();CHKERRCUSP(ierr);
1428:     PetscLogFlops(PetscMax(n-1.0,0.0));
1429:   } else if (type == NORM_1_AND_2) {
1430:     VecNorm_SeqCUSP(xin,NORM_1,z);
1431:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1432:   }
1433:   return(0);
1434: }

1436: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1437: template <typename T>
1438: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1439: {
1440:   __host__ __device__
1441:   T operator()(T x)
1442:   {
1443: #if defined(PETSC_USE_COMPLEX)
1444:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1445: #else
1446:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1447: #endif
1448:   }
1449: };

1451: template <typename T>
1452: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1453: {
1454:   __host__ __device__
1455:   T operator()(T x,T y)
1456:   {
1457:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1458:   }
1459: };

1461: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1462: {
1463:   PetscErrorCode                         ierr;
1464:   PetscScalar                            zero = 0.0;
1465:   PetscReal                              n=s->map->n;
1466:   thrust::tuple<PetscScalar,PetscScalar> result;
1467:   CUSPARRAY                              *sarray,*tarray;

1470:   /*VecCUSPCopyToGPU(s);
1471:    VecCUSPCopyToGPU(t);*/
1472:   VecCUSPGetArrayRead(s,&sarray);
1473:   VecCUSPGetArrayRead(t,&tarray);
1474:   try {
1475: #if defined(PETSC_USE_COMPLEX)
1476:     VecDot_SeqCUSP(s,t,dp);
1477:     VecDot_SeqCUSP(t,t,nm);
1478:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1479:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1480: #else
1481:     result = thrust::transform_reduce(
1482:               thrust::make_zip_iterator(
1483:                 thrust::make_tuple(
1484:                   sarray->begin(),
1485:                   tarray->begin())),
1486:               thrust::make_zip_iterator(
1487:                 thrust::make_tuple(
1488:                   sarray->end(),
1489:                   tarray->end())),
1490:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1491:               thrust::make_tuple(zero,zero),                                   /*init */
1492:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1493:     *dp = thrust::get<0>(result);
1494:     *nm = thrust::get<1>(result);
1495: #endif
1496:   } catch(char *ex) {
1497:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1498:   }
1499:   VecCUSPRestoreArrayRead(s,&sarray);
1500:   VecCUSPRestoreArrayRead(t,&tarray);
1501:   WaitForGPU();CHKERRCUSP(ierr);
1502:   PetscLogFlops(4.0*n);
1503:   return(0);
1504: }

1506: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1507: {
1509:   cudaError_t    err;

1512:   try {
1513:     if (v->spptr) {
1514:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
1515:       err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1516:       delete (Vec_CUSP*)v->spptr;
1517:     }
1518:   } catch(char *ex) {
1519:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1520:   }
1521:   VecDestroy_SeqCUSP_Private(v);
1522:   return(0);
1523: }


1526: #if defined(PETSC_USE_COMPLEX)
1527: struct conjugate
1528: {
1529:   __host__ __device__
1530:   PetscScalar operator()(PetscScalar x)
1531:   {
1532:     return cusp::conj(x);
1533:   }
1534: };
1535: #endif


1538: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1539: {
1541:   CUSPARRAY      *xarray;

1544:   VecCUSPGetArrayReadWrite(xin,&xarray);
1545: #if defined(PETSC_USE_COMPLEX)
1546:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1547: #endif
1548:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
1549:   return(0);
1550: }

1552: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1553: {
1555:   cudaError_t    err;


1562:   if (w->data) {
1563:     if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1564:     ((Vec_Seq*)w->data)->array = 0;
1565:     ((Vec_Seq*)w->data)->array_allocated = 0;
1566:     ((Vec_Seq*)w->data)->unplacedarray = 0;
1567:   }
1568:   if (w->spptr) {
1569:     if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1570:     err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1571:     delete (Vec_CUSP*)w->spptr;
1572:     w->spptr = 0;
1573:   }

1575:   if (v->petscnative) {
1576:     PetscFree(w->data);
1577:     w->data = v->data;
1578:     w->valid_GPU_array = v->valid_GPU_array;
1579:     w->spptr = v->spptr;
1580:     PetscObjectStateIncrease((PetscObject)w);
1581:   } else {
1582:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1583:     w->valid_GPU_array = PETSC_CUSP_CPU;
1584:     VecCUSPAllocateCheck(w);
1585:   }
1586:   return(0);
1587: }

1589: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1590: {
1592:   cudaError_t    err;


1599:   if (v->petscnative) {
1600:     v->data = w->data;
1601:     v->valid_GPU_array = w->valid_GPU_array;
1602:     v->spptr = w->spptr;
1603:     VecCUSPCopyFromGPU(v);
1604:     PetscObjectStateIncrease((PetscObject)v);
1605:     w->data = 0;
1606:     w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1607:     w->spptr = 0;
1608:   } else {
1609:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1610:     if ((Vec_CUSP*)w->spptr) {
1611:       delete ((Vec_CUSP*)w->spptr)->GPUarray;
1612:       err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1613:       delete (Vec_CUSP*)w->spptr;
1614:     }
1615:   }
1616:   return(0);
1617: }

1619: /*@C
1620:    VecCUSPGetArrayReadWrite - Provides access to the CUSP vector inside a vector.

1622:    This function has semantics similar to VecGetArray():  the CUSP
1623:    vector returned by this function points to a consistent view of the
1624:    vector data.  This may involve a copy operation of data from the host
1625:    to the device if the data on the device is out of date.  If the
1626:    device memory hasn't been allocated previously it will be allocated
1627:    as part of this function call.  VecCUSPGetArrayReadWrite() assumes
1628:    that the user will modify the vector data.  This is similar to
1629:    intent(inout) in fortran.

1631:    The CUSP device vector has to be released by calling
1632:    VecCUSPRestoreArrayReadWrite().  Upon restoring the vector data the
1633:    data on the host will be marked as out of date.  A subsequent access
1634:    of the host data will thus incur a data transfer from the device to
1635:    the host.


1638:    Input Parameter:
1639: .  v - the vector

1641:    Output Parameter:
1642: .  a - the CUSP device vector
1643:    
1644:    Fortran note: This function is not currently available from Fortran.

1646:    Fortran note:
1647:    This function is not currently available from Fortran.

1649:    Level: intermediate

1651: .seealso: VecCUSPRestoreArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1652: @*/
1653: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1654: {

1659:   *a   = 0;
1660:   VecCUSPCopyToGPU(v);
1661:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1662:   return(0);
1663: }

1665: /*@C
1666:    VecCUSPRestoreArrayReadWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayReadWrite().

1668:    This marks the host data as out of date.  Subsequent access to the
1669:    vector data on the host side with for instance VecGetArray() incurs a
1670:    data transfer.

1672:    Input Parameter:
1673: +  v - the vector
1674: -  a - the CUSP device vector.  This pointer is invalid after
1675:        VecCUSPRestoreArrayReadWrite() returns.

1677:    Fortran note:
1678:    This function is not currently available from Fortran.

1680:    Level: intermediate

1682: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1683: @*/
1684: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1685: {

1690:   v->valid_GPU_array = PETSC_CUSP_GPU;

1692:   PetscObjectStateIncrease((PetscObject)v);
1693:   return(0);
1694: }

1696: /*@C
1697:    VecCUSPGetArrayRead - Provides read access to the CUSP device vector inside a vector.

1699:    This function is analogous to VecGetArrayRead():  The CUSP vector
1700:    returned by this function points to a consistent view of the vector
1701:    data.  This may involve a copy operation of data from the host to the
1702:    device if the data on the device is out of date.  If the device
1703:    memory hasn't been allocated previously it will be allocated as part
1704:    of this function call.  VecCUSPGetArrayRead() assumes that the user
1705:    will not modify the vector data.  This is analogous to intent(in) in
1706:    Fortran.

1708:    The CUSP device vector has to be released by calling
1709:    VecCUSPRestoreArrayRead().  If the data on the host side was
1710:    previously up to date it will remain so, i.e. data on both the device
1711:    and the host is up to date.  Accessing data on the host side does not
1712:    incur a device to host data transfer.

1714:    Input Parameter:
1715: .  v - the vector

1717:    Output Parameter:
1718: .  a - the CUSP device vector

1720:    Fortran note:
1721:    This function is not currently available from Fortran.

1723:    Level: intermediate

1725: .seealso: VecCUSPRestoreArrayRead(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecGetArrayRead()
1726: @*/
1727: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1728: {

1733:   *a   = 0;
1734:   VecCUSPCopyToGPU(v);
1735:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1736:   return(0);
1737: }

1739: /*@C
1740:    VecCUSPRestoreArrayRead - Restore a CUSP device vector previously acquired with VecCUSPGetArrayRead().

1742:    If the data on the host side was previously up to date it will remain
1743:    so, i.e. data on both the device and the host is up to date.
1744:    Accessing data on the host side e.g. with VecGetArray() does not
1745:    incur a device to host data transfer.

1747:    Input Parameter:
1748: +  v - the vector
1749: -  a - the CUSP device vector.  This pointer is invalid after
1750:        VecCUSPRestoreArrayRead() returns.

1752:    Fortran note:
1753:    This function is not currently available from Fortran.

1755:    Level: intermediate

1757: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1758: @*/
1759: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1760: {
1763:   return(0);
1764: }

1766: /*@C
1767:    VecCUSPGetArrayWrite - Provides write access to the CUSP device vector inside a vector.

1769:    The data pointed to by the device vector is uninitialized.  The user
1770:    must not read this data.  Furthermore, the entire array needs to be
1771:    filled by the user to obtain well-defined behaviour.  The device
1772:    memory will be allocated by this function if it hasn't been allocated
1773:    previously.  This is analogous to intent(out) in Fortran.

1775:    The CUSP device vector needs to be released with
1776:    VecCUSPRestoreArrayWrite().  When the pointer is released the host
1777:    data of the vector is marked as out of data.  Subsequent access of
1778:    the host data with e.g. VecGetArray() incurs a device to host data
1779:    transfer.


1782:    Input Parameter:
1783: .  v - the vector

1785:    Output Parameter:
1786: .  a - the CUDA pointer

1788:    Fortran note:
1789:    This function is not currently available from Fortran.

1791:    Level: intermediate

1793: .seealso: VecCUSPRestoreArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1794: @*/
1795: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1796: {

1801:   *a   = 0;
1802:   VecCUSPAllocateCheck(v);
1803:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1804:   return(0);
1805: }

1807: /*@C
1808:    VecCUSPRestoreArrayWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayWrite().

1810:    Data on the host will be marked as out of date.  Subsequent access of
1811:    the data on the host side e.g. with VecGetArray() will incur a device
1812:    to host data transfer.

1814:    Input Parameter:
1815: +  v - the vector
1816: -  a - the CUDA device pointer.  This pointer is invalid after
1817:        VecCUSPRestoreArrayWrite() returns.

1819:    Fortran note:
1820:    This function is not currently available from Fortran.

1822:    Level: intermediate

1824: .seealso: VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1825: @*/
1826: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
1827: {

1832:   v->valid_GPU_array = PETSC_CUSP_GPU;

1834:   PetscObjectStateIncrease((PetscObject)v);
1835:   return(0);
1836: }

1838: /*@C
1839:    VecCUSPGetCUDAArrayReadWrite - Provides access to the CUDA buffer inside a vector.

1841:    This function has semantics similar to VecGetArray():  the pointer
1842:    returned by this function points to a consistent view of the vector
1843:    data.  This may involve a copy operation of data from the host to the
1844:    device if the data on the device is out of date.  If the device
1845:    memory hasn't been allocated previously it will be allocated as part
1846:    of this function call.  VecCUSPGetCUDAArrayReadWrite() assumes that
1847:    the user will modify the vector data.  This is similar to
1848:    intent(inout) in fortran.

1850:    The CUDA device pointer has to be released by calling
1851:    VecCUSPRestoreCUDAArrayReadWrite().  Upon restoring the vector data
1852:    the data on the host will be marked as out of date.  A subsequent
1853:    access of the host data will thus incur a data transfer from the
1854:    device to the host.


1857:    Input Parameter:
1858: .  v - the vector

1860:    Output Parameter:
1861: .  a - the CUDA device pointer

1863:    Fortran note:
1864:    This function is not currently available from Fortran.

1866:    Level: advanced

1868: .seealso: VecCUSPRestoreCUDAArrayReadWrite(), VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1869: @*/
1870: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayReadWrite(Vec v, PetscScalar **a)
1871: {
1873:   CUSPARRAY      *cusparray;

1878:   VecCUSPGetArrayReadWrite(v, &cusparray);
1879:   *a   = thrust::raw_pointer_cast(cusparray->data());
1880:   return(0);
1881: }

1883: /*@C
1884:    VecCUSPRestoreCUDAArrayReadWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayReadWrite().

1886:    This marks the host data as out of date.  Subsequent access to the
1887:    vector data on the host side with for instance VecGetArray() incurs a
1888:    data transfer.

1890:    Input Parameter:
1891: +  v - the vector
1892: -  a - the CUDA device pointer.  This pointer is invalid after
1893:        VecCUSPRestoreCUDAArrayReadWrite() returns.

1895:    Fortran note:
1896:    This function is not currently available from Fortran.

1898:    Level: advanced

1900: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1901: @*/
1902: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayReadWrite(Vec v, PetscScalar **a)
1903: {

1908:   v->valid_GPU_array = PETSC_CUSP_GPU;
1909:   PetscObjectStateIncrease((PetscObject)v);
1910:   return(0);
1911: }

1913: /*@C
1914:    VecCUSPGetCUDAArrayRead - Provides read access to the CUDA buffer inside a vector.

1916:    This function is analogous to VecGetArrayRead():  The pointer
1917:    returned by this function points to a consistent view of the vector
1918:    data.  This may involve a copy operation of data from the host to the
1919:    device if the data on the device is out of date.  If the device
1920:    memory hasn't been allocated previously it will be allocated as part
1921:    of this function call.  VecCUSPGetCUDAArrayRead() assumes that the
1922:    user will not modify the vector data.  This is analgogous to
1923:    intent(in) in Fortran.

1925:    The CUDA device pointer has to be released by calling
1926:    VecCUSPRestoreCUDAArrayRead().  If the data on the host side was
1927:    previously up to date it will remain so, i.e. data on both the device
1928:    and the host is up to date.  Accessing data on the host side does not
1929:    incur a device to host data transfer.

1931:    Input Parameter:
1932: .  v - the vector

1934:    Output Parameter:
1935: .  a - the CUDA pointer.

1937:    Fortran note:
1938:    This function is not currently available from Fortran.

1940:    Level: advanced

1942: .seealso: VecCUSPRestoreCUDAArrayRead(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1943: @*/
1944: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayRead(Vec v, PetscScalar **a)
1945: {
1947:   CUSPARRAY      *cusparray;

1952:   VecCUSPGetArrayRead(v, &cusparray);
1953:   *a   = thrust::raw_pointer_cast(cusparray->data());
1954:   return(0);
1955: }

1957: /*@C
1958:    VecCUSPRestoreCUDAArrayRead - Restore a device vector previously acquired with VecCUSPGetCUDAArrayRead()

1960:    If the data on the host side was previously up to date it will remain
1961:    so, i.e. data on both the device and the host is up to date.
1962:    Accessing data on the host side e.g. with VecGetArray() does not
1963:    incur a device to host data transfer.

1965:    Input Parameter:
1966: +  v - the vector

1968: -  a - the CUDA device pointer.  This pointer is invalid after
1969:        VecCUSPRestoreCUDAArrayRead() returns.

1971:    Fortran note:
1972:    This function is not currently available from Fortran.

1974:    Level: advanced

1976:    Fortran note: This function is not currently available from Fortran.

1978: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1979: @*/
1980: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayRead(Vec v, PetscScalar **a)
1981: {
1984:   v->valid_GPU_array = PETSC_CUSP_BOTH;
1985:   return(0);
1986: }

1988: /*@C
1989:    VecCUSPGetCUDAArrayWrite - Provides write access to the CUDA buffer inside a vector.

1991:    The data pointed to by the device pointer is uninitialized.  The user
1992:    may not read from this data.  Furthermore, the entire array needs to
1993:    be filled by the user to obtain well-defined behaviour.  The device
1994:    memory will be allocated by this function if it hasn't been allocated
1995:    previously.  This is analogous to intent(out) in Fortran.

1997:    The device pointer needs to be released with
1998:    VecCUSPRestoreCUDAArrayWrite().  When the pointer is released the
1999:    host data of the vector is marked as out of data.  Subsequent access
2000:    of the host data with e.g. VecGetArray() incurs a device to host data
2001:    transfer.


2004:    Input Parameter:
2005: .  v - the vector

2007:    Output Parameter:
2008: .  a - the CUDA pointer

2010:    Fortran note:
2011:    This function is not currently available from Fortran.

2013:    Level: advanced

2015: .seealso: VecCUSPRestoreCUDAArrayWrite(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2016: @*/
2017: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayWrite(Vec v, PetscScalar **a)
2018: {
2020:   CUSPARRAY      *cusparray;

2025:   VecCUSPGetArrayWrite(v, &cusparray);
2026:   *a   = thrust::raw_pointer_cast(cusparray->data());
2027:   return(0);
2028: }

2030: /*@C
2031:    VecCUSPRestoreCUDAArrayWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayWrite().

2033:    Data on the host will be marked as out of date.  Subsequent access of
2034:    the data on the host side e.g. with VecGetArray() will incur a device
2035:    to host data transfer.

2037:    Input Parameter:
2038: +  v - the vector

2040: -  a - the CUDA device pointer.  This pointer is invalid after
2041:        VecCUSPRestoreCUDAArrayWrite() returns.

2043:    Fortran note:
2044:    This function is not currently available from Fortran.

2046:    Level: advanced

2048: .seealso: VecCUSPGetCUDAArrayWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2049: @*/
2050: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayWrite(Vec v, PetscScalar **a)
2051: {

2056:   v->valid_GPU_array = PETSC_CUSP_GPU;
2057:   PetscObjectStateIncrease((PetscObject)v);
2058:   return(0);
2059: }


2062: /*@C
2063:    VecCUSPPlaceArray - Allows one to replace the array in a vector with a
2064:    CUSPARRAY provided by the user. This is useful to avoid copying a
2065:    CUSPARRAY into a vector.

2067:    Not Collective

2069:    Input Parameters:
2070: +  vec - the vector
2071: -  array - the CUSPARRAY

2073:    Notes:
2074:    You can return to the original CUSPARRAY with a call to VecCUSPResetArray()
2075:    It is not possible to use VecCUSPPlaceArray() and VecPlaceArray() at the
2076:    same time on the same vector.

2078:    Level: developer

2080: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPReplaceArray()

2082: @*/
2083: PetscErrorCode VecCUSPPlaceArray(Vec vin,CUSPARRAY *a)
2084: {

2089:   VecCUSPCopyToGPU(vin);
2090:   if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUSPPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUSPResetArray()/VecResetArray()");
2091:   ((Vec_Seq*)vin->data)->unplacedarray  = (PetscScalar *) ((Vec_CUSP*)vin->spptr)->GPUarray; /* save previous CUDAARRAY so reset can bring it back */
2092:   ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2093:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2094:   PetscObjectStateIncrease((PetscObject)vin);
2095:   return(0);
2096: }

2098: /*@C
2099:    VecCUSPReplaceArray - Allows one to replace the CUSPARRAY in a vector
2100:    with a CUSPARRAY provided by the user. This is useful to avoid copying
2101:    a CUSPARRAY into a vector.

2103:    Not Collective

2105:    Input Parameters:
2106: +  vec - the vector
2107: -  array - the CUSPARRAY

2109:    Notes:
2110:    This permanently replaces the CUSPARRAY and frees the memory associated
2111:    with the old CUSPARRAY.

2113:    The memory passed in CANNOT be freed by the user. It will be freed
2114:    when the vector is destroy.

2116:    Not supported from Fortran

2118:    Level: developer

2120: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPPlaceArray(), VecReplaceArray()

2122: @*/
2123: PetscErrorCode VecCUSPReplaceArray(Vec vin,CUSPARRAY *a)
2124: {

2129:   delete ((Vec_CUSP*)vin->spptr)->GPUarray;
2130:   ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2131:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2132:   PetscObjectStateIncrease((PetscObject)vin);
2133:   return(0);
2134: }

2136: /*@C
2137:    VecCUSPResetArray - Resets a vector to use its default memory. Call this
2138:    after the use of VecCUSPPlaceArray().

2140:    Not Collective

2142:    Input Parameters:
2143: .  vec - the vector

2145:    Level: developer

2147: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUSPPlaceArray(), VecCUSPReplaceArray()

2149: @*/
2150: PetscErrorCode VecCUSPResetArray(Vec vin)
2151: {

2156:   VecCUSPCopyToGPU(vin);
2157:   ((Vec_CUSP*)vin->spptr)->GPUarray = (CUSPARRAY *) ((Vec_Seq*)vin->data)->unplacedarray;
2158:   ((Vec_Seq*)vin->data)->unplacedarray = 0;
2159:   vin->valid_GPU_array = PETSC_CUSP_GPU;
2160:   PetscObjectStateIncrease((PetscObject)vin);
2161:   return(0);
2162: }