Actual source code: sveccuda.cu
slepc-3.8.2 2017-12-01
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../svecimpl.h"
17: #define BLOCKSIZE 64
19: /*
20: B := alpha*A + beta*B
22: A,B are nxk (ld=n)
23: */
24: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
25: {
27: PetscBLASInt m,one=1;
28: cublasStatus_t cberr;
31: PetscBLASIntCast(n_*k_,&m);
32: if (beta!=(PetscScalar)1.0) {
33: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
34: PetscLogFlops(m);
35: }
36: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
37: WaitForGPU();CHKERRCUDA(ierr);
38: PetscLogFlops(2.0*m);
39: return(0);
40: }
42: /*
43: C := alpha*A*B + beta*C
44: */
45: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
46: {
47: PetscErrorCode ierr;
48: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
49: const PetscScalar *d_px,*d_A;
50: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
51: PetscInt m,n,k,ldq,mq;
52: cublasStatus_t cberr;
53: cudaError_t err;
56: m = Y->n;
57: n = Y->k-Y->l;
58: k = X->k-X->l;
59: if (!Y->n) return(0);
60: VecCUDAGetArrayRead(x->v,&d_px);
61: if (beta==(PetscScalar)0.0) {
62: VecCUDAGetArrayWrite(y->v,&d_py);
63: } else {
64: VecCUDAGetArrayReadWrite(y->v,&d_py);
65: }
66: d_A = d_px+(X->nc+X->l)*X->n;
67: d_C = d_py+(Y->nc+Y->l)*Y->n;
68: if (Q) {
69: MatGetSize(Q,&ldq,&mq);
70: MatDenseGetArray(Q,&q);
71: err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
72: err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
73: d_B = d_q+Y->l*ldq+X->l;
74: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
75: WaitForGPU();CHKERRCUDA(ierr);
76: MatDenseRestoreArray(Q,&q);
77: err = cudaFree(d_q);CHKERRCUDA(err);
78: PetscLogFlops(2.0*m*n*k);
79: } else {
80: BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
81: }
82: VecCUDARestoreArrayRead(x->v,&d_px);
83: VecCUDARestoreArrayWrite(y->v,&d_py);
84: return(0);
85: }
87: /*
88: y := alpha*A*x + beta*y
89: */
90: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
91: {
92: PetscErrorCode ierr;
93: BV_SVEC *x = (BV_SVEC*)X->data;
94: const PetscScalar *d_px,*d_A;
95: PetscScalar *d_py,*d_q,*d_x,*d_y;
96: PetscBLASInt n,k,one=1;
97: cublasStatus_t cberr;
100: n = X->n;
101: k = X->k-X->l;
102: VecCUDAGetArrayRead(x->v,&d_px);
103: if (beta==(PetscScalar)0.0) {
104: VecCUDAGetArrayWrite(y,&d_py);
105: } else {
106: VecCUDAGetArrayReadWrite(y,&d_py);
107: }
108: if (!q) {
109: VecCUDAGetArrayReadWrite(X->buffer,&d_q);
110: } else {
111: cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
112: cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
113: }
114: d_A = d_px+(X->nc+X->l)*X->n;
115: d_x = d_q;
116: d_y = d_py;
117: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
118: WaitForGPU();CHKERRCUDA(ierr);
119: VecCUDARestoreArrayRead(x->v,&d_px);
120: if (beta==(PetscScalar)0.0) {
121: VecCUDARestoreArrayWrite(y,&d_py);
122: } else {
123: VecCUDARestoreArrayReadWrite(y,&d_py);
124: }
125: if (!q) {
126: VecCUDARestoreArrayReadWrite(X->buffer,&d_q);
127: } else {
128: cudaFree(d_q);
129: }
130: PetscLogFlops(2.0*n*k);
131: return(0);
132: }
134: /*
135: A(:,s:e-1) := A*B(:,s:e-1)
136: */
137: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
138: {
140: BV_SVEC *ctx = (BV_SVEC*)V->data;
141: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
142: PetscInt m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
143: cublasStatus_t cberr;
144: size_t freemem,totmem;
147: m = V->n;
148: n = e-s;
149: k = V->k-V->l;
150: if (!m) return(0);
151: MatGetSize(Q,&ldq,&nq);
152: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
153: MatDenseGetArray(Q,&q);
154: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
155: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
156: /* try to allocate the whole matrix */
157: cudaMemGetInfo(&freemem,&totmem);
158: if (freemem>=m*n*sizeof(PetscScalar)) {
159: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
160: d_A = d_pv+(V->nc+V->l)*m;
161: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
162: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
163: for (j=0;j<n;j++) {
164: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
165: }
166: } else {
167: bs = freemem/(m*sizeof(PetscScalar));
168: cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
169: l = m % bs;
170: if (l) {
171: d_A = d_pv+(V->nc+V->l)*m;
172: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
173: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
174: for (j=0;j<n;j++) {
175: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
176: }
177: }
178: for (;l<m;l+=bs) {
179: d_A = d_pv+(V->nc+V->l)*m+l;
180: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
181: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
182: for (j=0;j<n;j++) {
183: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
184: }
185: }
186: }
188: WaitForGPU();CHKERRCUDA(ierr);
189: MatDenseRestoreArray(Q,&q);
190: cudaFree(d_q);
191: cudaFree(d_work);
192: VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
193: PetscLogFlops(2.0*m*n*k);
194: return(0);
195: }
197: /*
198: A(:,s:e-1) := A*B(:,s:e-1)
199: */
200: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
201: {
203: BV_SVEC *ctx = (BV_SVEC*)V->data;
204: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
205: PetscInt m,n,j,k,ldq,nq;
206: cublasStatus_t cberr;
209: m = V->n;
210: n = e-s;
211: k = V->k-V->l;
212: if (!m) return(0);
213: MatGetSize(Q,&ldq,&nq);
214: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
215: MatDenseGetArray(Q,&q);
216: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
217: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
218: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
219: d_A = d_pv+(V->nc+V->l)*m;
220: d_B = d_q+V->l*ldq+s;
221: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
222: for (j=0;j<n;j++) {
223: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
224: }
225: WaitForGPU();CHKERRCUDA(ierr);
226: MatDenseRestoreArray(Q,&q);
227: cudaFree(d_q);
228: cudaFree(d_work);
229: VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
230: PetscLogFlops(2.0*m*n*k);
231: return(0);
232: }
234: /*
235: C := A'*B
236: */
237: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
238: {
239: PetscErrorCode ierr;
240: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
241: const PetscScalar *d_px,*d_py,*d_A,*d_B;
242: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
243: PetscInt ldm,m,n,k,len,j;
244: cublasStatus_t cberr;
247: m = Y->k-Y->l;
248: n = X->k-X->l;
249: k = X->n;
250: MatGetSize(M,&ldm,NULL);
251: VecCUDAGetArrayRead(x->v,&d_px);
252: VecCUDAGetArrayRead(y->v,&d_py);
253: MatDenseGetArray(M,&pm);
254: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
255: d_A = d_py+(Y->nc+Y->l)*Y->n;
256: d_B = d_px+(X->nc+X->l)*X->n;
257: C = pm+X->l*ldm+Y->l;
258: if (x->mpi) {
259: if (ldm==m) {
260: BVAllocateWork_Private(X,m*n);
261: if (k) {
262: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
263: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
264: } else {
265: PetscMemzero(X->work,m*n*sizeof(PetscScalar));
266: }
267: PetscMPIIntCast(m*n,&len);
268: MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
269: } else {
270: BVAllocateWork_Private(X,2*m*n);
271: CC = X->work+m*n;
272: if (k) {
273: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
274: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
275: } else {
276: PetscMemzero(X->work,m*n*sizeof(PetscScalar));
277: }
278: PetscMPIIntCast(m*n,&len);
279: MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
280: for (j=0;j<n;j++) {
281: PetscMemcpy(C+j*ldm,CC+j*m,m*sizeof(PetscScalar));
282: }
283: }
284: } else {
285: if (k) {
286: BVAllocateWork_Private(X,m*n);
287: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
288: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
289: for (j=0;j<n;j++) {
290: PetscMemcpy(C+j*ldm,X->work+j*m,m*sizeof(PetscScalar));
291: }
292: }
293: }
294: WaitForGPU();CHKERRCUDA(ierr);
295: cudaFree(d_work);
296: MatDenseRestoreArray(M,&pm);
297: VecCUDARestoreArrayRead(x->v,&d_px);
298: VecCUDARestoreArrayRead(y->v,&d_py);
299: PetscLogFlops(2.0*m*n*k);
300: return(0);
301: }
303: /*
304: y := A'*x
305: */
306: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
307: {
308: PetscErrorCode ierr;
309: BV_SVEC *x = (BV_SVEC*)X->data;
310: const PetscScalar *d_A,*d_x,*d_px,*d_py;
311: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
312: PetscBLASInt n,k,one=1,len;
313: Vec z = y;
314: cublasStatus_t cberr;
317: n = X->n;
318: k = X->k-X->l;
319: if (X->matrix) {
320: BV_IPMatMult(X,y);
321: z = X->Bx;
322: }
323: VecCUDAGetArrayRead(x->v,&d_px);
324: VecCUDAGetArrayRead(z,&d_py);
325: if (!q) {
326: VecCUDAGetArrayWrite(X->buffer,&d_work);
327: } else {
328: cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
329: }
330: d_A = d_px+(X->nc+X->l)*X->n;
331: d_x = d_py;
332: if (x->mpi) {
333: BVAllocateWork_Private(X,k);
334: if (n) {
335: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_work,one);CHKERRCUBLAS(cberr);
336: cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
337: } else {
338: PetscMemzero(X->work,k*sizeof(PetscScalar));
339: }
340: if (!q) {
341: VecCUDARestoreArrayWrite(X->buffer,&d_work);
342: VecGetArray(X->buffer,&qq);
343: } else {
344: cudaFree(d_work);
345: }
346: PetscMPIIntCast(k,&len);
347: MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
348: if (!q) { VecRestoreArray(X->buffer,&qq); }
349: } else {
350: if (n) {
351: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_work,one);CHKERRCUBLAS(cberr);
352: }
353: if (!q) {
354: VecCUDARestoreArrayWrite(X->buffer,&d_work);
355: } else {
356: cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
357: cudaFree(d_work);
358: }
359: }
360: WaitForGPU();CHKERRCUDA(ierr);
361: VecCUDARestoreArrayRead(z,&d_py);
362: VecCUDARestoreArrayRead(x->v,&d_px);
363: PetscLogFlops(2.0*n*k);
364: return(0);
365: }
367: /*
368: y := A'*x
369: */
370: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
371: {
372: PetscErrorCode ierr;
373: BV_SVEC *x = (BV_SVEC*)X->data;
374: const PetscScalar *d_A,*d_x,*d_px,*d_py;
375: PetscScalar *d_y,szero=0.0,sone=1.0;
376: PetscBLASInt n,k,one=1;
377: Vec z = y;
378: cublasStatus_t cberr;
381: n = X->n;
382: k = X->k-X->l;
383: if (X->matrix) {
384: BV_IPMatMult(X,y);
385: z = X->Bx;
386: }
387: VecCUDAGetArrayRead(x->v,&d_px);
388: VecCUDAGetArrayRead(z,&d_py);
389: d_A = d_px+(X->nc+X->l)*X->n;
390: d_x = d_py;
391: if (n) {
392: cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
393: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_y,one);CHKERRCUBLAS(cberr);
394: WaitForGPU();CHKERRCUDA(ierr);
395: cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
396: }
397: cudaFree(d_y);
398: VecCUDARestoreArrayRead(z,&d_py);
399: VecCUDARestoreArrayRead(x->v,&d_px);
400: PetscLogFlops(2.0*n*k);
401: return(0);
402: }
404: /*
405: Scale n scalars
406: */
407: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
408: {
410: BV_SVEC *ctx = (BV_SVEC*)bv->data;
411: PetscScalar *d_array, *d_A;
412: PetscBLASInt n,one=1;
413: cublasStatus_t cberr;
416: VecCUDAGetArrayReadWrite(ctx->v,&d_array);
417: if (j<0) {
418: d_A = d_array+(bv->nc+bv->l)*bv->n;
419: n = (bv->k-bv->l)*bv->n;
420: } else {
421: d_A = d_array+(bv->nc+j)*bv->n;
422: n = bv->n;
423: }
424: if (alpha == (PetscScalar)0.0) {
425: cudaMemset(d_A,0,n*sizeof(PetscScalar));
426: } else if (alpha != (PetscScalar)1.0) {
427: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
428: WaitForGPU();CHKERRCUDA(ierr);
429: PetscLogFlops(n);
430: }
431: VecCUDARestoreArrayReadWrite(ctx->v,&d_array);
432: return(0);
433: }
435: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
436: {
437: PetscErrorCode ierr;
438: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
439: const PetscScalar *d_pv;
440: PetscScalar *d_pw;
441: PetscInt j;
444: VecCUDAGetArrayRead(v->v,&d_pv);
445: VecCUDAGetArrayWrite(w->v,&d_pw);
446: for (j=0;j<V->k-V->l;j++) {
447: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
448: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
449: MatMult(A,V->cv[1],W->cv[1]);
450: VecCUDAResetArray(V->cv[1]);
451: VecCUDAResetArray(W->cv[1]);
452: }
453: VecCUDARestoreArrayRead(v->v,&d_pv);
454: VecCUDARestoreArrayWrite(w->v,&d_pw);
455: return(0);
456: }
458: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
459: {
460: PetscErrorCode ierr;
461: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
462: const PetscScalar *d_pv,*d_pvc;
463: PetscScalar *d_pw,*d_pwc;
464: cudaError_t err;
467: VecCUDAGetArrayRead(v->v,&d_pv);
468: VecCUDAGetArrayWrite(w->v,&d_pw);
469: d_pvc = d_pv+(V->nc+V->l)*V->n;
470: d_pwc = d_pw+(W->nc+W->l)*W->n;
471: err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
472: VecCUDARestoreArrayRead(v->v,&d_pv);
473: VecCUDARestoreArrayWrite(w->v,&d_pw);
474: return(0);
475: }
477: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
478: {
479: PetscErrorCode ierr;
480: BV_SVEC *ctx = (BV_SVEC*)bv->data;
481: const PetscScalar *d_pv;
482: PetscScalar *d_pnew;
483: PetscInt bs;
484: Vec vnew;
485: char str[50];
486: cudaError_t err;
489: VecGetBlockSize(bv->t,&bs);
490: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
491: VecSetType(vnew,((PetscObject)bv->t)->type_name);
492: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
493: VecSetBlockSize(vnew,bs);
494: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
495: if (((PetscObject)bv)->name) {
496: PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
497: PetscObjectSetName((PetscObject)vnew,str);
498: }
499: if (copy) {
500: VecCUDAGetArrayRead(ctx->v,&d_pv);
501: VecCUDAGetArrayWrite(vnew,&d_pnew);
502: err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
503: VecCUDARestoreArrayRead(ctx->v,&d_pv);
504: VecCUDARestoreArrayWrite(vnew,&d_pnew);
505: }
506: VecDestroy(&ctx->v);
507: ctx->v = vnew;
508: return(0);
509: }
511: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
512: {
514: BV_SVEC *ctx = (BV_SVEC*)bv->data;
515: PetscScalar *d_pv;
516: PetscInt l;
519: l = BVAvailableVec;
520: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
521: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
522: return(0);
523: }
525: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
526: {
528: BV_SVEC *ctx = (BV_SVEC*)bv->data;
529: PetscInt l;
532: l = (j==bv->ci[0])? 0: 1;
533: VecCUDAResetArray(bv->cv[l]);
534: VecCUDARestoreArrayReadWrite(ctx->v,NULL);
535: return(0);
536: }