Actual source code: sveccuda.cu
slepc-3.14.1 2020-12-08
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <petsccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
32: PetscBLASInt m=0,one=1;
33: cublasStatus_t cberr;
34: cublasHandle_t cublasv2handle;
37: PetscCUBLASGetHandle(&cublasv2handle);
38: PetscBLASIntCast(n_*k_,&m);
39: PetscLogGpuTimeBegin();
40: if (beta!=(PetscScalar)1.0) {
41: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
42: PetscLogGpuFlops(1.0*m);
43: }
44: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
45: PetscLogGpuTimeEnd();
46: PetscLogGpuFlops(2.0*m);
47: return(0);
48: }
50: /*
51: C := alpha*A*B + beta*C
52: */
53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
54: {
55: PetscErrorCode ierr;
56: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
57: const PetscScalar *d_px,*d_A;
58: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
59: PetscInt ldq,mq;
60: PetscBLASInt m=0,n=0,k=0,ldq_=0;
61: cublasStatus_t cberr;
62: cudaError_t cerr;
63: cublasHandle_t cublasv2handle;
66: if (!Y->n) return(0);
67: VecCUDAGetArrayRead(x->v,&d_px);
68: if (beta==(PetscScalar)0.0) {
69: VecCUDAGetArrayWrite(y->v,&d_py);
70: } else {
71: VecCUDAGetArray(y->v,&d_py);
72: }
73: d_A = d_px+(X->nc+X->l)*X->n;
74: d_C = d_py+(Y->nc+Y->l)*Y->n;
75: if (Q) {
76: PetscBLASIntCast(Y->n,&m);
77: PetscBLASIntCast(Y->k-Y->l,&n);
78: PetscBLASIntCast(X->k-X->l,&k);
79: PetscCUBLASGetHandle(&cublasv2handle);
80: MatGetSize(Q,&ldq,&mq);
81: PetscBLASIntCast(ldq,&ldq_);
82: MatDenseGetArray(Q,&q);
83: cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
84: cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
85: PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
86: d_B = d_q+Y->l*ldq+X->l;
87: PetscLogGpuTimeBegin();
88: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
89: PetscLogGpuTimeEnd();
90: MatDenseRestoreArray(Q,&q);
91: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
92: PetscLogGpuFlops(2.0*m*n*k);
93: } else {
94: BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
95: }
96: VecCUDARestoreArrayRead(x->v,&d_px);
97: VecCUDARestoreArrayWrite(y->v,&d_py);
98: return(0);
99: }
101: /*
102: y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106: PetscErrorCode ierr;
107: BV_SVEC *x = (BV_SVEC*)X->data;
108: const PetscScalar *d_px,*d_A;
109: PetscScalar *d_py,*d_q,*d_x,*d_y;
110: PetscBLASInt n=0,k=0,one=1;
111: cublasStatus_t cberr;
112: cublasHandle_t cublasv2handle;
113: cudaError_t cerr;
116: PetscBLASIntCast(X->n,&n);
117: PetscBLASIntCast(X->k-X->l,&k);
118: PetscCUBLASGetHandle(&cublasv2handle);
119: VecCUDAGetArrayRead(x->v,&d_px);
120: if (beta==(PetscScalar)0.0) {
121: VecCUDAGetArrayWrite(y,&d_py);
122: } else {
123: VecCUDAGetArray(y,&d_py);
124: }
125: if (!q) {
126: VecCUDAGetArray(X->buffer,&d_q);
127: } else {
128: cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129: cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130: PetscLogCpuToGpu(k*sizeof(PetscScalar));
131: }
132: d_A = d_px+(X->nc+X->l)*X->n;
133: d_x = d_q;
134: d_y = d_py;
135: PetscLogGpuTimeBegin();
136: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137: PetscLogGpuTimeEnd();
138: VecCUDARestoreArrayRead(x->v,&d_px);
139: if (beta==(PetscScalar)0.0) {
140: VecCUDARestoreArrayWrite(y,&d_py);
141: } else {
142: VecCUDARestoreArray(y,&d_py);
143: }
144: if (!q) {
145: VecCUDARestoreArray(X->buffer,&d_q);
146: } else {
147: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148: }
149: PetscLogGpuFlops(2.0*n*k);
150: return(0);
151: }
153: /*
154: A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159: BV_SVEC *ctx = (BV_SVEC*)V->data;
160: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161: PetscInt j,ldq,nq;
162: PetscBLASInt m=0,n=0,k=0,l,ldq_=0,bs=BLOCKSIZE;
163: cublasStatus_t cberr;
164: size_t freemem,totmem;
165: cublasHandle_t cublasv2handle;
166: cudaError_t cerr;
169: if (!V->n) return(0);
170: PetscBLASIntCast(V->n,&m);
171: PetscBLASIntCast(e-s,&n);
172: PetscBLASIntCast(V->k-V->l,&k);
173: MatGetSize(Q,&ldq,&nq);
174: PetscBLASIntCast(ldq,&ldq_);
175: VecCUDAGetArray(ctx->v,&d_pv);
176: MatDenseGetArray(Q,&q);
177: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
178: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
180: PetscCUBLASGetHandle(&cublasv2handle);
181: PetscLogGpuTimeBegin();
182: /* try to allocate the whole matrix */
183: cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
184: if (freemem>=m*n*sizeof(PetscScalar)) {
185: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
186: d_A = d_pv+(V->nc+V->l)*m;
187: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
189: for (j=0;j<n;j++) {
190: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
191: }
192: } else {
193: bs = freemem/(m*sizeof(PetscScalar));
194: cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
195: l = m % bs;
196: if (l) {
197: d_A = d_pv+(V->nc+V->l)*m;
198: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
199: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
200: for (j=0;j<n;j++) {
201: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
202: }
203: }
204: for (;l<m;l+=bs) {
205: d_A = d_pv+(V->nc+V->l)*m+l;
206: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
207: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
208: for (j=0;j<n;j++) {
209: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
210: }
211: }
212: }
213: PetscLogGpuTimeEnd();
214: cerr = WaitForCUDA();CHKERRCUDA(cerr);
215: MatDenseRestoreArray(Q,&q);
216: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
217: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
218: VecCUDARestoreArray(ctx->v,&d_pv);
219: PetscLogGpuFlops(2.0*m*n*k);
220: return(0);
221: }
223: /*
224: A(:,s:e-1) := A*B(:,s:e-1)
225: */
226: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
227: {
229: BV_SVEC *ctx = (BV_SVEC*)V->data;
230: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
231: PetscInt j,ldq,nq;
232: PetscBLASInt m=0,n=0,k=0,ldq_=0;
233: cublasStatus_t cberr;
234: cublasHandle_t cublasv2handle;
235: cudaError_t cerr;
238: if (!V->n) return(0);
239: PetscBLASIntCast(V->n,&m);
240: PetscBLASIntCast(e-s,&n);
241: PetscBLASIntCast(V->k-V->l,&k);
242: MatGetSize(Q,&ldq,&nq);
243: PetscBLASIntCast(ldq,&ldq_);
244: VecCUDAGetArray(ctx->v,&d_pv);
245: MatDenseGetArray(Q,&q);
246: PetscCUBLASGetHandle(&cublasv2handle);
247: cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
248: cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
249: PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250: PetscLogGpuTimeBegin();
251: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
252: d_A = d_pv+(V->nc+V->l)*m;
253: d_B = d_q+V->l*ldq+s;
254: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255: for (j=0;j<n;j++) {
256: cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
257: }
258: PetscLogGpuTimeEnd();
259: cerr = WaitForCUDA();CHKERRCUDA(cerr);
260: MatDenseRestoreArray(Q,&q);
261: cerr = cudaFree(d_q);CHKERRCUDA(cerr);
262: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
263: VecCUDARestoreArray(ctx->v,&d_pv);
264: PetscLogGpuFlops(2.0*m*n*k);
265: return(0);
266: }
268: /*
269: C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273: PetscErrorCode ierr;
274: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275: const PetscScalar *d_px,*d_py,*d_A,*d_B;
276: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277: PetscInt j,ldm;
278: PetscBLASInt m=0,n=0,k=0,ldm_=0;
279: PetscMPIInt len;
280: cublasStatus_t cberr;
281: cublasHandle_t cublasv2handle;
282: cudaError_t cerr;
285: PetscBLASIntCast(Y->k-Y->l,&m);
286: PetscBLASIntCast(X->k-X->l,&n);
287: PetscBLASIntCast(X->n,&k);
288: MatGetSize(M,&ldm,NULL);
289: PetscBLASIntCast(ldm,&ldm_);
290: VecCUDAGetArrayRead(x->v,&d_px);
291: VecCUDAGetArrayRead(y->v,&d_py);
292: MatDenseGetArray(M,&pm);
293: PetscCUBLASGetHandle(&cublasv2handle);
294: cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
295: d_A = d_py+(Y->nc+Y->l)*Y->n;
296: d_B = d_px+(X->nc+X->l)*X->n;
297: C = pm+X->l*ldm+Y->l;
298: if (x->mpi) {
299: if (ldm==m) {
300: BVAllocateWork_Private(X,m*n);
301: if (k) {
302: PetscLogGpuTimeBegin();
303: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
304: PetscLogGpuTimeEnd();
305: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
306: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
307: } else {
308: PetscArrayzero(X->work,m*n);
309: }
310: PetscMPIIntCast(m*n,&len);
311: MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
312: } else {
313: BVAllocateWork_Private(X,2*m*n);
314: CC = X->work+m*n;
315: if (k) {
316: PetscLogGpuTimeBegin();
317: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
318: PetscLogGpuTimeEnd();
319: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
320: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
321: } else {
322: PetscArrayzero(X->work,m*n);
323: }
324: PetscMPIIntCast(m*n,&len);
325: MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
326: for (j=0;j<n;j++) {
327: PetscArraycpy(C+j*ldm,CC+j*m,m);
328: }
329: }
330: } else {
331: if (k) {
332: BVAllocateWork_Private(X,m*n);
333: PetscLogGpuTimeBegin();
334: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
335: PetscLogGpuTimeEnd();
336: cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
337: PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
338: for (j=0;j<n;j++) {
339: PetscArraycpy(C+j*ldm,X->work+j*m,m);
340: }
341: }
342: }
343: cerr = WaitForCUDA();CHKERRCUDA(cerr);
344: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
345: MatDenseRestoreArray(M,&pm);
346: VecCUDARestoreArrayRead(x->v,&d_px);
347: VecCUDARestoreArrayRead(y->v,&d_py);
348: PetscLogGpuFlops(2.0*m*n*k);
349: return(0);
350: }
352: #if defined(PETSC_USE_COMPLEX)
353: struct conjugate
354: {
355: __host__ __device__
356: PetscScalar operator()(PetscScalar x)
357: {
358: return PetscConj(x);
359: }
360: };
362: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
363: {
364: cudaError_t cerr;
365: thrust::device_ptr<PetscScalar> ptr;
368: try {
369: ptr = thrust::device_pointer_cast(a);
370: thrust::transform(ptr,ptr+n,ptr,conjugate());
371: cerr = WaitForCUDA();CHKERRCUDA(cerr);
372: } catch (char *ex) {
373: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
374: }
375: return(0);
376: }
377: #endif
379: /*
380: y := A'*x computed as y' := x'*A
381: */
382: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
383: {
384: PetscErrorCode ierr;
385: BV_SVEC *x = (BV_SVEC*)X->data;
386: const PetscScalar *d_A,*d_x,*d_px,*d_py;
387: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
388: PetscBLASInt n=0,k=0,one=1;
389: PetscMPIInt len;
390: Vec z = y;
391: cublasStatus_t cberr;
392: cublasHandle_t cublasv2handle;
393: cudaError_t cerr;
396: PetscBLASIntCast(X->n,&n);
397: PetscBLASIntCast(X->k-X->l,&k);
398: PetscCUBLASGetHandle(&cublasv2handle);
399: if (X->matrix) {
400: BV_IPMatMult(X,y);
401: z = X->Bx;
402: }
403: VecCUDAGetArrayRead(x->v,&d_px);
404: VecCUDAGetArrayRead(z,&d_py);
405: if (!q) {
406: VecCUDAGetArrayWrite(X->buffer,&d_work);
407: } else {
408: cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
409: }
410: d_A = d_px+(X->nc+X->l)*X->n;
411: d_x = d_py;
412: if (x->mpi) {
413: BVAllocateWork_Private(X,k);
414: if (n) {
415: PetscLogGpuTimeBegin();
416: #if defined(PETSC_USE_COMPLEX)
417: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
418: ConjugateCudaArray(d_work,k);
419: #else
420: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
421: #endif
422: PetscLogGpuTimeEnd();
423: cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
424: PetscLogGpuToCpu(k*sizeof(PetscScalar));
425: } else {
426: PetscArrayzero(X->work,k);
427: }
428: if (!q) {
429: VecCUDARestoreArrayWrite(X->buffer,&d_work);
430: VecGetArray(X->buffer,&qq);
431: } else {
432: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
433: }
434: PetscMPIIntCast(k,&len);
435: MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
436: if (!q) { VecRestoreArray(X->buffer,&qq); }
437: } else {
438: if (n) {
439: PetscLogGpuTimeBegin();
440: #if defined(PETSC_USE_COMPLEX)
441: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
442: ConjugateCudaArray(d_work,k);
443: #else
444: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
445: #endif
446: PetscLogGpuTimeEnd();
447: }
448: if (!q) {
449: VecCUDARestoreArrayWrite(X->buffer,&d_work);
450: } else {
451: cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
452: PetscLogGpuToCpu(k*sizeof(PetscScalar));
453: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
454: }
455: }
456: cerr = WaitForCUDA();CHKERRCUDA(cerr);
457: VecCUDARestoreArrayRead(z,&d_py);
458: VecCUDARestoreArrayRead(x->v,&d_px);
459: PetscLogGpuFlops(2.0*n*k);
460: return(0);
461: }
463: /*
464: y := A'*x computed as y' := x'*A
465: */
466: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
467: {
468: PetscErrorCode ierr;
469: BV_SVEC *x = (BV_SVEC*)X->data;
470: const PetscScalar *d_A,*d_x,*d_px,*d_py;
471: PetscScalar *d_y,szero=0.0,sone=1.0;
472: PetscBLASInt n=0,k=0,one=1;
473: Vec z = y;
474: cublasStatus_t cberr;
475: cublasHandle_t cublasv2handle;
476: cudaError_t cerr;
479: PetscBLASIntCast(X->n,&n);
480: PetscBLASIntCast(X->k-X->l,&k);
481: if (X->matrix) {
482: BV_IPMatMult(X,y);
483: z = X->Bx;
484: }
485: PetscCUBLASGetHandle(&cublasv2handle);
486: VecCUDAGetArrayRead(x->v,&d_px);
487: VecCUDAGetArrayRead(z,&d_py);
488: d_A = d_px+(X->nc+X->l)*X->n;
489: d_x = d_py;
490: if (n) {
491: cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
492: PetscLogGpuTimeBegin();
493: #if defined(PETSC_USE_COMPLEX)
494: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495: ConjugateCudaArray(d_y,k);
496: #else
497: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
498: #endif
499: PetscLogGpuTimeEnd();
500: cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
501: PetscLogGpuToCpu(k*sizeof(PetscScalar));
502: cerr = cudaFree(d_y);CHKERRCUDA(cerr);
503: }
504: VecCUDARestoreArrayRead(z,&d_py);
505: VecCUDARestoreArrayRead(x->v,&d_px);
506: PetscLogGpuFlops(2.0*n*k);
507: return(0);
508: }
510: /*
511: Scale n scalars
512: */
513: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
514: {
516: BV_SVEC *ctx = (BV_SVEC*)bv->data;
517: PetscScalar *d_array, *d_A;
518: PetscBLASInt n=0,one=1;
519: cublasStatus_t cberr;
520: cublasHandle_t cublasv2handle;
521: cudaError_t cerr;
524: VecCUDAGetArray(ctx->v,&d_array);
525: if (j<0) {
526: d_A = d_array+(bv->nc+bv->l)*bv->n;
527: PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
528: } else {
529: d_A = d_array+(bv->nc+j)*bv->n;
530: PetscBLASIntCast(bv->n,&n);
531: }
532: if (alpha == (PetscScalar)0.0) {
533: cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
534: } else if (alpha != (PetscScalar)1.0) {
535: PetscCUBLASGetHandle(&cublasv2handle);
536: PetscLogGpuTimeBegin();
537: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
538: PetscLogGpuTimeEnd();
539: PetscLogGpuFlops(1.0*n);
540: }
541: VecCUDARestoreArray(ctx->v,&d_array);
542: return(0);
543: }
545: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
546: {
547: PetscErrorCode ierr;
548: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
549: Mat Vmat,Wmat;
550: const PetscScalar *d_pv;
551: PetscScalar *d_pw;
552: PetscInt j;
555: if (V->vmm) {
556: BVGetMat(V,&Vmat);
557: BVGetMat(W,&Wmat);
558: MatProductCreateWithMat(A,Vmat,NULL,Wmat);
559: MatProductSetType(Wmat,MATPRODUCT_AB);
560: MatProductSetFromOptions(Wmat);
561: MatProductSymbolic(Wmat);
562: MatProductNumeric(Wmat);
563: MatProductClear(Wmat);
564: BVRestoreMat(V,&Vmat);
565: BVRestoreMat(W,&Wmat);
566: } else {
567: VecCUDAGetArrayRead(v->v,&d_pv);
568: VecCUDAGetArrayWrite(w->v,&d_pw);
569: for (j=0;j<V->k-V->l;j++) {
570: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
571: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
572: MatMult(A,V->cv[1],W->cv[1]);
573: VecCUDAResetArray(V->cv[1]);
574: VecCUDAResetArray(W->cv[1]);
575: }
576: VecCUDARestoreArrayRead(v->v,&d_pv);
577: VecCUDARestoreArrayWrite(w->v,&d_pw);
578: }
579: return(0);
580: }
582: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
583: {
584: PetscErrorCode ierr;
585: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
586: const PetscScalar *d_pv,*d_pvc;
587: PetscScalar *d_pw,*d_pwc;
588: cudaError_t cerr;
591: VecCUDAGetArrayRead(v->v,&d_pv);
592: VecCUDAGetArrayWrite(w->v,&d_pw);
593: d_pvc = d_pv+(V->nc+V->l)*V->n;
594: d_pwc = d_pw+(W->nc+W->l)*W->n;
595: cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
596: VecCUDARestoreArrayRead(v->v,&d_pv);
597: VecCUDARestoreArrayWrite(w->v,&d_pw);
598: return(0);
599: }
601: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
602: {
604: BV_SVEC *v = (BV_SVEC*)V->data;
605: PetscScalar *d_pv;
606: cudaError_t cerr;
609: VecCUDAGetArray(v->v,&d_pv);
610: cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
611: VecCUDARestoreArray(v->v,&d_pv);
612: return(0);
613: }
615: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
616: {
617: PetscErrorCode ierr;
618: BV_SVEC *ctx = (BV_SVEC*)bv->data;
619: const PetscScalar *d_pv;
620: PetscScalar *d_pnew;
621: PetscInt bs;
622: Vec vnew;
623: char str[50];
624: cudaError_t cerr;
627: VecGetBlockSize(bv->t,&bs);
628: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
629: VecSetType(vnew,((PetscObject)bv->t)->type_name);
630: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
631: VecSetBlockSize(vnew,bs);
632: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
633: if (((PetscObject)bv)->name) {
634: PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
635: PetscObjectSetName((PetscObject)vnew,str);
636: }
637: if (copy) {
638: VecCUDAGetArrayRead(ctx->v,&d_pv);
639: VecCUDAGetArrayWrite(vnew,&d_pnew);
640: cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
641: VecCUDARestoreArrayRead(ctx->v,&d_pv);
642: VecCUDARestoreArrayWrite(vnew,&d_pnew);
643: }
644: VecDestroy(&ctx->v);
645: ctx->v = vnew;
646: return(0);
647: }
649: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
650: {
652: BV_SVEC *ctx = (BV_SVEC*)bv->data;
653: PetscScalar *d_pv;
654: PetscInt l;
657: l = BVAvailableVec;
658: VecCUDAGetArray(ctx->v,&d_pv);
659: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
660: return(0);
661: }
663: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
664: {
666: BV_SVEC *ctx = (BV_SVEC*)bv->data;
667: PetscInt l;
670: l = (j==bv->ci[0])? 0: 1;
671: VecCUDAResetArray(bv->cv[l]);
672: VecCUDARestoreArray(ctx->v,NULL);
673: return(0);
674: }
676: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
677: {
678: PetscErrorCode ierr;
679: Vec v;
680: const PetscScalar *d_pv;
681: PetscObjectState lstate,rstate;
682: PetscBool change=PETSC_FALSE;
685: /* force sync flag to PETSC_CUDA_BOTH */
686: if (L) {
687: PetscObjectStateGet((PetscObject)*L,&lstate);
688: if (lstate != bv->lstate) {
689: v = ((BV_SVEC*)bv->L->data)->v;
690: VecCUDAGetArrayRead(v,&d_pv);
691: VecCUDARestoreArrayRead(v,&d_pv);
692: change = PETSC_TRUE;
693: }
694: }
695: if (R) {
696: PetscObjectStateGet((PetscObject)*R,&rstate);
697: if (rstate != bv->rstate) {
698: v = ((BV_SVEC*)bv->R->data)->v;
699: VecCUDAGetArrayRead(v,&d_pv);
700: VecCUDARestoreArrayRead(v,&d_pv);
701: change = PETSC_TRUE;
702: }
703: }
704: if (change) {
705: v = ((BV_SVEC*)bv->data)->v;
706: VecCUDAGetArray(v,(PetscScalar **)&d_pv);
707: VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
708: }
709: return(0);
710: }
712: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
713: {
715: BV_SVEC *ctx = (BV_SVEC*)bv->data;
716: PetscScalar *vv,*aa;
717: PetscBool create=PETSC_FALSE;
718: PetscInt m,cols;
721: m = bv->k-bv->l;
722: if (!bv->Aget) create=PETSC_TRUE;
723: else {
724: MatDenseCUDAGetArray(bv->Aget,&aa);
725: if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
726: MatGetSize(bv->Aget,NULL,&cols);
727: if (cols!=m) {
728: MatDestroy(&bv->Aget);
729: create=PETSC_TRUE;
730: }
731: }
732: VecCUDAGetArray(ctx->v,&vv);
733: if (create) {
734: MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
735: MatDenseCUDAReplaceArray(bv->Aget,NULL); /* replace with a null pointer, the value after BVRestoreMat */
736: PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
737: }
738: MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n); /* set the actual pointer */
739: *A = bv->Aget;
740: return(0);
741: }
743: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
744: {
746: BV_SVEC *ctx = (BV_SVEC*)bv->data;
747: PetscScalar *vv,*aa;
750: MatDenseCUDAGetArray(bv->Aget,&aa);
751: vv = aa-(bv->nc+bv->l)*bv->n;
752: MatDenseCUDAResetArray(bv->Aget);
753: VecCUDARestoreArray(ctx->v,&vv);
754: *A = NULL;
755: return(0);
756: }