Actual source code: densecuda.cu
petsc-3.12.0 2019-09-29
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsccublas.h>
9: /* cublas definitions are here */
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #if defined(PETSC_USE_COMPLEX)
13: #if defined(PETSC_USE_REAL_SINGLE)
14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
24: #else /* complex double */
25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
35: #endif
36: #else /* real single */
37: #if defined(PETSC_USE_REAL_SINGLE)
38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
48: #else /* real double */
49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
59: #endif
60: #endif
62: /* copy and pasted from the CUBLAS implementation */
63: #define CHKERRCUSOLVER(err) do {if (PetscUnlikely(err)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSOLVER error %d",err);} while(0)
64: static PetscErrorCode PetscCUSOLVERDnDestroyHandle();
65: static PetscErrorCode PetscCUSOLVERDnGetHandle_Private(cusolverDnHandle_t **handle)
66: {
67: static cusolverDnHandle_t cusolverdnhandle = NULL;
68: cusolverStatus_t cerr;
69: PetscErrorCode ierr;
72: if (!cusolverdnhandle) {
73: cerr = cusolverDnCreate(&cusolverdnhandle);CHKERRCUSOLVER(cerr);
74: PetscRegisterFinalize(PetscCUSOLVERDnDestroyHandle);
75: }
76: *handle = &cusolverdnhandle;
77: return(0);
78: }
80: PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnInitializeHandle(void)
81: {
82: cusolverDnHandle_t *p_cusolverdnhandle;
83: PetscErrorCode ierr;
86: PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
87: return(0);
88: }
90: PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *handle)
91: {
92: cusolverDnHandle_t *p_cusolverdnhandle;
93: PetscErrorCode ierr;
96: PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
97: *handle = *p_cusolverdnhandle;
98: return(0);
99: }
101: PetscErrorCode PetscCUSOLVERDnDestroyHandle()
102: {
103: cusolverDnHandle_t *p_cusolverdnhandle;
104: cusolverStatus_t cerr;
108: PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
109: cerr = cusolverDnDestroy(*p_cusolverdnhandle);CHKERRCUSOLVER(cerr);
110: *p_cusolverdnhandle = NULL; /* Ensures proper reinitialization */
111: return(0);
112: }
114: typedef struct {
115: PetscScalar *d_v; /* pointer to the matrix on the GPU */
116: /* factorization support */
117: int *d_fact_ipiv; /* device pivots */
118: PetscScalar *d_fact_work; /* device workspace */
119: int fact_lwork;
120: int *d_fact_info; /* device info */
121: /* workspace */
122: Vec workvec;
123: } Mat_SeqDenseCUDA;
125: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
126: {
127: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
128: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
129: PetscErrorCode ierr;
130: cudaError_t cerr;
134: PetscInfo3(A,"%s matrix %d x %d\n",A->valid_GPU_matrix == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
135: if (A->valid_GPU_matrix == PETSC_OFFLOAD_GPU) {
136: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
137: if (cA->lda > A->rmap->n) {
138: PetscInt j,m = A->rmap->n;
140: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
141: cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
142: }
143: } else {
144: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
145: }
146: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
147: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
149: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
150: }
151: return(0);
152: }
154: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
155: {
156: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
157: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
158: PetscBool copy;
159: PetscErrorCode ierr;
160: cudaError_t cerr;
164: if (A->pinnedtocpu) return(0);
165: if (!dA->d_v) {
166: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
167: }
168: copy = (PetscBool)(A->valid_GPU_matrix == PETSC_OFFLOAD_CPU || A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED);
169: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
170: if (copy) {
171: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
172: if (cA->lda > A->rmap->n) {
173: PetscInt j,m = A->rmap->n;
175: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
176: cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
177: }
178: } else {
179: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
180: }
181: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
182: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
184: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
185: }
186: return(0);
187: }
189: PetscErrorCode MatDenseCUDAGetArrayWrite(Mat A, PetscScalar **a)
190: {
191: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
195: if (!dA->d_v) {
196: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
197: cudaError_t cerr;
199: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
200: }
201: *a = dA->d_v;
202: return(0);
203: }
205: PetscErrorCode MatDenseCUDARestoreArrayWrite(Mat A, PetscScalar **a)
206: {
208: *a = NULL;
209: A->valid_GPU_matrix = PETSC_OFFLOAD_GPU;
210: return(0);
211: }
213: PetscErrorCode MatDenseCUDAGetArrayRead(Mat A, const PetscScalar **a)
214: {
215: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
216: PetscErrorCode ierr;
220: MatSeqDenseCUDACopyToGPU(A);
221: *a = dA->d_v;
222: return(0);
223: }
225: PetscErrorCode MatDenseCUDARestoreArrayRead(Mat A, const PetscScalar **a)
226: {
228: *a = NULL;
229: return(0);
230: }
232: PetscErrorCode MatDenseCUDAGetArray(Mat A, PetscScalar **a)
233: {
234: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
235: PetscErrorCode ierr;
239: MatSeqDenseCUDACopyToGPU(A);
240: *a = dA->d_v;
241: return(0);
242: }
244: PetscErrorCode MatDenseCUDARestoreArray(Mat A, PetscScalar **a)
245: {
247: *a = NULL;
248: A->valid_GPU_matrix = PETSC_OFFLOAD_GPU;
249: return(0);
250: }
252: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
253: {
254: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
255: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
256: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
257: PetscScalar *da;
258: PetscErrorCode ierr;
259: cudaError_t ccer;
260: cusolverStatus_t cerr;
261: cusolverDnHandle_t handle;
262: int n,lda;
263: #if defined(PETSC_USE_DEBUG)
264: int info;
265: #endif
268: if (!A->rmap->n || !A->cmap->n) return(0);
269: PetscCUSOLVERDnGetHandle(&handle);
270: PetscMPIIntCast(A->cmap->n,&n);
271: PetscMPIIntCast(a->lda,&lda);
272: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
273: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
274: if (!dA->d_fact_ipiv) { /* spd */
275: int il;
277: MatDenseCUDAGetArray(A,&da);
278: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
279: if (il > dA->fact_lwork) {
280: dA->fact_lwork = il;
282: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
283: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
284: }
285: PetscLogGpuTimeBegin();
286: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
287: WaitForGPU();CHKERRCUDA(ierr);
288: PetscLogGpuTimeEnd();
289: MatDenseCUDARestoreArray(A,&da);
290: /* TODO (write cuda kernel) */
291: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
292: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
293: }
294: #if defined(PETSC_USE_DEBUG)
295: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
296: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
297: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
298: #endif
299: PetscLogGpuFlops(1.0*n*n*n/3.0);
300: A->ops->solve = NULL;
301: A->ops->solvetranspose = NULL;
302: A->ops->matsolve = NULL;
303: A->factortype = MAT_FACTOR_NONE;
305: PetscFree(A->solvertype);
306: return(0);
307: #else
308: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
309: #endif
310: }
312: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
313: {
314: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
315: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
316: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
317: const PetscScalar *da;
318: PetscScalar *dx;
319: cusolverDnHandle_t handle;
320: PetscBool iscuda;
321: int nrhs,n,lda,ldx;
322: #if defined(PETSC_USE_DEBUG)
323: int info;
324: cudaError_t ccer;
325: #endif
326: cusolverStatus_t cerr;
327: PetscErrorCode ierr;
330: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
331: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
332: PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
333: MatCopy(B,X,SAME_NONZERO_PATTERN);
334: MatDenseCUDAGetArrayRead(A,&da);
335: /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
336: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
337: if (!iscuda) {
338: MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
339: }
340: MatDenseCUDAGetArray(X,&dx);
341: PetscMPIIntCast(A->rmap->n,&n);
342: PetscMPIIntCast(X->cmap->n,&nrhs);
343: PetscMPIIntCast(a->lda,&lda);
344: PetscMPIIntCast(x->lda,&ldx);
345: PetscCUSOLVERDnGetHandle(&handle);
346: PetscLogGpuTimeBegin();
347: if (A->factortype == MAT_FACTOR_LU) {
348: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
349: cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
350: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
351: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
352: if (!dA->d_fact_ipiv) { /* spd */
353: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
354: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
355: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
356: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
357: WaitForGPU();CHKERRCUDA(ierr);
358: PetscLogGpuTimeEnd();
359: MatDenseCUDARestoreArrayRead(A,&da);
360: MatDenseCUDARestoreArray(X,&dx);
361: if (!iscuda) {
362: MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
363: }
364: #if defined(PETSC_USE_DEBUG)
365: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
366: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
367: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
368: #endif
369: PetscLogGpuFlops(nrhs*(2.0*n*n - n));
370: return(0);
371: }
373: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
374: {
375: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
376: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
377: const PetscScalar *da;
378: PetscScalar *y;
379: cusolverDnHandle_t handle;
380: int one = 1,n,lda;
381: #if defined(PETSC_USE_DEBUG)
382: int info;
383: cudaError_t ccer;
384: #endif
385: cusolverStatus_t cerr;
386: PetscBool iscuda;
387: PetscErrorCode ierr;
390: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
391: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
392: PetscMPIIntCast(A->rmap->n,&n);
393: /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
394: PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
395: if (iscuda) {
396: VecCopy(xx,yy);
397: VecCUDAGetArray(yy,&y);
398: } else {
399: if (!dA->workvec) {
400: MatCreateVecs(A,&dA->workvec,NULL);
401: }
402: VecCopy(xx,dA->workvec);
403: VecCUDAGetArray(dA->workvec,&y);
404: }
405: MatDenseCUDAGetArrayRead(A,&da);
406: PetscMPIIntCast(a->lda,&lda);
407: PetscCUSOLVERDnGetHandle(&handle);
408: PetscLogGpuTimeBegin();
409: if (A->factortype == MAT_FACTOR_LU) {
410: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
411: cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
412: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
413: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
414: if (!dA->d_fact_ipiv) { /* spd */
415: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
416: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
417: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
418: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
419: WaitForGPU();CHKERRCUDA(ierr);
420: PetscLogGpuTimeEnd();
421: if (iscuda) {
422: VecCUDARestoreArray(yy,&y);
423: } else {
424: VecCUDARestoreArray(dA->workvec,&y);
425: VecCopy(dA->workvec,yy);
426: }
427: MatDenseCUDARestoreArrayRead(A,&da);
428: #if defined(PETSC_USE_DEBUG)
429: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
430: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
431: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
432: #endif
433: PetscLogGpuFlops(2.0*n*n - n);
434: return(0);
435: }
437: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
438: {
439: PetscErrorCode ierr;
442: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
443: return(0);
444: }
446: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
447: {
448: PetscErrorCode ierr;
451: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
452: return(0);
453: }
455: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
456: {
457: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
458: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
459: PetscScalar *da;
460: int m,n,lda;
461: #if defined(PETSC_USE_DEBUG)
462: int info;
463: #endif
464: cusolverStatus_t cerr;
465: cusolverDnHandle_t handle;
466: cudaError_t ccer;
467: PetscErrorCode ierr;
470: if (!A->rmap->n || !A->cmap->n) return(0);
471: PetscCUSOLVERDnGetHandle(&handle);
472: MatDenseCUDAGetArray(A,&da);
473: PetscMPIIntCast(A->cmap->n,&n);
474: PetscMPIIntCast(A->rmap->n,&m);
475: PetscMPIIntCast(a->lda,&lda);
476: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
477: if (!dA->d_fact_ipiv) {
478: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
479: }
480: if (!dA->fact_lwork) {
481: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
482: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
483: }
484: if (!dA->d_fact_info) {
485: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
486: }
487: PetscLogGpuTimeBegin();
488: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
489: WaitForGPU();CHKERRCUDA(ierr);
490: PetscLogGpuTimeEnd();
491: MatDenseCUDARestoreArray(A,&da);
492: #if defined(PETSC_USE_DEBUG)
493: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
494: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
495: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
496: #endif
497: A->factortype = MAT_FACTOR_LU;
498: PetscLogGpuFlops(2.0*n*n*m/3.0);
500: A->ops->solve = MatSolve_SeqDenseCUDA;
501: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
502: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
504: PetscFree(A->solvertype);
505: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
506: return(0);
507: }
509: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
510: {
511: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
512: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
513: PetscScalar *da;
514: int n,lda;
515: #if defined(PETSC_USE_DEBUG)
516: int info;
517: #endif
518: cusolverStatus_t cerr;
519: cusolverDnHandle_t handle;
520: cudaError_t ccer;
521: PetscErrorCode ierr;
524: if (!A->rmap->n || !A->cmap->n) return(0);
525: PetscCUSOLVERDnGetHandle(&handle);
526: PetscMPIIntCast(A->rmap->n,&n);
527: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
528: if (A->spd) {
529: MatDenseCUDAGetArray(A,&da);
530: PetscMPIIntCast(a->lda,&lda);
531: if (!dA->fact_lwork) {
532: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
533: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
534: }
535: if (!dA->d_fact_info) {
536: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
537: }
538: PetscLogGpuTimeBegin();
539: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
540: WaitForGPU();CHKERRCUDA(ierr);
541: PetscLogGpuTimeEnd();
543: MatDenseCUDARestoreArray(A,&da);
544: #if defined(PETSC_USE_DEBUG)
545: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
546: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
547: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
548: #endif
549: A->factortype = MAT_FACTOR_CHOLESKY;
550: PetscLogGpuFlops(1.0*n*n*n/3.0);
551: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
552: #if 0
553: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
554: The code below should work, and it can be activated when *sytrs routines will be available */
555: if (!dA->d_fact_ipiv) {
556: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
557: }
558: if (!dA->fact_lwork) {
559: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
560: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
561: }
562: if (!dA->d_fact_info) {
563: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
564: }
565: PetscLogGpuTimeBegin();
566: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
567: PetscLogGpuTimeEnd();
568: #endif
570: A->ops->solve = MatSolve_SeqDenseCUDA;
571: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
572: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
574: PetscFree(A->solvertype);
575: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
576: return(0);
577: }
579: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
580: static PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA, PetscBool tB)
581: {
582: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
583: Mat_SeqDense *b = (Mat_SeqDense*)B->data;
584: Mat_SeqDense *c = (Mat_SeqDense*)C->data;
585: const PetscScalar *da,*db;
586: PetscScalar *dc;
587: PetscScalar one=1.0,zero=0.0;
588: int m,n,k,alda,blda,clda;
589: PetscErrorCode ierr;
590: cublasHandle_t cublasv2handle;
591: cublasStatus_t berr;
594: PetscMPIIntCast(C->rmap->n,&m);
595: PetscMPIIntCast(C->cmap->n,&n);
596: if (tA) {
597: PetscMPIIntCast(A->rmap->n,&k);
598: } else {
599: PetscMPIIntCast(A->cmap->n,&k);
600: }
601: if (!m || !n || !k) return(0);
602: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
603: MatDenseCUDAGetArrayRead(A,&da);
604: MatDenseCUDAGetArrayRead(B,&db);
605: MatDenseCUDAGetArrayWrite(C,&dc);
606: PetscMPIIntCast(a->lda,&alda);
607: PetscMPIIntCast(b->lda,&blda);
608: PetscMPIIntCast(c->lda,&clda);
609: PetscCUBLASGetHandle(&cublasv2handle);
610: PetscLogGpuTimeBegin();
611: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
612: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
613: WaitForGPU();CHKERRCUDA(ierr);
614: PetscLogGpuTimeEnd();
615: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
616: MatDenseCUDARestoreArrayRead(A,&da);
617: MatDenseCUDARestoreArrayRead(B,&db);
618: MatDenseCUDARestoreArrayWrite(C,&dc);
619: return(0);
620: }
622: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
623: {
627: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
628: return(0);
629: }
631: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
632: {
636: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
637: return(0);
638: }
640: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
641: {
645: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
646: return(0);
647: }
649: /* zz = op(A)*xx + yy
650: if yy == NULL, only MatMult */
651: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
652: {
653: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
654: const PetscScalar *xarray,*da;
655: PetscScalar *zarray;
656: PetscScalar one=1.0,zero=0.0;
657: int m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
658: cublasHandle_t cublasv2handle;
659: cublasStatus_t berr;
660: PetscErrorCode ierr;
663: if (yy && yy != zz) { /* mult add */
664: VecCopy_SeqCUDA(yy,zz);
665: }
666: if (!A->rmap->n || !A->cmap->n) {
667: if (!yy) { /* mult only */
668: VecSet_SeqCUDA(zz,0.0);
669: }
670: return(0);
671: }
672: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
673: PetscMPIIntCast(A->rmap->n,&m);
674: PetscMPIIntCast(A->cmap->n,&n);
675: PetscMPIIntCast(mat->lda,&lda);
676: PetscCUBLASGetHandle(&cublasv2handle);
677: MatDenseCUDAGetArrayRead(A,&da);
678: VecCUDAGetArrayRead(xx,&xarray);
679: VecCUDAGetArray(zz,&zarray);
680: PetscLogGpuTimeBegin();
681: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
682: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
683: PetscLogGpuTimeEnd();
684: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
685: VecCUDARestoreArrayRead(xx,&xarray);
686: VecCUDARestoreArray(zz,&zarray);
687: MatDenseCUDARestoreArrayRead(A,&da);
688: return(0);
689: }
691: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
692: {
696: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
697: return(0);
698: }
700: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
701: {
705: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
706: return(0);
707: }
709: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
710: {
714: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
715: return(0);
716: }
718: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
719: {
723: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
724: return(0);
725: }
727: PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar *array[])
728: {
729: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
733: MatSeqDenseCUDACopyFromGPU(A);
734: *array = mat->v;
735: return(0);
736: }
738: PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
739: {
740: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
744: MatSeqDenseCUDACopyFromGPU(A);
745: *array = mat->v;
746: A->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
747: return(0);
748: }
750: PetscErrorCode MatDenseRestoreArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
751: {
753: return(0);
754: }
756: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
757: {
758: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
759: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
760: const PetscScalar *dx;
761: PetscScalar *dy;
762: int j,N,m,ldax,lday,one = 1;
763: cublasHandle_t cublasv2handle;
764: cublasStatus_t berr;
765: PetscErrorCode ierr;
768: if (!X->rmap->n || !X->cmap->n) return(0);
769: PetscCUBLASGetHandle(&cublasv2handle);
770: MatDenseCUDAGetArrayRead(X,&dx);
771: if (alpha != 0.0) {
772: MatDenseCUDAGetArray(Y,&dy);
773: } else {
774: MatDenseCUDAGetArrayWrite(Y,&dy);
775: }
776: PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
777: PetscMPIIntCast(X->rmap->n,&m);
778: PetscMPIIntCast(x->lda,&ldax);
779: PetscMPIIntCast(y->lda,&lday);
780: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
781: PetscLogGpuTimeBegin();
782: if (ldax>m || lday>m) {
783: for (j=0; j<X->cmap->n; j++) {
784: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
785: }
786: } else {
787: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
788: }
789: WaitForGPU();CHKERRCUDA(ierr);
790: PetscLogGpuTimeEnd();
791: PetscLogGpuFlops(PetscMax(2.*N-1,0));
792: MatDenseCUDARestoreArrayRead(X,&dx);
793: if (alpha != 0.0) {
794: MatDenseCUDARestoreArray(Y,&dy);
795: } else {
796: MatDenseCUDARestoreArrayWrite(Y,&dy);
797: }
798: return(0);
799: }
801: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
802: {
803: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
804: cudaError_t cerr;
805: PetscErrorCode ierr;
808: if (dA) {
809: cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr);
810: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
811: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
812: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
813: VecDestroy(&dA->workvec);
814: }
815: PetscFree(A->spptr);
816: return(0);
817: }
819: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
820: {
821: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
825: /* prevent to copy back data if we own the data pointer */
826: if (!a->user_alloc) { A->valid_GPU_matrix = PETSC_OFFLOAD_CPU; }
827: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
828: MatDestroy_SeqDense(A);
829: return(0);
830: }
832: PetscErrorCode MatSeqDenseSetPreallocation_SeqDenseCUDA(Mat B,PetscScalar *data)
833: {
834: Mat_SeqDense *b;
835: Mat_SeqDenseCUDA *dB;
836: cudaError_t cerr;
837: PetscErrorCode ierr;
840: PetscLayoutSetUp(B->rmap);
841: PetscLayoutSetUp(B->cmap);
842: b = (Mat_SeqDense*)B->data;
843: b->Mmax = B->rmap->n;
844: b->Nmax = B->cmap->n;
845: if (b->lda <= 0 || b->changelda) b->lda = B->rmap->n;
846: if (b->lda < B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Invalid lda %D < %D",b->lda,B->rmap->n);
848: PetscIntMultError(b->lda,b->Nmax,NULL);
850: MatReset_SeqDenseCUDA(B);
851: PetscNewLog(B,&dB);
852: B->spptr = dB;
853: cerr = cudaMalloc((void**)&dB->d_v,b->lda*b->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
855: if (!data) { /* petsc-allocated storage */
856: if (!b->user_alloc) { PetscFree(b->v); }
857: PetscCalloc1((size_t)b->lda*b->Nmax,&b->v);
858: PetscLogObjectMemory((PetscObject)B,b->lda*b->Nmax*sizeof(PetscScalar));
859: b->user_alloc = PETSC_FALSE;
860: } else { /* user-allocated storage */
861: if (!b->user_alloc) { PetscFree(b->v); }
862: b->v = data;
863: b->user_alloc = PETSC_TRUE;
864: }
865: B->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
866: B->preallocated = PETSC_TRUE;
867: B->assembled = PETSC_TRUE;
868: return(0);
869: }
871: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
872: {
876: MatCreate(PetscObjectComm((PetscObject)A),B);
877: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
878: MatSetType(*B,((PetscObject)A)->type_name);
879: MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
880: if (cpvalues == MAT_COPY_VALUES && A->valid_GPU_matrix != PETSC_OFFLOAD_CPU) {
881: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
882: const PetscScalar *da;
883: PetscScalar *db;
884: cudaError_t cerr;
886: MatDenseCUDAGetArrayRead(A,&da);
887: MatDenseCUDAGetArrayWrite(*B,&db);
888: if (a->lda > A->rmap->n) {
889: PetscInt j,m = A->rmap->n;
891: for (j=0; j<A->cmap->n; j++) { /* it can be done better */
892: cerr = cudaMemcpy(db+j*m,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
893: }
894: } else {
895: cerr = cudaMemcpy(db,da,a->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
896: }
897: MatDenseCUDARestoreArrayRead(A,&da);
898: MatDenseCUDARestoreArrayWrite(*B,&db);
899: (*B)->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
900: }
901: return(0);
902: }
904: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
905: {
909: MatCreate(PetscObjectComm((PetscObject)A),fact);
910: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
911: MatSetType(*fact,MATSEQDENSECUDA);
912: if (ftype == MAT_FACTOR_LU) {
913: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
914: } else {
915: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
916: }
917: (*fact)->factortype = ftype;
919: PetscFree((*fact)->solvertype);
920: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
921: return(0);
922: }
924: static PetscErrorCode MatPinToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
925: {
929: A->pinnedtocpu = flg;
930: if (!flg) {
931: PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDenseCUDA);
932: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA);
933: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA);
934: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C", MatDenseRestoreArray_SeqDenseCUDA);
936: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
937: A->ops->mult = MatMult_SeqDenseCUDA;
938: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
939: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
940: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
941: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
942: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
943: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
944: A->ops->axpy = MatAXPY_SeqDenseCUDA;
945: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
946: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
947: } else {
948: /* make sure we have an up-to-date copy on the CPU */
949: MatSeqDenseCUDACopyFromGPU(A);
950: PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDense);
951: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C", MatDenseGetArray_SeqDense);
952: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
953: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C", MatDenseRestoreArray_SeqDense);
955: A->ops->duplicate = MatDuplicate_SeqDense;
956: A->ops->mult = MatMult_SeqDense;
957: A->ops->multadd = MatMultAdd_SeqDense;
958: A->ops->multtranspose = MatMultTranspose_SeqDense;
959: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
960: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
961: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
962: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
963: A->ops->axpy = MatAXPY_SeqDense;
964: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
965: A->ops->lufactor = MatLUFactor_SeqDense;
966: }
967: return(0);
968: }
970: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
971: {
972: Mat B;
973: PetscErrorCode ierr;
976: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
977: /* TODO these cases should be optimized */
978: MatConvert_Basic(M,type,reuse,newmat);
979: return(0);
980: }
982: B = *newmat;
983: MatPinToCPU_SeqDenseCUDA(B,PETSC_TRUE);
984: MatReset_SeqDenseCUDA(B);
985: PetscFree(B->defaultvectype);
986: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
987: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
988: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
990: B->ops->pintocpu = NULL;
991: B->ops->destroy = MatDestroy_SeqDense;
992: B->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
993: return(0);
994: }
996: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
997: {
998: Mat_SeqDenseCUDA *dB;
999: Mat B;
1000: PetscErrorCode ierr;
1003: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1004: /* TODO these cases should be optimized */
1005: MatConvert_Basic(M,type,reuse,newmat);
1006: return(0);
1007: }
1009: B = *newmat;
1010: PetscFree(B->defaultvectype);
1011: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1012: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1013: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",MatConvert_SeqDenseCUDA_SeqDense);
1015: MatReset_SeqDenseCUDA(B);
1016: PetscNewLog(B,&dB);
1017: B->spptr = dB;
1019: B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1021: MatPinToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1022: B->ops->pintocpu = MatPinToCPU_SeqDenseCUDA;
1023: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1024: return(0);
1025: }
1027: /*MC
1028: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1030: Options Database Keys:
1031: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1033: Level: beginner
1035: .seealso: MatCreateSeqDenseCuda()
1037: M*/
1038: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1039: {
1043: MatCreate_SeqDense(B);
1044: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1045: return(0);
1046: }