Actual source code: densecuda.cu

petsc-3.12.0 2019-09-29
Report Typos and Errors
  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6:  #include <../src/mat/impls/dense/seq/dense.h>
  7:  #include <petsccublas.h>

  9: /* cublas definitions are here */
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #else /* complex double */
 25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 35: #endif
 36: #else /* real single */
 37: #if defined(PETSC_USE_REAL_SINGLE)
 38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 48: #else /* real double */
 49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 59: #endif
 60: #endif

 62: /* copy and pasted from the CUBLAS implementation */
 63: #define CHKERRCUSOLVER(err) do {if (PetscUnlikely(err)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSOLVER error %d",err);} while(0)
 64: static PetscErrorCode PetscCUSOLVERDnDestroyHandle();
 65: static PetscErrorCode PetscCUSOLVERDnGetHandle_Private(cusolverDnHandle_t **handle)
 66: {
 67:   static cusolverDnHandle_t cusolverdnhandle = NULL;
 68:   cusolverStatus_t          cerr;
 69:   PetscErrorCode            ierr;

 72:   if (!cusolverdnhandle) {
 73:     cerr = cusolverDnCreate(&cusolverdnhandle);CHKERRCUSOLVER(cerr);
 74:     PetscRegisterFinalize(PetscCUSOLVERDnDestroyHandle);
 75:   }
 76:   *handle = &cusolverdnhandle;
 77:   return(0);
 78: }

 80: PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnInitializeHandle(void)
 81: {
 82:   cusolverDnHandle_t *p_cusolverdnhandle;
 83:   PetscErrorCode     ierr;

 86:   PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
 87:   return(0);
 88: }

 90: PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *handle)
 91: {
 92:   cusolverDnHandle_t *p_cusolverdnhandle;
 93:   PetscErrorCode     ierr;

 96:   PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
 97:   *handle = *p_cusolverdnhandle;
 98:   return(0);
 99: }

101: PetscErrorCode PetscCUSOLVERDnDestroyHandle()
102: {
103:   cusolverDnHandle_t *p_cusolverdnhandle;
104:   cusolverStatus_t cerr;

108:   PetscCUSOLVERDnGetHandle_Private(&p_cusolverdnhandle);
109:   cerr = cusolverDnDestroy(*p_cusolverdnhandle);CHKERRCUSOLVER(cerr);
110:   *p_cusolverdnhandle = NULL;  /* Ensures proper reinitialization */
111:   return(0);
112: }

114: typedef struct {
115:   PetscScalar *d_v;   /* pointer to the matrix on the GPU */
116:   /* factorization support */
117:   int         *d_fact_ipiv; /* device pivots */
118:   PetscScalar *d_fact_work; /* device workspace */
119:   int         fact_lwork;
120:   int         *d_fact_info; /* device info */
121:   /* workspace */
122:   Vec         workvec;
123: } Mat_SeqDenseCUDA;

125: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
126: {
127:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
128:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
129:   PetscErrorCode   ierr;
130:   cudaError_t      cerr;

134:   PetscInfo3(A,"%s matrix %d x %d\n",A->valid_GPU_matrix == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
135:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_GPU) {
136:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
137:     if (cA->lda > A->rmap->n) {
138:       PetscInt j,m = A->rmap->n;

140:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
141:         cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
142:       }
143:     } else {
144:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
145:     }
146:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
147:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

149:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
150:   }
151:   return(0);
152: }

154: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
155: {
156:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
157:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
158:   PetscBool        copy;
159:   PetscErrorCode   ierr;
160:   cudaError_t      cerr;

164:   if (A->pinnedtocpu) return(0);
165:   if (!dA->d_v) {
166:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
167:   }
168:   copy = (PetscBool)(A->valid_GPU_matrix == PETSC_OFFLOAD_CPU || A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED);
169:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
170:   if (copy) {
171:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
172:     if (cA->lda > A->rmap->n) {
173:       PetscInt j,m = A->rmap->n;

175:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
176:         cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
177:       }
178:     } else {
179:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
180:     }
181:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
182:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

184:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
185:   }
186:   return(0);
187: }

189: PetscErrorCode MatDenseCUDAGetArrayWrite(Mat A, PetscScalar **a)
190: {
191:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;

195:   if (!dA->d_v) {
196:     Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
197:     cudaError_t  cerr;

199:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*cA->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);
200:   }
201:   *a = dA->d_v;
202:   return(0);
203: }

205: PetscErrorCode MatDenseCUDARestoreArrayWrite(Mat A, PetscScalar **a)
206: {
208:   *a = NULL;
209:   A->valid_GPU_matrix = PETSC_OFFLOAD_GPU;
210:   return(0);
211: }

213: PetscErrorCode MatDenseCUDAGetArrayRead(Mat A, const PetscScalar **a)
214: {
215:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
216:   PetscErrorCode   ierr;

220:   MatSeqDenseCUDACopyToGPU(A);
221:   *a   = dA->d_v;
222:   return(0);
223: }

225: PetscErrorCode MatDenseCUDARestoreArrayRead(Mat A, const PetscScalar **a)
226: {
228:   *a = NULL;
229:   return(0);
230: }

232: PetscErrorCode MatDenseCUDAGetArray(Mat A, PetscScalar **a)
233: {
234:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
235:   PetscErrorCode   ierr;

239:   MatSeqDenseCUDACopyToGPU(A);
240:   *a   = dA->d_v;
241:   return(0);
242: }

244: PetscErrorCode MatDenseCUDARestoreArray(Mat A, PetscScalar **a)
245: {
247:   *a = NULL;
248:   A->valid_GPU_matrix = PETSC_OFFLOAD_GPU;
249:   return(0);
250: }

252: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
253: {
254: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
255:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
256:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
257:   PetscScalar        *da;
258:   PetscErrorCode     ierr;
259:   cudaError_t        ccer;
260:   cusolverStatus_t   cerr;
261:   cusolverDnHandle_t handle;
262:   int                n,lda;
263: #if defined(PETSC_USE_DEBUG)
264:   int                info;
265: #endif

268:   if (!A->rmap->n || !A->cmap->n) return(0);
269:   PetscCUSOLVERDnGetHandle(&handle);
270:   PetscMPIIntCast(A->cmap->n,&n);
271:   PetscMPIIntCast(a->lda,&lda);
272:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
273:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
274:     if (!dA->d_fact_ipiv) { /* spd */
275:       int il;

277:       MatDenseCUDAGetArray(A,&da);
278:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
279:       if (il > dA->fact_lwork) {
280:         dA->fact_lwork = il;

282:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
283:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
284:       }
285:       PetscLogGpuTimeBegin();
286:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
287:       WaitForGPU();CHKERRCUDA(ierr);
288:       PetscLogGpuTimeEnd();
289:       MatDenseCUDARestoreArray(A,&da);
290:       /* TODO (write cuda kernel) */
291:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
292:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
293:   }
294: #if defined(PETSC_USE_DEBUG)
295:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
296:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
297:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
298: #endif
299:   PetscLogGpuFlops(1.0*n*n*n/3.0);
300:   A->ops->solve          = NULL;
301:   A->ops->solvetranspose = NULL;
302:   A->ops->matsolve       = NULL;
303:   A->factortype          = MAT_FACTOR_NONE;

305:   PetscFree(A->solvertype);
306:   return(0);
307: #else
308:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
309: #endif
310: }

312: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
313: {
314:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
315:   Mat_SeqDense       *x = (Mat_SeqDense*)X->data;
316:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
317:   const PetscScalar  *da;
318:   PetscScalar        *dx;
319:   cusolverDnHandle_t handle;
320:   PetscBool          iscuda;
321:   int                nrhs,n,lda,ldx;
322: #if defined(PETSC_USE_DEBUG)
323:   int                info;
324:   cudaError_t        ccer;
325: #endif
326:   cusolverStatus_t   cerr;
327:   PetscErrorCode     ierr;

330:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
331:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
332:   PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
333:   MatCopy(B,X,SAME_NONZERO_PATTERN);
334:   MatDenseCUDAGetArrayRead(A,&da);
335:   /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
336:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
337:   if (!iscuda) {
338:     MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
339:   }
340:   MatDenseCUDAGetArray(X,&dx);
341:   PetscMPIIntCast(A->rmap->n,&n);
342:   PetscMPIIntCast(X->cmap->n,&nrhs);
343:   PetscMPIIntCast(a->lda,&lda);
344:   PetscMPIIntCast(x->lda,&ldx);
345:   PetscCUSOLVERDnGetHandle(&handle);
346:   PetscLogGpuTimeBegin();
347:   if (A->factortype == MAT_FACTOR_LU) {
348:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
349:     cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
350:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
351:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
352:     if (!dA->d_fact_ipiv) { /* spd */
353:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
354:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
355:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
356:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
357:   WaitForGPU();CHKERRCUDA(ierr);
358:   PetscLogGpuTimeEnd();
359:   MatDenseCUDARestoreArrayRead(A,&da);
360:   MatDenseCUDARestoreArray(X,&dx);
361:   if (!iscuda) {
362:     MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
363:   }
364: #if defined(PETSC_USE_DEBUG)
365:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
366:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
367:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
368: #endif
369:   PetscLogGpuFlops(nrhs*(2.0*n*n - n));
370:   return(0);
371: }

373: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
374: {
375:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
376:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
377:   const PetscScalar  *da;
378:   PetscScalar        *y;
379:   cusolverDnHandle_t handle;
380:   int                one = 1,n,lda;
381: #if defined(PETSC_USE_DEBUG)
382:   int                info;
383:   cudaError_t        ccer;
384: #endif
385:   cusolverStatus_t   cerr;
386:   PetscBool          iscuda;
387:   PetscErrorCode     ierr;

390:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
391:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
392:   PetscMPIIntCast(A->rmap->n,&n);
393:   /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
394:   PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
395:   if (iscuda) {
396:     VecCopy(xx,yy);
397:     VecCUDAGetArray(yy,&y);
398:   } else {
399:     if (!dA->workvec) {
400:       MatCreateVecs(A,&dA->workvec,NULL);
401:     }
402:     VecCopy(xx,dA->workvec);
403:     VecCUDAGetArray(dA->workvec,&y);
404:   }
405:   MatDenseCUDAGetArrayRead(A,&da);
406:   PetscMPIIntCast(a->lda,&lda);
407:   PetscCUSOLVERDnGetHandle(&handle);
408:   PetscLogGpuTimeBegin();
409:   if (A->factortype == MAT_FACTOR_LU) {
410:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
411:     cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
412:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
413:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
414:     if (!dA->d_fact_ipiv) { /* spd */
415:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
416:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
417:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
418:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
419:   WaitForGPU();CHKERRCUDA(ierr);
420:   PetscLogGpuTimeEnd();
421:   if (iscuda) {
422:     VecCUDARestoreArray(yy,&y);
423:   } else {
424:     VecCUDARestoreArray(dA->workvec,&y);
425:     VecCopy(dA->workvec,yy);
426:   }
427:   MatDenseCUDARestoreArrayRead(A,&da);
428: #if defined(PETSC_USE_DEBUG)
429:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
430:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
431:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
432: #endif
433:   PetscLogGpuFlops(2.0*n*n - n);
434:   return(0);
435: }

437: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
438: {
439:   PetscErrorCode     ierr;

442:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
443:   return(0);
444: }

446: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
447: {
448:   PetscErrorCode     ierr;

451:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
452:   return(0);
453: }

455: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
456: {
457:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
458:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
459:   PetscScalar        *da;
460:   int                m,n,lda;
461: #if defined(PETSC_USE_DEBUG)
462:   int                info;
463: #endif
464:   cusolverStatus_t   cerr;
465:   cusolverDnHandle_t handle;
466:   cudaError_t        ccer;
467:   PetscErrorCode     ierr;

470:   if (!A->rmap->n || !A->cmap->n) return(0);
471:   PetscCUSOLVERDnGetHandle(&handle);
472:   MatDenseCUDAGetArray(A,&da);
473:   PetscMPIIntCast(A->cmap->n,&n);
474:   PetscMPIIntCast(A->rmap->n,&m);
475:   PetscMPIIntCast(a->lda,&lda);
476:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
477:   if (!dA->d_fact_ipiv) {
478:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
479:   }
480:   if (!dA->fact_lwork) {
481:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
482:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
483:   }
484:   if (!dA->d_fact_info) {
485:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
486:   }
487:   PetscLogGpuTimeBegin();
488:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
489:   WaitForGPU();CHKERRCUDA(ierr);
490:   PetscLogGpuTimeEnd();
491:   MatDenseCUDARestoreArray(A,&da);
492: #if defined(PETSC_USE_DEBUG)
493:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
494:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
495:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
496: #endif
497:   A->factortype = MAT_FACTOR_LU;
498:   PetscLogGpuFlops(2.0*n*n*m/3.0);

500:   A->ops->solve          = MatSolve_SeqDenseCUDA;
501:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
502:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

504:   PetscFree(A->solvertype);
505:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
506:   return(0);
507: }

509: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
510: {
511:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
512:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
513:   PetscScalar        *da;
514:   int                n,lda;
515: #if defined(PETSC_USE_DEBUG)
516:   int                info;
517: #endif
518:   cusolverStatus_t   cerr;
519:   cusolverDnHandle_t handle;
520:   cudaError_t        ccer;
521:   PetscErrorCode     ierr;

524:   if (!A->rmap->n || !A->cmap->n) return(0);
525:   PetscCUSOLVERDnGetHandle(&handle);
526:   PetscMPIIntCast(A->rmap->n,&n);
527:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
528:   if (A->spd) {
529:     MatDenseCUDAGetArray(A,&da);
530:     PetscMPIIntCast(a->lda,&lda);
531:     if (!dA->fact_lwork) {
532:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
533:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
534:     }
535:     if (!dA->d_fact_info) {
536:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
537:     }
538:     PetscLogGpuTimeBegin();
539:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
540:     WaitForGPU();CHKERRCUDA(ierr);
541:     PetscLogGpuTimeEnd();

543:     MatDenseCUDARestoreArray(A,&da);
544: #if defined(PETSC_USE_DEBUG)
545:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
546:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
547:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
548: #endif
549:     A->factortype = MAT_FACTOR_CHOLESKY;
550:     PetscLogGpuFlops(1.0*n*n*n/3.0);
551:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
552: #if 0
553:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
554:        The code below should work, and it can be activated when *sytrs routines will be available */
555:     if (!dA->d_fact_ipiv) {
556:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
557:     }
558:     if (!dA->fact_lwork) {
559:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
560:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
561:     }
562:     if (!dA->d_fact_info) {
563:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
564:     }
565:     PetscLogGpuTimeBegin();
566:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
567:     PetscLogGpuTimeEnd();
568: #endif

570:   A->ops->solve          = MatSolve_SeqDenseCUDA;
571:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
572:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

574:   PetscFree(A->solvertype);
575:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
576:   return(0);
577: }

579: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
580: static PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA, PetscBool tB)
581: {
582:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
583:   Mat_SeqDense      *b = (Mat_SeqDense*)B->data;
584:   Mat_SeqDense      *c = (Mat_SeqDense*)C->data;
585:   const PetscScalar *da,*db;
586:   PetscScalar       *dc;
587:   PetscScalar       one=1.0,zero=0.0;
588:   int               m,n,k,alda,blda,clda;
589:   PetscErrorCode    ierr;
590:   cublasHandle_t    cublasv2handle;
591:   cublasStatus_t    berr;

594:   PetscMPIIntCast(C->rmap->n,&m);
595:   PetscMPIIntCast(C->cmap->n,&n);
596:   if (tA) {
597:     PetscMPIIntCast(A->rmap->n,&k);
598:   } else {
599:     PetscMPIIntCast(A->cmap->n,&k);
600:   }
601:   if (!m || !n || !k) return(0);
602:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
603:   MatDenseCUDAGetArrayRead(A,&da);
604:   MatDenseCUDAGetArrayRead(B,&db);
605:   MatDenseCUDAGetArrayWrite(C,&dc);
606:   PetscMPIIntCast(a->lda,&alda);
607:   PetscMPIIntCast(b->lda,&blda);
608:   PetscMPIIntCast(c->lda,&clda);
609:   PetscCUBLASGetHandle(&cublasv2handle);
610:   PetscLogGpuTimeBegin();
611:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
612:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
613:   WaitForGPU();CHKERRCUDA(ierr);
614:   PetscLogGpuTimeEnd();
615:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
616:   MatDenseCUDARestoreArrayRead(A,&da);
617:   MatDenseCUDARestoreArrayRead(B,&db);
618:   MatDenseCUDARestoreArrayWrite(C,&dc);
619:   return(0);
620: }

622: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
623: {

627:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
628:   return(0);
629: }

631: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
632: {

636:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
637:   return(0);
638: }

640: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
641: {

645:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
646:   return(0);
647: }

649: /* zz = op(A)*xx + yy
650:    if yy == NULL, only MatMult */
651: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
652: {
653:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
654:   const PetscScalar *xarray,*da;
655:   PetscScalar       *zarray;
656:   PetscScalar       one=1.0,zero=0.0;
657:   int               m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
658:   cublasHandle_t    cublasv2handle;
659:   cublasStatus_t    berr;
660:   PetscErrorCode    ierr;

663:   if (yy && yy != zz) { /* mult add */
664:     VecCopy_SeqCUDA(yy,zz);
665:   }
666:   if (!A->rmap->n || !A->cmap->n) {
667:     if (!yy) { /* mult only */
668:       VecSet_SeqCUDA(zz,0.0);
669:     }
670:     return(0);
671:   }
672:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
673:   PetscMPIIntCast(A->rmap->n,&m);
674:   PetscMPIIntCast(A->cmap->n,&n);
675:   PetscMPIIntCast(mat->lda,&lda);
676:   PetscCUBLASGetHandle(&cublasv2handle);
677:   MatDenseCUDAGetArrayRead(A,&da);
678:   VecCUDAGetArrayRead(xx,&xarray);
679:   VecCUDAGetArray(zz,&zarray);
680:   PetscLogGpuTimeBegin();
681:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
682:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
683:   PetscLogGpuTimeEnd();
684:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
685:   VecCUDARestoreArrayRead(xx,&xarray);
686:   VecCUDARestoreArray(zz,&zarray);
687:   MatDenseCUDARestoreArrayRead(A,&da);
688:   return(0);
689: }

691: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
692: {

696:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
697:   return(0);
698: }

700: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
701: {

705:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
706:   return(0);
707: }

709: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
710: {

714:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
715:   return(0);
716: }

718: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
719: {

723:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
724:   return(0);
725: }

727: PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar *array[])
728: {
729:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

733:   MatSeqDenseCUDACopyFromGPU(A);
734:   *array = mat->v;
735:   return(0);
736: }

738: PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
739: {
740:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

744:   MatSeqDenseCUDACopyFromGPU(A);
745:   *array = mat->v;
746:   A->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
747:   return(0);
748: }

750: PetscErrorCode MatDenseRestoreArray_SeqDenseCUDA(Mat A,PetscScalar *array[])
751: {
753:   return(0);
754: }

756: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
757: {
758:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
759:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
760:   const PetscScalar *dx;
761:   PetscScalar       *dy;
762:   int               j,N,m,ldax,lday,one = 1;
763:   cublasHandle_t    cublasv2handle;
764:   cublasStatus_t    berr;
765:   PetscErrorCode    ierr;

768:   if (!X->rmap->n || !X->cmap->n) return(0);
769:   PetscCUBLASGetHandle(&cublasv2handle);
770:   MatDenseCUDAGetArrayRead(X,&dx);
771:   if (alpha != 0.0) {
772:     MatDenseCUDAGetArray(Y,&dy);
773:   } else {
774:     MatDenseCUDAGetArrayWrite(Y,&dy);
775:   }
776:   PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
777:   PetscMPIIntCast(X->rmap->n,&m);
778:   PetscMPIIntCast(x->lda,&ldax);
779:   PetscMPIIntCast(y->lda,&lday);
780:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
781:   PetscLogGpuTimeBegin();
782:   if (ldax>m || lday>m) {
783:     for (j=0; j<X->cmap->n; j++) {
784:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
785:     }
786:   } else {
787:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
788:   }
789:   WaitForGPU();CHKERRCUDA(ierr);
790:   PetscLogGpuTimeEnd();
791:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
792:   MatDenseCUDARestoreArrayRead(X,&dx);
793:   if (alpha != 0.0) {
794:     MatDenseCUDARestoreArray(Y,&dy);
795:   } else {
796:     MatDenseCUDARestoreArrayWrite(Y,&dy);
797:   }
798:   return(0);
799: }

801: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
802: {
803:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
804:   cudaError_t      cerr;
805:   PetscErrorCode   ierr;

808:   if (dA) {
809:     cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr);
810:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
811:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
812:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
813:     VecDestroy(&dA->workvec);
814:   }
815:   PetscFree(A->spptr);
816:   return(0);
817: }

819: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
820: {
821:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

825:   /* prevent to copy back data if we own the data pointer */
826:   if (!a->user_alloc) { A->valid_GPU_matrix = PETSC_OFFLOAD_CPU; }
827:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
828:   MatDestroy_SeqDense(A);
829:   return(0);
830: }

832: PetscErrorCode MatSeqDenseSetPreallocation_SeqDenseCUDA(Mat B,PetscScalar *data)
833: {
834:   Mat_SeqDense     *b;
835:   Mat_SeqDenseCUDA *dB;
836:   cudaError_t      cerr;
837:   PetscErrorCode   ierr;

840:   PetscLayoutSetUp(B->rmap);
841:   PetscLayoutSetUp(B->cmap);
842:   b       = (Mat_SeqDense*)B->data;
843:   b->Mmax = B->rmap->n;
844:   b->Nmax = B->cmap->n;
845:   if (b->lda <= 0 || b->changelda) b->lda = B->rmap->n;
846:   if (b->lda < B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Invalid lda %D < %D",b->lda,B->rmap->n);

848:   PetscIntMultError(b->lda,b->Nmax,NULL);

850:   MatReset_SeqDenseCUDA(B);
851:   PetscNewLog(B,&dB);
852:   B->spptr = dB;
853:   cerr     = cudaMalloc((void**)&dB->d_v,b->lda*b->Nmax*sizeof(PetscScalar));CHKERRCUDA(cerr);

855:   if (!data) { /* petsc-allocated storage */
856:     if (!b->user_alloc) { PetscFree(b->v); }
857:     PetscCalloc1((size_t)b->lda*b->Nmax,&b->v);
858:     PetscLogObjectMemory((PetscObject)B,b->lda*b->Nmax*sizeof(PetscScalar));
859:     b->user_alloc       = PETSC_FALSE;
860:   } else { /* user-allocated storage */
861:     if (!b->user_alloc) { PetscFree(b->v); }
862:     b->v                = data;
863:     b->user_alloc       = PETSC_TRUE;
864:   }
865:   B->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
866:   B->preallocated     = PETSC_TRUE;
867:   B->assembled        = PETSC_TRUE;
868:   return(0);
869: }

871: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
872: {

876:   MatCreate(PetscObjectComm((PetscObject)A),B);
877:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
878:   MatSetType(*B,((PetscObject)A)->type_name);
879:   MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
880:   if (cpvalues == MAT_COPY_VALUES && A->valid_GPU_matrix != PETSC_OFFLOAD_CPU) {
881:     Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
882:     const PetscScalar *da;
883:     PetscScalar       *db;
884:     cudaError_t       cerr;

886:     MatDenseCUDAGetArrayRead(A,&da);
887:     MatDenseCUDAGetArrayWrite(*B,&db);
888:     if (a->lda > A->rmap->n) {
889:       PetscInt j,m = A->rmap->n;

891:       for (j=0; j<A->cmap->n; j++) { /* it can be done better */
892:         cerr = cudaMemcpy(db+j*m,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
893:       }
894:     } else {
895:       cerr = cudaMemcpy(db,da,a->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
896:     }
897:     MatDenseCUDARestoreArrayRead(A,&da);
898:     MatDenseCUDARestoreArrayWrite(*B,&db);
899:     (*B)->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
900:   }
901:   return(0);
902: }

904: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
905: {

909:   MatCreate(PetscObjectComm((PetscObject)A),fact);
910:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
911:   MatSetType(*fact,MATSEQDENSECUDA);
912:   if (ftype == MAT_FACTOR_LU) {
913:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
914:   } else {
915:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
916:   }
917:   (*fact)->factortype = ftype;

919:   PetscFree((*fact)->solvertype);
920:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
921:   return(0);
922: }

924: static PetscErrorCode MatPinToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
925: {

929:   A->pinnedtocpu = flg;
930:   if (!flg) {
931:     PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDenseCUDA);
932:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",           MatDenseGetArray_SeqDenseCUDA);
933:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",       MatDenseGetArrayRead_SeqDenseCUDA);
934:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C",       MatDenseRestoreArray_SeqDenseCUDA);

936:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
937:     A->ops->mult                    = MatMult_SeqDenseCUDA;
938:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
939:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
940:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
941:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
942:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
943:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
944:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
945:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
946:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
947:   } else {
948:     /* make sure we have an up-to-date copy on the CPU */
949:     MatSeqDenseCUDACopyFromGPU(A);
950:     PetscObjectComposeFunction((PetscObject)A,"MatSeqDenseSetPreallocation_C",MatSeqDenseSetPreallocation_SeqDense);
951:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",           MatDenseGetArray_SeqDense);
952:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",       MatDenseGetArray_SeqDense);
953:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreArray_C",       MatDenseRestoreArray_SeqDense);

955:     A->ops->duplicate               = MatDuplicate_SeqDense;
956:     A->ops->mult                    = MatMult_SeqDense;
957:     A->ops->multadd                 = MatMultAdd_SeqDense;
958:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
959:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
960:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
961:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
962:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
963:     A->ops->axpy                    = MatAXPY_SeqDense;
964:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
965:     A->ops->lufactor                = MatLUFactor_SeqDense;
966:  }
967:   return(0);
968: }

970: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
971: {
972:   Mat              B;
973:   PetscErrorCode   ierr;

976:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
977:     /* TODO these cases should be optimized */
978:     MatConvert_Basic(M,type,reuse,newmat);
979:     return(0);
980:   }

982:   B    = *newmat;
983:   MatPinToCPU_SeqDenseCUDA(B,PETSC_TRUE);
984:   MatReset_SeqDenseCUDA(B);
985:   PetscFree(B->defaultvectype);
986:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
987:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
988:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);

990:   B->ops->pintocpu    = NULL;
991:   B->ops->destroy     = MatDestroy_SeqDense;
992:   B->valid_GPU_matrix = PETSC_OFFLOAD_CPU;
993:   return(0);
994: }

996: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
997: {
998:   Mat_SeqDenseCUDA *dB;
999:   Mat              B;
1000:   PetscErrorCode   ierr;

1003:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1004:     /* TODO these cases should be optimized */
1005:     MatConvert_Basic(M,type,reuse,newmat);
1006:     return(0);
1007:   }

1009:   B    = *newmat;
1010:   PetscFree(B->defaultvectype);
1011:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1012:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1013:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",MatConvert_SeqDenseCUDA_SeqDense);

1015:   MatReset_SeqDenseCUDA(B);
1016:   PetscNewLog(B,&dB);
1017:   B->spptr = dB;

1019:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1021:   MatPinToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1022:   B->ops->pintocpu = MatPinToCPU_SeqDenseCUDA;
1023:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1024:   return(0);
1025: }

1027: /*MC
1028:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1030:    Options Database Keys:
1031: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1033:   Level: beginner

1035: .seealso: MatCreateSeqDenseCuda()

1037: M*/
1038: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1039: {

1043:   MatCreate_SeqDense(B);
1044:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1045:   return(0);
1046: }