Actual source code: aijcusparse.cu

petsc-3.4.2 2013-07-02
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */

  6: #include "petscconf.h"
 7:  #include ../src/mat/impls/aij/seq/aij.h
  8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 9:  #include ../src/vec/vec/impls/dvecimpl.h
 10: #include "petsc-private/vecimpl.h"
 11: #undef VecType
 12:  #include cusparsematimpl.h

 14: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 16: /* this is such a hack ... but I don't know of another way to pass this variable
 17:    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
 18:    SpMV. Essentially, I need to use the same stream variable in two different
 19:    data structures. I do this by creating a single instance of that stream
 20:    and reuse it. */
 21: cudaStream_t theBodyStream=0;

 23: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 25: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 27: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 28: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 29: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 31: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 32: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 33: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 34: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 35: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
 36: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 37: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 38: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 39: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 43: PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
 44: {
 46:   *type = MATSOLVERCUSPARSE;
 47:   return(0);
 48: }

 50: /*MC
 51:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 52:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 53:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 54:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 55:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 56:   algorithms are not recommended. This class does NOT support direct solver operations.

 58:   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE

 60:   Consult CUSPARSE documentation for more information about the matrix storage formats
 61:   which correspond to the options database keys below.

 63:    Options Database Keys:
 64: .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.

 66:   Level: beginner

 68: .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 69: M*/

 73: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
 74: {
 76:   PetscInt       n = A->rmap->n;

 79:   MatCreate(PetscObjectComm((PetscObject)A),B);
 80:   MatSetSizes(*B,n,n,n,n);
 81:   MatSetType(*B,MATSEQAIJCUSPARSE);

 83:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
 84:     MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);
 85:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
 86:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
 87:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
 88:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
 89:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
 90:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

 92:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
 93:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);
 94:   (*B)->factortype = ftype;
 95:   return(0);
 96: }

100: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
101: {
102:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;

105:   switch (op) {
106:   case MAT_CUSPARSE_MULT:
107:     cusparseMat->format = format;
108:     break;
109:   case MAT_CUSPARSE_SOLVE:
110:     cusparseMatSolveStorageFormat = format;
111:     break;
112:   case MAT_CUSPARSE_ALL:
113:     cusparseMat->format           = format;
114:     cusparseMatSolveStorageFormat = format;
115:     break;
116:   default:
117:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
118:   }
119:   return(0);
120: }

122: /*@
123:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
124:    operation. Only the MatMult operation can use different GPU storage formats
125:    for MPIAIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
126:    to build/install PETSc to use this package.

128:    Not Collective

130:    Input Parameters:
131: +  A - Matrix of type SEQAIJCUSPARSE
132: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
133: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)

135:    Output Parameter:

137:    Level: intermediate

139: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
140: @*/
143: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144: {

149:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
150:   return(0);
151: }

155: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
156: {
157:   PetscErrorCode           ierr;
158:   MatCUSPARSEStorageFormat format;
159:   PetscBool                flg;

162:   PetscOptionsHead("SeqAIJCUSPARSE options");
163:   PetscObjectOptionsBegin((PetscObject)A);
164:   if (A->factortype==MAT_FACTOR_NONE) {
165:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
166:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
167:     if (flg) {
168:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
169:     }
170:   } else {
171:     PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
172:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
173:     if (flg) {
174:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);
175:     }
176:   }
177:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
178:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
179:   if (flg) {
180:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
181:   }
182:   PetscOptionsEnd();
183:   return(0);

185: }

189: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
190: {

194:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
195:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
196:   return(0);
197: }

201: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
202: {

206:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
207:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
208:   return(0);
209: }

213: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214: {

218:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
219:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220:   return(0);
221: }

225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {

230:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232:   return(0);
233: }

237: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238: {
239:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
240:   PetscInt                     n                   = A->rmap->n;
241:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242:   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
243:   cusparseStatus_t             stat;
244:   const PetscInt               *ai = a->i,*aj = a->j,*vi;
245:   const MatScalar              *aa = a->a,*v;
246:   PetscInt                     *AiLo, *AjLo;
247:   PetscScalar                  *AALo;
248:   PetscInt                     i,nz, nzLower, offset, rowOffset;
249:   PetscErrorCode               ierr;

252:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
253:     try {
254:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255:       nzLower=n+ai[n]-ai[1];

257:       /* Allocate Space for the lower triangular matrix */
258:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
259:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
260:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);

262:       /* Fill the lower triangular matrix */
263:       AiLo[0]  = (PetscInt) 0;
264:       AiLo[n]  = nzLower;
265:       AjLo[0]  = (PetscInt) 0;
266:       AALo[0]  = (MatScalar) 1.0;
267:       v        = aa;
268:       vi       = aj;
269:       offset   = 1;
270:       rowOffset= 1;
271:       for (i=1; i<n; i++) {
272:         nz = ai[i+1] - ai[i];
273:         /* additional 1 for the term on the diagonal */
274:         AiLo[i]    = rowOffset;
275:         rowOffset += nz+1;

277:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
278:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));

280:         offset      += nz;
281:         AjLo[offset] = (PetscInt) i;
282:         AALo[offset] = (MatScalar) 1.0;
283:         offset      += 1;

285:         v  += nz;
286:         vi += nz;
287:       }
288:       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);

290:       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
291:       cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
292:       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);

294:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;

296:       cudaFreeHost(AiLo);CHKERRCUSP(ierr);
297:       cudaFreeHost(AjLo);CHKERRCUSP(ierr);
298:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
299:     } catch(char *ex) {
300:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
301:     }
302:   }
303:   return(0);
304: }

308: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
309: {
310:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
311:   PetscInt                     n                   = A->rmap->n;
312:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
313:   GPU_Matrix_Ifc               *cusparseMat        = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
314:   cusparseStatus_t             stat;
315:   const PetscInt               *aj = a->j,*adiag = a->diag,*vi;
316:   const MatScalar              *aa = a->a,*v;
317:   PetscInt                     *AiUp, *AjUp;
318:   PetscScalar                  *AAUp;
319:   PetscInt                     i,nz, nzUpper, offset;
320:   PetscErrorCode               ierr;

323:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
324:     try {
325:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
326:       nzUpper = adiag[0]-adiag[n];

328:       /* Allocate Space for the upper triangular matrix */
329:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
330:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
331:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

333:       /* Fill the upper triangular matrix */
334:       AiUp[0]=(PetscInt) 0;
335:       AiUp[n]=nzUpper;
336:       offset = nzUpper;
337:       for (i=n-1; i>=0; i--) {
338:         v  = aa + adiag[i+1] + 1;
339:         vi = aj + adiag[i+1] + 1;

341:         /* number of elements NOT on the diagonal */
342:         nz = adiag[i] - adiag[i+1]-1;

344:         /* decrement the offset */
345:         offset -= (nz+1);

347:         /* first, set the diagonal elements */
348:         AjUp[offset] = (PetscInt) i;
349:         AAUp[offset] = 1./v[nz];
350:         AiUp[i]      = AiUp[i+1] - (nz+1);

352:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
353:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));
354:       }
355:       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);

357:       stat = cusparseMat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
358:       cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
359:       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);

361:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;

363:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
364:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
365:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
366:     } catch(char *ex) {
367:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
368:     }
369:   }
370:   return(0);
371: }

375: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
376: {
377:   PetscErrorCode               ierr;
378:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
379:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
380:   IS                           isrow = a->row,iscol = a->icol;
381:   PetscBool                    row_identity,col_identity;
382:   const PetscInt               *r,*c;
383:   PetscInt                     n = A->rmap->n;

386:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
387:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

389:   cusparseTriFactors->tempvec = new CUSPARRAY;
390:   cusparseTriFactors->tempvec->resize(n);

392:   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
393:   /*lower triangular indices */
394:   ISGetIndices(isrow,&r);
395:   ISIdentity(isrow,&row_identity);
396:   if (!row_identity) {
397:     cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
398:   }
399:   ISRestoreIndices(isrow,&r);

401:   /*upper triangular indices */
402:   ISGetIndices(iscol,&c);
403:   ISIdentity(iscol,&col_identity);
404:   if (!col_identity) {
405:     cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
406:   }
407:   ISRestoreIndices(iscol,&c);
408:   return(0);
409: }

413: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
414: {
415:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
416:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
417:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
418:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
419:   cusparseStatus_t             stat;
420:   PetscErrorCode               ierr;
421:   PetscInt                     *AiUp, *AjUp;
422:   PetscScalar                  *AAUp;
423:   PetscScalar                  *AALo;
424:   PetscInt                     nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
425:   Mat_SeqSBAIJ                 *b = (Mat_SeqSBAIJ*)A->data;
426:   const PetscInt               *ai = b->i,*aj = b->j,*vj;
427:   const MatScalar              *aa = b->a,*v;

430:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
431:     try {
432:       /* Allocate Space for the upper triangular matrix */
433:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
434:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
435:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
436:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);

438:       /* Fill the upper triangular matrix */
439:       AiUp[0]=(PetscInt) 0;
440:       AiUp[n]=nzUpper;
441:       offset = 0;
442:       for (i=0; i<n; i++) {
443:         /* set the pointers */
444:         v  = aa + ai[i];
445:         vj = aj + ai[i];
446:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

448:         /* first, set the diagonal elements */
449:         AjUp[offset] = (PetscInt) i;
450:         AAUp[offset] = 1.0/v[nz];
451:         AiUp[i]      = offset;
452:         AALo[offset] = 1.0/v[nz];

454:         offset+=1;
455:         if (nz>0) {
456:           PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));
457:           PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));
458:           for (j=offset; j<offset+nz; j++) {
459:             AAUp[j] = -AAUp[j];
460:             AALo[j] = AAUp[j]/v[nz];
461:           }
462:           offset+=nz;
463:         }
464:       }

466:       /* Build the upper triangular piece */
467:       cusparseMatUp = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
468:       stat = cusparseMatUp->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
469:       cusparseMatUp->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
470:       stat = cusparseMatUp->solveAnalysis();CHKERRCUSP(stat);
471:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMatUp;

473:       /* Build the lower triangular piece */
474:       cusparseMatLo = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
475:       stat = cusparseMatLo->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
476:       cusparseMatLo->setMatrix(A->rmap->n, A->cmap->n, a->nz, AiUp, AjUp, AALo);CHKERRCUSP(ierr);
477:       stat = cusparseMatLo->solveAnalysis(CUSPARSE_OPERATION_TRANSPOSE);CHKERRCUSP(stat);
478:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMatLo;

480:       /* set this flag ... for performance logging */
481:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->isSymmOrHerm = PETSC_TRUE;

483:       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484:       cudaFreeHost(AiUp);CHKERRCUSP(ierr);
485:       cudaFreeHost(AjUp);CHKERRCUSP(ierr);
486:       cudaFreeHost(AAUp);CHKERRCUSP(ierr);
487:       cudaFreeHost(AALo);CHKERRCUSP(ierr);
488:     } catch(char *ex) {
489:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490:     }
491:   }
492:   return(0);
493: }

497: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498: {
499:   PetscErrorCode               ierr;
500:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
501:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
502:   IS                           ip = a->row;
503:   const PetscInt               *rip;
504:   PetscBool                    perm_identity;
505:   PetscInt                     n = A->rmap->n;

508:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
509:   cusparseTriFactors->tempvec = new CUSPARRAY;
510:   cusparseTriFactors->tempvec->resize(n);
511:   /*lower triangular indices */
512:   ISGetIndices(ip,&rip);
513:   ISIdentity(ip,&perm_identity);
514:   if (!perm_identity) {
515:     cusparseTriFactors->loTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
516:     cusparseTriFactors->upTriFactorPtr->setOrdIndices(rip, n);CHKERRCUSP(ierr);
517:   }
518:   ISRestoreIndices(ip,&rip);
519:   return(0);
520: }

524: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
525: {
526:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
527:   IS             isrow = b->row,iscol = b->col;
528:   PetscBool      row_identity,col_identity;

532:   MatLUFactorNumeric_SeqAIJ(B,A,info);
533:   /* determine which version of MatSolve needs to be used. */
534:   ISIdentity(isrow,&row_identity);
535:   ISIdentity(iscol,&col_identity);
536:   if (row_identity && col_identity) {
537:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
538:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
539:   } else {
540:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
541:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
542:   }

544:   /* get the triangular factors */
545:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
546:   return(0);
547: }

551: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
552: {
553:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
554:   IS             ip = b->row;
555:   PetscBool      perm_identity;

559:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

561:   /* determine which version of MatSolve needs to be used. */
562:   ISIdentity(ip,&perm_identity);
563:   if (perm_identity) {
564:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
565:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
566:   } else {
567:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
568:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
569:   }

571:   /* get the triangular factors */
572:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
573:   return(0);
574: }

578: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
579: {
580:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
581:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
582:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
583:   cusparseStatus_t             stat;

586:   stat = cusparseMatLo->initializeCusparseMatTranspose(MAT_cusparseHandle,
587:                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
588:                                                        CUSPARSE_FILL_MODE_UPPER,
589:                                                        CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
590:   stat = cusparseMatLo->solveAnalysisTranspose();CHKERRCUSP(stat);

592:   stat = cusparseMatUp->initializeCusparseMatTranspose(MAT_cusparseHandle,
593:                                                        CUSPARSE_MATRIX_TYPE_TRIANGULAR,
594:                                                        CUSPARSE_FILL_MODE_LOWER,
595:                                                        CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
596:   stat = cusparseMatUp->solveAnalysisTranspose();CHKERRCUSP(stat);
597:   return(0);
598: }

602: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
603: {
604:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
605:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
606:   cusparseStatus_t   stat;
607:   PetscErrorCode     ierr;

610:   if (cusparseMat->isSymmOrHerm) {
611:     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
612:   } else {
613:     stat = cusparseMat->mat->initializeCusparseMatTranspose(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
614:   }
615:   cusparseMat->mat->setMatrixTranspose(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);CHKERRCUSP(ierr);
616:   return(0);
617: }

621: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
622: {
623:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
624:   CUSPARRAY                    *xGPU, *bGPU;
625:   cusparseStatus_t             stat;
626:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
628:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
629:   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
630:   PetscErrorCode               ierr;

633:   /* Analyze the matrix ... on the fly */
634:   if (!cusparseTriFactors->hasTranspose) {
635:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
636:     cusparseTriFactors->hasTranspose=PETSC_TRUE;
637:   }

639:   /* Get the GPU pointers */
640:   VecCUSPGetArrayWrite(xx,&xGPU);
641:   VecCUSPGetArrayRead(bb,&bGPU);

643:   /* solve with reordering */
644:   cusparseMatUp->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
645:   stat = cusparseMatUp->solveTranspose(xGPU, tempGPU);CHKERRCUSP(stat);
646:   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);
647:   cusparseMatLo->reorderOut(xGPU);CHKERRCUSP(ierr);

649:   /* restore */
650:   VecCUSPRestoreArrayRead(bb,&bGPU);
651:   VecCUSPRestoreArrayWrite(xx,&xGPU);
652:   WaitForGPU();CHKERRCUSP(ierr);

654:   if (cusparseTriFactors->isSymmOrHerm) {
655:     PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);
656:   } else {
657:     PetscLogFlops(2.0*a->nz - A->cmap->n);
658:   }
659:   return(0);
660: }

664: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
665: {
666:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
667:   CUSPARRAY                    *xGPU,*bGPU;
668:   cusparseStatus_t             stat;
669:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
670:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
671:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
672:   CUSPARRAY                    *tempGPU            = (CUSPARRAY*) cusparseTriFactors->tempvec;
673:   PetscErrorCode               ierr;

676:   /* Analyze the matrix ... on the fly */
677:   if (!cusparseTriFactors->hasTranspose) {
678:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
679:     cusparseTriFactors->hasTranspose=PETSC_TRUE;
680:   }

682:   /* Get the GPU pointers */
683:   VecCUSPGetArrayWrite(xx,&xGPU);
684:   VecCUSPGetArrayRead(bb,&bGPU);

686:   /* solve */
687:   stat = cusparseMatUp->solveTranspose(bGPU, tempGPU);CHKERRCUSP(stat);
688:   stat = cusparseMatLo->solveTranspose(tempGPU, xGPU);CHKERRCUSP(stat);

690:   /* restore */
691:   VecCUSPRestoreArrayRead(bb,&bGPU);
692:   VecCUSPRestoreArrayWrite(xx,&xGPU);
693:   WaitForGPU();CHKERRCUSP(ierr);
694:   if (cusparseTriFactors->isSymmOrHerm) {
695:     PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);
696:   } else {
697:     PetscLogFlops(2.0*a->nz - A->cmap->n);
698:   }
699:   return(0);
700: }

704: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
705: {
706:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
707:   CUSPARRAY                    *xGPU,*bGPU;
708:   cusparseStatus_t             stat;
709:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
710:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
711:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
712:   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
713:   PetscErrorCode               ierr;

716:   /* Get the GPU pointers */
717:   VecCUSPGetArrayWrite(xx,&xGPU);
718:   VecCUSPGetArrayRead(bb,&bGPU);

720:   /* solve with reordering */
721:   cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
722:   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
723:   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
724:   cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);

726:   VecCUSPRestoreArrayRead(bb,&bGPU);
727:   VecCUSPRestoreArrayWrite(xx,&xGPU);
728:   WaitForGPU();CHKERRCUSP(ierr);
729:   if (cusparseTriFactors->isSymmOrHerm) {
730:     PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);
731:   } else {
732:     PetscLogFlops(2.0*a->nz - A->cmap->n);
733:   }
734:   return(0);
735: }

739: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
740: {
741:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
742:   CUSPARRAY                    *xGPU,*bGPU;
743:   cusparseStatus_t             stat;
744:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
745:   GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
746:   GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
747:   CUSPARRAY                    *tempGPU            = (CUSPARRAY*)cusparseTriFactors->tempvec;
748:   PetscErrorCode               ierr;

751:   /* Get the GPU pointers */
752:   VecCUSPGetArrayWrite(xx,&xGPU);
753:   VecCUSPGetArrayRead(bb,&bGPU);

755:   /* solve */
756:   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
757:   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);

759:   VecCUSPRestoreArrayRead(bb,&bGPU);
760:   VecCUSPRestoreArrayWrite(xx,&xGPU);
761:   WaitForGPU();CHKERRCUSP(ierr);
762:   if (cusparseTriFactors->isSymmOrHerm) {
763:     PetscLogFlops(4.0*a->nz - 3.0*A->cmap->n);
764:   } else {
765:     PetscLogFlops(2.0*a->nz - A->cmap->n);
766:   }
767:   return(0);
768: }

772: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
773: {

775:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
776:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
777:   PetscInt           m = A->rmap->n,*ii,*ridx;
778:   PetscBool          symmetryTest=PETSC_FALSE, hermitianTest=PETSC_FALSE;
779:   PetscBool          symmetryOptionIsSet=PETSC_FALSE, symmetryOptionTest=PETSC_FALSE;
780:   PetscErrorCode     ierr;

783:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
784:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
785:     /*
786:       It may be possible to reuse nonzero structure with new matrix values but
787:       for simplicity and insured correctness we delete and build a new matrix on
788:       the GPU. Likely a very small performance hit.
789:     */
790:     if (cusparseMat->mat) {
791:       try {
792:         delete cusparseMat->mat;
793:         if (cusparseMat->tempvec) delete cusparseMat->tempvec;

795:       } catch(char *ex) {
796:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
797:       }
798:     }
799:     try {
800:       cusparseMat->nonzerorow=0;
801:       for (int j = 0; j<m; j++) cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);

803:       if (a->compressedrow.use) {
804:         m    = a->compressedrow.nrows;
805:         ii   = a->compressedrow.i;
806:         ridx = a->compressedrow.rindex;
807:       } else {
808:         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
809:         int k=0;
810:         PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);
811:         PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);
812:         ii[0]=0;
813:         for (int j = 0; j<m; j++) {
814:           if ((a->i[j+1]-a->i[j])>0) {
815:             ii[k]  = a->i[j];
816:             ridx[k]= j;
817:             k++;
818:           }
819:         }
820:         ii[cusparseMat->nonzerorow] = a->nz;

822:         m = cusparseMat->nonzerorow;
823:       }

825:       /* Build our matrix ... first determine the GPU storage type */
826:       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);

828:       /* Create the streams and events (if desired).  */
829:       PetscMPIInt size;
830:       MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
831:       cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);

833:       MatIsSymmetricKnown(A,&symmetryOptionIsSet,&symmetryOptionTest);
834:       if ((symmetryOptionIsSet && !symmetryOptionTest) || !symmetryOptionIsSet) {
835:         /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
836:         cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
837:         cusparseMat->isSymmOrHerm = PETSC_FALSE;
838:       } else {
839:         MatIsSymmetric(A,0.0,&symmetryTest);
840:         if (symmetryTest) {
841:           cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_SYMMETRIC, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
842:           cusparseMat->isSymmOrHerm = PETSC_TRUE;
843:         } else {
844:           MatIsHermitian(A,0.0,&hermitianTest);
845:           if (hermitianTest) {
846:             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_HERMITIAN, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
847:             cusparseMat->isSymmOrHerm = PETSC_TRUE;
848:           } else {
849:             /* CUSPARSE_FILL_MODE_UPPER and CUSPARSE_DIAG_TYPE_NON_UNIT are irrelevant here */
850:             cusparseStatus_t stat = cusparseMat->mat->initializeCusparseMat(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
851:             cusparseMat->isSymmOrHerm = PETSC_FALSE;
852:           }
853:         }
854:       }

856:       /* lastly, build the matrix */
857:       cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
858:       cusparseMat->mat->setCPRowIndices(ridx, m);
859:       if (!a->compressedrow.use) {
860:         PetscFree(ii);
861:         PetscFree(ridx);
862:       }
863:       cusparseMat->tempvec = new CUSPARRAY;
864:       cusparseMat->tempvec->resize(m);
865:     } catch(char *ex) {
866:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
867:     }
868:     WaitForGPU();CHKERRCUSP(ierr);

870:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;

872:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
873:   }
874:   return(0);
875: }

879: static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
880: {

884:   if (right) {
885:     VecCreate(PetscObjectComm((PetscObject)mat),right);
886:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
887:     VecSetBlockSize(*right,mat->rmap->bs);
888:     VecSetType(*right,VECSEQCUSP);
889:     PetscLayoutReference(mat->cmap,&(*right)->map);
890:   }
891:   if (left) {
892:     VecCreate(PetscObjectComm((PetscObject)mat),left);
893:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
894:     VecSetBlockSize(*left,mat->rmap->bs);
895:     VecSetType(*left,VECSEQCUSP);
896:     PetscLayoutReference(mat->rmap,&(*left)->map);
897:   }
898:   return(0);
899: }

903: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
904: {
905:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
906:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
907:   CUSPARRAY          *xarray,*yarray;
908:   PetscErrorCode     ierr;

911:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
912:      MatSeqAIJCUSPARSECopyToGPU(A); */
913:   VecCUSPGetArrayRead(xx,&xarray);
914:   VecCUSPGetArrayWrite(yy,&yarray);
915:   try {
916:     cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
917:   } catch (char *ex) {
918:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
919:   }
920:   VecCUSPRestoreArrayRead(xx,&xarray);
921:   VecCUSPRestoreArrayWrite(yy,&yarray);
922:   if (!cusparseMat->mat->hasNonZeroStream()) {
923:     WaitForGPU();CHKERRCUSP(ierr);
924:   }
925:   if (cusparseMat->isSymmOrHerm) {
926:     PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);
927:   } else {
928:     PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);
929:   }
930:   return(0);
931: }

935: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
936: {
937:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
938:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
939:   CUSPARRAY          *xarray,*yarray;
940:   PetscErrorCode     ierr;

943:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
944:      MatSeqAIJCUSPARSECopyToGPU(A); */
945:   if (!cusparseMat->hasTranspose) {
946:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
947:     cusparseMat->hasTranspose=PETSC_TRUE;
948:   }
949:   VecCUSPGetArrayRead(xx,&xarray);
950:   VecCUSPGetArrayWrite(yy,&yarray);
951:   try {
952:     cusparseMat->mat->multiplyTranspose(xarray, yarray);CHKERRCUSP(ierr);
953:   } catch (char *ex) {
954:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955:   }
956:   VecCUSPRestoreArrayRead(xx,&xarray);
957:   VecCUSPRestoreArrayWrite(yy,&yarray);
958:   if (!cusparseMat->mat->hasNonZeroStream()) {
959:     WaitForGPU();CHKERRCUSP(ierr);
960:   }
961:   if (cusparseMat->isSymmOrHerm) {
962:     PetscLogFlops(4.0*a->nz - 3.0*cusparseMat->nonzerorow);
963:   } else {
964:     PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);
965:   }
966:   return(0);
967: }

971: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
972: {
973:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
974:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
975:   CUSPARRAY          *xarray,*yarray,*zarray;
976:   PetscErrorCode     ierr;

979:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
980:      MatSeqAIJCUSPARSECopyToGPU(A); */
981:   try {
982:     VecCopy_SeqCUSP(yy,zz);
983:     VecCUSPGetArrayRead(xx,&xarray);
984:     VecCUSPGetArrayRead(yy,&yarray);
985:     VecCUSPGetArrayWrite(zz,&zarray);

987:     /* multiply add */
988:     cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);

990:     VecCUSPRestoreArrayRead(xx,&xarray);
991:     VecCUSPRestoreArrayRead(yy,&yarray);
992:     VecCUSPRestoreArrayWrite(zz,&zarray);

994:   } catch(char *ex) {
995:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
996:   }
997:   WaitForGPU();CHKERRCUSP(ierr);
998:   if (cusparseMat->isSymmOrHerm) {
999:     PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);
1000:   } else {
1001:     PetscLogFlops(2.0*a->nz);
1002:   }
1003:   return(0);
1004: }

1008: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1009: {
1010:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1011:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
1012:   CUSPARRAY          *xarray,*yarray,*zarray;
1013:   PetscErrorCode     ierr;

1016:   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1017:      MatSeqAIJCUSPARSECopyToGPU(A); */
1018:   if (!cusparseMat->hasTranspose) {
1019:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1020:     cusparseMat->hasTranspose=PETSC_TRUE;
1021:   }
1022:   try {
1023:     VecCopy_SeqCUSP(yy,zz);
1024:     VecCUSPGetArrayRead(xx,&xarray);
1025:     VecCUSPGetArrayRead(yy,&yarray);
1026:     VecCUSPGetArrayWrite(zz,&zarray);

1028:     /* multiply add with matrix transpose */
1029:     cusparseMat->mat->multiplyAddTranspose(xarray, yarray);CHKERRCUSP(ierr);

1031:     VecCUSPRestoreArrayRead(xx,&xarray);
1032:     VecCUSPRestoreArrayRead(yy,&yarray);
1033:     VecCUSPRestoreArrayWrite(zz,&zarray);

1035:   } catch(char *ex) {
1036:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1037:   }
1038:   WaitForGPU();CHKERRCUSP(ierr);
1039:   if (cusparseMat->isSymmOrHerm) {
1040:     PetscLogFlops(4.0*a->nz - 2.0*cusparseMat->nonzerorow);
1041:   } else {
1042:     PetscLogFlops(2.0*a->nz);
1043:   }
1044:   return(0);
1045: }

1049: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1050: {

1054:   MatAssemblyEnd_SeqAIJ(A,mode);
1055:   if (A->factortype==MAT_FACTOR_NONE) {
1056:     MatSeqAIJCUSPARSECopyToGPU(A);
1057:   }
1058:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1059:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1060:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1061:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1062:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1063:   return(0);
1064: }

1066: /* --------------------------------------------------------------------------------*/
1069: /*@
1070:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1071:    (the default parallel PETSc format). This matrix will ultimately pushed down
1072:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1073:    assembly performance the user should preallocate the matrix storage by setting
1074:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1075:    performance during matrix assembly can be increased by more than a factor of 50.

1077:    Collective on MPI_Comm

1079:    Input Parameters:
1080: +  comm - MPI communicator, set to PETSC_COMM_SELF
1081: .  m - number of rows
1082: .  n - number of columns
1083: .  nz - number of nonzeros per row (same for all rows)
1084: -  nnz - array containing the number of nonzeros in the various rows
1085:          (possibly different for each row) or NULL

1087:    Output Parameter:
1088: .  A - the matrix

1090:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1091:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1092:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1094:    Notes:
1095:    If nnz is given then nz is ignored

1097:    The AIJ format (also called the Yale sparse matrix format or
1098:    compressed row storage), is fully compatible with standard Fortran 77
1099:    storage.  That is, the stored row and column indices can begin at
1100:    either one (as in Fortran) or zero.  See the users' manual for details.

1102:    Specify the preallocated storage with either nz or nnz (not both).
1103:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1104:    allocation.  For large problems you MUST preallocate memory or you
1105:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1107:    By default, this format uses inodes (identical nodes) when possible, to
1108:    improve numerical efficiency of matrix-vector products and solves. We
1109:    search for consecutive rows with the same nonzero structure, thereby
1110:    reusing matrix information to achieve increased efficiency.

1112:    Level: intermediate

1114: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1115: @*/
1116: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1117: {

1121:   MatCreate(comm,A);
1122:   MatSetSizes(*A,m,n,m,n);
1123:   MatSetType(*A,MATSEQAIJCUSPARSE);
1124:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1125:   return(0);
1126: }

1130: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1131: {
1132:   PetscErrorCode     ierr;
1133:   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;

1136:   if (A->factortype==MAT_FACTOR_NONE) {
1137:     try {
1138:       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1139:         delete (GPU_Matrix_Ifc*)(cusparseMat->mat);
1140:       }
1141:       if (cusparseMat->tempvec!=0) delete cusparseMat->tempvec;
1142:       delete cusparseMat;
1143:       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1144:     } catch(char *ex) {
1145:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1146:     }
1147:   } else {
1148:     /* The triangular factors */
1149:     try {
1150:       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1151:       GPU_Matrix_Ifc               *cusparseMatLo      = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
1152:       GPU_Matrix_Ifc               *cusparseMatUp      = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
1153:       delete (GPU_Matrix_Ifc*) cusparseMatLo;
1154:       delete (GPU_Matrix_Ifc*) cusparseMatUp;
1155:       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
1156:       delete cusparseTriFactors;
1157:     } catch(char *ex) {
1158:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1159:     }
1160:   }
1161:   if (MAT_cusparseHandle) {
1162:     cusparseStatus_t stat;
1163:     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);

1165:     MAT_cusparseHandle=0;
1166:   }
1167:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1168:   A->spptr = 0;

1170:   MatDestroy_SeqAIJ(A);
1171:   return(0);
1172: }

1176: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1177: {

1181:   MatCreate_SeqAIJ(B);
1182:   if (B->factortype==MAT_FACTOR_NONE) {
1183:     /* you cannot check the inode.use flag here since the matrix was just created.
1184:        now build a GPU matrix data structure */
1185:     B->spptr = new Mat_SeqAIJCUSPARSE;
1186:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1187:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->tempvec      = 0;
1188:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1189:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->hasTranspose = PETSC_FALSE;
1190:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->isSymmOrHerm = PETSC_FALSE;
1191:   } else {
1192:     /* NEXT, set the pointers to the triangular factors */
1193:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1194:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1195:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1196:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->tempvec        = 0;
1197:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format         = cusparseMatSolveStorageFormat;
1198:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->hasTranspose   = PETSC_FALSE;
1199:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->isSymmOrHerm   = PETSC_FALSE;
1200:   }
1201:   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
1202:   if (!MAT_cusparseHandle) {
1203:     cusparseStatus_t stat;
1204:     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
1205:   }
1206:   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1207:      default cusparse tri solve. Note the difference with the implementation in
1208:      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1209:   PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);

1211:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1212:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1213:   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1214:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1215:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1216:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1217:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1218:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;

1220:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1222:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;

1224:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1225:   return(0);
1226: }

1228: /*M
1229:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1231:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1232:    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1233:    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
1234:    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
1235:    the different GPU storage formats.

1237:    Options Database Keys:
1238: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1239: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1240: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.
1241: -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). Only available with 'txpetscgpu' package.

1243:   Level: beginner

1245: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1246: M*/