Actual source code: aijcusparse.cu
petsc-3.12.0 2019-09-29
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44: {
45: cusparseStatus_t stat;
46: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
49: cusparsestruct->stream = stream;
50: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51: return(0);
52: }
54: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55: {
56: cusparseStatus_t stat;
57: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60: if (cusparsestruct->handle != handle) {
61: if (cusparsestruct->handle) {
62: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63: }
64: cusparsestruct->handle = handle;
65: }
66: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67: return(0);
68: }
70: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71: {
72: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74: if (cusparsestruct->handle)
75: cusparsestruct->handle = 0;
76: return(0);
77: }
79: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80: {
82: *type = MATSOLVERCUSPARSE;
83: return(0);
84: }
86: /*MC
87: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92: algorithms are not recommended. This class does NOT support direct solver operations.
94: Level: beginner
96: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97: M*/
99: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100: {
102: PetscInt n = A->rmap->n;
105: MatCreate(PetscObjectComm((PetscObject)A),B);
106: (*B)->factortype = ftype;
107: MatSetSizes(*B,n,n,n,n);
108: MatSetType(*B,MATSEQAIJCUSPARSE);
110: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111: MatSetBlockSizesFromMats(*B,A,A);
112: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
120: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
121: return(0);
122: }
124: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125: {
126: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
129: switch (op) {
130: case MAT_CUSPARSE_MULT:
131: cusparsestruct->format = format;
132: break;
133: case MAT_CUSPARSE_ALL:
134: cusparsestruct->format = format;
135: break;
136: default:
137: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138: }
139: return(0);
140: }
142: /*@
143: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144: operation. Only the MatMult operation can use different GPU storage formats
145: for MPIAIJCUSPARSE matrices.
146: Not Collective
148: Input Parameters:
149: + A - Matrix of type SEQAIJCUSPARSE
150: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153: Output Parameter:
155: Level: intermediate
157: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: @*/
159: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160: {
165: PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
166: return(0);
167: }
169: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170: {
171: PetscErrorCode ierr;
172: MatCUSPARSEStorageFormat format;
173: PetscBool flg;
174: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
178: if (A->factortype==MAT_FACTOR_NONE) {
179: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
181: if (flg) {
182: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
183: }
184: }
185: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
187: if (flg) {
188: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
189: }
190: PetscOptionsTail();
191: return(0);
193: }
195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {
200: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202: return(0);
203: }
205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {
210: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212: return(0);
213: }
215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {
220: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222: return(0);
223: }
225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {
230: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232: return(0);
233: }
235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
238: PetscInt n = A->rmap->n;
239: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241: cusparseStatus_t stat;
242: const PetscInt *ai = a->i,*aj = a->j,*vi;
243: const MatScalar *aa = a->a,*v;
244: PetscInt *AiLo, *AjLo;
245: PetscScalar *AALo;
246: PetscInt i,nz, nzLower, offset, rowOffset;
247: PetscErrorCode ierr;
250: if (!n) return(0);
251: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
252: try {
253: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254: nzLower=n+ai[n]-ai[1];
256: /* Allocate Space for the lower triangular matrix */
257: cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258: cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259: cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261: /* Fill the lower triangular matrix */
262: AiLo[0] = (PetscInt) 0;
263: AiLo[n] = nzLower;
264: AjLo[0] = (PetscInt) 0;
265: AALo[0] = (MatScalar) 1.0;
266: v = aa;
267: vi = aj;
268: offset = 1;
269: rowOffset= 1;
270: for (i=1; i<n; i++) {
271: nz = ai[i+1] - ai[i];
272: /* additional 1 for the term on the diagonal */
273: AiLo[i] = rowOffset;
274: rowOffset += nz+1;
276: PetscArraycpy(&(AjLo[offset]), vi, nz);
277: PetscArraycpy(&(AALo[offset]), v, nz);
279: offset += nz;
280: AjLo[offset] = (PetscInt) i;
281: AALo[offset] = (MatScalar) 1.0;
282: offset += 1;
284: v += nz;
285: vi += nz;
286: }
288: /* allocate space for the triangular factor information */
289: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291: /* Create the matrix description */
292: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298: /* Create the solve analysis information */
299: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301: /* set the operation */
302: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304: /* set the matrix */
305: loTriFactor->csrMat = new CsrMatrix;
306: loTriFactor->csrMat->num_rows = n;
307: loTriFactor->csrMat->num_cols = n;
308: loTriFactor->csrMat->num_entries = nzLower;
310: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319: /* perform the solve analysis */
320: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325: /* assign the pointer. Is this really necessary? */
326: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328: cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329: cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330: cudaFreeHost(AALo);CHKERRCUDA(ierr);
331: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
332: } catch(char *ex) {
333: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334: }
335: }
336: return(0);
337: }
339: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340: {
341: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
342: PetscInt n = A->rmap->n;
343: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345: cusparseStatus_t stat;
346: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
347: const MatScalar *aa = a->a,*v;
348: PetscInt *AiUp, *AjUp;
349: PetscScalar *AAUp;
350: PetscInt i,nz, nzUpper, offset;
351: PetscErrorCode ierr;
354: if (!n) return(0);
355: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356: try {
357: /* next, figure out the number of nonzeros in the upper triangular matrix. */
358: nzUpper = adiag[0]-adiag[n];
360: /* Allocate Space for the upper triangular matrix */
361: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
365: /* Fill the upper triangular matrix */
366: AiUp[0]=(PetscInt) 0;
367: AiUp[n]=nzUpper;
368: offset = nzUpper;
369: for (i=n-1; i>=0; i--) {
370: v = aa + adiag[i+1] + 1;
371: vi = aj + adiag[i+1] + 1;
373: /* number of elements NOT on the diagonal */
374: nz = adiag[i] - adiag[i+1]-1;
376: /* decrement the offset */
377: offset -= (nz+1);
379: /* first, set the diagonal elements */
380: AjUp[offset] = (PetscInt) i;
381: AAUp[offset] = (MatScalar)1./v[nz];
382: AiUp[i] = AiUp[i+1] - (nz+1);
384: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
385: PetscArraycpy(&(AAUp[offset+1]), v, nz);
386: }
388: /* allocate space for the triangular factor information */
389: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391: /* Create the matrix description */
392: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
398: /* Create the solve analysis information */
399: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
401: /* set the operation */
402: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
404: /* set the matrix */
405: upTriFactor->csrMat = new CsrMatrix;
406: upTriFactor->csrMat->num_rows = n;
407: upTriFactor->csrMat->num_cols = n;
408: upTriFactor->csrMat->num_entries = nzUpper;
410: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
413: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
416: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
419: /* perform the solve analysis */
420: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
425: /* assign the pointer. Is this really necessary? */
426: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
428: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
432: } catch(char *ex) {
433: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434: }
435: }
436: return(0);
437: }
439: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440: {
441: PetscErrorCode ierr;
442: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
443: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444: IS isrow = a->row,iscol = a->icol;
445: PetscBool row_identity,col_identity;
446: const PetscInt *r,*c;
447: PetscInt n = A->rmap->n;
450: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
451: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
453: cusparseTriFactors->workVector = new THRUSTARRAY(n);
454: cusparseTriFactors->nnz=a->nz;
456: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457: /* lower triangular indices */
458: ISGetIndices(isrow,&r);
459: ISIdentity(isrow,&row_identity);
460: if (!row_identity) {
461: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462: cusparseTriFactors->rpermIndices->assign(r, r+n);
463: }
464: ISRestoreIndices(isrow,&r);
466: /* upper triangular indices */
467: ISGetIndices(iscol,&c);
468: ISIdentity(iscol,&col_identity);
469: if (!col_identity) {
470: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471: cusparseTriFactors->cpermIndices->assign(c, c+n);
472: }
474: if (!row_identity && !col_identity) {
475: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
476: } else if(!row_identity) {
477: PetscLogCpuToGpu(n*sizeof(PetscInt));
478: } else if(!col_identity) {
479: PetscLogCpuToGpu(n*sizeof(PetscInt));
480: }
482: ISRestoreIndices(iscol,&c);
483: return(0);
484: }
486: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487: {
488: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
489: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492: cusparseStatus_t stat;
493: PetscErrorCode ierr;
494: PetscInt *AiUp, *AjUp;
495: PetscScalar *AAUp;
496: PetscScalar *AALo;
497: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
499: const PetscInt *ai = b->i,*aj = b->j,*vj;
500: const MatScalar *aa = b->a,*v;
503: if (!n) return(0);
504: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
505: try {
506: /* Allocate Space for the upper triangular matrix */
507: cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508: cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509: cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510: cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
512: /* Fill the upper triangular matrix */
513: AiUp[0]=(PetscInt) 0;
514: AiUp[n]=nzUpper;
515: offset = 0;
516: for (i=0; i<n; i++) {
517: /* set the pointers */
518: v = aa + ai[i];
519: vj = aj + ai[i];
520: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
522: /* first, set the diagonal elements */
523: AjUp[offset] = (PetscInt) i;
524: AAUp[offset] = (MatScalar)1.0/v[nz];
525: AiUp[i] = offset;
526: AALo[offset] = (MatScalar)1.0/v[nz];
528: offset+=1;
529: if (nz>0) {
530: PetscArraycpy(&(AjUp[offset]), vj, nz);
531: PetscArraycpy(&(AAUp[offset]), v, nz);
532: for (j=offset; j<offset+nz; j++) {
533: AAUp[j] = -AAUp[j];
534: AALo[j] = AAUp[j]/v[nz];
535: }
536: offset+=nz;
537: }
538: }
540: /* allocate space for the triangular factor information */
541: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
543: /* Create the matrix description */
544: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
550: /* Create the solve analysis information */
551: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
553: /* set the operation */
554: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
556: /* set the matrix */
557: upTriFactor->csrMat = new CsrMatrix;
558: upTriFactor->csrMat->num_rows = A->rmap->n;
559: upTriFactor->csrMat->num_cols = A->cmap->n;
560: upTriFactor->csrMat->num_entries = a->nz;
562: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
565: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
568: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
571: /* perform the solve analysis */
572: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
577: /* assign the pointer. Is this really necessary? */
578: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
580: /* allocate space for the triangular factor information */
581: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
583: /* Create the matrix description */
584: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
590: /* Create the solve analysis information */
591: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
593: /* set the operation */
594: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
596: /* set the matrix */
597: loTriFactor->csrMat = new CsrMatrix;
598: loTriFactor->csrMat->num_rows = A->rmap->n;
599: loTriFactor->csrMat->num_cols = A->cmap->n;
600: loTriFactor->csrMat->num_entries = a->nz;
602: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
605: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
608: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
612: /* perform the solve analysis */
613: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
618: /* assign the pointer. Is this really necessary? */
619: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
621: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
622: cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623: cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624: cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625: cudaFreeHost(AALo);CHKERRCUDA(ierr);
626: } catch(char *ex) {
627: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628: }
629: }
630: return(0);
631: }
633: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634: {
635: PetscErrorCode ierr;
636: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
637: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638: IS ip = a->row;
639: const PetscInt *rip;
640: PetscBool perm_identity;
641: PetscInt n = A->rmap->n;
644: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
645: cusparseTriFactors->workVector = new THRUSTARRAY(n);
646: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
648: /* lower triangular indices */
649: ISGetIndices(ip,&rip);
650: ISIdentity(ip,&perm_identity);
651: if (!perm_identity) {
652: IS iip;
653: const PetscInt *irip;
655: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
656: ISGetIndices(iip,&irip);
657: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661: ISRestoreIndices(iip,&irip);
662: ISDestroy(&iip);
663: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
664: }
665: ISRestoreIndices(ip,&rip);
666: return(0);
667: }
669: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670: {
671: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
672: IS isrow = b->row,iscol = b->col;
673: PetscBool row_identity,col_identity;
677: MatLUFactorNumeric_SeqAIJ(B,A,info);
678: /* determine which version of MatSolve needs to be used. */
679: ISIdentity(isrow,&row_identity);
680: ISIdentity(iscol,&col_identity);
681: if (row_identity && col_identity) {
682: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684: B->ops->matsolve = NULL;
685: B->ops->matsolvetranspose = NULL;
686: } else {
687: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689: B->ops->matsolve = NULL;
690: B->ops->matsolvetranspose = NULL;
691: }
693: /* get the triangular factors */
694: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
695: return(0);
696: }
698: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699: {
700: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
701: IS ip = b->row;
702: PetscBool perm_identity;
706: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
708: /* determine which version of MatSolve needs to be used. */
709: ISIdentity(ip,&perm_identity);
710: if (perm_identity) {
711: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713: B->ops->matsolve = NULL;
714: B->ops->matsolvetranspose = NULL;
715: } else {
716: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718: B->ops->matsolve = NULL;
719: B->ops->matsolvetranspose = NULL;
720: }
722: /* get the triangular factors */
723: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
724: return(0);
725: }
727: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728: {
729: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734: cusparseStatus_t stat;
735: cusparseIndexBase_t indexBase;
736: cusparseMatrixType_t matrixType;
737: cusparseFillMode_t fillMode;
738: cusparseDiagType_t diagType;
742: /*********************************************/
743: /* Now the Transpose of the Lower Tri Factor */
744: /*********************************************/
746: /* allocate space for the transpose of the lower triangular factor */
747: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
749: /* set the matrix descriptors of the lower triangular factor */
750: matrixType = cusparseGetMatType(loTriFactor->descr);
751: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754: diagType = cusparseGetMatDiagType(loTriFactor->descr);
756: /* Create the matrix description */
757: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
763: /* Create the solve analysis information */
764: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
766: /* set the operation */
767: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
769: /* allocate GPU space for the CSC of the lower triangular factor*/
770: loTriFactorT->csrMat = new CsrMatrix;
771: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
778: /* compute the transpose of the lower triangular factor, i.e. the CSC */
779: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781: loTriFactor->csrMat->values->data().get(),
782: loTriFactor->csrMat->row_offsets->data().get(),
783: loTriFactor->csrMat->column_indices->data().get(),
784: loTriFactorT->csrMat->values->data().get(),
785: loTriFactorT->csrMat->column_indices->data().get(),
786: loTriFactorT->csrMat->row_offsets->data().get(),
787: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
789: /* perform the solve analysis on the transposed matrix */
790: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794: loTriFactorT->solveInfo);CHKERRCUDA(stat);
796: /* assign the pointer. Is this really necessary? */
797: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
799: /*********************************************/
800: /* Now the Transpose of the Upper Tri Factor */
801: /*********************************************/
803: /* allocate space for the transpose of the upper triangular factor */
804: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
806: /* set the matrix descriptors of the upper triangular factor */
807: matrixType = cusparseGetMatType(upTriFactor->descr);
808: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811: diagType = cusparseGetMatDiagType(upTriFactor->descr);
813: /* Create the matrix description */
814: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
820: /* Create the solve analysis information */
821: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
823: /* set the operation */
824: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
826: /* allocate GPU space for the CSC of the upper triangular factor*/
827: upTriFactorT->csrMat = new CsrMatrix;
828: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
835: /* compute the transpose of the upper triangular factor, i.e. the CSC */
836: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838: upTriFactor->csrMat->values->data().get(),
839: upTriFactor->csrMat->row_offsets->data().get(),
840: upTriFactor->csrMat->column_indices->data().get(),
841: upTriFactorT->csrMat->values->data().get(),
842: upTriFactorT->csrMat->column_indices->data().get(),
843: upTriFactorT->csrMat->row_offsets->data().get(),
844: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
846: /* perform the solve analysis on the transposed matrix */
847: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851: upTriFactorT->solveInfo);CHKERRCUDA(stat);
853: /* assign the pointer. Is this really necessary? */
854: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855: return(0);
856: }
858: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859: {
860: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
864: cusparseStatus_t stat;
865: cusparseIndexBase_t indexBase;
866: cudaError_t err;
867: PetscErrorCode ierr;
871: /* allocate space for the triangular factor information */
872: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874: indexBase = cusparseGetMatIndexBase(matstruct->descr);
875: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
878: /* set alpha and beta */
879: err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
880: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882: err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
887: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889: CsrMatrix *matrixT= new CsrMatrix;
890: thrust::device_vector<int> *rowoffsets_gpu;
891: const int ncomprow = matstruct->cprowIndices->size();
892: thrust::host_vector<int> cprow(ncomprow);
893: cprow = *matstruct->cprowIndices; // GPU --> CPU
894: matrixT->num_rows = A->cmap->n;
895: matrixT->num_cols = A->rmap->n;
896: matrixT->num_entries = a->nz;
897: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
898: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899: matrixT->values = new THRUSTARRAY(a->nz);
900: PetscInt k,i;
901: thrust::host_vector<int> rowst_host(A->rmap->n+1);
903: /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
904: rowst_host[0] = 0;
905: for (k = 0, i = 0; i < A->rmap->n ; i++) {
906: if (k < ncomprow && i==cprow[k]) {
907: rowst_host[i+1] = a->i[i+1];
908: k++;
909: } else rowst_host[i+1] = rowst_host[i];
910: }
911: if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
912: rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913: *rowoffsets_gpu = rowst_host; // CPU --> GPU
915: /* compute the transpose of the upper triangular factor, i.e. the CSC */
916: indexBase = cusparseGetMatIndexBase(matstruct->descr);
917: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
918: A->cmap->n, matrix->num_entries,
919: matrix->values->data().get(),
920: rowoffsets_gpu->data().get(),
921: matrix->column_indices->data().get(),
922: matrixT->values->data().get(),
923: matrixT->column_indices->data().get(),
924: matrixT->row_offsets->data().get(),
925: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
927: /* assign the pointer */
928: matstructT->mat = matrixT;
929: PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
930: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
931: /* First convert HYB to CSR */
932: CsrMatrix *temp= new CsrMatrix;
933: temp->num_rows = A->rmap->n;
934: temp->num_cols = A->cmap->n;
935: temp->num_entries = a->nz;
936: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937: temp->column_indices = new THRUSTINTARRAY32(a->nz);
938: temp->values = new THRUSTARRAY(a->nz);
941: stat = cusparse_hyb2csr(cusparsestruct->handle,
942: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
943: temp->values->data().get(),
944: temp->row_offsets->data().get(),
945: temp->column_indices->data().get());CHKERRCUDA(stat);
947: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
948: CsrMatrix *tempT= new CsrMatrix;
949: tempT->num_rows = A->rmap->n;
950: tempT->num_cols = A->cmap->n;
951: tempT->num_entries = a->nz;
952: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
953: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
954: tempT->values = new THRUSTARRAY(a->nz);
956: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
957: temp->num_cols, temp->num_entries,
958: temp->values->data().get(),
959: temp->row_offsets->data().get(),
960: temp->column_indices->data().get(),
961: tempT->values->data().get(),
962: tempT->column_indices->data().get(),
963: tempT->row_offsets->data().get(),
964: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
966: /* Last, convert CSC to HYB */
967: cusparseHybMat_t hybMat;
968: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
969: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
970: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
971: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
972: matstructT->descr, tempT->values->data().get(),
973: tempT->row_offsets->data().get(),
974: tempT->column_indices->data().get(),
975: hybMat, 0, partition);CHKERRCUDA(stat);
977: /* assign the pointer */
978: matstructT->mat = hybMat;
979: PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));
981: /* delete temporaries */
982: if (tempT) {
983: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
984: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
985: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
986: delete (CsrMatrix*) tempT;
987: }
988: if (temp) {
989: if (temp->values) delete (THRUSTARRAY*) temp->values;
990: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
991: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
992: delete (CsrMatrix*) temp;
993: }
994: }
995: /* assign the compressed row indices */
996: matstructT->cprowIndices = new THRUSTINTARRAY;
997: matstructT->cprowIndices->resize(A->cmap->n);
998: thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
999: /* assign the pointer */
1000: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1001: return(0);
1002: }
1004: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1005: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1006: {
1007: PetscInt n = xx->map->n;
1008: const PetscScalar *barray;
1009: PetscScalar *xarray;
1010: thrust::device_ptr<const PetscScalar> bGPU;
1011: thrust::device_ptr<PetscScalar> xGPU;
1012: cusparseStatus_t stat;
1013: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1014: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1017: PetscErrorCode ierr;
1020: /* Analyze the matrix and create the transpose ... on the fly */
1021: if (!loTriFactorT && !upTriFactorT) {
1022: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1023: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1024: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1025: }
1027: /* Get the GPU pointers */
1028: VecCUDAGetArrayWrite(xx,&xarray);
1029: VecCUDAGetArrayRead(bb,&barray);
1030: xGPU = thrust::device_pointer_cast(xarray);
1031: bGPU = thrust::device_pointer_cast(barray);
1033: PetscLogGpuTimeBegin();
1034: /* First, reorder with the row permutation */
1035: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1036: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1037: xGPU);
1039: /* First, solve U */
1040: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1042: upTriFactorT->csrMat->values->data().get(),
1043: upTriFactorT->csrMat->row_offsets->data().get(),
1044: upTriFactorT->csrMat->column_indices->data().get(),
1045: upTriFactorT->solveInfo,
1046: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1048: /* Then, solve L */
1049: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1051: loTriFactorT->csrMat->values->data().get(),
1052: loTriFactorT->csrMat->row_offsets->data().get(),
1053: loTriFactorT->csrMat->column_indices->data().get(),
1054: loTriFactorT->solveInfo,
1055: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1057: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1058: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1059: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1060: tempGPU->begin());
1062: /* Copy the temporary to the full solution. */
1063: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1065: /* restore */
1066: VecCUDARestoreArrayRead(bb,&barray);
1067: VecCUDARestoreArrayWrite(xx,&xarray);
1068: WaitForGPU();CHKERRCUDA(ierr);
1069: PetscLogGpuTimeEnd();
1070: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1071: return(0);
1072: }
1074: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1075: {
1076: const PetscScalar *barray;
1077: PetscScalar *xarray;
1078: cusparseStatus_t stat;
1079: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1080: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1081: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1082: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1083: PetscErrorCode ierr;
1086: /* Analyze the matrix and create the transpose ... on the fly */
1087: if (!loTriFactorT && !upTriFactorT) {
1088: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1089: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1090: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1091: }
1093: /* Get the GPU pointers */
1094: VecCUDAGetArrayWrite(xx,&xarray);
1095: VecCUDAGetArrayRead(bb,&barray);
1097: PetscLogGpuTimeBegin();
1098: /* First, solve U */
1099: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1100: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1101: upTriFactorT->csrMat->values->data().get(),
1102: upTriFactorT->csrMat->row_offsets->data().get(),
1103: upTriFactorT->csrMat->column_indices->data().get(),
1104: upTriFactorT->solveInfo,
1105: barray, tempGPU->data().get());CHKERRCUDA(stat);
1107: /* Then, solve L */
1108: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1109: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1110: loTriFactorT->csrMat->values->data().get(),
1111: loTriFactorT->csrMat->row_offsets->data().get(),
1112: loTriFactorT->csrMat->column_indices->data().get(),
1113: loTriFactorT->solveInfo,
1114: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1116: /* restore */
1117: VecCUDARestoreArrayRead(bb,&barray);
1118: VecCUDARestoreArrayWrite(xx,&xarray);
1119: WaitForGPU();CHKERRCUDA(ierr);
1120: PetscLogGpuTimeEnd();
1121: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1122: return(0);
1123: }
1125: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1126: {
1127: const PetscScalar *barray;
1128: PetscScalar *xarray;
1129: thrust::device_ptr<const PetscScalar> bGPU;
1130: thrust::device_ptr<PetscScalar> xGPU;
1131: cusparseStatus_t stat;
1132: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1133: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1134: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1135: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1136: PetscErrorCode ierr;
1140: /* Get the GPU pointers */
1141: VecCUDAGetArrayWrite(xx,&xarray);
1142: VecCUDAGetArrayRead(bb,&barray);
1143: xGPU = thrust::device_pointer_cast(xarray);
1144: bGPU = thrust::device_pointer_cast(barray);
1146: PetscLogGpuTimeBegin();
1147: /* First, reorder with the row permutation */
1148: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1149: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1150: tempGPU->begin());
1152: /* Next, solve L */
1153: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155: loTriFactor->csrMat->values->data().get(),
1156: loTriFactor->csrMat->row_offsets->data().get(),
1157: loTriFactor->csrMat->column_indices->data().get(),
1158: loTriFactor->solveInfo,
1159: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1161: /* Then, solve U */
1162: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164: upTriFactor->csrMat->values->data().get(),
1165: upTriFactor->csrMat->row_offsets->data().get(),
1166: upTriFactor->csrMat->column_indices->data().get(),
1167: upTriFactor->solveInfo,
1168: xarray, tempGPU->data().get());CHKERRCUDA(stat);
1170: /* Last, reorder with the column permutation */
1171: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1172: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1173: xGPU);
1175: VecCUDARestoreArrayRead(bb,&barray);
1176: VecCUDARestoreArrayWrite(xx,&xarray);
1177: WaitForGPU();CHKERRCUDA(ierr);
1178: PetscLogGpuTimeEnd();
1179: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1180: return(0);
1181: }
1183: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1184: {
1185: const PetscScalar *barray;
1186: PetscScalar *xarray;
1187: cusparseStatus_t stat;
1188: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1189: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1190: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1191: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1192: PetscErrorCode ierr;
1195: /* Get the GPU pointers */
1196: VecCUDAGetArrayWrite(xx,&xarray);
1197: VecCUDAGetArrayRead(bb,&barray);
1199: PetscLogGpuTimeBegin();
1200: /* First, solve L */
1201: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1203: loTriFactor->csrMat->values->data().get(),
1204: loTriFactor->csrMat->row_offsets->data().get(),
1205: loTriFactor->csrMat->column_indices->data().get(),
1206: loTriFactor->solveInfo,
1207: barray, tempGPU->data().get());CHKERRCUDA(stat);
1209: /* Next, solve U */
1210: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1212: upTriFactor->csrMat->values->data().get(),
1213: upTriFactor->csrMat->row_offsets->data().get(),
1214: upTriFactor->csrMat->column_indices->data().get(),
1215: upTriFactor->solveInfo,
1216: tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1218: VecCUDARestoreArrayRead(bb,&barray);
1219: VecCUDARestoreArrayWrite(xx,&xarray);
1220: WaitForGPU();CHKERRCUDA(ierr);
1221: PetscLogGpuTimeEnd();
1222: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1223: return(0);
1224: }
1226: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1227: {
1228: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1229: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1230: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1231: PetscInt m = A->rmap->n,*ii,*ridx;
1232: PetscErrorCode ierr;
1233: cusparseStatus_t stat;
1234: cudaError_t err;
1237: if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1238: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1239: if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1240: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1241: /* copy values only */
1242: matrix->values->assign(a->a, a->a+a->nz);
1243: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1244: } else {
1245: MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1246: try {
1247: cusparsestruct->nonzerorow=0;
1248: for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1250: if (a->compressedrow.use) {
1251: m = a->compressedrow.nrows;
1252: ii = a->compressedrow.i;
1253: ridx = a->compressedrow.rindex;
1254: } else {
1255: /* Forcing compressed row on the GPU */
1256: int k=0;
1257: PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1258: PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1259: ii[0]=0;
1260: for (int j = 0; j<m; j++) {
1261: if ((a->i[j+1]-a->i[j])>0) {
1262: ii[k] = a->i[j];
1263: ridx[k]= j;
1264: k++;
1265: }
1266: }
1267: ii[cusparsestruct->nonzerorow] = a->nz;
1268: m = cusparsestruct->nonzerorow;
1269: }
1271: /* allocate space for the triangular factor information */
1272: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1273: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1274: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1275: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1277: err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
1278: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1279: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1280: err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1285: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1286: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1287: /* set the matrix */
1288: CsrMatrix *matrix= new CsrMatrix;
1289: matrix->num_rows = m;
1290: matrix->num_cols = A->cmap->n;
1291: matrix->num_entries = a->nz;
1292: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1293: matrix->row_offsets->assign(ii, ii + m+1);
1295: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1296: matrix->column_indices->assign(a->j, a->j+a->nz);
1298: matrix->values = new THRUSTARRAY(a->nz);
1299: matrix->values->assign(a->a, a->a+a->nz);
1301: /* assign the pointer */
1302: matstruct->mat = matrix;
1304: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1305: CsrMatrix *matrix= new CsrMatrix;
1306: matrix->num_rows = m;
1307: matrix->num_cols = A->cmap->n;
1308: matrix->num_entries = a->nz;
1309: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1310: matrix->row_offsets->assign(ii, ii + m+1);
1312: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1313: matrix->column_indices->assign(a->j, a->j+a->nz);
1315: matrix->values = new THRUSTARRAY(a->nz);
1316: matrix->values->assign(a->a, a->a+a->nz);
1318: cusparseHybMat_t hybMat;
1319: stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1320: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1321: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1322: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1323: matstruct->descr, matrix->values->data().get(),
1324: matrix->row_offsets->data().get(),
1325: matrix->column_indices->data().get(),
1326: hybMat, 0, partition);CHKERRCUDA(stat);
1327: /* assign the pointer */
1328: matstruct->mat = hybMat;
1330: if (matrix) {
1331: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1332: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1333: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1334: delete (CsrMatrix*)matrix;
1335: }
1336: }
1338: /* assign the compressed row indices */
1339: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1340: matstruct->cprowIndices->assign(ridx,ridx+m);
1341: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1343: /* assign the pointer */
1344: cusparsestruct->mat = matstruct;
1346: if (!a->compressedrow.use) {
1347: PetscFree(ii);
1348: PetscFree(ridx);
1349: }
1350: cusparsestruct->workVector = new THRUSTARRAY(m);
1351: } catch(char *ex) {
1352: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353: }
1354: cusparsestruct->nonzerostate = A->nonzerostate;
1355: }
1356: WaitForGPU();CHKERRCUDA(ierr);
1357: A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1358: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1359: }
1360: return(0);
1361: }
1363: struct VecCUDAPlusEquals
1364: {
1365: template <typename Tuple>
1366: __host__ __device__
1367: void operator()(Tuple t)
1368: {
1369: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370: }
1371: };
1373: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1374: {
1378: MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1379: return(0);
1380: }
1382: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1383: {
1384: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1385: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1386: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1387: const PetscScalar *xarray;
1388: PetscScalar *yarray;
1389: PetscErrorCode ierr;
1390: cusparseStatus_t stat;
1393: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1394: MatSeqAIJCUSPARSECopyToGPU(A);
1395: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1396: if (!matstructT) {
1397: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1398: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1399: }
1400: VecCUDAGetArrayRead(xx,&xarray);
1401: VecCUDAGetArrayWrite(yy,&yarray);
1402: if (yy->map->n) {
1403: PetscInt n = yy->map->n;
1404: cudaError_t err;
1405: err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1406: }
1408: PetscLogGpuTimeBegin();
1409: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1410: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1411: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1412: mat->num_rows, mat->num_cols,
1413: mat->num_entries, matstructT->alpha, matstructT->descr,
1414: mat->values->data().get(), mat->row_offsets->data().get(),
1415: mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1416: yarray);CHKERRCUDA(stat);
1417: } else {
1418: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1419: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420: matstructT->alpha, matstructT->descr, hybMat,
1421: xarray, matstructT->beta_zero,
1422: yarray);CHKERRCUDA(stat);
1423: }
1424: VecCUDARestoreArrayRead(xx,&xarray);
1425: VecCUDARestoreArrayWrite(yy,&yarray);
1426: WaitForGPU();CHKERRCUDA(ierr);
1427: PetscLogGpuTimeEnd();
1428: PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1429: return(0);
1430: }
1433: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1434: {
1435: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1436: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1437: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1438: const PetscScalar *xarray;
1439: PetscScalar *zarray,*dptr,*beta;
1440: PetscErrorCode ierr;
1441: cusparseStatus_t stat;
1442: PetscBool cmpr; /* if the matrix has been compressed (zero rows) */
1445: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1446: MatSeqAIJCUSPARSECopyToGPU(A);
1447: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1448: try {
1449: cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1450: VecCUDAGetArrayRead(xx,&xarray);
1451: if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1452: VecCUDAGetArray(zz,&zarray);
1453: } else {
1454: VecCUDAGetArrayWrite(zz,&zarray);
1455: }
1456: dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1457: beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1459: PetscLogGpuTimeBegin();
1460: /* csr_spmv is multiply add */
1461: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1462: /* here we need to be careful to set the number of rows in the multiply to the
1463: number of compressed rows in the matrix ... which is equivalent to the
1464: size of the workVector */
1465: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1466: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1467: mat->num_rows, mat->num_cols,
1468: mat->num_entries, matstruct->alpha, matstruct->descr,
1469: mat->values->data().get(), mat->row_offsets->data().get(),
1470: mat->column_indices->data().get(), xarray, beta,
1471: dptr);CHKERRCUDA(stat);
1472: } else {
1473: if (cusparsestruct->workVector->size()) {
1474: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1475: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1476: matstruct->alpha, matstruct->descr, hybMat,
1477: xarray, beta,
1478: dptr);CHKERRCUDA(stat);
1479: }
1480: }
1481: PetscLogGpuTimeEnd();
1483: if (yy) {
1484: if (dptr != zarray) {
1485: VecCopy_SeqCUDA(yy,zz);
1486: } else if (zz != yy) {
1487: VecAXPY_SeqCUDA(zz,1.0,yy);
1488: }
1489: } else if (dptr != zarray) {
1490: VecSet_SeqCUDA(zz,0);
1491: }
1492: /* scatter the data from the temporary into the full vector with a += operation */
1493: PetscLogGpuTimeBegin();
1494: if (dptr != zarray) {
1495: thrust::device_ptr<PetscScalar> zptr;
1497: zptr = thrust::device_pointer_cast(zarray);
1498: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1499: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1500: VecCUDAPlusEquals());
1501: }
1502: PetscLogGpuTimeEnd();
1503: VecCUDARestoreArrayRead(xx,&xarray);
1504: if (yy && !cmpr) {
1505: VecCUDARestoreArray(zz,&zarray);
1506: } else {
1507: VecCUDARestoreArrayWrite(zz,&zarray);
1508: }
1509: } catch(char *ex) {
1510: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1511: }
1512: WaitForGPU();CHKERRCUDA(ierr);
1513: PetscLogGpuFlops(2.0*a->nz);
1514: return(0);
1515: }
1517: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1518: {
1519: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1520: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1521: Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1522: const PetscScalar *xarray;
1523: PetscScalar *zarray,*dptr,*beta;
1524: PetscErrorCode ierr;
1525: cusparseStatus_t stat;
1528: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1529: MatSeqAIJCUSPARSECopyToGPU(A);
1530: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1531: if (!matstructT) {
1532: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1533: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1534: }
1536: try {
1537: VecCopy_SeqCUDA(yy,zz);
1538: VecCUDAGetArrayRead(xx,&xarray);
1539: VecCUDAGetArray(zz,&zarray);
1540: dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1541: beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1543: PetscLogGpuTimeBegin();
1544: /* multiply add with matrix transpose */
1545: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1546: CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1547: /* here we need to be careful to set the number of rows in the multiply to the
1548: number of compressed rows in the matrix ... which is equivalent to the
1549: size of the workVector */
1550: stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1551: mat->num_rows, mat->num_cols,
1552: mat->num_entries, matstructT->alpha, matstructT->descr,
1553: mat->values->data().get(), mat->row_offsets->data().get(),
1554: mat->column_indices->data().get(), xarray, beta,
1555: dptr);CHKERRCUDA(stat);
1556: } else {
1557: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1558: if (cusparsestruct->workVector->size()) {
1559: stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1560: matstructT->alpha, matstructT->descr, hybMat,
1561: xarray, beta,
1562: dptr);CHKERRCUDA(stat);
1563: }
1564: }
1565: PetscLogGpuTimeEnd();
1567: if (dptr != zarray) {
1568: VecCopy_SeqCUDA(yy,zz);
1569: } else if (zz != yy) {
1570: VecAXPY_SeqCUDA(zz,1.0,yy);
1571: }
1572: /* scatter the data from the temporary into the full vector with a += operation */
1573: PetscLogGpuTimeBegin();
1574: if (dptr != zarray) {
1575: thrust::device_ptr<PetscScalar> zptr;
1577: zptr = thrust::device_pointer_cast(zarray);
1579: /* scatter the data from the temporary into the full vector with a += operation */
1580: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1581: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1582: VecCUDAPlusEquals());
1583: }
1584: PetscLogGpuTimeEnd();
1585: VecCUDARestoreArrayRead(xx,&xarray);
1586: VecCUDARestoreArray(zz,&zarray);
1587: } catch(char *ex) {
1588: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1589: }
1590: WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1591: PetscLogGpuFlops(2.0*a->nz);
1592: return(0);
1593: }
1595: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1596: {
1600: MatAssemblyEnd_SeqAIJ(A,mode);
1601: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1602: if (A->factortype == MAT_FACTOR_NONE) {
1603: MatSeqAIJCUSPARSECopyToGPU(A);
1604: }
1605: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1606: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1607: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1608: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1609: return(0);
1610: }
1612: /* --------------------------------------------------------------------------------*/
1613: /*@
1614: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1615: (the default parallel PETSc format). This matrix will ultimately pushed down
1616: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1617: assembly performance the user should preallocate the matrix storage by setting
1618: the parameter nz (or the array nnz). By setting these parameters accurately,
1619: performance during matrix assembly can be increased by more than a factor of 50.
1621: Collective
1623: Input Parameters:
1624: + comm - MPI communicator, set to PETSC_COMM_SELF
1625: . m - number of rows
1626: . n - number of columns
1627: . nz - number of nonzeros per row (same for all rows)
1628: - nnz - array containing the number of nonzeros in the various rows
1629: (possibly different for each row) or NULL
1631: Output Parameter:
1632: . A - the matrix
1634: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1635: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1636: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1638: Notes:
1639: If nnz is given then nz is ignored
1641: The AIJ format (also called the Yale sparse matrix format or
1642: compressed row storage), is fully compatible with standard Fortran 77
1643: storage. That is, the stored row and column indices can begin at
1644: either one (as in Fortran) or zero. See the users' manual for details.
1646: Specify the preallocated storage with either nz or nnz (not both).
1647: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1648: allocation. For large problems you MUST preallocate memory or you
1649: will get TERRIBLE performance, see the users' manual chapter on matrices.
1651: By default, this format uses inodes (identical nodes) when possible, to
1652: improve numerical efficiency of matrix-vector products and solves. We
1653: search for consecutive rows with the same nonzero structure, thereby
1654: reusing matrix information to achieve increased efficiency.
1656: Level: intermediate
1658: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1659: @*/
1660: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1661: {
1665: MatCreate(comm,A);
1666: MatSetSizes(*A,m,n,m,n);
1667: MatSetType(*A,MATSEQAIJCUSPARSE);
1668: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1669: return(0);
1670: }
1672: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1673: {
1674: PetscErrorCode ierr;
1677: if (A->factortype==MAT_FACTOR_NONE) {
1678: if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1679: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1680: }
1681: } else {
1682: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1683: }
1684: MatDestroy_SeqAIJ(A);
1685: return(0);
1686: }
1688: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1689: {
1691: Mat C;
1692: cusparseStatus_t stat;
1693: cusparseHandle_t handle=0;
1696: MatDuplicate_SeqAIJ(A,cpvalues,B);
1697: C = *B;
1698: PetscFree(C->defaultvectype);
1699: PetscStrallocpy(VECCUDA,&C->defaultvectype);
1701: /* inject CUSPARSE-specific stuff */
1702: if (C->factortype==MAT_FACTOR_NONE) {
1703: /* you cannot check the inode.use flag here since the matrix was just created.
1704: now build a GPU matrix data structure */
1705: C->spptr = new Mat_SeqAIJCUSPARSE;
1706: ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0;
1707: ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1708: ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0;
1709: ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR;
1710: ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0;
1711: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1712: ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle;
1713: ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1714: } else {
1715: /* NEXT, set the pointers to the triangular factors */
1716: C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1717: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0;
1718: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0;
1719: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1720: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1721: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0;
1722: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0;
1723: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0;
1724: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0;
1725: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1726: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle;
1727: ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0;
1728: }
1730: C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1731: C->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1732: C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1733: C->ops->mult = MatMult_SeqAIJCUSPARSE;
1734: C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1735: C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1736: C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1737: C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1739: PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);
1741: C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1743: PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1744: return(0);
1745: }
1747: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1748: {
1750: cusparseStatus_t stat;
1751: cusparseHandle_t handle=0;
1754: PetscFree(B->defaultvectype);
1755: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1757: if (B->factortype==MAT_FACTOR_NONE) {
1758: /* you cannot check the inode.use flag here since the matrix was just created.
1759: now build a GPU matrix data structure */
1760: B->spptr = new Mat_SeqAIJCUSPARSE;
1761: ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0;
1762: ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1763: ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0;
1764: ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR;
1765: ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0;
1766: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1767: ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle;
1768: ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1769: } else {
1770: /* NEXT, set the pointers to the triangular factors */
1771: B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1772: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0;
1773: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0;
1774: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1775: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1776: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0;
1777: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0;
1778: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0;
1779: stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1780: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle;
1781: ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0;
1782: }
1784: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
1785: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
1786: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
1787: B->ops->mult = MatMult_SeqAIJCUSPARSE;
1788: B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1789: B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1790: B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1791: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
1793: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
1795: B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1797: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1798: return(0);
1799: }
1801: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1802: {
1806: MatCreate_SeqAIJ(B);
1807: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1808: return(0);
1809: }
1811: /*MC
1812: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1814: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1815: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1816: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1818: Options Database Keys:
1819: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1820: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1821: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1823: Level: beginner
1825: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1826: M*/
1828: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1831: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1832: {
1836: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1837: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1838: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1839: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1840: return(0);
1841: }
1844: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1845: {
1846: cusparseStatus_t stat;
1847: cusparseHandle_t handle;
1850: if (*cusparsestruct) {
1851: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1852: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1853: delete (*cusparsestruct)->workVector;
1854: if (handle = (*cusparsestruct)->handle) {
1855: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1856: }
1857: delete *cusparsestruct;
1858: *cusparsestruct = 0;
1859: }
1860: return(0);
1861: }
1863: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1864: {
1866: if (*mat) {
1867: delete (*mat)->values;
1868: delete (*mat)->column_indices;
1869: delete (*mat)->row_offsets;
1870: delete *mat;
1871: *mat = 0;
1872: }
1873: return(0);
1874: }
1876: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1877: {
1878: cusparseStatus_t stat;
1879: PetscErrorCode ierr;
1882: if (*trifactor) {
1883: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1884: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1885: CsrMatrix_Destroy(&(*trifactor)->csrMat);
1886: delete *trifactor;
1887: *trifactor = 0;
1888: }
1889: return(0);
1890: }
1892: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1893: {
1894: CsrMatrix *mat;
1895: cusparseStatus_t stat;
1896: cudaError_t err;
1899: if (*matstruct) {
1900: if ((*matstruct)->mat) {
1901: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1902: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1903: stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1904: } else {
1905: mat = (CsrMatrix*)(*matstruct)->mat;
1906: CsrMatrix_Destroy(&mat);
1907: }
1908: }
1909: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1910: delete (*matstruct)->cprowIndices;
1911: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1912: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1913: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1914: delete *matstruct;
1915: *matstruct = 0;
1916: }
1917: return(0);
1918: }
1920: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1921: {
1922: cusparseHandle_t handle;
1923: cusparseStatus_t stat;
1926: if (*trifactors) {
1927: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1928: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1929: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1930: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1931: delete (*trifactors)->rpermIndices;
1932: delete (*trifactors)->cpermIndices;
1933: delete (*trifactors)->workVector;
1934: if (handle = (*trifactors)->handle) {
1935: stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1936: }
1937: delete *trifactors;
1938: *trifactors = 0;
1939: }
1940: return(0);
1941: }