3: /*
4: Defines the basic matrix operations for the AIJ (compressed row)
5: matrix storage format.
6: */
8: #include "petscconf.h"
9: PETSC_CUDA_EXTERN_C_BEGIN
10: #include ../src/mat/impls/aij/seq/aij.h 11: #include petscbt.h 12: #include ../src/vec/vec/impls/dvecimpl.h 13: #include "petsc-private/vecimpl.h"
14: PETSC_CUDA_EXTERN_C_END
15: #undef VecType 16: #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h 18: #if defined(PETSC_HAVE_TXPETSCGPU)
19: const char *const MatCUSPStorageFormats[] = {"CSR","DIA","ELL","MatCUSPStorageFormat","MAT_CUSP_",0};
21: /* this is such a hack ... but I haven't written another way to pass this variable
22: from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
23: SpMV. Essentially, I need to use the same stream variable in two different
24: data structures. I do this by creating a single instance of that stream
25: and reuse it.*/
26: cudaStream_t theCUSPBodyStream=0;
27: #endif
29: #include <algorithm>
30: #include <vector>
31: #include <string>
32: #include <thrust/sort.h>
33: #include <thrust/fill.h>
37: PetscErrorCode MatCUSPSetFormat_SeqAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format) 38: {
39: #if defined PETSC_HAVE_TXPETSCGPU
40: Mat_SeqAIJCUSP *cuspMat = (Mat_SeqAIJCUSP*)A->spptr;
41: #endif
44: #if defined PETSC_HAVE_TXPETSCGPU
45: switch (op) {
46: case MAT_CUSP_MULT:
47: cuspMat->format = format;
48: break;
49: case MAT_CUSP_ALL:
50: cuspMat->format = format;
51: break;
52: default: 53: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT and MAT_CUSP_ALL are currently supported.",op);
54: }
55: #endif
56: return(0);
57: }
59: /*@
60: MatCUSPSetFormat - Sets the storage format of CUSP matrices for a particular
61: operation. Only the MatMult operation can use different GPU storage formats
62: for AIJCUSP matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
63: to build/install PETSc to use these capabilities. If txpetscgpu is not enabled,
64: this function simply passes through leaving the matrix in the original CSR format
65: for usage on the GPU.
67: Not Collective
69: Input Parameters:
70: + A - Matrix of type SEQAIJCUSP
71: . op - MatCUSPFormatOperation. SEQAIJCUSP matrices support MAT_CUSP_MULT and MAT_CUSP_ALL. MPIAIJCUSP matrices support MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_OFFDIAG, and MAT_CUSP_ALL.
72: - format - MatCUSPStorageFormat (one of MAT_CUSP_CSR, MAT_CUSP_DIA, MAT_CUSP_ELL)
74: Output Parameter:
76: Level: intermediate
78: .seealso: MatCUSPStorageFormat, MatCUSPFormatOperation 79: @*/
82: PetscErrorCodeMatCUSPSetFormat(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format) 83: {
84: #if defined PETSC_HAVE_TXPETSCGPU
86: #endif
90: #if defined PETSC_HAVE_TXPETSCGPU
91: PetscTryMethod(A, "MatCUSPSetFormat_C",(Mat,MatCUSPFormatOperation,MatCUSPStorageFormat),(A,op,format));
92: #endif
93: return(0);
94: }
96: #if defined PETSC_HAVE_TXPETSCGPU
99: PetscErrorCode MatSetFromOptions_SeqAIJCUSP(Mat A)100: {
101: PetscErrorCode ierr;
102: MatCUSPStorageFormat format;
103: PetscBool flg;
106: PetscOptionsHead("SeqAIJCUSP options");
107: PetscObjectOptionsBegin((PetscObject)A);
108: PetscOptionsEnum("-mat_cusp_mult_storage_format","sets storage format of (seq)aijcusp gpu matrices for SpMV",
109: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
110: if (flg) {
111: MatCUSPSetFormat(A,MAT_CUSP_MULT,format);
112: }
113: PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of (seq)aijcusp gpu matrices for SpMV",
114: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
115: if (flg) {
116: MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
117: }
118: PetscOptionsEnd();
119: return(0);
121: }
122: #endif
126: PetscErrorCode MatCUSPCopyToGPU(Mat A)127: {
129: Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
130: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
131: PetscInt m = A->rmap->n,*ii,*ridx;
136: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
137: PetscLogEventBegin(MAT_CUSPCopyToGPU,A,0,0,0);
138: /*
139: It may be possible to reuse nonzero structure with new matrix values but
140: for simplicity and insured correctness we delete and build a new matrix on
141: the GPU. Likely a very small performance hit.
142: */
143: if (cuspstruct->mat) {
144: try {
145: delete cuspstruct->mat;
146: if (cuspstruct->tempvec) delete cuspstruct->tempvec;
148: } catch(char *ex) {
149: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
150: }
151: }
152: try {
153: cuspstruct->nonzerorow=0;
154: for (int j = 0; j<m; j++) cuspstruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
156: #if defined(PETSC_HAVE_TXPETSCGPU)
157: if (a->compressedrow.use) {
158: m = a->compressedrow.nrows;
159: ii = a->compressedrow.i;
160: ridx = a->compressedrow.rindex;
161: } else {
162: /* Forcing compressed row on the GPU ... only relevant for CSR storage */
163: int k=0;
164: PetscMalloc((cuspstruct->nonzerorow+1)*sizeof(PetscInt), &ii);
165: PetscMalloc((cuspstruct->nonzerorow)*sizeof(PetscInt), &ridx);
166: ii[0]=0;
167: for (int j = 0; j<m; j++) {
168: if ((a->i[j+1]-a->i[j])>0) {
169: ii[k] = a->i[j];
170: ridx[k]= j;
171: k++;
172: }
173: }
174: ii[cuspstruct->nonzerorow] = a->nz;
176: m = cuspstruct->nonzerorow;
177: }
179: /* Build our matrix ... first determine the GPU storage type */
180: cuspstruct->mat = GPU_Matrix_Factory::getNew(MatCUSPStorageFormats[cuspstruct->format]);
182: /* Create the streams and events (if desired). */
183: PetscMPIInt size;
184: MPI_Comm_size(PETSC_COMM_WORLD,&size);
185: cuspstruct->mat->buildStreamsAndEvents(size, &theCUSPBodyStream);CHKERRCUSP(ierr);
187: /* lastly, build the matrix */
188: cuspstruct->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);;CHKERRCUSP(ierr);
189: cuspstruct->mat->setCPRowIndices(ridx, m);
191: /*
192: INODES : Determine the inode data structure for the GPU.
193: This only really matters for the CSR format.
194: */
195: if (a->inode.use) {
196: PetscInt * temp;
197: PetscMalloc((a->inode.node_count+1)*sizeof(PetscInt), &temp);
198: temp[0]=0;
199: PetscInt nodeMax=0, nnzPerRowMax=0;
200: for (int i = 0; i<a->inode.node_count; i++) {
201: temp[i+1]= a->inode.size[i]+temp[i];
202: if (a->inode.size[i] > nodeMax) nodeMax = a->inode.size[i];
203: }
204: /* Determine the maximum number of nonzeros in a row. */
205: cuspstruct->nonzerorow = 0;
206: for (int j = 0; j<A->rmap->n; j++) {
207: cuspstruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
208: if (a->i[j+1]-a->i[j] > nnzPerRowMax) nnzPerRowMax = a->i[j+1]-a->i[j];
209: }
210: /* Set the Inode data ... only relevant for CSR really */
211: cuspstruct->mat->setInodeData(temp, a->inode.node_count+1, nnzPerRowMax, nodeMax, a->inode.node_count);
212: PetscFree(temp);
213: }
214: if (!a->compressedrow.use) {
215: PetscFree(ii);
216: PetscFree(ridx);
217: }
219: #else
221: cuspstruct->mat = new CUSPMATRIX;
222: if (a->compressedrow.use) {
223: m = a->compressedrow.nrows;
224: ii = a->compressedrow.i;
225: ridx = a->compressedrow.rindex;
226: cuspstruct->mat->resize(m,A->cmap->n,a->nz);
227: cuspstruct->mat->row_offsets.assign(ii,ii+m+1);
228: cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
229: cuspstruct->mat->values.assign(a->a,a->a+a->nz);
230: cuspstruct->indices = new CUSPINTARRAYGPU;
231: cuspstruct->indices->assign(ridx,ridx+m);
232: } else {
233: cuspstruct->mat->resize(m,A->cmap->n,a->nz);
234: cuspstruct->mat->row_offsets.assign(a->i,a->i+m+1);
235: cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
236: cuspstruct->mat->values.assign(a->a,a->a+a->nz);
237: }
238: #endif
239: cuspstruct->tempvec = new CUSPARRAY;
240: cuspstruct->tempvec->resize(m);
241: } catch(char *ex) {
242: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
243: }
244: WaitForGPU();CHKERRCUSP(ierr);
246: A->valid_GPU_matrix = PETSC_CUSP_BOTH;
248: PetscLogEventEnd(MAT_CUSPCopyToGPU,A,0,0,0);
249: }
250: return(0);
251: }
255: PetscErrorCode MatCUSPCopyFromGPU(Mat A, CUSPMATRIX *Agpu)256: {
257: Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*) A->spptr;
258: Mat_SeqAIJ *a = (Mat_SeqAIJ*) A->data;
259: PetscInt m = A->rmap->n;
262: #if defined(PETSC_HAVE_TXPETSCGPU)
263: CUSPMATRIX *mat;
264: cuspstruct->mat->getCsrMatrix(&mat);CHKERRCUSP(ierr);
265: #else
266: CUSPMATRIX *mat = (CUSPMATRIX*)cuspstruct->mat;
267: #endif
270: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
271: if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
272: try {
273: mat = Agpu;
274: if (a->compressedrow.use) {
275: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
276: } else {
277: PetscInt i;
279: if (m+1 != (PetscInt) mat->row_offsets.size()) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", mat->row_offsets.size()-1, m);
280: a->nz = mat->values.size();
281: a->maxnz = a->nz; /* Since we allocate exactly the right amount */
282: A->preallocated = PETSC_TRUE;
283: if (a->singlemalloc) {
284: if (a->a) {PetscFree3(a->a,a->j,a->i);}
285: } else {
286: if (a->i) {PetscFree(a->i);}
287: if (a->j) {PetscFree(a->j);}
288: if (a->a) {PetscFree(a->a);}
289: }
290: PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);
291: PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));
293: a->singlemalloc = PETSC_TRUE;
294: thrust::copy(mat->row_offsets.begin(), mat->row_offsets.end(), a->i);
295: thrust::copy(mat->column_indices.begin(), mat->column_indices.end(), a->j);
296: thrust::copy(mat->values.begin(), mat->values.end(), a->a);
297: /* Setup row lengths */
298: if (a->imax) {PetscFree2(a->imax,a->ilen);}
299: PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);
300: PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));
301: for (i = 0; i < m; ++i) a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
302: /* a->diag?*/
303: }
304: cuspstruct->tempvec = new CUSPARRAY;
305: cuspstruct->tempvec->resize(m);
306: } catch(char *ex) {
307: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
308: }
309: }
310: /* This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying */
311: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
312: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
314: A->valid_GPU_matrix = PETSC_CUSP_BOTH;
315: } else {
316: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
317: }
318: return(0);
319: }
323: PetscErrorCode MatGetVecs_SeqAIJCUSP(Mat mat, Vec *right, Vec *left)324: {
328: if (right) {
329: VecCreate(PetscObjectComm((PetscObject)mat),right);
330: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
331: VecSetBlockSize(*right,mat->rmap->bs);
332: VecSetType(*right,VECSEQCUSP);
333: PetscLayoutReference(mat->cmap,&(*right)->map);
334: }
335: if (left) {
336: VecCreate(PetscObjectComm((PetscObject)mat),left);
337: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
338: VecSetBlockSize(*left,mat->rmap->bs);
339: VecSetType(*left,VECSEQCUSP);
340: PetscLayoutReference(mat->rmap,&(*left)->map);
341: }
342: return(0);
343: }
347: PetscErrorCode MatMult_SeqAIJCUSP(Mat A,Vec xx,Vec yy)348: {
349: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
351: Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
352: #if !defined(PETSC_HAVE_TXPETSCGPU)
353: PetscBool usecprow = a->compressedrow.use;
354: #endif
355: CUSPARRAY *xarray=NULL,*yarray=NULL;
358: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSP
359: MatCUSPCopyToGPU(A); */
360: VecCUSPGetArrayRead(xx,&xarray);
361: VecCUSPGetArrayWrite(yy,&yarray);
362: try {
363: #if defined(PETSC_HAVE_TXPETSCGPU)
364: cuspstruct->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
365: #else
366: if (usecprow) { /* use compressed row format */
367: cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
368: VecSet_SeqCUSP(yy,0.0);
369: thrust::copy(cuspstruct->tempvec->begin(),cuspstruct->tempvec->end(),thrust::make_permutation_iterator(yarray->begin(),cuspstruct->indices->begin()));
370: } else { /* do not use compressed row format */
371: cusp::multiply(*cuspstruct->mat,*xarray,*yarray);
372: }
373: #endif
375: } catch (char *ex) {
376: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
377: }
378: VecCUSPRestoreArrayRead(xx,&xarray);
379: VecCUSPRestoreArrayWrite(yy,&yarray);
380: #if defined(PETSC_HAVE_TXPETSCGPU)
381: if (!cuspstruct->mat->hasNonZeroStream()) {
382: WaitForGPU();CHKERRCUSP(ierr);
383: }
384: #else
385: WaitForGPU();CHKERRCUSP(ierr);
386: #endif
387: PetscLogFlops(2.0*a->nz - cuspstruct->nonzerorow);
388: return(0);
389: }
392: struct VecCUSPPlusEquals
393: {
394: template <typename Tuple>
395: __host__ __device__
396: void operator()(Tuple t)
397: {
398: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
399: }
400: };
404: PetscErrorCode MatMultAdd_SeqAIJCUSP(Mat A,Vec xx,Vec yy,Vec zz)405: {
406: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
408: Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
409: CUSPARRAY *xarray = NULL,*yarray=NULL,*zarray=NULL;
412: /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSP
413: MatCUSPCopyToGPU(A); */
414: try {
415: VecCopy_SeqCUSP(yy,zz);
416: VecCUSPGetArrayRead(xx,&xarray);
417: VecCUSPGetArrayRead(yy,&yarray);
418: VecCUSPGetArrayWrite(zz,&zarray);
419: #if defined(PETSC_HAVE_TXPETSCGPU)
420: cuspstruct->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
421: #else
422: if (a->compressedrow.use) {
423: cusp::multiply(*cuspstruct->mat,*xarray, *cuspstruct->tempvec);
424: thrust::for_each(
425: thrust::make_zip_iterator(
426: thrust::make_tuple(cuspstruct->tempvec->begin(),
427: thrust::make_permutation_iterator(zarray->begin(), cuspstruct->indices->begin()))),
428: thrust::make_zip_iterator(
429: thrust::make_tuple(cuspstruct->tempvec->begin(),
430: thrust::make_permutation_iterator(zarray->begin(),cuspstruct->indices->begin()))) + cuspstruct->tempvec->size(),
431: VecCUSPPlusEquals());
432: } else {
433: cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
434: thrust::for_each(
435: thrust::make_zip_iterator(thrust::make_tuple(
436: cuspstruct->tempvec->begin(),
437: zarray->begin())),
438: thrust::make_zip_iterator(thrust::make_tuple(
439: cuspstruct->tempvec->end(),
440: zarray->end())),
441: VecCUSPPlusEquals());
442: }
443: #endif
444: VecCUSPRestoreArrayRead(xx,&xarray);
445: VecCUSPRestoreArrayRead(yy,&yarray);
446: VecCUSPRestoreArrayWrite(zz,&zarray);
448: } catch(char *ex) {
449: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
450: }
451: WaitForGPU();CHKERRCUSP(ierr);
452: PetscLogFlops(2.0*a->nz);
453: return(0);
454: }
458: PetscErrorCode MatAssemblyEnd_SeqAIJCUSP(Mat A,MatAssemblyType mode)459: {
463: MatAssemblyEnd_SeqAIJ(A,mode);
464: MatCUSPCopyToGPU(A);
465: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
466: A->ops->mult = MatMult_SeqAIJCUSP;
467: A->ops->multadd = MatMultAdd_SeqAIJCUSP;
468: return(0);
469: }
471: /* --------------------------------------------------------------------------------*/
474: /*@
475: MatCreateSeqAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
476: (the default parallel PETSc format). This matrix will ultimately pushed down
477: to NVidia GPUs and use the CUSP library for calculations. For good matrix
478: assembly performance the user should preallocate the matrix storage by setting
479: the parameter nz (or the array nnz). By setting these parameters accurately,
480: performance during matrix assembly can be increased by more than a factor of 50.
483: Collective on MPI_Comm485: Input Parameters:
486: + comm - MPI communicator, set to PETSC_COMM_SELF487: . m - number of rows
488: . n - number of columns
489: . nz - number of nonzeros per row (same for all rows)
490: - nnz - array containing the number of nonzeros in the various rows
491: (possibly different for each row) or NULL
493: Output Parameter:
494: . A - the matrix
496: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
497: MatXXXXSetPreallocation() paradigm instead of this routine directly.
498: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
500: Notes:
501: If nnz is given then nz is ignored
503: The AIJ format (also called the Yale sparse matrix format or
504: compressed row storage), is fully compatible with standard Fortran 77
505: storage. That is, the stored row and column indices can begin at
506: either one (as in Fortran) or zero. See the users' manual for details.
508: Specify the preallocated storage with either nz or nnz (not both).
509: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
510: allocation. For large problems you MUST preallocate memory or you
511: will get TERRIBLE performance, see the users' manual chapter on matrices.
513: By default, this format uses inodes (identical nodes) when possible, to
514: improve numerical efficiency of matrix-vector products and solves. We
515: search for consecutive rows with the same nonzero structure, thereby
516: reusing matrix information to achieve increased efficiency.
518: Level: intermediate
520: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
522: @*/
523: PetscErrorCodeMatCreateSeqAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)524: {
528: MatCreate(comm,A);
529: MatSetSizes(*A,m,n,m,n);
530: MatSetType(*A,MATSEQAIJCUSP);
531: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
532: return(0);
533: }
535: #if defined(PETSC_HAVE_TXPETSCGPU)
539: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)540: {
542: Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
545: if (A->factortype==MAT_FACTOR_NONE) {
546: try {
547: if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
548: delete (GPU_Matrix_Ifc*)(cuspstruct->mat);
549: }
550: if (cuspstruct->tempvec!=0) delete cuspstruct->tempvec;
551: delete cuspstruct;
552: A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
553: } catch(char *ex) {
554: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
555: }
556: }
558: /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
559: A->spptr = 0;
561: MatDestroy_SeqAIJ(A);
562: return(0);
563: }
565: #else
569: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)570: {
572: Mat_SeqAIJCUSP *cuspcontainer = (Mat_SeqAIJCUSP*)A->spptr;
575: try {
576: if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) delete (CUSPMATRIX*)(cuspcontainer->mat);
577: delete cuspcontainer;
578: A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
579: } catch(char *ex) {
580: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
581: }
582: /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
583: A->spptr = 0;
584: MatDestroy_SeqAIJ(A);
585: return(0);
586: }
588: #endif
590: /*
593: PetscErrorCode MatCreateSeqAIJCUSPFromTriple(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt* i, PetscInt* j, PetscScalar*a, Mat *mat, PetscInt nz, PetscBool idx)
594: {
595: CUSPMATRIX *gpucsr;
599: if (idx) {
600: }
601: return(0);
602: }*/
604: extern PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat, PetscInt, PetscInt, PetscInt*,const PetscScalar*);
606: #if defined(PETSC_HAVE_TXPETSCGPU)
607: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat,MatFactorType,Mat*);
608: extern PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat,const MatSolverPackage*);
609: #endif
611: #if defined(PETSC_HAVE_TXPETSCGPU)
615: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSP(Mat B)616: {
617: PetscErrorCode ierr;
618: MatCUSPStorageFormat format = MAT_CUSP_CSR;
621: MatCreate_SeqAIJ(B);
622: B->ops->mult = MatMult_SeqAIJCUSP;
623: B->ops->multadd = MatMultAdd_SeqAIJCUSP;
625: if (B->factortype==MAT_FACTOR_NONE) {
626: /* you cannot check the inode.use flag here since the matrix was just created.*/
627: B->spptr = new Mat_SeqAIJCUSP;
628: ((Mat_SeqAIJCUSP*)B->spptr)->mat = 0;
629: ((Mat_SeqAIJCUSP*)B->spptr)->tempvec = 0;
630: ((Mat_SeqAIJCUSP*)B->spptr)->format = format;
631: }
632: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSP;
633: B->ops->destroy = MatDestroy_SeqAIJCUSP;
634: B->ops->getvecs = MatGetVecs_SeqAIJCUSP;
635: B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
636: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSP;
638: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
640: B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
641: /* Here we overload MatGetFactor_cusparse_C which enables -pc_factor_mat_solver_package cusparse to work with
642: -mat_type aijcusp. That is, an aijcusp matrix can call the cusparse tri solve.
643: Note the difference with the implementation in MatCreate_SeqAIJCUSPARSE in ../seqcusparse/aijcusparse.cu */
644: PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_cusparse_C",MatGetFactor_seqaij_cusparse);
645: PetscObjectComposeFunction((PetscObject)B,"MatCUSPSetFormat_C", MatCUSPSetFormat_SeqAIJCUSP);
646: return(0);
647: }
649: #else
653: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSP(Mat B)654: {
656: Mat_SeqAIJ *aij;
659: MatCreate_SeqAIJ(B);
660: aij = (Mat_SeqAIJ*)B->data;
661: aij->inode.use = PETSC_FALSE;
662: B->ops->mult = MatMult_SeqAIJCUSP;
663: B->ops->multadd = MatMultAdd_SeqAIJCUSP;
664: B->spptr = new Mat_SeqAIJCUSP;
666: ((Mat_SeqAIJCUSP*)B->spptr)->mat = 0;
667: ((Mat_SeqAIJCUSP*)B->spptr)->tempvec = 0;
668: ((Mat_SeqAIJCUSP*)B->spptr)->indices = 0;
670: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSP;
671: B->ops->destroy = MatDestroy_SeqAIJCUSP;
672: B->ops->getvecs = MatGetVecs_SeqAIJCUSP;
673: B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
675: PetscObjectComposeFunction((PetscObject)B,"MatCUSPSetFormat_C", MatCUSPSetFormat_SeqAIJCUSP);
676: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
678: B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
679: return(0);
680: }
682: #endif
684: /*M
685: MATSEQAIJCUSP - MATAIJCUSP = "aijcusp" = "seqaijcusp" - A matrix type to be used for sparse matrices.
687: A matrix type type whose data resides on Nvidia GPUs. These matrices are in CSR format by
688: default. All matrix calculations are performed using the CUSP library.
689: DIA and ELL formats are ONLY available when using the 'txpetscgpu' package.
690: Use --download-txpetscgpu to build/install PETSc to use different GPU storage formats
691: with CUSP matrix types.
693: Options Database Keys:
694: + -mat_type aijcusp - sets the matrix type to "seqaijcusp" during a call to MatSetFromOptions()
695: . -mat_cusp_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). Other options include dia (diagonal) or ell (ellpack). dia and ell only available with 'txpetscgpu' package.
696: - -mat_cusp_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions(). Other options include dia (diagonal) or ell (ellpack). dia and ell only available with 'txpetscgpu' package.
698: Level: beginner
700: .seealso: MatCreateSeqAIJCUSP(), MATAIJCUSP, MatCreateAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation701: M*/