Actual source code: mpiaijcusparse.cu
petsc-3.4.2 2013-07-02
1: #include "petscconf.h"
2: PETSC_CUDA_EXTERN_C_BEGIN
3: #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
4: PETSC_CUDA_EXTERN_C_END
5: #include mpicusparsematimpl.h
9: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10: {
11: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
13: PetscErrorCode ierr;
14: PetscInt i;
17: if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
18: if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
19: if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
20: if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
22: PetscLayoutSetBlockSize(B->rmap,1);
23: PetscLayoutSetBlockSize(B->cmap,1);
24: PetscLayoutSetUp(B->rmap);
25: PetscLayoutSetUp(B->cmap);
26: if (d_nnz) {
27: for (i=0; i<B->rmap->n; i++) {
28: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
29: }
30: }
31: if (o_nnz) {
32: for (i=0; i<B->rmap->n; i++) {
33: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
34: }
35: }
36: if (!B->preallocated) {
37: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
38: MatCreate(PETSC_COMM_SELF,&b->A);
39: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
40: MatSetType(b->A,MATSEQAIJCUSPARSE);
41: PetscLogObjectParent(B,b->A);
42: MatCreate(PETSC_COMM_SELF,&b->B);
43: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
44: MatSetType(b->B,MATSEQAIJCUSPARSE);
45: PetscLogObjectParent(B,b->B);
46: }
47: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
48: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
49: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
50: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
52: B->preallocated = PETSC_TRUE;
53: return(0);
54: }
58: PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
59: {
63: if (right) {
64: VecCreate(PetscObjectComm((PetscObject)mat),right);
65: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
66: VecSetBlockSize(*right,mat->rmap->bs);
67: VecSetType(*right,VECCUSP);
68: VecSetLayout(*right,mat->cmap);
69: }
70: if (left) {
71: VecCreate(PetscObjectComm((PetscObject)mat),left);
72: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
73: VecSetBlockSize(*left,mat->rmap->bs);
74: VecSetType(*left,VECCUSP);
75: VecSetLayout(*left,mat->rmap);
78: }
79: return(0);
80: }
85: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
86: {
87: /* This multiplication sequence is different sequence
88: than the CPU version. In particular, the diagonal block
89: multiplication kernel is launched in one stream. Then,
90: in a separate stream, the data transfers from DeviceToHost
91: (with MPI messaging in between), then HostToDevice are
92: launched. Once the data transfer stream is synchronized,
93: to ensure messaging is complete, the MatMultAdd kernel
94: is launched in the original (MatMult) stream to protect
95: against race conditions.
97: This sequence should only be called for GPU computation. */
98: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
100: PetscInt nt;
103: VecGetLocalSize(xx,&nt);
104: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
106: (*a->A->ops->mult)(a->A,xx,yy);
107: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
108: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
109: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
110: VecScatterFinalizeForGPU(a->Mvctx);
111: return(0);
112: }
116: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117: {
118: /* This multiplication sequence is different sequence
119: than the CPU version. In particular, the diagonal block
120: multiplication kernel is launched in one stream. Then,
121: in a separate stream, the data transfers from DeviceToHost
122: (with MPI messaging in between), then HostToDevice are
123: launched. Once the data transfer stream is synchronized,
124: to ensure messaging is complete, the MatMultAdd kernel
125: is launched in the original (MatMult) stream to protect
126: against race conditions.
128: This sequence should only be called for GPU computation. */
129: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
131: PetscInt nt;
134: VecGetLocalSize(xx,&nt);
135: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
137: (*a->A->ops->multtranspose)(a->A,xx,yy);
138: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
139: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
140: (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);
141: VecScatterFinalizeForGPU(a->Mvctx);
142: return(0);
143: }
147: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148: {
149: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
150: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
153: switch (op) {
154: case MAT_CUSPARSE_MULT_DIAG:
155: cusparseStruct->diagGPUMatFormat = format;
156: break;
157: case MAT_CUSPARSE_MULT_OFFDIAG:
158: cusparseStruct->offdiagGPUMatFormat = format;
159: break;
160: case MAT_CUSPARSE_ALL:
161: cusparseStruct->diagGPUMatFormat = format;
162: cusparseStruct->offdiagGPUMatFormat = format;
163: break;
164: default:
165: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166: }
167: return(0);
168: }
172: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
173: {
174: MatCUSPARSEStorageFormat format;
175: PetscErrorCode ierr;
176: PetscBool flg;
179: PetscOptionsHead("MPIAIJCUSPARSE options");
180: PetscObjectOptionsBegin((PetscObject)A);
181: if (A->factortype==MAT_FACTOR_NONE) {
182: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
183: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
184: if (flg) {
185: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
186: }
187: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
189: if (flg) {
190: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
191: }
192: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
193: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
194: if (flg) {
195: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
196: }
197: }
198: PetscOptionsEnd();
199: return(0);
200: }
204: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205: {
206: PetscErrorCode ierr;
207: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
208: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
211: try {
212: delete cusparseStruct;
213: } catch(char *ex) {
214: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
215: }
216: cusparseStruct = 0;
218: MatDestroy_MPIAIJ(A);
219: return(0);
220: }
224: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
225: {
226: PetscErrorCode ierr;
227: Mat_MPIAIJ *a;
228: Mat_MPIAIJCUSPARSE * cusparseStruct;
231: MatCreate_MPIAIJ(A);
232: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
233: a = (Mat_MPIAIJ*)A->data;
234: a->spptr = new Mat_MPIAIJCUSPARSE;
236: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
237: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
238: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
240: A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE;
241: A->ops->mult = MatMult_MPIAIJCUSPARSE;
242: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
243: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
244: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
246: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
247: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
248: return(0);
249: }
251: /*@
252: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
253: (the default parallel PETSc format). This matrix will ultimately pushed down
254: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
255: assembly performance the user should preallocate the matrix storage by setting
256: the parameter nz (or the array nnz). By setting these parameters accurately,
257: performance during matrix assembly can be increased by more than a factor of 50.
258: This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
259: to build/install PETSc to use different CUSPARSE base matrix types.
261: Collective on MPI_Comm
263: Input Parameters:
264: + comm - MPI communicator, set to PETSC_COMM_SELF
265: . m - number of rows
266: . n - number of columns
267: . nz - number of nonzeros per row (same for all rows)
268: - nnz - array containing the number of nonzeros in the various rows
269: (possibly different for each row) or NULL
271: Output Parameter:
272: . A - the matrix
274: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
275: MatXXXXSetPreallocation() paradigm instead of this routine directly.
276: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
278: Notes:
279: If nnz is given then nz is ignored
281: The AIJ format (also called the Yale sparse matrix format or
282: compressed row storage), is fully compatible with standard Fortran 77
283: storage. That is, the stored row and column indices can begin at
284: either one (as in Fortran) or zero. See the users' manual for details.
286: Specify the preallocated storage with either nz or nnz (not both).
287: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
288: allocation. For large problems you MUST preallocate memory or you
289: will get TERRIBLE performance, see the users' manual chapter on matrices.
291: By default, this format uses inodes (identical nodes) when possible, to
292: improve numerical efficiency of matrix-vector products and solves. We
293: search for consecutive rows with the same nonzero structure, thereby
294: reusing matrix information to achieve increased efficiency.
296: Level: intermediate
298: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
299: @*/
302: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
303: {
305: PetscMPIInt size;
308: MatCreate(comm,A);
309: MatSetSizes(*A,m,n,M,N);
310: MPI_Comm_size(comm,&size);
311: if (size > 1) {
312: MatSetType(*A,MATMPIAIJCUSPARSE);
313: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
314: } else {
315: MatSetType(*A,MATSEQAIJCUSPARSE);
316: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
317: }
318: return(0);
319: }
321: /*M
322: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
324: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
325: All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
326: CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
327: in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
328: GPU storage formats with CUSPARSE matrix types.
330: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
331: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
332: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
333: for communicators controlling multiple processes. It is recommended that you call both of
334: the above preallocation routines for simplicity.
336: Options Database Keys:
337: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
338: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
339: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
340: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342: Level: beginner
344: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
345: M
346: M*/