Actual source code: mpiaijcusp.cu

petsc-3.4.2 2013-07-02
  1: #include "petscconf.h"
  2: PETSC_CUDA_EXTERN_C_BEGIN
  3: #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
  4: PETSC_CUDA_EXTERN_C_END
 5:  #include mpicuspmatimpl.h

  9: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSP(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 10: {
 11:   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
 12: #if defined(PETSC_HAVE_TXPETSCGPU)
 13:   Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)b->spptr;
 14: #endif
 16:   PetscInt       i;

 19:   PetscLayoutSetUp(B->rmap);
 20:   PetscLayoutSetUp(B->cmap);
 21:   if (d_nnz) {
 22:     for (i=0; i<B->rmap->n; i++) {
 23:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 24:     }
 25:   }
 26:   if (o_nnz) {
 27:     for (i=0; i<B->rmap->n; i++) {
 28:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 29:     }
 30:   }
 31:   if (!B->preallocated) {
 32:     /* Explicitly create 2 MATSEQAIJCUSP matrices. */
 33:     MatCreate(PETSC_COMM_SELF,&b->A);
 34:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 35:     MatSetType(b->A,MATSEQAIJCUSP);
 36:     PetscLogObjectParent(B,b->A);
 37:     MatCreate(PETSC_COMM_SELF,&b->B);
 38:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 39:     MatSetType(b->B,MATSEQAIJCUSP);
 40:     PetscLogObjectParent(B,b->B);
 41:   }
 42:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 43:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 44: #if defined(PETSC_HAVE_TXPETSCGPU)
 45:   ierr=MatCUSPSetFormat(b->A,MAT_CUSP_MULT,cuspStruct->diagGPUMatFormat);
 46:   ierr=MatCUSPSetFormat(b->B,MAT_CUSP_MULT,cuspStruct->offdiagGPUMatFormat);
 47: #endif
 48:   B->preallocated = PETSC_TRUE;
 49:   return(0);
 50: }

 54: PetscErrorCode  MatGetVecs_MPIAIJCUSP(Mat mat,Vec *right,Vec *left)
 55: {

 59:   if (right) {
 60:     VecCreate(PetscObjectComm((PetscObject)mat),right);
 61:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
 62:     VecSetBlockSize(*right,mat->rmap->bs);
 63:     VecSetType(*right,VECCUSP);
 64:     VecSetLayout(*right,mat->cmap);
 65:   }
 66:   if (left) {
 67:     VecCreate(PetscObjectComm((PetscObject)mat),left);
 68:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
 69:     VecSetBlockSize(*left,mat->rmap->bs);
 70:     VecSetType(*left,VECCUSP);
 71:     VecSetLayout(*left,mat->rmap);
 72:   }
 73:   return(0);
 74: }


 77: #if defined(PETSC_HAVE_TXPETSCGPU)
 80: PetscErrorCode MatMult_MPIAIJCUSP(Mat A,Vec xx,Vec yy)
 81: {
 82:   /* This multiplication sequence is different sequence
 83:      than the CPU version. In particular, the diagonal block
 84:      multiplication kernel is launched in one stream. Then,
 85:      in a separate stream, the data transfers from DeviceToHost
 86:      (with MPI messaging in between), then HostToDevice are
 87:      launched. Once the data transfer stream is synchronized,
 88:      to ensure messaging is complete, the MatMultAdd kernel
 89:      is launched in the original (MatMult) stream to protect
 90:      against race conditions.

 92:      This sequence should only be called for GPU computation. */
 93:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 95:   PetscInt       nt;

 98:   VecGetLocalSize(xx,&nt);
 99:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
100:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
101:   (*a->A->ops->mult)(a->A,xx,yy);
102:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
103:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
104:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
105:   VecScatterFinalizeForGPU(a->Mvctx);
106:   return(0);
107: }
108: #endif

110: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);

112: #if defined(PETSC_HAVE_TXPETSCGPU)

116: PetscErrorCode MatCUSPSetFormat_MPIAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format)
117: {
118:   Mat_MPIAIJ     *a           = (Mat_MPIAIJ*)A->data;
119:   Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;

122:   switch (op) {
123:   case MAT_CUSP_MULT_DIAG:
124:     cuspStruct->diagGPUMatFormat = format;
125:     break;
126:   case MAT_CUSP_MULT_OFFDIAG:
127:     cuspStruct->offdiagGPUMatFormat = format;
128:     break;
129:   case MAT_CUSP_ALL:
130:     cuspStruct->diagGPUMatFormat    = format;
131:     cuspStruct->offdiagGPUMatFormat = format;
132:     break;
133:   default:
134:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_DIAG, and MAT_CUSP_MULT_ALL are currently supported.",op);
135:   }
136:   return(0);
137: }

141: PetscErrorCode MatSetFromOptions_MPIAIJCUSP(Mat A)
142: {
143:   MatCUSPStorageFormat format;
144:   PetscErrorCode       ierr;
145:   PetscBool            flg;

148:   PetscOptionsHead("MPIAIJCUSP options");
149:   PetscObjectOptionsBegin((PetscObject)A);
150:   if (A->factortype==MAT_FACTOR_NONE) {
151:     PetscOptionsEnum("-mat_cusp_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusp gpu matrices for SpMV",
152:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
153:     if (flg) {
154:       MatCUSPSetFormat(A,MAT_CUSP_MULT_DIAG,format);
155:     }
156:     PetscOptionsEnum("-mat_cusp_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
157:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
158:     if (flg) {
159:       MatCUSPSetFormat(A,MAT_CUSP_MULT_OFFDIAG,format);
160:     }
161:     PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
162:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
163:     if (flg) {
164:       MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
165:     }
166:   }
167:   PetscOptionsEnd();
168:   return(0);
169: }
170: #endif

174: PetscErrorCode MatDestroy_MPIAIJCUSP(Mat A)
175: {
177: #if defined(PETSC_HAVE_TXPETSCGPU)
178:   Mat_MPIAIJ     *a           = (Mat_MPIAIJ*)A->data;
179:   Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
180: #endif

183: #if defined(PETSC_HAVE_TXPETSCGPU)
184:   try {
185:     delete cuspStruct;
186:   } catch(char *ex) {
187:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", ex);
188:   }
189:   cuspStruct = 0;
190: #endif
191:   MatDestroy_MPIAIJ(A);
192:   return(0);
193: }

197: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSP(Mat A)
198: {
200: #if defined(PETSC_HAVE_TXPETSCGPU)
201:   Mat_MPIAIJ     *a;
202:   Mat_MPIAIJCUSP * cuspStruct;
203: #endif

206:   MatCreate_MPIAIJ(A);
207:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSP);
208:   A->ops->getvecs        = MatGetVecs_MPIAIJCUSP;
209:   A->ops->setvaluesbatch = MatSetValuesBatch_MPIAIJCUSP;

211: #if defined(PETSC_HAVE_TXPETSCGPU)
212:   a          = (Mat_MPIAIJ*)A->data;
213:   a->spptr   = new Mat_MPIAIJCUSP;
214:   cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;

216:   cuspStruct->diagGPUMatFormat    = MAT_CUSP_CSR;
217:   cuspStruct->offdiagGPUMatFormat = MAT_CUSP_CSR;

219:   A->ops->mult           = MatMult_MPIAIJCUSP;
220:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSP;
221:   A->ops->destroy        = MatDestroy_MPIAIJCUSP;

223:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPSetFormat_C", MatCUSPSetFormat_MPIAIJCUSP);
224: #endif
225:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSP);
226:   return(0);
227: }


230: /*@
231:    MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
232:    (the default parallel PETSc format).  This matrix will ultimately pushed down
233:    to NVidia GPUs and use the CUSP library for calculations. For good matrix
234:    assembly performance the user should preallocate the matrix storage by setting
235:    the parameter nz (or the array nnz).  By setting these parameters accurately,
236:    performance during matrix assembly can be increased by more than a factor of 50.


239:    Collective on MPI_Comm

241:    Input Parameters:
242: +  comm - MPI communicator, set to PETSC_COMM_SELF
243: .  m - number of rows
244: .  n - number of columns
245: .  nz - number of nonzeros per row (same for all rows)
246: -  nnz - array containing the number of nonzeros in the various rows
247:          (possibly different for each row) or NULL

249:    Output Parameter:
250: .  A - the matrix

252:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
253:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
254:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

256:    Notes:
257:    If nnz is given then nz is ignored

259:    The AIJ format (also called the Yale sparse matrix format or
260:    compressed row storage), is fully compatible with standard Fortran 77
261:    storage.  That is, the stored row and column indices can begin at
262:    either one (as in Fortran) or zero.  See the users' manual for details.

264:    Specify the preallocated storage with either nz or nnz (not both).
265:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
266:    allocation.  For large problems you MUST preallocate memory or you
267:    will get TERRIBLE performance, see the users' manual chapter on matrices.

269:    By default, this format uses inodes (identical nodes) when possible, to
270:    improve numerical efficiency of matrix-vector products and solves. We
271:    search for consecutive rows with the same nonzero structure, thereby
272:    reusing matrix information to achieve increased efficiency.

274:    Level: intermediate

276: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSP, MATAIJCUSP
277: @*/
280: PetscErrorCode  MatCreateAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
281: {
283:   PetscMPIInt    size;

286:   MatCreate(comm,A);
287:   MatSetSizes(*A,m,n,M,N);
288:   MPI_Comm_size(comm,&size);
289:   if (size > 1) {
290:     MatSetType(*A,MATMPIAIJCUSP);
291:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
292:   } else {
293:     MatSetType(*A,MATSEQAIJCUSP);
294:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
295:   }
296:   return(0);
297: }

299: /*M
300:    MATAIJCUSP - MATMPIAIJCUSP= "aijcusp" = "mpiaijcusp" - A matrix type to be used for sparse matrices.

302:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be CSR format.
303:    All matrix calculations are performed using the CUSP library. DIA and ELL
304:    formats are ONLY available when using the 'txpetscgpu' package. Use --download-txpetscgpu
305:    to build/install PETSc to use different GPU storage formats with CUSP matrix types.

307:    This matrix type is identical to MATSEQAIJCUSP when constructed with a single process communicator,
308:    and MATMPIAIJCUSP otherwise.  As a result, for single process communicators,
309:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
310:    for communicators controlling multiple processes.  It is recommended that you call both of
311:    the above preallocation routines for simplicity.

313:    Options Database Keys:
314: +  -mat_type mpiaijcusp - sets the matrix type to "mpiaijcusp" during a call to MatSetFromOptions()
315: .  -mat_cusp_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack) which are only available with 'txpetscgpu' package. Moreover this option is only available with the 'txpetscgpu' package.
316: .  -mat_cusp_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack) which are only available with 'txpetscgpu' package. Moreover this option is only available with the 'txpetscgpu' package.
317: -  -mat_cusp_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack) which are only available with 'txpetscgpu' package. Moreover this option is only available with the 'txpetscgpu' package.

319:   Level: beginner

321:  .seealso: MatCreateAIJCUSP(), MATSEQAIJCUSP, MatCreateSeqAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation
322: M*/