Actual source code: cudavecimpl.h
petsc-3.12.3 2020-01-03
4: #include <petscvec.h>
5: #include <petsccublas.h>
6: #include <petsc/private/vecimpl.h>
8: #include <cublas_v2.h>
10: #define WaitForGPU() PetscCUDASynchronize ? cudaDeviceSynchronize() : 0
12: typedef struct {
13: PetscScalar *GPUarray; /* this always holds the GPU data */
14: PetscScalar *GPUarray_allocated; /* if the array was allocated by PETSc this is its pointer */
15: cudaStream_t stream; /* A stream for doing asynchronous data transfers */
16: PetscBool hostDataRegisteredAsPageLocked;
17: } Vec_CUDA;
20: #include <cuda_runtime.h>
22: PETSC_INTERN PetscErrorCode VecDotNorm2_SeqCUDA(Vec,Vec,PetscScalar*, PetscScalar*);
23: PETSC_INTERN PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec,Vec,Vec);
24: PETSC_INTERN PetscErrorCode VecWAXPY_SeqCUDA(Vec,PetscScalar,Vec,Vec);
25: PETSC_INTERN PetscErrorCode VecMDot_SeqCUDA(Vec,PetscInt,const Vec[],PetscScalar*);
26: PETSC_EXTERN PetscErrorCode VecSet_SeqCUDA(Vec,PetscScalar);
27: PETSC_INTERN PetscErrorCode VecMAXPY_SeqCUDA(Vec,PetscInt,const PetscScalar*,Vec*);
28: PETSC_INTERN PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec,PetscScalar,PetscScalar,PetscScalar,Vec,Vec);
29: PETSC_INTERN PetscErrorCode VecPointwiseMult_SeqCUDA(Vec,Vec,Vec);
30: PETSC_INTERN PetscErrorCode VecPlaceArray_SeqCUDA(Vec,const PetscScalar*);
31: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA(Vec);
32: PETSC_INTERN PetscErrorCode VecReplaceArray_SeqCUDA(Vec,const PetscScalar*);
33: PETSC_INTERN PetscErrorCode VecDot_SeqCUDA(Vec,Vec,PetscScalar*);
34: PETSC_INTERN PetscErrorCode VecTDot_SeqCUDA(Vec,Vec,PetscScalar*);
35: PETSC_INTERN PetscErrorCode VecScale_SeqCUDA(Vec,PetscScalar);
36: PETSC_EXTERN PetscErrorCode VecCopy_SeqCUDA(Vec,Vec);
37: PETSC_INTERN PetscErrorCode VecSwap_SeqCUDA(Vec,Vec);
38: PETSC_EXTERN PetscErrorCode VecAXPY_SeqCUDA(Vec,PetscScalar,Vec);
39: PETSC_INTERN PetscErrorCode VecAXPBY_SeqCUDA(Vec,PetscScalar,PetscScalar,Vec);
40: PETSC_INTERN PetscErrorCode VecDuplicate_SeqCUDA(Vec,Vec*);
41: PETSC_INTERN PetscErrorCode VecConjugate_SeqCUDA(Vec xin);
42: PETSC_INTERN PetscErrorCode VecNorm_SeqCUDA(Vec,NormType,PetscReal*);
43: PETSC_INTERN PetscErrorCode VecCUDACopyToGPU(Vec);
44: PETSC_INTERN PetscErrorCode VecCUDAAllocateCheck(Vec);
45: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUDA(Vec);
46: PETSC_INTERN PetscErrorCode VecCreate_SeqCUDA_Private(Vec,const PetscScalar*);
47: PETSC_INTERN PetscErrorCode VecCreate_MPICUDA(Vec);
48: PETSC_INTERN PetscErrorCode VecCreate_MPICUDA_Private(Vec,PetscBool,PetscInt,const PetscScalar*);
49: PETSC_INTERN PetscErrorCode VecCreate_CUDA(Vec);
50: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA(Vec);
51: PETSC_INTERN PetscErrorCode VecDestroy_MPICUDA(Vec);
52: PETSC_INTERN PetscErrorCode VecAYPX_SeqCUDA(Vec,PetscScalar,Vec);
53: PETSC_INTERN PetscErrorCode VecSetRandom_SeqCUDA(Vec,PetscRandom);
54: PETSC_INTERN PetscErrorCode VecGetLocalVector_SeqCUDA(Vec,Vec);
55: PETSC_INTERN PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec,Vec);
56: PETSC_INTERN PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin);
57: PETSC_INTERN PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r);
58: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v);
59: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin);
60: PETSC_INTERN PetscErrorCode VecCUDACopyToGPU_Public(Vec);
61: PETSC_INTERN PetscErrorCode VecCUDAAllocateCheck_Public(Vec);
62: PETSC_INTERN PetscErrorCode VecCUDACopyToGPUSome(Vec,PetscCUDAIndices,ScatterMode);
63: PETSC_INTERN PetscErrorCode VecCUDACopyFromGPUSome(Vec,PetscCUDAIndices,ScatterMode);
65: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt, PetscInt*,PetscInt, PetscInt*,PetscCUDAIndices*);
66: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt,PetscInt,PetscInt,PetscInt,PetscInt,PetscInt*,PetscInt*,PetscCUDAIndices*);
67: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices*);
68: PETSC_INTERN PetscErrorCode VecScatterCUDA_StoS(Vec,Vec,PetscCUDAIndices,InsertMode,ScatterMode);
70: typedef enum {VEC_SCATTER_CUDA_STOS, VEC_SCATTER_CUDA_PTOP} VecCUDAScatterType;
71: typedef enum {VEC_SCATTER_CUDA_GENERAL, VEC_SCATTER_CUDA_STRIDED} VecCUDASequentialScatterMode;
73: struct _p_VecScatterCUDAIndices_PtoP {
74: PetscInt ns;
75: PetscInt sendLowestIndex;
76: PetscInt nr;
77: PetscInt recvLowestIndex;
78: };
80: struct _p_VecScatterCUDAIndices_StoS {
81: /* from indices data */
82: PetscInt *fslots;
83: PetscInt fromFirst;
84: PetscInt fromStep;
85: VecCUDASequentialScatterMode fromMode;
87: /* to indices data */
88: PetscInt *tslots;
89: PetscInt toFirst;
90: PetscInt toStep;
91: VecCUDASequentialScatterMode toMode;
93: PetscInt n;
94: PetscInt MAX_BLOCKS;
95: PetscInt MAX_CORESIDENT_THREADS;
96: cudaStream_t stream;
97: };
99: struct _p_PetscCUDAIndices {
100: void * scatter;
101: VecCUDAScatterType scatterType;
102: };
104: /* complex single */
105: #if defined(PETSC_USE_COMPLEX)
106: #if defined(PETSC_USE_REAL_SINGLE)
107: #define cublasXaxpy(a,b,c,d,e,f,g) cublasCaxpy((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g))
108: #define cublasXscal(a,b,c,d,e) cublasCscal((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e))
109: #define cublasXdotu(a,b,c,d,e,f,g) cublasCdotu((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
110: #define cublasXdot(a,b,c,d,e,f,g) cublasCdotc((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
111: #define cublasXswap(a,b,c,d,e,f) cublasCswap((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f))
112: #define cublasXnrm2(a,b,c,d,e) cublasScnrm2((a),(b),(cuComplex*)(c),(d),(e))
113: #define cublasIXamax(a,b,c,d,e) cublasIcamax((a),(b),(cuComplex*)(c),(d),(e))
114: #define cublasXasum(a,b,c,d,e) cublasScasum((a),(b),(cuComplex*)(c),(d),(e))
115: #define cublasXgemv(a,b,c,d,e,f,g,h,i,j,k,l) cublasCgemv((a),(b),(c),(d),(cuComplex*)(e),(cuComplex*)(f),(g),(cuComplex*)(h),(i),(cuComplex*)(j),(cuComplex*)(k),(l))
116: #define cublasXgemm(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cublasCgemm((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(cuComplex*)(h),(i),(cuComplex*)(j),(k),(cuComplex*)(l),(cuComplex*)(m),(n))
117: #else /* complex double */
118: #define cublasXaxpy(a,b,c,d,e,f,g) cublasZaxpy((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g))
119: #define cublasXscal(a,b,c,d,e) cublasZscal((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e))
120: #define cublasXdotu(a,b,c,d,e,f,g) cublasZdotu((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
121: #define cublasXdot(a,b,c,d,e,f,g) cublasZdotc((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
122: #define cublasXswap(a,b,c,d,e,f) cublasZswap((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f))
123: #define cublasXnrm2(a,b,c,d,e) cublasDznrm2((a),(b),(cuDoubleComplex*)(c),(d),(e))
124: #define cublasIXamax(a,b,c,d,e) cublasIzamax((a),(b),(cuDoubleComplex*)(c),(d),(e))
125: #define cublasXasum(a,b,c,d,e) cublasDzasum((a),(b),(cuDoubleComplex*)(c),(d),(e))
126: #define cublasXgemv(a,b,c,d,e,f,g,h,i,j,k,l) cublasZgemv((a),(b),(c),(d),(cuDoubleComplex*)(e),(cuDoubleComplex*)(f),(g),(cuDoubleComplex*)(h),(i),(cuDoubleComplex*)(j),(cuDoubleComplex*)(k),(l))
127: #define cublasXgemm(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cublasZgemm((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(cuDoubleComplex*)(h),(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(cuDoubleComplex*)(m),(n))
128: #endif
129: #else /* real single */
130: #if defined(PETSC_USE_REAL_SINGLE)
131: #define cublasXaxpy cublasSaxpy
132: #define cublasXscal cublasSscal
133: #define cublasXdotu cublasSdot
134: #define cublasXdot cublasSdot
135: #define cublasXswap cublasSswap
136: #define cublasXnrm2 cublasSnrm2
137: #define cublasIXamax cublasIsamax
138: #define cublasXasum cublasSasum
139: #define cublasXgemv cublasSgemv
140: #define cublasXgemm cublasSgemm
141: #else /* real double */
142: #define cublasXaxpy cublasDaxpy
143: #define cublasXscal cublasDscal
144: #define cublasXdotu cublasDdot
145: #define cublasXdot cublasDdot
146: #define cublasXswap cublasDswap
147: #define cublasXnrm2 cublasDnrm2
148: #define cublasIXamax cublasIdamax
149: #define cublasXasum cublasDasum
150: #define cublasXgemv cublasDgemv
151: #define cublasXgemm cublasDgemm
152: #endif
153: #endif
155: #endif