Actual source code: landaucu.cu
petsc-3.14.3 2021-01-09
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <petsc/private/kernels/petscaxpy.h>
11: #define PETSC_THREAD_SYNC __syncthreads()
12: #define PETSC_DEVICE_FUNC_DECL __device__
13: #include "../land_kernel.h"
15: // Macro to catch CUDA errors in CUDA runtime calls
16: #define CUDA_SAFE_CALL(call) \
17: do { \
18: cudaError_t err = call; \
19: if (cudaSuccess != err) { \
20: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
21: __FILE__, __LINE__, cudaGetErrorString(err)); \
22: exit(EXIT_FAILURE); \
23: } \
24: } while (0)
25: // Macro to catch CUDA errors in kernel launches
26: #define CHECK_LAUNCH_ERROR() \
27: do { \
28: /* Check synchronous errors, i.e. pre-launch */ \
29: cudaError_t err = cudaGetLastError(); \
30: if (cudaSuccess != err) { \
31: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
32: __FILE__, __LINE__, cudaGetErrorString(err)); \
33: exit(EXIT_FAILURE); \
34: } \
35: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
36: err = cudaDeviceSynchronize(); \
37: if (cudaSuccess != err) { \
38: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
39: __FILE__, __LINE__, cudaGetErrorString( err)); \
40: exit(EXIT_FAILURE); \
41: } \
42: } while (0)
44: #define LANDAU_USE_SHARED_GPU_MEM
45: //j
46: // The GPU Landau kernel
47: //
48: __global__
49: void landau_kernel(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
50: const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
51: const PetscReal * const BB, const PetscReal * const DD, const PetscReal * const IPDataGlobal, const PetscReal wiGlobal[],
52: #if !defined(LANDAU_USE_SHARED_GPU_MEM)
53: PetscReal *g2arr, PetscReal *g3arr,
54: #endif
55: PetscBool quarter3DDomain, PetscScalar elemMats_out[])
56: {
57: const PetscInt Nq = blockDim.x, myelem = blockIdx.x;
58: #if defined(LANDAU_USE_SHARED_GPU_MEM)
59: extern __shared__ PetscReal g2_g3_qi[]; // Nq * { [NSubBlocks][Nf][dim] ; [NSubBlocks][Nf][dim][dim] }
60: PetscReal (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]) &g2_g3_qi[0];
61: PetscReal (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g2_g3_qi[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM];
62: #else
63: PetscReal (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]) &g2arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM ];
64: PetscReal (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g3arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*LANDAU_DIM];
65: #endif
66: const PetscInt myQi = threadIdx.x, mySubBlk = threadIdx.y, nSubBlks = blockDim.y;
67: const PetscInt jpidx = myQi + myelem * Nq;
68: const PetscInt subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
69: PetscScalar *elemMat = &elemMats_out[myelem*totDim*totDim]; /* my output */
71: if (threadIdx.x==0 && threadIdx.y==0) {
72: memset(elemMat, 0, totDim*totDim*sizeof(PetscScalar));
73: }
74: __syncthreads();
75: landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, ip_start, ip_end, 1, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* compact */
76: // landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, mySubBlk, nip, nSubBlks, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* spread */
77: }
78: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[]);
79: __global__ void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[]);
80: PetscErrorCode LandauCUDAJacobian(DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[],
81: const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal * const IPDataGlobal,
82: const PetscReal wiGlobal[], const PetscReal invJj[], const PetscInt num_sub_blocks, const PetscLogEvent events[], PetscBool quarter3DDomain,
83: Mat JacP)
84: {
85: PetscErrorCode ierr;
86: PetscInt ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(PetscReal);
87: PetscReal *d_BB,*d_DD,*d_invJj,*d_wiGlobal,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m;
88: PetscScalar *elemMats,*d_elemMats;
89: PetscLogDouble flops;
90: PetscTabulation *Tf;
91: PetscDS prob;
92: PetscSection section, globalSection;
93: PetscReal *d_IPDataGlobal;
94: PetscBool cuda_assemble = PETSC_FALSE;
97: PetscLogEventBegin(events[3],0,0,0,0);
98: DMGetDimension(plex, &dim);
99: if (dim!=LANDAU_DIM) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM != dim");
100: DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
101: numGCells = cEnd - cStart;
102: nip = numGCells*Nq; /* length of inner global iteration */
103: DMGetDS(plex, &prob);
104: PetscDSGetNumFields(prob, &Nf);
105: PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
106: if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D %D",Nq,Nb);
107: PetscDSGetTotalDimension(prob, &totDim);
108: PetscDSGetTabulation(prob, &Tf);
109: DMGetLocalSection(plex, §ion);
110: DMGetGlobalSection(plex, &globalSection);
111: // create data
112: CUDA_SAFE_CALL(cudaMalloc((void **)&d_IPDataGlobal, nip*(dim + Nf*(dim+1))*szf)); // kernel input
113: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_alpha, Nf*szf)); // kernel input
114: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_beta, Nf*szf)); // kernel input
115: CUDA_SAFE_CALL(cudaMalloc((void **)&d_invMass, Nf*szf)); // kernel input
116: CUDA_SAFE_CALL(cudaMalloc((void **)&d_Eq_m, Nf*szf)); // kernel input
117: CUDA_SAFE_CALL(cudaMemcpy(d_IPDataGlobal, IPDataGlobal, nip*(dim + Nf*(dim+1))*szf, cudaMemcpyHostToDevice));
118: CUDA_SAFE_CALL(cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf, cudaMemcpyHostToDevice));
119: CUDA_SAFE_CALL(cudaMemcpy(d_nu_beta, nu_beta, Nf*szf, cudaMemcpyHostToDevice));
120: CUDA_SAFE_CALL(cudaMemcpy(d_invMass, invMass, Nf*szf, cudaMemcpyHostToDevice));
121: CUDA_SAFE_CALL(cudaMemcpy(d_Eq_m, Eq_m, Nf*szf, cudaMemcpyHostToDevice));
122: CUDA_SAFE_CALL(cudaMalloc((void **)&d_BB, Nq*Nb*szf)); // kernel input
123: CUDA_SAFE_CALL(cudaMemcpy( d_BB, Tf[0]->T[0], Nq*Nb*szf, cudaMemcpyHostToDevice));
124: CUDA_SAFE_CALL(cudaMalloc((void **)&d_DD, Nq*Nb*dim*szf)); // kernel input
125: CUDA_SAFE_CALL(cudaMemcpy( d_DD, Tf[0]->T[1], Nq*Nb*dim*szf, cudaMemcpyHostToDevice));
126: CUDA_SAFE_CALL(cudaMalloc((void **)&d_wiGlobal, Nq*numGCells*szf)); // kernel input
127: CUDA_SAFE_CALL(cudaMemcpy( d_wiGlobal, wiGlobal, Nq*numGCells*szf, cudaMemcpyHostToDevice));
128: // collect geometry
129: flops = (PetscLogDouble)numGCells*(PetscLogDouble)Nq*(PetscLogDouble)(5.*dim*dim*Nf*Nf + 165.);
130: nip_dim2 = Nq*numGCells*dim*dim;
131: CUDA_SAFE_CALL(cudaMalloc((void **)&d_invJj, nip_dim2*szf)); // kernel input
132: CUDA_SAFE_CALL(cudaMemcpy(d_invJj, invJj, nip_dim2*szf, cudaMemcpyHostToDevice));
133: PetscLogEventEnd(events[3],0,0,0,0);
135: PetscLogEventBegin(events[4],0,0,0,0);
136: PetscLogGpuFlops(flops*nip);
137: {
138: dim3 dimBlock(Nq,num_sub_blocks);
139: CUDA_SAFE_CALL(cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar))); // kernel output
140: ii = LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM)*LANDAU_MAX_SUB_THREAD_BLOCKS;
141: #if defined(LANDAU_USE_SHARED_GPU_MEM)
142: // PetscPrintf(PETSC_COMM_SELF,"Call land_kernel with %D kB shared memory\n",ii*8/1024);
143: landau_kernel<<<numGCells,dimBlock,ii*szf>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
144: d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, quarter3DDomain, d_elemMats);
145: CHECK_LAUNCH_ERROR();
146: #else
147: PetscReal *d_g2g3;
148: CUDA_SAFE_CALL(cudaMalloc((void **)&d_g2g3, ii*szf*numGCells)); // kernel input
149: PetscReal *g2 = &d_g2g3[0];
150: PetscReal *g3 = &d_g2g3[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*numGCells];
151: landau_kernel<<<numGCells,dimBlock>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
152: d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, g2, g3, quarter3DDomain, d_elemMats);
153: CHECK_LAUNCH_ERROR();
154: CUDA_SAFE_CALL (cudaDeviceSynchronize());
155: CUDA_SAFE_CALL(cudaFree(d_g2g3));
156: #endif
157: }
158: PetscLogEventEnd(events[4],0,0,0,0);
159: // delete device data
160: PetscLogEventBegin(events[5],0,0,0,0);
161: CUDA_SAFE_CALL(cudaFree(d_IPDataGlobal));
162: CUDA_SAFE_CALL(cudaFree(d_invJj));
163: CUDA_SAFE_CALL(cudaFree(d_wiGlobal));
164: CUDA_SAFE_CALL(cudaFree(d_nu_alpha));
165: CUDA_SAFE_CALL(cudaFree(d_nu_beta));
166: CUDA_SAFE_CALL(cudaFree(d_invMass));
167: CUDA_SAFE_CALL(cudaFree(d_Eq_m));
168: CUDA_SAFE_CALL(cudaFree(d_BB));
169: CUDA_SAFE_CALL(cudaFree(d_DD));
170: PetscMalloc1(totDim*totDim*numGCells,&elemMats);
171: CUDA_SAFE_CALL(cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
172: CUDA_SAFE_CALL(cudaFree(d_elemMats));
173: PetscLogEventEnd(events[5],0,0,0,0);
175: PetscLogEventBegin(events[6],0,0,0,0);
176: if (!cuda_assemble) {
177: PetscScalar *elMat;
178: for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
179: DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
180: if (ej==-1) {
181: int d,f;
182: printf("GPU Element matrix\n");
183: for (d = 0; d < totDim; ++d) {
184: for (f = 0; f < totDim; ++f) printf(" %17.10e", PetscRealPart(elMat[d*totDim + f]));
185: printf("\n");
186: }
187: exit(12);
188: }
189: }
190: } else {
191: PetscContainer container = NULL;
192: PetscObjectQuery((PetscObject)JacP,"coloring",(PetscObject*)&container);
193: if (!container) {
194: PetscLogEventBegin(events[8],0,0,0,0);
195: LandauCreateColoring(JacP, plex, &container);
196: PetscLogEventEnd(events[8],0,0,0,0);
197: }
198: LandauAssembleCuda(cStart, cEnd, totDim, plex, section, globalSection, JacP, elemMats, container, events);
199: }
200: PetscFree(elemMats);
201: PetscLogEventEnd(events[6],0,0,0,0);
203: return(0);
204: }
206: __global__
207: void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[])
208: {
209: const PetscInt myelem = (gridDim.x==1) ? threadIdx.x : blockIdx.x;
210: Mat_SeqAIJ a = mats[colors[myelem]]; /* copy to GPU */
211: const PetscScalar *v = el_mats[myelem];
212: const PetscInt *in = idx_arr[myelem], *im = idx_arr[myelem], n = nidx_arr[myelem], m = nidx_arr[myelem];
213: /* mat set values */
214: PetscInt *rp,k,low,high,t,row,nrow,i,col,l;
215: PetscInt *ai = a.i,*ailen = a.ilen;
216: PetscInt *aj = a.j,lastcol = -1;
217: MatScalar *ap=NULL,value=0.0,*aa = a.a;
218: for (k=0; k<m; k++) { /* loop over added rows */
219: row = im[k];
220: if (row < 0) continue;
221: rp = aj + ai[row];
222: ap = aa + ai[row];
223: nrow = ailen[row];
224: low = 0;
225: high = nrow;
226: for (l=0; l<n; l++) { /* loop over added columns */
227: /* if (in[l] < 0) { */
228: /* printf("\t\tin[l] < 0 ?????\n"); */
229: /* continue; */
230: /* } */
231: while (l<n && (value = v[l + k*n], PetscAbsScalar(value)==0.0)) l++;
232: if (l==n) break;
233: col = in[l];
234: if (col <= lastcol) low = 0;
235: else high = nrow;
236: lastcol = col;
237: while (high-low > 5) {
238: t = (low+high)/2;
239: if (rp[t] > col) high = t;
240: else low = t;
241: }
242: for (i=low; i<high; i++) {
243: // if (rp[i] > col) break;
244: if (rp[i] == col) {
245: ap[i] += value;
246: low = i + 1;
247: goto noinsert;
248: }
249: }
250: printf("\t\t\t ERROR in assemble_kernel\n");
251: noinsert:;
252: }
253: }
254: }
256: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[])
257: {
258: PetscErrorCode ierr;
259: #define LANDAU_MAX_COLORS 16
260: #define LANDAU_MAX_ELEMS 512
261: Mat_SeqAIJ h_mats[LANDAU_MAX_COLORS], *jaca = (Mat_SeqAIJ *)JacP->data, *d_mats;
262: const PetscInt nelems = cEnd - cStart, nnz = jaca->i[JacP->rmap->n], N = JacP->rmap->n; /* serial */
263: const ISColoringValue *colors;
264: ISColoringValue *d_colors,colour;
265: PetscInt *h_idx_arr[LANDAU_MAX_ELEMS], h_nidx_arr[LANDAU_MAX_ELEMS], *d_nidx_arr, **d_idx_arr,nc,ej,j,cell;
266: PetscScalar *h_new_el_mats[LANDAU_MAX_ELEMS], *val_buf, **d_new_el_mats;
267: ISColoring iscoloring;
268: PetscContainerGetPointer(container,(void**)&iscoloring);
269: /* get colors */
270: ISColoringGetColors(iscoloring, &j, &nc, &colors);
271: if (nelems>LANDAU_MAX_ELEMS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many elements. %D > %D",nelems,LANDAU_MAX_ELEMS);
272: if (nc>LANDAU_MAX_COLORS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many colors. %D > %D",nc,LANDAU_MAX_COLORS);
273: /* colors for kernel */
274: CUDA_SAFE_CALL(cudaMalloc((void **)&d_colors, nelems*sizeof(ISColoringValue))); // kernel input
275: CUDA_SAFE_CALL(cudaMemcpy( d_colors, colors, nelems*sizeof(ISColoringValue), cudaMemcpyHostToDevice));
276: /* get indices and element matrices */
277: for (cell = cStart, ej = 0 ; cell < cEnd; ++cell, ++ej) {
278: PetscInt numindices,*indices;
279: PetscScalar *elMat = &elemMats[ej*totDim*totDim];
280: PetscScalar *valuesOrig = elMat;
281: DMPlexGetClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
282: h_nidx_arr[ej] = numindices;
283: CUDA_SAFE_CALL(cudaMalloc((void **)&h_idx_arr[ej], numindices*sizeof(PetscInt))); // kernel input
284: CUDA_SAFE_CALL(cudaMemcpy( h_idx_arr[ej], indices, numindices*sizeof(PetscInt), cudaMemcpyHostToDevice));
285: CUDA_SAFE_CALL(cudaMalloc((void **)&h_new_el_mats[ej], numindices*numindices*sizeof(PetscScalar))); // kernel input
286: CUDA_SAFE_CALL(cudaMemcpy( h_new_el_mats[ej], elMat, numindices*numindices*sizeof(PetscScalar), cudaMemcpyHostToDevice));
287: DMPlexRestoreClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
288: if (elMat != valuesOrig) {DMRestoreWorkArray(plex, numindices*numindices, MPIU_SCALAR, &elMat);}
289: }
290: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nidx_arr, nelems*sizeof(PetscInt))); // kernel input
291: CUDA_SAFE_CALL(cudaMemcpy( d_nidx_arr, h_nidx_arr, nelems*sizeof(PetscInt), cudaMemcpyHostToDevice));
292: CUDA_SAFE_CALL(cudaMalloc((void **)&d_idx_arr, nelems*sizeof(PetscInt*))); // kernel input
293: CUDA_SAFE_CALL(cudaMemcpy( d_idx_arr, h_idx_arr, nelems*sizeof(PetscInt*), cudaMemcpyHostToDevice));
294: CUDA_SAFE_CALL(cudaMalloc((void **)&d_new_el_mats, nelems*sizeof(PetscScalar*))); // kernel input
295: CUDA_SAFE_CALL(cudaMemcpy( d_new_el_mats, h_new_el_mats,nelems*sizeof(PetscScalar*), cudaMemcpyHostToDevice));
296: /* make matrix buffers */
297: for (colour=0; colour<nc; colour++) {
298: Mat_SeqAIJ *a = &h_mats[colour];
299: /* create on GPU and copy to GPU */
300: CUDA_SAFE_CALL(cudaMalloc((void **)&a->i, (N+1)*sizeof(PetscInt))); // kernel input
301: CUDA_SAFE_CALL(cudaMemcpy( a->i, jaca->i, (N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice));
302: CUDA_SAFE_CALL(cudaMalloc((void **)&a->ilen, (N)*sizeof(PetscInt))); // kernel input
303: CUDA_SAFE_CALL(cudaMemcpy( a->ilen, jaca->ilen,(N)*sizeof(PetscInt), cudaMemcpyHostToDevice));
304: CUDA_SAFE_CALL(cudaMalloc((void **)&a->j, (nnz)*sizeof(PetscInt))); // kernel input
305: CUDA_SAFE_CALL(cudaMemcpy( a->j, jaca->j, (nnz)*sizeof(PetscInt), cudaMemcpyHostToDevice));
306: CUDA_SAFE_CALL(cudaMalloc((void **)&a->a, (nnz)*sizeof(PetscScalar))); // kernel output
307: CUDA_SAFE_CALL(cudaMemset( a->a, 0, (nnz)*sizeof(PetscScalar)));
308: }
309: CUDA_SAFE_CALL(cudaMalloc(&d_mats, nc*sizeof(Mat_SeqAIJ))); // kernel input
310: CUDA_SAFE_CALL(cudaMemcpy( d_mats, h_mats, nc*sizeof(Mat_SeqAIJ), cudaMemcpyHostToDevice));
311: /* do it */
312: assemble_kernel<<<nelems,1>>>(d_nidx_arr, d_idx_arr, d_new_el_mats, d_colors, d_mats);
313: CHECK_LAUNCH_ERROR();
314: /* cleanup */
315: CUDA_SAFE_CALL(cudaFree(d_colors));
316: CUDA_SAFE_CALL(cudaFree(d_nidx_arr));
317: for (ej = cStart ; ej < nelems; ++ej) {
318: CUDA_SAFE_CALL(cudaFree(h_idx_arr[ej]));
319: CUDA_SAFE_CALL(cudaFree(h_new_el_mats[ej]));
320: }
321: CUDA_SAFE_CALL(cudaFree(d_idx_arr));
322: CUDA_SAFE_CALL(cudaFree(d_new_el_mats));
323: /* copy & add Mat data back to CPU to JacP */
325: PetscLogEventBegin(events[2],0,0,0,0);
326: PetscMalloc1(nnz,&val_buf);
327: PetscMemzero(jaca->a,nnz*sizeof(PetscScalar));
328: for (colour=0; colour<nc; colour++) {
329: Mat_SeqAIJ *a = &h_mats[colour];
330: CUDA_SAFE_CALL(cudaMemcpy(val_buf, a->a, (nnz)*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
331: PetscKernelAXPY(jaca->a,1.0,val_buf,nnz);
332: }
333: PetscFree(val_buf);
334: PetscLogEventEnd(events[2],0,0,0,0);
336: for (colour=0; colour<nc; colour++) {
337: Mat_SeqAIJ *a = &h_mats[colour];
338: /* destroy mat */
339: CUDA_SAFE_CALL(cudaFree(a->i));
340: CUDA_SAFE_CALL(cudaFree(a->ilen));
341: CUDA_SAFE_CALL(cudaFree(a->j));
342: CUDA_SAFE_CALL(cudaFree(a->a));
343: }
344: CUDA_SAFE_CALL(cudaFree(d_mats));
345: return(0);
346: }