Actual source code: landaucu.cu

petsc-3.14.5 2021-03-03
Report Typos and Errors
  1: /*
  2:    Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <petsc/private/kernels/petscaxpy.h>

 11: #define PETSC_THREAD_SYNC __syncthreads()
 12: #define PETSC_DEVICE_FUNC_DECL __device__
 13: #include "../land_kernel.h"

 15: // Macro to catch CUDA errors in CUDA runtime calls
 16: #define CUDA_SAFE_CALL(call)                                          \
 17: do {                                                                  \
 18:     cudaError_t err = call;                                           \
 19:     if (cudaSuccess != err) {                                         \
 20:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 21:                  __FILE__, __LINE__, cudaGetErrorString(err));        \
 22:         exit(EXIT_FAILURE);                                           \
 23:     }                                                                 \
 24: } while (0)
 25: // Macro to catch CUDA errors in kernel launches
 26: #define CHECK_LAUNCH_ERROR()                                          \
 27: do {                                                                  \
 28:     /* Check synchronous errors, i.e. pre-launch */                   \
 29:     cudaError_t err = cudaGetLastError();                             \
 30:     if (cudaSuccess != err) {                                         \
 31:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 32:                  __FILE__, __LINE__, cudaGetErrorString(err));        \
 33:         exit(EXIT_FAILURE);                                           \
 34:     }                                                                 \
 35:     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 36:     err = cudaDeviceSynchronize();                                    \
 37:     if (cudaSuccess != err) {                                         \
 38:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 39:                  __FILE__, __LINE__, cudaGetErrorString( err));       \
 40:         exit(EXIT_FAILURE);                                           \
 41:     }                                                                 \
 42: } while (0)

 44: #define LANDAU_USE_SHARED_GPU_MEM
 45: //j
 46: // The GPU Landau kernel
 47: //
 48: __global__
 49: void landau_kernel(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
 50:                    const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
 51:                    const PetscReal * const BB, const PetscReal * const DD, const PetscReal * const IPDataGlobal, const PetscReal wiGlobal[],
 52: #if !defined(LANDAU_USE_SHARED_GPU_MEM)
 53:                    PetscReal *g2arr, PetscReal *g3arr,
 54: #endif
 55:                    PetscBool quarter3DDomain, PetscScalar elemMats_out[])
 56: {
 57:   const PetscInt  Nq = blockDim.x, myelem = blockIdx.x;
 58: #if defined(LANDAU_USE_SHARED_GPU_MEM)
 59:   extern __shared__ PetscReal g2_g3_qi[]; // Nq * { [NSubBlocks][Nf][dim] ; [NSubBlocks][Nf][dim][dim] }
 60:   PetscReal       (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]         = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM])         &g2_g3_qi[0];
 61:   PetscReal       (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g2_g3_qi[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM];
 62: #else
 63:   PetscReal       (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]         = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM])         &g2arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM       ];
 64:   PetscReal       (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g3arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*LANDAU_DIM];
 65: #endif
 66:   const PetscInt  myQi = threadIdx.x, mySubBlk = threadIdx.y, nSubBlks = blockDim.y;
 67:   const PetscInt  jpidx = myQi + myelem * Nq;
 68:   const PetscInt  subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
 69:   PetscScalar     *elemMat  = &elemMats_out[myelem*totDim*totDim]; /* my output */

 71:   if (threadIdx.x==0 && threadIdx.y==0) {
 72:     memset(elemMat, 0, totDim*totDim*sizeof(PetscScalar));
 73:   }
 74:   __syncthreads();
 75:   landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, ip_start, ip_end, 1,        jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* compact */
 76:   // landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, mySubBlk,    nip, nSubBlks, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* spread */
 77: }
 78: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[]);
 79: __global__ void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[]);
 80: PetscErrorCode LandauCUDAJacobian(DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[],
 81:                                   const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal * const IPDataGlobal,
 82:                                   const PetscReal wiGlobal[], const PetscReal invJj[], const PetscInt num_sub_blocks, const PetscLogEvent events[], PetscBool quarter3DDomain,
 83:                                   Mat JacP)
 84: {
 85:   PetscErrorCode    ierr;
 86:   PetscInt          ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(PetscReal);
 87:   PetscReal         *d_BB,*d_DD,*d_invJj,*d_wiGlobal,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m;
 88:   PetscScalar       *elemMats,*d_elemMats;
 89:   PetscLogDouble    flops;
 90:   PetscTabulation   *Tf;
 91:   PetscDS           prob;
 92:   PetscSection      section, globalSection;
 93:   PetscReal        *d_IPDataGlobal;
 94:   PetscBool         cuda_assemble = PETSC_FALSE;

 97:   PetscLogEventBegin(events[3],0,0,0,0);
 98:   DMGetDimension(plex, &dim);
 99:   if (dim!=LANDAU_DIM) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM != dim");
100:   DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
101:   numGCells = cEnd - cStart;
102:   nip  = numGCells*Nq; /* length of inner global iteration */
103:   DMGetDS(plex, &prob);
104:   PetscDSGetNumFields(prob, &Nf);
105:   PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
106:   if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D  %D",Nq,Nb);
107:   PetscDSGetTotalDimension(prob, &totDim);
108:   PetscDSGetTabulation(prob, &Tf);
109:   DMGetLocalSection(plex, &section);
110:   DMGetGlobalSection(plex, &globalSection);
111:   // create data
112:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_IPDataGlobal, nip*(dim + Nf*(dim+1))*szf)); // kernel input
113:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_alpha, Nf*szf)); // kernel input
114:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_beta,  Nf*szf)); // kernel input
115:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_invMass,  Nf*szf)); // kernel input
116:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_Eq_m,     Nf*szf)); // kernel input
117:   CUDA_SAFE_CALL(cudaMemcpy(d_IPDataGlobal, IPDataGlobal, nip*(dim + Nf*(dim+1))*szf, cudaMemcpyHostToDevice));
118:   CUDA_SAFE_CALL(cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf,                             cudaMemcpyHostToDevice));
119:   CUDA_SAFE_CALL(cudaMemcpy(d_nu_beta,  nu_beta,  Nf*szf,                             cudaMemcpyHostToDevice));
120:   CUDA_SAFE_CALL(cudaMemcpy(d_invMass,  invMass,  Nf*szf,                             cudaMemcpyHostToDevice));
121:   CUDA_SAFE_CALL(cudaMemcpy(d_Eq_m,     Eq_m,     Nf*szf,                             cudaMemcpyHostToDevice));
122:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_BB,              Nq*Nb*szf));     // kernel input
123:   CUDA_SAFE_CALL(cudaMemcpy(          d_BB, Tf[0]->T[0], Nq*Nb*szf,   cudaMemcpyHostToDevice));
124:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_DD,              Nq*Nb*dim*szf)); // kernel input
125:   CUDA_SAFE_CALL(cudaMemcpy(          d_DD, Tf[0]->T[1], Nq*Nb*dim*szf,   cudaMemcpyHostToDevice));
126:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_wiGlobal,           Nq*numGCells*szf)); // kernel input
127:   CUDA_SAFE_CALL(cudaMemcpy(          d_wiGlobal, wiGlobal, Nq*numGCells*szf,   cudaMemcpyHostToDevice));
128:   // collect geometry
129:   flops = (PetscLogDouble)numGCells*(PetscLogDouble)Nq*(PetscLogDouble)(5.*dim*dim*Nf*Nf + 165.);
130:   nip_dim2 = Nq*numGCells*dim*dim;
131:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_invJj, nip_dim2*szf)); // kernel input
132:   CUDA_SAFE_CALL(cudaMemcpy(d_invJj, invJj, nip_dim2*szf,       cudaMemcpyHostToDevice));
133:   PetscLogEventEnd(events[3],0,0,0,0);

135:   PetscLogEventBegin(events[4],0,0,0,0);
136:   PetscLogGpuFlops(flops*nip);
137:   {
138:     dim3 dimBlock(Nq,num_sub_blocks);
139:     CUDA_SAFE_CALL(cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar))); // kernel output
140:     ii = LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM)*LANDAU_MAX_SUB_THREAD_BLOCKS;
141: #if defined(LANDAU_USE_SHARED_GPU_MEM)
142:     // PetscPrintf(PETSC_COMM_SELF,"Call land_kernel with %D kB shared memory\n",ii*8/1024);
143:     landau_kernel<<<numGCells,dimBlock,ii*szf>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
144:                                                  d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, quarter3DDomain, d_elemMats);
145:     CHECK_LAUNCH_ERROR();
146: #else
147:     PetscReal  *d_g2g3;
148:     CUDA_SAFE_CALL(cudaMalloc((void **)&d_g2g3, ii*szf*numGCells)); // kernel input
149:     PetscReal  *g2 = &d_g2g3[0];
150:     PetscReal  *g3 = &d_g2g3[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*numGCells];
151:     landau_kernel<<<numGCells,dimBlock>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
152:                                           d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, g2, g3, quarter3DDomain, d_elemMats);
153:     CHECK_LAUNCH_ERROR();
154:     CUDA_SAFE_CALL (cudaDeviceSynchronize());
155:     CUDA_SAFE_CALL(cudaFree(d_g2g3));
156: #endif
157:   }
158:   PetscLogEventEnd(events[4],0,0,0,0);
159:   // delete device data
160:   PetscLogEventBegin(events[5],0,0,0,0);
161:   CUDA_SAFE_CALL(cudaFree(d_IPDataGlobal));
162:   CUDA_SAFE_CALL(cudaFree(d_invJj));
163:   CUDA_SAFE_CALL(cudaFree(d_wiGlobal));
164:   CUDA_SAFE_CALL(cudaFree(d_nu_alpha));
165:   CUDA_SAFE_CALL(cudaFree(d_nu_beta));
166:   CUDA_SAFE_CALL(cudaFree(d_invMass));
167:   CUDA_SAFE_CALL(cudaFree(d_Eq_m));
168:   CUDA_SAFE_CALL(cudaFree(d_BB));
169:   CUDA_SAFE_CALL(cudaFree(d_DD));
170:   PetscMalloc1(totDim*totDim*numGCells,&elemMats);
171:   CUDA_SAFE_CALL(cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
172:   CUDA_SAFE_CALL(cudaFree(d_elemMats));
173:   PetscLogEventEnd(events[5],0,0,0,0);

175:   PetscLogEventBegin(events[6],0,0,0,0);
176:   if (!cuda_assemble) {
177:     PetscScalar *elMat;
178:     for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
179:       DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
180:       if (ej==-1) {
181:         int d,f;
182:         printf("GPU Element matrix\n");
183:         for (d = 0; d < totDim; ++d) {
184:           for (f = 0; f < totDim; ++f) printf(" %17.10e",  PetscRealPart(elMat[d*totDim + f]));
185:           printf("\n");
186:         }
187:         exit(12);
188:       }
189:     }
190:   } else {
191:     PetscContainer container = NULL;
192:     PetscObjectQuery((PetscObject)JacP,"coloring",(PetscObject*)&container);
193:     if (!container) {
194:       PetscLogEventBegin(events[8],0,0,0,0);
195:       LandauCreateColoring(JacP, plex, &container);
196:       PetscLogEventEnd(events[8],0,0,0,0);
197:     }
198:     LandauAssembleCuda(cStart, cEnd, totDim, plex, section, globalSection, JacP, elemMats, container, events);
199:   }
200:   PetscFree(elemMats);
201:   PetscLogEventEnd(events[6],0,0,0,0);

203:   return(0);
204: }

206: __global__
207: void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[])
208: {
209:   const PetscInt     myelem = (gridDim.x==1) ? threadIdx.x : blockIdx.x;
210:   Mat_SeqAIJ         a = mats[colors[myelem]]; /* copy to GPU */
211:   const PetscScalar *v = el_mats[myelem];
212:   const PetscInt    *in = idx_arr[myelem], *im = idx_arr[myelem], n = nidx_arr[myelem], m = nidx_arr[myelem];
213:   /* mat set values */
214:   PetscInt          *rp,k,low,high,t,row,nrow,i,col,l;
215:   PetscInt          *ai = a.i,*ailen = a.ilen;
216:   PetscInt          *aj = a.j,lastcol = -1;
217:   MatScalar         *ap=NULL,value=0.0,*aa = a.a;
218:   for (k=0; k<m; k++) { /* loop over added rows */
219:     row = im[k];
220:     if (row < 0) continue;
221:     rp   = aj + ai[row];
222:     ap = aa + ai[row];
223:     nrow = ailen[row];
224:     low  = 0;
225:     high = nrow;
226:     for (l=0; l<n; l++) { /* loop over added columns */
227:       /* if (in[l] < 0) { */
228:       /*   printf("\t\tin[l] < 0 ?????\n"); */
229:       /*   continue; */
230:       /* } */
231:       while (l<n && (value = v[l + k*n], PetscAbsScalar(value)==0.0)) l++;
232:       if (l==n) break;
233:       col = in[l];
234:       if (col <= lastcol) low = 0;
235:       else high = nrow;
236:       lastcol = col;
237:       while (high-low > 5) {
238:         t = (low+high)/2;
239:         if (rp[t] > col) high = t;
240:         else low = t;
241:       }
242:       for (i=low; i<high; i++) {
243:         // if (rp[i] > col) break;
244:         if (rp[i] == col) {
245:           ap[i] += value;
246:           low = i + 1;
247:           goto noinsert;
248:         }
249:       }
250:       printf("\t\t\t ERROR in assemble_kernel\n");
251:     noinsert:;
252:     }
253:   }
254: }

256: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[])
257: {
258:   PetscErrorCode    ierr;
259: #define LANDAU_MAX_COLORS 16
260: #define LANDAU_MAX_ELEMS 512
261:   Mat_SeqAIJ             h_mats[LANDAU_MAX_COLORS], *jaca = (Mat_SeqAIJ *)JacP->data, *d_mats;
262:   const PetscInt         nelems = cEnd - cStart, nnz = jaca->i[JacP->rmap->n], N = JacP->rmap->n;  /* serial */
263:   const ISColoringValue *colors;
264:   ISColoringValue       *d_colors,colour;
265:   PetscInt              *h_idx_arr[LANDAU_MAX_ELEMS], h_nidx_arr[LANDAU_MAX_ELEMS], *d_nidx_arr, **d_idx_arr,nc,ej,j,cell;
266:   PetscScalar           *h_new_el_mats[LANDAU_MAX_ELEMS], *val_buf, **d_new_el_mats;
267:   ISColoring             iscoloring;
268:   PetscContainerGetPointer(container,(void**)&iscoloring);
269:   /* get colors */
270:   ISColoringGetColors(iscoloring, &j, &nc, &colors);
271:   if (nelems>LANDAU_MAX_ELEMS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many elements. %D > %D",nelems,LANDAU_MAX_ELEMS);
272:   if (nc>LANDAU_MAX_COLORS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many colors. %D > %D",nc,LANDAU_MAX_COLORS);
273:   /* colors for kernel */
274:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_colors,         nelems*sizeof(ISColoringValue))); // kernel input
275:   CUDA_SAFE_CALL(cudaMemcpy(          d_colors, colors, nelems*sizeof(ISColoringValue), cudaMemcpyHostToDevice));
276:   /* get indices and element matrices */
277:   for (cell = cStart, ej = 0 ; cell < cEnd; ++cell, ++ej) {
278:     PetscInt numindices,*indices;
279:     PetscScalar *elMat = &elemMats[ej*totDim*totDim];
280:     PetscScalar *valuesOrig = elMat;
281:     DMPlexGetClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
282:     h_nidx_arr[ej] = numindices;
283:     CUDA_SAFE_CALL(cudaMalloc((void **)&h_idx_arr[ej],            numindices*sizeof(PetscInt))); // kernel input
284:     CUDA_SAFE_CALL(cudaMemcpy(          h_idx_arr[ej],   indices, numindices*sizeof(PetscInt), cudaMemcpyHostToDevice));
285:     CUDA_SAFE_CALL(cudaMalloc((void **)&h_new_el_mats[ej],        numindices*numindices*sizeof(PetscScalar))); // kernel input
286:     CUDA_SAFE_CALL(cudaMemcpy(          h_new_el_mats[ej], elMat, numindices*numindices*sizeof(PetscScalar), cudaMemcpyHostToDevice));
287:     DMPlexRestoreClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
288:     if (elMat != valuesOrig) {DMRestoreWorkArray(plex, numindices*numindices, MPIU_SCALAR, &elMat);}
289:   }
290:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nidx_arr,                  nelems*sizeof(PetscInt))); // kernel input
291:   CUDA_SAFE_CALL(cudaMemcpy(          d_nidx_arr,    h_nidx_arr,   nelems*sizeof(PetscInt), cudaMemcpyHostToDevice));
292:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_idx_arr,                   nelems*sizeof(PetscInt*))); // kernel input
293:   CUDA_SAFE_CALL(cudaMemcpy(          d_idx_arr,     h_idx_arr,    nelems*sizeof(PetscInt*), cudaMemcpyHostToDevice));
294:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_new_el_mats,               nelems*sizeof(PetscScalar*))); // kernel input
295:   CUDA_SAFE_CALL(cudaMemcpy(          d_new_el_mats, h_new_el_mats,nelems*sizeof(PetscScalar*), cudaMemcpyHostToDevice));
296:   /* make matrix buffers */
297:   for (colour=0; colour<nc; colour++) {
298:     Mat_SeqAIJ *a = &h_mats[colour];
299:     /* create on GPU and copy to GPU */
300:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->i,               (N+1)*sizeof(PetscInt))); // kernel input
301:     CUDA_SAFE_CALL(cudaMemcpy(          a->i,    jaca->i,   (N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice));
302:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->ilen,            (N)*sizeof(PetscInt))); // kernel input
303:     CUDA_SAFE_CALL(cudaMemcpy(          a->ilen, jaca->ilen,(N)*sizeof(PetscInt), cudaMemcpyHostToDevice));
304:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->j,               (nnz)*sizeof(PetscInt))); // kernel input
305:     CUDA_SAFE_CALL(cudaMemcpy(          a->j,    jaca->j,   (nnz)*sizeof(PetscInt), cudaMemcpyHostToDevice));
306:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->a,               (nnz)*sizeof(PetscScalar))); // kernel output
307:     CUDA_SAFE_CALL(cudaMemset(          a->a, 0,            (nnz)*sizeof(PetscScalar)));
308:   }
309:   CUDA_SAFE_CALL(cudaMalloc(&d_mats,         nc*sizeof(Mat_SeqAIJ))); // kernel input
310:   CUDA_SAFE_CALL(cudaMemcpy( d_mats, h_mats, nc*sizeof(Mat_SeqAIJ), cudaMemcpyHostToDevice));
311:   /* do it */
312:   assemble_kernel<<<nelems,1>>>(d_nidx_arr, d_idx_arr, d_new_el_mats, d_colors, d_mats);
313:   CHECK_LAUNCH_ERROR();
314:   /* cleanup */
315:   CUDA_SAFE_CALL(cudaFree(d_colors));
316:   CUDA_SAFE_CALL(cudaFree(d_nidx_arr));
317:   for (ej = cStart ; ej < nelems; ++ej) {
318:     CUDA_SAFE_CALL(cudaFree(h_idx_arr[ej]));
319:     CUDA_SAFE_CALL(cudaFree(h_new_el_mats[ej]));
320:   }
321:   CUDA_SAFE_CALL(cudaFree(d_idx_arr));
322:   CUDA_SAFE_CALL(cudaFree(d_new_el_mats));
323:   /* copy & add Mat data back to CPU to JacP */

325:   PetscLogEventBegin(events[2],0,0,0,0);
326:   PetscMalloc1(nnz,&val_buf);
327:   PetscMemzero(jaca->a,nnz*sizeof(PetscScalar));
328:   for (colour=0; colour<nc; colour++) {
329:     Mat_SeqAIJ *a = &h_mats[colour];
330:     CUDA_SAFE_CALL(cudaMemcpy(val_buf, a->a, (nnz)*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
331:     PetscKernelAXPY(jaca->a,1.0,val_buf,nnz);
332:   }
333:   PetscFree(val_buf);
334:   PetscLogEventEnd(events[2],0,0,0,0);

336:   for (colour=0; colour<nc; colour++) {
337:     Mat_SeqAIJ *a = &h_mats[colour];
338:     /* destroy mat */
339:     CUDA_SAFE_CALL(cudaFree(a->i));
340:     CUDA_SAFE_CALL(cudaFree(a->ilen));
341:     CUDA_SAFE_CALL(cudaFree(a->j));
342:     CUDA_SAFE_CALL(cudaFree(a->a));
343:   }
344:   CUDA_SAFE_CALL(cudaFree(d_mats));
345:   return(0);
346: }