Actual source code: landaucu.cu

petsc-3.14.2 2020-12-03
Report Typos and Errors
  1: /*
  2:    Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <petsc/private/kernels/petscaxpy.h>

 10: #define PETSC_THREAD_SYNC __syncthreads()
 11: #define PETSC_DEVICE_FUNC_DECL __device__
 12: #include "../land_kernel.h"

 14: // Macro to catch CUDA errors in CUDA runtime calls
 15: #define CUDA_SAFE_CALL(call)                                          \
 16: do {                                                                  \
 17:     cudaError_t err = call;                                           \
 18:     if (cudaSuccess != err) {                                         \
 19:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 20:                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 21:         exit(EXIT_FAILURE);                                           \
 22:     }                                                                 \
 23: } while (0)
 24: // Macro to catch CUDA errors in kernel launches
 25: #define CHECK_LAUNCH_ERROR()                                          \
 26: do {                                                                  \
 27:     /* Check synchronous errors, i.e. pre-launch */                   \
 28:     cudaError_t err = cudaGetLastError();                             \
 29:     if (cudaSuccess != err) {                                         \
 30:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 31:                  __FILE__, __LINE__, cudaGetErrorString(err) );       \
 32:         exit(EXIT_FAILURE);                                           \
 33:     }                                                                 \
 34:     /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
 35:     err = cudaDeviceSynchronize();                                    \
 36:     if (cudaSuccess != err) {                                         \
 37:         fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 38:                  __FILE__, __LINE__, cudaGetErrorString( err) );      \
 39:         exit(EXIT_FAILURE);                                           \
 40:     }                                                                 \
 41: } while (0)

 43: #define LANDAU_USE_SHARED_GPU_MEM
 44: //j
 45: // The GPU Landau kernel
 46: //
 47: __global__
 48: void landau_kernel(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
 49:                  const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
 50:                  const PetscReal * const BB, const PetscReal * const DD, const PetscReal * const IPDataGlobal, const PetscReal wiGlobal[],
 51: #if !defined(LANDAU_USE_SHARED_GPU_MEM)
 52:                  PetscReal *g2arr, PetscReal *g3arr,
 53: #endif
 54:                  PetscBool quarter3DDomain, PetscScalar elemMats_out[])
 55: {
 56:   const PetscInt  Nq = blockDim.x, myelem = blockIdx.x;
 57: #if defined(LANDAU_USE_SHARED_GPU_MEM)
 58:   extern __shared__ PetscReal g2_g3_qi[]; // Nq * { [NSubBlocks][Nf][dim] ; [NSubBlocks][Nf][dim][dim] }
 59:   PetscReal       (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]         = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM])         &g2_g3_qi[0];
 60:   PetscReal       (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g2_g3_qi[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM];
 61: #else
 62:   PetscReal       (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]         = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM])         &g2arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM       ];
 63:   PetscReal       (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g3arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*LANDAU_DIM];
 64: #endif
 65:   const PetscInt  myQi = threadIdx.x, mySubBlk = threadIdx.y, nSubBlks = blockDim.y;
 66:   const PetscInt  jpidx = myQi + myelem * Nq;
 67:   const PetscInt  subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
 68:   PetscScalar     *elemMat  = &elemMats_out[myelem*totDim*totDim]; /* my output */

 70:   if (threadIdx.x==0 && threadIdx.y==0) {
 71:     memset(elemMat, 0, totDim*totDim*sizeof(PetscScalar));
 72:   }
 73:   __syncthreads();
 74:   landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, ip_start, ip_end, 1,        jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* compact */
 75:   // landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, mySubBlk,    nip, nSubBlks, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* spread */
 76: }
 77: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[]);
 78: __global__ void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[]);
 79: PetscErrorCode LandauCUDAJacobian( DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[],
 80:                                  const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal * const IPDataGlobal,
 81:                                  const PetscReal wiGlobal[], const PetscReal invJj[], const PetscInt num_sub_blocks, const PetscLogEvent events[], PetscBool quarter3DDomain,
 82:                                  Mat JacP)
 83: {
 84:   PetscErrorCode    ierr;
 85:   PetscInt          ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(PetscReal);
 86:   PetscReal         *d_BB,*d_DD,*d_invJj,*d_wiGlobal,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m;
 87:   PetscScalar       *elemMats,*d_elemMats;
 88:   PetscLogDouble    flops;
 89:   PetscTabulation   *Tf;
 90:   PetscDS           prob;
 91:   PetscSection      section, globalSection;
 92:   PetscReal        *d_IPDataGlobal;
 93:   PetscBool         cuda_assemble = PETSC_FALSE;

 96:   PetscLogEventBegin(events[3],0,0,0,0);
 97:   DMGetDimension(plex, &dim);
 98:   if (dim!=LANDAU_DIM) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM != dim");
 99:   DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
100:   numGCells = cEnd - cStart;
101:   nip  = numGCells*Nq; /* length of inner global iteration */
102:   DMGetDS(plex, &prob);
103:   PetscDSGetNumFields(prob, &Nf);
104:   PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
105:   if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D  %D",Nq,Nb);
106:   PetscDSGetTotalDimension(prob, &totDim);
107:   PetscDSGetTabulation(prob, &Tf);
108:   DMGetLocalSection(plex, &section);
109:   DMGetGlobalSection(plex, &globalSection);
110:   // create data
111:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_IPDataGlobal, nip*(dim + Nf*(dim+1))*szf )); // kernel input
112:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_alpha, Nf*szf)); // kernel input
113:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_beta,  Nf*szf)); // kernel input
114:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_invMass,  Nf*szf)); // kernel input
115:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_Eq_m,     Nf*szf)); // kernel input
116:   CUDA_SAFE_CALL(cudaMemcpy(d_IPDataGlobal, IPDataGlobal, nip*(dim + Nf*(dim+1))*szf, cudaMemcpyHostToDevice));
117:   CUDA_SAFE_CALL(cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf,                             cudaMemcpyHostToDevice));
118:   CUDA_SAFE_CALL(cudaMemcpy(d_nu_beta,  nu_beta,  Nf*szf,                             cudaMemcpyHostToDevice));
119:   CUDA_SAFE_CALL(cudaMemcpy(d_invMass,  invMass,  Nf*szf,                             cudaMemcpyHostToDevice));
120:   CUDA_SAFE_CALL(cudaMemcpy(d_Eq_m,     Eq_m,     Nf*szf,                             cudaMemcpyHostToDevice));
121:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_BB,              Nq*Nb*szf));     // kernel input
122:   CUDA_SAFE_CALL(cudaMemcpy(          d_BB, Tf[0]->T[0], Nq*Nb*szf,   cudaMemcpyHostToDevice));
123:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_DD,              Nq*Nb*dim*szf)); // kernel input
124:   CUDA_SAFE_CALL(cudaMemcpy(          d_DD, Tf[0]->T[1], Nq*Nb*dim*szf,   cudaMemcpyHostToDevice));
125:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_wiGlobal,           Nq*numGCells*szf)); // kernel input
126:   CUDA_SAFE_CALL(cudaMemcpy(          d_wiGlobal, wiGlobal, Nq*numGCells*szf,   cudaMemcpyHostToDevice));
127:   // collect geometry
128:   flops = (PetscLogDouble)numGCells*(PetscLogDouble)Nq*(PetscLogDouble)(5.*dim*dim*Nf*Nf + 165.);
129:   nip_dim2 = Nq*numGCells*dim*dim;
130:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_invJj, nip_dim2*szf)); // kernel input
131:   CUDA_SAFE_CALL(cudaMemcpy(d_invJj, invJj, nip_dim2*szf,       cudaMemcpyHostToDevice));
132:   PetscLogEventEnd(events[3],0,0,0,0);

134:   PetscLogEventBegin(events[4],0,0,0,0);
135:   PetscLogGpuFlops(flops*nip);
136:   {
137:     dim3 dimBlock(Nq,num_sub_blocks);
138:     CUDA_SAFE_CALL(cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar))); // kernel output
139:     ii = LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM)*LANDAU_MAX_SUB_THREAD_BLOCKS;
140: #if defined(LANDAU_USE_SHARED_GPU_MEM)
141:     // PetscPrintf(PETSC_COMM_SELF,"Call land_kernel with %D kB shared memory\n",ii*8/1024);
142:     landau_kernel<<<numGCells,dimBlock,ii*szf>>>( nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
143:                                                 d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, quarter3DDomain, d_elemMats);
144:     CHECK_LAUNCH_ERROR();
145: #else
146:     PetscReal  *d_g2g3;
147:     CUDA_SAFE_CALL(cudaMalloc((void **)&d_g2g3, ii*szf*numGCells)); // kernel input
148:     PetscReal  *g2 = &d_g2g3[0];
149:     PetscReal  *g3 = &d_g2g3[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*numGCells];
150:     landau_kernel<<<numGCells,dimBlock>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
151:                                         d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, g2, g3, quarter3DDomain, d_elemMats);
152:     CHECK_LAUNCH_ERROR();
153:     CUDA_SAFE_CALL (cudaDeviceSynchronize());
154:     CUDA_SAFE_CALL(cudaFree(d_g2g3));
155: #endif
156:   }
157:   PetscLogEventEnd(events[4],0,0,0,0);
158:   // delete device data
159:   PetscLogEventBegin(events[5],0,0,0,0);
160:   CUDA_SAFE_CALL(cudaFree(d_IPDataGlobal));
161:   CUDA_SAFE_CALL(cudaFree(d_invJj));
162:   CUDA_SAFE_CALL(cudaFree(d_wiGlobal));
163:   CUDA_SAFE_CALL(cudaFree(d_nu_alpha));
164:   CUDA_SAFE_CALL(cudaFree(d_nu_beta));
165:   CUDA_SAFE_CALL(cudaFree(d_invMass));
166:   CUDA_SAFE_CALL(cudaFree(d_Eq_m));
167:   CUDA_SAFE_CALL(cudaFree(d_BB));
168:   CUDA_SAFE_CALL(cudaFree(d_DD));
169:   PetscMalloc1(totDim*totDim*numGCells,&elemMats);
170:   CUDA_SAFE_CALL(cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
171:   CUDA_SAFE_CALL(cudaFree(d_elemMats));
172:   PetscLogEventEnd(events[5],0,0,0,0);

174:   PetscLogEventBegin(events[6],0,0,0,0);
175:   if (!cuda_assemble) {
176:     PetscScalar *elMat;
177:     for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
178:       DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
179:       if (ej==-1) {
180:         int d,f;
181:         printf("GPU Element matrix\n");
182:         for (d = 0; d < totDim; ++d){
183:           for (f = 0; f < totDim; ++f) printf(" %17.10e",  PetscRealPart(elMat[d*totDim + f]));
184:           printf("\n");
185:         }
186:         exit(12);
187:       }
188:     }
189:   } else {
190:     PetscContainer container = NULL;
191:     PetscObjectQuery((PetscObject)JacP,"coloring",(PetscObject*)&container);
192:     if (!container) {
193:       PetscLogEventBegin(events[8],0,0,0,0);
194:       LandauCreateColoring(JacP, plex, &container);
195:       PetscLogEventEnd(events[8],0,0,0,0);
196:     }
197:     LandauAssembleCuda(cStart, cEnd, totDim, plex, section, globalSection, JacP, elemMats, container, events);
198:   }
199:   PetscFree(elemMats);
200:   PetscLogEventEnd(events[6],0,0,0,0);

202:   return(0);
203: }

205: __global__
206: void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[])
207: {
208:   const PetscInt     myelem = (gridDim.x==1) ? threadIdx.x : blockIdx.x;
209:   Mat_SeqAIJ         a = mats[colors[myelem]]; /* copy to GPU */
210:   const PetscScalar *v = el_mats[myelem];
211:   const PetscInt    *in = idx_arr[myelem], *im = idx_arr[myelem], n = nidx_arr[myelem], m = nidx_arr[myelem];
212:   /* mat set values */
213:   PetscInt          *rp,k,low,high,t,row,nrow,i,col,l;
214:   PetscInt          *ai = a.i,*ailen = a.ilen;
215:   PetscInt          *aj = a.j,lastcol = -1;
216:   MatScalar         *ap=NULL,value=0.0,*aa = a.a;
217:   for (k=0; k<m; k++) { /* loop over added rows */
218:     row = im[k];
219:     if (row < 0) continue;
220:     rp   = aj + ai[row];
221:     ap = aa + ai[row];
222:     nrow = ailen[row];
223:     low  = 0;
224:     high = nrow;
225:     for (l=0; l<n; l++) { /* loop over added columns */
226:       /* if (in[l] < 0) { */
227:       /*         printf("\t\tin[l] < 0 ?????\n"); */
228:       /*         continue; */
229:       /* } */
230:       while (l<n && (value = v[l + k*n], PetscAbsScalar(value)==0.0)) l++;
231:       if (l==n) break;
232:       col = in[l];
233:       if (col <= lastcol) low = 0;
234:       else high = nrow;
235:       lastcol = col;
236:       while (high-low > 5) {
237:         t = (low+high)/2;
238:         if (rp[t] > col) high = t;
239:         else low = t;
240:       }
241:       for (i=low; i<high; i++) {
242:         // if (rp[i] > col) break;
243:         if (rp[i] == col) {
244:           ap[i] += value;
245:           low = i + 1;
246:           goto noinsert;
247:         }
248:       }
249:       printf("\t\t\t ERROR in assemble_kernel\n");
250:     noinsert:;
251:     }
252:   }
253: }

255: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[])
256: {
257:   PetscErrorCode    ierr;
258: #define LANDAU_MAX_COLORS 16
259: #define LANDAU_MAX_ELEMS 512
260:   Mat_SeqAIJ             h_mats[LANDAU_MAX_COLORS], *jaca = (Mat_SeqAIJ *)JacP->data, *d_mats;
261:   const PetscInt         nelems = cEnd - cStart, nnz = jaca->i[JacP->rmap->n], N = JacP->rmap->n;  /* serial */
262:   const ISColoringValue *colors;
263:   ISColoringValue       *d_colors,colour;
264:   PetscInt              *h_idx_arr[LANDAU_MAX_ELEMS], h_nidx_arr[LANDAU_MAX_ELEMS], *d_nidx_arr, **d_idx_arr,nc,ej,j,cell;
265:   PetscScalar           *h_new_el_mats[LANDAU_MAX_ELEMS], *val_buf, **d_new_el_mats;
266:   ISColoring             iscoloring;
267:   PetscContainerGetPointer(container,(void**)&iscoloring);
268:   /* get colors */
269:   ISColoringGetColors(iscoloring, &j, &nc, &colors);
270:   if (nelems>LANDAU_MAX_ELEMS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many elements. %D > %D",nelems,LANDAU_MAX_ELEMS);
271:   if (nc>LANDAU_MAX_COLORS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many colors. %D > %D",nc,LANDAU_MAX_COLORS);
272:   /* colors for kernel */
273:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_colors,         nelems*sizeof(ISColoringValue))); // kernel input
274:   CUDA_SAFE_CALL(cudaMemcpy(          d_colors, colors, nelems*sizeof(ISColoringValue), cudaMemcpyHostToDevice));
275:   /* get indices and element matrices */
276:   for (cell = cStart, ej = 0 ; cell < cEnd; ++cell, ++ej) {
277:     PetscInt numindices,*indices;
278:     PetscScalar *elMat = &elemMats[ej*totDim*totDim];
279:     PetscScalar *valuesOrig = elMat;
280:     DMPlexGetClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
281:     h_nidx_arr[ej] = numindices;
282:     CUDA_SAFE_CALL(cudaMalloc((void **)&h_idx_arr[ej],            numindices*sizeof(PetscInt))); // kernel input
283:     CUDA_SAFE_CALL(cudaMemcpy(          h_idx_arr[ej],   indices, numindices*sizeof(PetscInt), cudaMemcpyHostToDevice));
284:     CUDA_SAFE_CALL(cudaMalloc((void **)&h_new_el_mats[ej],        numindices*numindices*sizeof(PetscScalar))); // kernel input
285:     CUDA_SAFE_CALL(cudaMemcpy(          h_new_el_mats[ej], elMat, numindices*numindices*sizeof(PetscScalar), cudaMemcpyHostToDevice));
286:     DMPlexRestoreClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
287:     if (elMat != valuesOrig) {DMRestoreWorkArray(plex, numindices*numindices, MPIU_SCALAR, &elMat);}
288:   }
289:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_nidx_arr,                  nelems*sizeof(PetscInt))); // kernel input
290:   CUDA_SAFE_CALL(cudaMemcpy(          d_nidx_arr,    h_nidx_arr,   nelems*sizeof(PetscInt), cudaMemcpyHostToDevice));
291:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_idx_arr,                   nelems*sizeof(PetscInt*))); // kernel input
292:   CUDA_SAFE_CALL(cudaMemcpy(          d_idx_arr,     h_idx_arr,    nelems*sizeof(PetscInt*), cudaMemcpyHostToDevice));
293:   CUDA_SAFE_CALL(cudaMalloc((void **)&d_new_el_mats,               nelems*sizeof(PetscScalar*))); // kernel input
294:   CUDA_SAFE_CALL(cudaMemcpy(          d_new_el_mats, h_new_el_mats,nelems*sizeof(PetscScalar*), cudaMemcpyHostToDevice));
295:   /* make matrix buffers */
296:   for (colour=0; colour<nc; colour++) {
297:     Mat_SeqAIJ *a = &h_mats[colour];
298:     /* create on GPU and copy to GPU */
299:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->i,               (N+1)*sizeof(PetscInt))); // kernel input
300:     CUDA_SAFE_CALL(cudaMemcpy(          a->i,    jaca->i,   (N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice));
301:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->ilen,            (N)*sizeof(PetscInt))); // kernel input
302:     CUDA_SAFE_CALL(cudaMemcpy(          a->ilen, jaca->ilen,(N)*sizeof(PetscInt), cudaMemcpyHostToDevice));
303:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->j,               (nnz)*sizeof(PetscInt))); // kernel input
304:     CUDA_SAFE_CALL(cudaMemcpy(          a->j,    jaca->j,   (nnz)*sizeof(PetscInt), cudaMemcpyHostToDevice));
305:     CUDA_SAFE_CALL(cudaMalloc((void **)&a->a,               (nnz)*sizeof(PetscScalar))); // kernel output
306:     CUDA_SAFE_CALL(cudaMemset(          a->a, 0,            (nnz)*sizeof(PetscScalar)));
307:   }
308:   CUDA_SAFE_CALL(cudaMalloc(&d_mats,         nc*sizeof(Mat_SeqAIJ))); // kernel input
309:   CUDA_SAFE_CALL(cudaMemcpy( d_mats, h_mats, nc*sizeof(Mat_SeqAIJ), cudaMemcpyHostToDevice));
310:   /* do it */
311:   assemble_kernel<<<nelems,1>>>(d_nidx_arr, d_idx_arr, d_new_el_mats, d_colors, d_mats);
312:   CHECK_LAUNCH_ERROR();
313:   /* cleanup */
314:   CUDA_SAFE_CALL(cudaFree(d_colors));
315:   CUDA_SAFE_CALL(cudaFree(d_nidx_arr));
316:   for (ej = cStart ; ej < nelems; ++ej) {
317:     CUDA_SAFE_CALL(cudaFree(h_idx_arr[ej]));
318:     CUDA_SAFE_CALL(cudaFree(h_new_el_mats[ej]));
319:   }
320:   CUDA_SAFE_CALL(cudaFree(d_idx_arr));
321:   CUDA_SAFE_CALL(cudaFree(d_new_el_mats));
322:   /* copy & add Mat data back to CPU to JacP */

324:   PetscLogEventBegin(events[2],0,0,0,0);
325:   PetscMalloc1(nnz,&val_buf);
326:   PetscMemzero(jaca->a,nnz*sizeof(PetscScalar));
327:   for (colour=0; colour<nc; colour++) {
328:     Mat_SeqAIJ *a = &h_mats[colour];
329:     CUDA_SAFE_CALL(cudaMemcpy(val_buf, a->a, (nnz)*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
330:     PetscKernelAXPY(jaca->a,1.0,val_buf,nnz);
331:   }
332:   PetscFree(val_buf);
333:   PetscLogEventEnd(events[2],0,0,0,0);

335:   for (colour=0; colour<nc; colour++) {
336:     Mat_SeqAIJ *a = &h_mats[colour];
337:     /* destroy mat */
338:     CUDA_SAFE_CALL(cudaFree(a->i));
339:     CUDA_SAFE_CALL(cudaFree(a->ilen));
340:     CUDA_SAFE_CALL(cudaFree(a->j));
341:     CUDA_SAFE_CALL(cudaFree(a->a));
342:   }
343:   CUDA_SAFE_CALL(cudaFree(d_mats));
344:   return(0);
345: }