Actual source code: bvec1.c

petsc-3.4.2 2013-07-02
  2: /*
  3:    Defines the BLAS based vector operations. Code shared by parallel
  4:   and sequential vectors.
  5: */

  7: #include <../src/vec/vec/impls/dvecimpl.h>          /*I "petscvec.h" I*/
  8: #include <petscblaslapack.h>
  9: #include <petscthreadcomm.h>

 11: #if defined(PETSC_THREADCOMM_ACTIVE)
 12: PetscErrorCode VecDot_kernel(PetscInt thread_id,Vec xin,Vec yin,PetscThreadCommReduction red)
 13: {
 14:   PetscErrorCode    ierr;
 15:   PetscInt          *trstarts=xin->map->trstarts;
 16:   PetscInt          start,end,n;
 17:   PetscBLASInt      one = 1,bn;
 18:   const PetscScalar *xa,*ya;
 19:   PetscScalar       z_loc;

 21:   VecGetArrayRead(xin,&xa);
 22:   VecGetArrayRead(yin,&ya);
 23:   start = trstarts[thread_id];
 24:   end   = trstarts[thread_id+1];
 25:   n     = end-start;
 26:   PetscBLASIntCast(n,&bn);
 27:   /* arguments ya, xa are reversed because BLAS complex conjugates the first argument, PETSc the second */
 28:   PetscStackCallBLAS("BLASdot",z_loc = BLASdot_(&bn,ya+start,&one,xa+start,&one));

 30:   PetscThreadReductionKernelPost(thread_id,red,(void*)&z_loc);

 32:   VecRestoreArrayRead(xin,&xa);
 33:   VecRestoreArrayRead(yin,&ya);
 34:   return 0;
 35: }

 39: PetscErrorCode VecDot_Seq(Vec xin,Vec yin,PetscScalar *z)
 40: {
 41:   PetscErrorCode           ierr;
 42:   PetscThreadCommReduction red;

 45:   PetscThreadReductionBegin(PetscObjectComm((PetscObject)xin),THREADCOMM_SUM,PETSC_SCALAR,1,&red);
 46:   PetscThreadCommRunKernel3(PetscObjectComm((PetscObject)xin),(PetscThreadKernel)VecDot_kernel,xin,yin,red);
 47:   PetscThreadReductionEnd(red,z);
 48:   if (xin->map->n > 0) {
 49:     PetscLogFlops(2.0*xin->map->n-1);
 50:   }
 51:   return(0);
 52: }
 53: #else
 56: PetscErrorCode VecDot_Seq(Vec xin,Vec yin,PetscScalar *z)
 57: {
 58:   const PetscScalar *ya,*xa;
 59:   PetscBLASInt      one = 1,bn;
 60:   PetscErrorCode    ierr;

 63:   PetscBLASIntCast(xin->map->n,&bn);
 64:   VecGetArrayRead(xin,&xa);
 65:   VecGetArrayRead(yin,&ya);
 66:   /* arguments ya, xa are reversed because BLAS complex conjugates the first argument, PETSc the second */
 67:   PetscStackCallBLAS("BLASdot",*z   = BLASdot_(&bn,ya,&one,xa,&one));
 68:   VecRestoreArrayRead(xin,&xa);
 69:   VecRestoreArrayRead(yin,&ya);
 70:   if (xin->map->n > 0) {
 71:     PetscLogFlops(2.0*xin->map->n-1);
 72:   }
 73:   return(0);
 74: }
 75: #endif

 77: #if defined(PETSC_THREADCOMM_ACTIVE)
 78: PetscErrorCode VecTDot_kernel(PetscInt thread_id,Vec xin,Vec yin,PetscThreadCommReduction red)
 79: {
 80:   PetscErrorCode    ierr;
 81:   PetscInt          *trstarts=xin->map->trstarts;
 82:   PetscInt          start,end,n;
 83:   PetscBLASInt      one=1,bn;
 84:   const PetscScalar *xa,*ya;
 85:   PetscScalar       z_loc;

 87:   VecGetArrayRead(xin,&xa);
 88:   VecGetArrayRead(yin,&ya);
 89:   start = trstarts[thread_id];
 90:   end   = trstarts[thread_id+1];
 91:   n     = end-start;
 92:   PetscBLASIntCast(n,&bn);
 93:   PetscStackCallBLAS("BLASdot",z_loc = BLASdotu_(&bn,xa+start,&one,ya+start,&one));

 95:   PetscThreadReductionKernelPost(thread_id,red,(void*)&z_loc);

 97:   VecRestoreArrayRead(xin,&xa);
 98:   VecRestoreArrayRead(yin,&ya);
 99:   return 0;
100: }

104: PetscErrorCode VecTDot_Seq(Vec xin,Vec yin,PetscScalar *z)
105: {
106:   PetscErrorCode           ierr;
107:   PetscThreadCommReduction red;

110:   PetscThreadReductionBegin(PetscObjectComm((PetscObject)xin),THREADCOMM_SUM,PETSC_SCALAR,1,&red);
111:   PetscThreadCommRunKernel3(PetscObjectComm((PetscObject)xin),(PetscThreadKernel)VecTDot_kernel,xin,yin,red);
112:   PetscThreadReductionEnd(red,z);
113:   if (xin->map->n > 0) {
114:     PetscLogFlops(2.0*xin->map->n-1);
115:   }
116:   return(0);
117: }
118: #else
121: PetscErrorCode VecTDot_Seq(Vec xin,Vec yin,PetscScalar *z)
122: {
123:   const PetscScalar *ya,*xa;
124:   PetscBLASInt      one = 1,bn;
125:   PetscErrorCode    ierr;

128:   PetscBLASIntCast(xin->map->n,&bn);
129:   VecGetArrayRead(xin,&xa);
130:   VecGetArrayRead(yin,&ya);
131:   PetscStackCallBLAS("BLASdot",*z   = BLASdotu_(&bn,xa,&one,ya,&one));
132:   VecRestoreArrayRead(xin,&xa);
133:   VecRestoreArrayRead(yin,&ya);
134:   if (xin->map->n > 0) {
135:     PetscLogFlops(2.0*xin->map->n-1);
136:   }
137:   return(0);
138: }
139: #endif

141: #if defined(PETSC_THREADCOMM_ACTIVE)
142: PetscErrorCode VecScale_kernel(PetscInt thread_id,Vec xin,PetscScalar *alpha_p)
143: {
145:   PetscScalar    *xx;
146:   PetscInt       start,end,n;
147:   PetscBLASInt   one=1,bn;
148:   PetscInt       *trstarts=xin->map->trstarts;
149:   PetscScalar    alpha=*alpha_p;

151:   start = trstarts[thread_id];
152:   end   = trstarts[thread_id+1];
153:   n     = end-start;
154:   VecGetArray(xin,&xx);
155:   PetscBLASIntCast(n,&bn);
156:   PetscStackCallBLAS("BLASscal",BLASscal_(&bn,&alpha,xx+start,&one));
157:   VecRestoreArray(xin,&xx);
158:   return 0;
159: }

163: PetscErrorCode VecScale_Seq(Vec xin,PetscScalar alpha)
164: {

168:   if (alpha == (PetscScalar)0.0) {
169:     VecSet_Seq(xin,alpha);
170:   } else if (alpha != (PetscScalar)1.0) {
171:     PetscScalar *scalar;
172:     PetscThreadCommGetScalars(PetscObjectComm((PetscObject)xin),&scalar,NULL,NULL);
173:     *scalar = alpha;
174:     PetscThreadCommRunKernel2(PetscObjectComm((PetscObject)xin),(PetscThreadKernel)VecScale_kernel,xin,scalar);
175:   }
176:   PetscLogFlops(xin->map->n);
177:   return(0);
178: }
179: #else
182: PetscErrorCode VecScale_Seq(Vec xin, PetscScalar alpha)
183: {
185:   PetscBLASInt   one = 1,bn;

188:   PetscBLASIntCast(xin->map->n,&bn);
189:   if (alpha == (PetscScalar)0.0) {
190:     VecSet_Seq(xin,alpha);
191:   } else if (alpha != (PetscScalar)1.0) {
192:     PetscScalar a = alpha,*xarray;
193:     VecGetArray(xin,&xarray);
194:     PetscStackCallBLAS("BLASscal",BLASscal_(&bn,&a,xarray,&one));
195:     VecRestoreArray(xin,&xarray);
196:   }
197:   PetscLogFlops(xin->map->n);
198:   return(0);
199: }
200: #endif

202: #if defined(PETSC_THREADCOMM_ACTIVE)
203: PetscErrorCode VecAXPY_kernel(PetscInt thread_id,Vec yin,PetscScalar *alpha_p,Vec xin)
204: {
205:   PetscErrorCode    ierr;
206:   const PetscScalar *xarray;
207:   PetscScalar       *yarray;
208:   PetscBLASInt      one=1,bn;
209:   PetscInt          *trstarts=yin->map->trstarts,start,end,n;
210:   PetscScalar       alpha = *alpha_p;

212:   start = trstarts[thread_id];
213:   end   = trstarts[thread_id+1];
214:   n     = end - start;
215:   PetscBLASIntCast(n,&bn);
216:   VecGetArrayRead(xin,&xarray);
217:   VecGetArray(yin,&yarray);
218:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&bn,&alpha,xarray+start,&one,yarray+start,&one));
219:   VecRestoreArrayRead(xin,&xarray);
220:   VecRestoreArray(yin,&yarray);

222:   return 0;
223: }
226: PetscErrorCode VecAXPY_Seq(Vec yin,PetscScalar alpha,Vec xin)
227: {

231:   /* assume that the BLAS handles alpha == 1.0 efficiently since we have no fast code for it */
232:   if (alpha != (PetscScalar)0.0) {
233:     PetscScalar *scalar;
234:     PetscThreadCommGetScalars(PetscObjectComm((PetscObject)yin),&scalar,NULL,NULL);
235:     *scalar = alpha;
236:     PetscThreadCommRunKernel3(PetscObjectComm((PetscObject)yin),(PetscThreadKernel)VecAXPY_kernel,yin,scalar,xin);
237:     PetscLogFlops(2.0*yin->map->n);
238:   }
239:   return(0);
240: }
241: #else
244: PetscErrorCode VecAXPY_Seq(Vec yin,PetscScalar alpha,Vec xin)
245: {
246:   PetscErrorCode    ierr;
247:   const PetscScalar *xarray;
248:   PetscScalar       *yarray;
249:   PetscBLASInt      one = 1,bn;

252:   PetscBLASIntCast(yin->map->n,&bn);
253:   /* assume that the BLAS handles alpha == 1.0 efficiently since we have no fast code for it */
254:   if (alpha != (PetscScalar)0.0) {
255:     VecGetArrayRead(xin,&xarray);
256:     VecGetArray(yin,&yarray);
257:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&bn,&alpha,xarray,&one,yarray,&one));
258:     VecRestoreArrayRead(xin,&xarray);
259:     VecRestoreArray(yin,&yarray);
260:     PetscLogFlops(2.0*yin->map->n);
261:   }
262:   return(0);
263: }
264: #endif

266: #if defined(PETSC_THREADCOMM_ACTIVE)
267: PetscErrorCode VecAXPBY_kernel(PetscInt thread_id,Vec yin,PetscScalar *alpha_p,PetscScalar *beta_p,Vec xin)
268: {
269:   PetscErrorCode    ierr;
270:   const PetscScalar *xx;
271:   PetscScalar       *yy;
272:   PetscInt          *trstarts=yin->map->trstarts,i;
273:   PetscScalar       a=*alpha_p,b=*beta_p;

275:   VecGetArrayRead(xin,&xx);
276:   VecGetArray(yin,&yy);

278:   if (b == (PetscScalar)0.0) {
279:     for (i=trstarts[thread_id];i < trstarts[thread_id+1];i++) yy[i] = a*xx[i];
280:   } else {
281:     for (i=trstarts[thread_id];i < trstarts[thread_id+1];i++) yy[i] = a*xx[i] + b*yy[i];
282:   }
283:   VecRestoreArrayRead(xin,&xx);
284:   VecRestoreArray(yin,&yy);
285:   return 0;
286: }

290: PetscErrorCode VecAXPBY_Seq(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
291: {

295:   if (alpha == (PetscScalar)0.0) {
296:     VecScale_Seq(yin,beta);
297:   } else if (beta == (PetscScalar)1.0) {
298:     VecAXPY_Seq(yin,alpha,xin);
299:   } else if (alpha == (PetscScalar)1.0) {
300:     VecAYPX_Seq(yin,beta,xin);
301:   } else {
302:     PetscScalar *scal1,*scal2;
303:     PetscThreadCommGetScalars(PetscObjectComm((PetscObject)yin),&scal1,&scal2,NULL);
304:     *scal1 = alpha; *scal2 = beta;
305:     PetscThreadCommRunKernel4(PetscObjectComm((PetscObject)yin),(PetscThreadKernel)VecAXPBY_kernel,yin,scal1,scal2,xin);
306:     if (beta == (PetscScalar)0.0) {
307:       PetscLogFlops(yin->map->n);
308:     } else {
309:       PetscLogFlops(3.0*yin->map->n);
310:     }
311:   }
312:   return(0);
313: }
314: #else
317: PetscErrorCode VecAXPBY_Seq(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
318: {
319:   PetscErrorCode    ierr;
320:   PetscInt          n = yin->map->n,i;
321:   const PetscScalar *xx;
322:   PetscScalar       *yy,a = alpha,b = beta;

325:   if (a == (PetscScalar)0.0) {
326:     VecScale_Seq(yin,beta);
327:   } else if (b == (PetscScalar)1.0) {
328:     VecAXPY_Seq(yin,alpha,xin);
329:   } else if (a == (PetscScalar)1.0) {
330:     VecAYPX_Seq(yin,beta,xin);
331:   } else if (b == (PetscScalar)0.0) {
332:     VecGetArrayRead(xin,&xx);
333:     VecGetArray(yin,(PetscScalar**)&yy);

335:     for (i=0; i<n; i++) yy[i] = a*xx[i];

337:     VecRestoreArrayRead(xin,&xx);
338:     VecRestoreArray(yin,(PetscScalar**)&yy);
339:     PetscLogFlops(xin->map->n);
340:   } else {
341:     VecGetArrayRead(xin,&xx);
342:     VecGetArray(yin,(PetscScalar**)&yy);

344:     for (i=0; i<n; i++) yy[i] = a*xx[i] + b*yy[i];

346:     VecRestoreArrayRead(xin,&xx);
347:     VecRestoreArray(yin,(PetscScalar**)&yy);
348:     PetscLogFlops(3.0*xin->map->n);
349:   }
350:   return(0);
351: }
352: #endif

354: #if defined(PETSC_THREADCOMM_ACTIVE)
355: PetscErrorCode VecAXPBYPCZ_kernel(PetscInt thread_id,Vec zin,PetscScalar *alpha_p,PetscScalar *beta_p,PetscScalar *gamma_p,Vec xin,Vec yin)
356: {
357:   PetscErrorCode    ierr;
358:   const PetscScalar *xx,*yy;
359:   PetscScalar       *zz;
360:   PetscInt          *trstarts=zin->map->trstarts,i;
361:   PetscScalar       alpha=*alpha_p,beta=*beta_p,gamma=*gamma_p;

363:   VecGetArrayRead(xin,&xx);
364:   VecGetArrayRead(yin,&yy);
365:   VecGetArray(zin,&zz);

367:   if (alpha == (PetscScalar)1.0) {
368:     for (i=trstarts[thread_id]; i < trstarts[thread_id+1]; i++) zz[i] = xx[i] + beta*yy[i] + gamma*zz[i];
369:   } else if (gamma == (PetscScalar)1.0) {
370:     for (i=trstarts[thread_id]; i < trstarts[thread_id+1]; i++) zz[i] = alpha*xx[i] + beta*yy[i] + zz[i];
371:   } else if (gamma == (PetscScalar)0.0) {
372:     for (i=trstarts[thread_id]; i < trstarts[thread_id+1]; i++) zz[i] = alpha*xx[i] + beta*yy[i];
373:   } else {
374:     for (i=trstarts[thread_id]; i < trstarts[thread_id+1]; i++) zz[i] = alpha*xx[i] + beta*yy[i] + gamma*zz[i];
375:   }
376:   VecGetArrayRead(xin,&xx);
377:   VecGetArrayRead(yin,&yy);
378:   VecGetArray(zin,&zz);
379:   return 0;
380: }

384: PetscErrorCode VecAXPBYPCZ_Seq(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
385: {
387:   PetscScalar    *scal1,*scal2,*scal3;

390:   PetscThreadCommGetScalars(PetscObjectComm((PetscObject)zin),&scal1,&scal2,&scal3);
391:   *scal1 = alpha; *scal2 = beta; *scal3 = gamma;
392:   PetscThreadCommRunKernel6(PetscObjectComm((PetscObject)zin),(PetscThreadKernel)VecAXPBYPCZ_kernel,zin,scal1,scal2,scal3,xin,yin);
393:   if (alpha == (PetscScalar)1.0) {
394:     PetscLogFlops(4.0*zin->map->n);
395:   } else if (gamma == (PetscScalar)1.0) {
396:     PetscLogFlops(4.0*zin->map->n);
397:   } else if (gamma == (PetscScalar)0.0) {
398:     PetscLogFlops(3.0*zin->map->n);
399:   } else {
400:     PetscLogFlops(5.0*zin->map->n);
401:   }
402:   return(0);
403: }
404: #else
407: PetscErrorCode VecAXPBYPCZ_Seq(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
408: {
409:   PetscErrorCode    ierr;
410:   PetscInt          n = zin->map->n,i;
411:   const PetscScalar *yy,*xx;
412:   PetscScalar       *zz;

415:   VecGetArrayRead(xin,&xx);
416:   VecGetArrayRead(yin,&yy);
417:   VecGetArray(zin,&zz);
418:   if (alpha == (PetscScalar)1.0) {
419:     for (i=0; i<n; i++) zz[i] = xx[i] + beta*yy[i] + gamma*zz[i];
420:     PetscLogFlops(4.0*n);
421:   } else if (gamma == (PetscScalar)1.0) {
422:     for (i=0; i<n; i++) zz[i] = alpha*xx[i] + beta*yy[i] + zz[i];
423:     PetscLogFlops(4.0*n);
424:   } else if (gamma == (PetscScalar)0.0) {
425:     for (i=0; i<n; i++) zz[i] = alpha*xx[i] + beta*yy[i];
426:     PetscLogFlops(3.0*n);
427:   } else {
428:     for (i=0; i<n; i++) zz[i] = alpha*xx[i] + beta*yy[i] + gamma*zz[i];
429:     PetscLogFlops(5.0*n);
430:   }
431:   VecRestoreArrayRead(xin,&xx);
432:   VecRestoreArrayRead(yin,&yy);
433:   VecRestoreArray(zin,&zz);
434:   return(0);
435: }
436: #endif