Actual source code: sfpack.c

petsc-3.12.2 2019-11-22
Report Typos and Errors

  2:  #include <../src/vec/is/sf/impls/basic/sfpack.h>
  3:  #include <../src/vec/is/sf/impls/basic/sfbasic.h>

  5: #if defined(PETSC_HAVE_CUDA)
  6: #include <cuda_runtime.h>
  7: #endif
  8: /*
  9:  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
 10:  * therefore we pack data types manually. This file defines packing routines for the standard data types.
 11:  */

 13: #define CPPJoin2(a,b)         a ##_## b
 14: #define CPPJoin3(a,b,c)       a ##_## b ##_## c
 15: #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
 16: #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)

 18: #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
 19: #define IGNORE(statement)     do {} while(0)

 21: #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
 22: #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
 23: #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
 24: #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)

 26: #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */

 28: /* DEF_PackFunc - macro defining a Pack routine

 30:    Arguments of the macro:
 31:    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
 32:    .BS        Block size for vectorization. It is a factor of bs.
 33:    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.

 35:    Arguments of the Pack routine:
 36:    +count     Number of indices in idx[]
 37:    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
 38:    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
 39:    .opt       Pack optimization plans. NULL means no plan at all.
 40:    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
 41:    -packed    Address of the packed data for each rank
 42:  */
 43: #define DEF_PackFunc(Type,BS,EQ) \
 44:   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
 45:   {                                                                                                          \
 47:     const Type     *u = (const Type*)unpacked,*u2;                                                           \
 48:     Type           *p = (Type*)packed,*p2;                                                                   \
 49:     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
 50:     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
 51:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
 53:     if (!idx) {PetscArraycpy(p,u,MBS*count);}  /* Indices are contiguous */             \
 54:     else if (!opt) { /* No optimizations available */                                                        \
 55:       for (i=0; i<count; i++)                                                                                \
 56:         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
 57:           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
 58:             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
 59:     } else {                                                                                                 \
 60:       for (r=0; r<opt->n; r++) {                                                                             \
 61:         p2  = p + opt->offset[r]*MBS;                                                                        \
 62:         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
 63:           idx2 = idx + opt->offset[r];                                                                       \
 64:           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
 65:             for (j=0; j<M; j++)                                                                              \
 66:               for (k=0; k<BS; k++)                                                                           \
 67:                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
 68:         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
 69:           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
 70:             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
 71:             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
 72:             PetscArraycpy(p2,u2,l);                                                     \
 73:             p2  += l;                                                                                        \
 74:           }                                                                                                  \
 75:         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
 76:           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
 77:           step = opt->stride_step[r];                                                                        \
 78:           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
 79:             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
 80:         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
 81:       }                                                                                                      \
 82:     }                                                                                                        \
 83:     return(0);                                                                                  \
 84:   }

 86: /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine

 88:    Arguments:
 89:   +action     Unpack or Fetch
 90:   .Type       Type of the data
 91:   .BS         Block size for vectorization
 92:   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
 93:   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
 94:   .CType      Type with or without the const qualifier, i.e., const Type or Type
 95:   .Cvoid      void with or without the const qualifier, i.e., const void or void

 97:   Notes:
 98:    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
 99:    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
100:    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
101:    sometimes we input 'const', sometimes we have to input empty.

103:    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
104:  */
105: #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106:   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107:   {                                                                                                          \
109:     Type           *u = (Type*)unpacked,*u2;                                                                 \
110:     CType          *p = (CType*)packed,*p2;                                                                  \
111:     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112:     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
115:     if (!idx) {                                                                                              \
116:       FILTER(Type *v);                                                                                       \
117:       FILTER(PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118:       FILTER(PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119:              PetscArraycpy(u,p,count*MBS);                                              \
120:       FILTER(PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
121:       FILTER(PetscFree(v);CHKERRQ(ierr));                                                             \
122:     } else if (!opt) { /* No optimizations available */                                                      \
123:       for (i=0; i<count; i++)                                                                                \
124:         for (j=0; j<M; j++)                                                                                  \
125:           for (k=0; k<BS; k++) {                                                                             \
126:             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127:                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128:             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
129:           }                                                                                                  \
130:     } else {                                                                                                 \
131:       for (r=0; r<opt->n; r++) {                                                                             \
132:         p2 = p + opt->offset[r]*MBS;                                                                         \
133:         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134:           idx2 = idx + opt->offset[r];                                                                       \
135:           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136:             for (j=0; j<M; j++)                                                                              \
137:               for (k=0; k<BS; k++) {                                                                         \
138:                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139:                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140:                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
141:               }                                                                                              \
142:         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143:           FILTER(Type *v);                                                                                   \
144:           FILTER(PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
145:           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146:             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147:             l  = opt->copy_length[i]*MBS;                                                                    \
148:             FILTER(PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149:                    PetscArraycpy(u2,p2,l);                                              \
150:             FILTER(PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151:             p2 += l;                                                                                         \
152:           }                                                                                                  \
153:           FILTER(PetscFree(v);CHKERRQ(ierr));                                                         \
154:         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155:           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
156:           step = opt->stride_step[r];                                                                        \
157:           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158:             for (j=0; j<M; j++)                                                                              \
159:               for (k=0; k<BS; k++) {                                                                         \
160:                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161:                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162:                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
163:               }                                                                                              \
164:         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
165:       }                                                                                                      \
166:     }                                                                                                        \
167:     return(0);                                                                                  \
168:   }

170: /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc

172:    Arguments:
173:   +action     Unpack or Fetch
174:   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175:   .Type       Type of the data
176:   .BS         Block size for vectorization
177:   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
178:   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
179:   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
180:   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181:   .CType      Type with or without the const qualifier, i.e., const Type or Type
182:   -Cvoid      void with or without the const qualifier, i.e., const void or void
183:  */
184: #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185:   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186:   {                                                                                                          \
187:     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188:     CType          *p = (CType*)packed,*p2;                                                                  \
189:     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190:     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191:     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
193:     if (!idx) {                                                                                              \
194:       for (i=0; i<count; i++)                                                                                \
195:         for (j=0; j<M; j++)                                                                                  \
196:           for (k=0; k<BS; k++) {                                                                             \
197:             t    = u[i*MBS+j*BS+k];                                                                          \
198:             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199:             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
200:           }                                                                                                  \
201:     } else if (!opt) { /* No optimizations available */                                                      \
202:       for (i=0; i<count; i++)                                                                                \
203:         for (j=0; j<M; j++)                                                                                  \
204:           for (k=0; k<BS; k++) {                                                                             \
205:               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206:               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207:               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
208:           }                                                                                                  \
209:     } else {                                                                                                 \
210:       for (r=0; r<opt->n; r++) {                                                                             \
211:         p2 = p + opt->offset[r]*MBS;                                                                         \
212:         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213:           idx2 = idx + opt->offset[r];                                                                       \
214:           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215:             for (j=0; j<M; j++)                                                                              \
216:               for (k=0; k<BS; k++) {                                                                         \
217:                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218:                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219:                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
220:               }                                                                                              \
221:         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
222:           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223:             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224:             l  = opt->copy_length[i]*MBS;                                                                    \
225:             for (j=0; j<l; j++) {                                                                            \
226:               t    = u2[j];                                                                                  \
227:               APPLY (u2[j],t,op,p2[j]);                                                                      \
228:               FILTER(p2[j] = t);                                                                             \
229:             }                                                                                                \
230:             p2 += l;                                                                                         \
231:           }                                                                                                  \
232:         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233:           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
234:           step = opt->stride_step[r];                                                                        \
235:           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236:             for (j=0; j<M; j++)                                                                              \
237:               for (k=0; k<BS; k++) {                                                                         \
238:                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239:                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240:                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
241:               }                                                                                              \
242:         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
243:       }                                                                                                      \
244:     }                                                                                                        \
245:     return(0);                                                                                  \
246:   }

248: /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine

250:    Arguments:
251:   +Action     Unpack or Fetch
252:   .locname    Max or Min
253:   .type1      Type of the first data in a pair type
254:   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
255:   .op         > or <
256:   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257:   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258:   -Cvoid      void with or without the const qualifier, i.e., const void or void
259:  */
260: #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261:   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262:     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263:     CType                 *p = (CType*)packed;                                                               \
264:     PetscInt              i,j;                                                                               \
265:     for (i=0; i<count; i++) {                                                                                \
266:       FILTER(PairType(Type1,Type2) v);                                                                       \
267:       j = idx? idx[i] : i;                                                                                   \
268:       FILTER(v = u[j]);                                                                                      \
269:       if (p[i].a op u[j].a) {                                                                                \
270:         u[j] = p[i];                                                                                         \
271:       } else if (p[i].a == u[j].a) {                                                                         \
272:         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
273:       }                                                                                                      \
274:       FILTER(p[i] = v);                                                                                      \
275:     }                                                                                                        \
276:     return(0);                                                                                  \
277:   }

279: /* Pack, Unpack/Fetch ops */
280: #define DEF_Pack(Type,BS,EQ)                                                                   \
281:   DEF_PackFunc(Type,BS,EQ)                                                                     \
282:   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283:   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284:   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285:     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286:     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287:     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
288:   }

290: /* Add, Mult ops */
291: #define DEF_Add(Type,BS,EQ)                                                                    \
292:   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293:   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294:   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295:   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296:   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297:     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298:     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299:     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300:     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
301:   }

303: /* Max, Min ops */
304: #define DEF_Cmp(Type,BS,EQ)                                                                    \
305:   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306:   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307:   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308:   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309:   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310:     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311:     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312:     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313:     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314:   }

316: /* Logical ops.
317:   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
318:   the compilation warning "empty macro arguments are undefined in ISO C90"
319:  */
320: #define DEF_Log(Type,BS,EQ)                                                                    \
321:   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322:   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323:   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324:   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325:   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326:   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327:   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328:     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329:     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330:     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331:     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332:     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333:     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
334:   }

336: /* Bitwise ops */
337: #define DEF_Bit(Type,BS,EQ)                                                                    \
338:   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339:   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340:   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341:   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342:   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343:   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344:   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345:     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346:     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347:     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348:     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349:     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350:     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
351:   }

353: /* Maxloc, Minloc */
354: #define DEF_Xloc(Type1,Type2)                                                                  \
355:   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356:   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357:   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358:   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359:   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360:     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361:     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362:     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363:     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
364:   }

366: #define DEF_IntegerType(Type,BS,EQ)                                                            \
367:   DEF_Pack(Type,BS,EQ)                                                                         \
368:   DEF_Add(Type,BS,EQ)                                                                          \
369:   DEF_Cmp(Type,BS,EQ)                                                                          \
370:   DEF_Log(Type,BS,EQ)                                                                          \
371:   DEF_Bit(Type,BS,EQ)                                                                          \
372:   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376:     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377:     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
378:   }

380: #define DEF_RealType(Type,BS,EQ)                                                               \
381:   DEF_Pack(Type,BS,EQ)                                                                         \
382:   DEF_Add(Type,BS,EQ)                                                                          \
383:   DEF_Cmp(Type,BS,EQ)                                                                          \
384:   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387:     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388:   }

390: #if defined(PETSC_HAVE_COMPLEX)
391: #define DEF_ComplexType(Type,BS,EQ)                                                            \
392:   DEF_Pack(Type,BS,EQ)                                                                         \
393:   DEF_Add(Type,BS,EQ)                                                                          \
394:   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396:     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397:   }
398: #endif

400: #define DEF_DumbType(Type,BS,EQ)                                                               \
401:   DEF_Pack(Type,BS,EQ)                                                                         \
402:   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403:     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404:   }

406: /* Maxloc, Minloc */
407: #define DEF_PairType(Type1,Type2)                                                              \
408:   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409:   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410:   DEF_Xloc(Type1,Type2)                                                                        \
411:   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412:     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413:     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414:   }

416: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */

425: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426: DEF_IntegerType(int,1,1)
427: DEF_IntegerType(int,2,1)
428: DEF_IntegerType(int,4,1)
429: DEF_IntegerType(int,8,1)
430: DEF_IntegerType(int,1,0)
431: DEF_IntegerType(int,2,0)
432: DEF_IntegerType(int,4,0)
433: DEF_IntegerType(int,8,0)
434: #endif

436: /* The typedefs are used to get a typename without space that CPPJoin can handle */
437: typedef signed char SignedChar;
438: DEF_IntegerType(SignedChar,1,1)
439: DEF_IntegerType(SignedChar,2,1)
440: DEF_IntegerType(SignedChar,4,1)
441: DEF_IntegerType(SignedChar,8,1)
442: DEF_IntegerType(SignedChar,1,0)
443: DEF_IntegerType(SignedChar,2,0)
444: DEF_IntegerType(SignedChar,4,0)
445: DEF_IntegerType(SignedChar,8,0)

447: typedef unsigned char UnsignedChar;
448: DEF_IntegerType(UnsignedChar,1,1)
449: DEF_IntegerType(UnsignedChar,2,1)
450: DEF_IntegerType(UnsignedChar,4,1)
451: DEF_IntegerType(UnsignedChar,8,1)
452: DEF_IntegerType(UnsignedChar,1,0)
453: DEF_IntegerType(UnsignedChar,2,0)
454: DEF_IntegerType(UnsignedChar,4,0)
455: DEF_IntegerType(UnsignedChar,8,0)

457: DEF_RealType(PetscReal,1,1)
458: DEF_RealType(PetscReal,2,1)
459: DEF_RealType(PetscReal,4,1)
460: DEF_RealType(PetscReal,8,1)
461: DEF_RealType(PetscReal,1,0)
462: DEF_RealType(PetscReal,2,0)
463: DEF_RealType(PetscReal,4,0)
464: DEF_RealType(PetscReal,8,0)

466: #if defined(PETSC_HAVE_COMPLEX)
467: DEF_ComplexType(PetscComplex,1,1)
468: DEF_ComplexType(PetscComplex,2,1)
469: DEF_ComplexType(PetscComplex,4,1)
470: DEF_ComplexType(PetscComplex,8,1)
471: DEF_ComplexType(PetscComplex,1,0)
472: DEF_ComplexType(PetscComplex,2,0)
473: DEF_ComplexType(PetscComplex,4,0)
474: DEF_ComplexType(PetscComplex,8,0)
475: #endif

477: DEF_PairType(int,int)
478: DEF_PairType(PetscInt,PetscInt)

480: /* If we don't know the basic type, we treat it as a stream of chars or ints */
481: DEF_DumbType(char,1,1)
482: DEF_DumbType(char,2,1)
483: DEF_DumbType(char,4,1)
484: DEF_DumbType(char,1,0)
485: DEF_DumbType(char,2,0)
486: DEF_DumbType(char,4,0)

488: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489: DEF_DumbType(DumbInt,1,1)
490: DEF_DumbType(DumbInt,2,1)
491: DEF_DumbType(DumbInt,4,1)
492: DEF_DumbType(DumbInt,8,1)
493: DEF_DumbType(DumbInt,1,0)
494: DEF_DumbType(DumbInt,2,0)
495: DEF_DumbType(DumbInt,4,0)
496: DEF_DumbType(DumbInt,8,0)

498: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
499: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
500: {
501:   int ierr;
502:   MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
503:   MPI_Type_commit(newtype); if (ierr) return ierr;
504:   return MPI_SUCCESS;
505: }
506: #endif

508: PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
509: {
510:   PetscErrorCode    ierr;
511:   PetscSFPack       link,*p;
512:   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;

515:   /* Look for types in cache */
516:   for (p=&bas->inuse; (link=*p); p=&link->next) {
517:     PetscBool match;
518:     MPIPetsc_Type_compare(unit,link->unit,&match);
519:     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
520:       switch (cmode) {
521:       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
522:       case PETSC_USE_POINTER: break;
523:       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
524:       }
525:       *mylink = link;
526:       return(0);
527:     }
528:   }
529:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
530:   return(0);
531: }

533: PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
534: {
535:   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;

538:   (*link)->rkey = NULL;
539:   (*link)->lkey = NULL;
540:   (*link)->next = bas->avail;
541:   bas->avail    = *link;
542:   *link         = NULL;
543:   return(0);
544: }

546: /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547: PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail)
548: {
549:   PetscErrorCode    ierr;
550:   PetscSFPack       link=*avail,next;
551:   PetscInt          i;

554:   for (; link; link=next) {
555:     next = link->next;
556:     if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
557:     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558:       if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
559:     }
560:     PetscFree(link->reqs);
561:     PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);
562:     PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);
563:     PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);

565: #if defined(PETSC_HAVE_CUDA)
566:     PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);
567:     PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);
568:     PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);
569:     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
570: #endif
571:     PetscFree(link);
572:   }
573:   *avail = NULL;
574:   return(0);
575: }

577: /* Error out on unsupported overlapped communications */
578: PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
579: {
580:   PetscErrorCode    ierr;
581:   PetscSFPack       link,*p;
582:   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
583:   PetscBool         match;

586:   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
587:      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
588:   */
589:   if (rkey || lkey) {
590:     for (p=&bas->inuse; (link=*p); p=&link->next) {
591:       MPIPetsc_Type_compare(unit,link->unit,&match);
592:       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
593:     }
594:   }
595:   return(0);
596: }

598: PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
599: {
601:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
602:   PetscBool      is2Int,is2PetscInt;
603:   PetscMPIInt    ni,na,nd,combiner;
604: #if defined(PETSC_HAVE_COMPLEX)
605:   PetscInt       nPetscComplex=0;
606: #endif

609:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
610:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
611:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
612:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
613:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
614:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
615: #if defined(PETSC_HAVE_COMPLEX)
616:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
617: #endif
618:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
619:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
620:   /* TODO: shaell we also handle Fortran MPI_2REAL? */
621:   MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
622:   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
623:   link->bs = 1; /* default */

625:   if (is2Int) {
626:     PackInit_PairType_int_int(link);
627:     link->bs        = 1;
628:     link->unitbytes = 2*sizeof(int);
629:     link->basicunit = MPI_2INT;
630:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
631:     PackInit_PairType_PetscInt_PetscInt(link);
632:     link->bs        = 1;
633:     link->unitbytes = 2*sizeof(PetscInt);
634:     link->basicunit = MPIU_2INT;
635:   } else if (nPetscReal) {
636:     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
637:     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
638:     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
639:     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
640:     link->bs        = nPetscReal;
641:     link->unitbytes = nPetscReal*sizeof(PetscReal);
642:     link->basicunit = MPIU_REAL;
643:   } else if (nPetscInt) {
644:     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
645:     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
646:     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
647:     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
648:     link->bs        = nPetscInt;
649:     link->unitbytes = nPetscInt*sizeof(PetscInt);
650:     link->basicunit = MPIU_INT;
651: #if defined(PETSC_USE_64BIT_INDICES)
652:   } else if (nInt) {
653:     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
654:     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
655:     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
656:     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
657:     link->bs        = nInt;
658:     link->unitbytes = nInt*sizeof(int);
659:     link->basicunit = MPI_INT;
660: #endif
661:   } else if (nSignedChar) {
662:     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
663:     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
664:     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665:     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
666:     link->bs        = nSignedChar;
667:     link->unitbytes = nSignedChar*sizeof(SignedChar);
668:     link->basicunit = MPI_SIGNED_CHAR;
669:   }  else if (nUnsignedChar) {
670:     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
671:     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
672:     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
673:     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
674:     link->bs        = nUnsignedChar;
675:     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
676:     link->basicunit = MPI_UNSIGNED_CHAR;
677: #if defined(PETSC_HAVE_COMPLEX)
678:   } else if (nPetscComplex) {
679:     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
680:     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
681:     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
682:     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
683:     link->bs        = nPetscComplex;
684:     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
685:     link->basicunit = MPIU_COMPLEX;
686: #endif
687:   } else {
688:     MPI_Aint lb,nbyte;
689:     MPI_Type_get_extent(unit,&lb,&nbyte);
690:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
691:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
692:       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
693:       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
694:       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
695:       link->bs        = nbyte;
696:       link->unitbytes = nbyte;
697:       link->basicunit = MPI_BYTE;
698:     } else {
699:       nInt = nbyte / sizeof(int);
700:       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
701:       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
702:       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
703:       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
704:       link->bs        = nInt;
705:       link->unitbytes = nbyte;
706:       link->basicunit = MPI_INT;
707:     }
708:   }

710:   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
711:   else {MPI_Type_dup(unit,&link->unit);}
712:   return(0);
713: }

715: PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
716: {
718:   *UnpackAndOp = NULL;
719:   if (mtype == PETSC_MEMTYPE_HOST) {
720:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
721:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
722:     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
723:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
724:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
725:     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
726:     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
727:     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
728:     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
729:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
730:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
731:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
732:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
733:   }
734: #if defined(PETSC_HAVE_CUDA)
735:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
736:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
737:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
738:     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
739:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
740:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
741:     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
742:     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
743:     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
744:     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
745:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
746:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
747:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
748:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
749:   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
750:     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
751:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
752:     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
753:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
754:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
755:     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
756:     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
757:     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
758:     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
759:     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
760:     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
761:     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
762:     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
763:   }
764: #endif
765:   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);

767:   return(0);
768: }

770: PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
771: {
773:   *FetchAndOp = NULL;
774:   if (mtype == PETSC_MEMTYPE_HOST) {
775:     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
776:     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
777:     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
778:     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
779:     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
780:     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
781:     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
782:     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
783:     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
784:     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
785:     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
786:     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
787:     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
788:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
789:   }
790: #if defined(PETSC_HAVE_CUDA)
791:   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
792:     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
793:     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
794:     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
795:     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
796:     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
797:     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
798:     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
799:     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
800:     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
801:     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
802:     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
803:     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
804:     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
805:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
806:   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
807:     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
808:     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
809:     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
810:     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
811:     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
812:     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
813:     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
814:     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
815:     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
816:     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
817:     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
818:     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
819:     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
820:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
821:   }
822: #endif
823:   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
824:   return(0);
825: }

827: /*
828:   Create pack/unpack optimization plans based on indice patterns available

830:    Input Parameters:
831:   +  n       - Number of target ranks
832:   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
833:   -  idx     - [*]   Array storing indices

835:    Output Parameters:
836:   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
837: */
838: PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
839: {
841:   PetscInt       i,j,k,n_copies,tot_copies=0,step;
842:   PetscBool      strided,optimized=PETSC_FALSE;
843:   PetscSFPackOpt opt;

846:   if (!n) {
847:     *out = NULL;
848:     return(0);
849:   }

851:   PetscCalloc1(1,&opt);
852:   PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);
853:   PetscArraycpy(opt->offset,offset,n+1);
854:   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/

856:   opt->n = n;

858:   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
859:   for (i=0; i<n; i++) { /* for each target processor */
860:     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
861:     n_copies = 1;
862:     for (j=offset[i]; j<offset[i+1]-1; j++) {
863:       if (idx[j]+1 != idx[j+1]) n_copies++;
864:     }
865:     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
866:        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
867:      */
868:     if ((offset[i+1]-offset[i])/n_copies >= 32) {
869:       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
870:       optimized    = PETSC_TRUE;
871:       tot_copies  += n_copies;
872:     }
873:   }

875:   /* Setup memcpy plan for each contiguous piece */
876:   k    = 0; /* k-th copy */
877:   PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);
878:   for (i=0; i<n; i++) { /* for each target processor */
879:     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
880:       n_copies           = 1;
881:       opt->copy_start[k] = offset[i] - offset[0];
882:       for (j=offset[i]; j<offset[i+1]-1; j++) {
883:         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
884:           n_copies++;
885:           opt->copy_start[k+1] = j-offset[0]+1;
886:           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
887:           k++;
888:         }
889:       }
890:       /* Set copy length of the last copy for this target */
891:       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
892:       k++;
893:     }
894:     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
895:     opt->copy_offset[i+1] = k;
896:   }

898:   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
899:   for (i=0; i<n; i++) { /* for each remote */
900:     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
901:       strided = PETSC_TRUE;
902:       step    = idx[offset[i]+1] - idx[offset[i]];
903:       for (j=offset[i]; j<offset[i+1]-1; j++) {
904:         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
905:       }
906:       if (strided) {
907:         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
908:         opt->stride_step[i]  = step;
909:         opt->stride_n[i]     = offset[i+1] - offset[i];
910:         optimized            = PETSC_TRUE;
911:       }
912:     }
913:   }
914:   /* If no rank gets optimized, free arrays to save memory */
915:   if (!optimized) {
916:     PetscFree3(opt->type,opt->offset,opt->copy_offset);
917:     PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);
918:     PetscFree(opt);
919:     *out = NULL;
920:   } else *out = opt;
921:   return(0);
922: }

924: PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
925: {
927:   PetscSFPackOpt opt = *out;

930:   if (opt) {
931:     PetscFree3(opt->type,opt->offset,opt->copy_offset);
932:     PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);
933:     PetscFree(opt);
934:     *out = NULL;
935:   }
936:   return(0);
937: }