Actual source code: inode.c
petsc-3.7.1 2016-05-15
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
10: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
14: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
17: n = A->cmap->n;
18: m = A->rmap->n;
19: ns_row = a->inode.size;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24: for (; count+1 < n; count++,i++) ;
25: if (count < n) {
26: i++;
27: }
28: *size = i;
29: return(0);
30: }
31: PetscMalloc1(n+1,&ns_col);
33: /* Use the same row structure wherever feasible. */
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35: ns_col[i] = ns_row[i];
36: }
38: /* if m < n; pad up the remainder with inode_limit */
39: for (; count+1 < n; count++,i++) {
40: ns_col[i] = 1;
41: }
42: /* The last node is the odd ball. padd it up with the remaining rows; */
43: if (count < n) {
44: ns_col[i] = n - count;
45: i++;
46: } else if (count > n) {
47: /* Adjust for the over estimation */
48: ns_col[i-1] += n - count;
49: }
50: *size = i;
51: *ns = ns_col;
52: return(0);
53: }
56: /*
57: This builds symmetric version of nonzero structure,
58: */
61: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62: {
63: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
65: PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67: const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc1(nslim_col+1,&tns);
81: PetscMalloc1(n+1,&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1) {
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87: }
88: /* allocate space for row pointers */
89: PetscMalloc1(nslim_row+1,&ia);
90: *iia = ia;
91: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
92: PetscMalloc1(nslim_row+1,&work);
94: /* determine the number of columns in each row */
95: ia[0] = oshift;
96: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
98: j = aj + ai[row] + ishift;
99: jmax = aj + ai[row+1] + ishift;
100: col = *j++ + ishift;
101: i2 = tvc[col];
102: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103: ia[i1+1]++;
104: ia[i2+1]++;
105: i2++; /* Start col of next node */
106: while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107: i2 = tvc[col];
108: }
109: if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
110: }
112: /* shift ia[i] to point to next row */
113: for (i1=1; i1<nslim_row+1; i1++) {
114: row = ia[i1-1];
115: ia[i1] += row;
116: work[i1-1] = row - oshift;
117: }
119: /* allocate space for column pointers */
120: nz = ia[nslim_row] + (!ishift);
121: PetscMalloc1(nz,&ja);
122: *jja = ja;
124: /* loop over lower triangular part putting into ja */
125: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126: j = aj + ai[row] + ishift;
127: jmax = aj + ai[row+1] + ishift;
128: col = *j++ + ishift;
129: i2 = tvc[col];
130: while (i2<i1 && j<jmax) {
131: ja[work[i2]++] = i1 + oshift;
132: ja[work[i1]++] = i2 + oshift;
133: ++i2;
134: while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135: i2 = tvc[col];
136: }
137: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
139: }
140: PetscFree(work);
141: PetscFree(tns);
142: PetscFree(tvc);
143: return(0);
144: }
146: /*
147: This builds nonsymmetric version of nonzero structure,
148: */
151: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152: {
153: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
155: PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156: PetscInt *tns,*tvc,nsz,i1,i2;
157: const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
160: nslim_row = a->inode.node_count;
161: n = A->cmap->n;
163: /* Create The column_inode for this matrix */
164: Mat_CreateColInode(A,&nslim_col,&ns_col);
166: /* allocate space for reformated column_inode structure */
167: PetscMalloc1(nslim_col +1,&tns);
168: PetscMalloc1(n + 1,&tvc);
169: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171: for (i1=0,col=0; i1<nslim_col; ++i1) {
172: nsz = ns_col[i1];
173: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174: }
175: /* allocate space for row pointers */
176: PetscMalloc1(nslim_row+1,&ia);
177: *iia = ia;
178: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
179: PetscMalloc1(nslim_row+1,&work);
181: /* determine the number of columns in each row */
182: ia[0] = oshift;
183: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184: j = aj + ai[row] + ishift;
185: col = *j++ + ishift;
186: i2 = tvc[col];
187: nz = ai[row+1] - ai[row];
188: while (nz-- > 0) { /* off-diagonal elemets */
189: ia[i1+1]++;
190: i2++; /* Start col of next node */
191: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192: if (nz > 0) i2 = tvc[col];
193: }
194: }
196: /* shift ia[i] to point to next row */
197: for (i1=1; i1<nslim_row+1; i1++) {
198: row = ia[i1-1];
199: ia[i1] += row;
200: work[i1-1] = row - oshift;
201: }
203: /* allocate space for column pointers */
204: nz = ia[nslim_row] + (!ishift);
205: PetscMalloc1(nz,&ja);
206: *jja = ja;
208: /* loop over matrix putting into ja */
209: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210: j = aj + ai[row] + ishift;
211: col = *j++ + ishift;
212: i2 = tvc[col];
213: nz = ai[row+1] - ai[row];
214: while (nz-- > 0) {
215: ja[work[i1]++] = i2 + oshift;
216: ++i2;
217: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218: if (nz > 0) i2 = tvc[col];
219: }
220: }
221: PetscFree(ns_col);
222: PetscFree(work);
223: PetscFree(tns);
224: PetscFree(tvc);
225: return(0);
226: }
230: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
231: {
232: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
236: *n = a->inode.node_count;
237: if (!ia) return(0);
238: if (!blockcompressed) {
239: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
240: } else if (symmetric) {
241: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
242: } else {
243: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
244: }
245: return(0);
246: }
250: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
251: {
255: if (!ia) return(0);
257: if (!blockcompressed) {
258: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
259: } else {
260: PetscFree(*ia);
261: PetscFree(*ja);
262: }
263: return(0);
264: }
266: /* ----------------------------------------------------------- */
270: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271: {
272: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
274: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
278: nslim_row = a->inode.node_count;
279: n = A->cmap->n;
281: /* Create The column_inode for this matrix */
282: Mat_CreateColInode(A,&nslim_col,&ns_col);
284: /* allocate space for reformated column_inode structure */
285: PetscMalloc1(nslim_col + 1,&tns);
286: PetscMalloc1(n + 1,&tvc);
287: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
289: for (i1=0,col=0; i1<nslim_col; ++i1) {
290: nsz = ns_col[i1];
291: for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292: }
293: /* allocate space for column pointers */
294: PetscMalloc1(nslim_col+1,&ia);
295: *iia = ia;
296: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
297: PetscMalloc1(nslim_col+1,&work);
299: /* determine the number of columns in each row */
300: ia[0] = oshift;
301: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302: j = aj + ai[row] + ishift;
303: col = *j++ + ishift;
304: i2 = tvc[col];
305: nz = ai[row+1] - ai[row];
306: while (nz-- > 0) { /* off-diagonal elemets */
307: /* ia[i1+1]++; */
308: ia[i2+1]++;
309: i2++;
310: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311: if (nz > 0) i2 = tvc[col];
312: }
313: }
315: /* shift ia[i] to point to next col */
316: for (i1=1; i1<nslim_col+1; i1++) {
317: col = ia[i1-1];
318: ia[i1] += col;
319: work[i1-1] = col - oshift;
320: }
322: /* allocate space for column pointers */
323: nz = ia[nslim_col] + (!ishift);
324: PetscMalloc1(nz,&ja);
325: *jja = ja;
327: /* loop over matrix putting into ja */
328: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329: j = aj + ai[row] + ishift;
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
351: {
355: Mat_CreateColInode(A,n,NULL);
356: if (!ia) return(0);
358: if (!blockcompressed) {
359: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
360: } else if (symmetric) {
361: /* Since the indices are symmetric it does'nt matter */
362: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
363: } else {
364: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
365: }
366: return(0);
367: }
371: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
372: {
376: if (!ia) return(0);
377: if (!blockcompressed) {
378: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
379: } else {
380: PetscFree(*ia);
381: PetscFree(*ja);
382: }
383: return(0);
384: }
386: /* ----------------------------------------------------------- */
390: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391: {
392: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
393: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394: PetscScalar *y;
395: const PetscScalar *x;
396: const MatScalar *v1,*v2,*v3,*v4,*v5;
397: PetscErrorCode ierr;
398: PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399: const PetscInt *idx,*ns,*ii;
401: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403: #endif
406: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407: node_max = a->inode.node_count;
408: ns = a->inode.size; /* Node Size array */
409: VecGetArrayRead(xx,&x);
410: VecGetArray(yy,&y);
411: idx = a->j;
412: v1 = a->a;
413: ii = a->i;
415: for (i = 0,row = 0; i< node_max; ++i) {
416: nsz = ns[i];
417: n = ii[1] - ii[0];
418: nonzerorow += (n>0)*nsz;
419: ii += nsz;
420: PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
421: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
422: sz = n; /* No of non zeros in this row */
423: /* Switch on the size of Node */
424: switch (nsz) { /* Each loop in 'case' is unrolled */
425: case 1:
426: sum1 = 0.;
428: for (n = 0; n< sz-1; n+=2) {
429: i1 = idx[0]; /* The instructions are ordered to */
430: i2 = idx[1]; /* make the compiler's job easy */
431: idx += 2;
432: tmp0 = x[i1];
433: tmp1 = x[i2];
434: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435: }
437: if (n == sz-1) { /* Take care of the last nonzero */
438: tmp0 = x[*idx++];
439: sum1 += *v1++ *tmp0;
440: }
441: y[row++]=sum1;
442: break;
443: case 2:
444: sum1 = 0.;
445: sum2 = 0.;
446: v2 = v1 + n;
448: for (n = 0; n< sz-1; n+=2) {
449: i1 = idx[0];
450: i2 = idx[1];
451: idx += 2;
452: tmp0 = x[i1];
453: tmp1 = x[i2];
454: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456: }
457: if (n == sz-1) {
458: tmp0 = x[*idx++];
459: sum1 += *v1++ * tmp0;
460: sum2 += *v2++ * tmp0;
461: }
462: y[row++]=sum1;
463: y[row++]=sum2;
464: v1 =v2; /* Since the next block to be processed starts there*/
465: idx +=sz;
466: break;
467: case 3:
468: sum1 = 0.;
469: sum2 = 0.;
470: sum3 = 0.;
471: v2 = v1 + n;
472: v3 = v2 + n;
474: for (n = 0; n< sz-1; n+=2) {
475: i1 = idx[0];
476: i2 = idx[1];
477: idx += 2;
478: tmp0 = x[i1];
479: tmp1 = x[i2];
480: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483: }
484: if (n == sz-1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: }
490: y[row++]=sum1;
491: y[row++]=sum2;
492: y[row++]=sum3;
493: v1 =v3; /* Since the next block to be processed starts there*/
494: idx +=2*sz;
495: break;
496: case 4:
497: sum1 = 0.;
498: sum2 = 0.;
499: sum3 = 0.;
500: sum4 = 0.;
501: v2 = v1 + n;
502: v3 = v2 + n;
503: v4 = v3 + n;
505: for (n = 0; n< sz-1; n+=2) {
506: i1 = idx[0];
507: i2 = idx[1];
508: idx += 2;
509: tmp0 = x[i1];
510: tmp1 = x[i2];
511: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515: }
516: if (n == sz-1) {
517: tmp0 = x[*idx++];
518: sum1 += *v1++ * tmp0;
519: sum2 += *v2++ * tmp0;
520: sum3 += *v3++ * tmp0;
521: sum4 += *v4++ * tmp0;
522: }
523: y[row++]=sum1;
524: y[row++]=sum2;
525: y[row++]=sum3;
526: y[row++]=sum4;
527: v1 =v4; /* Since the next block to be processed starts there*/
528: idx +=3*sz;
529: break;
530: case 5:
531: sum1 = 0.;
532: sum2 = 0.;
533: sum3 = 0.;
534: sum4 = 0.;
535: sum5 = 0.;
536: v2 = v1 + n;
537: v3 = v2 + n;
538: v4 = v3 + n;
539: v5 = v4 + n;
541: for (n = 0; n<sz-1; n+=2) {
542: i1 = idx[0];
543: i2 = idx[1];
544: idx += 2;
545: tmp0 = x[i1];
546: tmp1 = x[i2];
547: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552: }
553: if (n == sz-1) {
554: tmp0 = x[*idx++];
555: sum1 += *v1++ * tmp0;
556: sum2 += *v2++ * tmp0;
557: sum3 += *v3++ * tmp0;
558: sum4 += *v4++ * tmp0;
559: sum5 += *v5++ * tmp0;
560: }
561: y[row++]=sum1;
562: y[row++]=sum2;
563: y[row++]=sum3;
564: y[row++]=sum4;
565: y[row++]=sum5;
566: v1 =v5; /* Since the next block to be processed starts there */
567: idx +=4*sz;
568: break;
569: default:
570: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571: }
572: }
573: VecRestoreArrayRead(xx,&x);
574: VecRestoreArray(yy,&y);
575: PetscLogFlops(2.0*a->nz - nonzerorow);
576: return(0);
577: }
578: /* ----------------------------------------------------------- */
579: /* Almost same code as the MatMult_SeqAIJ_Inode() */
582: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583: {
584: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
585: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586: const MatScalar *v1,*v2,*v3,*v4,*v5;
587: const PetscScalar *x;
588: PetscScalar *y,*z,*zt;
589: PetscErrorCode ierr;
590: PetscInt i1,i2,n,i,row,node_max,nsz,sz;
591: const PetscInt *idx,*ns,*ii;
594: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595: node_max = a->inode.node_count;
596: ns = a->inode.size; /* Node Size array */
598: VecGetArrayRead(xx,&x);
599: VecGetArrayPair(zz,yy,&z,&y);
600: zt = z;
602: idx = a->j;
603: v1 = a->a;
604: ii = a->i;
606: for (i = 0,row = 0; i< node_max; ++i) {
607: nsz = ns[i];
608: n = ii[1] - ii[0];
609: ii += nsz;
610: sz = n; /* No of non zeros in this row */
611: /* Switch on the size of Node */
612: switch (nsz) { /* Each loop in 'case' is unrolled */
613: case 1:
614: sum1 = *zt++;
616: for (n = 0; n< sz-1; n+=2) {
617: i1 = idx[0]; /* The instructions are ordered to */
618: i2 = idx[1]; /* make the compiler's job easy */
619: idx += 2;
620: tmp0 = x[i1];
621: tmp1 = x[i2];
622: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623: }
625: if (n == sz-1) { /* Take care of the last nonzero */
626: tmp0 = x[*idx++];
627: sum1 += *v1++ * tmp0;
628: }
629: y[row++]=sum1;
630: break;
631: case 2:
632: sum1 = *zt++;
633: sum2 = *zt++;
634: v2 = v1 + n;
636: for (n = 0; n< sz-1; n+=2) {
637: i1 = idx[0];
638: i2 = idx[1];
639: idx += 2;
640: tmp0 = x[i1];
641: tmp1 = x[i2];
642: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644: }
645: if (n == sz-1) {
646: tmp0 = x[*idx++];
647: sum1 += *v1++ * tmp0;
648: sum2 += *v2++ * tmp0;
649: }
650: y[row++]=sum1;
651: y[row++]=sum2;
652: v1 =v2; /* Since the next block to be processed starts there*/
653: idx +=sz;
654: break;
655: case 3:
656: sum1 = *zt++;
657: sum2 = *zt++;
658: sum3 = *zt++;
659: v2 = v1 + n;
660: v3 = v2 + n;
662: for (n = 0; n< sz-1; n+=2) {
663: i1 = idx[0];
664: i2 = idx[1];
665: idx += 2;
666: tmp0 = x[i1];
667: tmp1 = x[i2];
668: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671: }
672: if (n == sz-1) {
673: tmp0 = x[*idx++];
674: sum1 += *v1++ * tmp0;
675: sum2 += *v2++ * tmp0;
676: sum3 += *v3++ * tmp0;
677: }
678: y[row++]=sum1;
679: y[row++]=sum2;
680: y[row++]=sum3;
681: v1 =v3; /* Since the next block to be processed starts there*/
682: idx +=2*sz;
683: break;
684: case 4:
685: sum1 = *zt++;
686: sum2 = *zt++;
687: sum3 = *zt++;
688: sum4 = *zt++;
689: v2 = v1 + n;
690: v3 = v2 + n;
691: v4 = v3 + n;
693: for (n = 0; n< sz-1; n+=2) {
694: i1 = idx[0];
695: i2 = idx[1];
696: idx += 2;
697: tmp0 = x[i1];
698: tmp1 = x[i2];
699: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703: }
704: if (n == sz-1) {
705: tmp0 = x[*idx++];
706: sum1 += *v1++ * tmp0;
707: sum2 += *v2++ * tmp0;
708: sum3 += *v3++ * tmp0;
709: sum4 += *v4++ * tmp0;
710: }
711: y[row++]=sum1;
712: y[row++]=sum2;
713: y[row++]=sum3;
714: y[row++]=sum4;
715: v1 =v4; /* Since the next block to be processed starts there*/
716: idx +=3*sz;
717: break;
718: case 5:
719: sum1 = *zt++;
720: sum2 = *zt++;
721: sum3 = *zt++;
722: sum4 = *zt++;
723: sum5 = *zt++;
724: v2 = v1 + n;
725: v3 = v2 + n;
726: v4 = v3 + n;
727: v5 = v4 + n;
729: for (n = 0; n<sz-1; n+=2) {
730: i1 = idx[0];
731: i2 = idx[1];
732: idx += 2;
733: tmp0 = x[i1];
734: tmp1 = x[i2];
735: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740: }
741: if (n == sz-1) {
742: tmp0 = x[*idx++];
743: sum1 += *v1++ * tmp0;
744: sum2 += *v2++ * tmp0;
745: sum3 += *v3++ * tmp0;
746: sum4 += *v4++ * tmp0;
747: sum5 += *v5++ * tmp0;
748: }
749: y[row++]=sum1;
750: y[row++]=sum2;
751: y[row++]=sum3;
752: y[row++]=sum4;
753: y[row++]=sum5;
754: v1 =v5; /* Since the next block to be processed starts there */
755: idx +=4*sz;
756: break;
757: default:
758: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759: }
760: }
761: VecRestoreArrayRead(xx,&x);
762: VecRestoreArrayPair(zz,yy,&z,&y);
763: PetscLogFlops(2.0*a->nz);
764: return(0);
765: }
767: /* ----------------------------------------------------------- */
770: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771: {
772: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
773: IS iscol = a->col,isrow = a->row;
774: PetscErrorCode ierr;
775: const PetscInt *r,*c,*rout,*cout;
776: PetscInt i,j,n = A->rmap->n,nz;
777: PetscInt node_max,*ns,row,nsz,aii,i0,i1;
778: const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
780: PetscScalar sum1,sum2,sum3,sum4,sum5;
781: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782: const PetscScalar *b;
785: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786: node_max = a->inode.node_count;
787: ns = a->inode.size; /* Node Size array */
789: VecGetArrayRead(bb,&b);
790: VecGetArray(xx,&x);
791: tmp = a->solve_work;
793: ISGetIndices(isrow,&rout); r = rout;
794: ISGetIndices(iscol,&cout); c = cout + (n-1);
796: /* forward solve the lower triangular */
797: tmps = tmp;
798: aa = a_a;
799: aj = a_j;
800: ad = a->diag;
802: for (i = 0,row = 0; i< node_max; ++i) {
803: nsz = ns[i];
804: aii = ai[row];
805: v1 = aa + aii;
806: vi = aj + aii;
807: nz = ad[row]- aii;
808: if (i < node_max-1) {
809: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810: * but our indexing to determine it's size could. */
811: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815: }
817: switch (nsz) { /* Each loop in 'case' is unrolled */
818: case 1:
819: sum1 = b[*r++];
820: for (j=0; j<nz-1; j+=2) {
821: i0 = vi[0];
822: i1 = vi[1];
823: vi +=2;
824: tmp0 = tmps[i0];
825: tmp1 = tmps[i1];
826: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827: }
828: if (j == nz-1) {
829: tmp0 = tmps[*vi++];
830: sum1 -= *v1++ *tmp0;
831: }
832: tmp[row++]=sum1;
833: break;
834: case 2:
835: sum1 = b[*r++];
836: sum2 = b[*r++];
837: v2 = aa + ai[row+1];
839: for (j=0; j<nz-1; j+=2) {
840: i0 = vi[0];
841: i1 = vi[1];
842: vi +=2;
843: tmp0 = tmps[i0];
844: tmp1 = tmps[i1];
845: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847: }
848: if (j == nz-1) {
849: tmp0 = tmps[*vi++];
850: sum1 -= *v1++ *tmp0;
851: sum2 -= *v2++ *tmp0;
852: }
853: sum2 -= *v2++ *sum1;
854: tmp[row++]=sum1;
855: tmp[row++]=sum2;
856: break;
857: case 3:
858: sum1 = b[*r++];
859: sum2 = b[*r++];
860: sum3 = b[*r++];
861: v2 = aa + ai[row+1];
862: v3 = aa + ai[row+2];
864: for (j=0; j<nz-1; j+=2) {
865: i0 = vi[0];
866: i1 = vi[1];
867: vi +=2;
868: tmp0 = tmps[i0];
869: tmp1 = tmps[i1];
870: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873: }
874: if (j == nz-1) {
875: tmp0 = tmps[*vi++];
876: sum1 -= *v1++ *tmp0;
877: sum2 -= *v2++ *tmp0;
878: sum3 -= *v3++ *tmp0;
879: }
880: sum2 -= *v2++ * sum1;
881: sum3 -= *v3++ * sum1;
882: sum3 -= *v3++ * sum2;
884: tmp[row++]=sum1;
885: tmp[row++]=sum2;
886: tmp[row++]=sum3;
887: break;
889: case 4:
890: sum1 = b[*r++];
891: sum2 = b[*r++];
892: sum3 = b[*r++];
893: sum4 = b[*r++];
894: v2 = aa + ai[row+1];
895: v3 = aa + ai[row+2];
896: v4 = aa + ai[row+3];
898: for (j=0; j<nz-1; j+=2) {
899: i0 = vi[0];
900: i1 = vi[1];
901: vi +=2;
902: tmp0 = tmps[i0];
903: tmp1 = tmps[i1];
904: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908: }
909: if (j == nz-1) {
910: tmp0 = tmps[*vi++];
911: sum1 -= *v1++ *tmp0;
912: sum2 -= *v2++ *tmp0;
913: sum3 -= *v3++ *tmp0;
914: sum4 -= *v4++ *tmp0;
915: }
916: sum2 -= *v2++ * sum1;
917: sum3 -= *v3++ * sum1;
918: sum4 -= *v4++ * sum1;
919: sum3 -= *v3++ * sum2;
920: sum4 -= *v4++ * sum2;
921: sum4 -= *v4++ * sum3;
923: tmp[row++]=sum1;
924: tmp[row++]=sum2;
925: tmp[row++]=sum3;
926: tmp[row++]=sum4;
927: break;
928: case 5:
929: sum1 = b[*r++];
930: sum2 = b[*r++];
931: sum3 = b[*r++];
932: sum4 = b[*r++];
933: sum5 = b[*r++];
934: v2 = aa + ai[row+1];
935: v3 = aa + ai[row+2];
936: v4 = aa + ai[row+3];
937: v5 = aa + ai[row+4];
939: for (j=0; j<nz-1; j+=2) {
940: i0 = vi[0];
941: i1 = vi[1];
942: vi +=2;
943: tmp0 = tmps[i0];
944: tmp1 = tmps[i1];
945: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950: }
951: if (j == nz-1) {
952: tmp0 = tmps[*vi++];
953: sum1 -= *v1++ *tmp0;
954: sum2 -= *v2++ *tmp0;
955: sum3 -= *v3++ *tmp0;
956: sum4 -= *v4++ *tmp0;
957: sum5 -= *v5++ *tmp0;
958: }
960: sum2 -= *v2++ * sum1;
961: sum3 -= *v3++ * sum1;
962: sum4 -= *v4++ * sum1;
963: sum5 -= *v5++ * sum1;
964: sum3 -= *v3++ * sum2;
965: sum4 -= *v4++ * sum2;
966: sum5 -= *v5++ * sum2;
967: sum4 -= *v4++ * sum3;
968: sum5 -= *v5++ * sum3;
969: sum5 -= *v5++ * sum4;
971: tmp[row++]=sum1;
972: tmp[row++]=sum2;
973: tmp[row++]=sum3;
974: tmp[row++]=sum4;
975: tmp[row++]=sum5;
976: break;
977: default:
978: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979: }
980: }
981: /* backward solve the upper triangular */
982: for (i=node_max -1,row = n-1; i>=0; i--) {
983: nsz = ns[i];
984: aii = ai[row+1] -1;
985: v1 = aa + aii;
986: vi = aj + aii;
987: nz = aii- ad[row];
988: switch (nsz) { /* Each loop in 'case' is unrolled */
989: case 1:
990: sum1 = tmp[row];
992: for (j=nz; j>1; j-=2) {
993: vi -=2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000: }
1001: if (j==1) {
1002: tmp0 = tmps[*vi--];
1003: sum1 -= *v1-- * tmp0;
1004: }
1005: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006: break;
1007: case 2:
1008: sum1 = tmp[row];
1009: sum2 = tmp[row -1];
1010: v2 = aa + ai[row]-1;
1011: for (j=nz; j>1; j-=2) {
1012: vi -=2;
1013: i0 = vi[2];
1014: i1 = vi[1];
1015: tmp0 = tmps[i0];
1016: tmp1 = tmps[i1];
1017: v1 -= 2;
1018: v2 -= 2;
1019: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021: }
1022: if (j==1) {
1023: tmp0 = tmps[*vi--];
1024: sum1 -= *v1-- * tmp0;
1025: sum2 -= *v2-- * tmp0;
1026: }
1028: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029: sum2 -= *v2-- * tmp0;
1030: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031: break;
1032: case 3:
1033: sum1 = tmp[row];
1034: sum2 = tmp[row -1];
1035: sum3 = tmp[row -2];
1036: v2 = aa + ai[row]-1;
1037: v3 = aa + ai[row -1]-1;
1038: for (j=nz; j>1; j-=2) {
1039: vi -=2;
1040: i0 = vi[2];
1041: i1 = vi[1];
1042: tmp0 = tmps[i0];
1043: tmp1 = tmps[i1];
1044: v1 -= 2;
1045: v2 -= 2;
1046: v3 -= 2;
1047: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050: }
1051: if (j==1) {
1052: tmp0 = tmps[*vi--];
1053: sum1 -= *v1-- * tmp0;
1054: sum2 -= *v2-- * tmp0;
1055: sum3 -= *v3-- * tmp0;
1056: }
1057: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058: sum2 -= *v2-- * tmp0;
1059: sum3 -= *v3-- * tmp0;
1060: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061: sum3 -= *v3-- * tmp0;
1062: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1064: break;
1065: case 4:
1066: sum1 = tmp[row];
1067: sum2 = tmp[row -1];
1068: sum3 = tmp[row -2];
1069: sum4 = tmp[row -3];
1070: v2 = aa + ai[row]-1;
1071: v3 = aa + ai[row -1]-1;
1072: v4 = aa + ai[row -2]-1;
1074: for (j=nz; j>1; j-=2) {
1075: vi -=2;
1076: i0 = vi[2];
1077: i1 = vi[1];
1078: tmp0 = tmps[i0];
1079: tmp1 = tmps[i1];
1080: v1 -= 2;
1081: v2 -= 2;
1082: v3 -= 2;
1083: v4 -= 2;
1084: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088: }
1089: if (j==1) {
1090: tmp0 = tmps[*vi--];
1091: sum1 -= *v1-- * tmp0;
1092: sum2 -= *v2-- * tmp0;
1093: sum3 -= *v3-- * tmp0;
1094: sum4 -= *v4-- * tmp0;
1095: }
1097: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098: sum2 -= *v2-- * tmp0;
1099: sum3 -= *v3-- * tmp0;
1100: sum4 -= *v4-- * tmp0;
1101: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102: sum3 -= *v3-- * tmp0;
1103: sum4 -= *v4-- * tmp0;
1104: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105: sum4 -= *v4-- * tmp0;
1106: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107: break;
1108: case 5:
1109: sum1 = tmp[row];
1110: sum2 = tmp[row -1];
1111: sum3 = tmp[row -2];
1112: sum4 = tmp[row -3];
1113: sum5 = tmp[row -4];
1114: v2 = aa + ai[row]-1;
1115: v3 = aa + ai[row -1]-1;
1116: v4 = aa + ai[row -2]-1;
1117: v5 = aa + ai[row -3]-1;
1118: for (j=nz; j>1; j-=2) {
1119: vi -= 2;
1120: i0 = vi[2];
1121: i1 = vi[1];
1122: tmp0 = tmps[i0];
1123: tmp1 = tmps[i1];
1124: v1 -= 2;
1125: v2 -= 2;
1126: v3 -= 2;
1127: v4 -= 2;
1128: v5 -= 2;
1129: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134: }
1135: if (j==1) {
1136: tmp0 = tmps[*vi--];
1137: sum1 -= *v1-- * tmp0;
1138: sum2 -= *v2-- * tmp0;
1139: sum3 -= *v3-- * tmp0;
1140: sum4 -= *v4-- * tmp0;
1141: sum5 -= *v5-- * tmp0;
1142: }
1144: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145: sum2 -= *v2-- * tmp0;
1146: sum3 -= *v3-- * tmp0;
1147: sum4 -= *v4-- * tmp0;
1148: sum5 -= *v5-- * tmp0;
1149: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150: sum3 -= *v3-- * tmp0;
1151: sum4 -= *v4-- * tmp0;
1152: sum5 -= *v5-- * tmp0;
1153: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154: sum4 -= *v4-- * tmp0;
1155: sum5 -= *v5-- * tmp0;
1156: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157: sum5 -= *v5-- * tmp0;
1158: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159: break;
1160: default:
1161: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162: }
1163: }
1164: ISRestoreIndices(isrow,&rout);
1165: ISRestoreIndices(iscol,&cout);
1166: VecRestoreArrayRead(bb,&b);
1167: VecRestoreArray(xx,&x);
1168: PetscLogFlops(2.0*a->nz - A->cmap->n);
1169: return(0);
1170: }
1174: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175: {
1176: Mat C =B;
1177: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178: IS isrow = b->row,isicol = b->icol;
1179: PetscErrorCode ierr;
1180: const PetscInt *r,*ic,*ics;
1181: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182: PetscInt i,j,k,nz,nzL,row,*pj;
1183: const PetscInt *ajtmp,*bjtmp;
1184: MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185: const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186: FactorShiftCtx sctx;
1187: const PetscInt *ddiag;
1188: PetscReal rs;
1189: MatScalar d;
1190: PetscInt inod,nodesz,node_max,col;
1191: const PetscInt *ns;
1192: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1195: /* MatPivotSetUp(): initialize shift context sctx */
1196: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1198: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199: ddiag = a->diag;
1200: sctx.shift_top = info->zeropivot;
1201: for (i=0; i<n; i++) {
1202: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203: d = (aa)[ddiag[i]];
1204: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205: v = aa+ai[i];
1206: nz = ai[i+1] - ai[i];
1207: for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208: if (rs>sctx.shift_top) sctx.shift_top = rs;
1209: }
1210: sctx.shift_top *= 1.1;
1211: sctx.nshift_max = 5;
1212: sctx.shift_lo = 0.;
1213: sctx.shift_hi = 1.;
1214: }
1216: ISGetIndices(isrow,&r);
1217: ISGetIndices(isicol,&ic);
1219: PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);
1220: ics = ic;
1222: node_max = a->inode.node_count;
1223: ns = a->inode.size;
1224: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1226: /* If max inode size > 4, split it into two inodes.*/
1227: /* also map the inode sizes according to the ordering */
1228: PetscMalloc1(n+1,&tmp_vec1);
1229: for (i=0,j=0; i<node_max; ++i,++j) {
1230: if (ns[i] > 4) {
1231: tmp_vec1[j] = 4;
1232: ++j;
1233: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234: } else {
1235: tmp_vec1[j] = ns[i];
1236: }
1237: }
1238: /* Use the correct node_max */
1239: node_max = j;
1241: /* Now reorder the inode info based on mat re-ordering info */
1242: /* First create a row -> inode_size_array_index map */
1243: PetscMalloc1(n+1,&nsmap);
1244: PetscMalloc1(node_max+1,&tmp_vec2);
1245: for (i=0,row=0; i<node_max; i++) {
1246: nodesz = tmp_vec1[i];
1247: for (j=0; j<nodesz; j++,row++) {
1248: nsmap[row] = i;
1249: }
1250: }
1251: /* Using nsmap, create a reordered ns structure */
1252: for (i=0,j=0; i< node_max; i++) {
1253: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1254: tmp_vec2[i] = nodesz;
1255: j += nodesz;
1256: }
1257: PetscFree(nsmap);
1258: PetscFree(tmp_vec1);
1260: /* Now use the correct ns */
1261: ns = tmp_vec2;
1263: do {
1264: sctx.newshift = PETSC_FALSE;
1265: /* Now loop over each block-row, and do the factorization */
1266: for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267: nodesz = ns[inod];
1269: switch (nodesz) {
1270: case 1:
1271: /*----------*/
1272: /* zero rtmp1 */
1273: /* L part */
1274: nz = bi[i+1] - bi[i];
1275: bjtmp = bj + bi[i];
1276: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1278: /* U part */
1279: nz = bdiag[i]-bdiag[i+1];
1280: bjtmp = bj + bdiag[i+1]+1;
1281: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1283: /* load in initial (unfactored row) */
1284: nz = ai[r[i]+1] - ai[r[i]];
1285: ajtmp = aj + ai[r[i]];
1286: v = aa + ai[r[i]];
1287: for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1289: /* ZeropivotApply() */
1290: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1292: /* elimination */
1293: bjtmp = bj + bi[i];
1294: row = *bjtmp++;
1295: nzL = bi[i+1] - bi[i];
1296: for (k=0; k < nzL; k++) {
1297: pc = rtmp1 + row;
1298: if (*pc != 0.0) {
1299: pv = b->a + bdiag[row];
1300: mul1 = *pc * (*pv);
1301: *pc = mul1;
1302: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303: pv = b->a + bdiag[row+1]+1;
1304: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306: PetscLogFlops(1+2*nz);
1307: }
1308: row = *bjtmp++;
1309: }
1311: /* finished row so stick it into b->a */
1312: rs = 0.0;
1313: /* L part */
1314: pv = b->a + bi[i];
1315: pj = b->j + bi[i];
1316: nz = bi[i+1] - bi[i];
1317: for (j=0; j<nz; j++) {
1318: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319: }
1321: /* U part */
1322: pv = b->a + bdiag[i+1]+1;
1323: pj = b->j + bdiag[i+1]+1;
1324: nz = bdiag[i] - bdiag[i+1]-1;
1325: for (j=0; j<nz; j++) {
1326: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* Check zero pivot */
1330: sctx.rs = rs;
1331: sctx.pv = rtmp1[i];
1332: MatPivotCheck(B,A,info,&sctx,i);
1333: if (sctx.newshift) break;
1335: /* Mark diagonal and invert diagonal for simplier triangular solves */
1336: pv = b->a + bdiag[i];
1337: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338: break;
1340: case 2:
1341: /*----------*/
1342: /* zero rtmp1 and rtmp2 */
1343: /* L part */
1344: nz = bi[i+1] - bi[i];
1345: bjtmp = bj + bi[i];
1346: for (j=0; j<nz; j++) {
1347: col = bjtmp[j];
1348: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349: }
1351: /* U part */
1352: nz = bdiag[i]-bdiag[i+1];
1353: bjtmp = bj + bdiag[i+1]+1;
1354: for (j=0; j<nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357: }
1359: /* load in initial (unfactored row) */
1360: nz = ai[r[i]+1] - ai[r[i]];
1361: ajtmp = aj + ai[r[i]];
1362: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363: for (j=0; j<nz; j++) {
1364: col = ics[ajtmp[j]];
1365: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366: }
1367: /* ZeropivotApply(): shift the diagonal of the matrix */
1368: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1370: /* elimination */
1371: bjtmp = bj + bi[i];
1372: row = *bjtmp++; /* pivot row */
1373: nzL = bi[i+1] - bi[i];
1374: for (k=0; k < nzL; k++) {
1375: pc1 = rtmp1 + row;
1376: pc2 = rtmp2 + row;
1377: if (*pc1 != 0.0 || *pc2 != 0.0) {
1378: pv = b->a + bdiag[row];
1379: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380: *pc1 = mul1; *pc2 = mul2;
1382: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1383: pv = b->a + bdiag[row+1]+1;
1384: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385: for (j=0; j<nz; j++) {
1386: col = pj[j];
1387: rtmp1[col] -= mul1 * pv[j];
1388: rtmp2[col] -= mul2 * pv[j];
1389: }
1390: PetscLogFlops(2+4*nz);
1391: }
1392: row = *bjtmp++;
1393: }
1395: /* finished row i; check zero pivot, then stick row i into b->a */
1396: rs = 0.0;
1397: /* L part */
1398: pc1 = b->a + bi[i];
1399: pj = b->j + bi[i];
1400: nz = bi[i+1] - bi[i];
1401: for (j=0; j<nz; j++) {
1402: col = pj[j];
1403: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404: }
1405: /* U part */
1406: pc1 = b->a + bdiag[i+1]+1;
1407: pj = b->j + bdiag[i+1]+1;
1408: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409: for (j=0; j<nz; j++) {
1410: col = pj[j];
1411: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412: }
1414: sctx.rs = rs;
1415: sctx.pv = rtmp1[i];
1416: MatPivotCheck(B,A,info,&sctx,i);
1417: if (sctx.newshift) break;
1418: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1419: *pc1 = 1.0/sctx.pv;
1421: /* Now take care of diagonal 2x2 block. */
1422: pc2 = rtmp2 + i;
1423: if (*pc2 != 0.0) {
1424: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425: *pc2 = mul1; /* insert L entry */
1426: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1427: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428: for (j=0; j<nz; j++) {
1429: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430: }
1431: PetscLogFlops(1+2*nz);
1432: }
1434: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435: rs = 0.0;
1436: /* L part */
1437: pc2 = b->a + bi[i+1];
1438: pj = b->j + bi[i+1];
1439: nz = bi[i+2] - bi[i+1];
1440: for (j=0; j<nz; j++) {
1441: col = pj[j];
1442: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443: }
1444: /* U part */
1445: pc2 = b->a + bdiag[i+2]+1;
1446: pj = b->j + bdiag[i+2]+1;
1447: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448: for (j=0; j<nz; j++) {
1449: col = pj[j];
1450: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451: }
1453: sctx.rs = rs;
1454: sctx.pv = rtmp2[i+1];
1455: MatPivotCheck(B,A,info,&sctx,i+1);
1456: if (sctx.newshift) break;
1457: pc2 = b->a + bdiag[i+1];
1458: *pc2 = 1.0/sctx.pv;
1459: break;
1461: case 3:
1462: /*----------*/
1463: /* zero rtmp */
1464: /* L part */
1465: nz = bi[i+1] - bi[i];
1466: bjtmp = bj + bi[i];
1467: for (j=0; j<nz; j++) {
1468: col = bjtmp[j];
1469: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470: }
1472: /* U part */
1473: nz = bdiag[i]-bdiag[i+1];
1474: bjtmp = bj + bdiag[i+1]+1;
1475: for (j=0; j<nz; j++) {
1476: col = bjtmp[j];
1477: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478: }
1480: /* load in initial (unfactored row) */
1481: nz = ai[r[i]+1] - ai[r[i]];
1482: ajtmp = aj + ai[r[i]];
1483: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484: for (j=0; j<nz; j++) {
1485: col = ics[ajtmp[j]];
1486: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487: }
1488: /* ZeropivotApply(): shift the diagonal of the matrix */
1489: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1491: /* elimination */
1492: bjtmp = bj + bi[i];
1493: row = *bjtmp++; /* pivot row */
1494: nzL = bi[i+1] - bi[i];
1495: for (k=0; k < nzL; k++) {
1496: pc1 = rtmp1 + row;
1497: pc2 = rtmp2 + row;
1498: pc3 = rtmp3 + row;
1499: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500: pv = b->a + bdiag[row];
1501: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1504: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1505: pv = b->a + bdiag[row+1]+1;
1506: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507: for (j=0; j<nz; j++) {
1508: col = pj[j];
1509: rtmp1[col] -= mul1 * pv[j];
1510: rtmp2[col] -= mul2 * pv[j];
1511: rtmp3[col] -= mul3 * pv[j];
1512: }
1513: PetscLogFlops(3+6*nz);
1514: }
1515: row = *bjtmp++;
1516: }
1518: /* finished row i; check zero pivot, then stick row i into b->a */
1519: rs = 0.0;
1520: /* L part */
1521: pc1 = b->a + bi[i];
1522: pj = b->j + bi[i];
1523: nz = bi[i+1] - bi[i];
1524: for (j=0; j<nz; j++) {
1525: col = pj[j];
1526: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527: }
1528: /* U part */
1529: pc1 = b->a + bdiag[i+1]+1;
1530: pj = b->j + bdiag[i+1]+1;
1531: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532: for (j=0; j<nz; j++) {
1533: col = pj[j];
1534: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535: }
1537: sctx.rs = rs;
1538: sctx.pv = rtmp1[i];
1539: MatPivotCheck(B,A,info,&sctx,i);
1540: if (sctx.newshift) break;
1541: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1542: *pc1 = 1.0/sctx.pv;
1544: /* Now take care of 1st column of diagonal 3x3 block. */
1545: pc2 = rtmp2 + i;
1546: pc3 = rtmp3 + i;
1547: if (*pc2 != 0.0 || *pc3 != 0.0) {
1548: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1551: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552: for (j=0; j<nz; j++) {
1553: col = pj[j];
1554: rtmp2[col] -= mul2 * rtmp1[col];
1555: rtmp3[col] -= mul3 * rtmp1[col];
1556: }
1557: PetscLogFlops(2+4*nz);
1558: }
1560: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561: rs = 0.0;
1562: /* L part */
1563: pc2 = b->a + bi[i+1];
1564: pj = b->j + bi[i+1];
1565: nz = bi[i+2] - bi[i+1];
1566: for (j=0; j<nz; j++) {
1567: col = pj[j];
1568: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569: }
1570: /* U part */
1571: pc2 = b->a + bdiag[i+2]+1;
1572: pj = b->j + bdiag[i+2]+1;
1573: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574: for (j=0; j<nz; j++) {
1575: col = pj[j];
1576: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577: }
1579: sctx.rs = rs;
1580: sctx.pv = rtmp2[i+1];
1581: MatPivotCheck(B,A,info,&sctx,i+1);
1582: if (sctx.newshift) break;
1583: pc2 = b->a + bdiag[i+1];
1584: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1586: /* Now take care of 2nd column of diagonal 3x3 block. */
1587: pc3 = rtmp3 + i+1;
1588: if (*pc3 != 0.0) {
1589: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1591: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592: for (j=0; j<nz; j++) {
1593: col = pj[j];
1594: rtmp3[col] -= mul3 * rtmp2[col];
1595: }
1596: PetscLogFlops(1+2*nz);
1597: }
1599: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600: rs = 0.0;
1601: /* L part */
1602: pc3 = b->a + bi[i+2];
1603: pj = b->j + bi[i+2];
1604: nz = bi[i+3] - bi[i+2];
1605: for (j=0; j<nz; j++) {
1606: col = pj[j];
1607: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608: }
1609: /* U part */
1610: pc3 = b->a + bdiag[i+3]+1;
1611: pj = b->j + bdiag[i+3]+1;
1612: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613: for (j=0; j<nz; j++) {
1614: col = pj[j];
1615: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616: }
1618: sctx.rs = rs;
1619: sctx.pv = rtmp3[i+2];
1620: MatPivotCheck(B,A,info,&sctx,i+2);
1621: if (sctx.newshift) break;
1622: pc3 = b->a + bdiag[i+2];
1623: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624: break;
1625: case 4:
1626: /*----------*/
1627: /* zero rtmp */
1628: /* L part */
1629: nz = bi[i+1] - bi[i];
1630: bjtmp = bj + bi[i];
1631: for (j=0; j<nz; j++) {
1632: col = bjtmp[j];
1633: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634: }
1636: /* U part */
1637: nz = bdiag[i]-bdiag[i+1];
1638: bjtmp = bj + bdiag[i+1]+1;
1639: for (j=0; j<nz; j++) {
1640: col = bjtmp[j];
1641: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642: }
1644: /* load in initial (unfactored row) */
1645: nz = ai[r[i]+1] - ai[r[i]];
1646: ajtmp = aj + ai[r[i]];
1647: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648: for (j=0; j<nz; j++) {
1649: col = ics[ajtmp[j]];
1650: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651: }
1652: /* ZeropivotApply(): shift the diagonal of the matrix */
1653: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1655: /* elimination */
1656: bjtmp = bj + bi[i];
1657: row = *bjtmp++; /* pivot row */
1658: nzL = bi[i+1] - bi[i];
1659: for (k=0; k < nzL; k++) {
1660: pc1 = rtmp1 + row;
1661: pc2 = rtmp2 + row;
1662: pc3 = rtmp3 + row;
1663: pc4 = rtmp4 + row;
1664: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665: pv = b->a + bdiag[row];
1666: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1669: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670: pv = b->a + bdiag[row+1]+1;
1671: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672: for (j=0; j<nz; j++) {
1673: col = pj[j];
1674: rtmp1[col] -= mul1 * pv[j];
1675: rtmp2[col] -= mul2 * pv[j];
1676: rtmp3[col] -= mul3 * pv[j];
1677: rtmp4[col] -= mul4 * pv[j];
1678: }
1679: PetscLogFlops(4+8*nz);
1680: }
1681: row = *bjtmp++;
1682: }
1684: /* finished row i; check zero pivot, then stick row i into b->a */
1685: rs = 0.0;
1686: /* L part */
1687: pc1 = b->a + bi[i];
1688: pj = b->j + bi[i];
1689: nz = bi[i+1] - bi[i];
1690: for (j=0; j<nz; j++) {
1691: col = pj[j];
1692: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693: }
1694: /* U part */
1695: pc1 = b->a + bdiag[i+1]+1;
1696: pj = b->j + bdiag[i+1]+1;
1697: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698: for (j=0; j<nz; j++) {
1699: col = pj[j];
1700: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701: }
1703: sctx.rs = rs;
1704: sctx.pv = rtmp1[i];
1705: MatPivotCheck(B,A,info,&sctx,i);
1706: if (sctx.newshift) break;
1707: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1708: *pc1 = 1.0/sctx.pv;
1710: /* Now take care of 1st column of diagonal 4x4 block. */
1711: pc2 = rtmp2 + i;
1712: pc3 = rtmp3 + i;
1713: pc4 = rtmp4 + i;
1714: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717: mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1719: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720: for (j=0; j<nz; j++) {
1721: col = pj[j];
1722: rtmp2[col] -= mul2 * rtmp1[col];
1723: rtmp3[col] -= mul3 * rtmp1[col];
1724: rtmp4[col] -= mul4 * rtmp1[col];
1725: }
1726: PetscLogFlops(3+6*nz);
1727: }
1729: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730: rs = 0.0;
1731: /* L part */
1732: pc2 = b->a + bi[i+1];
1733: pj = b->j + bi[i+1];
1734: nz = bi[i+2] - bi[i+1];
1735: for (j=0; j<nz; j++) {
1736: col = pj[j];
1737: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738: }
1739: /* U part */
1740: pc2 = b->a + bdiag[i+2]+1;
1741: pj = b->j + bdiag[i+2]+1;
1742: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743: for (j=0; j<nz; j++) {
1744: col = pj[j];
1745: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746: }
1748: sctx.rs = rs;
1749: sctx.pv = rtmp2[i+1];
1750: MatPivotCheck(B,A,info,&sctx,i+1);
1751: if (sctx.newshift) break;
1752: pc2 = b->a + bdiag[i+1];
1753: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1755: /* Now take care of 2nd column of diagonal 4x4 block. */
1756: pc3 = rtmp3 + i+1;
1757: pc4 = rtmp4 + i+1;
1758: if (*pc3 != 0.0 || *pc4 != 0.0) {
1759: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760: mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1762: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763: for (j=0; j<nz; j++) {
1764: col = pj[j];
1765: rtmp3[col] -= mul3 * rtmp2[col];
1766: rtmp4[col] -= mul4 * rtmp2[col];
1767: }
1768: PetscLogFlops(4*nz);
1769: }
1771: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772: rs = 0.0;
1773: /* L part */
1774: pc3 = b->a + bi[i+2];
1775: pj = b->j + bi[i+2];
1776: nz = bi[i+3] - bi[i+2];
1777: for (j=0; j<nz; j++) {
1778: col = pj[j];
1779: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780: }
1781: /* U part */
1782: pc3 = b->a + bdiag[i+3]+1;
1783: pj = b->j + bdiag[i+3]+1;
1784: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785: for (j=0; j<nz; j++) {
1786: col = pj[j];
1787: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788: }
1790: sctx.rs = rs;
1791: sctx.pv = rtmp3[i+2];
1792: MatPivotCheck(B,A,info,&sctx,i+2);
1793: if (sctx.newshift) break;
1794: pc3 = b->a + bdiag[i+2];
1795: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1797: /* Now take care of 3rd column of diagonal 4x4 block. */
1798: pc4 = rtmp4 + i+2;
1799: if (*pc4 != 0.0) {
1800: mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801: pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1802: nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803: for (j=0; j<nz; j++) {
1804: col = pj[j];
1805: rtmp4[col] -= mul4 * rtmp3[col];
1806: }
1807: PetscLogFlops(1+2*nz);
1808: }
1810: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811: rs = 0.0;
1812: /* L part */
1813: pc4 = b->a + bi[i+3];
1814: pj = b->j + bi[i+3];
1815: nz = bi[i+4] - bi[i+3];
1816: for (j=0; j<nz; j++) {
1817: col = pj[j];
1818: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819: }
1820: /* U part */
1821: pc4 = b->a + bdiag[i+4]+1;
1822: pj = b->j + bdiag[i+4]+1;
1823: nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824: for (j=0; j<nz; j++) {
1825: col = pj[j];
1826: pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827: }
1829: sctx.rs = rs;
1830: sctx.pv = rtmp4[i+3];
1831: MatPivotCheck(B,A,info,&sctx,i+3);
1832: if (sctx.newshift) break;
1833: pc4 = b->a + bdiag[i+3];
1834: *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835: break;
1837: default:
1838: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839: }
1840: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841: i += nodesz; /* Update the row */
1842: }
1844: /* MatPivotRefine() */
1845: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846: /*
1847: * if no shift in this attempt & shifting & started shifting & can refine,
1848: * then try lower shift
1849: */
1850: sctx.shift_hi = sctx.shift_fraction;
1851: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1853: sctx.newshift = PETSC_TRUE;
1854: sctx.nshift++;
1855: }
1856: } while (sctx.newshift);
1858: PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);
1859: PetscFree(tmp_vec2);
1860: ISRestoreIndices(isicol,&ic);
1861: ISRestoreIndices(isrow,&r);
1863: if (b->inode.size) {
1864: C->ops->solve = MatSolve_SeqAIJ_Inode;
1865: } else {
1866: C->ops->solve = MatSolve_SeqAIJ;
1867: }
1868: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1869: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1870: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871: C->ops->matsolve = MatMatSolve_SeqAIJ;
1872: C->assembled = PETSC_TRUE;
1873: C->preallocated = PETSC_TRUE;
1875: PetscLogFlops(C->cmap->n);
1877: /* MatShiftView(A,info,&sctx) */
1878: if (sctx.nshift) {
1879: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1881: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1883: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1885: }
1886: }
1887: return(0);
1888: }
1892: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893: {
1894: Mat C = B;
1895: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1897: PetscErrorCode ierr;
1898: const PetscInt *r,*ic,*c,*ics;
1899: PetscInt n = A->rmap->n,*bi = b->i;
1900: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901: PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1902: PetscInt *ai = a->i,*aj = a->j;
1903: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904: PetscScalar mul1,mul2,mul3,tmp;
1905: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907: PetscReal rs=0.0;
1908: FactorShiftCtx sctx;
1911: sctx.shift_top = 0;
1912: sctx.nshift_max = 0;
1913: sctx.shift_lo = 0;
1914: sctx.shift_hi = 0;
1915: sctx.shift_fraction = 0;
1917: /* if both shift schemes are chosen by user, only use info->shiftpd */
1918: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919: sctx.shift_top = 0;
1920: for (i=0; i<n; i++) {
1921: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922: rs = 0.0;
1923: ajtmp = aj + ai[i];
1924: rtmp1 = aa + ai[i];
1925: nz = ai[i+1] - ai[i];
1926: for (j=0; j<nz; j++) {
1927: if (*ajtmp != i) {
1928: rs += PetscAbsScalar(*rtmp1++);
1929: } else {
1930: rs -= PetscRealPart(*rtmp1++);
1931: }
1932: ajtmp++;
1933: }
1934: if (rs>sctx.shift_top) sctx.shift_top = rs;
1935: }
1936: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937: sctx.shift_top *= 1.1;
1938: sctx.nshift_max = 5;
1939: sctx.shift_lo = 0.;
1940: sctx.shift_hi = 1.;
1941: }
1942: sctx.shift_amount = 0;
1943: sctx.nshift = 0;
1945: ISGetIndices(isrow,&r);
1946: ISGetIndices(iscol,&c);
1947: ISGetIndices(isicol,&ic);
1948: PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);
1949: ics = ic;
1951: node_max = a->inode.node_count;
1952: ns = a->inode.size;
1953: if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1955: /* If max inode size > 3, split it into two inodes.*/
1956: /* also map the inode sizes according to the ordering */
1957: PetscMalloc1(n+1,&tmp_vec1);
1958: for (i=0,j=0; i<node_max; ++i,++j) {
1959: if (ns[i]>3) {
1960: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1961: ++j;
1962: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963: } else {
1964: tmp_vec1[j] = ns[i];
1965: }
1966: }
1967: /* Use the correct node_max */
1968: node_max = j;
1970: /* Now reorder the inode info based on mat re-ordering info */
1971: /* First create a row -> inode_size_array_index map */
1972: PetscMalloc1(n+1,&nsmap);
1973: PetscMalloc1(node_max+1,&tmp_vec2);
1974: for (i=0,row=0; i<node_max; i++) {
1975: nodesz = tmp_vec1[i];
1976: for (j=0; j<nodesz; j++,row++) {
1977: nsmap[row] = i;
1978: }
1979: }
1980: /* Using nsmap, create a reordered ns structure */
1981: for (i=0,j=0; i< node_max; i++) {
1982: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1983: tmp_vec2[i] = nodesz;
1984: j += nodesz;
1985: }
1986: PetscFree(nsmap);
1987: PetscFree(tmp_vec1);
1988: /* Now use the correct ns */
1989: ns = tmp_vec2;
1991: do {
1992: sctx.newshift = PETSC_FALSE;
1993: /* Now loop over each block-row, and do the factorization */
1994: for (i=0,row=0; i<node_max; i++) {
1995: nodesz = ns[i];
1996: nz = bi[row+1] - bi[row];
1997: bjtmp = bj + bi[row];
1999: switch (nodesz) {
2000: case 1:
2001: for (j=0; j<nz; j++) {
2002: idx = bjtmp[j];
2003: rtmp11[idx] = 0.0;
2004: }
2006: /* load in initial (unfactored row) */
2007: idx = r[row];
2008: nz_tmp = ai[idx+1] - ai[idx];
2009: ajtmp = aj + ai[idx];
2010: v1 = aa + ai[idx];
2012: for (j=0; j<nz_tmp; j++) {
2013: idx = ics[ajtmp[j]];
2014: rtmp11[idx] = v1[j];
2015: }
2016: rtmp11[ics[r[row]]] += sctx.shift_amount;
2018: prow = *bjtmp++;
2019: while (prow < row) {
2020: pc1 = rtmp11 + prow;
2021: if (*pc1 != 0.0) {
2022: pv = ba + bd[prow];
2023: pj = nbj + bd[prow];
2024: mul1 = *pc1 * *pv++;
2025: *pc1 = mul1;
2026: nz_tmp = bi[prow+1] - bd[prow] - 1;
2027: PetscLogFlops(1+2*nz_tmp);
2028: for (j=0; j<nz_tmp; j++) {
2029: tmp = pv[j];
2030: idx = pj[j];
2031: rtmp11[idx] -= mul1 * tmp;
2032: }
2033: }
2034: prow = *bjtmp++;
2035: }
2036: pj = bj + bi[row];
2037: pc1 = ba + bi[row];
2039: sctx.pv = rtmp11[row];
2040: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041: rs = 0.0;
2042: for (j=0; j<nz; j++) {
2043: idx = pj[j];
2044: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046: }
2047: sctx.rs = rs;
2048: MatPivotCheck(B,A,info,&sctx,row);
2049: if (sctx.newshift) goto endofwhile;
2050: break;
2052: case 2:
2053: for (j=0; j<nz; j++) {
2054: idx = bjtmp[j];
2055: rtmp11[idx] = 0.0;
2056: rtmp22[idx] = 0.0;
2057: }
2059: /* load in initial (unfactored row) */
2060: idx = r[row];
2061: nz_tmp = ai[idx+1] - ai[idx];
2062: ajtmp = aj + ai[idx];
2063: v1 = aa + ai[idx];
2064: v2 = aa + ai[idx+1];
2065: for (j=0; j<nz_tmp; j++) {
2066: idx = ics[ajtmp[j]];
2067: rtmp11[idx] = v1[j];
2068: rtmp22[idx] = v2[j];
2069: }
2070: rtmp11[ics[r[row]]] += sctx.shift_amount;
2071: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2073: prow = *bjtmp++;
2074: while (prow < row) {
2075: pc1 = rtmp11 + prow;
2076: pc2 = rtmp22 + prow;
2077: if (*pc1 != 0.0 || *pc2 != 0.0) {
2078: pv = ba + bd[prow];
2079: pj = nbj + bd[prow];
2080: mul1 = *pc1 * *pv;
2081: mul2 = *pc2 * *pv;
2082: ++pv;
2083: *pc1 = mul1;
2084: *pc2 = mul2;
2086: nz_tmp = bi[prow+1] - bd[prow] - 1;
2087: for (j=0; j<nz_tmp; j++) {
2088: tmp = pv[j];
2089: idx = pj[j];
2090: rtmp11[idx] -= mul1 * tmp;
2091: rtmp22[idx] -= mul2 * tmp;
2092: }
2093: PetscLogFlops(2+4*nz_tmp);
2094: }
2095: prow = *bjtmp++;
2096: }
2098: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099: pc1 = rtmp11 + prow;
2100: pc2 = rtmp22 + prow;
2102: sctx.pv = *pc1;
2103: pj = bj + bi[prow];
2104: rs = 0.0;
2105: for (j=0; j<nz; j++) {
2106: idx = pj[j];
2107: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108: }
2109: sctx.rs = rs;
2110: MatPivotCheck(B,A,info,&sctx,row);
2111: if (sctx.newshift) goto endofwhile;
2113: if (*pc2 != 0.0) {
2114: pj = nbj + bd[prow];
2115: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116: *pc2 = mul2;
2117: nz_tmp = bi[prow+1] - bd[prow] - 1;
2118: for (j=0; j<nz_tmp; j++) {
2119: idx = pj[j];
2120: tmp = rtmp11[idx];
2121: rtmp22[idx] -= mul2 * tmp;
2122: }
2123: PetscLogFlops(1+2*nz_tmp);
2124: }
2126: pj = bj + bi[row];
2127: pc1 = ba + bi[row];
2128: pc2 = ba + bi[row+1];
2130: sctx.pv = rtmp22[row+1];
2131: rs = 0.0;
2132: rtmp11[row] = 1.0/rtmp11[row];
2133: rtmp22[row+1] = 1.0/rtmp22[row+1];
2134: /* copy row entries from dense representation to sparse */
2135: for (j=0; j<nz; j++) {
2136: idx = pj[j];
2137: pc1[j] = rtmp11[idx];
2138: pc2[j] = rtmp22[idx];
2139: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140: }
2141: sctx.rs = rs;
2142: MatPivotCheck(B,A,info,&sctx,row+1);
2143: if (sctx.newshift) goto endofwhile;
2144: break;
2146: case 3:
2147: for (j=0; j<nz; j++) {
2148: idx = bjtmp[j];
2149: rtmp11[idx] = 0.0;
2150: rtmp22[idx] = 0.0;
2151: rtmp33[idx] = 0.0;
2152: }
2153: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154: idx = r[row];
2155: nz_tmp = ai[idx+1] - ai[idx];
2156: ajtmp = aj + ai[idx];
2157: v1 = aa + ai[idx];
2158: v2 = aa + ai[idx+1];
2159: v3 = aa + ai[idx+2];
2160: for (j=0; j<nz_tmp; j++) {
2161: idx = ics[ajtmp[j]];
2162: rtmp11[idx] = v1[j];
2163: rtmp22[idx] = v2[j];
2164: rtmp33[idx] = v3[j];
2165: }
2166: rtmp11[ics[r[row]]] += sctx.shift_amount;
2167: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2170: /* loop over all pivot row blocks above this row block */
2171: prow = *bjtmp++;
2172: while (prow < row) {
2173: pc1 = rtmp11 + prow;
2174: pc2 = rtmp22 + prow;
2175: pc3 = rtmp33 + prow;
2176: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177: pv = ba + bd[prow];
2178: pj = nbj + bd[prow];
2179: mul1 = *pc1 * *pv;
2180: mul2 = *pc2 * *pv;
2181: mul3 = *pc3 * *pv;
2182: ++pv;
2183: *pc1 = mul1;
2184: *pc2 = mul2;
2185: *pc3 = mul3;
2187: nz_tmp = bi[prow+1] - bd[prow] - 1;
2188: /* update this row based on pivot row */
2189: for (j=0; j<nz_tmp; j++) {
2190: tmp = pv[j];
2191: idx = pj[j];
2192: rtmp11[idx] -= mul1 * tmp;
2193: rtmp22[idx] -= mul2 * tmp;
2194: rtmp33[idx] -= mul3 * tmp;
2195: }
2196: PetscLogFlops(3+6*nz_tmp);
2197: }
2198: prow = *bjtmp++;
2199: }
2201: /* Now take care of diagonal 3x3 block in this set of rows */
2202: /* note: prow = row here */
2203: pc1 = rtmp11 + prow;
2204: pc2 = rtmp22 + prow;
2205: pc3 = rtmp33 + prow;
2207: sctx.pv = *pc1;
2208: pj = bj + bi[prow];
2209: rs = 0.0;
2210: for (j=0; j<nz; j++) {
2211: idx = pj[j];
2212: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213: }
2214: sctx.rs = rs;
2215: MatPivotCheck(B,A,info,&sctx,row);
2216: if (sctx.newshift) goto endofwhile;
2218: if (*pc2 != 0.0 || *pc3 != 0.0) {
2219: mul2 = (*pc2)/(*pc1);
2220: mul3 = (*pc3)/(*pc1);
2221: *pc2 = mul2;
2222: *pc3 = mul3;
2223: nz_tmp = bi[prow+1] - bd[prow] - 1;
2224: pj = nbj + bd[prow];
2225: for (j=0; j<nz_tmp; j++) {
2226: idx = pj[j];
2227: tmp = rtmp11[idx];
2228: rtmp22[idx] -= mul2 * tmp;
2229: rtmp33[idx] -= mul3 * tmp;
2230: }
2231: PetscLogFlops(2+4*nz_tmp);
2232: }
2233: ++prow;
2235: pc2 = rtmp22 + prow;
2236: pc3 = rtmp33 + prow;
2237: sctx.pv = *pc2;
2238: pj = bj + bi[prow];
2239: rs = 0.0;
2240: for (j=0; j<nz; j++) {
2241: idx = pj[j];
2242: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243: }
2244: sctx.rs = rs;
2245: MatPivotCheck(B,A,info,&sctx,row+1);
2246: if (sctx.newshift) goto endofwhile;
2248: if (*pc3 != 0.0) {
2249: mul3 = (*pc3)/(*pc2);
2250: *pc3 = mul3;
2251: pj = nbj + bd[prow];
2252: nz_tmp = bi[prow+1] - bd[prow] - 1;
2253: for (j=0; j<nz_tmp; j++) {
2254: idx = pj[j];
2255: tmp = rtmp22[idx];
2256: rtmp33[idx] -= mul3 * tmp;
2257: }
2258: PetscLogFlops(1+2*nz_tmp);
2259: }
2261: pj = bj + bi[row];
2262: pc1 = ba + bi[row];
2263: pc2 = ba + bi[row+1];
2264: pc3 = ba + bi[row+2];
2266: sctx.pv = rtmp33[row+2];
2267: rs = 0.0;
2268: rtmp11[row] = 1.0/rtmp11[row];
2269: rtmp22[row+1] = 1.0/rtmp22[row+1];
2270: rtmp33[row+2] = 1.0/rtmp33[row+2];
2271: /* copy row entries from dense representation to sparse */
2272: for (j=0; j<nz; j++) {
2273: idx = pj[j];
2274: pc1[j] = rtmp11[idx];
2275: pc2[j] = rtmp22[idx];
2276: pc3[j] = rtmp33[idx];
2277: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278: }
2280: sctx.rs = rs;
2281: MatPivotCheck(B,A,info,&sctx,row+2);
2282: if (sctx.newshift) goto endofwhile;
2283: break;
2285: default:
2286: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287: }
2288: row += nodesz; /* Update the row */
2289: }
2290: endofwhile:;
2291: } while (sctx.newshift);
2292: PetscFree3(rtmp11,rtmp22,rtmp33);
2293: PetscFree(tmp_vec2);
2294: ISRestoreIndices(isicol,&ic);
2295: ISRestoreIndices(isrow,&r);
2296: ISRestoreIndices(iscol,&c);
2298: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299: /* do not set solve add, since MatSolve_Inode + Add is faster */
2300: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2301: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302: C->assembled = PETSC_TRUE;
2303: C->preallocated = PETSC_TRUE;
2304: if (sctx.nshift) {
2305: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
2307: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
2309: }
2310: }
2311: PetscLogFlops(C->cmap->n);
2312: MatSeqAIJCheckInode(C);
2313: return(0);
2314: }
2317: /* ----------------------------------------------------------- */
2320: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321: {
2322: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2323: IS iscol = a->col,isrow = a->row;
2324: PetscErrorCode ierr;
2325: const PetscInt *r,*c,*rout,*cout;
2326: PetscInt i,j,n = A->rmap->n;
2327: PetscInt node_max,row,nsz,aii,i0,i1,nz;
2328: const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2330: PetscScalar sum1,sum2,sum3,sum4,sum5;
2331: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332: const PetscScalar *b;
2335: if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336: node_max = a->inode.node_count;
2337: ns = a->inode.size; /* Node Size array */
2339: VecGetArrayRead(bb,&b);
2340: VecGetArray(xx,&x);
2341: tmp = a->solve_work;
2343: ISGetIndices(isrow,&rout); r = rout;
2344: ISGetIndices(iscol,&cout); c = cout;
2346: /* forward solve the lower triangular */
2347: tmps = tmp;
2348: aa = a_a;
2349: aj = a_j;
2350: ad = a->diag;
2352: for (i = 0,row = 0; i< node_max; ++i) {
2353: nsz = ns[i];
2354: aii = ai[row];
2355: v1 = aa + aii;
2356: vi = aj + aii;
2357: nz = ai[row+1]- ai[row];
2359: if (i < node_max-1) {
2360: /* Prefetch the indices for the next block */
2361: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362: /* Prefetch the data for the next block */
2363: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364: }
2366: switch (nsz) { /* Each loop in 'case' is unrolled */
2367: case 1:
2368: sum1 = b[r[row]];
2369: for (j=0; j<nz-1; j+=2) {
2370: i0 = vi[j];
2371: i1 = vi[j+1];
2372: tmp0 = tmps[i0];
2373: tmp1 = tmps[i1];
2374: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375: }
2376: if (j == nz-1) {
2377: tmp0 = tmps[vi[j]];
2378: sum1 -= v1[j]*tmp0;
2379: }
2380: tmp[row++]=sum1;
2381: break;
2382: case 2:
2383: sum1 = b[r[row]];
2384: sum2 = b[r[row+1]];
2385: v2 = aa + ai[row+1];
2387: for (j=0; j<nz-1; j+=2) {
2388: i0 = vi[j];
2389: i1 = vi[j+1];
2390: tmp0 = tmps[i0];
2391: tmp1 = tmps[i1];
2392: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394: }
2395: if (j == nz-1) {
2396: tmp0 = tmps[vi[j]];
2397: sum1 -= v1[j] *tmp0;
2398: sum2 -= v2[j] *tmp0;
2399: }
2400: sum2 -= v2[nz] * sum1;
2401: tmp[row++]=sum1;
2402: tmp[row++]=sum2;
2403: break;
2404: case 3:
2405: sum1 = b[r[row]];
2406: sum2 = b[r[row+1]];
2407: sum3 = b[r[row+2]];
2408: v2 = aa + ai[row+1];
2409: v3 = aa + ai[row+2];
2411: for (j=0; j<nz-1; j+=2) {
2412: i0 = vi[j];
2413: i1 = vi[j+1];
2414: tmp0 = tmps[i0];
2415: tmp1 = tmps[i1];
2416: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419: }
2420: if (j == nz-1) {
2421: tmp0 = tmps[vi[j]];
2422: sum1 -= v1[j] *tmp0;
2423: sum2 -= v2[j] *tmp0;
2424: sum3 -= v3[j] *tmp0;
2425: }
2426: sum2 -= v2[nz] * sum1;
2427: sum3 -= v3[nz] * sum1;
2428: sum3 -= v3[nz+1] * sum2;
2429: tmp[row++]=sum1;
2430: tmp[row++]=sum2;
2431: tmp[row++]=sum3;
2432: break;
2434: case 4:
2435: sum1 = b[r[row]];
2436: sum2 = b[r[row+1]];
2437: sum3 = b[r[row+2]];
2438: sum4 = b[r[row+3]];
2439: v2 = aa + ai[row+1];
2440: v3 = aa + ai[row+2];
2441: v4 = aa + ai[row+3];
2443: for (j=0; j<nz-1; j+=2) {
2444: i0 = vi[j];
2445: i1 = vi[j+1];
2446: tmp0 = tmps[i0];
2447: tmp1 = tmps[i1];
2448: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452: }
2453: if (j == nz-1) {
2454: tmp0 = tmps[vi[j]];
2455: sum1 -= v1[j] *tmp0;
2456: sum2 -= v2[j] *tmp0;
2457: sum3 -= v3[j] *tmp0;
2458: sum4 -= v4[j] *tmp0;
2459: }
2460: sum2 -= v2[nz] * sum1;
2461: sum3 -= v3[nz] * sum1;
2462: sum4 -= v4[nz] * sum1;
2463: sum3 -= v3[nz+1] * sum2;
2464: sum4 -= v4[nz+1] * sum2;
2465: sum4 -= v4[nz+2] * sum3;
2467: tmp[row++]=sum1;
2468: tmp[row++]=sum2;
2469: tmp[row++]=sum3;
2470: tmp[row++]=sum4;
2471: break;
2472: case 5:
2473: sum1 = b[r[row]];
2474: sum2 = b[r[row+1]];
2475: sum3 = b[r[row+2]];
2476: sum4 = b[r[row+3]];
2477: sum5 = b[r[row+4]];
2478: v2 = aa + ai[row+1];
2479: v3 = aa + ai[row+2];
2480: v4 = aa + ai[row+3];
2481: v5 = aa + ai[row+4];
2483: for (j=0; j<nz-1; j+=2) {
2484: i0 = vi[j];
2485: i1 = vi[j+1];
2486: tmp0 = tmps[i0];
2487: tmp1 = tmps[i1];
2488: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493: }
2494: if (j == nz-1) {
2495: tmp0 = tmps[vi[j]];
2496: sum1 -= v1[j] *tmp0;
2497: sum2 -= v2[j] *tmp0;
2498: sum3 -= v3[j] *tmp0;
2499: sum4 -= v4[j] *tmp0;
2500: sum5 -= v5[j] *tmp0;
2501: }
2503: sum2 -= v2[nz] * sum1;
2504: sum3 -= v3[nz] * sum1;
2505: sum4 -= v4[nz] * sum1;
2506: sum5 -= v5[nz] * sum1;
2507: sum3 -= v3[nz+1] * sum2;
2508: sum4 -= v4[nz+1] * sum2;
2509: sum5 -= v5[nz+1] * sum2;
2510: sum4 -= v4[nz+2] * sum3;
2511: sum5 -= v5[nz+2] * sum3;
2512: sum5 -= v5[nz+3] * sum4;
2514: tmp[row++]=sum1;
2515: tmp[row++]=sum2;
2516: tmp[row++]=sum3;
2517: tmp[row++]=sum4;
2518: tmp[row++]=sum5;
2519: break;
2520: default:
2521: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522: }
2523: }
2524: /* backward solve the upper triangular */
2525: for (i=node_max -1,row = n-1; i>=0; i--) {
2526: nsz = ns[i];
2527: aii = ad[row+1] + 1;
2528: v1 = aa + aii;
2529: vi = aj + aii;
2530: nz = ad[row]- ad[row+1] - 1;
2532: if (i > 0) {
2533: /* Prefetch the indices for the next block */
2534: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535: /* Prefetch the data for the next block */
2536: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537: }
2539: switch (nsz) { /* Each loop in 'case' is unrolled */
2540: case 1:
2541: sum1 = tmp[row];
2543: for (j=0; j<nz-1; j+=2) {
2544: i0 = vi[j];
2545: i1 = vi[j+1];
2546: tmp0 = tmps[i0];
2547: tmp1 = tmps[i1];
2548: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549: }
2550: if (j == nz-1) {
2551: tmp0 = tmps[vi[j]];
2552: sum1 -= v1[j]*tmp0;
2553: }
2554: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555: break;
2556: case 2:
2557: sum1 = tmp[row];
2558: sum2 = tmp[row-1];
2559: v2 = aa + ad[row] + 1;
2560: for (j=0; j<nz-1; j+=2) {
2561: i0 = vi[j];
2562: i1 = vi[j+1];
2563: tmp0 = tmps[i0];
2564: tmp1 = tmps[i1];
2565: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567: }
2568: if (j == nz-1) {
2569: tmp0 = tmps[vi[j]];
2570: sum1 -= v1[j]* tmp0;
2571: sum2 -= v2[j+1]* tmp0;
2572: }
2574: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575: sum2 -= v2[0] * tmp0;
2576: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577: break;
2578: case 3:
2579: sum1 = tmp[row];
2580: sum2 = tmp[row -1];
2581: sum3 = tmp[row -2];
2582: v2 = aa + ad[row] + 1;
2583: v3 = aa + ad[row -1] + 1;
2584: for (j=0; j<nz-1; j+=2) {
2585: i0 = vi[j];
2586: i1 = vi[j+1];
2587: tmp0 = tmps[i0];
2588: tmp1 = tmps[i1];
2589: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592: }
2593: if (j== nz-1) {
2594: tmp0 = tmps[vi[j]];
2595: sum1 -= v1[j] * tmp0;
2596: sum2 -= v2[j+1] * tmp0;
2597: sum3 -= v3[j+2] * tmp0;
2598: }
2599: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600: sum2 -= v2[0]* tmp0;
2601: sum3 -= v3[1] * tmp0;
2602: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603: sum3 -= v3[0]* tmp0;
2604: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2606: break;
2607: case 4:
2608: sum1 = tmp[row];
2609: sum2 = tmp[row -1];
2610: sum3 = tmp[row -2];
2611: sum4 = tmp[row -3];
2612: v2 = aa + ad[row]+1;
2613: v3 = aa + ad[row -1]+1;
2614: v4 = aa + ad[row -2]+1;
2616: for (j=0; j<nz-1; j+=2) {
2617: i0 = vi[j];
2618: i1 = vi[j+1];
2619: tmp0 = tmps[i0];
2620: tmp1 = tmps[i1];
2621: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2622: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625: }
2626: if (j== nz-1) {
2627: tmp0 = tmps[vi[j]];
2628: sum1 -= v1[j] * tmp0;
2629: sum2 -= v2[j+1] * tmp0;
2630: sum3 -= v3[j+2] * tmp0;
2631: sum4 -= v4[j+3] * tmp0;
2632: }
2634: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635: sum2 -= v2[0] * tmp0;
2636: sum3 -= v3[1] * tmp0;
2637: sum4 -= v4[2] * tmp0;
2638: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639: sum3 -= v3[0] * tmp0;
2640: sum4 -= v4[1] * tmp0;
2641: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642: sum4 -= v4[0] * tmp0;
2643: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644: break;
2645: case 5:
2646: sum1 = tmp[row];
2647: sum2 = tmp[row -1];
2648: sum3 = tmp[row -2];
2649: sum4 = tmp[row -3];
2650: sum5 = tmp[row -4];
2651: v2 = aa + ad[row]+1;
2652: v3 = aa + ad[row -1]+1;
2653: v4 = aa + ad[row -2]+1;
2654: v5 = aa + ad[row -3]+1;
2655: for (j=0; j<nz-1; j+=2) {
2656: i0 = vi[j];
2657: i1 = vi[j+1];
2658: tmp0 = tmps[i0];
2659: tmp1 = tmps[i1];
2660: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665: }
2666: if (j==nz-1) {
2667: tmp0 = tmps[vi[j]];
2668: sum1 -= v1[j] * tmp0;
2669: sum2 -= v2[j+1] * tmp0;
2670: sum3 -= v3[j+2] * tmp0;
2671: sum4 -= v4[j+3] * tmp0;
2672: sum5 -= v5[j+4] * tmp0;
2673: }
2675: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676: sum2 -= v2[0] * tmp0;
2677: sum3 -= v3[1] * tmp0;
2678: sum4 -= v4[2] * tmp0;
2679: sum5 -= v5[3] * tmp0;
2680: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681: sum3 -= v3[0] * tmp0;
2682: sum4 -= v4[1] * tmp0;
2683: sum5 -= v5[2] * tmp0;
2684: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685: sum4 -= v4[0] * tmp0;
2686: sum5 -= v5[1] * tmp0;
2687: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688: sum5 -= v5[0] * tmp0;
2689: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690: break;
2691: default:
2692: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693: }
2694: }
2695: ISRestoreIndices(isrow,&rout);
2696: ISRestoreIndices(iscol,&cout);
2697: VecRestoreArrayRead(bb,&b);
2698: VecRestoreArray(xx,&x);
2699: PetscLogFlops(2.0*a->nz - A->cmap->n);
2700: return(0);
2701: }
2704: /*
2705: Makes a longer coloring[] array and calls the usual code with that
2706: */
2709: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710: {
2711: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2712: PetscErrorCode ierr;
2713: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714: PetscInt *colorused,i;
2715: ISColoringValue *newcolor;
2718: PetscMalloc1(n+1,&newcolor);
2719: /* loop over inodes, marking a color for each column*/
2720: row = 0;
2721: for (i=0; i<m; i++) {
2722: for (j=0; j<ns[i]; j++) {
2723: newcolor[row++] = coloring[i] + j*ncolors;
2724: }
2725: }
2727: /* eliminate unneeded colors */
2728: PetscCalloc1(5*ncolors,&colorused);
2729: for (i=0; i<n; i++) {
2730: colorused[newcolor[i]] = 1;
2731: }
2733: for (i=1; i<5*ncolors; i++) {
2734: colorused[i] += colorused[i-1];
2735: }
2736: ncolors = colorused[5*ncolors-1];
2737: for (i=0; i<n; i++) {
2738: newcolor[i] = colorused[newcolor[i]]-1;
2739: }
2740: PetscFree(colorused);
2741: ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);
2742: PetscFree(coloring);
2743: return(0);
2744: }
2746: #include <petsc/private/kernels/blockinvert.h>
2750: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751: {
2752: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2753: PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754: MatScalar *ibdiag,*bdiag,work[25],*t;
2755: PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756: const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757: const PetscScalar *xb, *b;
2758: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2759: PetscErrorCode ierr;
2760: PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761: PetscInt sz,k,ipvt[5];
2762: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
2763: const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2766: allowzeropivot = PetscNot(A->erroriffailure);
2767: if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2768: if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2769: if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2771: if (!a->inode.ibdiagvalid) {
2772: if (!a->inode.ibdiag) {
2773: /* calculate space needed for diagonal blocks */
2774: for (i=0; i<m; i++) {
2775: cnt += sizes[i]*sizes[i];
2776: }
2777: a->inode.bdiagsize = cnt;
2779: PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);
2780: }
2782: /* copy over the diagonal blocks and invert them */
2783: ibdiag = a->inode.ibdiag;
2784: bdiag = a->inode.bdiag;
2785: cnt = 0;
2786: for (i=0, row = 0; i<m; i++) {
2787: for (j=0; j<sizes[i]; j++) {
2788: for (k=0; k<sizes[i]; k++) {
2789: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2790: }
2791: }
2792: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2794: switch (sizes[i]) {
2795: case 1:
2796: /* Create matrix data structure */
2797: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2798: if (allowzeropivot) {
2799: zeropivotdetected = PETSC_TRUE;
2800: PetscInfo1(A,"Zero pivot, row %D\n",row);
2801: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2802: }
2803: ibdiag[cnt] = 1.0/ibdiag[cnt];
2804: break;
2805: case 2:
2806: PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2807: break;
2808: case 3:
2809: PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2810: break;
2811: case 4:
2812: PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);
2813: break;
2814: case 5:
2815: PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
2816: break;
2817: default:
2818: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819: }
2820: if (zeropivotdetected) {
2821: A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2822: }
2824: cnt += sizes[i]*sizes[i];
2825: row += sizes[i];
2826: }
2827: a->inode.ibdiagvalid = PETSC_TRUE;
2828: }
2829: ibdiag = a->inode.ibdiag;
2830: bdiag = a->inode.bdiag;
2831: t = a->inode.ssor_work;
2833: VecGetArray(xx,&x);
2834: VecGetArrayRead(bb,&b);
2835: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836: if (flag & SOR_ZERO_INITIAL_GUESS) {
2837: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2839: for (i=0, row=0; i<m; i++) {
2840: sz = diag[row] - ii[row];
2841: v1 = a->a + ii[row];
2842: idx = a->j + ii[row];
2844: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2845: switch (sizes[i]) {
2846: case 1:
2848: sum1 = b[row];
2849: for (n = 0; n<sz-1; n+=2) {
2850: i1 = idx[0];
2851: i2 = idx[1];
2852: idx += 2;
2853: tmp0 = x[i1];
2854: tmp1 = x[i2];
2855: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2856: }
2858: if (n == sz-1) {
2859: tmp0 = x[*idx];
2860: sum1 -= *v1 * tmp0;
2861: }
2862: t[row] = sum1;
2863: x[row++] = sum1*(*ibdiag++);
2864: break;
2865: case 2:
2866: v2 = a->a + ii[row+1];
2867: sum1 = b[row];
2868: sum2 = b[row+1];
2869: for (n = 0; n<sz-1; n+=2) {
2870: i1 = idx[0];
2871: i2 = idx[1];
2872: idx += 2;
2873: tmp0 = x[i1];
2874: tmp1 = x[i2];
2875: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877: }
2879: if (n == sz-1) {
2880: tmp0 = x[*idx];
2881: sum1 -= v1[0] * tmp0;
2882: sum2 -= v2[0] * tmp0;
2883: }
2884: t[row] = sum1;
2885: t[row+1] = sum2;
2886: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888: ibdiag += 4;
2889: break;
2890: case 3:
2891: v2 = a->a + ii[row+1];
2892: v3 = a->a + ii[row+2];
2893: sum1 = b[row];
2894: sum2 = b[row+1];
2895: sum3 = b[row+2];
2896: for (n = 0; n<sz-1; n+=2) {
2897: i1 = idx[0];
2898: i2 = idx[1];
2899: idx += 2;
2900: tmp0 = x[i1];
2901: tmp1 = x[i2];
2902: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905: }
2907: if (n == sz-1) {
2908: tmp0 = x[*idx];
2909: sum1 -= v1[0] * tmp0;
2910: sum2 -= v2[0] * tmp0;
2911: sum3 -= v3[0] * tmp0;
2912: }
2913: t[row] = sum1;
2914: t[row+1] = sum2;
2915: t[row+2] = sum3;
2916: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2917: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2918: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2919: ibdiag += 9;
2920: break;
2921: case 4:
2922: v2 = a->a + ii[row+1];
2923: v3 = a->a + ii[row+2];
2924: v4 = a->a + ii[row+3];
2925: sum1 = b[row];
2926: sum2 = b[row+1];
2927: sum3 = b[row+2];
2928: sum4 = b[row+3];
2929: for (n = 0; n<sz-1; n+=2) {
2930: i1 = idx[0];
2931: i2 = idx[1];
2932: idx += 2;
2933: tmp0 = x[i1];
2934: tmp1 = x[i2];
2935: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2936: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2937: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2938: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2939: }
2941: if (n == sz-1) {
2942: tmp0 = x[*idx];
2943: sum1 -= v1[0] * tmp0;
2944: sum2 -= v2[0] * tmp0;
2945: sum3 -= v3[0] * tmp0;
2946: sum4 -= v4[0] * tmp0;
2947: }
2948: t[row] = sum1;
2949: t[row+1] = sum2;
2950: t[row+2] = sum3;
2951: t[row+3] = sum4;
2952: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2953: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2954: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2955: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2956: ibdiag += 16;
2957: break;
2958: case 5:
2959: v2 = a->a + ii[row+1];
2960: v3 = a->a + ii[row+2];
2961: v4 = a->a + ii[row+3];
2962: v5 = a->a + ii[row+4];
2963: sum1 = b[row];
2964: sum2 = b[row+1];
2965: sum3 = b[row+2];
2966: sum4 = b[row+3];
2967: sum5 = b[row+4];
2968: for (n = 0; n<sz-1; n+=2) {
2969: i1 = idx[0];
2970: i2 = idx[1];
2971: idx += 2;
2972: tmp0 = x[i1];
2973: tmp1 = x[i2];
2974: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2975: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2976: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2977: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2978: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2979: }
2981: if (n == sz-1) {
2982: tmp0 = x[*idx];
2983: sum1 -= v1[0] * tmp0;
2984: sum2 -= v2[0] * tmp0;
2985: sum3 -= v3[0] * tmp0;
2986: sum4 -= v4[0] * tmp0;
2987: sum5 -= v5[0] * tmp0;
2988: }
2989: t[row] = sum1;
2990: t[row+1] = sum2;
2991: t[row+2] = sum3;
2992: t[row+3] = sum4;
2993: t[row+4] = sum5;
2994: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2995: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2996: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2997: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2998: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2999: ibdiag += 25;
3000: break;
3001: default:
3002: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3003: }
3004: }
3006: xb = t;
3007: PetscLogFlops(a->nz);
3008: } else xb = b;
3009: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3011: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3012: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3013: ibdiag -= sizes[i]*sizes[i];
3014: sz = ii[row+1] - diag[row] - 1;
3015: v1 = a->a + diag[row] + 1;
3016: idx = a->j + diag[row] + 1;
3018: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3019: switch (sizes[i]) {
3020: case 1:
3022: sum1 = xb[row];
3023: for (n = 0; n<sz-1; n+=2) {
3024: i1 = idx[0];
3025: i2 = idx[1];
3026: idx += 2;
3027: tmp0 = x[i1];
3028: tmp1 = x[i2];
3029: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3030: }
3032: if (n == sz-1) {
3033: tmp0 = x[*idx];
3034: sum1 -= *v1*tmp0;
3035: }
3036: x[row--] = sum1*(*ibdiag);
3037: break;
3039: case 2:
3041: sum1 = xb[row];
3042: sum2 = xb[row-1];
3043: /* note that sum1 is associated with the second of the two rows */
3044: v2 = a->a + diag[row-1] + 2;
3045: for (n = 0; n<sz-1; n+=2) {
3046: i1 = idx[0];
3047: i2 = idx[1];
3048: idx += 2;
3049: tmp0 = x[i1];
3050: tmp1 = x[i2];
3051: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3052: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3053: }
3055: if (n == sz-1) {
3056: tmp0 = x[*idx];
3057: sum1 -= *v1*tmp0;
3058: sum2 -= *v2*tmp0;
3059: }
3060: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3061: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3062: break;
3063: case 3:
3065: sum1 = xb[row];
3066: sum2 = xb[row-1];
3067: sum3 = xb[row-2];
3068: v2 = a->a + diag[row-1] + 2;
3069: v3 = a->a + diag[row-2] + 3;
3070: for (n = 0; n<sz-1; n+=2) {
3071: i1 = idx[0];
3072: i2 = idx[1];
3073: idx += 2;
3074: tmp0 = x[i1];
3075: tmp1 = x[i2];
3076: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3078: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3079: }
3081: if (n == sz-1) {
3082: tmp0 = x[*idx];
3083: sum1 -= *v1*tmp0;
3084: sum2 -= *v2*tmp0;
3085: sum3 -= *v3*tmp0;
3086: }
3087: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3088: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3089: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3090: break;
3091: case 4:
3093: sum1 = xb[row];
3094: sum2 = xb[row-1];
3095: sum3 = xb[row-2];
3096: sum4 = xb[row-3];
3097: v2 = a->a + diag[row-1] + 2;
3098: v3 = a->a + diag[row-2] + 3;
3099: v4 = a->a + diag[row-3] + 4;
3100: for (n = 0; n<sz-1; n+=2) {
3101: i1 = idx[0];
3102: i2 = idx[1];
3103: idx += 2;
3104: tmp0 = x[i1];
3105: tmp1 = x[i2];
3106: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3107: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3108: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3109: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3110: }
3112: if (n == sz-1) {
3113: tmp0 = x[*idx];
3114: sum1 -= *v1*tmp0;
3115: sum2 -= *v2*tmp0;
3116: sum3 -= *v3*tmp0;
3117: sum4 -= *v4*tmp0;
3118: }
3119: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3120: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3121: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3122: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3123: break;
3124: case 5:
3126: sum1 = xb[row];
3127: sum2 = xb[row-1];
3128: sum3 = xb[row-2];
3129: sum4 = xb[row-3];
3130: sum5 = xb[row-4];
3131: v2 = a->a + diag[row-1] + 2;
3132: v3 = a->a + diag[row-2] + 3;
3133: v4 = a->a + diag[row-3] + 4;
3134: v5 = a->a + diag[row-4] + 5;
3135: for (n = 0; n<sz-1; n+=2) {
3136: i1 = idx[0];
3137: i2 = idx[1];
3138: idx += 2;
3139: tmp0 = x[i1];
3140: tmp1 = x[i2];
3141: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3142: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3143: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3144: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3145: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3146: }
3148: if (n == sz-1) {
3149: tmp0 = x[*idx];
3150: sum1 -= *v1*tmp0;
3151: sum2 -= *v2*tmp0;
3152: sum3 -= *v3*tmp0;
3153: sum4 -= *v4*tmp0;
3154: sum5 -= *v5*tmp0;
3155: }
3156: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3157: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3158: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3159: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3160: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3161: break;
3162: default:
3163: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3164: }
3165: }
3167: PetscLogFlops(a->nz);
3168: }
3169: its--;
3170: }
3171: while (its--) {
3173: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3174: for (i=0, row=0, ibdiag = a->inode.ibdiag;
3175: i<m;
3176: row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3178: sz = diag[row] - ii[row];
3179: v1 = a->a + ii[row];
3180: idx = a->j + ii[row];
3181: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3182: switch (sizes[i]) {
3183: case 1:
3184: sum1 = b[row];
3185: for (n = 0; n<sz-1; n+=2) {
3186: i1 = idx[0];
3187: i2 = idx[1];
3188: idx += 2;
3189: tmp0 = x[i1];
3190: tmp1 = x[i2];
3191: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3192: }
3193: if (n == sz-1) {
3194: tmp0 = x[*idx++];
3195: sum1 -= *v1 * tmp0;
3196: v1++;
3197: }
3198: t[row] = sum1;
3199: sz = ii[row+1] - diag[row] - 1;
3200: idx = a->j + diag[row] + 1;
3201: v1 += 1;
3202: for (n = 0; n<sz-1; n+=2) {
3203: i1 = idx[0];
3204: i2 = idx[1];
3205: idx += 2;
3206: tmp0 = x[i1];
3207: tmp1 = x[i2];
3208: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3209: }
3210: if (n == sz-1) {
3211: tmp0 = x[*idx++];
3212: sum1 -= *v1 * tmp0;
3213: }
3214: /* in MatSOR_SeqAIJ this line would be
3215: *
3216: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3217: *
3218: * but omega == 1, so this becomes
3219: *
3220: * x[row] = sum1*(*ibdiag++);
3221: *
3222: */
3223: x[row] = sum1*(*ibdiag);
3224: break;
3225: case 2:
3226: v2 = a->a + ii[row+1];
3227: sum1 = b[row];
3228: sum2 = b[row+1];
3229: for (n = 0; n<sz-1; n+=2) {
3230: i1 = idx[0];
3231: i2 = idx[1];
3232: idx += 2;
3233: tmp0 = x[i1];
3234: tmp1 = x[i2];
3235: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3236: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3237: }
3238: if (n == sz-1) {
3239: tmp0 = x[*idx++];
3240: sum1 -= v1[0] * tmp0;
3241: sum2 -= v2[0] * tmp0;
3242: v1++; v2++;
3243: }
3244: t[row] = sum1;
3245: t[row+1] = sum2;
3246: sz = ii[row+1] - diag[row] - 2;
3247: idx = a->j + diag[row] + 2;
3248: v1 += 2;
3249: v2 += 2;
3250: for (n = 0; n<sz-1; n+=2) {
3251: i1 = idx[0];
3252: i2 = idx[1];
3253: idx += 2;
3254: tmp0 = x[i1];
3255: tmp1 = x[i2];
3256: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3257: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3258: }
3259: if (n == sz-1) {
3260: tmp0 = x[*idx];
3261: sum1 -= v1[0] * tmp0;
3262: sum2 -= v2[0] * tmp0;
3263: }
3264: x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3265: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3266: break;
3267: case 3:
3268: v2 = a->a + ii[row+1];
3269: v3 = a->a + ii[row+2];
3270: sum1 = b[row];
3271: sum2 = b[row+1];
3272: sum3 = b[row+2];
3273: for (n = 0; n<sz-1; n+=2) {
3274: i1 = idx[0];
3275: i2 = idx[1];
3276: idx += 2;
3277: tmp0 = x[i1];
3278: tmp1 = x[i2];
3279: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3280: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3281: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3282: }
3283: if (n == sz-1) {
3284: tmp0 = x[*idx++];
3285: sum1 -= v1[0] * tmp0;
3286: sum2 -= v2[0] * tmp0;
3287: sum3 -= v3[0] * tmp0;
3288: v1++; v2++; v3++;
3289: }
3290: t[row] = sum1;
3291: t[row+1] = sum2;
3292: t[row+2] = sum3;
3293: sz = ii[row+1] - diag[row] - 3;
3294: idx = a->j + diag[row] + 3;
3295: v1 += 3;
3296: v2 += 3;
3297: v3 += 3;
3298: for (n = 0; n<sz-1; n+=2) {
3299: i1 = idx[0];
3300: i2 = idx[1];
3301: idx += 2;
3302: tmp0 = x[i1];
3303: tmp1 = x[i2];
3304: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3305: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3306: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3307: }
3308: if (n == sz-1) {
3309: tmp0 = x[*idx];
3310: sum1 -= v1[0] * tmp0;
3311: sum2 -= v2[0] * tmp0;
3312: sum3 -= v3[0] * tmp0;
3313: }
3314: x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3315: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3316: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3317: break;
3318: case 4:
3319: v2 = a->a + ii[row+1];
3320: v3 = a->a + ii[row+2];
3321: v4 = a->a + ii[row+3];
3322: sum1 = b[row];
3323: sum2 = b[row+1];
3324: sum3 = b[row+2];
3325: sum4 = b[row+3];
3326: for (n = 0; n<sz-1; n+=2) {
3327: i1 = idx[0];
3328: i2 = idx[1];
3329: idx += 2;
3330: tmp0 = x[i1];
3331: tmp1 = x[i2];
3332: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3333: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3334: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3335: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3336: }
3337: if (n == sz-1) {
3338: tmp0 = x[*idx++];
3339: sum1 -= v1[0] * tmp0;
3340: sum2 -= v2[0] * tmp0;
3341: sum3 -= v3[0] * tmp0;
3342: sum4 -= v4[0] * tmp0;
3343: v1++; v2++; v3++; v4++;
3344: }
3345: t[row] = sum1;
3346: t[row+1] = sum2;
3347: t[row+2] = sum3;
3348: t[row+3] = sum4;
3349: sz = ii[row+1] - diag[row] - 4;
3350: idx = a->j + diag[row] + 4;
3351: v1 += 4;
3352: v2 += 4;
3353: v3 += 4;
3354: v4 += 4;
3355: for (n = 0; n<sz-1; n+=2) {
3356: i1 = idx[0];
3357: i2 = idx[1];
3358: idx += 2;
3359: tmp0 = x[i1];
3360: tmp1 = x[i2];
3361: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3363: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3364: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3365: }
3366: if (n == sz-1) {
3367: tmp0 = x[*idx];
3368: sum1 -= v1[0] * tmp0;
3369: sum2 -= v2[0] * tmp0;
3370: sum3 -= v3[0] * tmp0;
3371: sum4 -= v4[0] * tmp0;
3372: }
3373: x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3374: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3375: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3376: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3377: break;
3378: case 5:
3379: v2 = a->a + ii[row+1];
3380: v3 = a->a + ii[row+2];
3381: v4 = a->a + ii[row+3];
3382: v5 = a->a + ii[row+4];
3383: sum1 = b[row];
3384: sum2 = b[row+1];
3385: sum3 = b[row+2];
3386: sum4 = b[row+3];
3387: sum5 = b[row+4];
3388: for (n = 0; n<sz-1; n+=2) {
3389: i1 = idx[0];
3390: i2 = idx[1];
3391: idx += 2;
3392: tmp0 = x[i1];
3393: tmp1 = x[i2];
3394: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3395: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3396: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3397: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3398: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3399: }
3400: if (n == sz-1) {
3401: tmp0 = x[*idx++];
3402: sum1 -= v1[0] * tmp0;
3403: sum2 -= v2[0] * tmp0;
3404: sum3 -= v3[0] * tmp0;
3405: sum4 -= v4[0] * tmp0;
3406: sum5 -= v5[0] * tmp0;
3407: v1++; v2++; v3++; v4++; v5++;
3408: }
3409: t[row] = sum1;
3410: t[row+1] = sum2;
3411: t[row+2] = sum3;
3412: t[row+3] = sum4;
3413: t[row+4] = sum5;
3414: sz = ii[row+1] - diag[row] - 5;
3415: idx = a->j + diag[row] + 5;
3416: v1 += 5;
3417: v2 += 5;
3418: v3 += 5;
3419: v4 += 5;
3420: v5 += 5;
3421: for (n = 0; n<sz-1; n+=2) {
3422: i1 = idx[0];
3423: i2 = idx[1];
3424: idx += 2;
3425: tmp0 = x[i1];
3426: tmp1 = x[i2];
3427: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3428: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3429: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3430: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3431: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3432: }
3433: if (n == sz-1) {
3434: tmp0 = x[*idx];
3435: sum1 -= v1[0] * tmp0;
3436: sum2 -= v2[0] * tmp0;
3437: sum3 -= v3[0] * tmp0;
3438: sum4 -= v4[0] * tmp0;
3439: sum5 -= v5[0] * tmp0;
3440: }
3441: x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3442: x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3443: x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3444: x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3445: x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3446: break;
3447: default:
3448: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3449: }
3450: }
3451: xb = t;
3452: PetscLogFlops(2.0*a->nz); /* undercounts diag inverse */
3453: } else xb = b;
3455: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3457: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3458: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3459: ibdiag -= sizes[i]*sizes[i];
3461: /* set RHS */
3462: if (xb == b) {
3463: /* whole (old way) */
3464: sz = ii[row+1] - ii[row];
3465: idx = a->j + ii[row];
3466: switch (sizes[i]) {
3467: case 5:
3468: v5 = a->a + ii[row-4];
3469: case 4: /* fall through */
3470: v4 = a->a + ii[row-3];
3471: case 3:
3472: v3 = a->a + ii[row-2];
3473: case 2:
3474: v2 = a->a + ii[row-1];
3475: case 1:
3476: v1 = a->a + ii[row];
3477: break;
3478: default:
3479: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3480: }
3481: } else {
3482: /* upper, no diag */
3483: sz = ii[row+1] - diag[row] - 1;
3484: idx = a->j + diag[row] + 1;
3485: switch (sizes[i]) {
3486: case 5:
3487: v5 = a->a + diag[row-4] + 5;
3488: case 4: /* fall through */
3489: v4 = a->a + diag[row-3] + 4;
3490: case 3:
3491: v3 = a->a + diag[row-2] + 3;
3492: case 2:
3493: v2 = a->a + diag[row-1] + 2;
3494: case 1:
3495: v1 = a->a + diag[row] + 1;
3496: }
3497: }
3498: /* set sum */
3499: switch (sizes[i]) {
3500: case 5:
3501: sum5 = xb[row-4];
3502: case 4: /* fall through */
3503: sum4 = xb[row-3];
3504: case 3:
3505: sum3 = xb[row-2];
3506: case 2:
3507: sum2 = xb[row-1];
3508: case 1:
3509: /* note that sum1 is associated with the last row */
3510: sum1 = xb[row];
3511: }
3512: /* do sums */
3513: for (n = 0; n<sz-1; n+=2) {
3514: i1 = idx[0];
3515: i2 = idx[1];
3516: idx += 2;
3517: tmp0 = x[i1];
3518: tmp1 = x[i2];
3519: switch (sizes[i]) {
3520: case 5:
3521: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3522: case 4: /* fall through */
3523: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3524: case 3:
3525: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3526: case 2:
3527: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3528: case 1:
3529: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3530: }
3531: }
3532: /* ragged edge */
3533: if (n == sz-1) {
3534: tmp0 = x[*idx];
3535: switch (sizes[i]) {
3536: case 5:
3537: sum5 -= *v5*tmp0;
3538: case 4: /* fall through */
3539: sum4 -= *v4*tmp0;
3540: case 3:
3541: sum3 -= *v3*tmp0;
3542: case 2:
3543: sum2 -= *v2*tmp0;
3544: case 1:
3545: sum1 -= *v1*tmp0;
3546: }
3547: }
3548: /* update */
3549: if (xb == b) {
3550: /* whole (old way) w/ diag */
3551: switch (sizes[i]) {
3552: case 5:
3553: x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3554: x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3555: x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3556: x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3557: x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3558: break;
3559: case 4:
3560: x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3561: x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3562: x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3563: x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3564: break;
3565: case 3:
3566: x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3567: x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3568: x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3569: break;
3570: case 2:
3571: x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3572: x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3573: break;
3574: case 1:
3575: x[row--] += sum1*(*ibdiag);
3576: break;
3577: }
3578: } else {
3579: /* no diag so set = */
3580: switch (sizes[i]) {
3581: case 5:
3582: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3583: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3584: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3585: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3586: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3587: break;
3588: case 4:
3589: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3590: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3591: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3592: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3593: break;
3594: case 3:
3595: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3596: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3597: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3598: break;
3599: case 2:
3600: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3601: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3602: break;
3603: case 1:
3604: x[row--] = sum1*(*ibdiag);
3605: break;
3606: }
3607: }
3608: }
3609: if (xb == b) {
3610: PetscLogFlops(2.0*a->nz);
3611: } else {
3612: PetscLogFlops(a->nz); /* assumes 1/2 in upper, undercounts diag inverse */
3613: }
3614: }
3615: }
3616: if (flag & SOR_EISENSTAT) {
3617: /*
3618: Apply (U + D)^-1 where D is now the block diagonal
3619: */
3620: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3621: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3622: ibdiag -= sizes[i]*sizes[i];
3623: sz = ii[row+1] - diag[row] - 1;
3624: v1 = a->a + diag[row] + 1;
3625: idx = a->j + diag[row] + 1;
3626: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3627: switch (sizes[i]) {
3628: case 1:
3630: sum1 = b[row];
3631: for (n = 0; n<sz-1; n+=2) {
3632: i1 = idx[0];
3633: i2 = idx[1];
3634: idx += 2;
3635: tmp0 = x[i1];
3636: tmp1 = x[i2];
3637: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3638: }
3640: if (n == sz-1) {
3641: tmp0 = x[*idx];
3642: sum1 -= *v1*tmp0;
3643: }
3644: x[row] = sum1*(*ibdiag);row--;
3645: break;
3647: case 2:
3649: sum1 = b[row];
3650: sum2 = b[row-1];
3651: /* note that sum1 is associated with the second of the two rows */
3652: v2 = a->a + diag[row-1] + 2;
3653: for (n = 0; n<sz-1; n+=2) {
3654: i1 = idx[0];
3655: i2 = idx[1];
3656: idx += 2;
3657: tmp0 = x[i1];
3658: tmp1 = x[i2];
3659: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661: }
3663: if (n == sz-1) {
3664: tmp0 = x[*idx];
3665: sum1 -= *v1*tmp0;
3666: sum2 -= *v2*tmp0;
3667: }
3668: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3669: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3670: row -= 2;
3671: break;
3672: case 3:
3674: sum1 = b[row];
3675: sum2 = b[row-1];
3676: sum3 = b[row-2];
3677: v2 = a->a + diag[row-1] + 2;
3678: v3 = a->a + diag[row-2] + 3;
3679: for (n = 0; n<sz-1; n+=2) {
3680: i1 = idx[0];
3681: i2 = idx[1];
3682: idx += 2;
3683: tmp0 = x[i1];
3684: tmp1 = x[i2];
3685: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3686: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3687: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3688: }
3690: if (n == sz-1) {
3691: tmp0 = x[*idx];
3692: sum1 -= *v1*tmp0;
3693: sum2 -= *v2*tmp0;
3694: sum3 -= *v3*tmp0;
3695: }
3696: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3697: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3698: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3699: row -= 3;
3700: break;
3701: case 4:
3703: sum1 = b[row];
3704: sum2 = b[row-1];
3705: sum3 = b[row-2];
3706: sum4 = b[row-3];
3707: v2 = a->a + diag[row-1] + 2;
3708: v3 = a->a + diag[row-2] + 3;
3709: v4 = a->a + diag[row-3] + 4;
3710: for (n = 0; n<sz-1; n+=2) {
3711: i1 = idx[0];
3712: i2 = idx[1];
3713: idx += 2;
3714: tmp0 = x[i1];
3715: tmp1 = x[i2];
3716: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3717: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3718: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3719: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3720: }
3722: if (n == sz-1) {
3723: tmp0 = x[*idx];
3724: sum1 -= *v1*tmp0;
3725: sum2 -= *v2*tmp0;
3726: sum3 -= *v3*tmp0;
3727: sum4 -= *v4*tmp0;
3728: }
3729: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3730: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3731: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3732: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3733: row -= 4;
3734: break;
3735: case 5:
3737: sum1 = b[row];
3738: sum2 = b[row-1];
3739: sum3 = b[row-2];
3740: sum4 = b[row-3];
3741: sum5 = b[row-4];
3742: v2 = a->a + diag[row-1] + 2;
3743: v3 = a->a + diag[row-2] + 3;
3744: v4 = a->a + diag[row-3] + 4;
3745: v5 = a->a + diag[row-4] + 5;
3746: for (n = 0; n<sz-1; n+=2) {
3747: i1 = idx[0];
3748: i2 = idx[1];
3749: idx += 2;
3750: tmp0 = x[i1];
3751: tmp1 = x[i2];
3752: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3753: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3754: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3755: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3756: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3757: }
3759: if (n == sz-1) {
3760: tmp0 = x[*idx];
3761: sum1 -= *v1*tmp0;
3762: sum2 -= *v2*tmp0;
3763: sum3 -= *v3*tmp0;
3764: sum4 -= *v4*tmp0;
3765: sum5 -= *v5*tmp0;
3766: }
3767: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3768: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3769: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3770: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3771: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3772: row -= 5;
3773: break;
3774: default:
3775: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3776: }
3777: }
3778: PetscLogFlops(a->nz);
3780: /*
3781: t = b - D x where D is the block diagonal
3782: */
3783: cnt = 0;
3784: for (i=0, row=0; i<m; i++) {
3785: switch (sizes[i]) {
3786: case 1:
3787: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3788: break;
3789: case 2:
3790: x1 = x[row]; x2 = x[row+1];
3791: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3792: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3793: t[row] = b[row] - tmp1;
3794: t[row+1] = b[row+1] - tmp2; row += 2;
3795: cnt += 4;
3796: break;
3797: case 3:
3798: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3799: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3800: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3801: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3802: t[row] = b[row] - tmp1;
3803: t[row+1] = b[row+1] - tmp2;
3804: t[row+2] = b[row+2] - tmp3; row += 3;
3805: cnt += 9;
3806: break;
3807: case 4:
3808: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3809: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3810: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3811: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3812: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3813: t[row] = b[row] - tmp1;
3814: t[row+1] = b[row+1] - tmp2;
3815: t[row+2] = b[row+2] - tmp3;
3816: t[row+3] = b[row+3] - tmp4; row += 4;
3817: cnt += 16;
3818: break;
3819: case 5:
3820: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3821: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3822: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3823: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3824: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3825: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3826: t[row] = b[row] - tmp1;
3827: t[row+1] = b[row+1] - tmp2;
3828: t[row+2] = b[row+2] - tmp3;
3829: t[row+3] = b[row+3] - tmp4;
3830: t[row+4] = b[row+4] - tmp5;row += 5;
3831: cnt += 25;
3832: break;
3833: default:
3834: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3835: }
3836: }
3837: PetscLogFlops(m);
3841: /*
3842: Apply (L + D)^-1 where D is the block diagonal
3843: */
3844: for (i=0, row=0; i<m; i++) {
3845: sz = diag[row] - ii[row];
3846: v1 = a->a + ii[row];
3847: idx = a->j + ii[row];
3848: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3849: switch (sizes[i]) {
3850: case 1:
3852: sum1 = t[row];
3853: for (n = 0; n<sz-1; n+=2) {
3854: i1 = idx[0];
3855: i2 = idx[1];
3856: idx += 2;
3857: tmp0 = t[i1];
3858: tmp1 = t[i2];
3859: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3860: }
3862: if (n == sz-1) {
3863: tmp0 = t[*idx];
3864: sum1 -= *v1 * tmp0;
3865: }
3866: x[row] += t[row] = sum1*(*ibdiag++); row++;
3867: break;
3868: case 2:
3869: v2 = a->a + ii[row+1];
3870: sum1 = t[row];
3871: sum2 = t[row+1];
3872: for (n = 0; n<sz-1; n+=2) {
3873: i1 = idx[0];
3874: i2 = idx[1];
3875: idx += 2;
3876: tmp0 = t[i1];
3877: tmp1 = t[i2];
3878: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3879: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3880: }
3882: if (n == sz-1) {
3883: tmp0 = t[*idx];
3884: sum1 -= v1[0] * tmp0;
3885: sum2 -= v2[0] * tmp0;
3886: }
3887: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3888: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3889: ibdiag += 4; row += 2;
3890: break;
3891: case 3:
3892: v2 = a->a + ii[row+1];
3893: v3 = a->a + ii[row+2];
3894: sum1 = t[row];
3895: sum2 = t[row+1];
3896: sum3 = t[row+2];
3897: for (n = 0; n<sz-1; n+=2) {
3898: i1 = idx[0];
3899: i2 = idx[1];
3900: idx += 2;
3901: tmp0 = t[i1];
3902: tmp1 = t[i2];
3903: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3904: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3905: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3906: }
3908: if (n == sz-1) {
3909: tmp0 = t[*idx];
3910: sum1 -= v1[0] * tmp0;
3911: sum2 -= v2[0] * tmp0;
3912: sum3 -= v3[0] * tmp0;
3913: }
3914: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3915: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3916: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3917: ibdiag += 9; row += 3;
3918: break;
3919: case 4:
3920: v2 = a->a + ii[row+1];
3921: v3 = a->a + ii[row+2];
3922: v4 = a->a + ii[row+3];
3923: sum1 = t[row];
3924: sum2 = t[row+1];
3925: sum3 = t[row+2];
3926: sum4 = t[row+3];
3927: for (n = 0; n<sz-1; n+=2) {
3928: i1 = idx[0];
3929: i2 = idx[1];
3930: idx += 2;
3931: tmp0 = t[i1];
3932: tmp1 = t[i2];
3933: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3934: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3935: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3936: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3937: }
3939: if (n == sz-1) {
3940: tmp0 = t[*idx];
3941: sum1 -= v1[0] * tmp0;
3942: sum2 -= v2[0] * tmp0;
3943: sum3 -= v3[0] * tmp0;
3944: sum4 -= v4[0] * tmp0;
3945: }
3946: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3947: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3948: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3949: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3950: ibdiag += 16; row += 4;
3951: break;
3952: case 5:
3953: v2 = a->a + ii[row+1];
3954: v3 = a->a + ii[row+2];
3955: v4 = a->a + ii[row+3];
3956: v5 = a->a + ii[row+4];
3957: sum1 = t[row];
3958: sum2 = t[row+1];
3959: sum3 = t[row+2];
3960: sum4 = t[row+3];
3961: sum5 = t[row+4];
3962: for (n = 0; n<sz-1; n+=2) {
3963: i1 = idx[0];
3964: i2 = idx[1];
3965: idx += 2;
3966: tmp0 = t[i1];
3967: tmp1 = t[i2];
3968: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3969: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3970: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3971: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3972: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3973: }
3975: if (n == sz-1) {
3976: tmp0 = t[*idx];
3977: sum1 -= v1[0] * tmp0;
3978: sum2 -= v2[0] * tmp0;
3979: sum3 -= v3[0] * tmp0;
3980: sum4 -= v4[0] * tmp0;
3981: sum5 -= v5[0] * tmp0;
3982: }
3983: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3984: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3985: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3986: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3987: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3988: ibdiag += 25; row += 5;
3989: break;
3990: default:
3991: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3992: }
3993: }
3994: PetscLogFlops(a->nz);
3995: }
3996: VecRestoreArray(xx,&x);
3997: VecRestoreArrayRead(bb,&b);
3998: return(0);
3999: }
4003: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4004: {
4005: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4006: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4007: const MatScalar *bdiag = a->inode.bdiag;
4008: const PetscScalar *b;
4009: PetscErrorCode ierr;
4010: PetscInt m = a->inode.node_count,cnt = 0,i,row;
4011: const PetscInt *sizes = a->inode.size;
4014: VecGetArray(xx,&x);
4015: VecGetArrayRead(bb,&b);
4016: cnt = 0;
4017: for (i=0, row=0; i<m; i++) {
4018: switch (sizes[i]) {
4019: case 1:
4020: x[row] = b[row]*bdiag[cnt++];row++;
4021: break;
4022: case 2:
4023: x1 = b[row]; x2 = b[row+1];
4024: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4025: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4026: x[row++] = tmp1;
4027: x[row++] = tmp2;
4028: cnt += 4;
4029: break;
4030: case 3:
4031: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
4032: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4033: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4034: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4035: x[row++] = tmp1;
4036: x[row++] = tmp2;
4037: x[row++] = tmp3;
4038: cnt += 9;
4039: break;
4040: case 4:
4041: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4042: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4043: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4044: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4045: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4046: x[row++] = tmp1;
4047: x[row++] = tmp2;
4048: x[row++] = tmp3;
4049: x[row++] = tmp4;
4050: cnt += 16;
4051: break;
4052: case 5:
4053: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4054: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4055: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4056: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4057: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4058: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4059: x[row++] = tmp1;
4060: x[row++] = tmp2;
4061: x[row++] = tmp3;
4062: x[row++] = tmp4;
4063: x[row++] = tmp5;
4064: cnt += 25;
4065: break;
4066: default:
4067: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4068: }
4069: }
4070: PetscLogFlops(2*cnt);
4071: VecRestoreArray(xx,&x);
4072: VecRestoreArrayRead(bb,&b);
4073: return(0);
4074: }
4076: /*
4077: samestructure indicates that the matrix has not changed its nonzero structure so we
4078: do not need to recompute the inodes
4079: */
4082: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4083: {
4084: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4086: PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4087: PetscBool flag;
4088: const PetscInt *idx,*idy,*ii;
4091: if (!a->inode.use) return(0);
4092: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) return(0);
4094: m = A->rmap->n;
4095: if (a->inode.size) ns = a->inode.size;
4096: else {
4097: PetscMalloc1(m+1,&ns);
4098: }
4100: i = 0;
4101: node_count = 0;
4102: idx = a->j;
4103: ii = a->i;
4104: while (i < m) { /* For each row */
4105: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4106: /* Limits the number of elements in a node to 'a->inode.limit' */
4107: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4108: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4109: if (nzy != nzx) break;
4110: idy += nzx; /* Same nonzero pattern */
4111: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
4112: if (!flag) break;
4113: }
4114: ns[node_count++] = blk_size;
4115: idx += blk_size*nzx;
4116: i = j;
4117: }
4118: /* If not enough inodes found,, do not use inode version of the routines */
4119: if (!m || node_count > .8*m) {
4120: PetscFree(ns);
4122: a->inode.node_count = 0;
4123: a->inode.size = NULL;
4124: a->inode.use = PETSC_FALSE;
4125: A->ops->mult = MatMult_SeqAIJ;
4126: A->ops->sor = MatSOR_SeqAIJ;
4127: A->ops->multadd = MatMultAdd_SeqAIJ;
4128: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4129: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4130: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4131: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4132: A->ops->coloringpatch = 0;
4133: A->ops->multdiagonalblock = 0;
4135: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4136: } else {
4137: if (!A->factortype) {
4138: A->ops->mult = MatMult_SeqAIJ_Inode;
4139: A->ops->sor = MatSOR_SeqAIJ_Inode;
4140: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4141: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4142: if (A->rmap->n == A->cmap->n) {
4143: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4144: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4145: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4146: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4147: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4148: }
4149: } else {
4150: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4151: }
4152: a->inode.node_count = node_count;
4153: a->inode.size = ns;
4154: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4155: }
4156: a->inode.checked = PETSC_TRUE;
4157: a->inode.mat_nonzerostate = A->nonzerostate;
4158: return(0);
4159: }
4163: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4164: {
4165: Mat B =*C;
4166: Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4168: PetscInt m=A->rmap->n;
4171: c->inode.use = a->inode.use;
4172: c->inode.limit = a->inode.limit;
4173: c->inode.max_limit = a->inode.max_limit;
4174: if (a->inode.size) {
4175: PetscMalloc1(m+1,&c->inode.size);
4176: c->inode.node_count = a->inode.node_count;
4177: PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));
4178: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4179: if (!B->factortype) {
4180: B->ops->mult = MatMult_SeqAIJ_Inode;
4181: B->ops->sor = MatSOR_SeqAIJ_Inode;
4182: B->ops->multadd = MatMultAdd_SeqAIJ_Inode;
4183: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4184: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4185: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4186: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4187: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4188: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4189: } else {
4190: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4191: }
4192: } else {
4193: c->inode.size = 0;
4194: c->inode.node_count = 0;
4195: }
4196: c->inode.ibdiagvalid = PETSC_FALSE;
4197: c->inode.ibdiag = 0;
4198: c->inode.bdiag = 0;
4199: return(0);
4200: }
4204: PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4205: {
4206: PetscInt k;
4207: const PetscInt *vi;
4210: vi = aj + ai[row];
4211: for (k=0; k<nzl; k++) cols[k] = vi[k];
4212: vi = aj + adiag[row];
4213: cols[nzl] = vi[0];
4214: vi = aj + adiag[row+1]+1;
4215: for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4216: return(0);
4217: }
4218: /*
4219: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4220: Modified from MatSeqAIJCheckInode().
4222: Input Parameters:
4223: . Mat A - ILU or LU matrix factor
4225: */
4228: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4229: {
4230: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4232: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4233: PetscInt *cols1,*cols2,*ns;
4234: const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4235: PetscBool flag;
4238: if (!a->inode.use) return(0);
4239: if (a->inode.checked) return(0);
4241: m = A->rmap->n;
4242: if (a->inode.size) ns = a->inode.size;
4243: else {
4244: PetscMalloc1(m+1,&ns);
4245: }
4247: i = 0;
4248: node_count = 0;
4249: PetscMalloc2(m,&cols1,m,&cols2);
4250: while (i < m) { /* For each row */
4251: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4252: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4253: nzx = nzl1 + nzu1 + 1;
4254: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4256: /* Limits the number of elements in a node to 'a->inode.limit' */
4257: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4258: nzl2 = ai[j+1] - ai[j];
4259: nzu2 = adiag[j] - adiag[j+1] - 1;
4260: nzy = nzl2 + nzu2 + 1;
4261: if (nzy != nzx) break;
4262: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
4263: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
4264: if (!flag) break;
4265: }
4266: ns[node_count++] = blk_size;
4267: i = j;
4268: }
4269: PetscFree2(cols1,cols2);
4270: /* If not enough inodes found,, do not use inode version of the routines */
4271: if (!m || node_count > .8*m) {
4272: PetscFree(ns);
4274: a->inode.node_count = 0;
4275: a->inode.size = NULL;
4276: a->inode.use = PETSC_FALSE;
4278: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
4279: } else {
4280: A->ops->mult = 0;
4281: A->ops->sor = 0;
4282: A->ops->multadd = 0;
4283: A->ops->getrowij = 0;
4284: A->ops->restorerowij = 0;
4285: A->ops->getcolumnij = 0;
4286: A->ops->restorecolumnij = 0;
4287: A->ops->coloringpatch = 0;
4288: A->ops->multdiagonalblock = 0;
4289: a->inode.node_count = node_count;
4290: a->inode.size = ns;
4292: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
4293: }
4294: a->inode.checked = PETSC_TRUE;
4295: return(0);
4296: }
4300: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4301: {
4302: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4305: a->inode.ibdiagvalid = PETSC_FALSE;
4306: return(0);
4307: }
4309: /*
4310: This is really ugly. if inodes are used this replaces the
4311: permutations with ones that correspond to rows/cols of the matrix
4312: rather then inode blocks
4313: */
4316: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4317: {
4321: PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4322: return(0);
4323: }
4327: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4328: {
4329: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4331: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4332: const PetscInt *ridx,*cidx;
4333: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4334: PetscInt nslim_col,*ns_col;
4335: IS ris = *rperm,cis = *cperm;
4338: if (!a->inode.size) return(0); /* no inodes so return */
4339: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
4341: Mat_CreateColInode(A,&nslim_col,&ns_col);
4342: PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);
4343: PetscMalloc2(m,&permr,n,&permc);
4345: ISGetIndices(ris,&ridx);
4346: ISGetIndices(cis,&cidx);
4348: /* Form the inode structure for the rows of permuted matric using inv perm*/
4349: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4351: /* Construct the permutations for rows*/
4352: for (i=0,row = 0; i<nslim_row; ++i) {
4353: indx = ridx[i];
4354: start_val = tns[indx];
4355: end_val = tns[indx + 1];
4356: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4357: }
4359: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4360: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4362: /* Construct permutations for columns */
4363: for (i=0,col=0; i<nslim_col; ++i) {
4364: indx = cidx[i];
4365: start_val = tns[indx];
4366: end_val = tns[indx + 1];
4367: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4368: }
4370: ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);
4371: ISSetPermutation(*rperm);
4372: ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);
4373: ISSetPermutation(*cperm);
4375: ISRestoreIndices(ris,&ridx);
4376: ISRestoreIndices(cis,&cidx);
4378: PetscFree(ns_col);
4379: PetscFree2(permr,permc);
4380: ISDestroy(&cis);
4381: ISDestroy(&ris);
4382: PetscFree(tns);
4383: return(0);
4384: }
4388: /*@C
4389: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4391: Not Collective
4393: Input Parameter:
4394: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4396: Output Parameter:
4397: + node_count - no of inodes present in the matrix.
4398: . sizes - an array of size node_count,with sizes of each inode.
4399: - limit - the max size used to generate the inodes.
4401: Level: advanced
4403: Notes: This routine returns some internal storage information
4404: of the matrix, it is intended to be used by advanced users.
4405: It should be called after the matrix is assembled.
4406: The contents of the sizes[] array should not be changed.
4407: NULL may be passed for information not requested.
4409: .keywords: matrix, seqaij, get, inode
4411: .seealso: MatGetInfo()
4412: @*/
4413: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4414: {
4415: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4418: if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4419: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);
4420: if (f) {
4421: (*f)(A,node_count,sizes,limit);
4422: }
4423: return(0);
4424: }
4428: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4429: {
4430: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4433: if (node_count) *node_count = a->inode.node_count;
4434: if (sizes) *sizes = a->inode.size;
4435: if (limit) *limit = a->inode.limit;
4436: return(0);
4437: }