Actual source code: baijsolvnat.c
petsc-3.10.2 2018-10-09
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
6: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
7: {
8: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
9: PetscErrorCode ierr;
10: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
11: PetscInt i,nz,idx,idt,m;
12: const MatScalar *aa=a->a,*v;
13: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
14: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
15: PetscScalar *x;
16: const PetscScalar *b;
19: VecGetArrayRead(bb,&b);
20: VecGetArray(xx,&x);
22: /* forward solve the lower triangular */
23: idx = 0;
24: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
25: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
26: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
28: for (i=1; i<n; i++) {
29: v = aa + bs2*ai[i];
30: vi = aj + ai[i];
31: nz = ai[i+1] - ai[i];
32: idt = bs*i;
33: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
34: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
35: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
36: for (m=0; m<nz; m++) {
37: idx = bs*vi[m];
38: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
39: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
40: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
43: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
44: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
45: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
46: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
47: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
48: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
49: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
50: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
51: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
52: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
53: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
54: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
55: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
56: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
57: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
59: v += bs2;
60: }
61: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
62: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
63: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
65: }
66: /* backward solve the upper triangular */
67: for (i=n-1; i>=0; i--) {
68: v = aa + bs2*(adiag[i+1]+1);
69: vi = aj + adiag[i+1]+1;
70: nz = adiag[i] - adiag[i+1] - 1;
71: idt = bs*i;
72: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
73: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
74: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
76: for (m=0; m<nz; m++) {
77: idx = bs*vi[m];
78: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
79: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
80: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
82: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
83: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
84: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
85: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
86: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
87: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
88: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
89: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
90: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
91: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
92: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
93: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
94: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
95: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
96: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
98: v += bs2;
99: }
101: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
102: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
103: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
104: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
105: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
106: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
107: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
108: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
109: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
110: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
111: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
112: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
113: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
114: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
115: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
117: }
119: VecRestoreArrayRead(bb,&b);
120: VecRestoreArray(xx,&x);
121: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
122: return(0);
123: }
125: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
126: /* Default MatSolve for block size 15 */
128: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
129: {
130: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
131: PetscErrorCode ierr;
132: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
133: PetscInt i,k,nz,idx,idt,m;
134: const MatScalar *aa=a->a,*v;
135: PetscScalar s[15];
136: PetscScalar *x,xv;
137: const PetscScalar *b;
140: VecGetArrayRead(bb,&b);
141: VecGetArray(xx,&x);
143: /* forward solve the lower triangular */
144: for (i=0; i<n; i++) {
145: v = aa + bs2*ai[i];
146: vi = aj + ai[i];
147: nz = ai[i+1] - ai[i];
148: idt = bs*i;
149: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
150: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
151: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
152: for (m=0; m<nz; m++) {
153: idx = bs*vi[m];
154: for (k=0; k<15; k++) {
155: xv = x[k + idx];
156: x[idt] -= v[0]*xv;
157: x[1+idt] -= v[1]*xv;
158: x[2+idt] -= v[2]*xv;
159: x[3+idt] -= v[3]*xv;
160: x[4+idt] -= v[4]*xv;
161: x[5+idt] -= v[5]*xv;
162: x[6+idt] -= v[6]*xv;
163: x[7+idt] -= v[7]*xv;
164: x[8+idt] -= v[8]*xv;
165: x[9+idt] -= v[9]*xv;
166: x[10+idt] -= v[10]*xv;
167: x[11+idt] -= v[11]*xv;
168: x[12+idt] -= v[12]*xv;
169: x[13+idt] -= v[13]*xv;
170: x[14+idt] -= v[14]*xv;
171: v += 15;
172: }
173: }
174: }
175: /* backward solve the upper triangular */
176: for (i=n-1; i>=0; i--) {
177: v = aa + bs2*(adiag[i+1]+1);
178: vi = aj + adiag[i+1]+1;
179: nz = adiag[i] - adiag[i+1] - 1;
180: idt = bs*i;
181: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
182: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
183: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
185: for (m=0; m<nz; m++) {
186: idx = bs*vi[m];
187: for (k=0; k<15; k++) {
188: xv = x[k + idx];
189: s[0] -= v[0]*xv;
190: s[1] -= v[1]*xv;
191: s[2] -= v[2]*xv;
192: s[3] -= v[3]*xv;
193: s[4] -= v[4]*xv;
194: s[5] -= v[5]*xv;
195: s[6] -= v[6]*xv;
196: s[7] -= v[7]*xv;
197: s[8] -= v[8]*xv;
198: s[9] -= v[9]*xv;
199: s[10] -= v[10]*xv;
200: s[11] -= v[11]*xv;
201: s[12] -= v[12]*xv;
202: s[13] -= v[13]*xv;
203: s[14] -= v[14]*xv;
204: v += 15;
205: }
206: }
207: PetscMemzero(x+idt,bs*sizeof(MatScalar));
208: for (k=0; k<15; k++) {
209: x[idt] += v[0]*s[k];
210: x[1+idt] += v[1]*s[k];
211: x[2+idt] += v[2]*s[k];
212: x[3+idt] += v[3]*s[k];
213: x[4+idt] += v[4]*s[k];
214: x[5+idt] += v[5]*s[k];
215: x[6+idt] += v[6]*s[k];
216: x[7+idt] += v[7]*s[k];
217: x[8+idt] += v[8]*s[k];
218: x[9+idt] += v[9]*s[k];
219: x[10+idt] += v[10]*s[k];
220: x[11+idt] += v[11]*s[k];
221: x[12+idt] += v[12]*s[k];
222: x[13+idt] += v[13]*s[k];
223: x[14+idt] += v[14]*s[k];
224: v += 15;
225: }
226: }
227: VecRestoreArrayRead(bb,&b);
228: VecRestoreArray(xx,&x);
229: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
230: return(0);
231: }
233: /* Block operations are done by accessing one column at at time */
234: /* Default MatSolve for block size 14 */
236: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
237: {
238: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
239: PetscErrorCode ierr;
240: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
241: PetscInt i,k,nz,idx,idt,m;
242: const MatScalar *aa=a->a,*v;
243: PetscScalar s[14];
244: PetscScalar *x,xv;
245: const PetscScalar *b;
248: VecGetArrayRead(bb,&b);
249: VecGetArray(xx,&x);
251: /* forward solve the lower triangular */
252: for (i=0; i<n; i++) {
253: v = aa + bs2*ai[i];
254: vi = aj + ai[i];
255: nz = ai[i+1] - ai[i];
256: idt = bs*i;
257: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
258: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
259: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
260: for (m=0; m<nz; m++) {
261: idx = bs*vi[m];
262: for (k=0; k<bs; k++) {
263: xv = x[k + idx];
264: x[idt] -= v[0]*xv;
265: x[1+idt] -= v[1]*xv;
266: x[2+idt] -= v[2]*xv;
267: x[3+idt] -= v[3]*xv;
268: x[4+idt] -= v[4]*xv;
269: x[5+idt] -= v[5]*xv;
270: x[6+idt] -= v[6]*xv;
271: x[7+idt] -= v[7]*xv;
272: x[8+idt] -= v[8]*xv;
273: x[9+idt] -= v[9]*xv;
274: x[10+idt] -= v[10]*xv;
275: x[11+idt] -= v[11]*xv;
276: x[12+idt] -= v[12]*xv;
277: x[13+idt] -= v[13]*xv;
278: v += 14;
279: }
280: }
281: }
282: /* backward solve the upper triangular */
283: for (i=n-1; i>=0; i--) {
284: v = aa + bs2*(adiag[i+1]+1);
285: vi = aj + adiag[i+1]+1;
286: nz = adiag[i] - adiag[i+1] - 1;
287: idt = bs*i;
288: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
289: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
290: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];
292: for (m=0; m<nz; m++) {
293: idx = bs*vi[m];
294: for (k=0; k<bs; k++) {
295: xv = x[k + idx];
296: s[0] -= v[0]*xv;
297: s[1] -= v[1]*xv;
298: s[2] -= v[2]*xv;
299: s[3] -= v[3]*xv;
300: s[4] -= v[4]*xv;
301: s[5] -= v[5]*xv;
302: s[6] -= v[6]*xv;
303: s[7] -= v[7]*xv;
304: s[8] -= v[8]*xv;
305: s[9] -= v[9]*xv;
306: s[10] -= v[10]*xv;
307: s[11] -= v[11]*xv;
308: s[12] -= v[12]*xv;
309: s[13] -= v[13]*xv;
310: v += 14;
311: }
312: }
313: PetscMemzero(x+idt,bs*sizeof(MatScalar));
314: for (k=0; k<bs; k++) {
315: x[idt] += v[0]*s[k];
316: x[1+idt] += v[1]*s[k];
317: x[2+idt] += v[2]*s[k];
318: x[3+idt] += v[3]*s[k];
319: x[4+idt] += v[4]*s[k];
320: x[5+idt] += v[5]*s[k];
321: x[6+idt] += v[6]*s[k];
322: x[7+idt] += v[7]*s[k];
323: x[8+idt] += v[8]*s[k];
324: x[9+idt] += v[9]*s[k];
325: x[10+idt] += v[10]*s[k];
326: x[11+idt] += v[11]*s[k];
327: x[12+idt] += v[12]*s[k];
328: x[13+idt] += v[13]*s[k];
329: v += 14;
330: }
331: }
332: VecRestoreArrayRead(bb,&b);
333: VecRestoreArray(xx,&x);
334: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
335: return(0);
336: }
338: /* Block operations are done by accessing one column at at time */
339: /* Default MatSolve for block size 13 */
341: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
342: {
343: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
344: PetscErrorCode ierr;
345: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
346: PetscInt i,k,nz,idx,idt,m;
347: const MatScalar *aa=a->a,*v;
348: PetscScalar s[13];
349: PetscScalar *x,xv;
350: const PetscScalar *b;
353: VecGetArrayRead(bb,&b);
354: VecGetArray(xx,&x);
356: /* forward solve the lower triangular */
357: for (i=0; i<n; i++) {
358: v = aa + bs2*ai[i];
359: vi = aj + ai[i];
360: nz = ai[i+1] - ai[i];
361: idt = bs*i;
362: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
363: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
364: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
365: for (m=0; m<nz; m++) {
366: idx = bs*vi[m];
367: for (k=0; k<bs; k++) {
368: xv = x[k + idx];
369: x[idt] -= v[0]*xv;
370: x[1+idt] -= v[1]*xv;
371: x[2+idt] -= v[2]*xv;
372: x[3+idt] -= v[3]*xv;
373: x[4+idt] -= v[4]*xv;
374: x[5+idt] -= v[5]*xv;
375: x[6+idt] -= v[6]*xv;
376: x[7+idt] -= v[7]*xv;
377: x[8+idt] -= v[8]*xv;
378: x[9+idt] -= v[9]*xv;
379: x[10+idt] -= v[10]*xv;
380: x[11+idt] -= v[11]*xv;
381: x[12+idt] -= v[12]*xv;
382: v += 13;
383: }
384: }
385: }
386: /* backward solve the upper triangular */
387: for (i=n-1; i>=0; i--) {
388: v = aa + bs2*(adiag[i+1]+1);
389: vi = aj + adiag[i+1]+1;
390: nz = adiag[i] - adiag[i+1] - 1;
391: idt = bs*i;
392: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
393: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
394: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];
396: for (m=0; m<nz; m++) {
397: idx = bs*vi[m];
398: for (k=0; k<bs; k++) {
399: xv = x[k + idx];
400: s[0] -= v[0]*xv;
401: s[1] -= v[1]*xv;
402: s[2] -= v[2]*xv;
403: s[3] -= v[3]*xv;
404: s[4] -= v[4]*xv;
405: s[5] -= v[5]*xv;
406: s[6] -= v[6]*xv;
407: s[7] -= v[7]*xv;
408: s[8] -= v[8]*xv;
409: s[9] -= v[9]*xv;
410: s[10] -= v[10]*xv;
411: s[11] -= v[11]*xv;
412: s[12] -= v[12]*xv;
413: v += 13;
414: }
415: }
416: PetscMemzero(x+idt,bs*sizeof(MatScalar));
417: for (k=0; k<bs; k++) {
418: x[idt] += v[0]*s[k];
419: x[1+idt] += v[1]*s[k];
420: x[2+idt] += v[2]*s[k];
421: x[3+idt] += v[3]*s[k];
422: x[4+idt] += v[4]*s[k];
423: x[5+idt] += v[5]*s[k];
424: x[6+idt] += v[6]*s[k];
425: x[7+idt] += v[7]*s[k];
426: x[8+idt] += v[8]*s[k];
427: x[9+idt] += v[9]*s[k];
428: x[10+idt] += v[10]*s[k];
429: x[11+idt] += v[11]*s[k];
430: x[12+idt] += v[12]*s[k];
431: v += 13;
432: }
433: }
434: VecRestoreArrayRead(bb,&b);
435: VecRestoreArray(xx,&x);
436: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
437: return(0);
438: }
440: /* Block operations are done by accessing one column at at time */
441: /* Default MatSolve for block size 12 */
443: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
444: {
445: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
446: PetscErrorCode ierr;
447: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
448: PetscInt i,k,nz,idx,idt,m;
449: const MatScalar *aa=a->a,*v;
450: PetscScalar s[12];
451: PetscScalar *x,xv;
452: const PetscScalar *b;
455: VecGetArrayRead(bb,&b);
456: VecGetArray(xx,&x);
458: /* forward solve the lower triangular */
459: for (i=0; i<n; i++) {
460: v = aa + bs2*ai[i];
461: vi = aj + ai[i];
462: nz = ai[i+1] - ai[i];
463: idt = bs*i;
464: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
465: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
466: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
467: for (m=0; m<nz; m++) {
468: idx = bs*vi[m];
469: for (k=0; k<bs; k++) {
470: xv = x[k + idx];
471: x[idt] -= v[0]*xv;
472: x[1+idt] -= v[1]*xv;
473: x[2+idt] -= v[2]*xv;
474: x[3+idt] -= v[3]*xv;
475: x[4+idt] -= v[4]*xv;
476: x[5+idt] -= v[5]*xv;
477: x[6+idt] -= v[6]*xv;
478: x[7+idt] -= v[7]*xv;
479: x[8+idt] -= v[8]*xv;
480: x[9+idt] -= v[9]*xv;
481: x[10+idt] -= v[10]*xv;
482: x[11+idt] -= v[11]*xv;
483: v += 12;
484: }
485: }
486: }
487: /* backward solve the upper triangular */
488: for (i=n-1; i>=0; i--) {
489: v = aa + bs2*(adiag[i+1]+1);
490: vi = aj + adiag[i+1]+1;
491: nz = adiag[i] - adiag[i+1] - 1;
492: idt = bs*i;
493: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
494: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
495: s[10] = x[10+idt]; s[11] = x[11+idt];
497: for (m=0; m<nz; m++) {
498: idx = bs*vi[m];
499: for (k=0; k<bs; k++) {
500: xv = x[k + idx];
501: s[0] -= v[0]*xv;
502: s[1] -= v[1]*xv;
503: s[2] -= v[2]*xv;
504: s[3] -= v[3]*xv;
505: s[4] -= v[4]*xv;
506: s[5] -= v[5]*xv;
507: s[6] -= v[6]*xv;
508: s[7] -= v[7]*xv;
509: s[8] -= v[8]*xv;
510: s[9] -= v[9]*xv;
511: s[10] -= v[10]*xv;
512: s[11] -= v[11]*xv;
513: v += 12;
514: }
515: }
516: PetscMemzero(x+idt,bs*sizeof(MatScalar));
517: for (k=0; k<bs; k++) {
518: x[idt] += v[0]*s[k];
519: x[1+idt] += v[1]*s[k];
520: x[2+idt] += v[2]*s[k];
521: x[3+idt] += v[3]*s[k];
522: x[4+idt] += v[4]*s[k];
523: x[5+idt] += v[5]*s[k];
524: x[6+idt] += v[6]*s[k];
525: x[7+idt] += v[7]*s[k];
526: x[8+idt] += v[8]*s[k];
527: x[9+idt] += v[9]*s[k];
528: x[10+idt] += v[10]*s[k];
529: x[11+idt] += v[11]*s[k];
530: v += 12;
531: }
532: }
533: VecRestoreArrayRead(bb,&b);
534: VecRestoreArray(xx,&x);
535: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
536: return(0);
537: }
539: /* Block operations are done by accessing one column at at time */
540: /* Default MatSolve for block size 11 */
542: PetscErrorCode MatSolve_SeqBAIJ_11_NaturalOrdering(Mat A,Vec bb,Vec xx)
543: {
544: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
545: PetscErrorCode ierr;
546: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
547: PetscInt i,k,nz,idx,idt,m;
548: const MatScalar *aa=a->a,*v;
549: PetscScalar s[11];
550: PetscScalar *x,xv;
551: const PetscScalar *b;
554: VecGetArrayRead(bb,&b);
555: VecGetArray(xx,&x);
557: /* forward solve the lower triangular */
558: for (i=0; i<n; i++) {
559: v = aa + bs2*ai[i];
560: vi = aj + ai[i];
561: nz = ai[i+1] - ai[i];
562: idt = bs*i;
563: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
564: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
565: x[10+idt] = b[10+idt];
566: for (m=0; m<nz; m++) {
567: idx = bs*vi[m];
568: for (k=0; k<11; k++) {
569: xv = x[k + idx];
570: x[idt] -= v[0]*xv;
571: x[1+idt] -= v[1]*xv;
572: x[2+idt] -= v[2]*xv;
573: x[3+idt] -= v[3]*xv;
574: x[4+idt] -= v[4]*xv;
575: x[5+idt] -= v[5]*xv;
576: x[6+idt] -= v[6]*xv;
577: x[7+idt] -= v[7]*xv;
578: x[8+idt] -= v[8]*xv;
579: x[9+idt] -= v[9]*xv;
580: x[10+idt] -= v[10]*xv;
581: v += 11;
582: }
583: }
584: }
585: /* backward solve the upper triangular */
586: for (i=n-1; i>=0; i--) {
587: v = aa + bs2*(adiag[i+1]+1);
588: vi = aj + adiag[i+1]+1;
589: nz = adiag[i] - adiag[i+1] - 1;
590: idt = bs*i;
591: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
592: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
593: s[10] = x[10+idt];
595: for (m=0; m<nz; m++) {
596: idx = bs*vi[m];
597: for (k=0; k<11; k++) {
598: xv = x[k + idx];
599: s[0] -= v[0]*xv;
600: s[1] -= v[1]*xv;
601: s[2] -= v[2]*xv;
602: s[3] -= v[3]*xv;
603: s[4] -= v[4]*xv;
604: s[5] -= v[5]*xv;
605: s[6] -= v[6]*xv;
606: s[7] -= v[7]*xv;
607: s[8] -= v[8]*xv;
608: s[9] -= v[9]*xv;
609: s[10] -= v[10]*xv;
610: v += 11;
611: }
612: }
613: PetscMemzero(x+idt,bs*sizeof(MatScalar));
614: for (k=0; k<11; k++) {
615: x[idt] += v[0]*s[k];
616: x[1+idt] += v[1]*s[k];
617: x[2+idt] += v[2]*s[k];
618: x[3+idt] += v[3]*s[k];
619: x[4+idt] += v[4]*s[k];
620: x[5+idt] += v[5]*s[k];
621: x[6+idt] += v[6]*s[k];
622: x[7+idt] += v[7]*s[k];
623: x[8+idt] += v[8]*s[k];
624: x[9+idt] += v[9]*s[k];
625: x[10+idt] += v[10]*s[k];
626: v += 11;
627: }
628: }
629: VecRestoreArrayRead(bb,&b);
630: VecRestoreArray(xx,&x);
631: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
632: return(0);
633: }
635: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
636: {
637: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
638: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
639: PetscErrorCode ierr;
640: PetscInt i,nz,idx,idt,jdx;
641: const MatScalar *aa=a->a,*v;
642: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
643: const PetscScalar *b;
646: VecGetArrayRead(bb,&b);
647: VecGetArray(xx,&x);
648: /* forward solve the lower triangular */
649: idx = 0;
650: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
651: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
652: x[6] = b[6+idx];
653: for (i=1; i<n; i++) {
654: v = aa + 49*ai[i];
655: vi = aj + ai[i];
656: nz = diag[i] - ai[i];
657: idx = 7*i;
658: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
659: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
660: s7 = b[6+idx];
661: while (nz--) {
662: jdx = 7*(*vi++);
663: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
664: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
665: x7 = x[6+jdx];
666: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
667: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
668: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
669: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
670: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
671: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
672: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
673: v += 49;
674: }
675: x[idx] = s1;
676: x[1+idx] = s2;
677: x[2+idx] = s3;
678: x[3+idx] = s4;
679: x[4+idx] = s5;
680: x[5+idx] = s6;
681: x[6+idx] = s7;
682: }
683: /* backward solve the upper triangular */
684: for (i=n-1; i>=0; i--) {
685: v = aa + 49*diag[i] + 49;
686: vi = aj + diag[i] + 1;
687: nz = ai[i+1] - diag[i] - 1;
688: idt = 7*i;
689: s1 = x[idt]; s2 = x[1+idt];
690: s3 = x[2+idt]; s4 = x[3+idt];
691: s5 = x[4+idt]; s6 = x[5+idt];
692: s7 = x[6+idt];
693: while (nz--) {
694: idx = 7*(*vi++);
695: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
696: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
697: x7 = x[6+idx];
698: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
699: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
700: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
701: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
702: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
703: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
704: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
705: v += 49;
706: }
707: v = aa + 49*diag[i];
708: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
709: + v[28]*s5 + v[35]*s6 + v[42]*s7;
710: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
711: + v[29]*s5 + v[36]*s6 + v[43]*s7;
712: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
713: + v[30]*s5 + v[37]*s6 + v[44]*s7;
714: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
715: + v[31]*s5 + v[38]*s6 + v[45]*s7;
716: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
717: + v[32]*s5 + v[39]*s6 + v[46]*s7;
718: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
719: + v[33]*s5 + v[40]*s6 + v[47]*s7;
720: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
721: + v[34]*s5 + v[41]*s6 + v[48]*s7;
722: }
724: VecRestoreArrayRead(bb,&b);
725: VecRestoreArray(xx,&x);
726: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
727: return(0);
728: }
730: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
731: {
732: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
733: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
734: PetscErrorCode ierr;
735: PetscInt i,k,nz,idx,jdx,idt;
736: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
737: const MatScalar *aa=a->a,*v;
738: PetscScalar *x;
739: const PetscScalar *b;
740: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
743: VecGetArrayRead(bb,&b);
744: VecGetArray(xx,&x);
745: /* forward solve the lower triangular */
746: idx = 0;
747: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
748: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
749: for (i=1; i<n; i++) {
750: v = aa + bs2*ai[i];
751: vi = aj + ai[i];
752: nz = ai[i+1] - ai[i];
753: idx = bs*i;
754: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
755: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
756: for (k=0; k<nz; k++) {
757: jdx = bs*vi[k];
758: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
759: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
760: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
761: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
762: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
763: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
764: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
765: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
766: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
767: v += bs2;
768: }
770: x[idx] = s1;
771: x[1+idx] = s2;
772: x[2+idx] = s3;
773: x[3+idx] = s4;
774: x[4+idx] = s5;
775: x[5+idx] = s6;
776: x[6+idx] = s7;
777: }
779: /* backward solve the upper triangular */
780: for (i=n-1; i>=0; i--) {
781: v = aa + bs2*(adiag[i+1]+1);
782: vi = aj + adiag[i+1]+1;
783: nz = adiag[i] - adiag[i+1]-1;
784: idt = bs*i;
785: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
786: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
787: for (k=0; k<nz; k++) {
788: idx = bs*vi[k];
789: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
790: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
791: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
792: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
793: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
794: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
795: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
796: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
797: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
798: v += bs2;
799: }
800: /* x = inv_diagonal*x */
801: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
802: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
803: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
804: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
805: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
806: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
807: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
808: }
810: VecRestoreArrayRead(bb,&b);
811: VecRestoreArray(xx,&x);
812: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
813: return(0);
814: }
816: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
817: {
818: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
819: PetscInt i,nz,idx,idt,jdx;
820: PetscErrorCode ierr;
821: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
822: const MatScalar *aa =a->a,*v;
823: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
824: const PetscScalar *b;
827: VecGetArrayRead(bb,&b);
828: VecGetArray(xx,&x);
829: /* forward solve the lower triangular */
830: idx = 0;
831: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
832: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
833: for (i=1; i<n; i++) {
834: v = aa + 36*ai[i];
835: vi = aj + ai[i];
836: nz = diag[i] - ai[i];
837: idx = 6*i;
838: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
839: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
840: while (nz--) {
841: jdx = 6*(*vi++);
842: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
843: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
844: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
845: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
846: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
847: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
848: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
849: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
850: v += 36;
851: }
852: x[idx] = s1;
853: x[1+idx] = s2;
854: x[2+idx] = s3;
855: x[3+idx] = s4;
856: x[4+idx] = s5;
857: x[5+idx] = s6;
858: }
859: /* backward solve the upper triangular */
860: for (i=n-1; i>=0; i--) {
861: v = aa + 36*diag[i] + 36;
862: vi = aj + diag[i] + 1;
863: nz = ai[i+1] - diag[i] - 1;
864: idt = 6*i;
865: s1 = x[idt]; s2 = x[1+idt];
866: s3 = x[2+idt]; s4 = x[3+idt];
867: s5 = x[4+idt]; s6 = x[5+idt];
868: while (nz--) {
869: idx = 6*(*vi++);
870: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
871: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
872: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
873: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
874: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
875: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
876: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
877: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
878: v += 36;
879: }
880: v = aa + 36*diag[i];
881: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
882: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
883: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
884: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
885: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
886: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
887: }
889: VecRestoreArrayRead(bb,&b);
890: VecRestoreArray(xx,&x);
891: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
892: return(0);
893: }
895: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
896: {
897: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
898: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
899: PetscErrorCode ierr;
900: PetscInt i,k,nz,idx,jdx,idt;
901: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
902: const MatScalar *aa=a->a,*v;
903: PetscScalar *x;
904: const PetscScalar *b;
905: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
908: VecGetArrayRead(bb,&b);
909: VecGetArray(xx,&x);
910: /* forward solve the lower triangular */
911: idx = 0;
912: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
913: x[4] = b[4+idx];x[5] = b[5+idx];
914: for (i=1; i<n; i++) {
915: v = aa + bs2*ai[i];
916: vi = aj + ai[i];
917: nz = ai[i+1] - ai[i];
918: idx = bs*i;
919: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
920: s5 = b[4+idx];s6 = b[5+idx];
921: for (k=0; k<nz; k++) {
922: jdx = bs*vi[k];
923: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
924: x5 = x[4+jdx]; x6 = x[5+jdx];
925: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
926: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
927: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
928: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
929: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
930: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
931: v += bs2;
932: }
934: x[idx] = s1;
935: x[1+idx] = s2;
936: x[2+idx] = s3;
937: x[3+idx] = s4;
938: x[4+idx] = s5;
939: x[5+idx] = s6;
940: }
942: /* backward solve the upper triangular */
943: for (i=n-1; i>=0; i--) {
944: v = aa + bs2*(adiag[i+1]+1);
945: vi = aj + adiag[i+1]+1;
946: nz = adiag[i] - adiag[i+1]-1;
947: idt = bs*i;
948: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
949: s5 = x[4+idt];s6 = x[5+idt];
950: for (k=0; k<nz; k++) {
951: idx = bs*vi[k];
952: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
953: x5 = x[4+idx];x6 = x[5+idx];
954: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
955: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
956: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
957: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
958: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
959: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
960: v += bs2;
961: }
962: /* x = inv_diagonal*x */
963: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
964: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
965: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
966: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
967: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
968: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
969: }
971: VecRestoreArrayRead(bb,&b);
972: VecRestoreArray(xx,&x);
973: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
974: return(0);
975: }
977: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
978: {
979: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
980: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
981: PetscInt i,nz,idx,idt,jdx;
982: PetscErrorCode ierr;
983: const MatScalar *aa=a->a,*v;
984: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
985: const PetscScalar *b;
988: VecGetArrayRead(bb,&b);
989: VecGetArray(xx,&x);
990: /* forward solve the lower triangular */
991: idx = 0;
992: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
993: for (i=1; i<n; i++) {
994: v = aa + 25*ai[i];
995: vi = aj + ai[i];
996: nz = diag[i] - ai[i];
997: idx = 5*i;
998: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
999: while (nz--) {
1000: jdx = 5*(*vi++);
1001: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1002: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1003: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1004: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1005: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1006: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1007: v += 25;
1008: }
1009: x[idx] = s1;
1010: x[1+idx] = s2;
1011: x[2+idx] = s3;
1012: x[3+idx] = s4;
1013: x[4+idx] = s5;
1014: }
1015: /* backward solve the upper triangular */
1016: for (i=n-1; i>=0; i--) {
1017: v = aa + 25*diag[i] + 25;
1018: vi = aj + diag[i] + 1;
1019: nz = ai[i+1] - diag[i] - 1;
1020: idt = 5*i;
1021: s1 = x[idt]; s2 = x[1+idt];
1022: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1023: while (nz--) {
1024: idx = 5*(*vi++);
1025: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1026: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1027: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1028: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1029: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1030: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1031: v += 25;
1032: }
1033: v = aa + 25*diag[i];
1034: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1035: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1036: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1037: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1038: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1039: }
1041: VecRestoreArrayRead(bb,&b);
1042: VecRestoreArray(xx,&x);
1043: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1044: return(0);
1045: }
1047: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1048: {
1049: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1050: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1051: PetscInt i,k,nz,idx,idt,jdx;
1052: PetscErrorCode ierr;
1053: const MatScalar *aa=a->a,*v;
1054: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1055: const PetscScalar *b;
1058: VecGetArrayRead(bb,&b);
1059: VecGetArray(xx,&x);
1060: /* forward solve the lower triangular */
1061: idx = 0;
1062: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1063: for (i=1; i<n; i++) {
1064: v = aa + 25*ai[i];
1065: vi = aj + ai[i];
1066: nz = ai[i+1] - ai[i];
1067: idx = 5*i;
1068: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1069: for (k=0; k<nz; k++) {
1070: jdx = 5*vi[k];
1071: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1072: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1073: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1074: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1075: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1076: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1077: v += 25;
1078: }
1079: x[idx] = s1;
1080: x[1+idx] = s2;
1081: x[2+idx] = s3;
1082: x[3+idx] = s4;
1083: x[4+idx] = s5;
1084: }
1086: /* backward solve the upper triangular */
1087: for (i=n-1; i>=0; i--) {
1088: v = aa + 25*(adiag[i+1]+1);
1089: vi = aj + adiag[i+1]+1;
1090: nz = adiag[i] - adiag[i+1]-1;
1091: idt = 5*i;
1092: s1 = x[idt]; s2 = x[1+idt];
1093: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1094: for (k=0; k<nz; k++) {
1095: idx = 5*vi[k];
1096: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1097: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1098: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1099: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1100: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1101: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1102: v += 25;
1103: }
1104: /* x = inv_diagonal*x */
1105: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1106: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1107: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1108: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1109: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1110: }
1112: VecRestoreArrayRead(bb,&b);
1113: VecRestoreArray(xx,&x);
1114: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1115: return(0);
1116: }
1118: /*
1119: Special case where the matrix was ILU(0) factored in the natural
1120: ordering. This eliminates the need for the column and row permutation.
1121: */
1122: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1123: {
1124: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1125: PetscInt n =a->mbs;
1126: const PetscInt *ai=a->i,*aj=a->j;
1127: PetscErrorCode ierr;
1128: const PetscInt *diag = a->diag;
1129: const MatScalar *aa =a->a;
1130: PetscScalar *x;
1131: const PetscScalar *b;
1134: VecGetArrayRead(bb,&b);
1135: VecGetArray(xx,&x);
1137: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1138: {
1139: static PetscScalar w[2000]; /* very BAD need to fix */
1140: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1141: }
1142: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1143: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1144: #else
1145: {
1146: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
1147: const MatScalar *v;
1148: PetscInt jdx,idt,idx,nz,i,ai16;
1149: const PetscInt *vi;
1151: /* forward solve the lower triangular */
1152: idx = 0;
1153: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1154: for (i=1; i<n; i++) {
1155: v = aa + 16*ai[i];
1156: vi = aj + ai[i];
1157: nz = diag[i] - ai[i];
1158: idx += 4;
1159: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1160: while (nz--) {
1161: jdx = 4*(*vi++);
1162: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1163: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1164: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1165: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1166: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1167: v += 16;
1168: }
1169: x[idx] = s1;
1170: x[1+idx] = s2;
1171: x[2+idx] = s3;
1172: x[3+idx] = s4;
1173: }
1174: /* backward solve the upper triangular */
1175: idt = 4*(n-1);
1176: for (i=n-1; i>=0; i--) {
1177: ai16 = 16*diag[i];
1178: v = aa + ai16 + 16;
1179: vi = aj + diag[i] + 1;
1180: nz = ai[i+1] - diag[i] - 1;
1181: s1 = x[idt]; s2 = x[1+idt];
1182: s3 = x[2+idt];s4 = x[3+idt];
1183: while (nz--) {
1184: idx = 4*(*vi++);
1185: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
1186: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1187: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1188: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1189: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1190: v += 16;
1191: }
1192: v = aa + ai16;
1193: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1194: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
1195: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1196: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1197: idt -= 4;
1198: }
1199: }
1200: #endif
1202: VecRestoreArrayRead(bb,&b);
1203: VecRestoreArray(xx,&x);
1204: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1205: return(0);
1206: }
1208: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1209: {
1210: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1211: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1212: PetscInt i,k,nz,idx,jdx,idt;
1213: PetscErrorCode ierr;
1214: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1215: const MatScalar *aa=a->a,*v;
1216: PetscScalar *x;
1217: const PetscScalar *b;
1218: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
1221: VecGetArrayRead(bb,&b);
1222: VecGetArray(xx,&x);
1223: /* forward solve the lower triangular */
1224: idx = 0;
1225: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1226: for (i=1; i<n; i++) {
1227: v = aa + bs2*ai[i];
1228: vi = aj + ai[i];
1229: nz = ai[i+1] - ai[i];
1230: idx = bs*i;
1231: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1232: for (k=0; k<nz; k++) {
1233: jdx = bs*vi[k];
1234: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1235: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1236: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1237: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1238: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1240: v += bs2;
1241: }
1243: x[idx] = s1;
1244: x[1+idx] = s2;
1245: x[2+idx] = s3;
1246: x[3+idx] = s4;
1247: }
1249: /* backward solve the upper triangular */
1250: for (i=n-1; i>=0; i--) {
1251: v = aa + bs2*(adiag[i+1]+1);
1252: vi = aj + adiag[i+1]+1;
1253: nz = adiag[i] - adiag[i+1]-1;
1254: idt = bs*i;
1255: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1257: for (k=0; k<nz; k++) {
1258: idx = bs*vi[k];
1259: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1260: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1261: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1262: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1263: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1265: v += bs2;
1266: }
1267: /* x = inv_diagonal*x */
1268: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1269: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
1270: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1271: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1273: }
1275: VecRestoreArrayRead(bb,&b);
1276: VecRestoreArray(xx,&x);
1277: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1278: return(0);
1279: }
1281: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1282: {
1283: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1284: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1285: PetscErrorCode ierr;
1286: const MatScalar *aa=a->a;
1287: const PetscScalar *b;
1288: PetscScalar *x;
1291: VecGetArrayRead(bb,&b);
1292: VecGetArray(xx,&x);
1294: {
1295: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
1296: const MatScalar *v;
1297: MatScalar *t=(MatScalar*)x;
1298: PetscInt jdx,idt,idx,nz,i,ai16;
1299: const PetscInt *vi;
1301: /* forward solve the lower triangular */
1302: idx = 0;
1303: t[0] = (MatScalar)b[0];
1304: t[1] = (MatScalar)b[1];
1305: t[2] = (MatScalar)b[2];
1306: t[3] = (MatScalar)b[3];
1307: for (i=1; i<n; i++) {
1308: v = aa + 16*ai[i];
1309: vi = aj + ai[i];
1310: nz = diag[i] - ai[i];
1311: idx += 4;
1312: s1 = (MatScalar)b[idx];
1313: s2 = (MatScalar)b[1+idx];
1314: s3 = (MatScalar)b[2+idx];
1315: s4 = (MatScalar)b[3+idx];
1316: while (nz--) {
1317: jdx = 4*(*vi++);
1318: x1 = t[jdx];
1319: x2 = t[1+jdx];
1320: x3 = t[2+jdx];
1321: x4 = t[3+jdx];
1322: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1323: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1324: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1325: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1326: v += 16;
1327: }
1328: t[idx] = s1;
1329: t[1+idx] = s2;
1330: t[2+idx] = s3;
1331: t[3+idx] = s4;
1332: }
1333: /* backward solve the upper triangular */
1334: idt = 4*(n-1);
1335: for (i=n-1; i>=0; i--) {
1336: ai16 = 16*diag[i];
1337: v = aa + ai16 + 16;
1338: vi = aj + diag[i] + 1;
1339: nz = ai[i+1] - diag[i] - 1;
1340: s1 = t[idt];
1341: s2 = t[1+idt];
1342: s3 = t[2+idt];
1343: s4 = t[3+idt];
1344: while (nz--) {
1345: idx = 4*(*vi++);
1346: x1 = (MatScalar)x[idx];
1347: x2 = (MatScalar)x[1+idx];
1348: x3 = (MatScalar)x[2+idx];
1349: x4 = (MatScalar)x[3+idx];
1350: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1351: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1352: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1353: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1354: v += 16;
1355: }
1356: v = aa + ai16;
1357: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
1358: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
1359: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
1360: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
1361: idt -= 4;
1362: }
1363: }
1365: VecRestoreArrayRead(bb,&b);
1366: VecRestoreArray(xx,&x);
1367: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1368: return(0);
1369: }
1371: #if defined(PETSC_HAVE_SSE)
1373: #include PETSC_HAVE_SSE
1374: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1375: {
1376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1377: unsigned short *aj=(unsigned short*)a->j;
1379: int *ai=a->i,n=a->mbs,*diag = a->diag;
1380: MatScalar *aa=a->a;
1381: PetscScalar *x,*b;
1384: SSE_SCOPE_BEGIN;
1385: /*
1386: Note: This code currently uses demotion of double
1387: to float when performing the mixed-mode computation.
1388: This may not be numerically reasonable for all applications.
1389: */
1390: PREFETCH_NTA(aa+16*ai[1]);
1392: VecGetArray(bb,&b);
1393: VecGetArray(xx,&x);
1394: {
1395: /* x will first be computed in single precision then promoted inplace to double */
1396: MatScalar *v,*t=(MatScalar*)x;
1397: int nz,i,idt,ai16;
1398: unsigned int jdx,idx;
1399: unsigned short *vi;
1400: /* Forward solve the lower triangular factor. */
1402: /* First block is the identity. */
1403: idx = 0;
1404: CONVERT_DOUBLE4_FLOAT4(t,b);
1405: v = aa + 16*((unsigned int)ai[1]);
1407: for (i=1; i<n; ) {
1408: PREFETCH_NTA(&v[8]);
1409: vi = aj + ai[i];
1410: nz = diag[i] - ai[i];
1411: idx += 4;
1413: /* Demote RHS from double to float. */
1414: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1415: LOAD_PS(&t[idx],XMM7);
1417: while (nz--) {
1418: PREFETCH_NTA(&v[16]);
1419: jdx = 4*((unsigned int)(*vi++));
1421: /* 4x4 Matrix-Vector product with negative accumulation: */
1422: SSE_INLINE_BEGIN_2(&t[jdx],v)
1423: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1425: /* First Column */
1426: SSE_COPY_PS(XMM0,XMM6)
1427: SSE_SHUFFLE(XMM0,XMM0,0x00)
1428: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1429: SSE_SUB_PS(XMM7,XMM0)
1431: /* Second Column */
1432: SSE_COPY_PS(XMM1,XMM6)
1433: SSE_SHUFFLE(XMM1,XMM1,0x55)
1434: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1435: SSE_SUB_PS(XMM7,XMM1)
1437: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1439: /* Third Column */
1440: SSE_COPY_PS(XMM2,XMM6)
1441: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1442: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1443: SSE_SUB_PS(XMM7,XMM2)
1445: /* Fourth Column */
1446: SSE_COPY_PS(XMM3,XMM6)
1447: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1448: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1449: SSE_SUB_PS(XMM7,XMM3)
1450: SSE_INLINE_END_2
1452: v += 16;
1453: }
1454: v = aa + 16*ai[++i];
1455: PREFETCH_NTA(v);
1456: STORE_PS(&t[idx],XMM7);
1457: }
1459: /* Backward solve the upper triangular factor.*/
1461: idt = 4*(n-1);
1462: ai16 = 16*diag[n-1];
1463: v = aa + ai16 + 16;
1464: for (i=n-1; i>=0; ) {
1465: PREFETCH_NTA(&v[8]);
1466: vi = aj + diag[i] + 1;
1467: nz = ai[i+1] - diag[i] - 1;
1469: LOAD_PS(&t[idt],XMM7);
1471: while (nz--) {
1472: PREFETCH_NTA(&v[16]);
1473: idx = 4*((unsigned int)(*vi++));
1475: /* 4x4 Matrix-Vector Product with negative accumulation: */
1476: SSE_INLINE_BEGIN_2(&t[idx],v)
1477: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1479: /* First Column */
1480: SSE_COPY_PS(XMM0,XMM6)
1481: SSE_SHUFFLE(XMM0,XMM0,0x00)
1482: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1483: SSE_SUB_PS(XMM7,XMM0)
1485: /* Second Column */
1486: SSE_COPY_PS(XMM1,XMM6)
1487: SSE_SHUFFLE(XMM1,XMM1,0x55)
1488: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1489: SSE_SUB_PS(XMM7,XMM1)
1491: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1493: /* Third Column */
1494: SSE_COPY_PS(XMM2,XMM6)
1495: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1496: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1497: SSE_SUB_PS(XMM7,XMM2)
1499: /* Fourth Column */
1500: SSE_COPY_PS(XMM3,XMM6)
1501: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1502: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1503: SSE_SUB_PS(XMM7,XMM3)
1504: SSE_INLINE_END_2
1505: v += 16;
1506: }
1507: v = aa + ai16;
1508: ai16 = 16*diag[--i];
1509: PREFETCH_NTA(aa+ai16+16);
1510: /*
1511: Scale the result by the diagonal 4x4 block,
1512: which was inverted as part of the factorization
1513: */
1514: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1515: /* First Column */
1516: SSE_COPY_PS(XMM0,XMM7)
1517: SSE_SHUFFLE(XMM0,XMM0,0x00)
1518: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1520: /* Second Column */
1521: SSE_COPY_PS(XMM1,XMM7)
1522: SSE_SHUFFLE(XMM1,XMM1,0x55)
1523: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1524: SSE_ADD_PS(XMM0,XMM1)
1526: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1528: /* Third Column */
1529: SSE_COPY_PS(XMM2,XMM7)
1530: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1531: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1532: SSE_ADD_PS(XMM0,XMM2)
1534: /* Fourth Column */
1535: SSE_COPY_PS(XMM3,XMM7)
1536: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1537: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1538: SSE_ADD_PS(XMM0,XMM3)
1540: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1541: SSE_INLINE_END_3
1543: v = aa + ai16 + 16;
1544: idt -= 4;
1545: }
1547: /* Convert t from single precision back to double precision (inplace)*/
1548: idt = 4*(n-1);
1549: for (i=n-1; i>=0; i--) {
1550: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1551: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1552: PetscScalar *xtemp=&x[idt];
1553: MatScalar *ttemp=&t[idt];
1554: xtemp[3] = (PetscScalar)ttemp[3];
1555: xtemp[2] = (PetscScalar)ttemp[2];
1556: xtemp[1] = (PetscScalar)ttemp[1];
1557: xtemp[0] = (PetscScalar)ttemp[0];
1558: idt -= 4;
1559: }
1561: } /* End of artificial scope. */
1562: VecRestoreArray(bb,&b);
1563: VecRestoreArray(xx,&x);
1564: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1565: SSE_SCOPE_END;
1566: return(0);
1567: }
1569: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1570: {
1571: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1572: int *aj=a->j;
1574: int *ai=a->i,n=a->mbs,*diag = a->diag;
1575: MatScalar *aa=a->a;
1576: PetscScalar *x,*b;
1579: SSE_SCOPE_BEGIN;
1580: /*
1581: Note: This code currently uses demotion of double
1582: to float when performing the mixed-mode computation.
1583: This may not be numerically reasonable for all applications.
1584: */
1585: PREFETCH_NTA(aa+16*ai[1]);
1587: VecGetArray(bb,&b);
1588: VecGetArray(xx,&x);
1589: {
1590: /* x will first be computed in single precision then promoted inplace to double */
1591: MatScalar *v,*t=(MatScalar*)x;
1592: int nz,i,idt,ai16;
1593: int jdx,idx;
1594: int *vi;
1595: /* Forward solve the lower triangular factor. */
1597: /* First block is the identity. */
1598: idx = 0;
1599: CONVERT_DOUBLE4_FLOAT4(t,b);
1600: v = aa + 16*ai[1];
1602: for (i=1; i<n; ) {
1603: PREFETCH_NTA(&v[8]);
1604: vi = aj + ai[i];
1605: nz = diag[i] - ai[i];
1606: idx += 4;
1608: /* Demote RHS from double to float. */
1609: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1610: LOAD_PS(&t[idx],XMM7);
1612: while (nz--) {
1613: PREFETCH_NTA(&v[16]);
1614: jdx = 4*(*vi++);
1615: /* jdx = *vi++; */
1617: /* 4x4 Matrix-Vector product with negative accumulation: */
1618: SSE_INLINE_BEGIN_2(&t[jdx],v)
1619: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1621: /* First Column */
1622: SSE_COPY_PS(XMM0,XMM6)
1623: SSE_SHUFFLE(XMM0,XMM0,0x00)
1624: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1625: SSE_SUB_PS(XMM7,XMM0)
1627: /* Second Column */
1628: SSE_COPY_PS(XMM1,XMM6)
1629: SSE_SHUFFLE(XMM1,XMM1,0x55)
1630: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1631: SSE_SUB_PS(XMM7,XMM1)
1633: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1635: /* Third Column */
1636: SSE_COPY_PS(XMM2,XMM6)
1637: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1638: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1639: SSE_SUB_PS(XMM7,XMM2)
1641: /* Fourth Column */
1642: SSE_COPY_PS(XMM3,XMM6)
1643: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1644: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1645: SSE_SUB_PS(XMM7,XMM3)
1646: SSE_INLINE_END_2
1648: v += 16;
1649: }
1650: v = aa + 16*ai[++i];
1651: PREFETCH_NTA(v);
1652: STORE_PS(&t[idx],XMM7);
1653: }
1655: /* Backward solve the upper triangular factor.*/
1657: idt = 4*(n-1);
1658: ai16 = 16*diag[n-1];
1659: v = aa + ai16 + 16;
1660: for (i=n-1; i>=0; ) {
1661: PREFETCH_NTA(&v[8]);
1662: vi = aj + diag[i] + 1;
1663: nz = ai[i+1] - diag[i] - 1;
1665: LOAD_PS(&t[idt],XMM7);
1667: while (nz--) {
1668: PREFETCH_NTA(&v[16]);
1669: idx = 4*(*vi++);
1670: /* idx = *vi++; */
1672: /* 4x4 Matrix-Vector Product with negative accumulation: */
1673: SSE_INLINE_BEGIN_2(&t[idx],v)
1674: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1676: /* First Column */
1677: SSE_COPY_PS(XMM0,XMM6)
1678: SSE_SHUFFLE(XMM0,XMM0,0x00)
1679: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1680: SSE_SUB_PS(XMM7,XMM0)
1682: /* Second Column */
1683: SSE_COPY_PS(XMM1,XMM6)
1684: SSE_SHUFFLE(XMM1,XMM1,0x55)
1685: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1686: SSE_SUB_PS(XMM7,XMM1)
1688: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1690: /* Third Column */
1691: SSE_COPY_PS(XMM2,XMM6)
1692: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1693: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1694: SSE_SUB_PS(XMM7,XMM2)
1696: /* Fourth Column */
1697: SSE_COPY_PS(XMM3,XMM6)
1698: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1699: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1700: SSE_SUB_PS(XMM7,XMM3)
1701: SSE_INLINE_END_2
1702: v += 16;
1703: }
1704: v = aa + ai16;
1705: ai16 = 16*diag[--i];
1706: PREFETCH_NTA(aa+ai16+16);
1707: /*
1708: Scale the result by the diagonal 4x4 block,
1709: which was inverted as part of the factorization
1710: */
1711: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1712: /* First Column */
1713: SSE_COPY_PS(XMM0,XMM7)
1714: SSE_SHUFFLE(XMM0,XMM0,0x00)
1715: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1717: /* Second Column */
1718: SSE_COPY_PS(XMM1,XMM7)
1719: SSE_SHUFFLE(XMM1,XMM1,0x55)
1720: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1721: SSE_ADD_PS(XMM0,XMM1)
1723: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1725: /* Third Column */
1726: SSE_COPY_PS(XMM2,XMM7)
1727: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1728: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1729: SSE_ADD_PS(XMM0,XMM2)
1731: /* Fourth Column */
1732: SSE_COPY_PS(XMM3,XMM7)
1733: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1734: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1735: SSE_ADD_PS(XMM0,XMM3)
1737: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1738: SSE_INLINE_END_3
1740: v = aa + ai16 + 16;
1741: idt -= 4;
1742: }
1744: /* Convert t from single precision back to double precision (inplace)*/
1745: idt = 4*(n-1);
1746: for (i=n-1; i>=0; i--) {
1747: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1748: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1749: PetscScalar *xtemp=&x[idt];
1750: MatScalar *ttemp=&t[idt];
1751: xtemp[3] = (PetscScalar)ttemp[3];
1752: xtemp[2] = (PetscScalar)ttemp[2];
1753: xtemp[1] = (PetscScalar)ttemp[1];
1754: xtemp[0] = (PetscScalar)ttemp[0];
1755: idt -= 4;
1756: }
1758: } /* End of artificial scope. */
1759: VecRestoreArray(bb,&b);
1760: VecRestoreArray(xx,&x);
1761: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1762: SSE_SCOPE_END;
1763: return(0);
1764: }
1766: #endif
1768: /*
1769: Special case where the matrix was ILU(0) factored in the natural
1770: ordering. This eliminates the need for the column and row permutation.
1771: */
1772: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1773: {
1774: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1775: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j;
1776: PetscErrorCode ierr;
1777: const PetscInt *diag = a->diag,*vi;
1778: const MatScalar *aa =a->a,*v;
1779: PetscScalar *x,s1,s2,s3,x1,x2,x3;
1780: const PetscScalar *b;
1781: PetscInt jdx,idt,idx,nz,i;
1784: VecGetArrayRead(bb,&b);
1785: VecGetArray(xx,&x);
1787: /* forward solve the lower triangular */
1788: idx = 0;
1789: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1790: for (i=1; i<n; i++) {
1791: v = aa + 9*ai[i];
1792: vi = aj + ai[i];
1793: nz = diag[i] - ai[i];
1794: idx += 3;
1795: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1796: while (nz--) {
1797: jdx = 3*(*vi++);
1798: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1799: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1800: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1801: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1802: v += 9;
1803: }
1804: x[idx] = s1;
1805: x[1+idx] = s2;
1806: x[2+idx] = s3;
1807: }
1808: /* backward solve the upper triangular */
1809: for (i=n-1; i>=0; i--) {
1810: v = aa + 9*diag[i] + 9;
1811: vi = aj + diag[i] + 1;
1812: nz = ai[i+1] - diag[i] - 1;
1813: idt = 3*i;
1814: s1 = x[idt]; s2 = x[1+idt];
1815: s3 = x[2+idt];
1816: while (nz--) {
1817: idx = 3*(*vi++);
1818: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
1819: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1820: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1821: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1822: v += 9;
1823: }
1824: v = aa + 9*diag[i];
1825: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1826: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1827: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1828: }
1830: VecRestoreArrayRead(bb,&b);
1831: VecRestoreArray(xx,&x);
1832: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1833: return(0);
1834: }
1836: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1837: {
1838: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1839: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1840: PetscErrorCode ierr;
1841: PetscInt i,k,nz,idx,jdx,idt;
1842: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1843: const MatScalar *aa=a->a,*v;
1844: PetscScalar *x;
1845: const PetscScalar *b;
1846: PetscScalar s1,s2,s3,x1,x2,x3;
1849: VecGetArrayRead(bb,&b);
1850: VecGetArray(xx,&x);
1851: /* forward solve the lower triangular */
1852: idx = 0;
1853: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1854: for (i=1; i<n; i++) {
1855: v = aa + bs2*ai[i];
1856: vi = aj + ai[i];
1857: nz = ai[i+1] - ai[i];
1858: idx = bs*i;
1859: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1860: for (k=0; k<nz; k++) {
1861: jdx = bs*vi[k];
1862: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1863: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1864: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1865: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1867: v += bs2;
1868: }
1870: x[idx] = s1;
1871: x[1+idx] = s2;
1872: x[2+idx] = s3;
1873: }
1875: /* backward solve the upper triangular */
1876: for (i=n-1; i>=0; i--) {
1877: v = aa + bs2*(adiag[i+1]+1);
1878: vi = aj + adiag[i+1]+1;
1879: nz = adiag[i] - adiag[i+1]-1;
1880: idt = bs*i;
1881: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
1883: for (k=0; k<nz; k++) {
1884: idx = bs*vi[k];
1885: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1886: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1887: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1888: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1890: v += bs2;
1891: }
1892: /* x = inv_diagonal*x */
1893: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1894: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1895: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1897: }
1899: VecRestoreArrayRead(bb,&b);
1900: VecRestoreArray(xx,&x);
1901: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1902: return(0);
1903: }
1905: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1906: {
1907: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1908: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j;
1909: PetscErrorCode ierr;
1910: PetscInt i,k,nz,idx,jdx;
1911: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1912: const MatScalar *aa=a->a,*v;
1913: PetscScalar *x;
1914: const PetscScalar *b;
1915: PetscScalar s1,s2,s3,x1,x2,x3;
1918: VecGetArrayRead(bb,&b);
1919: VecGetArray(xx,&x);
1920: /* forward solve the lower triangular */
1921: idx = 0;
1922: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1923: for (i=1; i<n; i++) {
1924: v = aa + bs2*ai[i];
1925: vi = aj + ai[i];
1926: nz = ai[i+1] - ai[i];
1927: idx = bs*i;
1928: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1929: for (k=0; k<nz; k++) {
1930: jdx = bs*vi[k];
1931: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1932: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1933: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1934: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1936: v += bs2;
1937: }
1939: x[idx] = s1;
1940: x[1+idx] = s2;
1941: x[2+idx] = s3;
1942: }
1944: VecRestoreArrayRead(bb,&b);
1945: VecRestoreArray(xx,&x);
1946: PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1947: return(0);
1948: }
1951: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1952: {
1953: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1954: const PetscInt n =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1955: PetscErrorCode ierr;
1956: PetscInt i,k,nz,idx,idt;
1957: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
1958: const MatScalar *aa=a->a,*v;
1959: PetscScalar *x;
1960: const PetscScalar *b;
1961: PetscScalar s1,s2,s3,x1,x2,x3;
1964: VecGetArrayRead(bb,&b);
1965: VecGetArray(xx,&x);
1967: /* backward solve the upper triangular */
1968: for (i=n-1; i>=0; i--) {
1969: v = aa + bs2*(adiag[i+1]+1);
1970: vi = aj + adiag[i+1]+1;
1971: nz = adiag[i] - adiag[i+1]-1;
1972: idt = bs*i;
1973: s1 = b[idt]; s2 = b[1+idt];s3 = b[2+idt];
1975: for (k=0; k<nz; k++) {
1976: idx = bs*vi[k];
1977: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1978: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1979: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1980: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1982: v += bs2;
1983: }
1984: /* x = inv_diagonal*x */
1985: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1986: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1987: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1989: }
1991: VecRestoreArrayRead(bb,&b);
1992: VecRestoreArray(xx,&x);
1993: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1994: return(0);
1995: }
1997: /*
1998: Special case where the matrix was ILU(0) factored in the natural
1999: ordering. This eliminates the need for the column and row permutation.
2000: */
2001: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2002: {
2003: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2004: const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2005: PetscErrorCode ierr;
2006: const MatScalar *aa=a->a,*v;
2007: PetscScalar *x,s1,s2,x1,x2;
2008: const PetscScalar *b;
2009: PetscInt jdx,idt,idx,nz,i;
2012: VecGetArrayRead(bb,&b);
2013: VecGetArray(xx,&x);
2015: /* forward solve the lower triangular */
2016: idx = 0;
2017: x[0] = b[0]; x[1] = b[1];
2018: for (i=1; i<n; i++) {
2019: v = aa + 4*ai[i];
2020: vi = aj + ai[i];
2021: nz = diag[i] - ai[i];
2022: idx += 2;
2023: s1 = b[idx];s2 = b[1+idx];
2024: while (nz--) {
2025: jdx = 2*(*vi++);
2026: x1 = x[jdx];x2 = x[1+jdx];
2027: s1 -= v[0]*x1 + v[2]*x2;
2028: s2 -= v[1]*x1 + v[3]*x2;
2029: v += 4;
2030: }
2031: x[idx] = s1;
2032: x[1+idx] = s2;
2033: }
2034: /* backward solve the upper triangular */
2035: for (i=n-1; i>=0; i--) {
2036: v = aa + 4*diag[i] + 4;
2037: vi = aj + diag[i] + 1;
2038: nz = ai[i+1] - diag[i] - 1;
2039: idt = 2*i;
2040: s1 = x[idt]; s2 = x[1+idt];
2041: while (nz--) {
2042: idx = 2*(*vi++);
2043: x1 = x[idx]; x2 = x[1+idx];
2044: s1 -= v[0]*x1 + v[2]*x2;
2045: s2 -= v[1]*x1 + v[3]*x2;
2046: v += 4;
2047: }
2048: v = aa + 4*diag[i];
2049: x[idt] = v[0]*s1 + v[2]*s2;
2050: x[1+idt] = v[1]*s1 + v[3]*s2;
2051: }
2053: VecRestoreArrayRead(bb,&b);
2054: VecRestoreArray(xx,&x);
2055: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2056: return(0);
2057: }
2059: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2060: {
2061: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2062: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2063: PetscInt i,k,nz,idx,idt,jdx;
2064: PetscErrorCode ierr;
2065: const MatScalar *aa=a->a,*v;
2066: PetscScalar *x,s1,s2,x1,x2;
2067: const PetscScalar *b;
2070: VecGetArrayRead(bb,&b);
2071: VecGetArray(xx,&x);
2072: /* forward solve the lower triangular */
2073: idx = 0;
2074: x[0] = b[idx]; x[1] = b[1+idx];
2075: for (i=1; i<n; i++) {
2076: v = aa + 4*ai[i];
2077: vi = aj + ai[i];
2078: nz = ai[i+1] - ai[i];
2079: idx = 2*i;
2080: s1 = b[idx];s2 = b[1+idx];
2081: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2082: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2083: for (k=0; k<nz; k++) {
2084: jdx = 2*vi[k];
2085: x1 = x[jdx];x2 = x[1+jdx];
2086: s1 -= v[0]*x1 + v[2]*x2;
2087: s2 -= v[1]*x1 + v[3]*x2;
2088: v += 4;
2089: }
2090: x[idx] = s1;
2091: x[1+idx] = s2;
2092: }
2094: /* backward solve the upper triangular */
2095: for (i=n-1; i>=0; i--) {
2096: v = aa + 4*(adiag[i+1]+1);
2097: vi = aj + adiag[i+1]+1;
2098: nz = adiag[i] - adiag[i+1]-1;
2099: idt = 2*i;
2100: s1 = x[idt]; s2 = x[1+idt];
2101: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2102: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2103: for (k=0; k<nz; k++) {
2104: idx = 2*vi[k];
2105: x1 = x[idx]; x2 = x[1+idx];
2106: s1 -= v[0]*x1 + v[2]*x2;
2107: s2 -= v[1]*x1 + v[3]*x2;
2108: v += 4;
2109: }
2110: /* x = inv_diagonal*x */
2111: x[idt] = v[0]*s1 + v[2]*s2;
2112: x[1+idt] = v[1]*s1 + v[3]*s2;
2113: }
2115: VecRestoreArrayRead(bb,&b);
2116: VecRestoreArray(xx,&x);
2117: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2118: return(0);
2119: }
2121: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2122: {
2123: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2124: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j;
2125: PetscInt i,k,nz,idx,jdx;
2126: PetscErrorCode ierr;
2127: const MatScalar *aa=a->a,*v;
2128: PetscScalar *x,s1,s2,x1,x2;
2129: const PetscScalar *b;
2132: VecGetArrayRead(bb,&b);
2133: VecGetArray(xx,&x);
2134: /* forward solve the lower triangular */
2135: idx = 0;
2136: x[0] = b[idx]; x[1] = b[1+idx];
2137: for (i=1; i<n; i++) {
2138: v = aa + 4*ai[i];
2139: vi = aj + ai[i];
2140: nz = ai[i+1] - ai[i];
2141: idx = 2*i;
2142: s1 = b[idx];s2 = b[1+idx];
2143: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2144: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2145: for (k=0; k<nz; k++) {
2146: jdx = 2*vi[k];
2147: x1 = x[jdx];x2 = x[1+jdx];
2148: s1 -= v[0]*x1 + v[2]*x2;
2149: s2 -= v[1]*x1 + v[3]*x2;
2150: v += 4;
2151: }
2152: x[idx] = s1;
2153: x[1+idx] = s2;
2154: }
2157: VecRestoreArrayRead(bb,&b);
2158: VecRestoreArray(xx,&x);
2159: PetscLogFlops(4.0*(a->nz) - A->cmap->n);
2160: return(0);
2161: }
2163: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2164: {
2165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2166: const PetscInt n = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
2167: PetscInt i,k,nz,idx,idt;
2168: PetscErrorCode ierr;
2169: const MatScalar *aa=a->a,*v;
2170: PetscScalar *x,s1,s2,x1,x2;
2171: const PetscScalar *b;
2174: VecGetArrayRead(bb,&b);
2175: VecGetArray(xx,&x);
2177: /* backward solve the upper triangular */
2178: for (i=n-1; i>=0; i--) {
2179: v = aa + 4*(adiag[i+1]+1);
2180: vi = aj + adiag[i+1]+1;
2181: nz = adiag[i] - adiag[i+1]-1;
2182: idt = 2*i;
2183: s1 = b[idt]; s2 = b[1+idt];
2184: PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2185: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2186: for (k=0; k<nz; k++) {
2187: idx = 2*vi[k];
2188: x1 = x[idx]; x2 = x[1+idx];
2189: s1 -= v[0]*x1 + v[2]*x2;
2190: s2 -= v[1]*x1 + v[3]*x2;
2191: v += 4;
2192: }
2193: /* x = inv_diagonal*x */
2194: x[idt] = v[0]*s1 + v[2]*s2;
2195: x[1+idt] = v[1]*s1 + v[3]*s2;
2196: }
2198: VecRestoreArrayRead(bb,&b);
2199: VecRestoreArray(xx,&x);
2200: PetscLogFlops(4.0*a->nz - A->cmap->n);
2201: return(0);
2202: }
2204: /*
2205: Special case where the matrix was ILU(0) factored in the natural
2206: ordering. This eliminates the need for the column and row permutation.
2207: */
2208: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2209: {
2210: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2211: const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2212: PetscErrorCode ierr;
2213: const MatScalar *aa=a->a,*v;
2214: PetscScalar *x;
2215: const PetscScalar *b;
2216: PetscScalar s1,x1;
2217: PetscInt jdx,idt,idx,nz,i;
2220: VecGetArrayRead(bb,&b);
2221: VecGetArray(xx,&x);
2223: /* forward solve the lower triangular */
2224: idx = 0;
2225: x[0] = b[0];
2226: for (i=1; i<n; i++) {
2227: v = aa + ai[i];
2228: vi = aj + ai[i];
2229: nz = diag[i] - ai[i];
2230: idx += 1;
2231: s1 = b[idx];
2232: while (nz--) {
2233: jdx = *vi++;
2234: x1 = x[jdx];
2235: s1 -= v[0]*x1;
2236: v += 1;
2237: }
2238: x[idx] = s1;
2239: }
2240: /* backward solve the upper triangular */
2241: for (i=n-1; i>=0; i--) {
2242: v = aa + diag[i] + 1;
2243: vi = aj + diag[i] + 1;
2244: nz = ai[i+1] - diag[i] - 1;
2245: idt = i;
2246: s1 = x[idt];
2247: while (nz--) {
2248: idx = *vi++;
2249: x1 = x[idx];
2250: s1 -= v[0]*x1;
2251: v += 1;
2252: }
2253: v = aa + diag[i];
2254: x[idt] = v[0]*s1;
2255: }
2256: VecRestoreArrayRead(bb,&b);
2257: VecRestoreArray(xx,&x);
2258: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
2259: return(0);
2260: }
2263: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2264: {
2265: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2266: PetscErrorCode ierr;
2267: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*vi;
2268: PetscScalar *x,sum;
2269: const PetscScalar *b;
2270: const MatScalar *aa = a->a,*v;
2271: PetscInt i,nz;
2274: if (!n) return(0);
2276: VecGetArrayRead(bb,&b);
2277: VecGetArray(xx,&x);
2279: /* forward solve the lower triangular */
2280: x[0] = b[0];
2281: v = aa;
2282: vi = aj;
2283: for (i=1; i<n; i++) {
2284: nz = ai[i+1] - ai[i];
2285: sum = b[i];
2286: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2287: v += nz;
2288: vi += nz;
2289: x[i] = sum;
2290: }
2291: PetscLogFlops(a->nz - A->cmap->n);
2292: VecRestoreArrayRead(bb,&b);
2293: VecRestoreArray(xx,&x);
2294: return(0);
2295: }
2297: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2298: {
2299: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2300: PetscErrorCode ierr;
2301: const PetscInt n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
2302: PetscScalar *x,sum;
2303: const PetscScalar *b;
2304: const MatScalar *aa = a->a,*v;
2305: PetscInt i,nz;
2308: if (!n) return(0);
2310: VecGetArrayRead(bb,&b);
2311: VecGetArray(xx,&x);
2313: /* backward solve the upper triangular */
2314: for (i=n-1; i>=0; i--) {
2315: v = aa + adiag[i+1] + 1;
2316: vi = aj + adiag[i+1] + 1;
2317: nz = adiag[i] - adiag[i+1]-1;
2318: sum = b[i];
2319: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2320: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2321: }
2323: PetscLogFlops(2.0*a->nz - A->cmap->n);
2324: VecRestoreArrayRead(bb,&b);
2325: VecRestoreArray(xx,&x);
2326: return(0);
2327: }
2329: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2330: {
2331: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2332: PetscErrorCode ierr;
2333: const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
2334: PetscScalar *x,sum;
2335: const PetscScalar *b;
2336: const MatScalar *aa = a->a,*v;
2337: PetscInt i,nz;
2340: if (!n) return(0);
2342: VecGetArrayRead(bb,&b);
2343: VecGetArray(xx,&x);
2345: /* forward solve the lower triangular */
2346: x[0] = b[0];
2347: v = aa;
2348: vi = aj;
2349: for (i=1; i<n; i++) {
2350: nz = ai[i+1] - ai[i];
2351: sum = b[i];
2352: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2353: v += nz;
2354: vi += nz;
2355: x[i] = sum;
2356: }
2358: /* backward solve the upper triangular */
2359: for (i=n-1; i>=0; i--) {
2360: v = aa + adiag[i+1] + 1;
2361: vi = aj + adiag[i+1] + 1;
2362: nz = adiag[i] - adiag[i+1]-1;
2363: sum = x[i];
2364: PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2365: x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2366: }
2368: PetscLogFlops(2.0*a->nz - A->cmap->n);
2369: VecRestoreArrayRead(bb,&b);
2370: VecRestoreArray(xx,&x);
2371: return(0);
2372: }