Actual source code: baijsolvnat.c

petsc-3.10.2 2018-10-09
Report Typos and Errors
  1:  #include <../src/mat/impls/baij/seq/baij.h>
  2:  #include <petsc/private/kernels/blockinvert.h>

  4: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

  6: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
  7: {
  8:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
  9:   PetscErrorCode    ierr;
 10:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
 11:   PetscInt          i,nz,idx,idt,m;
 12:   const MatScalar   *aa=a->a,*v;
 13:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
 14:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 19:   VecGetArrayRead(bb,&b);
 20:   VecGetArray(xx,&x);

 22:   /* forward solve the lower triangular */
 23:   idx   = 0;
 24:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
 25:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
 26:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

 28:   for (i=1; i<n; i++) {
 29:     v   = aa + bs2*ai[i];
 30:     vi  = aj + ai[i];
 31:     nz  = ai[i+1] - ai[i];
 32:     idt = bs*i;
 33:     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
 34:     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
 35:     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
 36:     for (m=0; m<nz; m++) {
 37:       idx = bs*vi[m];
 38:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 39:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 40:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];


 43:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 44:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 45:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 46:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 47:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 48:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 49:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 50:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 51:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 52:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 53:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 54:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 55:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 56:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 57:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

 59:       v += bs2;
 60:     }
 61:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
 62:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
 63:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;

 65:   }
 66:   /* backward solve the upper triangular */
 67:   for (i=n-1; i>=0; i--) {
 68:     v   = aa + bs2*(adiag[i+1]+1);
 69:     vi  = aj + adiag[i+1]+1;
 70:     nz  = adiag[i] - adiag[i+1] - 1;
 71:     idt = bs*i;
 72:     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
 73:     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
 74:     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];

 76:     for (m=0; m<nz; m++) {
 77:       idx = bs*vi[m];
 78:       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
 79:       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
 80:       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

 82:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
 83:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
 84:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
 85:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
 86:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
 87:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
 88:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
 89:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
 90:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
 91:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
 92:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
 93:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
 94:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
 95:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
 96:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

 98:       v += bs2;
 99:     }

101:     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
102:     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
103:     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
104:     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
105:     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
106:     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
107:     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
108:     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
109:     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
110:     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
111:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
112:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
113:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
114:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
115:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

117:   }

119:   VecRestoreArrayRead(bb,&b);
120:   VecRestoreArray(xx,&x);
121:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
122:   return(0);
123: }

125: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
126: /* Default MatSolve for block size 15 */

128: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
129: {
130:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
131:   PetscErrorCode    ierr;
132:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
133:   PetscInt          i,k,nz,idx,idt,m;
134:   const MatScalar   *aa=a->a,*v;
135:   PetscScalar       s[15];
136:   PetscScalar       *x,xv;
137:   const PetscScalar *b;

140:   VecGetArrayRead(bb,&b);
141:   VecGetArray(xx,&x);

143:   /* forward solve the lower triangular */
144:   for (i=0; i<n; i++) {
145:     v         = aa + bs2*ai[i];
146:     vi        = aj + ai[i];
147:     nz        = ai[i+1] - ai[i];
148:     idt       = bs*i;
149:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
150:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
151:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
152:     for (m=0; m<nz; m++) {
153:       idx = bs*vi[m];
154:       for (k=0; k<15; k++) {
155:         xv         = x[k + idx];
156:         x[idt]    -= v[0]*xv;
157:         x[1+idt]  -= v[1]*xv;
158:         x[2+idt]  -= v[2]*xv;
159:         x[3+idt]  -= v[3]*xv;
160:         x[4+idt]  -= v[4]*xv;
161:         x[5+idt]  -= v[5]*xv;
162:         x[6+idt]  -= v[6]*xv;
163:         x[7+idt]  -= v[7]*xv;
164:         x[8+idt]  -= v[8]*xv;
165:         x[9+idt]  -= v[9]*xv;
166:         x[10+idt] -= v[10]*xv;
167:         x[11+idt] -= v[11]*xv;
168:         x[12+idt] -= v[12]*xv;
169:         x[13+idt] -= v[13]*xv;
170:         x[14+idt] -= v[14]*xv;
171:         v         += 15;
172:       }
173:     }
174:   }
175:   /* backward solve the upper triangular */
176:   for (i=n-1; i>=0; i--) {
177:     v     = aa + bs2*(adiag[i+1]+1);
178:     vi    = aj + adiag[i+1]+1;
179:     nz    = adiag[i] - adiag[i+1] - 1;
180:     idt   = bs*i;
181:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
182:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
183:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];

185:     for (m=0; m<nz; m++) {
186:       idx = bs*vi[m];
187:       for (k=0; k<15; k++) {
188:         xv     = x[k + idx];
189:         s[0]  -= v[0]*xv;
190:         s[1]  -= v[1]*xv;
191:         s[2]  -= v[2]*xv;
192:         s[3]  -= v[3]*xv;
193:         s[4]  -= v[4]*xv;
194:         s[5]  -= v[5]*xv;
195:         s[6]  -= v[6]*xv;
196:         s[7]  -= v[7]*xv;
197:         s[8]  -= v[8]*xv;
198:         s[9]  -= v[9]*xv;
199:         s[10] -= v[10]*xv;
200:         s[11] -= v[11]*xv;
201:         s[12] -= v[12]*xv;
202:         s[13] -= v[13]*xv;
203:         s[14] -= v[14]*xv;
204:         v     += 15;
205:       }
206:     }
207:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
208:     for (k=0; k<15; k++) {
209:       x[idt]    += v[0]*s[k];
210:       x[1+idt]  += v[1]*s[k];
211:       x[2+idt]  += v[2]*s[k];
212:       x[3+idt]  += v[3]*s[k];
213:       x[4+idt]  += v[4]*s[k];
214:       x[5+idt]  += v[5]*s[k];
215:       x[6+idt]  += v[6]*s[k];
216:       x[7+idt]  += v[7]*s[k];
217:       x[8+idt]  += v[8]*s[k];
218:       x[9+idt]  += v[9]*s[k];
219:       x[10+idt] += v[10]*s[k];
220:       x[11+idt] += v[11]*s[k];
221:       x[12+idt] += v[12]*s[k];
222:       x[13+idt] += v[13]*s[k];
223:       x[14+idt] += v[14]*s[k];
224:       v         += 15;
225:     }
226:   }
227:   VecRestoreArrayRead(bb,&b);
228:   VecRestoreArray(xx,&x);
229:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
230:   return(0);
231: }

233: /* Block operations are done by accessing one column at at time */
234: /* Default MatSolve for block size 14 */

236: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
237: {
238:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
239:   PetscErrorCode    ierr;
240:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
241:   PetscInt          i,k,nz,idx,idt,m;
242:   const MatScalar   *aa=a->a,*v;
243:   PetscScalar       s[14];
244:   PetscScalar       *x,xv;
245:   const PetscScalar *b;

248:   VecGetArrayRead(bb,&b);
249:   VecGetArray(xx,&x);

251:   /* forward solve the lower triangular */
252:   for (i=0; i<n; i++) {
253:     v         = aa + bs2*ai[i];
254:     vi        = aj + ai[i];
255:     nz        = ai[i+1] - ai[i];
256:     idt       = bs*i;
257:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
258:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
259:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
260:     for (m=0; m<nz; m++) {
261:       idx = bs*vi[m];
262:       for (k=0; k<bs; k++) {
263:         xv         = x[k + idx];
264:         x[idt]    -= v[0]*xv;
265:         x[1+idt]  -= v[1]*xv;
266:         x[2+idt]  -= v[2]*xv;
267:         x[3+idt]  -= v[3]*xv;
268:         x[4+idt]  -= v[4]*xv;
269:         x[5+idt]  -= v[5]*xv;
270:         x[6+idt]  -= v[6]*xv;
271:         x[7+idt]  -= v[7]*xv;
272:         x[8+idt]  -= v[8]*xv;
273:         x[9+idt]  -= v[9]*xv;
274:         x[10+idt] -= v[10]*xv;
275:         x[11+idt] -= v[11]*xv;
276:         x[12+idt] -= v[12]*xv;
277:         x[13+idt] -= v[13]*xv;
278:         v         += 14;
279:       }
280:     }
281:   }
282:   /* backward solve the upper triangular */
283:   for (i=n-1; i>=0; i--) {
284:     v     = aa + bs2*(adiag[i+1]+1);
285:     vi    = aj + adiag[i+1]+1;
286:     nz    = adiag[i] - adiag[i+1] - 1;
287:     idt   = bs*i;
288:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
289:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
290:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];

292:     for (m=0; m<nz; m++) {
293:       idx = bs*vi[m];
294:       for (k=0; k<bs; k++) {
295:         xv     = x[k + idx];
296:         s[0]  -= v[0]*xv;
297:         s[1]  -= v[1]*xv;
298:         s[2]  -= v[2]*xv;
299:         s[3]  -= v[3]*xv;
300:         s[4]  -= v[4]*xv;
301:         s[5]  -= v[5]*xv;
302:         s[6]  -= v[6]*xv;
303:         s[7]  -= v[7]*xv;
304:         s[8]  -= v[8]*xv;
305:         s[9]  -= v[9]*xv;
306:         s[10] -= v[10]*xv;
307:         s[11] -= v[11]*xv;
308:         s[12] -= v[12]*xv;
309:         s[13] -= v[13]*xv;
310:         v     += 14;
311:       }
312:     }
313:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
314:     for (k=0; k<bs; k++) {
315:       x[idt]    += v[0]*s[k];
316:       x[1+idt]  += v[1]*s[k];
317:       x[2+idt]  += v[2]*s[k];
318:       x[3+idt]  += v[3]*s[k];
319:       x[4+idt]  += v[4]*s[k];
320:       x[5+idt]  += v[5]*s[k];
321:       x[6+idt]  += v[6]*s[k];
322:       x[7+idt]  += v[7]*s[k];
323:       x[8+idt]  += v[8]*s[k];
324:       x[9+idt]  += v[9]*s[k];
325:       x[10+idt] += v[10]*s[k];
326:       x[11+idt] += v[11]*s[k];
327:       x[12+idt] += v[12]*s[k];
328:       x[13+idt] += v[13]*s[k];
329:       v         += 14;
330:     }
331:   }
332:   VecRestoreArrayRead(bb,&b);
333:   VecRestoreArray(xx,&x);
334:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
335:   return(0);
336: }

338: /* Block operations are done by accessing one column at at time */
339: /* Default MatSolve for block size 13 */

341: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
342: {
343:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
344:   PetscErrorCode    ierr;
345:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
346:   PetscInt          i,k,nz,idx,idt,m;
347:   const MatScalar   *aa=a->a,*v;
348:   PetscScalar       s[13];
349:   PetscScalar       *x,xv;
350:   const PetscScalar *b;

353:   VecGetArrayRead(bb,&b);
354:   VecGetArray(xx,&x);

356:   /* forward solve the lower triangular */
357:   for (i=0; i<n; i++) {
358:     v         = aa + bs2*ai[i];
359:     vi        = aj + ai[i];
360:     nz        = ai[i+1] - ai[i];
361:     idt       = bs*i;
362:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
363:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
364:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
365:     for (m=0; m<nz; m++) {
366:       idx = bs*vi[m];
367:       for (k=0; k<bs; k++) {
368:         xv         = x[k + idx];
369:         x[idt]    -= v[0]*xv;
370:         x[1+idt]  -= v[1]*xv;
371:         x[2+idt]  -= v[2]*xv;
372:         x[3+idt]  -= v[3]*xv;
373:         x[4+idt]  -= v[4]*xv;
374:         x[5+idt]  -= v[5]*xv;
375:         x[6+idt]  -= v[6]*xv;
376:         x[7+idt]  -= v[7]*xv;
377:         x[8+idt]  -= v[8]*xv;
378:         x[9+idt]  -= v[9]*xv;
379:         x[10+idt] -= v[10]*xv;
380:         x[11+idt] -= v[11]*xv;
381:         x[12+idt] -= v[12]*xv;
382:         v         += 13;
383:       }
384:     }
385:   }
386:   /* backward solve the upper triangular */
387:   for (i=n-1; i>=0; i--) {
388:     v     = aa + bs2*(adiag[i+1]+1);
389:     vi    = aj + adiag[i+1]+1;
390:     nz    = adiag[i] - adiag[i+1] - 1;
391:     idt   = bs*i;
392:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
393:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
394:     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];

396:     for (m=0; m<nz; m++) {
397:       idx = bs*vi[m];
398:       for (k=0; k<bs; k++) {
399:         xv     = x[k + idx];
400:         s[0]  -= v[0]*xv;
401:         s[1]  -= v[1]*xv;
402:         s[2]  -= v[2]*xv;
403:         s[3]  -= v[3]*xv;
404:         s[4]  -= v[4]*xv;
405:         s[5]  -= v[5]*xv;
406:         s[6]  -= v[6]*xv;
407:         s[7]  -= v[7]*xv;
408:         s[8]  -= v[8]*xv;
409:         s[9]  -= v[9]*xv;
410:         s[10] -= v[10]*xv;
411:         s[11] -= v[11]*xv;
412:         s[12] -= v[12]*xv;
413:         v     += 13;
414:       }
415:     }
416:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
417:     for (k=0; k<bs; k++) {
418:       x[idt]    += v[0]*s[k];
419:       x[1+idt]  += v[1]*s[k];
420:       x[2+idt]  += v[2]*s[k];
421:       x[3+idt]  += v[3]*s[k];
422:       x[4+idt]  += v[4]*s[k];
423:       x[5+idt]  += v[5]*s[k];
424:       x[6+idt]  += v[6]*s[k];
425:       x[7+idt]  += v[7]*s[k];
426:       x[8+idt]  += v[8]*s[k];
427:       x[9+idt]  += v[9]*s[k];
428:       x[10+idt] += v[10]*s[k];
429:       x[11+idt] += v[11]*s[k];
430:       x[12+idt] += v[12]*s[k];
431:       v         += 13;
432:     }
433:   }
434:   VecRestoreArrayRead(bb,&b);
435:   VecRestoreArray(xx,&x);
436:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
437:   return(0);
438: }

440: /* Block operations are done by accessing one column at at time */
441: /* Default MatSolve for block size 12 */

443: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
444: {
445:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
446:   PetscErrorCode    ierr;
447:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
448:   PetscInt          i,k,nz,idx,idt,m;
449:   const MatScalar   *aa=a->a,*v;
450:   PetscScalar       s[12];
451:   PetscScalar       *x,xv;
452:   const PetscScalar *b;

455:   VecGetArrayRead(bb,&b);
456:   VecGetArray(xx,&x);

458:   /* forward solve the lower triangular */
459:   for (i=0; i<n; i++) {
460:     v         = aa + bs2*ai[i];
461:     vi        = aj + ai[i];
462:     nz        = ai[i+1] - ai[i];
463:     idt       = bs*i;
464:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
465:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
466:     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
467:     for (m=0; m<nz; m++) {
468:       idx = bs*vi[m];
469:       for (k=0; k<bs; k++) {
470:         xv         = x[k + idx];
471:         x[idt]    -= v[0]*xv;
472:         x[1+idt]  -= v[1]*xv;
473:         x[2+idt]  -= v[2]*xv;
474:         x[3+idt]  -= v[3]*xv;
475:         x[4+idt]  -= v[4]*xv;
476:         x[5+idt]  -= v[5]*xv;
477:         x[6+idt]  -= v[6]*xv;
478:         x[7+idt]  -= v[7]*xv;
479:         x[8+idt]  -= v[8]*xv;
480:         x[9+idt]  -= v[9]*xv;
481:         x[10+idt] -= v[10]*xv;
482:         x[11+idt] -= v[11]*xv;
483:         v         += 12;
484:       }
485:     }
486:   }
487:   /* backward solve the upper triangular */
488:   for (i=n-1; i>=0; i--) {
489:     v     = aa + bs2*(adiag[i+1]+1);
490:     vi    = aj + adiag[i+1]+1;
491:     nz    = adiag[i] - adiag[i+1] - 1;
492:     idt   = bs*i;
493:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
494:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
495:     s[10] = x[10+idt]; s[11] = x[11+idt];

497:     for (m=0; m<nz; m++) {
498:       idx = bs*vi[m];
499:       for (k=0; k<bs; k++) {
500:         xv     = x[k + idx];
501:         s[0]  -= v[0]*xv;
502:         s[1]  -= v[1]*xv;
503:         s[2]  -= v[2]*xv;
504:         s[3]  -= v[3]*xv;
505:         s[4]  -= v[4]*xv;
506:         s[5]  -= v[5]*xv;
507:         s[6]  -= v[6]*xv;
508:         s[7]  -= v[7]*xv;
509:         s[8]  -= v[8]*xv;
510:         s[9]  -= v[9]*xv;
511:         s[10] -= v[10]*xv;
512:         s[11] -= v[11]*xv;
513:         v     += 12;
514:       }
515:     }
516:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
517:     for (k=0; k<bs; k++) {
518:       x[idt]    += v[0]*s[k];
519:       x[1+idt]  += v[1]*s[k];
520:       x[2+idt]  += v[2]*s[k];
521:       x[3+idt]  += v[3]*s[k];
522:       x[4+idt]  += v[4]*s[k];
523:       x[5+idt]  += v[5]*s[k];
524:       x[6+idt]  += v[6]*s[k];
525:       x[7+idt]  += v[7]*s[k];
526:       x[8+idt]  += v[8]*s[k];
527:       x[9+idt]  += v[9]*s[k];
528:       x[10+idt] += v[10]*s[k];
529:       x[11+idt] += v[11]*s[k];
530:       v         += 12;
531:     }
532:   }
533:   VecRestoreArrayRead(bb,&b);
534:   VecRestoreArray(xx,&x);
535:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
536:   return(0);
537: }

539: /* Block operations are done by accessing one column at at time */
540: /* Default MatSolve for block size 11 */

542: PetscErrorCode MatSolve_SeqBAIJ_11_NaturalOrdering(Mat A,Vec bb,Vec xx)
543: {
544:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
545:   PetscErrorCode    ierr;
546:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
547:   PetscInt          i,k,nz,idx,idt,m;
548:   const MatScalar   *aa=a->a,*v;
549:   PetscScalar       s[11];
550:   PetscScalar       *x,xv;
551:   const PetscScalar *b;

554:   VecGetArrayRead(bb,&b);
555:   VecGetArray(xx,&x);

557:   /* forward solve the lower triangular */
558:   for (i=0; i<n; i++) {
559:     v         = aa + bs2*ai[i];
560:     vi        = aj + ai[i];
561:     nz        = ai[i+1] - ai[i];
562:     idt       = bs*i;
563:     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
564:     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
565:     x[10+idt] = b[10+idt];
566:     for (m=0; m<nz; m++) {
567:       idx = bs*vi[m];
568:       for (k=0; k<11; k++) {
569:         xv         = x[k + idx];
570:         x[idt]    -= v[0]*xv;
571:         x[1+idt]  -= v[1]*xv;
572:         x[2+idt]  -= v[2]*xv;
573:         x[3+idt]  -= v[3]*xv;
574:         x[4+idt]  -= v[4]*xv;
575:         x[5+idt]  -= v[5]*xv;
576:         x[6+idt]  -= v[6]*xv;
577:         x[7+idt]  -= v[7]*xv;
578:         x[8+idt]  -= v[8]*xv;
579:         x[9+idt]  -= v[9]*xv;
580:         x[10+idt] -= v[10]*xv;
581:         v         += 11;
582:       }
583:     }
584:   }
585:   /* backward solve the upper triangular */
586:   for (i=n-1; i>=0; i--) {
587:     v     = aa + bs2*(adiag[i+1]+1);
588:     vi    = aj + adiag[i+1]+1;
589:     nz    = adiag[i] - adiag[i+1] - 1;
590:     idt   = bs*i;
591:     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
592:     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
593:     s[10] = x[10+idt];

595:     for (m=0; m<nz; m++) {
596:       idx = bs*vi[m];
597:       for (k=0; k<11; k++) {
598:         xv     = x[k + idx];
599:         s[0]  -= v[0]*xv;
600:         s[1]  -= v[1]*xv;
601:         s[2]  -= v[2]*xv;
602:         s[3]  -= v[3]*xv;
603:         s[4]  -= v[4]*xv;
604:         s[5]  -= v[5]*xv;
605:         s[6]  -= v[6]*xv;
606:         s[7]  -= v[7]*xv;
607:         s[8]  -= v[8]*xv;
608:         s[9]  -= v[9]*xv;
609:         s[10] -= v[10]*xv;
610:         v     += 11;
611:       }
612:     }
613:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
614:     for (k=0; k<11; k++) {
615:       x[idt]    += v[0]*s[k];
616:       x[1+idt]  += v[1]*s[k];
617:       x[2+idt]  += v[2]*s[k];
618:       x[3+idt]  += v[3]*s[k];
619:       x[4+idt]  += v[4]*s[k];
620:       x[5+idt]  += v[5]*s[k];
621:       x[6+idt]  += v[6]*s[k];
622:       x[7+idt]  += v[7]*s[k];
623:       x[8+idt]  += v[8]*s[k];
624:       x[9+idt]  += v[9]*s[k];
625:       x[10+idt] += v[10]*s[k];
626:       v         += 11;
627:     }
628:   }
629:   VecRestoreArrayRead(bb,&b);
630:   VecRestoreArray(xx,&x);
631:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
632:   return(0);
633: }

635: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
636: {
637:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
638:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
639:   PetscErrorCode    ierr;
640:   PetscInt          i,nz,idx,idt,jdx;
641:   const MatScalar   *aa=a->a,*v;
642:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
643:   const PetscScalar *b;

646:   VecGetArrayRead(bb,&b);
647:   VecGetArray(xx,&x);
648:   /* forward solve the lower triangular */
649:   idx  = 0;
650:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
651:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
652:   x[6] = b[6+idx];
653:   for (i=1; i<n; i++) {
654:     v   =  aa + 49*ai[i];
655:     vi  =  aj + ai[i];
656:     nz  =  diag[i] - ai[i];
657:     idx =  7*i;
658:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
659:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
660:     s7  =  b[6+idx];
661:     while (nz--) {
662:       jdx = 7*(*vi++);
663:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
664:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
665:       x7  = x[6+jdx];
666:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
667:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
668:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
669:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
670:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
671:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
672:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
673:       v  += 49;
674:     }
675:     x[idx]   = s1;
676:     x[1+idx] = s2;
677:     x[2+idx] = s3;
678:     x[3+idx] = s4;
679:     x[4+idx] = s5;
680:     x[5+idx] = s6;
681:     x[6+idx] = s7;
682:   }
683:   /* backward solve the upper triangular */
684:   for (i=n-1; i>=0; i--) {
685:     v   = aa + 49*diag[i] + 49;
686:     vi  = aj + diag[i] + 1;
687:     nz  = ai[i+1] - diag[i] - 1;
688:     idt = 7*i;
689:     s1  = x[idt];   s2 = x[1+idt];
690:     s3  = x[2+idt]; s4 = x[3+idt];
691:     s5  = x[4+idt]; s6 = x[5+idt];
692:     s7  = x[6+idt];
693:     while (nz--) {
694:       idx = 7*(*vi++);
695:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
696:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
697:       x7  = x[6+idx];
698:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
699:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
700:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
701:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
702:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
703:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
704:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
705:       v  += 49;
706:     }
707:     v      = aa + 49*diag[i];
708:     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
709:              + v[28]*s5 + v[35]*s6 + v[42]*s7;
710:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
711:                + v[29]*s5 + v[36]*s6 + v[43]*s7;
712:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
713:                + v[30]*s5 + v[37]*s6 + v[44]*s7;
714:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
715:                + v[31]*s5 + v[38]*s6 + v[45]*s7;
716:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
717:                + v[32]*s5 + v[39]*s6 + v[46]*s7;
718:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
719:                + v[33]*s5 + v[40]*s6 + v[47]*s7;
720:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
721:                + v[34]*s5 + v[41]*s6 + v[48]*s7;
722:   }

724:   VecRestoreArrayRead(bb,&b);
725:   VecRestoreArray(xx,&x);
726:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
727:   return(0);
728: }

730: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
731: {
732:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
733:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
734:   PetscErrorCode    ierr;
735:   PetscInt          i,k,nz,idx,jdx,idt;
736:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
737:   const MatScalar   *aa=a->a,*v;
738:   PetscScalar       *x;
739:   const PetscScalar *b;
740:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

743:   VecGetArrayRead(bb,&b);
744:   VecGetArray(xx,&x);
745:   /* forward solve the lower triangular */
746:   idx  = 0;
747:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
748:   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
749:   for (i=1; i<n; i++) {
750:     v   = aa + bs2*ai[i];
751:     vi  = aj + ai[i];
752:     nz  = ai[i+1] - ai[i];
753:     idx = bs*i;
754:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
755:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
756:     for (k=0; k<nz; k++) {
757:       jdx = bs*vi[k];
758:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
759:       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
760:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
761:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
762:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
763:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
764:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
765:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
766:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
767:       v  +=  bs2;
768:     }

770:     x[idx]   = s1;
771:     x[1+idx] = s2;
772:     x[2+idx] = s3;
773:     x[3+idx] = s4;
774:     x[4+idx] = s5;
775:     x[5+idx] = s6;
776:     x[6+idx] = s7;
777:   }

779:   /* backward solve the upper triangular */
780:   for (i=n-1; i>=0; i--) {
781:     v   = aa + bs2*(adiag[i+1]+1);
782:     vi  = aj + adiag[i+1]+1;
783:     nz  = adiag[i] - adiag[i+1]-1;
784:     idt = bs*i;
785:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
786:     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
787:     for (k=0; k<nz; k++) {
788:       idx = bs*vi[k];
789:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
790:       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
791:       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
792:       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
793:       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
794:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
795:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
796:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
797:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
798:       v  +=  bs2;
799:     }
800:     /* x = inv_diagonal*x */
801:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
802:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
803:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
804:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
805:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
806:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
807:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
808:   }

810:   VecRestoreArrayRead(bb,&b);
811:   VecRestoreArray(xx,&x);
812:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
813:   return(0);
814: }

816: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
817: {
818:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
819:   PetscInt          i,nz,idx,idt,jdx;
820:   PetscErrorCode    ierr;
821:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
822:   const MatScalar   *aa   =a->a,*v;
823:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
824:   const PetscScalar *b;

827:   VecGetArrayRead(bb,&b);
828:   VecGetArray(xx,&x);
829:   /* forward solve the lower triangular */
830:   idx  = 0;
831:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
832:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
833:   for (i=1; i<n; i++) {
834:     v   =  aa + 36*ai[i];
835:     vi  =  aj + ai[i];
836:     nz  =  diag[i] - ai[i];
837:     idx =  6*i;
838:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
839:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
840:     while (nz--) {
841:       jdx = 6*(*vi++);
842:       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
843:       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
844:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
845:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
846:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
847:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
848:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
849:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
850:       v  += 36;
851:     }
852:     x[idx]   = s1;
853:     x[1+idx] = s2;
854:     x[2+idx] = s3;
855:     x[3+idx] = s4;
856:     x[4+idx] = s5;
857:     x[5+idx] = s6;
858:   }
859:   /* backward solve the upper triangular */
860:   for (i=n-1; i>=0; i--) {
861:     v   = aa + 36*diag[i] + 36;
862:     vi  = aj + diag[i] + 1;
863:     nz  = ai[i+1] - diag[i] - 1;
864:     idt = 6*i;
865:     s1  = x[idt];   s2 = x[1+idt];
866:     s3  = x[2+idt]; s4 = x[3+idt];
867:     s5  = x[4+idt]; s6 = x[5+idt];
868:     while (nz--) {
869:       idx = 6*(*vi++);
870:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
871:       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
872:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
873:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
874:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
875:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
876:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
877:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
878:       v  += 36;
879:     }
880:     v        = aa + 36*diag[i];
881:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
882:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
883:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
884:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
885:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
886:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
887:   }

889:   VecRestoreArrayRead(bb,&b);
890:   VecRestoreArray(xx,&x);
891:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
892:   return(0);
893: }

895: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
896: {
897:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
898:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
899:   PetscErrorCode    ierr;
900:   PetscInt          i,k,nz,idx,jdx,idt;
901:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
902:   const MatScalar   *aa=a->a,*v;
903:   PetscScalar       *x;
904:   const PetscScalar *b;
905:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

908:   VecGetArrayRead(bb,&b);
909:   VecGetArray(xx,&x);
910:   /* forward solve the lower triangular */
911:   idx  = 0;
912:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
913:   x[4] = b[4+idx];x[5] = b[5+idx];
914:   for (i=1; i<n; i++) {
915:     v   = aa + bs2*ai[i];
916:     vi  = aj + ai[i];
917:     nz  = ai[i+1] - ai[i];
918:     idx = bs*i;
919:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
920:     s5  = b[4+idx];s6 = b[5+idx];
921:     for (k=0; k<nz; k++) {
922:       jdx = bs*vi[k];
923:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
924:       x5  = x[4+jdx]; x6 = x[5+jdx];
925:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
926:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
927:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
928:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
929:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
930:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
931:       v  +=  bs2;
932:     }

934:     x[idx]   = s1;
935:     x[1+idx] = s2;
936:     x[2+idx] = s3;
937:     x[3+idx] = s4;
938:     x[4+idx] = s5;
939:     x[5+idx] = s6;
940:   }

942:   /* backward solve the upper triangular */
943:   for (i=n-1; i>=0; i--) {
944:     v   = aa + bs2*(adiag[i+1]+1);
945:     vi  = aj + adiag[i+1]+1;
946:     nz  = adiag[i] - adiag[i+1]-1;
947:     idt = bs*i;
948:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
949:     s5  = x[4+idt];s6 = x[5+idt];
950:     for (k=0; k<nz; k++) {
951:       idx = bs*vi[k];
952:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
953:       x5  = x[4+idx];x6 = x[5+idx];
954:       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
955:       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
956:       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
957:       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
958:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
959:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
960:       v  +=  bs2;
961:     }
962:     /* x = inv_diagonal*x */
963:     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
964:     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
965:     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
966:     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
967:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
968:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
969:   }

971:   VecRestoreArrayRead(bb,&b);
972:   VecRestoreArray(xx,&x);
973:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
974:   return(0);
975: }

977: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
978: {
979:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
980:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
981:   PetscInt          i,nz,idx,idt,jdx;
982:   PetscErrorCode    ierr;
983:   const MatScalar   *aa=a->a,*v;
984:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
985:   const PetscScalar *b;

988:   VecGetArrayRead(bb,&b);
989:   VecGetArray(xx,&x);
990:   /* forward solve the lower triangular */
991:   idx  = 0;
992:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
993:   for (i=1; i<n; i++) {
994:     v   =  aa + 25*ai[i];
995:     vi  =  aj + ai[i];
996:     nz  =  diag[i] - ai[i];
997:     idx =  5*i;
998:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
999:     while (nz--) {
1000:       jdx = 5*(*vi++);
1001:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1002:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1003:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1004:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1005:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1006:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1007:       v  += 25;
1008:     }
1009:     x[idx]   = s1;
1010:     x[1+idx] = s2;
1011:     x[2+idx] = s3;
1012:     x[3+idx] = s4;
1013:     x[4+idx] = s5;
1014:   }
1015:   /* backward solve the upper triangular */
1016:   for (i=n-1; i>=0; i--) {
1017:     v   = aa + 25*diag[i] + 25;
1018:     vi  = aj + diag[i] + 1;
1019:     nz  = ai[i+1] - diag[i] - 1;
1020:     idt = 5*i;
1021:     s1  = x[idt];  s2 = x[1+idt];
1022:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1023:     while (nz--) {
1024:       idx = 5*(*vi++);
1025:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1026:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1027:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1028:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1029:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1030:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1031:       v  += 25;
1032:     }
1033:     v        = aa + 25*diag[i];
1034:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1035:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1036:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1037:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1038:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1039:   }

1041:   VecRestoreArrayRead(bb,&b);
1042:   VecRestoreArray(xx,&x);
1043:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1044:   return(0);
1045: }

1047: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1048: {
1049:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1050:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1051:   PetscInt          i,k,nz,idx,idt,jdx;
1052:   PetscErrorCode    ierr;
1053:   const MatScalar   *aa=a->a,*v;
1054:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1055:   const PetscScalar *b;

1058:   VecGetArrayRead(bb,&b);
1059:   VecGetArray(xx,&x);
1060:   /* forward solve the lower triangular */
1061:   idx  = 0;
1062:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1063:   for (i=1; i<n; i++) {
1064:     v   = aa + 25*ai[i];
1065:     vi  = aj + ai[i];
1066:     nz  = ai[i+1] - ai[i];
1067:     idx = 5*i;
1068:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1069:     for (k=0; k<nz; k++) {
1070:       jdx = 5*vi[k];
1071:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1072:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1073:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1074:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1075:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1076:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1077:       v  += 25;
1078:     }
1079:     x[idx]   = s1;
1080:     x[1+idx] = s2;
1081:     x[2+idx] = s3;
1082:     x[3+idx] = s4;
1083:     x[4+idx] = s5;
1084:   }

1086:   /* backward solve the upper triangular */
1087:   for (i=n-1; i>=0; i--) {
1088:     v   = aa + 25*(adiag[i+1]+1);
1089:     vi  = aj + adiag[i+1]+1;
1090:     nz  = adiag[i] - adiag[i+1]-1;
1091:     idt = 5*i;
1092:     s1  = x[idt];  s2 = x[1+idt];
1093:     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1094:     for (k=0; k<nz; k++) {
1095:       idx = 5*vi[k];
1096:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1097:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1098:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1099:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1100:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1101:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1102:       v  += 25;
1103:     }
1104:     /* x = inv_diagonal*x */
1105:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1106:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1107:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1108:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1109:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1110:   }

1112:   VecRestoreArrayRead(bb,&b);
1113:   VecRestoreArray(xx,&x);
1114:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1115:   return(0);
1116: }

1118: /*
1119:       Special case where the matrix was ILU(0) factored in the natural
1120:    ordering. This eliminates the need for the column and row permutation.
1121: */
1122: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1123: {
1124:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1125:   PetscInt          n  =a->mbs;
1126:   const PetscInt    *ai=a->i,*aj=a->j;
1127:   PetscErrorCode    ierr;
1128:   const PetscInt    *diag = a->diag;
1129:   const MatScalar   *aa   =a->a;
1130:   PetscScalar       *x;
1131:   const PetscScalar *b;

1134:   VecGetArrayRead(bb,&b);
1135:   VecGetArray(xx,&x);

1137: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1138:   {
1139:     static PetscScalar w[2000]; /* very BAD need to fix */
1140:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1141:   }
1142: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1143:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1144: #else
1145:   {
1146:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
1147:     const MatScalar *v;
1148:     PetscInt        jdx,idt,idx,nz,i,ai16;
1149:     const PetscInt  *vi;

1151:     /* forward solve the lower triangular */
1152:     idx  = 0;
1153:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1154:     for (i=1; i<n; i++) {
1155:       v    =  aa      + 16*ai[i];
1156:       vi   =  aj      + ai[i];
1157:       nz   =  diag[i] - ai[i];
1158:       idx +=  4;
1159:       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1160:       while (nz--) {
1161:         jdx = 4*(*vi++);
1162:         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1163:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1164:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1165:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1166:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1167:         v  += 16;
1168:       }
1169:       x[idx]   = s1;
1170:       x[1+idx] = s2;
1171:       x[2+idx] = s3;
1172:       x[3+idx] = s4;
1173:     }
1174:     /* backward solve the upper triangular */
1175:     idt = 4*(n-1);
1176:     for (i=n-1; i>=0; i--) {
1177:       ai16 = 16*diag[i];
1178:       v    = aa + ai16 + 16;
1179:       vi   = aj + diag[i] + 1;
1180:       nz   = ai[i+1] - diag[i] - 1;
1181:       s1   = x[idt];  s2 = x[1+idt];
1182:       s3   = x[2+idt];s4 = x[3+idt];
1183:       while (nz--) {
1184:         idx = 4*(*vi++);
1185:         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1186:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1187:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1188:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1189:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1190:         v  += 16;
1191:       }
1192:       v        = aa + ai16;
1193:       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1194:       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1195:       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1196:       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1197:       idt     -= 4;
1198:     }
1199:   }
1200: #endif

1202:   VecRestoreArrayRead(bb,&b);
1203:   VecRestoreArray(xx,&x);
1204:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1205:   return(0);
1206: }

1208: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1209: {
1210:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1211:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1212:   PetscInt          i,k,nz,idx,jdx,idt;
1213:   PetscErrorCode    ierr;
1214:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1215:   const MatScalar   *aa=a->a,*v;
1216:   PetscScalar       *x;
1217:   const PetscScalar *b;
1218:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

1221:   VecGetArrayRead(bb,&b);
1222:   VecGetArray(xx,&x);
1223:   /* forward solve the lower triangular */
1224:   idx  = 0;
1225:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1226:   for (i=1; i<n; i++) {
1227:     v   = aa + bs2*ai[i];
1228:     vi  = aj + ai[i];
1229:     nz  = ai[i+1] - ai[i];
1230:     idx = bs*i;
1231:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1232:     for (k=0; k<nz; k++) {
1233:       jdx = bs*vi[k];
1234:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1235:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1236:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1237:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1238:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

1240:       v +=  bs2;
1241:     }

1243:     x[idx]   = s1;
1244:     x[1+idx] = s2;
1245:     x[2+idx] = s3;
1246:     x[3+idx] = s4;
1247:   }

1249:   /* backward solve the upper triangular */
1250:   for (i=n-1; i>=0; i--) {
1251:     v   = aa + bs2*(adiag[i+1]+1);
1252:     vi  = aj + adiag[i+1]+1;
1253:     nz  = adiag[i] - adiag[i+1]-1;
1254:     idt = bs*i;
1255:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];

1257:     for (k=0; k<nz; k++) {
1258:       idx = bs*vi[k];
1259:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1260:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1261:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1262:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1263:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

1265:       v +=  bs2;
1266:     }
1267:     /* x = inv_diagonal*x */
1268:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
1269:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
1270:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1271:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

1273:   }

1275:   VecRestoreArrayRead(bb,&b);
1276:   VecRestoreArray(xx,&x);
1277:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1278:   return(0);
1279: }

1281: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1282: {
1283:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1284:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1285:   PetscErrorCode    ierr;
1286:   const MatScalar   *aa=a->a;
1287:   const PetscScalar *b;
1288:   PetscScalar       *x;

1291:   VecGetArrayRead(bb,&b);
1292:   VecGetArray(xx,&x);

1294:   {
1295:     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1296:     const MatScalar *v;
1297:     MatScalar       *t=(MatScalar*)x;
1298:     PetscInt        jdx,idt,idx,nz,i,ai16;
1299:     const PetscInt  *vi;

1301:     /* forward solve the lower triangular */
1302:     idx  = 0;
1303:     t[0] = (MatScalar)b[0];
1304:     t[1] = (MatScalar)b[1];
1305:     t[2] = (MatScalar)b[2];
1306:     t[3] = (MatScalar)b[3];
1307:     for (i=1; i<n; i++) {
1308:       v    =  aa      + 16*ai[i];
1309:       vi   =  aj      + ai[i];
1310:       nz   =  diag[i] - ai[i];
1311:       idx +=  4;
1312:       s1   = (MatScalar)b[idx];
1313:       s2   = (MatScalar)b[1+idx];
1314:       s3   = (MatScalar)b[2+idx];
1315:       s4   = (MatScalar)b[3+idx];
1316:       while (nz--) {
1317:         jdx = 4*(*vi++);
1318:         x1  = t[jdx];
1319:         x2  = t[1+jdx];
1320:         x3  = t[2+jdx];
1321:         x4  = t[3+jdx];
1322:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1323:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1324:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1325:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1326:         v  += 16;
1327:       }
1328:       t[idx]   = s1;
1329:       t[1+idx] = s2;
1330:       t[2+idx] = s3;
1331:       t[3+idx] = s4;
1332:     }
1333:     /* backward solve the upper triangular */
1334:     idt = 4*(n-1);
1335:     for (i=n-1; i>=0; i--) {
1336:       ai16 = 16*diag[i];
1337:       v    = aa + ai16 + 16;
1338:       vi   = aj + diag[i] + 1;
1339:       nz   = ai[i+1] - diag[i] - 1;
1340:       s1   = t[idt];
1341:       s2   = t[1+idt];
1342:       s3   = t[2+idt];
1343:       s4   = t[3+idt];
1344:       while (nz--) {
1345:         idx = 4*(*vi++);
1346:         x1  = (MatScalar)x[idx];
1347:         x2  = (MatScalar)x[1+idx];
1348:         x3  = (MatScalar)x[2+idx];
1349:         x4  = (MatScalar)x[3+idx];
1350:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1351:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1352:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1353:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1354:         v  += 16;
1355:       }
1356:       v        = aa + ai16;
1357:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
1358:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
1359:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
1360:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
1361:       idt     -= 4;
1362:     }
1363:   }

1365:   VecRestoreArrayRead(bb,&b);
1366:   VecRestoreArray(xx,&x);
1367:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1368:   return(0);
1369: }

1371: #if defined(PETSC_HAVE_SSE)

1373: #include PETSC_HAVE_SSE
1374: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
1375: {
1376:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1377:   unsigned short *aj=(unsigned short*)a->j;
1379:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1380:   MatScalar      *aa=a->a;
1381:   PetscScalar    *x,*b;

1384:   SSE_SCOPE_BEGIN;
1385:   /*
1386:      Note: This code currently uses demotion of double
1387:      to float when performing the mixed-mode computation.
1388:      This may not be numerically reasonable for all applications.
1389:   */
1390:   PREFETCH_NTA(aa+16*ai[1]);

1392:   VecGetArray(bb,&b);
1393:   VecGetArray(xx,&x);
1394:   {
1395:     /* x will first be computed in single precision then promoted inplace to double */
1396:     MatScalar      *v,*t=(MatScalar*)x;
1397:     int            nz,i,idt,ai16;
1398:     unsigned int   jdx,idx;
1399:     unsigned short *vi;
1400:     /* Forward solve the lower triangular factor. */

1402:     /* First block is the identity. */
1403:     idx = 0;
1404:     CONVERT_DOUBLE4_FLOAT4(t,b);
1405:     v =  aa + 16*((unsigned int)ai[1]);

1407:     for (i=1; i<n; ) {
1408:       PREFETCH_NTA(&v[8]);
1409:       vi   =  aj      + ai[i];
1410:       nz   =  diag[i] - ai[i];
1411:       idx +=  4;

1413:       /* Demote RHS from double to float. */
1414:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1415:       LOAD_PS(&t[idx],XMM7);

1417:       while (nz--) {
1418:         PREFETCH_NTA(&v[16]);
1419:         jdx = 4*((unsigned int)(*vi++));

1421:         /* 4x4 Matrix-Vector product with negative accumulation: */
1422:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1423:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1425:         /* First Column */
1426:         SSE_COPY_PS(XMM0,XMM6)
1427:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1428:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1429:         SSE_SUB_PS(XMM7,XMM0)

1431:         /* Second Column */
1432:         SSE_COPY_PS(XMM1,XMM6)
1433:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1434:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1435:         SSE_SUB_PS(XMM7,XMM1)

1437:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1439:         /* Third Column */
1440:         SSE_COPY_PS(XMM2,XMM6)
1441:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1442:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1443:         SSE_SUB_PS(XMM7,XMM2)

1445:         /* Fourth Column */
1446:         SSE_COPY_PS(XMM3,XMM6)
1447:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1448:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1449:         SSE_SUB_PS(XMM7,XMM3)
1450:         SSE_INLINE_END_2

1452:         v += 16;
1453:       }
1454:       v =  aa + 16*ai[++i];
1455:       PREFETCH_NTA(v);
1456:       STORE_PS(&t[idx],XMM7);
1457:     }

1459:     /* Backward solve the upper triangular factor.*/

1461:     idt  = 4*(n-1);
1462:     ai16 = 16*diag[n-1];
1463:     v    = aa + ai16 + 16;
1464:     for (i=n-1; i>=0; ) {
1465:       PREFETCH_NTA(&v[8]);
1466:       vi = aj + diag[i] + 1;
1467:       nz = ai[i+1] - diag[i] - 1;

1469:       LOAD_PS(&t[idt],XMM7);

1471:       while (nz--) {
1472:         PREFETCH_NTA(&v[16]);
1473:         idx = 4*((unsigned int)(*vi++));

1475:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1476:         SSE_INLINE_BEGIN_2(&t[idx],v)
1477:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1479:         /* First Column */
1480:         SSE_COPY_PS(XMM0,XMM6)
1481:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1482:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1483:         SSE_SUB_PS(XMM7,XMM0)

1485:         /* Second Column */
1486:         SSE_COPY_PS(XMM1,XMM6)
1487:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1488:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1489:         SSE_SUB_PS(XMM7,XMM1)

1491:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1493:         /* Third Column */
1494:         SSE_COPY_PS(XMM2,XMM6)
1495:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1496:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1497:         SSE_SUB_PS(XMM7,XMM2)

1499:         /* Fourth Column */
1500:         SSE_COPY_PS(XMM3,XMM6)
1501:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1502:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1503:         SSE_SUB_PS(XMM7,XMM3)
1504:         SSE_INLINE_END_2
1505:         v += 16;
1506:       }
1507:       v    = aa + ai16;
1508:       ai16 = 16*diag[--i];
1509:       PREFETCH_NTA(aa+ai16+16);
1510:       /*
1511:          Scale the result by the diagonal 4x4 block,
1512:          which was inverted as part of the factorization
1513:       */
1514:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1515:       /* First Column */
1516:       SSE_COPY_PS(XMM0,XMM7)
1517:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1518:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1520:       /* Second Column */
1521:       SSE_COPY_PS(XMM1,XMM7)
1522:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1523:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1524:       SSE_ADD_PS(XMM0,XMM1)

1526:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1528:       /* Third Column */
1529:       SSE_COPY_PS(XMM2,XMM7)
1530:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1531:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1532:       SSE_ADD_PS(XMM0,XMM2)

1534:       /* Fourth Column */
1535:       SSE_COPY_PS(XMM3,XMM7)
1536:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1537:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1538:       SSE_ADD_PS(XMM0,XMM3)

1540:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1541:       SSE_INLINE_END_3

1543:       v    = aa + ai16 + 16;
1544:       idt -= 4;
1545:     }

1547:     /* Convert t from single precision back to double precision (inplace)*/
1548:     idt = 4*(n-1);
1549:     for (i=n-1; i>=0; i--) {
1550:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1551:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1552:       PetscScalar *xtemp=&x[idt];
1553:       MatScalar   *ttemp=&t[idt];
1554:       xtemp[3] = (PetscScalar)ttemp[3];
1555:       xtemp[2] = (PetscScalar)ttemp[2];
1556:       xtemp[1] = (PetscScalar)ttemp[1];
1557:       xtemp[0] = (PetscScalar)ttemp[0];
1558:       idt     -= 4;
1559:     }

1561:   } /* End of artificial scope. */
1562:   VecRestoreArray(bb,&b);
1563:   VecRestoreArray(xx,&x);
1564:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1565:   SSE_SCOPE_END;
1566:   return(0);
1567: }

1569: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1570: {
1571:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1572:   int            *aj=a->j;
1574:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
1575:   MatScalar      *aa=a->a;
1576:   PetscScalar    *x,*b;

1579:   SSE_SCOPE_BEGIN;
1580:   /*
1581:      Note: This code currently uses demotion of double
1582:      to float when performing the mixed-mode computation.
1583:      This may not be numerically reasonable for all applications.
1584:   */
1585:   PREFETCH_NTA(aa+16*ai[1]);

1587:   VecGetArray(bb,&b);
1588:   VecGetArray(xx,&x);
1589:   {
1590:     /* x will first be computed in single precision then promoted inplace to double */
1591:     MatScalar *v,*t=(MatScalar*)x;
1592:     int       nz,i,idt,ai16;
1593:     int       jdx,idx;
1594:     int       *vi;
1595:     /* Forward solve the lower triangular factor. */

1597:     /* First block is the identity. */
1598:     idx = 0;
1599:     CONVERT_DOUBLE4_FLOAT4(t,b);
1600:     v =  aa + 16*ai[1];

1602:     for (i=1; i<n; ) {
1603:       PREFETCH_NTA(&v[8]);
1604:       vi   =  aj      + ai[i];
1605:       nz   =  diag[i] - ai[i];
1606:       idx +=  4;

1608:       /* Demote RHS from double to float. */
1609:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
1610:       LOAD_PS(&t[idx],XMM7);

1612:       while (nz--) {
1613:         PREFETCH_NTA(&v[16]);
1614:         jdx = 4*(*vi++);
1615: /*          jdx = *vi++; */

1617:         /* 4x4 Matrix-Vector product with negative accumulation: */
1618:         SSE_INLINE_BEGIN_2(&t[jdx],v)
1619:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1621:         /* First Column */
1622:         SSE_COPY_PS(XMM0,XMM6)
1623:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1624:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1625:         SSE_SUB_PS(XMM7,XMM0)

1627:         /* Second Column */
1628:         SSE_COPY_PS(XMM1,XMM6)
1629:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1630:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1631:         SSE_SUB_PS(XMM7,XMM1)

1633:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1635:         /* Third Column */
1636:         SSE_COPY_PS(XMM2,XMM6)
1637:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1638:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1639:         SSE_SUB_PS(XMM7,XMM2)

1641:         /* Fourth Column */
1642:         SSE_COPY_PS(XMM3,XMM6)
1643:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1644:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1645:         SSE_SUB_PS(XMM7,XMM3)
1646:         SSE_INLINE_END_2

1648:         v += 16;
1649:       }
1650:       v =  aa + 16*ai[++i];
1651:       PREFETCH_NTA(v);
1652:       STORE_PS(&t[idx],XMM7);
1653:     }

1655:     /* Backward solve the upper triangular factor.*/

1657:     idt  = 4*(n-1);
1658:     ai16 = 16*diag[n-1];
1659:     v    = aa + ai16 + 16;
1660:     for (i=n-1; i>=0; ) {
1661:       PREFETCH_NTA(&v[8]);
1662:       vi = aj + diag[i] + 1;
1663:       nz = ai[i+1] - diag[i] - 1;

1665:       LOAD_PS(&t[idt],XMM7);

1667:       while (nz--) {
1668:         PREFETCH_NTA(&v[16]);
1669:         idx = 4*(*vi++);
1670: /*          idx = *vi++; */

1672:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1673:         SSE_INLINE_BEGIN_2(&t[idx],v)
1674:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1676:         /* First Column */
1677:         SSE_COPY_PS(XMM0,XMM6)
1678:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1679:         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1680:         SSE_SUB_PS(XMM7,XMM0)

1682:         /* Second Column */
1683:         SSE_COPY_PS(XMM1,XMM6)
1684:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1685:         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1686:         SSE_SUB_PS(XMM7,XMM1)

1688:         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

1690:         /* Third Column */
1691:         SSE_COPY_PS(XMM2,XMM6)
1692:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1693:         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1694:         SSE_SUB_PS(XMM7,XMM2)

1696:         /* Fourth Column */
1697:         SSE_COPY_PS(XMM3,XMM6)
1698:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1699:         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1700:         SSE_SUB_PS(XMM7,XMM3)
1701:         SSE_INLINE_END_2
1702:         v += 16;
1703:       }
1704:       v    = aa + ai16;
1705:       ai16 = 16*diag[--i];
1706:       PREFETCH_NTA(aa+ai16+16);
1707:       /*
1708:          Scale the result by the diagonal 4x4 block,
1709:          which was inverted as part of the factorization
1710:       */
1711:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
1712:       /* First Column */
1713:       SSE_COPY_PS(XMM0,XMM7)
1714:       SSE_SHUFFLE(XMM0,XMM0,0x00)
1715:       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1717:       /* Second Column */
1718:       SSE_COPY_PS(XMM1,XMM7)
1719:       SSE_SHUFFLE(XMM1,XMM1,0x55)
1720:       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1721:       SSE_ADD_PS(XMM0,XMM1)

1723:       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1725:       /* Third Column */
1726:       SSE_COPY_PS(XMM2,XMM7)
1727:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1728:       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1729:       SSE_ADD_PS(XMM0,XMM2)

1731:       /* Fourth Column */
1732:       SSE_COPY_PS(XMM3,XMM7)
1733:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1734:       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1735:       SSE_ADD_PS(XMM0,XMM3)

1737:       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1738:       SSE_INLINE_END_3

1740:       v    = aa + ai16 + 16;
1741:       idt -= 4;
1742:     }

1744:     /* Convert t from single precision back to double precision (inplace)*/
1745:     idt = 4*(n-1);
1746:     for (i=n-1; i>=0; i--) {
1747:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
1748:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
1749:       PetscScalar *xtemp=&x[idt];
1750:       MatScalar   *ttemp=&t[idt];
1751:       xtemp[3] = (PetscScalar)ttemp[3];
1752:       xtemp[2] = (PetscScalar)ttemp[2];
1753:       xtemp[1] = (PetscScalar)ttemp[1];
1754:       xtemp[0] = (PetscScalar)ttemp[0];
1755:       idt     -= 4;
1756:     }

1758:   } /* End of artificial scope. */
1759:   VecRestoreArray(bb,&b);
1760:   VecRestoreArray(xx,&x);
1761:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1762:   SSE_SCOPE_END;
1763:   return(0);
1764: }

1766: #endif

1768: /*
1769:       Special case where the matrix was ILU(0) factored in the natural
1770:    ordering. This eliminates the need for the column and row permutation.
1771: */
1772: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1773: {
1774:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1775:   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
1776:   PetscErrorCode    ierr;
1777:   const PetscInt    *diag = a->diag,*vi;
1778:   const MatScalar   *aa   =a->a,*v;
1779:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
1780:   const PetscScalar *b;
1781:   PetscInt          jdx,idt,idx,nz,i;

1784:   VecGetArrayRead(bb,&b);
1785:   VecGetArray(xx,&x);

1787:   /* forward solve the lower triangular */
1788:   idx  = 0;
1789:   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
1790:   for (i=1; i<n; i++) {
1791:     v    =  aa      + 9*ai[i];
1792:     vi   =  aj      + ai[i];
1793:     nz   =  diag[i] - ai[i];
1794:     idx +=  3;
1795:     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1796:     while (nz--) {
1797:       jdx = 3*(*vi++);
1798:       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1799:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1800:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1801:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1802:       v  += 9;
1803:     }
1804:     x[idx]   = s1;
1805:     x[1+idx] = s2;
1806:     x[2+idx] = s3;
1807:   }
1808:   /* backward solve the upper triangular */
1809:   for (i=n-1; i>=0; i--) {
1810:     v   = aa + 9*diag[i] + 9;
1811:     vi  = aj + diag[i] + 1;
1812:     nz  = ai[i+1] - diag[i] - 1;
1813:     idt = 3*i;
1814:     s1  = x[idt];  s2 = x[1+idt];
1815:     s3  = x[2+idt];
1816:     while (nz--) {
1817:       idx = 3*(*vi++);
1818:       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1819:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1820:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1821:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1822:       v  += 9;
1823:     }
1824:     v        = aa +  9*diag[i];
1825:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1826:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1827:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1828:   }

1830:   VecRestoreArrayRead(bb,&b);
1831:   VecRestoreArray(xx,&x);
1832:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1833:   return(0);
1834: }

1836: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1837: {
1838:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1839:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1840:   PetscErrorCode    ierr;
1841:   PetscInt          i,k,nz,idx,jdx,idt;
1842:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1843:   const MatScalar   *aa=a->a,*v;
1844:   PetscScalar       *x;
1845:   const PetscScalar *b;
1846:   PetscScalar       s1,s2,s3,x1,x2,x3;

1849:   VecGetArrayRead(bb,&b);
1850:   VecGetArray(xx,&x);
1851:   /* forward solve the lower triangular */
1852:   idx  = 0;
1853:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1854:   for (i=1; i<n; i++) {
1855:     v   = aa + bs2*ai[i];
1856:     vi  = aj + ai[i];
1857:     nz  = ai[i+1] - ai[i];
1858:     idx = bs*i;
1859:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1860:     for (k=0; k<nz; k++) {
1861:       jdx = bs*vi[k];
1862:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1863:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1864:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1865:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1867:       v +=  bs2;
1868:     }

1870:     x[idx]   = s1;
1871:     x[1+idx] = s2;
1872:     x[2+idx] = s3;
1873:   }

1875:   /* backward solve the upper triangular */
1876:   for (i=n-1; i>=0; i--) {
1877:     v   = aa + bs2*(adiag[i+1]+1);
1878:     vi  = aj + adiag[i+1]+1;
1879:     nz  = adiag[i] - adiag[i+1]-1;
1880:     idt = bs*i;
1881:     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];

1883:     for (k=0; k<nz; k++) {
1884:       idx = bs*vi[k];
1885:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1886:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1887:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1888:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1890:       v +=  bs2;
1891:     }
1892:     /* x = inv_diagonal*x */
1893:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1894:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1895:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1897:   }

1899:   VecRestoreArrayRead(bb,&b);
1900:   VecRestoreArray(xx,&x);
1901:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1902:   return(0);
1903: }

1905: PetscErrorCode MatForwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1906: {
1907:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1908:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j;
1909:   PetscErrorCode    ierr;
1910:   PetscInt          i,k,nz,idx,jdx;
1911:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1912:   const MatScalar   *aa=a->a,*v;
1913:   PetscScalar       *x;
1914:   const PetscScalar *b;
1915:   PetscScalar       s1,s2,s3,x1,x2,x3;

1918:   VecGetArrayRead(bb,&b);
1919:   VecGetArray(xx,&x);
1920:   /* forward solve the lower triangular */
1921:   idx  = 0;
1922:   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
1923:   for (i=1; i<n; i++) {
1924:     v   = aa + bs2*ai[i];
1925:     vi  = aj + ai[i];
1926:     nz  = ai[i+1] - ai[i];
1927:     idx = bs*i;
1928:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
1929:     for (k=0; k<nz; k++) {
1930:       jdx = bs*vi[k];
1931:       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
1932:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1933:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1934:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1936:       v +=  bs2;
1937:     }

1939:     x[idx]   = s1;
1940:     x[1+idx] = s2;
1941:     x[2+idx] = s3;
1942:   }

1944:   VecRestoreArrayRead(bb,&b);
1945:   VecRestoreArray(xx,&x);
1946:   PetscLogFlops(1.0*bs2*(a->nz) - bs*A->cmap->n);
1947:   return(0);
1948: }


1951: PetscErrorCode MatBackwardSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1952: {
1953:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1954:   const PetscInt    n  =a->mbs,*vi,*aj=a->j,*adiag=a->diag;
1955:   PetscErrorCode    ierr;
1956:   PetscInt          i,k,nz,idx,idt;
1957:   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1958:   const MatScalar   *aa=a->a,*v;
1959:   PetscScalar       *x;
1960:   const PetscScalar *b;
1961:   PetscScalar       s1,s2,s3,x1,x2,x3;

1964:   VecGetArrayRead(bb,&b);
1965:   VecGetArray(xx,&x);

1967:   /* backward solve the upper triangular */
1968:   for (i=n-1; i>=0; i--) {
1969:     v   = aa + bs2*(adiag[i+1]+1);
1970:     vi  = aj + adiag[i+1]+1;
1971:     nz  = adiag[i] - adiag[i+1]-1;
1972:     idt = bs*i;
1973:     s1  = b[idt];  s2 = b[1+idt];s3 = b[2+idt];

1975:     for (k=0; k<nz; k++) {
1976:       idx = bs*vi[k];
1977:       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1978:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1979:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1980:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

1982:       v +=  bs2;
1983:     }
1984:     /* x = inv_diagonal*x */
1985:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1986:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1987:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

1989:   }

1991:   VecRestoreArrayRead(bb,&b);
1992:   VecRestoreArray(xx,&x);
1993:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1994:   return(0);
1995: }

1997: /*
1998:       Special case where the matrix was ILU(0) factored in the natural
1999:    ordering. This eliminates the need for the column and row permutation.
2000: */
2001: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2002: {
2003:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2004:   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2005:   PetscErrorCode    ierr;
2006:   const MatScalar   *aa=a->a,*v;
2007:   PetscScalar       *x,s1,s2,x1,x2;
2008:   const PetscScalar *b;
2009:   PetscInt          jdx,idt,idx,nz,i;

2012:   VecGetArrayRead(bb,&b);
2013:   VecGetArray(xx,&x);

2015:   /* forward solve the lower triangular */
2016:   idx  = 0;
2017:   x[0] = b[0]; x[1] = b[1];
2018:   for (i=1; i<n; i++) {
2019:     v    =  aa      + 4*ai[i];
2020:     vi   =  aj      + ai[i];
2021:     nz   =  diag[i] - ai[i];
2022:     idx +=  2;
2023:     s1   =  b[idx];s2 = b[1+idx];
2024:     while (nz--) {
2025:       jdx = 2*(*vi++);
2026:       x1  = x[jdx];x2 = x[1+jdx];
2027:       s1 -= v[0]*x1 + v[2]*x2;
2028:       s2 -= v[1]*x1 + v[3]*x2;
2029:       v  += 4;
2030:     }
2031:     x[idx]   = s1;
2032:     x[1+idx] = s2;
2033:   }
2034:   /* backward solve the upper triangular */
2035:   for (i=n-1; i>=0; i--) {
2036:     v   = aa + 4*diag[i] + 4;
2037:     vi  = aj + diag[i] + 1;
2038:     nz  = ai[i+1] - diag[i] - 1;
2039:     idt = 2*i;
2040:     s1  = x[idt];  s2 = x[1+idt];
2041:     while (nz--) {
2042:       idx = 2*(*vi++);
2043:       x1  = x[idx];   x2 = x[1+idx];
2044:       s1 -= v[0]*x1 + v[2]*x2;
2045:       s2 -= v[1]*x1 + v[3]*x2;
2046:       v  += 4;
2047:     }
2048:     v        = aa +  4*diag[i];
2049:     x[idt]   = v[0]*s1 + v[2]*s2;
2050:     x[1+idt] = v[1]*s1 + v[3]*s2;
2051:   }

2053:   VecRestoreArrayRead(bb,&b);
2054:   VecRestoreArray(xx,&x);
2055:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2056:   return(0);
2057: }

2059: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2060: {
2061:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2062:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2063:   PetscInt          i,k,nz,idx,idt,jdx;
2064:   PetscErrorCode    ierr;
2065:   const MatScalar   *aa=a->a,*v;
2066:   PetscScalar       *x,s1,s2,x1,x2;
2067:   const PetscScalar *b;

2070:   VecGetArrayRead(bb,&b);
2071:   VecGetArray(xx,&x);
2072:   /* forward solve the lower triangular */
2073:   idx  = 0;
2074:   x[0] = b[idx]; x[1] = b[1+idx];
2075:   for (i=1; i<n; i++) {
2076:     v   = aa + 4*ai[i];
2077:     vi  = aj + ai[i];
2078:     nz  = ai[i+1] - ai[i];
2079:     idx = 2*i;
2080:     s1  = b[idx];s2 = b[1+idx];
2081:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2082:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2083:     for (k=0; k<nz; k++) {
2084:       jdx = 2*vi[k];
2085:       x1  = x[jdx];x2 = x[1+jdx];
2086:       s1 -= v[0]*x1 + v[2]*x2;
2087:       s2 -= v[1]*x1 + v[3]*x2;
2088:       v  +=  4;
2089:     }
2090:     x[idx]   = s1;
2091:     x[1+idx] = s2;
2092:   }

2094:   /* backward solve the upper triangular */
2095:   for (i=n-1; i>=0; i--) {
2096:     v   = aa + 4*(adiag[i+1]+1);
2097:     vi  = aj + adiag[i+1]+1;
2098:     nz  = adiag[i] - adiag[i+1]-1;
2099:     idt = 2*i;
2100:     s1  = x[idt];  s2 = x[1+idt];
2101:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2102:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2103:     for (k=0; k<nz; k++) {
2104:       idx = 2*vi[k];
2105:       x1  = x[idx];   x2 = x[1+idx];
2106:       s1 -= v[0]*x1 + v[2]*x2;
2107:       s2 -= v[1]*x1 + v[3]*x2;
2108:       v  += 4;
2109:     }
2110:     /* x = inv_diagonal*x */
2111:     x[idt]   = v[0]*s1 + v[2]*s2;
2112:     x[1+idt] = v[1]*s1 + v[3]*s2;
2113:   }

2115:   VecRestoreArrayRead(bb,&b);
2116:   VecRestoreArray(xx,&x);
2117:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
2118:   return(0);
2119: }

2121: PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2122: {
2123:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2124:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j;
2125:   PetscInt          i,k,nz,idx,jdx;
2126:   PetscErrorCode    ierr;
2127:   const MatScalar   *aa=a->a,*v;
2128:   PetscScalar       *x,s1,s2,x1,x2;
2129:   const PetscScalar *b;

2132:   VecGetArrayRead(bb,&b);
2133:   VecGetArray(xx,&x);
2134:   /* forward solve the lower triangular */
2135:   idx  = 0;
2136:   x[0] = b[idx]; x[1] = b[1+idx];
2137:   for (i=1; i<n; i++) {
2138:     v   = aa + 4*ai[i];
2139:     vi  = aj + ai[i];
2140:     nz  = ai[i+1] - ai[i];
2141:     idx = 2*i;
2142:     s1  = b[idx];s2 = b[1+idx];
2143:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2144:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2145:     for (k=0; k<nz; k++) {
2146:       jdx = 2*vi[k];
2147:       x1  = x[jdx];x2 = x[1+jdx];
2148:       s1 -= v[0]*x1 + v[2]*x2;
2149:       s2 -= v[1]*x1 + v[3]*x2;
2150:       v  +=  4;
2151:     }
2152:     x[idx]   = s1;
2153:     x[1+idx] = s2;
2154:   }


2157:   VecRestoreArrayRead(bb,&b);
2158:   VecRestoreArray(xx,&x);
2159:   PetscLogFlops(4.0*(a->nz) - A->cmap->n);
2160:   return(0);
2161: }

2163: PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2164: {
2165:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2166:   const PetscInt    n  = a->mbs,*vi,*aj=a->j,*adiag=a->diag;
2167:   PetscInt          i,k,nz,idx,idt;
2168:   PetscErrorCode    ierr;
2169:   const MatScalar   *aa=a->a,*v;
2170:   PetscScalar       *x,s1,s2,x1,x2;
2171:   const PetscScalar *b;

2174:   VecGetArrayRead(bb,&b);
2175:   VecGetArray(xx,&x);

2177:   /* backward solve the upper triangular */
2178:   for (i=n-1; i>=0; i--) {
2179:     v   = aa + 4*(adiag[i+1]+1);
2180:     vi  = aj + adiag[i+1]+1;
2181:     nz  = adiag[i] - adiag[i+1]-1;
2182:     idt = 2*i;
2183:     s1  = b[idt];  s2 = b[1+idt];
2184:     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
2185:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
2186:     for (k=0; k<nz; k++) {
2187:       idx = 2*vi[k];
2188:       x1  = x[idx];   x2 = x[1+idx];
2189:       s1 -= v[0]*x1 + v[2]*x2;
2190:       s2 -= v[1]*x1 + v[3]*x2;
2191:       v  += 4;
2192:     }
2193:     /* x = inv_diagonal*x */
2194:     x[idt]   = v[0]*s1 + v[2]*s2;
2195:     x[1+idt] = v[1]*s1 + v[3]*s2;
2196:   }

2198:   VecRestoreArrayRead(bb,&b);
2199:   VecRestoreArray(xx,&x);
2200:   PetscLogFlops(4.0*a->nz - A->cmap->n);
2201:   return(0);
2202: }

2204: /*
2205:       Special case where the matrix was ILU(0) factored in the natural
2206:    ordering. This eliminates the need for the column and row permutation.
2207: */
2208: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2209: {
2210:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2211:   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2212:   PetscErrorCode    ierr;
2213:   const MatScalar   *aa=a->a,*v;
2214:   PetscScalar       *x;
2215:   const PetscScalar *b;
2216:   PetscScalar       s1,x1;
2217:   PetscInt          jdx,idt,idx,nz,i;

2220:   VecGetArrayRead(bb,&b);
2221:   VecGetArray(xx,&x);

2223:   /* forward solve the lower triangular */
2224:   idx  = 0;
2225:   x[0] = b[0];
2226:   for (i=1; i<n; i++) {
2227:     v    =  aa      + ai[i];
2228:     vi   =  aj      + ai[i];
2229:     nz   =  diag[i] - ai[i];
2230:     idx +=  1;
2231:     s1   =  b[idx];
2232:     while (nz--) {
2233:       jdx = *vi++;
2234:       x1  = x[jdx];
2235:       s1 -= v[0]*x1;
2236:       v  += 1;
2237:     }
2238:     x[idx] = s1;
2239:   }
2240:   /* backward solve the upper triangular */
2241:   for (i=n-1; i>=0; i--) {
2242:     v   = aa + diag[i] + 1;
2243:     vi  = aj + diag[i] + 1;
2244:     nz  = ai[i+1] - diag[i] - 1;
2245:     idt = i;
2246:     s1  = x[idt];
2247:     while (nz--) {
2248:       idx = *vi++;
2249:       x1  = x[idx];
2250:       s1 -= v[0]*x1;
2251:       v  += 1;
2252:     }
2253:     v      = aa +  diag[i];
2254:     x[idt] = v[0]*s1;
2255:   }
2256:   VecRestoreArrayRead(bb,&b);
2257:   VecRestoreArray(xx,&x);
2258:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
2259:   return(0);
2260: }


2263: PetscErrorCode MatForwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2264: {
2265:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2266:   PetscErrorCode    ierr;
2267:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*vi;
2268:   PetscScalar       *x,sum;
2269:   const PetscScalar *b;
2270:   const MatScalar   *aa = a->a,*v;
2271:   PetscInt          i,nz;

2274:   if (!n) return(0);

2276:   VecGetArrayRead(bb,&b);
2277:   VecGetArray(xx,&x);

2279:   /* forward solve the lower triangular */
2280:   x[0] = b[0];
2281:   v    = aa;
2282:   vi   = aj;
2283:   for (i=1; i<n; i++) {
2284:     nz  = ai[i+1] - ai[i];
2285:     sum = b[i];
2286:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2287:     v   += nz;
2288:     vi  += nz;
2289:     x[i] = sum;
2290:   }
2291:   PetscLogFlops(a->nz - A->cmap->n);
2292:   VecRestoreArrayRead(bb,&b);
2293:   VecRestoreArray(xx,&x);
2294:   return(0);
2295: }

2297: PetscErrorCode MatBackwardSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2298: {
2299:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2300:   PetscErrorCode    ierr;
2301:   const PetscInt    n = a->mbs,*aj = a->j,*adiag = a->diag,*vi;
2302:   PetscScalar       *x,sum;
2303:   const PetscScalar *b;
2304:   const MatScalar   *aa = a->a,*v;
2305:   PetscInt          i,nz;

2308:   if (!n) return(0);

2310:   VecGetArrayRead(bb,&b);
2311:   VecGetArray(xx,&x);

2313:   /* backward solve the upper triangular */
2314:   for (i=n-1; i>=0; i--) {
2315:     v   = aa + adiag[i+1] + 1;
2316:     vi  = aj + adiag[i+1] + 1;
2317:     nz  = adiag[i] - adiag[i+1]-1;
2318:     sum = b[i];
2319:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2320:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2321:   }

2323:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2324:   VecRestoreArrayRead(bb,&b);
2325:   VecRestoreArray(xx,&x);
2326:   return(0);
2327: }

2329: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2330: {
2331:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2332:   PetscErrorCode    ierr;
2333:   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
2334:   PetscScalar       *x,sum;
2335:   const PetscScalar *b;
2336:   const MatScalar   *aa = a->a,*v;
2337:   PetscInt          i,nz;

2340:   if (!n) return(0);

2342:   VecGetArrayRead(bb,&b);
2343:   VecGetArray(xx,&x);

2345:   /* forward solve the lower triangular */
2346:   x[0] = b[0];
2347:   v    = aa;
2348:   vi   = aj;
2349:   for (i=1; i<n; i++) {
2350:     nz  = ai[i+1] - ai[i];
2351:     sum = b[i];
2352:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2353:     v   += nz;
2354:     vi  += nz;
2355:     x[i] = sum;
2356:   }

2358:   /* backward solve the upper triangular */
2359:   for (i=n-1; i>=0; i--) {
2360:     v   = aa + adiag[i+1] + 1;
2361:     vi  = aj + adiag[i+1] + 1;
2362:     nz  = adiag[i] - adiag[i+1]-1;
2363:     sum = x[i];
2364:     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
2365:     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
2366:   }

2368:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2369:   VecRestoreArrayRead(bb,&b);
2370:   VecRestoreArray(xx,&x);
2371:   return(0);
2372: }