Actual source code: baijfact13.c

  1: /*
  2:     Factorization code for BAIJ format.
  3: */
  4: #include <../src/mat/impls/baij/seq/baij.h>
  5: #include <petsc/private/kernels/blockinvert.h>

  7: /*
  8:       Version for when blocks are 3 by 3
  9: */
 10: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 13:   IS              isrow = b->row, isicol = b->icol;
 14:   const PetscInt *r, *ic;
 15:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 16:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
 17:   const PetscInt *diag_offset;
 18:   PetscInt        idx, *pj;
 19:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 20:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 21:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
 22:   MatScalar      *ba = b->a, *aa = a->a;
 23:   PetscReal       shift = info->shiftamount;
 24:   PetscBool       allowzeropivot, zeropivotdetected;

 26:   PetscFunctionBegin;
 27:   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
 28:   A->factortype = MAT_FACTOR_NONE;
 29:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
 30:   A->factortype = MAT_FACTOR_ILU;
 31:   PetscCall(ISGetIndices(isrow, &r));
 32:   PetscCall(ISGetIndices(isicol, &ic));
 33:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
 34:   allowzeropivot = PetscNot(A->erroriffailure);

 36:   for (i = 0; i < n; i++) {
 37:     nz    = bi[i + 1] - bi[i];
 38:     ajtmp = bj + bi[i];
 39:     for (j = 0; j < nz; j++) {
 40:       x    = rtmp + 9 * ajtmp[j];
 41:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
 42:     }
 43:     /* load in initial (unfactored row) */
 44:     idx      = r[i];
 45:     nz       = ai[idx + 1] - ai[idx];
 46:     ajtmpold = aj + ai[idx];
 47:     v        = aa + 9 * ai[idx];
 48:     for (j = 0; j < nz; j++) {
 49:       x    = rtmp + 9 * ic[ajtmpold[j]];
 50:       x[0] = v[0];
 51:       x[1] = v[1];
 52:       x[2] = v[2];
 53:       x[3] = v[3];
 54:       x[4] = v[4];
 55:       x[5] = v[5];
 56:       x[6] = v[6];
 57:       x[7] = v[7];
 58:       x[8] = v[8];
 59:       v += 9;
 60:     }
 61:     row = *ajtmp++;
 62:     while (row < i) {
 63:       pc = rtmp + 9 * row;
 64:       p1 = pc[0];
 65:       p2 = pc[1];
 66:       p3 = pc[2];
 67:       p4 = pc[3];
 68:       p5 = pc[4];
 69:       p6 = pc[5];
 70:       p7 = pc[6];
 71:       p8 = pc[7];
 72:       p9 = pc[8];
 73:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
 74:         pv    = ba + 9 * diag_offset[row];
 75:         pj    = bj + diag_offset[row] + 1;
 76:         x1    = pv[0];
 77:         x2    = pv[1];
 78:         x3    = pv[2];
 79:         x4    = pv[3];
 80:         x5    = pv[4];
 81:         x6    = pv[5];
 82:         x7    = pv[6];
 83:         x8    = pv[7];
 84:         x9    = pv[8];
 85:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
 86:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
 87:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

 89:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
 90:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
 91:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

 93:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
 94:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
 95:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
 96:         nz         = bi[row + 1] - diag_offset[row] - 1;
 97:         pv += 9;
 98:         for (j = 0; j < nz; j++) {
 99:           x1 = pv[0];
100:           x2 = pv[1];
101:           x3 = pv[2];
102:           x4 = pv[3];
103:           x5 = pv[4];
104:           x6 = pv[5];
105:           x7 = pv[6];
106:           x8 = pv[7];
107:           x9 = pv[8];
108:           x  = rtmp + 9 * pj[j];
109:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
110:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
111:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

113:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
114:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
115:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

117:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
118:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
119:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
120:           pv += 9;
121:         }
122:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
123:       }
124:       row = *ajtmp++;
125:     }
126:     /* finished row so stick it into b->a */
127:     pv = ba + 9 * bi[i];
128:     pj = bj + bi[i];
129:     nz = bi[i + 1] - bi[i];
130:     for (j = 0; j < nz; j++) {
131:       x     = rtmp + 9 * pj[j];
132:       pv[0] = x[0];
133:       pv[1] = x[1];
134:       pv[2] = x[2];
135:       pv[3] = x[3];
136:       pv[4] = x[4];
137:       pv[5] = x[5];
138:       pv[6] = x[6];
139:       pv[7] = x[7];
140:       pv[8] = x[8];
141:       pv += 9;
142:     }
143:     /* invert diagonal block */
144:     w = ba + 9 * diag_offset[i];
145:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
146:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
147:   }

149:   PetscCall(PetscFree(rtmp));
150:   PetscCall(ISRestoreIndices(isicol, &ic));
151:   PetscCall(ISRestoreIndices(isrow, &r));

153:   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
154:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
155:   C->assembled           = PETSC_TRUE;

157:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
158:   PetscFunctionReturn(PETSC_SUCCESS);
159: }

161: /* MatLUFactorNumeric_SeqBAIJ_3 -
162:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163:        PetscKernel_A_gets_A_times_B()
164:        PetscKernel_A_gets_A_minus_B_times_C()
165:        PetscKernel_A_gets_inverse_A()
166: */
167: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
168: {
169:   Mat             C = B;
170:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
171:   IS              isrow = b->row, isicol = b->icol;
172:   const PetscInt *r, *ic;
173:   PetscInt        i, j, k, nz, nzL, row;
174:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
175:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
176:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
177:   PetscInt        flg;
178:   PetscReal       shift = info->shiftamount;
179:   PetscBool       allowzeropivot, zeropivotdetected;

181:   PetscFunctionBegin;
182:   PetscCall(ISGetIndices(isrow, &r));
183:   PetscCall(ISGetIndices(isicol, &ic));
184:   allowzeropivot = PetscNot(A->erroriffailure);

186:   /* generate work space needed by the factorization */
187:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
188:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

190:   for (i = 0; i < n; i++) {
191:     /* zero rtmp */
192:     /* L part */
193:     nz    = bi[i + 1] - bi[i];
194:     bjtmp = bj + bi[i];
195:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

197:     /* U part */
198:     nz    = bdiag[i] - bdiag[i + 1];
199:     bjtmp = bj + bdiag[i + 1] + 1;
200:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

202:     /* load in initial (unfactored row) */
203:     nz    = ai[r[i] + 1] - ai[r[i]];
204:     ajtmp = aj + ai[r[i]];
205:     v     = aa + bs2 * ai[r[i]];
206:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

208:     /* elimination */
209:     bjtmp = bj + bi[i];
210:     nzL   = bi[i + 1] - bi[i];
211:     for (k = 0; k < nzL; k++) {
212:       row = bjtmp[k];
213:       pc  = rtmp + bs2 * row;
214:       for (flg = 0, j = 0; j < bs2; j++) {
215:         if (pc[j] != 0.0) {
216:           flg = 1;
217:           break;
218:         }
219:       }
220:       if (flg) {
221:         pv = b->a + bs2 * bdiag[row];
222:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
223:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

225:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
226:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
227:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
228:         for (j = 0; j < nz; j++) {
229:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
230:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
231:           v = rtmp + bs2 * pj[j];
232:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
233:           pv += bs2;
234:         }
235:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
236:       }
237:     }

239:     /* finished row so stick it into b->a */
240:     /* L part */
241:     pv = b->a + bs2 * bi[i];
242:     pj = b->j + bi[i];
243:     nz = bi[i + 1] - bi[i];
244:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

246:     /* Mark diagonal and invert diagonal for simpler triangular solves */
247:     pv = b->a + bs2 * bdiag[i];
248:     pj = b->j + bdiag[i];
249:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
250:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
251:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

253:     /* U part */
254:     pj = b->j + bdiag[i + 1] + 1;
255:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
256:     nz = bdiag[i] - bdiag[i + 1] - 1;
257:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
258:   }

260:   PetscCall(PetscFree2(rtmp, mwork));
261:   PetscCall(ISRestoreIndices(isicol, &ic));
262:   PetscCall(ISRestoreIndices(isrow, &r));

264:   C->ops->solve          = MatSolve_SeqBAIJ_3;
265:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
266:   C->assembled           = PETSC_TRUE;

268:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
269:   PetscFunctionReturn(PETSC_SUCCESS);
270: }

272: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
273: {
274:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
275:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
276:   PetscInt       *ajtmpold, *ajtmp, nz, row;
277:   const PetscInt *diag_offset;
278:   PetscInt       *ai = a->i, *aj = a->j, *pj;
279:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
280:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
281:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
282:   MatScalar      *ba = b->a, *aa = a->a;
283:   PetscReal       shift = info->shiftamount;
284:   PetscBool       allowzeropivot, zeropivotdetected;

286:   PetscFunctionBegin;
287:   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
288:   A->factortype = MAT_FACTOR_NONE;
289:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
290:   A->factortype = MAT_FACTOR_ILU;
291:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
292:   allowzeropivot = PetscNot(A->erroriffailure);

294:   for (i = 0; i < n; i++) {
295:     nz    = bi[i + 1] - bi[i];
296:     ajtmp = bj + bi[i];
297:     for (j = 0; j < nz; j++) {
298:       x    = rtmp + 9 * ajtmp[j];
299:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
300:     }
301:     /* load in initial (unfactored row) */
302:     nz       = ai[i + 1] - ai[i];
303:     ajtmpold = aj + ai[i];
304:     v        = aa + 9 * ai[i];
305:     for (j = 0; j < nz; j++) {
306:       x    = rtmp + 9 * ajtmpold[j];
307:       x[0] = v[0];
308:       x[1] = v[1];
309:       x[2] = v[2];
310:       x[3] = v[3];
311:       x[4] = v[4];
312:       x[5] = v[5];
313:       x[6] = v[6];
314:       x[7] = v[7];
315:       x[8] = v[8];
316:       v += 9;
317:     }
318:     row = *ajtmp++;
319:     while (row < i) {
320:       pc = rtmp + 9 * row;
321:       p1 = pc[0];
322:       p2 = pc[1];
323:       p3 = pc[2];
324:       p4 = pc[3];
325:       p5 = pc[4];
326:       p6 = pc[5];
327:       p7 = pc[6];
328:       p8 = pc[7];
329:       p9 = pc[8];
330:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
331:         pv    = ba + 9 * diag_offset[row];
332:         pj    = bj + diag_offset[row] + 1;
333:         x1    = pv[0];
334:         x2    = pv[1];
335:         x3    = pv[2];
336:         x4    = pv[3];
337:         x5    = pv[4];
338:         x6    = pv[5];
339:         x7    = pv[6];
340:         x8    = pv[7];
341:         x9    = pv[8];
342:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
343:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
344:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

346:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
347:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
348:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

350:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
351:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
352:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;

354:         nz = bi[row + 1] - diag_offset[row] - 1;
355:         pv += 9;
356:         for (j = 0; j < nz; j++) {
357:           x1 = pv[0];
358:           x2 = pv[1];
359:           x3 = pv[2];
360:           x4 = pv[3];
361:           x5 = pv[4];
362:           x6 = pv[5];
363:           x7 = pv[6];
364:           x8 = pv[7];
365:           x9 = pv[8];
366:           x  = rtmp + 9 * pj[j];
367:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
368:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
369:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

371:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
372:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
373:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

375:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
376:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
377:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
378:           pv += 9;
379:         }
380:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
381:       }
382:       row = *ajtmp++;
383:     }
384:     /* finished row so stick it into b->a */
385:     pv = ba + 9 * bi[i];
386:     pj = bj + bi[i];
387:     nz = bi[i + 1] - bi[i];
388:     for (j = 0; j < nz; j++) {
389:       x     = rtmp + 9 * pj[j];
390:       pv[0] = x[0];
391:       pv[1] = x[1];
392:       pv[2] = x[2];
393:       pv[3] = x[3];
394:       pv[4] = x[4];
395:       pv[5] = x[5];
396:       pv[6] = x[6];
397:       pv[7] = x[7];
398:       pv[8] = x[8];
399:       pv += 9;
400:     }
401:     /* invert diagonal block */
402:     w = ba + 9 * diag_offset[i];
403:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
404:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
405:   }

407:   PetscCall(PetscFree(rtmp));

409:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
410:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
411:   C->assembled           = PETSC_TRUE;

413:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
414:   PetscFunctionReturn(PETSC_SUCCESS);
415: }

417: /*
418:   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
419:     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
420: */
421: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
422: {
423:   Mat             C = B;
424:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
425:   PetscInt        i, j, k, nz, nzL, row;
426:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
427:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
428:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
429:   PetscInt        flg;
430:   PetscReal       shift = info->shiftamount;
431:   PetscBool       allowzeropivot, zeropivotdetected;

433:   PetscFunctionBegin;
434:   allowzeropivot = PetscNot(A->erroriffailure);

436:   /* generate work space needed by the factorization */
437:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
438:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

440:   for (i = 0; i < n; i++) {
441:     /* zero rtmp */
442:     /* L part */
443:     nz    = bi[i + 1] - bi[i];
444:     bjtmp = bj + bi[i];
445:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

447:     /* U part */
448:     nz    = bdiag[i] - bdiag[i + 1];
449:     bjtmp = bj + bdiag[i + 1] + 1;
450:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

452:     /* load in initial (unfactored row) */
453:     nz    = ai[i + 1] - ai[i];
454:     ajtmp = aj + ai[i];
455:     v     = aa + bs2 * ai[i];
456:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

458:     /* elimination */
459:     bjtmp = bj + bi[i];
460:     nzL   = bi[i + 1] - bi[i];
461:     for (k = 0; k < nzL; k++) {
462:       row = bjtmp[k];
463:       pc  = rtmp + bs2 * row;
464:       for (flg = 0, j = 0; j < bs2; j++) {
465:         if (pc[j] != 0.0) {
466:           flg = 1;
467:           break;
468:         }
469:       }
470:       if (flg) {
471:         pv = b->a + bs2 * bdiag[row];
472:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
473:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

475:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
476:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
477:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
478:         for (j = 0; j < nz; j++) {
479:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
480:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
481:           v = rtmp + bs2 * pj[j];
482:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
483:           pv += bs2;
484:         }
485:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
486:       }
487:     }

489:     /* finished row so stick it into b->a */
490:     /* L part */
491:     pv = b->a + bs2 * bi[i];
492:     pj = b->j + bi[i];
493:     nz = bi[i + 1] - bi[i];
494:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

496:     /* Mark diagonal and invert diagonal for simpler triangular solves */
497:     pv = b->a + bs2 * bdiag[i];
498:     pj = b->j + bdiag[i];
499:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
500:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
501:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

503:     /* U part */
504:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
505:     pj = b->j + bdiag[i + 1] + 1;
506:     nz = bdiag[i] - bdiag[i + 1] - 1;
507:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
508:   }
509:   PetscCall(PetscFree2(rtmp, mwork));

511:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
512:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
513:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
514:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
515:   C->assembled           = PETSC_TRUE;

517:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
518:   PetscFunctionReturn(PETSC_SUCCESS);
519: }