Actual source code: baijfact9.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 5 by 5
9: */
10: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_5_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic;
15: PetscInt *bi = b->i, *bj = b->j, *ajtmpold, *ajtmp;
16: PetscInt i, j, n = a->mbs, nz, row, idx, ipvt[5];
17: const PetscInt *diag_offset;
18: PetscInt *ai = a->i, *aj = a->j, *pj;
19: MatScalar *w, *pv, *rtmp, *x, *pc;
20: const MatScalar *v, *aa = a->a;
21: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
22: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
23: MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14;
24: MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12;
25: MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
26: MatScalar *ba = b->a, work[25];
27: PetscReal shift = info->shiftamount;
28: PetscBool allowzeropivot, zeropivotdetected;
30: PetscFunctionBegin;
31: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
32: A->factortype = MAT_FACTOR_NONE;
33: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
34: A->factortype = MAT_FACTOR_ILU;
35: allowzeropivot = PetscNot(A->erroriffailure);
36: PetscCall(ISGetIndices(isrow, &r));
37: PetscCall(ISGetIndices(isicol, &ic));
38: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
40: #define PETSC_USE_MEMZERO 1
41: #define PETSC_USE_MEMCPY 1
43: for (i = 0; i < n; i++) {
44: nz = bi[i + 1] - bi[i];
45: ajtmp = bj + bi[i];
46: for (j = 0; j < nz; j++) {
47: #if defined(PETSC_USE_MEMZERO)
48: PetscCall(PetscArrayzero(rtmp + 25 * ajtmp[j], 25));
49: #else
50: x = rtmp + 25 * ajtmp[j];
51: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
52: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
53: x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
54: #endif
55: }
56: /* load in initial (unfactored row) */
57: idx = r[i];
58: nz = ai[idx + 1] - ai[idx];
59: ajtmpold = aj + ai[idx];
60: v = aa + 25 * ai[idx];
61: for (j = 0; j < nz; j++) {
62: #if defined(PETSC_USE_MEMCPY)
63: PetscCall(PetscArraycpy(rtmp + 25 * ic[ajtmpold[j]], v, 25));
64: #else
65: x = rtmp + 25 * ic[ajtmpold[j]];
66: x[0] = v[0];
67: x[1] = v[1];
68: x[2] = v[2];
69: x[3] = v[3];
70: x[4] = v[4];
71: x[5] = v[5];
72: x[6] = v[6];
73: x[7] = v[7];
74: x[8] = v[8];
75: x[9] = v[9];
76: x[10] = v[10];
77: x[11] = v[11];
78: x[12] = v[12];
79: x[13] = v[13];
80: x[14] = v[14];
81: x[15] = v[15];
82: x[16] = v[16];
83: x[17] = v[17];
84: x[18] = v[18];
85: x[19] = v[19];
86: x[20] = v[20];
87: x[21] = v[21];
88: x[22] = v[22];
89: x[23] = v[23];
90: x[24] = v[24];
91: #endif
92: v += 25;
93: }
94: row = *ajtmp++;
95: while (row < i) {
96: pc = rtmp + 25 * row;
97: p1 = pc[0];
98: p2 = pc[1];
99: p3 = pc[2];
100: p4 = pc[3];
101: p5 = pc[4];
102: p6 = pc[5];
103: p7 = pc[6];
104: p8 = pc[7];
105: p9 = pc[8];
106: p10 = pc[9];
107: p11 = pc[10];
108: p12 = pc[11];
109: p13 = pc[12];
110: p14 = pc[13];
111: p15 = pc[14];
112: p16 = pc[15];
113: p17 = pc[16];
114: p18 = pc[17];
115: p19 = pc[18];
116: p20 = pc[19];
117: p21 = pc[20];
118: p22 = pc[21];
119: p23 = pc[22];
120: p24 = pc[23];
121: p25 = pc[24];
122: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
123: pv = ba + 25 * diag_offset[row];
124: pj = bj + diag_offset[row] + 1;
125: x1 = pv[0];
126: x2 = pv[1];
127: x3 = pv[2];
128: x4 = pv[3];
129: x5 = pv[4];
130: x6 = pv[5];
131: x7 = pv[6];
132: x8 = pv[7];
133: x9 = pv[8];
134: x10 = pv[9];
135: x11 = pv[10];
136: x12 = pv[11];
137: x13 = pv[12];
138: x14 = pv[13];
139: x15 = pv[14];
140: x16 = pv[15];
141: x17 = pv[16];
142: x18 = pv[17];
143: x19 = pv[18];
144: x20 = pv[19];
145: x21 = pv[20];
146: x22 = pv[21];
147: x23 = pv[22];
148: x24 = pv[23];
149: x25 = pv[24];
150: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
151: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
152: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
153: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
154: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
156: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
157: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
158: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
159: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
160: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
162: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
163: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
164: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
165: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
166: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
168: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
169: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
170: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
171: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
172: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
174: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
175: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
176: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
177: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
178: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
180: nz = bi[row + 1] - diag_offset[row] - 1;
181: pv += 25;
182: for (j = 0; j < nz; j++) {
183: x1 = pv[0];
184: x2 = pv[1];
185: x3 = pv[2];
186: x4 = pv[3];
187: x5 = pv[4];
188: x6 = pv[5];
189: x7 = pv[6];
190: x8 = pv[7];
191: x9 = pv[8];
192: x10 = pv[9];
193: x11 = pv[10];
194: x12 = pv[11];
195: x13 = pv[12];
196: x14 = pv[13];
197: x15 = pv[14];
198: x16 = pv[15];
199: x17 = pv[16];
200: x18 = pv[17];
201: x19 = pv[18];
202: x20 = pv[19];
203: x21 = pv[20];
204: x22 = pv[21];
205: x23 = pv[22];
206: x24 = pv[23];
207: x25 = pv[24];
208: x = rtmp + 25 * pj[j];
209: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
210: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
211: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
212: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
213: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
215: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
216: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
217: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
218: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
219: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
221: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
222: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
223: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
224: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
225: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
227: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
228: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
229: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
230: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
231: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
233: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
234: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
235: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
236: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
237: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
239: pv += 25;
240: }
241: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
242: }
243: row = *ajtmp++;
244: }
245: /* finished row so stick it into b->a */
246: pv = ba + 25 * bi[i];
247: pj = bj + bi[i];
248: nz = bi[i + 1] - bi[i];
249: for (j = 0; j < nz; j++) {
250: #if defined(PETSC_USE_MEMCPY)
251: PetscCall(PetscArraycpy(pv, rtmp + 25 * pj[j], 25));
252: #else
253: x = rtmp + 25 * pj[j];
254: pv[0] = x[0];
255: pv[1] = x[1];
256: pv[2] = x[2];
257: pv[3] = x[3];
258: pv[4] = x[4];
259: pv[5] = x[5];
260: pv[6] = x[6];
261: pv[7] = x[7];
262: pv[8] = x[8];
263: pv[9] = x[9];
264: pv[10] = x[10];
265: pv[11] = x[11];
266: pv[12] = x[12];
267: pv[13] = x[13];
268: pv[14] = x[14];
269: pv[15] = x[15];
270: pv[16] = x[16];
271: pv[17] = x[17];
272: pv[18] = x[18];
273: pv[19] = x[19];
274: pv[20] = x[20];
275: pv[21] = x[21];
276: pv[22] = x[22];
277: pv[23] = x[23];
278: pv[24] = x[24];
279: #endif
280: pv += 25;
281: }
282: /* invert diagonal block */
283: w = ba + 25 * diag_offset[i];
284: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
285: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
286: }
288: PetscCall(PetscFree(rtmp));
289: PetscCall(ISRestoreIndices(isicol, &ic));
290: PetscCall(ISRestoreIndices(isrow, &r));
292: C->ops->solve = MatSolve_SeqBAIJ_5_inplace;
293: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_inplace;
294: C->assembled = PETSC_TRUE;
296: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
297: PetscFunctionReturn(PETSC_SUCCESS);
298: }
300: /* MatLUFactorNumeric_SeqBAIJ_5 -
301: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
302: PetscKernel_A_gets_A_times_B()
303: PetscKernel_A_gets_A_minus_B_times_C()
304: PetscKernel_A_gets_inverse_A()
305: */
307: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat B, Mat A, const MatFactorInfo *info)
308: {
309: Mat C = B;
310: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
311: IS isrow = b->row, isicol = b->icol;
312: const PetscInt *r, *ic;
313: PetscInt i, j, k, nz, nzL, row;
314: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
315: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
316: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a, work[25];
317: PetscInt flg, ipvt[5];
318: PetscReal shift = info->shiftamount;
319: PetscBool allowzeropivot, zeropivotdetected;
321: PetscFunctionBegin;
322: allowzeropivot = PetscNot(A->erroriffailure);
323: PetscCall(ISGetIndices(isrow, &r));
324: PetscCall(ISGetIndices(isicol, &ic));
326: /* generate work space needed by the factorization */
327: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
328: PetscCall(PetscArrayzero(rtmp, bs2 * n));
330: for (i = 0; i < n; i++) {
331: /* zero rtmp */
332: /* L part */
333: nz = bi[i + 1] - bi[i];
334: bjtmp = bj + bi[i];
335: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
337: /* U part */
338: nz = bdiag[i] - bdiag[i + 1];
339: bjtmp = bj + bdiag[i + 1] + 1;
340: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
342: /* load in initial (unfactored row) */
343: nz = ai[r[i] + 1] - ai[r[i]];
344: ajtmp = aj + ai[r[i]];
345: v = aa + bs2 * ai[r[i]];
346: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
348: /* elimination */
349: bjtmp = bj + bi[i];
350: nzL = bi[i + 1] - bi[i];
351: for (k = 0; k < nzL; k++) {
352: row = bjtmp[k];
353: pc = rtmp + bs2 * row;
354: for (flg = 0, j = 0; j < bs2; j++) {
355: if (pc[j] != 0.0) {
356: flg = 1;
357: break;
358: }
359: }
360: if (flg) {
361: pv = b->a + bs2 * bdiag[row];
362: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
363: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
365: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
366: pv = b->a + bs2 * (bdiag[row + 1] + 1);
367: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
368: for (j = 0; j < nz; j++) {
369: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
370: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
371: v = rtmp + bs2 * pj[j];
372: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(v, pc, pv));
373: pv += bs2;
374: }
375: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
376: }
377: }
379: /* finished row so stick it into b->a */
380: /* L part */
381: pv = b->a + bs2 * bi[i];
382: pj = b->j + bi[i];
383: nz = bi[i + 1] - bi[i];
384: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
386: /* Mark diagonal and invert diagonal for simpler triangular solves */
387: pv = b->a + bs2 * bdiag[i];
388: pj = b->j + bdiag[i];
389: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
390: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
391: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
393: /* U part */
394: pv = b->a + bs2 * (bdiag[i + 1] + 1);
395: pj = b->j + bdiag[i + 1] + 1;
396: nz = bdiag[i] - bdiag[i + 1] - 1;
397: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
398: }
400: PetscCall(PetscFree2(rtmp, mwork));
401: PetscCall(ISRestoreIndices(isicol, &ic));
402: PetscCall(ISRestoreIndices(isrow, &r));
404: C->ops->solve = MatSolve_SeqBAIJ_5;
405: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
406: C->assembled = PETSC_TRUE;
408: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
409: PetscFunctionReturn(PETSC_SUCCESS);
410: }
412: /*
413: Version for when blocks are 5 by 5 Using natural ordering
414: */
415: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
416: {
417: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
418: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j, ipvt[5];
419: PetscInt *ajtmpold, *ajtmp, nz, row;
420: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
421: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
422: MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
423: MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25;
424: MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15;
425: MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25;
426: MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
427: MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
428: MatScalar *ba = b->a, *aa = a->a, work[25];
429: PetscReal shift = info->shiftamount;
430: PetscBool allowzeropivot, zeropivotdetected;
432: PetscFunctionBegin;
433: allowzeropivot = PetscNot(A->erroriffailure);
434: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
435: for (i = 0; i < n; i++) {
436: nz = bi[i + 1] - bi[i];
437: ajtmp = bj + bi[i];
438: for (j = 0; j < nz; j++) {
439: x = rtmp + 25 * ajtmp[j];
440: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
441: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
442: x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
443: }
444: /* load in initial (unfactored row) */
445: nz = ai[i + 1] - ai[i];
446: ajtmpold = aj + ai[i];
447: v = aa + 25 * ai[i];
448: for (j = 0; j < nz; j++) {
449: x = rtmp + 25 * ajtmpold[j];
450: x[0] = v[0];
451: x[1] = v[1];
452: x[2] = v[2];
453: x[3] = v[3];
454: x[4] = v[4];
455: x[5] = v[5];
456: x[6] = v[6];
457: x[7] = v[7];
458: x[8] = v[8];
459: x[9] = v[9];
460: x[10] = v[10];
461: x[11] = v[11];
462: x[12] = v[12];
463: x[13] = v[13];
464: x[14] = v[14];
465: x[15] = v[15];
466: x[16] = v[16];
467: x[17] = v[17];
468: x[18] = v[18];
469: x[19] = v[19];
470: x[20] = v[20];
471: x[21] = v[21];
472: x[22] = v[22];
473: x[23] = v[23];
474: x[24] = v[24];
475: v += 25;
476: }
477: row = *ajtmp++;
478: while (row < i) {
479: pc = rtmp + 25 * row;
480: p1 = pc[0];
481: p2 = pc[1];
482: p3 = pc[2];
483: p4 = pc[3];
484: p5 = pc[4];
485: p6 = pc[5];
486: p7 = pc[6];
487: p8 = pc[7];
488: p9 = pc[8];
489: p10 = pc[9];
490: p11 = pc[10];
491: p12 = pc[11];
492: p13 = pc[12];
493: p14 = pc[13];
494: p15 = pc[14];
495: p16 = pc[15];
496: p17 = pc[16];
497: p18 = pc[17];
498: p19 = pc[18];
499: p20 = pc[19];
500: p21 = pc[20];
501: p22 = pc[21];
502: p23 = pc[22];
503: p24 = pc[23];
504: p25 = pc[24];
505: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
506: pv = ba + 25 * diag_offset[row];
507: pj = bj + diag_offset[row] + 1;
508: x1 = pv[0];
509: x2 = pv[1];
510: x3 = pv[2];
511: x4 = pv[3];
512: x5 = pv[4];
513: x6 = pv[5];
514: x7 = pv[6];
515: x8 = pv[7];
516: x9 = pv[8];
517: x10 = pv[9];
518: x11 = pv[10];
519: x12 = pv[11];
520: x13 = pv[12];
521: x14 = pv[13];
522: x15 = pv[14];
523: x16 = pv[15];
524: x17 = pv[16];
525: x18 = pv[17];
526: x19 = pv[18];
527: x20 = pv[19];
528: x21 = pv[20];
529: x22 = pv[21];
530: x23 = pv[22];
531: x24 = pv[23];
532: x25 = pv[24];
533: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
534: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
535: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
536: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
537: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
539: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
540: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
541: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
542: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
543: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
545: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
546: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
547: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
548: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
549: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
551: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
552: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
553: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
554: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
555: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
557: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
558: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
559: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
560: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
561: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
563: nz = bi[row + 1] - diag_offset[row] - 1;
564: pv += 25;
565: for (j = 0; j < nz; j++) {
566: x1 = pv[0];
567: x2 = pv[1];
568: x3 = pv[2];
569: x4 = pv[3];
570: x5 = pv[4];
571: x6 = pv[5];
572: x7 = pv[6];
573: x8 = pv[7];
574: x9 = pv[8];
575: x10 = pv[9];
576: x11 = pv[10];
577: x12 = pv[11];
578: x13 = pv[12];
579: x14 = pv[13];
580: x15 = pv[14];
581: x16 = pv[15];
582: x17 = pv[16];
583: x18 = pv[17];
584: x19 = pv[18];
585: x20 = pv[19];
586: x21 = pv[20];
587: x22 = pv[21];
588: x23 = pv[22];
589: x24 = pv[23];
590: x25 = pv[24];
591: x = rtmp + 25 * pj[j];
592: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
593: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
594: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
595: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
596: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
598: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
599: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
600: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
601: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
602: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
604: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
605: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
606: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
607: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
608: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
610: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
611: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
612: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
613: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
614: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
616: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
617: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
618: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
619: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
620: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
621: pv += 25;
622: }
623: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
624: }
625: row = *ajtmp++;
626: }
627: /* finished row so stick it into b->a */
628: pv = ba + 25 * bi[i];
629: pj = bj + bi[i];
630: nz = bi[i + 1] - bi[i];
631: for (j = 0; j < nz; j++) {
632: x = rtmp + 25 * pj[j];
633: pv[0] = x[0];
634: pv[1] = x[1];
635: pv[2] = x[2];
636: pv[3] = x[3];
637: pv[4] = x[4];
638: pv[5] = x[5];
639: pv[6] = x[6];
640: pv[7] = x[7];
641: pv[8] = x[8];
642: pv[9] = x[9];
643: pv[10] = x[10];
644: pv[11] = x[11];
645: pv[12] = x[12];
646: pv[13] = x[13];
647: pv[14] = x[14];
648: pv[15] = x[15];
649: pv[16] = x[16];
650: pv[17] = x[17];
651: pv[18] = x[18];
652: pv[19] = x[19];
653: pv[20] = x[20];
654: pv[21] = x[21];
655: pv[22] = x[22];
656: pv[23] = x[23];
657: pv[24] = x[24];
658: pv += 25;
659: }
660: /* invert diagonal block */
661: w = ba + 25 * diag_offset[i];
662: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
663: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
664: }
666: PetscCall(PetscFree(rtmp));
668: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_inplace;
669: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace;
670: C->assembled = PETSC_TRUE;
672: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
673: PetscFunctionReturn(PETSC_SUCCESS);
674: }
676: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
677: {
678: Mat C = B;
679: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
680: PetscInt i, j, k, nz, nzL, row;
681: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
682: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
683: MatScalar *rtmp, *pc, *mwork, *v, *vv, *pv, *aa = a->a, work[25];
684: PetscInt flg, ipvt[5];
685: PetscReal shift = info->shiftamount;
686: PetscBool allowzeropivot, zeropivotdetected;
688: PetscFunctionBegin;
689: allowzeropivot = PetscNot(A->erroriffailure);
691: /* generate work space needed by the factorization */
692: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
693: PetscCall(PetscArrayzero(rtmp, bs2 * n));
695: for (i = 0; i < n; i++) {
696: /* zero rtmp */
697: /* L part */
698: nz = bi[i + 1] - bi[i];
699: bjtmp = bj + bi[i];
700: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
702: /* U part */
703: nz = bdiag[i] - bdiag[i + 1];
704: bjtmp = bj + bdiag[i + 1] + 1;
705: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
707: /* load in initial (unfactored row) */
708: nz = ai[i + 1] - ai[i];
709: ajtmp = aj + ai[i];
710: v = aa + bs2 * ai[i];
711: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
713: /* elimination */
714: bjtmp = bj + bi[i];
715: nzL = bi[i + 1] - bi[i];
716: for (k = 0; k < nzL; k++) {
717: row = bjtmp[k];
718: pc = rtmp + bs2 * row;
719: for (flg = 0, j = 0; j < bs2; j++) {
720: if (pc[j] != 0.0) {
721: flg = 1;
722: break;
723: }
724: }
725: if (flg) {
726: pv = b->a + bs2 * bdiag[row];
727: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
728: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
730: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
731: pv = b->a + bs2 * (bdiag[row + 1] + 1);
732: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
733: for (j = 0; j < nz; j++) {
734: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
735: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
736: vv = rtmp + bs2 * pj[j];
737: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(vv, pc, pv));
738: pv += bs2;
739: }
740: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
741: }
742: }
744: /* finished row so stick it into b->a */
745: /* L part */
746: pv = b->a + bs2 * bi[i];
747: pj = b->j + bi[i];
748: nz = bi[i + 1] - bi[i];
749: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
751: /* Mark diagonal and invert diagonal for simpler triangular solves */
752: pv = b->a + bs2 * bdiag[i];
753: pj = b->j + bdiag[i];
754: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
755: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
756: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
758: /* U part */
759: pv = b->a + bs2 * (bdiag[i + 1] + 1);
760: pj = b->j + bdiag[i + 1] + 1;
761: nz = bdiag[i] - bdiag[i + 1] - 1;
762: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
763: }
764: PetscCall(PetscFree2(rtmp, mwork));
766: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
767: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
768: C->assembled = PETSC_TRUE;
770: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
771: PetscFunctionReturn(PETSC_SUCCESS);
772: }
774: /*
775: Version for when blocks are 9 by 9
776: */
777: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
778: #include <immintrin.h>
779: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_9_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
780: {
781: Mat C = B;
782: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
783: PetscInt i, j, k, nz, nzL, row;
784: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
785: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
786: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
787: PetscInt flg;
788: PetscReal shift = info->shiftamount;
789: PetscBool allowzeropivot, zeropivotdetected;
791: PetscFunctionBegin;
792: allowzeropivot = PetscNot(A->erroriffailure);
794: /* generate work space needed by the factorization */
795: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
796: PetscCall(PetscArrayzero(rtmp, bs2 * n));
798: for (i = 0; i < n; i++) {
799: /* zero rtmp */
800: /* L part */
801: nz = bi[i + 1] - bi[i];
802: bjtmp = bj + bi[i];
803: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
805: /* U part */
806: nz = bdiag[i] - bdiag[i + 1];
807: bjtmp = bj + bdiag[i + 1] + 1;
808: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
810: /* load in initial (unfactored row) */
811: nz = ai[i + 1] - ai[i];
812: ajtmp = aj + ai[i];
813: v = aa + bs2 * ai[i];
814: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
816: /* elimination */
817: bjtmp = bj + bi[i];
818: nzL = bi[i + 1] - bi[i];
819: for (k = 0; k < nzL; k++) {
820: row = bjtmp[k];
821: pc = rtmp + bs2 * row;
822: for (flg = 0, j = 0; j < bs2; j++) {
823: if (pc[j] != 0.0) {
824: flg = 1;
825: break;
826: }
827: }
828: if (flg) {
829: pv = b->a + bs2 * bdiag[row];
830: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
831: PetscCall(PetscKernel_A_gets_A_times_B_9(pc, pv, mwork));
833: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
834: pv = b->a + bs2 * (bdiag[row + 1] + 1);
835: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
836: for (j = 0; j < nz; j++) {
837: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
838: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
839: v = rtmp + bs2 * pj[j];
840: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_9(v, pc, pv + 81 * j));
841: /* pv incremented in PetscKernel_A_gets_A_minus_B_times_C_9 */
842: }
843: PetscCall(PetscLogFlops(1458 * nz + 1377)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
844: }
845: }
847: /* finished row so stick it into b->a */
848: /* L part */
849: pv = b->a + bs2 * bi[i];
850: pj = b->j + bi[i];
851: nz = bi[i + 1] - bi[i];
852: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
854: /* Mark diagonal and invert diagonal for simpler triangular solves */
855: pv = b->a + bs2 * bdiag[i];
856: pj = b->j + bdiag[i];
857: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
858: PetscCall(PetscKernel_A_gets_inverse_A_9(pv, shift, allowzeropivot, &zeropivotdetected));
859: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
861: /* U part */
862: pv = b->a + bs2 * (bdiag[i + 1] + 1);
863: pj = b->j + bdiag[i + 1] + 1;
864: nz = bdiag[i] - bdiag[i + 1] - 1;
865: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
866: }
867: PetscCall(PetscFree2(rtmp, mwork));
869: C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering;
870: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
871: C->assembled = PETSC_TRUE;
873: PetscCall(PetscLogFlops(1.333333333333 * 9 * 9 * 9 * n)); /* from inverting diagonal blocks */
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: PetscErrorCode MatSolve_SeqBAIJ_9_NaturalOrdering(Mat A, Vec bb, Vec xx)
878: {
879: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
880: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
881: PetscInt i, k, n = a->mbs;
882: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
883: const MatScalar *aa = a->a, *v;
884: PetscScalar *x, *s, *t, *ls;
885: const PetscScalar *b;
886: __m256d a0, a1, a2, a3, a4, a5, w0, w1, w2, w3, s0, s1, s2, v0, v1, v2, v3;
888: PetscFunctionBegin;
889: PetscCall(VecGetArrayRead(bb, &b));
890: PetscCall(VecGetArray(xx, &x));
891: t = a->solve_work;
893: /* forward solve the lower triangular */
894: PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
896: for (i = 1; i < n; i++) {
897: v = aa + bs2 * ai[i];
898: vi = aj + ai[i];
899: nz = ai[i + 1] - ai[i];
900: s = t + bs * i;
901: PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
903: __m256d s0, s1, s2;
904: s0 = _mm256_loadu_pd(s + 0);
905: s1 = _mm256_loadu_pd(s + 4);
906: s2 = _mm256_maskload_pd(s + 8, _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63));
908: for (k = 0; k < nz; k++) {
909: w0 = _mm256_set1_pd((t + bs * vi[k])[0]);
910: a0 = _mm256_loadu_pd(&v[0]);
911: s0 = _mm256_fnmadd_pd(a0, w0, s0);
912: a1 = _mm256_loadu_pd(&v[4]);
913: s1 = _mm256_fnmadd_pd(a1, w0, s1);
914: a2 = _mm256_loadu_pd(&v[8]);
915: s2 = _mm256_fnmadd_pd(a2, w0, s2);
917: w1 = _mm256_set1_pd((t + bs * vi[k])[1]);
918: a3 = _mm256_loadu_pd(&v[9]);
919: s0 = _mm256_fnmadd_pd(a3, w1, s0);
920: a4 = _mm256_loadu_pd(&v[13]);
921: s1 = _mm256_fnmadd_pd(a4, w1, s1);
922: a5 = _mm256_loadu_pd(&v[17]);
923: s2 = _mm256_fnmadd_pd(a5, w1, s2);
925: w2 = _mm256_set1_pd((t + bs * vi[k])[2]);
926: a0 = _mm256_loadu_pd(&v[18]);
927: s0 = _mm256_fnmadd_pd(a0, w2, s0);
928: a1 = _mm256_loadu_pd(&v[22]);
929: s1 = _mm256_fnmadd_pd(a1, w2, s1);
930: a2 = _mm256_loadu_pd(&v[26]);
931: s2 = _mm256_fnmadd_pd(a2, w2, s2);
933: w3 = _mm256_set1_pd((t + bs * vi[k])[3]);
934: a3 = _mm256_loadu_pd(&v[27]);
935: s0 = _mm256_fnmadd_pd(a3, w3, s0);
936: a4 = _mm256_loadu_pd(&v[31]);
937: s1 = _mm256_fnmadd_pd(a4, w3, s1);
938: a5 = _mm256_loadu_pd(&v[35]);
939: s2 = _mm256_fnmadd_pd(a5, w3, s2);
941: w0 = _mm256_set1_pd((t + bs * vi[k])[4]);
942: a0 = _mm256_loadu_pd(&v[36]);
943: s0 = _mm256_fnmadd_pd(a0, w0, s0);
944: a1 = _mm256_loadu_pd(&v[40]);
945: s1 = _mm256_fnmadd_pd(a1, w0, s1);
946: a2 = _mm256_loadu_pd(&v[44]);
947: s2 = _mm256_fnmadd_pd(a2, w0, s2);
949: w1 = _mm256_set1_pd((t + bs * vi[k])[5]);
950: a3 = _mm256_loadu_pd(&v[45]);
951: s0 = _mm256_fnmadd_pd(a3, w1, s0);
952: a4 = _mm256_loadu_pd(&v[49]);
953: s1 = _mm256_fnmadd_pd(a4, w1, s1);
954: a5 = _mm256_loadu_pd(&v[53]);
955: s2 = _mm256_fnmadd_pd(a5, w1, s2);
957: w2 = _mm256_set1_pd((t + bs * vi[k])[6]);
958: a0 = _mm256_loadu_pd(&v[54]);
959: s0 = _mm256_fnmadd_pd(a0, w2, s0);
960: a1 = _mm256_loadu_pd(&v[58]);
961: s1 = _mm256_fnmadd_pd(a1, w2, s1);
962: a2 = _mm256_loadu_pd(&v[62]);
963: s2 = _mm256_fnmadd_pd(a2, w2, s2);
965: w3 = _mm256_set1_pd((t + bs * vi[k])[7]);
966: a3 = _mm256_loadu_pd(&v[63]);
967: s0 = _mm256_fnmadd_pd(a3, w3, s0);
968: a4 = _mm256_loadu_pd(&v[67]);
969: s1 = _mm256_fnmadd_pd(a4, w3, s1);
970: a5 = _mm256_loadu_pd(&v[71]);
971: s2 = _mm256_fnmadd_pd(a5, w3, s2);
973: w0 = _mm256_set1_pd((t + bs * vi[k])[8]);
974: a0 = _mm256_loadu_pd(&v[72]);
975: s0 = _mm256_fnmadd_pd(a0, w0, s0);
976: a1 = _mm256_loadu_pd(&v[76]);
977: s1 = _mm256_fnmadd_pd(a1, w0, s1);
978: a2 = _mm256_maskload_pd(v + 80, _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63));
979: s2 = _mm256_fnmadd_pd(a2, w0, s2);
980: v += bs2;
981: }
982: _mm256_storeu_pd(&s[0], s0);
983: _mm256_storeu_pd(&s[4], s1);
984: _mm256_maskstore_pd(&s[8], _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63), s2);
985: }
987: /* backward solve the upper triangular */
988: ls = a->solve_work + A->cmap->n;
989: for (i = n - 1; i >= 0; i--) {
990: v = aa + bs2 * (adiag[i + 1] + 1);
991: vi = aj + adiag[i + 1] + 1;
992: nz = adiag[i] - adiag[i + 1] - 1;
993: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
995: s0 = _mm256_loadu_pd(ls + 0);
996: s1 = _mm256_loadu_pd(ls + 4);
997: s2 = _mm256_maskload_pd(ls + 8, _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63));
999: for (k = 0; k < nz; k++) {
1000: w0 = _mm256_set1_pd((t + bs * vi[k])[0]);
1001: a0 = _mm256_loadu_pd(&v[0]);
1002: s0 = _mm256_fnmadd_pd(a0, w0, s0);
1003: a1 = _mm256_loadu_pd(&v[4]);
1004: s1 = _mm256_fnmadd_pd(a1, w0, s1);
1005: a2 = _mm256_loadu_pd(&v[8]);
1006: s2 = _mm256_fnmadd_pd(a2, w0, s2);
1008: /* v += 9; */
1009: w1 = _mm256_set1_pd((t + bs * vi[k])[1]);
1010: a3 = _mm256_loadu_pd(&v[9]);
1011: s0 = _mm256_fnmadd_pd(a3, w1, s0);
1012: a4 = _mm256_loadu_pd(&v[13]);
1013: s1 = _mm256_fnmadd_pd(a4, w1, s1);
1014: a5 = _mm256_loadu_pd(&v[17]);
1015: s2 = _mm256_fnmadd_pd(a5, w1, s2);
1017: /* v += 9; */
1018: w2 = _mm256_set1_pd((t + bs * vi[k])[2]);
1019: a0 = _mm256_loadu_pd(&v[18]);
1020: s0 = _mm256_fnmadd_pd(a0, w2, s0);
1021: a1 = _mm256_loadu_pd(&v[22]);
1022: s1 = _mm256_fnmadd_pd(a1, w2, s1);
1023: a2 = _mm256_loadu_pd(&v[26]);
1024: s2 = _mm256_fnmadd_pd(a2, w2, s2);
1026: /* v += 9; */
1027: w3 = _mm256_set1_pd((t + bs * vi[k])[3]);
1028: a3 = _mm256_loadu_pd(&v[27]);
1029: s0 = _mm256_fnmadd_pd(a3, w3, s0);
1030: a4 = _mm256_loadu_pd(&v[31]);
1031: s1 = _mm256_fnmadd_pd(a4, w3, s1);
1032: a5 = _mm256_loadu_pd(&v[35]);
1033: s2 = _mm256_fnmadd_pd(a5, w3, s2);
1035: /* v += 9; */
1036: w0 = _mm256_set1_pd((t + bs * vi[k])[4]);
1037: a0 = _mm256_loadu_pd(&v[36]);
1038: s0 = _mm256_fnmadd_pd(a0, w0, s0);
1039: a1 = _mm256_loadu_pd(&v[40]);
1040: s1 = _mm256_fnmadd_pd(a1, w0, s1);
1041: a2 = _mm256_loadu_pd(&v[44]);
1042: s2 = _mm256_fnmadd_pd(a2, w0, s2);
1044: /* v += 9; */
1045: w1 = _mm256_set1_pd((t + bs * vi[k])[5]);
1046: a3 = _mm256_loadu_pd(&v[45]);
1047: s0 = _mm256_fnmadd_pd(a3, w1, s0);
1048: a4 = _mm256_loadu_pd(&v[49]);
1049: s1 = _mm256_fnmadd_pd(a4, w1, s1);
1050: a5 = _mm256_loadu_pd(&v[53]);
1051: s2 = _mm256_fnmadd_pd(a5, w1, s2);
1053: /* v += 9; */
1054: w2 = _mm256_set1_pd((t + bs * vi[k])[6]);
1055: a0 = _mm256_loadu_pd(&v[54]);
1056: s0 = _mm256_fnmadd_pd(a0, w2, s0);
1057: a1 = _mm256_loadu_pd(&v[58]);
1058: s1 = _mm256_fnmadd_pd(a1, w2, s1);
1059: a2 = _mm256_loadu_pd(&v[62]);
1060: s2 = _mm256_fnmadd_pd(a2, w2, s2);
1062: /* v += 9; */
1063: w3 = _mm256_set1_pd((t + bs * vi[k])[7]);
1064: a3 = _mm256_loadu_pd(&v[63]);
1065: s0 = _mm256_fnmadd_pd(a3, w3, s0);
1066: a4 = _mm256_loadu_pd(&v[67]);
1067: s1 = _mm256_fnmadd_pd(a4, w3, s1);
1068: a5 = _mm256_loadu_pd(&v[71]);
1069: s2 = _mm256_fnmadd_pd(a5, w3, s2);
1071: /* v += 9; */
1072: w0 = _mm256_set1_pd((t + bs * vi[k])[8]);
1073: a0 = _mm256_loadu_pd(&v[72]);
1074: s0 = _mm256_fnmadd_pd(a0, w0, s0);
1075: a1 = _mm256_loadu_pd(&v[76]);
1076: s1 = _mm256_fnmadd_pd(a1, w0, s1);
1077: a2 = _mm256_maskload_pd(v + 80, _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63));
1078: s2 = _mm256_fnmadd_pd(a2, w0, s2);
1079: v += bs2;
1080: }
1082: _mm256_storeu_pd(&ls[0], s0);
1083: _mm256_storeu_pd(&ls[4], s1);
1084: _mm256_maskstore_pd(&ls[8], _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63), s2);
1086: w0 = _mm256_setzero_pd();
1087: w1 = _mm256_setzero_pd();
1088: w2 = _mm256_setzero_pd();
1090: /* first row */
1091: v0 = _mm256_set1_pd(ls[0]);
1092: a0 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[0]);
1093: w0 = _mm256_fmadd_pd(a0, v0, w0);
1094: a1 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[4]);
1095: w1 = _mm256_fmadd_pd(a1, v0, w1);
1096: a2 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[8]);
1097: w2 = _mm256_fmadd_pd(a2, v0, w2);
1099: /* second row */
1100: v1 = _mm256_set1_pd(ls[1]);
1101: a3 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[9]);
1102: w0 = _mm256_fmadd_pd(a3, v1, w0);
1103: a4 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[13]);
1104: w1 = _mm256_fmadd_pd(a4, v1, w1);
1105: a5 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[17]);
1106: w2 = _mm256_fmadd_pd(a5, v1, w2);
1108: /* third row */
1109: v2 = _mm256_set1_pd(ls[2]);
1110: a0 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[18]);
1111: w0 = _mm256_fmadd_pd(a0, v2, w0);
1112: a1 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[22]);
1113: w1 = _mm256_fmadd_pd(a1, v2, w1);
1114: a2 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[26]);
1115: w2 = _mm256_fmadd_pd(a2, v2, w2);
1117: /* fourth row */
1118: v3 = _mm256_set1_pd(ls[3]);
1119: a3 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[27]);
1120: w0 = _mm256_fmadd_pd(a3, v3, w0);
1121: a4 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[31]);
1122: w1 = _mm256_fmadd_pd(a4, v3, w1);
1123: a5 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[35]);
1124: w2 = _mm256_fmadd_pd(a5, v3, w2);
1126: /* fifth row */
1127: v0 = _mm256_set1_pd(ls[4]);
1128: a0 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[36]);
1129: w0 = _mm256_fmadd_pd(a0, v0, w0);
1130: a1 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[40]);
1131: w1 = _mm256_fmadd_pd(a1, v0, w1);
1132: a2 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[44]);
1133: w2 = _mm256_fmadd_pd(a2, v0, w2);
1135: /* sixth row */
1136: v1 = _mm256_set1_pd(ls[5]);
1137: a3 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[45]);
1138: w0 = _mm256_fmadd_pd(a3, v1, w0);
1139: a4 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[49]);
1140: w1 = _mm256_fmadd_pd(a4, v1, w1);
1141: a5 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[53]);
1142: w2 = _mm256_fmadd_pd(a5, v1, w2);
1144: /* seventh row */
1145: v2 = _mm256_set1_pd(ls[6]);
1146: a0 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[54]);
1147: w0 = _mm256_fmadd_pd(a0, v2, w0);
1148: a1 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[58]);
1149: w1 = _mm256_fmadd_pd(a1, v2, w1);
1150: a2 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[62]);
1151: w2 = _mm256_fmadd_pd(a2, v2, w2);
1153: /* eighth row */
1154: v3 = _mm256_set1_pd(ls[7]);
1155: a3 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[63]);
1156: w0 = _mm256_fmadd_pd(a3, v3, w0);
1157: a4 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[67]);
1158: w1 = _mm256_fmadd_pd(a4, v3, w1);
1159: a5 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[71]);
1160: w2 = _mm256_fmadd_pd(a5, v3, w2);
1162: /* ninth row */
1163: v0 = _mm256_set1_pd(ls[8]);
1164: a3 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[72]);
1165: w0 = _mm256_fmadd_pd(a3, v0, w0);
1166: a4 = _mm256_loadu_pd(&(aa + bs2 * adiag[i])[76]);
1167: w1 = _mm256_fmadd_pd(a4, v0, w1);
1168: a2 = _mm256_maskload_pd(&(aa + bs2 * adiag[i])[80], _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63));
1169: w2 = _mm256_fmadd_pd(a2, v0, w2);
1171: _mm256_storeu_pd(&(t + i * bs)[0], w0);
1172: _mm256_storeu_pd(&(t + i * bs)[4], w1);
1173: _mm256_maskstore_pd(&(t + i * bs)[8], _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63), w2);
1175: PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1176: }
1178: PetscCall(VecRestoreArrayRead(bb, &b));
1179: PetscCall(VecRestoreArray(xx, &x));
1180: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1181: PetscFunctionReturn(PETSC_SUCCESS);
1182: }
1183: #endif