DTRSM bug fix for MIPS P5600 and I6400

Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
This commit is contained in:
Kaustubh Raste 2016-05-17 15:48:02 +05:30
parent 8bf71e9e06
commit d7cbc7ac13
5 changed files with 610 additions and 587 deletions

View File

@ -126,22 +126,14 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
a -= 64;
b -= 32;
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
src_a54 = __msa_cast_to_vector_double(*(a + 54));
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@ -172,10 +164,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c6, b + 24);
ST_DP(res_c15, b + 30);
ST_DP(res_c14, b + 26);
src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
ST_DP(src_c3, c + 6);
ST_DP(src_c7, c_nxt1line + 6);
ST_DP(src_c11, c_nxt2line + 6);
@ -211,10 +201,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c12, b + 18);
ST_DP(res_c13, b + 22);
src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
ST_DP(src_c2, c + 4);
ST_DP(src_c6, c_nxt1line + 4);
ST_DP(src_c10, c_nxt2line + 4);
@ -286,10 +274,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
ST_DP(src_c1, c + 2);
ST_DP(src_c5, c_nxt1line + 2);
ST_DP(src_c9, c_nxt2line + 2);
@ -343,10 +329,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c1, b + 4);
ST_DP(res_c9, b + 6);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
ST_DP(src_c0, c);
ST_DP(src_c4, c_nxt1line);
@ -417,14 +401,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c7 -= src_a3 * src_b;
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
src_a56 = LD_DP(a - 8);
src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
@ -541,14 +521,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c1, b - 14);
ST_DP(res_c0, b - 16);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
@ -572,30 +548,19 @@ static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
if (bk > 0)
{
int i;
BLASLONG i;
FLOAT *aa = a, *bb = b;
FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
for (i = bk; i--; )
{
a0 = aa[0];
a1 = aa[1];
a2 = aa[2];
a3 = aa[3];
a4 = aa[4];
a5 = aa[5];
a6 = aa[6];
a7 = aa[7];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c2 -= a2 * b0;
c3 -= a3 * b0;
c4 -= a4 * b0;
c5 -= a5 * b0;
c6 -= a6 * b0;
c7 -= a7 * b0;
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
c2 -= aa[2] * bb[0];
c3 -= aa[3] * bb[0];
c4 -= aa[4] * bb[0];
c5 -= aa[5] * bb[0];
c6 -= aa[6] * bb[0];
c7 -= aa[7] * bb[0];
aa += 8;
bb += 1;
@ -720,7 +685,7 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 16, *bb = b + 16;
FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
for (i = bk; i--;)
@ -749,14 +714,13 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
a -= 16;
b -= 16;
ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
src_a14 = LD_DP(a + 14);
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
@ -813,14 +777,10 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c4, b + 2);
ST_DP(res_c0, b + 0);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
@ -840,7 +800,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 16, *bb = b + 8;
FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0;
for (i = bk; i--;)
@ -861,10 +821,11 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
a -= 16;
b -= 8;
ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
src_a14 = LD_DP(a + 14);
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
@ -907,10 +868,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c1, b + 2);
ST_DP(res_c0, b + 0);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
@ -918,8 +877,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
FLOAT c0, c1, c2, c3;
FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
@ -929,27 +887,23 @@ static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 16, *bb = b + 4;
FLOAT a0, a1, a2, a3, b0;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
a0 = aa[0];
a1 = aa[1];
a2 = aa[2];
a3 = aa[3];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c2 -= a2 * b0;
c3 -= a3 * b0;
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
c2 -= aa[2] * bb[0];
c3 -= aa[3] * bb[0];
aa += 4;
bb += 1;
}
}
a -= 16;
b -= 4;
a0 = *(a + 0);
a4 = *(a + 4);
a5 = *(a + 5);
@ -1003,35 +957,27 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 4, *bb = b + 8;
FLOAT a0, a1, b0, b1, b2, b3;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
a0 = aa[0];
a1 = aa[1];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = bb[1];
c0_nxt1 -= a0 * b1;
c1_nxt1 -= a1 * b1;
b2 = bb[2];
c0_nxt2 -= a0 * b2;
c1_nxt2 -= a1 * b2;
b3 = bb[3];
c0_nxt3 -= a0 * b3;
c1_nxt3 -= a1 * b3;
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
c0_nxt1 -= aa[0] * bb[1];
c1_nxt1 -= aa[1] * bb[1];
c0_nxt2 -= aa[0] * bb[2];
c1_nxt2 -= aa[1] * bb[2];
c0_nxt3 -= aa[0] * bb[3];
c1_nxt3 -= aa[1] * bb[3];
aa += 2;
bb += 4;
}
}
a -= 4;
b -= 8;
a0 = *(a + 0);
a2 = *(a + 2);
a3 = *(a + 3);
@ -1063,13 +1009,10 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
*(c + 0) = c0;
*(c + 1) = c1;
*(c + 0 + ldc) = c0_nxt1;
*(c + 1 + ldc) = c1_nxt1;
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
@ -1087,27 +1030,24 @@ static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 4, *bb = b + 4;
FLOAT a0, a1, b0, b1;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
a0 = aa[0];
a1 = aa[1];
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = bb[1];
c0_nxt -= a0 * b1;
c1_nxt -= a1 * b1;
c0_nxt -= aa[0] * bb[1];
c1_nxt -= aa[1] * bb[1];
aa += 2;
bb += 2;
}
}
a -= 4;
b -= 4;
a0 = *(a + 0);
a2 = *(a + 2);
a3 = *(a + 3);
@ -1144,33 +1084,28 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
if (bk > 0)
{
BLASLONG i;
FLOAT a0, a1, b0;
FLOAT *aa = a + 4, *bb = b + 2;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
a0 = aa[0];
a1 = aa[1];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
aa += 2;
bb += 1;
}
}
a0 = *(a + 0);
a2 = *(a + 2);
a3 = *(a + 3);
a0 = *(a - 4);
a2 = *(a - 2);
a3 = *(a - 1);
c1 *= a3;
c0 -= c1 * a2;
c0 *= a0;
*(b + 0) = c0;
*(b + 1) = c1;
*(b - 2) = c0;
*(b - 1) = c1;
*(c + 0) = c0;
*(c + 1) = c1;
@ -1178,46 +1113,44 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT a0;
FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
FLOAT c0, c1, c2, c3;
a0 = *a;
c0 = *(c + 0);
c0_nxt1 = *(c + 1 * ldc);
c0_nxt2 = *(c + 2 * ldc);
c0_nxt3 = *(c + 3 * ldc);
c1 = *(c + 1 * ldc);
c2 = *(c + 2 * ldc);
c3 = *(c + 3 * ldc);
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 1, *bb = b + 4;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
c0_nxt1 -= aa[0] * bb[1];
c0_nxt2 -= aa[0] * bb[2];
c0_nxt3 -= aa[0] * bb[3];
c1 -= aa[0] * bb[1];
c2 -= aa[0] * bb[2];
c3 -= aa[0] * bb[3];
aa += 1;
bb += 4;
}
}
c0 *= a0;
c0_nxt1 *= a0;
c0_nxt2 *= a0;
c0_nxt3 *= a0;
c0 *= *(a - 1);
c1 *= *(a - 1);
c2 *= *(a - 1);
c3 *= *(a - 1);
*(c + 0 * ldc) = c0;
*(c + 1 * ldc) = c0_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 1 * ldc) = c1;
*(c + 2 * ldc) = c2;
*(c + 3 * ldc) = c3;
*(b + 0) = c0;
*(b + 1) = c0_nxt1;
*(b + 2) = c0_nxt2;
*(b + 3) = c0_nxt3;
*(b - 4) = c0;
*(b - 3) = c1;
*(b - 2) = c2;
*(b - 1) = c3;
}
static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
@ -1247,7 +1180,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + (m - 1);
dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk);
dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 1;
}
@ -1258,7 +1191,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + ((m & -2) - 2);
dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk);
dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 2;
}
@ -1269,7 +1202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + ((m & -4) - 4);
dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk);
dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 4;
}
@ -1319,7 +1252,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -2) - 2) * k;
cc = c + ((m & -2) - 2);
dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk);
dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk);
kk -= 2;
}
@ -1329,7 +1262,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk);
dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk);
kk -= 4;
}
@ -1377,7 +1310,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -2) - 2) * k + kk * 2;
cc = c + ((m & -2) - 2);
dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk);
dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk);
kk -= 2;
}
@ -1387,7 +1320,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk);
dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk);
kk -= 4;
}

View File

@ -48,7 +48,7 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
@ -124,22 +124,14 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
b += 4;
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
@ -205,10 +197,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c1, b + 4);
ST_DP(res_c9, b + 6);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
ST_DP(src_c0, c);
ST_DP(src_c4, c_nxt1line);
@ -265,10 +255,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c3, b + 12);
ST_DP(res_c11, b + 14);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
src_a36 = LD_DP(a + 36);
src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
@ -311,10 +299,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c5, b + 20);
ST_DP(res_c13, b + 22);
src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
src_a63 = __msa_cast_to_vector_double(*(a + 63));
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
@ -341,10 +327,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c7, b + 28);
ST_DP(res_c15, b + 30);
src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
ST_DP(src_c3, c + 6);
ST_DP(src_c7, c_nxt1line + 6);
@ -365,15 +349,21 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
v2f64 src_b, src_b0, src_b1;
for (i = bk; i--;)
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(b);
a += 8;
b += 2;
for (i = (bk - 1); i--;)
{
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(b);
LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
src_b1 = LD_DP(b);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -387,19 +377,33 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b1;
a += 8;
b += 2;
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
@ -480,10 +484,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c2, b + 4);
ST_DP(res_c3, b + 6);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c4, src_c5, c + ldc, 2);
@ -526,10 +528,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c6, b + 12);
ST_DP(res_c7, b + 14);
src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ST_DP2(src_c2, src_c3, c + 4, 2);
ST_DP2(src_c6, src_c7, c + 4 + ldc, 2);
@ -539,8 +539,7 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
FLOAT a45, a46, a47, a54, a55, a63;
FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
c0 = *(c + 0);
c1 = *(c + 1);
@ -551,31 +550,20 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
c6 = *(c + 6);
c7 = *(c + 7);
if (bk > 0)
if (bk)
{
int i;
FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
BLASLONG i;
for (i = bk; i--; )
{
a0 = a[0];
a1 = a[1];
a2 = a[2];
a3 = a[3];
a4 = a[4];
a5 = a[5];
a6 = a[6];
a7 = a[7];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c2 -= a2 * b0;
c3 -= a3 * b0;
c4 -= a4 * b0;
c5 -= a5 * b0;
c6 -= a6 * b0;
c7 -= a7 * b0;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
c2 -= a[2] * b[0];
c3 -= a[3] * b[0];
c4 -= a[4] * b[0];
c5 -= a[5] * b[0];
c6 -= a[6] * b[0];
c7 -= a[7] * b[0];
a += 8;
b += 1;
@ -694,7 +682,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
@ -725,14 +713,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
@ -788,15 +772,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c3, b + 12);
ST_DP(res_c7, b + 14);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
@ -813,7 +792,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP2(c, 2, src_c0, src_c1);
LD_DP2(c + ldc, 2, src_c2, src_c3);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0;
@ -836,10 +815,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
}
res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
@ -878,10 +855,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP(res_c2, b + 4);
ST_DP(res_c3, b + 6);
src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
@ -889,31 +864,23 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT c0, c1, c2, c3;
FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, a2, a3, b0;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
a2 = a[2];
a3 = a[3];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c2 -= a2 * b0;
c3 -= a3 * b0;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
c2 -= a[2] * b[0];
c3 -= a[3] * b[0];
a += 4;
b += 1;
@ -958,8 +925,7 @@ static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT a0, a1, a3;
FLOAT c0, c1, c0_nxt1, c1_nxt1;
FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
c0 = *(c + 0);
@ -971,31 +937,20 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0_nxt3 = *(c + 3 * ldc);
c1_nxt3 = *(c + 1 + 3 * ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0, b1, b2, b3;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = b[1];
c0_nxt1 -= a0 * b1;
c1_nxt1 -= a1 * b1;
b2 = b[2];
c0_nxt2 -= a0 * b2;
c1_nxt2 -= a1 * b2;
b3 = b[3];
c0_nxt3 -= a0 * b3;
c1_nxt3 -= a1 * b3;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
c0_nxt1 -= a[0] * b[1];
c1_nxt1 -= a[1] * b[1];
c0_nxt2 -= a[0] * b[2];
c1_nxt2 -= a[1] * b[2];
c0_nxt3 -= a[0] * b[3];
c1_nxt3 -= a[1] * b[3];
a += 2;
b += 4;
@ -1033,21 +988,17 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
*(c + 0) = c0;
*(c + 1) = c1;
*(c + 0 + ldc) = c0_nxt1;
*(c + 1 + ldc) = c1_nxt1;
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT a0, a1, a3;
FLOAT c0, c1, c0_nxt, c1_nxt;
FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
c0 = *(c + 0);
c1 = *(c + 1);
@ -1055,23 +1006,17 @@ static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0_nxt = *(c + ldc);
c1_nxt = *(c + 1 + ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0, b1;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = b[1];
c0_nxt -= a0 * b1;
c1_nxt -= a1 * b1;
c0_nxt -= a[0] * b[1];
c1_nxt -= a[1] * b[1];
a += 2;
b += 2;
@ -1109,19 +1054,14 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
c0 = *(c + 0);
c1 = *(c + 1);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
a += 2;
b += 1;
@ -1145,63 +1085,60 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT a0;
FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
FLOAT c0, c1, c2, c3;
c0 = *(c + 0);
c0_nxt1 = *(c + 1 * ldc);
c0_nxt2 = *(c + 2 * ldc);
c0_nxt3 = *(c + 3 * ldc);
c1 = *(c + 1 * ldc);
c2 = *(c + 2 * ldc);
c3 = *(c + 3 * ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= a[0] * b[0];
c0_nxt1 -= a[0] * b[1];
c0_nxt2 -= a[0] * b[2];
c0_nxt3 -= a[0] * b[3];
c1 -= a[0] * b[1];
c2 -= a[0] * b[2];
c3 -= a[0] * b[3];
a += 1;
b += 4;
}
}
a0 = *a;
c0 *= a0;
c0_nxt1 *= a0;
c0_nxt2 *= a0;
c0_nxt3 *= a0;
c0 *= *a;
c1 *= *a;
c2 *= *a;
c3 *= *a;
*(c + 0 * ldc) = c0;
*(c + 1 * ldc) = c0_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 1 * ldc) = c1;
*(c + 2 * ldc) = c2;
*(c + 3 * ldc) = c3;
*(b + 0) = c0;
*(b + 1) = c0_nxt1;
*(b + 2) = c0_nxt2;
*(b + 3) = c0_nxt3;
*(b + 1) = c1;
*(b + 2) = c2;
*(b + 3) = c3;
}
static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT c0, c0_nxt;
FLOAT c0, c1;
c0 = *c;
c0_nxt = *(c + ldc);
c1 = *(c + ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= *a * b[0];
c0_nxt -= *a * b[1];
c1 -= *a * b[1];
a += 1;
b += 2;
@ -1209,18 +1146,18 @@ static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
c0 *= *a;
c0_nxt *= *a;
c1 *= *a;
*(b + 0) = c0;
*(b + 1) = c0_nxt;
*(b + 1) = c1;
*(c + 0) = c0;
*(c + ldc) = c0_nxt;
*(c + ldc) = c1;
}
static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
if (bk > 0)
if (bk)
{
BLASLONG i;

View File

@ -43,7 +43,7 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
@ -200,20 +200,26 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_b0, src_b1, src_b3;
v2f64 src_b0, src_b1, src_b3, src_b;
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
for (i = bk; i--;)
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(b);
a += 8;
b += 2;
for (i = (bk - 1); i--;)
{
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(b);
LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
src_b1 = LD_DP(b);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -227,9 +233,27 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b1;
a += 8;
b += 2;
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
}
src_b0 = LD_DP(b + 0);
@ -267,7 +291,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_b;
@ -311,7 +335,7 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
@ -405,7 +429,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_DP2(c, 2, src_c0, src_c1);
LD_DP2(c + ldc, 2, src_c2, src_c3);
if (bk > 0)
if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0;
@ -451,42 +475,33 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT b0, c0, c1, c2, c3;
FLOAT c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, a2, a3;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
a2 = a[2];
a3 = a[3];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c2 -= a2 * b0;
c3 -= a3 * b0;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
c2 -= a[2] * b[0];
c3 -= a[3] * b[0];
a += 4;
b += 1;
}
}
b0 = *b;
c0 *= b0;
c1 *= b0;
c2 *= b0;
c3 *= b0;
c0 *= *b;
c1 *= *b;
c2 *= *b;
c3 *= *b;
*(a + 0) = c0;
*(a + 1) = c1;
@ -514,31 +529,20 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0_nxt3 = *(c + 0 + 3 * ldc);
c1_nxt3 = *(c + 1 + 3 * ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0, b1, b2, b3;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = b[1];
c0_nxt1 -= a0 * b1;
c1_nxt1 -= a1 * b1;
b2 = b[2];
c0_nxt2 -= a0 * b2;
c1_nxt2 -= a1 * b2;
b3 = b[3];
c0_nxt3 -= a0 * b3;
c1_nxt3 -= a1 * b3;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
c0_nxt1 -= a[0] * b[1];
c1_nxt1 -= a[1] * b[1];
c0_nxt2 -= a[0] * b[2];
c1_nxt2 -= a[1] * b[2];
c0_nxt3 -= a[0] * b[3];
c1_nxt3 -= a[1] * b[3];
a += 2;
b += 4;
@ -590,13 +594,12 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
*(a + 7) = c1_nxt3;
*(c + 0) = c0;
*(c + 1 * ldc) = c0_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 1) = c1;
*(c + 1 * ldc) = c0_nxt1;
*(c + 1 + 1 * ldc) = c1_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
@ -606,27 +609,20 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0 = *(c + 0);
c1 = *(c + 1);
c0_nxt = *(c + 0 + ldc);
c1_nxt = *(c + 1 + ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0, b1;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = b[1];
c0_nxt -= a0 * b1;
c1_nxt -= a1 * b1;
c0_nxt -= a[0] * b[1];
c1_nxt -= a[1] * b[1];
a += 2;
b += 2;
@ -653,7 +649,7 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
*(c + 0) = c0;
*(c + 1) = c1;
*(c + ldc) = c0_nxt;
*(c + 0 + ldc) = c0_nxt;
*(c + 1 + ldc) = c1_nxt;
}
@ -664,19 +660,14 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
c0 = *(c + 0);
c1 = *(c + 1);
if (bk > 0)
if (bk)
{
BLASLONG i;
FLOAT a0, a1, b0;
for (i = bk; i--;)
{
a0 = a[0];
a1 = a[1];
b0 = b[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
c0 -= a[0] * b[0];
c1 -= a[1] * b[0];
a += 2;
b += 1;
@ -697,24 +688,23 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15;
FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
c0 = *(c + 0);
c0_nxt1 = *(c + 1 * ldc);
c0_nxt2 = *(c + 2 * ldc);
c0_nxt3 = *(c + 3 * ldc);
c1 = *(c + 1 * ldc);
c2 = *(c + 2 * ldc);
c3 = *(c + 3 * ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= a[0] * b[0];
c0_nxt1 -= a[0] * b[1];
c0_nxt2 -= a[0] * b[2];
c0_nxt3 -= a[0] * b[3];
c1 -= a[0] * b[1];
c2 -= a[0] * b[2];
c3 -= a[0] * b[3];
a += 1;
b += 4;
@ -734,44 +724,44 @@ static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0 *= b0;
c0_nxt1 -= c0 * b1;
c0_nxt1 *= b5;
c1 -= c0 * b1;
c1 *= b5;
c0_nxt2 -= c0 * b2;
c0_nxt2 -= c0_nxt1 * b6;
c0_nxt2 *= b10;
c2 -= c0 * b2;
c2 -= c1 * b6;
c2 *= b10;
c0_nxt3 -= c0 * b3;
c0_nxt3 -= c0_nxt1 * b7;
c0_nxt3 -= c0_nxt2 * b11;
c0_nxt3 *= b15;
c3 -= c0 * b3;
c3 -= c1 * b7;
c3 -= c2 * b11;
c3 *= b15;
*(a + 0) = c0;
*(a + 1) = c0_nxt1;
*(a + 2) = c0_nxt2;
*(a + 3) = c0_nxt3;
*(a + 1) = c1;
*(a + 2) = c2;
*(a + 3) = c3;
*(c + 0) = c0;
*(c + 1 * ldc) = c0_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 1 * ldc) = c1;
*(c + 2 * ldc) = c2;
*(c + 3 * ldc) = c3;
}
static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT b0, b1, b3, c0, c0_nxt;
FLOAT b0, b1, b3, c0, c1;
c0 = *c;
c0_nxt = *(c + ldc);
c1 = *(c + ldc);
if (bk > 0)
if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= *a * b[0];
c0_nxt -= *a * b[1];
c1 -= *a * b[1];
a += 1;
b += 2;
@ -784,19 +774,19 @@ static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
c0 *= b0;
c0_nxt -= c0 * b1;
c0_nxt *= b3;
c1 -= c0 * b1;
c1 *= b3;
*(a + 0) = c0;
*(a + 1) = c0_nxt;
*(a + 1) = c1;
*(c + 0) = c0;
*(c + ldc) = c0_nxt;
*(c + ldc) = c1;
}
static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
if (bk > 0)
if (bk)
{
BLASLONG i;

View File

@ -200,7 +200,7 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_b0, src_b2, src_b3;
@ -210,10 +210,40 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
if (bk > 0)
{
v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
BLASLONG i;
FLOAT *pba = a, *pbb = b;
v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3;
v2f64 src_a4, src_a5, src_a6, src_a7;
LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(b + 4);
LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pbb);
for (i = bk - 1; i--;)
{
pba += 8;
pbb += 2;
LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
src_b1 = LD_DP(pbb);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b1;
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -228,6 +258,9 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
src_c7 -= src_a3 * src_b;
}
a -= 16;
b -= 4;
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b2 = LD_DP(b + 2);
@ -256,13 +289,57 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
}
static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3;
v2f64 src_b0;
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
v2f64 src_b1;
LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(bb);
aa += 8;
bb += 1;
for (i = (bk - 1); i--;)
{
LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7);
src_b1 = LD_DP(bb);
src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a2 * src_b0;
src_c3 -= src_a3 * src_b0;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b1;
aa += 8;
bb += 1;
}
src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a2 * src_b0;
src_c3 -= src_a3 * src_b0;
}
a -= 8;
b -= 1;
src_b0 = __msa_cast_to_vector_double(*b);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
@ -289,7 +366,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 16, *bb = b + 16;
FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
for (i = bk; i--;)
@ -318,6 +395,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
}
}
a -= 16;
b -= 16;
src_b12 = LD_DP(b + 12);
src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
@ -376,7 +456,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
@ -385,20 +465,31 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0;
LD_DP2(a + 8, 2, src_a0, src_a1);
src_b0 = LD_DP(b + 4);
for (i = bk; i--;)
{
LD_DP2(aa, 2, src_a0, src_a1);
src_b0 = LD_DP(bb);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c2 -= src_a0 * src_b;
src_c3 -= src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c2 -= src_a0 * src_b;
src_c3 -= src_a1 * src_b;
aa += 4;
bb += 2;
}
}
a -= 8;
b -= 4;
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b2 = LD_DP(b + 2);
@ -420,17 +511,36 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT b0, c0, c1, c2, c3;
b0 = *(b + 0);
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
c2 -= aa[2] * bb[0];
c3 -= aa[3] * bb[0];
aa += 4;
bb += 1;
}
}
a -= 4;
b0 = *(b - 1);
c0 *= b0;
c1 *= b0;
c2 *= b0;
@ -464,35 +574,27 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 8, *bb = b + 16;
FLOAT a0, a1, b0, b1, b2, b3;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
a0 = aa[0];
a1 = aa[1];
b0 = bb[0];
c0 -= a0 * b0;
c1 -= a1 * b0;
b1 = bb[1];
c0_nxt1 -= a0 * b1;
c1_nxt1 -= a1 * b1;
b2 = bb[2];
c0_nxt2 -= a0 * b2;
c1_nxt2 -= a1 * b2;
b3 = bb[3];
c0_nxt3 -= a0 * b3;
c1_nxt3 -= a1 * b3;
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
c0_nxt1 -= aa[0] * bb[1];
c1_nxt1 -= aa[1] * bb[1];
c0_nxt2 -= aa[0] * bb[2];
c1_nxt2 -= aa[1] * bb[2];
c0_nxt3 -= aa[0] * bb[3];
c1_nxt3 -= aa[1] * bb[3];
aa += 2;
bb += 4;
}
}
a -= 8;
b -= 16;
b0 = *b;
b4 = *(b + 4);
b5 = *(b + 5);
@ -539,44 +641,44 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
*(c + 0) = c0;
*(c + 1) = c1;
*(c + 0 + 1 * ldc) = c0_nxt1;
*(c + 1 + 1 * ldc) = c1_nxt1;
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT b0, b2, b3;
FLOAT c0, c1, c0_nxt, c1_nxt;
FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
c0 = *(c + 0);
c1 = *(c + 1);
c0_nxt = *(c + 0 + ldc);
c1_nxt = *(c + 1 + ldc);
if (bk > 0)
{
FLOAT a0, a1, b0, b1;
BLASLONG i;
FLOAT *aa = a, *bb = b;
a0 = a[4];
a1 = a[5];
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
b0 = b[4];
c0 -= a0 * b0;
c1 -= a1 * b0;
c0_nxt -= aa[0] * bb[1];
c1_nxt -= aa[1] * bb[1];
b1 = b[5];
c0_nxt -= a0 * b1;
c1_nxt -= a1 * b1;
aa += 2;
bb += 2;
}
}
a -= 4;
b -= 4;
b3 = *(b + 3);
b2 = *(b + 2);
b0 = *b;
@ -601,20 +703,35 @@ static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk
*(c + 1 + ldc) = c1_nxt;
}
static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT b0, c0, c1;
c0 = *(c + 0);
c1 = *(c + 1);
b0 = *b;
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
c1 -= aa[1] * bb[0];
aa += 2;
bb += 1;
}
}
b0 = *(b - 1);
c0 *= b0;
c1 *= b0;
*(a + 0) = c0;
*(a + 1) = c1;
*(a - 2) = c0;
*(a - 1) = c1;
*(c + 0) = c0;
*(c + 1) = c1;
@ -622,31 +739,33 @@ static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3;
c0 = *(c + 0);
c0_nxt1 = *(c + 1 * ldc);
c0_nxt2 = *(c + 2 * ldc);
c0_nxt3 = *(c + 3 * ldc);
c1 = *(c + 1 * ldc);
c2 = *(c + 2 * ldc);
c3 = *(c + 3 * ldc);
if (bk > 0)
{
BLASLONG i;
FLOAT *aa = a + 4, *bb = b + 16;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
c0_nxt1 -= aa[0] * bb[1];
c0_nxt2 -= aa[0] * bb[2];
c0_nxt3 -= aa[0] * bb[3];
c1 -= aa[0] * bb[1];
c2 -= aa[0] * bb[2];
c3 -= aa[0] * bb[3];
aa += 1;
bb += 4;
}
}
a -= 4;
b -= 16;
b0 = *b;
b4 = *(b + 4);
b5 = *(b + 5);
@ -658,58 +777,86 @@ static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
b14 = *(b + 14);
b15 = *(b + 15);
c0_nxt3 *= b15;
c3 *= b15;
c0_nxt2 -= c0_nxt3 * b14;
c0_nxt2 *= b10;
c2 -= c3 * b14;
c2 *= b10;
c0_nxt1 -= c0_nxt3 * b13;
c0_nxt1 -= c0_nxt2 * b9;
c0_nxt1 *= b5;
c1 -= c3 * b13;
c1 -= c2 * b9;
c1 *= b5;
c0 -= c0_nxt3 * b12;
c0 -= c0_nxt2 * b8;
c0 -= c0_nxt1 * b4;
c0 -= c3 * b12;
c0 -= c2 * b8;
c0 -= c1 * b4;
c0 *= b0;
*(a + 0) = c0;
*(a + 1) = c0_nxt1;
*(a + 2) = c0_nxt2;
*(a + 3) = c0_nxt3;
*(a + 1) = c1;
*(a + 2) = c2;
*(a + 3) = c3;
*(c) = c0;
*(c + 1 * ldc) = c0_nxt1;
*(c + 2 * ldc) = c0_nxt2;
*(c + 3 * ldc) = c0_nxt3;
*(c + 0 * ldc) = c0;
*(c + 1 * ldc) = c1;
*(c + 2 * ldc) = c2;
*(c + 3 * ldc) = c3;
}
static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
FLOAT b0, b2, b3, c0, c0_nxt;
FLOAT b0, b2, b3, c0, c1;
c0 = *(c + 0);
c0_nxt = *(c + ldc);
c1 = *(c + ldc);
if (bk > 0)
{
c0 -= a[2] * b[4];
c0_nxt -= a[2] * b[5];
BLASLONG i;
FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= *aa * bb[0];
c1 -= *aa * bb[1];
aa += 1;
bb += 2;
}
}
a -= 2;
b -= 4;
b3 = *(b + 3);
b2 = *(b + 2);
b0 = *b;
c0_nxt *= b3;
c1 *= b3;
c0 -= c0_nxt * b2;
c0 -= c1 * b2;
c0 *= b0;
*(a + 0) = c0;
*(a + 1) = c0_nxt;
*(a + 1) = c1;
*(c + 0) = c0;
*(c + ldc) = c0_nxt;
*(c + ldc) = c1;
}
static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
if (bk > 0)
{
BLASLONG i;
for (i = 0; i < bk; i++)
{
*c -= a[i] * b[i];
}
}
*c *= *(b - 1);
*(a - 1) = *c;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
@ -729,12 +876,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a;
c -= ldc;
b -= k;
bb = b + (kk - 1);
bb = b + kk;
cc = c;
for (i = (m >> 3); i--;)
{
dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc);
dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk);
aa += 8 * k;
cc += 8;
@ -744,7 +891,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc);
dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk);
aa += 4 * k;
cc += 4;
@ -752,7 +899,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc);
dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk);
aa += 2 * k;
cc += 2;
@ -760,8 +907,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
*cc *= *bb;
*(aa + kk - 1) = *cc;
dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk);
aa += k;
cc += 1;
@ -782,7 +928,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (i = (m >> 3); i--;)
{
dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk);
dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk);
aa += 8 * k;
cc += 8;
@ -792,7 +938,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk);
dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk);
aa += 4 * k;
cc += 4;
@ -800,7 +946,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk);
dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk);
aa += 2 * k;
cc += 2;
@ -808,7 +954,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk);
dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk);
aa += k;
cc += 1;
}
}
@ -836,7 +985,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk);
dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk);
aa += 4 * k;
cc += 4;
@ -844,7 +993,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk);
dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk);
aa += 2 * k;
cc += 2;
@ -852,7 +1001,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk);
dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk);
aa += k;
cc += 1;

View File

@ -76,4 +76,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
}
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
#endif /* __MACROS_MSA_H__ */