diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9f0eb95a5..d0792bf85 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -126,22 +126,14 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 64; b -= 32; - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); - res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); - res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); - res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); - res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); - res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); - res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); - res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); - res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a54 = __msa_cast_to_vector_double(*(a + 54)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -172,10 +164,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c6, b + 24); ST_DP(res_c15, b + 30); ST_DP(res_c14, b + 26); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); - src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); - src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); ST_DP(src_c11, c_nxt2line + 6); @@ -211,10 +201,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c12, b + 18); ST_DP(res_c13, b + 22); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); - src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); ST_DP(src_c2, c + 4); ST_DP(src_c6, c_nxt1line + 4); ST_DP(src_c10, c_nxt2line + 4); @@ -286,10 +274,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); - src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); ST_DP(src_c1, c + 2); ST_DP(src_c5, c_nxt1line + 2); ST_DP(src_c9, c_nxt2line + 2); @@ -343,10 +329,8 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); - src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); @@ -417,14 +401,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c7 -= src_a3 * src_b; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a56 = LD_DP(a - 8); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); @@ -541,14 +521,10 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b - 14); ST_DP(res_c0, b - 16); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); @@ -572,30 +548,19 @@ static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { - int i; + BLASLONG i; FLOAT *aa = a, *bb = b; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; for (i = bk; i--; ) { - a0 = aa[0]; - a1 = aa[1]; - a2 = aa[2]; - a3 = aa[3]; - a4 = aa[4]; - a5 = aa[5]; - a6 = aa[6]; - a7 = aa[7]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; - c4 -= a4 * b0; - c5 -= a5 * b0; - c6 -= a6 * b0; - c7 -= a7 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; aa += 8; bb += 1; @@ -720,7 +685,7 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 16; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) @@ -749,14 +714,13 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + a -= 16; + b -= 16; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); @@ -813,14 +777,10 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c4, b + 2); ST_DP(res_c0, b + 0); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -840,7 +800,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 8; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) @@ -861,10 +821,11 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + a -= 16; + b -= 8; + + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); @@ -907,10 +868,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 2); ST_DP(res_c0, b + 0); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -918,8 +877,7 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; - FLOAT c0, c1, c2, c3; + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); @@ -929,27 +887,23 @@ static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 4; - FLOAT a0, a1, a2, a3, b0; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - a2 = aa[2]; - a3 = aa[3]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } } + a -= 16; + b -= 4; + a0 = *(a + 0); a4 = *(a + 4); a5 = *(a + 5); @@ -1003,35 +957,27 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 8; - FLOAT a0, a1, b0, b1, b2, b3; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = bb[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = bb[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = bb[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } + a -= 4; + b -= 8; + a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); @@ -1063,13 +1009,10 @@ static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } @@ -1087,27 +1030,24 @@ static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 4; - FLOAT a0, a1, b0, b1; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = bb[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } } + a -= 4; + b -= 4; + a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); @@ -1144,33 +1084,28 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) if (bk > 0) { BLASLONG i; - FLOAT a0, a1, b0; - FLOAT *aa = a + 4, *bb = b + 2; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } } - a0 = *(a + 0); - a2 = *(a + 2); - a3 = *(a + 3); + a0 = *(a - 4); + a2 = *(a - 2); + a3 = *(a - 1); c1 *= a3; c0 -= c1 * a2; c0 *= a0; - *(b + 0) = c0; - *(b + 1) = c1; + *(b - 2) = c0; + *(b - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; @@ -1178,46 +1113,44 @@ static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c0, c1, c2, c3; - a0 = *a; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 1, *bb = b + 4; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; - c0_nxt1 -= aa[0] * bb[1]; - c0_nxt2 -= aa[0] * bb[2]; - c0_nxt3 -= aa[0] * bb[3]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } - c0 *= a0; - c0_nxt1 *= a0; - c0_nxt2 *= a0; - c0_nxt3 *= a0; + c0 *= *(a - 1); + c1 *= *(a - 1); + c2 *= *(a - 1); + c3 *= *(a - 1); *(c + 0 * ldc) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; - *(b + 0) = c0; - *(b + 1) = c0_nxt1; - *(b + 2) = c0_nxt2; - *(b + 3) = c0_nxt3; + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; } static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) @@ -1247,7 +1180,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + (m - 1); - dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk); + dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 1; } @@ -1258,7 +1191,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -2) - 2); - dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk); + dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 2; } @@ -1269,7 +1202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -4) - 4); - dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk); + dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk); kk -= 4; } @@ -1319,7 +1252,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k; cc = c + ((m & -2) - 2); - dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk); + dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk); kk -= 2; } @@ -1329,7 +1262,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk); + dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk); kk -= 4; } @@ -1377,7 +1310,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k + kk * 2; cc = c + ((m & -2) - 2); - dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk); + dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk); kk -= 2; } @@ -1387,7 +1320,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk); + dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk); kk -= 4; } diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index da35aa8f9..db902c0de 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -48,7 +48,7 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_b, src_b0, src_b1, src_b2, src_b3; @@ -124,22 +124,14 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b += 4; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); - res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); - res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); - res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); - res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); - res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); - res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); - res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); - res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); + ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); + ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); + ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); + ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -205,10 +197,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); - src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); @@ -265,10 +255,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c3, b + 12); ST_DP(res_c11, b + 14); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); - src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); + ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); src_a36 = LD_DP(a + 36); src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); @@ -311,10 +299,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c5, b + 20); ST_DP(res_c13, b + 22); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); - src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); src_a63 = __msa_cast_to_vector_double(*(a + 63)); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); @@ -341,10 +327,8 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c7, b + 28); ST_DP(res_c15, b + 30); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); - src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); - src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); + ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); @@ -365,15 +349,21 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + v2f64 src_b, src_b0, src_b1; - for (i = bk; i--;) + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) { - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b); + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -387,19 +377,33 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + a += 8; b += 2; } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -480,10 +484,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c4, src_c5, c + ldc, 2); @@ -526,10 +528,8 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c6, b + 12); ST_DP(res_c7, b + 14); - src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP2(src_c2, src_c3, c + 4, 2); ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); @@ -539,8 +539,7 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; - FLOAT a45, a46, a47, a54, a55, a63; - FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); @@ -551,31 +550,20 @@ static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + if (bk) { - int i; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + BLASLONG i; for (i = bk; i--; ) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; - c4 -= a4 * b0; - c5 -= a5 * b0; - c6 -= a6 * b0; - c7 -= a7 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; a += 8; b += 1; @@ -694,7 +682,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; @@ -725,14 +713,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); - res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); - res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); - res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); - res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); + ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); + ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -788,15 +772,10 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c3, b + 12); ST_DP(res_c7, b + 14); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); - src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); - - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); - src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); - src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); + ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); + ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -813,7 +792,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; @@ -836,10 +815,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } - res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); - res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); - res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); - res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); + ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); @@ -878,10 +855,8 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); - src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); - src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); - src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); - src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); + ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); @@ -889,31 +864,23 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT c0, c1, c2, c3; - FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, a2, a3, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -958,8 +925,7 @@ static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0, a1, a3; - FLOAT c0, c1, c0_nxt1, c1_nxt1; + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); @@ -971,31 +937,20 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1, b2, b3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = b[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = b[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1033,21 +988,17 @@ static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0, a1, a3; - FLOAT c0, c1, c0_nxt, c1_nxt; + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); @@ -1055,23 +1006,17 @@ static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1109,19 +1054,14 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1145,63 +1085,60 @@ static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT a0; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; - c0_nxt1 -= a[0] * b[1]; - c0_nxt2 -= a[0] * b[2]; - c0_nxt3 -= a[0] * b[3]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; } } - a0 = *a; - - c0 *= a0; - c0_nxt1 *= a0; - c0_nxt2 *= a0; - c0_nxt3 *= a0; + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; *(c + 0 * ldc) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; *(b + 0) = c0; - *(b + 1) = c0_nxt1; - *(b + 2) = c0_nxt2; - *(b + 3) = c0_nxt3; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; } static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT c0, c0_nxt; + FLOAT c0, c1; c0 = *c; - c0_nxt = *(c + ldc); + c1 = *(c + ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; - c0_nxt -= *a * b[1]; + c1 -= *a * b[1]; a += 1; b += 2; @@ -1209,18 +1146,18 @@ static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } c0 *= *a; - c0_nxt *= *a; + c1 *= *a; *(b + 0) = c0; - *(b + 1) = c0_nxt; + *(b + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) + if (bk) { BLASLONG i; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index 659f77266..518daad13 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -43,7 +43,7 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; @@ -200,20 +200,26 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; - v2f64 src_b0, src_b1, src_b3; + v2f64 src_b0, src_b1, src_b3, src_b; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; - for (i = bk; i--;) + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + a += 8; + b += 2; + + for (i = (bk - 1); i--;) { - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b); + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -227,9 +233,27 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + a += 8; b += 2; } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; } src_b0 = LD_DP(b + 0); @@ -267,7 +291,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_b; @@ -311,7 +335,7 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; @@ -405,7 +429,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); - if (bk > 0) + if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; @@ -451,42 +475,33 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - FLOAT b0, c0, c1, c2, c3; + FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, a2, a3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - c2 -= a2 * b0; - c3 -= a3 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; } } - b0 = *b; - - c0 *= b0; - c1 *= b0; - c2 *= b0; - c3 *= b0; + c0 *= *b; + c1 *= *b; + c2 *= *b; + c3 *= *b; *(a + 0) = c0; *(a + 1) = c1; @@ -514,31 +529,20 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1, b2, b3; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = b[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = b[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -590,13 +594,12 @@ static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(a + 7) = c1_nxt3; *(c + 0) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; - *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } @@ -606,27 +609,20 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 = *(c + 0); c1 = *(c + 1); - c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0, b1; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = b[1]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -653,7 +649,7 @@ static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + ldc) = c0_nxt; + *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } @@ -664,19 +660,14 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + if (bk) { BLASLONG i; - FLOAT a0, a1, b0; for (i = bk; i--;) { - a0 = a[0]; - a1 = a[1]; - - b0 = b[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -697,24 +688,23 @@ static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; - c0_nxt1 -= a[0] * b[1]; - c0_nxt2 -= a[0] * b[2]; - c0_nxt3 -= a[0] * b[3]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; @@ -734,44 +724,44 @@ static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 *= b0; - c0_nxt1 -= c0 * b1; - c0_nxt1 *= b5; + c1 -= c0 * b1; + c1 *= b5; - c0_nxt2 -= c0 * b2; - c0_nxt2 -= c0_nxt1 * b6; - c0_nxt2 *= b10; + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; - c0_nxt3 -= c0 * b3; - c0_nxt3 -= c0_nxt1 * b7; - c0_nxt3 -= c0_nxt2 * b11; - c0_nxt3 *= b15; + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; *(a + 0) = c0; - *(a + 1) = c0_nxt1; - *(a + 2) = c0_nxt2; - *(a + 3) = c0_nxt3; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; *(c + 0) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; } static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b1, b3, c0, c0_nxt; + FLOAT b0, b1, b3, c0, c1; c0 = *c; - c0_nxt = *(c + ldc); + c1 = *(c + ldc); - if (bk > 0) + if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; - c0_nxt -= *a * b[1]; + c1 -= *a * b[1]; a += 1; b += 2; @@ -784,19 +774,19 @@ static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0 *= b0; - c0_nxt -= c0 * b1; - c0_nxt *= b3; + c1 -= c0 * b1; + c1 *= b3; *(a + 0) = c0; - *(a + 1) = c0_nxt; + *(a + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) + if (bk) { BLASLONG i; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index a90d5fec3..bef87d44d 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -200,7 +200,7 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b2, src_b3; @@ -210,10 +210,40 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk if (bk > 0) { - v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3; + v2f64 src_a4, src_a5, src_a6, src_a7; - LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3); - src_b0 = LD_DP(b + 4); + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); + + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -228,6 +258,9 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk src_c7 -= src_a3 * src_b; } + a -= 16; + b -= 4; + src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); @@ -256,13 +289,57 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); } -static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3; v2f64 src_b0; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b1; + + LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(bb); + + aa += 8; + bb += 1; + + for (i = (bk - 1); i--;) + { + LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7); + src_b1 = LD_DP(bb); + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b1; + + aa += 8; + bb += 1; + } + + src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a2 * src_b0; + src_c3 -= src_a3 * src_b0; + } + + a -= 8; + b -= 1; + src_b0 = __msa_cast_to_vector_double(*b); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); @@ -289,7 +366,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 16, *bb = b + 16; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) @@ -318,6 +395,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO } } + a -= 16; + b -= 16; + src_b12 = LD_DP(b + 12); src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); @@ -376,7 +456,7 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; @@ -385,20 +465,31 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk if (bk > 0) { + BLASLONG i; + FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; - LD_DP2(a + 8, 2, src_a0, src_a1); - src_b0 = LD_DP(b + 4); + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c2 -= src_a0 * src_b; - src_c3 -= src_a1 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } } + a -= 8; + b -= 4; + src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); @@ -420,17 +511,36 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } -static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1, c2, c3; - b0 = *(b + 0); - c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + + aa += 4; + bb += 1; + } + } + + a -= 4; + + b0 = *(b - 1); + c0 *= b0; c1 *= b0; c2 *= b0; @@ -464,35 +574,27 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 8, *bb = b + 16; - FLOAT a0, a1, b0, b1, b2, b3; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { - a0 = aa[0]; - a1 = aa[1]; - - b0 = bb[0]; - c0 -= a0 * b0; - c1 -= a1 * b0; - - b1 = bb[1]; - c0_nxt1 -= a0 * b1; - c1_nxt1 -= a1 * b1; - - b2 = bb[2]; - c0_nxt2 -= a0 * b2; - c1_nxt2 -= a1 * b2; - - b3 = bb[3]; - c0_nxt3 -= a0 * b3; - c1_nxt3 -= a1 * b3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } + a -= 8; + b -= 16; + b0 = *b; b4 = *(b + 4); b5 = *(b + 5); @@ -539,44 +641,44 @@ static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 0) = c0; *(c + 1) = c1; - *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; - *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; - *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } -static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b2, b3; - FLOAT c0, c1, c0_nxt, c1_nxt; + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); - c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); if (bk > 0) { - FLOAT a0, a1, b0, b1; + BLASLONG i; + FLOAT *aa = a, *bb = b; - a0 = a[4]; - a1 = a[5]; + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - b0 = b[4]; - c0 -= a0 * b0; - c1 -= a1 * b0; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - b1 = b[5]; - c0_nxt -= a0 * b1; - c1_nxt -= a1 * b1; + aa += 2; + bb += 2; + } } + a -= 4; + b -= 4; + b3 = *(b + 3); b2 = *(b + 2); b0 = *b; @@ -601,20 +703,35 @@ static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk *(c + 1 + ldc) = c1_nxt; } -static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - b0 = *b; + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + + aa += 2; + bb += 1; + } + } + + b0 = *(b - 1); c0 *= b0; c1 *= b0; - *(a + 0) = c0; - *(a + 1) = c1; + *(a - 2) = c0; + *(a - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; @@ -622,31 +739,33 @@ static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; - FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3; c0 = *(c + 0); - c0_nxt1 = *(c + 1 * ldc); - c0_nxt2 = *(c + 2 * ldc); - c0_nxt3 = *(c + 3 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; - FLOAT *aa = a + 4, *bb = b + 16; + FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; - c0_nxt1 -= aa[0] * bb[1]; - c0_nxt2 -= aa[0] * bb[2]; - c0_nxt3 -= aa[0] * bb[3]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } + a -= 4; + b -= 16; + b0 = *b; b4 = *(b + 4); b5 = *(b + 5); @@ -658,58 +777,86 @@ static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO b14 = *(b + 14); b15 = *(b + 15); - c0_nxt3 *= b15; + c3 *= b15; - c0_nxt2 -= c0_nxt3 * b14; - c0_nxt2 *= b10; + c2 -= c3 * b14; + c2 *= b10; - c0_nxt1 -= c0_nxt3 * b13; - c0_nxt1 -= c0_nxt2 * b9; - c0_nxt1 *= b5; + c1 -= c3 * b13; + c1 -= c2 * b9; + c1 *= b5; - c0 -= c0_nxt3 * b12; - c0 -= c0_nxt2 * b8; - c0 -= c0_nxt1 * b4; + c0 -= c3 * b12; + c0 -= c2 * b8; + c0 -= c1 * b4; c0 *= b0; *(a + 0) = c0; - *(a + 1) = c0_nxt1; - *(a + 2) = c0_nxt2; - *(a + 3) = c0_nxt3; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; - *(c) = c0; - *(c + 1 * ldc) = c0_nxt1; - *(c + 2 * ldc) = c0_nxt2; - *(c + 3 * ldc) = c0_nxt3; + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; } static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - FLOAT b0, b2, b3, c0, c0_nxt; + FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); - c0_nxt = *(c + ldc); + c1 = *(c + ldc); if (bk > 0) { - c0 -= a[2] * b[4]; - c0_nxt -= a[2] * b[5]; + BLASLONG i; + FLOAT *aa = a, *bb = b; + + for (i = bk; i--;) + { + c0 -= *aa * bb[0]; + c1 -= *aa * bb[1]; + + aa += 1; + bb += 2; + } } + a -= 2; + b -= 4; + b3 = *(b + 3); b2 = *(b + 2); b0 = *b; - c0_nxt *= b3; + c1 *= b3; - c0 -= c0_nxt * b2; + c0 -= c1 * b2; c0 *= b0; *(a + 0) = c0; - *(a + 1) = c0_nxt; + *(a + 1) = c1; *(c + 0) = c0; - *(c + ldc) = c0_nxt; + *(c + ldc) = c1; +} + +static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = 0; i < bk; i++) + { + *c -= a[i] * b[i]; + } + } + + *c *= *(b - 1); + *(a - 1) = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, @@ -729,12 +876,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a; c -= ldc; b -= k; - bb = b + (kk - 1); + bb = b + kk; cc = c; for (i = (m >> 3); i--;) { - dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc); + dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk); aa += 8 * k; cc += 8; @@ -744,7 +891,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc); + dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk); aa += 4 * k; cc += 4; @@ -752,7 +899,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc); + dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk); aa += 2 * k; cc += 2; @@ -760,8 +907,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - *cc *= *bb; - *(aa + kk - 1) = *cc; + dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk); aa += k; cc += 1; @@ -782,7 +928,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (i = (m >> 3); i--;) { - dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk); + dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk); aa += 8 * k; cc += 8; @@ -792,7 +938,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk); + dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk); aa += 4 * k; cc += 4; @@ -800,7 +946,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk); + dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk); aa += 2 * k; cc += 2; @@ -808,7 +954,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk); + dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk); + + aa += k; + cc += 1; } } @@ -836,7 +985,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk); + dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk); aa += 4 * k; cc += 4; @@ -844,7 +993,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk); + dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk); aa += 2 * k; cc += 2; @@ -852,7 +1001,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk); + dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk); aa += k; cc += 1; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index 3bcc59629..d3a4022d6 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -76,4 +76,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ + out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ +} +#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) + #endif /* __MACROS_MSA_H__ */