From c8a7860eb3ea70e4684d6ab82c2c2a432b33187d Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Mon, 30 May 2016 21:17:00 +0530 Subject: [PATCH] STRSM optimized Signed-off-by: Kaustubh Raste --- kernel/Makefile.L3 | 4 - kernel/mips/macros_msa.h | 6 +- kernel/mips/strsm_kernel_LN_8x8_msa.c | 993 +++++++------------- kernel/mips/strsm_kernel_LT_8x8_msa.c | 993 ++++++-------------- kernel/mips/strsm_kernel_RN_8x8_msa.c | 1246 ++++++++---------------- kernel/mips/strsm_kernel_RT_8x8_msa.c | 1254 +++++++++---------------- 6 files changed, 1447 insertions(+), 3049 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8e6827424..e55f153f5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -12,10 +12,6 @@ ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif -ifeq ($(ARCH), MIPS) -USE_GEMM3M = 1 -endif - ifeq ($(ARCH), arm) USE_TRMM = 1 endif diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ae85220c6..0efca7860 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -42,6 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) +#define COPY_FLOAT_TO_VECTOR(a, b) \ + b = __msa_cast_to_vector_float(a); \ + b = (v4f32) __msa_splati_w((v4i32) b, 0); + + /* Description : Load 2 vectors of single precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -178,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \ out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \ } - #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) #endif /* __MACROS_MSA_H__ */ diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 3db7da3c4..516b9752f 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -30,6 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -47,107 +50,43 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; aa += 8; bb += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - aa += 8; - bb += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } a -= 64; @@ -169,25 +108,18 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 *= src_a63; res_c15 *= src_a63; - res_c6 -= res_c7 * src_a62; res_c14 -= res_c15 * src_a62; - res_c5 -= res_c7 * src_a61; res_c13 -= res_c15 * src_a61; - res_c4 -= res_c7 * src_a60; res_c12 -= res_c15 * src_a60; - res_c3 -= res_c7 * src_a59; res_c11 -= res_c15 * src_a59; - res_c2 -= res_c7 * src_a58; res_c10 -= res_c15 * src_a58; - res_c1 -= res_c7 * src_a57; res_c9 -= res_c15 * src_a57; - res_c0 -= res_c7 * src_a56; res_c8 -= res_c15 * src_a56; @@ -200,22 +132,16 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 *= src_a54; res_c14 *= src_a54; - res_c5 -= res_c6 * src_a53; res_c13 -= res_c14 * src_a53; - res_c4 -= res_c6 * src_a52; res_c12 -= res_c14 * src_a52; - res_c3 -= res_c6 * src_a51; res_c11 -= res_c14 * src_a51; - res_c2 -= res_c6 * src_a50; res_c10 -= res_c14 * src_a50; - res_c1 -= res_c6 * src_a49; res_c9 -= res_c14 * src_a49; - res_c0 -= res_c6 * src_a48; res_c8 -= res_c14 * src_a48; @@ -227,39 +153,29 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c5 *= src_a45; res_c13 *= src_a45; - res_c4 -= res_c5 * src_a44; res_c12 -= res_c13 * src_a44; - res_c3 -= res_c5 * src_a43; res_c11 -= res_c13 * src_a43; - res_c2 -= res_c5 * src_a42; res_c10 -= res_c13 * src_a42; - res_c1 -= res_c5 * src_a41; res_c9 -= res_c13 * src_a41; - res_c0 -= res_c5 * src_a40; res_c8 -= res_c13 * src_a40; src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - src_a36 = __msa_cast_to_vector_float(*(a + 36)); - src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); res_c4 *= src_a36; res_c12 *= src_a36; - res_c3 -= res_c4 * src_a35; res_c11 -= res_c12 * src_a35; - res_c2 -= res_c4 * src_a34; res_c10 -= res_c12 * src_a34; - res_c1 -= res_c4 * src_a33; res_c9 -= res_c12 * src_a33; - res_c0 -= res_c4 * src_a32; res_c8 -= res_c12 * src_a32; @@ -285,13 +201,10 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c3 *= src_a27; res_c11 *= src_a27; - res_c2 -= res_c3 * src_a26; res_c10 -= res_c11 * src_a26; - res_c1 -= res_c3 * src_a25; res_c9 -= res_c11 * src_a25; - res_c0 -= res_c3 * src_a24; res_c8 -= res_c11 * src_a24; @@ -302,23 +215,17 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 *= src_a18; res_c10 *= src_a18; - res_c1 -= res_c2 * src_a17; res_c9 -= res_c10 * src_a17; - res_c0 -= res_c2 * src_a16; res_c8 -= res_c10 * src_a16; - src_a9 = __msa_cast_to_vector_float(*(a + 9)); - src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); - src_a8 = __msa_cast_to_vector_float(*(a + 8)); - src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); + COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c1 *= src_a9; res_c9 *= src_a9; - res_c0 -= res_c1 * src_a8; res_c8 -= res_c9 * src_a8; @@ -345,6 +252,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; @@ -356,65 +266,60 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; + aa += 8; + bb += 4; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; } a -= 64; @@ -469,8 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); - src_a36 = __msa_cast_to_vector_float(*(a + 36)); - src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -495,12 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; - src_a9 = __msa_cast_to_vector_float(*(a + 9)); - src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); - src_a8 = __msa_cast_to_vector_float(*(a + 8)); - src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9); + COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; @@ -523,6 +424,8 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; @@ -546,69 +449,27 @@ static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; + c4_nxt -= aa[4] * bb[1]; + c5_nxt -= aa[5] * bb[1]; + c6_nxt -= aa[6] * bb[1]; + c7_nxt -= aa[7] * bb[1]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[2] * bb[0]; - res[3] = aa[3] * bb[0]; - res[4] = aa[4] * bb[0]; - res[5] = aa[5] * bb[0]; - res[6] = aa[6] * bb[0]; - res[7] = aa[7] * bb[0]; - res[8] = aa[0] * bb[1]; - res[9] = aa[1] * bb[1]; - res[10] = aa[2] * bb[1]; - res[11] = aa[3] * bb[1]; - res[12] = aa[4] * bb[1]; - res[13] = aa[5] * bb[1]; - res[14] = aa[6] * bb[1]; - res[15] = aa[7] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 2; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[2] * bb[0]; - res[3] += aa[3] * bb[0]; - res[4] += aa[4] * bb[0]; - res[5] += aa[5] * bb[0]; - res[6] += aa[6] * bb[0]; - res[7] += aa[7] * bb[0]; - res[8] += aa[0] * bb[1]; - res[9] += aa[1] * bb[1]; - res[10] += aa[2] * bb[1]; - res[11] += aa[3] * bb[1]; - res[12] += aa[4] * bb[1]; - res[13] += aa[5] * bb[1]; - res[14] += aa[6] * bb[1]; - res[15] += aa[7] * bb[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c4 -= res[4]; - c5 -= res[5]; - c6 -= res[6]; - c7 -= res[7]; - - c0_nxt -= res[8]; - c1_nxt -= res[9]; - c2_nxt -= res[10]; - c3_nxt -= res[11]; - c4_nxt -= res[12]; - c5_nxt -= res[13]; - c6_nxt -= res[14]; - c7_nxt -= res[15]; + aa += 8; + bb += 2; } a -= 64; @@ -768,6 +629,8 @@ static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; @@ -782,44 +645,19 @@ static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c4 -= aa[4] * bb[0]; + c5 -= aa[5] * bb[0]; + c6 -= aa[6] * bb[0]; + c7 -= aa[7] * bb[0]; - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - t4 = aa[4] * bb[0]; - t5 = aa[5] * bb[0]; - t6 = aa[6] * bb[0]; - t7 = aa[7] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 1; - - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - t4 += aa[4] * bb[0]; - t5 += aa[5] * bb[0]; - t6 += aa[6] * bb[0]; - t7 += aa[7] * bb[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + aa += 8; + bb += 1; } a -= 64; @@ -927,6 +765,9 @@ static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; @@ -939,79 +780,35 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 8; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + aa += 4; + bb += 8; } a -= 16; @@ -1028,12 +825,10 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a5 = __msa_cast_to_vector_float(*(a + 5)); - src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a4 = __msa_cast_to_vector_float(*(a + 4)); - src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); + COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c3 *= src_a15; res_c7 *= src_a15; @@ -1079,6 +874,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; v4f32 src_a13, src_a14, src_a15; @@ -1086,80 +884,48 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3; + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + aa += 4; + bb += 4; src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 4; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - aa += 4; - bb += 4; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - } - - if ((bk - 1) & 1) - { - aa += 4; - bb += 4; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; + aa += 4; + bb += 4; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; } a -= 16; @@ -1174,12 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); - src_a5 = __msa_cast_to_vector_float(*(a + 5)); - src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a4 = __msa_cast_to_vector_float(*(a + 4)); - src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); - src_a0 = __msa_cast_to_vector_float(*(a + 0)); - src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5); + COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4); + COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; @@ -1208,6 +971,8 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; @@ -1220,44 +985,19 @@ static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; + c2_nxt -= aa[2] * bb[1]; + c3_nxt -= aa[3] * bb[1]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[2] * bb[0]; - res[3] = aa[3] * bb[0]; - res[4] = aa[0] * bb[1]; - res[5] = aa[1] * bb[1]; - res[6] = aa[2] * bb[1]; - res[7] = aa[3] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 2; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[2] * bb[0]; - res[3] += aa[3] * bb[0]; - res[4] += aa[0] * bb[1]; - res[5] += aa[1] * bb[1]; - res[6] += aa[2] * bb[1]; - res[7] += aa[3] * bb[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c0_nxt -= res[4]; - c1_nxt -= res[5]; - c2_nxt -= res[6]; - c3_nxt -= res[7]; + aa += 4; + bb += 2; } a -= 16; @@ -1325,6 +1065,8 @@ static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1332,32 +1074,15 @@ static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 1; - - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + aa += 4; + bb += 1; } a -= 16; @@ -1401,6 +1126,8 @@ static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; FLOAT c0_nxt7, c1_nxt7; @@ -1422,68 +1149,27 @@ static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - res[8] = aa[0] * bb[4]; - res[9] = aa[1] * bb[4]; - res[10] = aa[0] * bb[5]; - res[11] = aa[1] * bb[5]; - res[12] = aa[0] * bb[6]; - res[13] = aa[1] * bb[6]; - res[14] = aa[0] * bb[7]; - res[15] = aa[1] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 8; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - res[8] += aa[0] * bb[4]; - res[9] += aa[1] * bb[4]; - res[10] += aa[0] * bb[5]; - res[11] += aa[1] * bb[5]; - res[12] += aa[0] * bb[6]; - res[13] += aa[1] * bb[6]; - res[14] += aa[0] * bb[7]; - res[15] += aa[1] * bb[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + aa += 2; + bb += 8; } a -= 4; @@ -1557,6 +1243,8 @@ static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1569,44 +1257,19 @@ static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 4; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + aa += 2; + bb += 4; } a -= 4; @@ -1652,6 +1315,8 @@ static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1659,32 +1324,15 @@ static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1, res2, res3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; - res2 = aa[0] * bb[1]; - res3 = aa[1] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 2; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - res2 += aa[0] * bb[1]; - res3 += aa[1] * bb[1]; - } - - c0 -= res0; - c1 -= res1; - c0_nxt -= res2; - c1_nxt -= res3; + aa += 2; + bb += 2; } a -= 4; @@ -1716,31 +1364,20 @@ static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 1; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - } - - c0 -= res0; - c1 -= res1; + aa += 2; + bb += 1; } a -= 4; @@ -1764,9 +1401,11 @@ static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; - c0 = *(c + 0 * ldc); + c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); @@ -1775,44 +1414,19 @@ static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - FLOAT *aa = a, *bb = b; - BLASLONG k; - FLOAT r0, r1, r2, r3, r4, r5, r6, r7; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; - r0 = aa[0] * bb[0]; - r1 = aa[0] * bb[1]; - r2 = aa[0] * bb[2]; - r3 = aa[0] * bb[3]; - r4 = aa[0] * bb[4]; - r5 = aa[0] * bb[5]; - r6 = aa[0] * bb[6]; - r7 = aa[0] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 8; - - r0 += aa[0] * bb[0]; - r1 += aa[0] * bb[1]; - r2 += aa[0] * bb[2]; - r3 += aa[0] * bb[3]; - r4 += aa[0] * bb[4]; - r5 += aa[0] * bb[5]; - r6 += aa[0] * bb[6]; - r7 += aa[0] * bb[7]; - } - - c0 -= r0; - c1 -= r1; - c2 -= r2; - c3 -= r3; - c4 -= r4; - c5 -= r5; - c6 -= r6; - c7 -= r7; + aa += 1; + bb += 8; } a0 = *(a - 1); @@ -1845,16 +1459,34 @@ static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO *(c + 7 * ldc) = c7; } -static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3; + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + a0 = *(a - 1); - c0 = *(c + 0 * ldc) * a0; - c1 = *(c + 1 * ldc) * a0; - c2 = *(c + 2 * ldc) * a0; - c3 = *(c + 3 * ldc) * a0; + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; *(b - 4) = c0; *(b - 3) = c1; @@ -1867,14 +1499,28 @@ static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) *(c + 3 * ldc) = c3; } -static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1; + c0 = *c; + c1 = *(c + ldc); + + for (k = 0; k < bk; k++) + { + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + + aa += 1; + bb += 2; + } + a0 = *(a - 1); - c0 = *(c + 0 * ldc) * a0; - c1 = *(c + 1 * ldc) * a0; + c0 *= a0; + c1 *= a0; *(b - 2) = c0; *(b - 1) = c1; @@ -1883,8 +1529,15 @@ static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) *(c + 1 * ldc) = c1; } -static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c) +static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + + for (k = 0; k < bk; k++) + { + *c -= a[k] * b[k]; + } + *c *= *(a - 1); *(b - 1) = *c; } @@ -1965,7 +1618,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc); + ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); kk -= 1; } @@ -2023,7 +1676,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc); + ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); kk -= 1; } @@ -2057,7 +1710,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, do { - ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, k -kk); + ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; @@ -2081,7 +1734,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + (m - 1) * k + kk; cc = c + (m - 1); - ssolve_1x1_ln_msa(aa, b + kk, cc); + ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 1; } diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index 0c61d3618..fbce812e6 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -47,106 +49,43 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; a += 8; b += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - a += 8; - b += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, @@ -223,8 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - src_a31 = __msa_cast_to_vector_float(*(a + 31)); - src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); res_c3 *= src_a27; res_c11 *= src_a27; @@ -278,12 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c7 -= res_c5 * src_a47; res_c15 -= res_c13 * src_a47; - src_a54 = __msa_cast_to_vector_float(*(a + 54)); - src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); - src_a55 = __msa_cast_to_vector_float(*(a + 55)); - src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); - src_a63 = __msa_cast_to_vector_float(*(a + 63)); - src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); + COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); + COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); res_c6 *= src_a54; res_c14 *= src_a54; @@ -313,6 +248,8 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; @@ -324,67 +261,28 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; a += 8; b += 4; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, @@ -436,8 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); - src_a31 = __msa_cast_to_vector_float(*(a + 31)); - src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; @@ -462,12 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a54 = __msa_cast_to_vector_float(*(a + 54)); - src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); - src_a55 = __msa_cast_to_vector_float(*(a + 55)); - src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); - src_a63 = __msa_cast_to_vector_float(*(a + 63)); - src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54); + COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55); + COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; @@ -490,6 +384,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63; @@ -513,67 +408,24 @@ static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[2] * b[0]; - res[3] = a[3] * b[0]; - res[4] = a[4] * b[0]; - res[5] = a[5] * b[0]; - res[6] = a[6] * b[0]; - res[7] = a[7] * b[0]; - res[8] = a[0] * b[1]; - res[9] = a[1] * b[1]; - res[10] = a[2] * b[1]; - res[11] = a[3] * b[1]; - res[12] = a[4] * b[1]; - res[13] = a[5] * b[1]; - res[14] = a[6] * b[1]; - res[15] = a[7] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 8; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[2] * b[0]; - res[3] += a[3] * b[0]; - res[4] += a[4] * b[0]; - res[5] += a[5] * b[0]; - res[6] += a[6] * b[0]; - res[7] += a[7] * b[0]; - res[8] += a[0] * b[1]; - res[9] += a[1] * b[1]; - res[10] += a[2] * b[1]; - res[11] += a[3] * b[1]; - res[12] += a[4] * b[1]; - res[13] += a[5] * b[1]; - res[14] += a[6] * b[1]; - res[15] += a[7] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c4 -= res[4]; - c5 -= res[5]; - c6 -= res[6]; - c7 -= res[7]; - c0_nxt -= res[8]; - c1_nxt -= res[9]; - c2_nxt -= res[10]; - c3_nxt -= res[11]; - c4_nxt -= res[12]; - c5_nxt -= res[13]; - c6_nxt -= res[14]; - c7_nxt -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c4 -= a[4] * b[0]; + c5 -= a[5] * b[0]; + c6 -= a[6] * b[0]; + c7 -= a[7] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; + c4_nxt -= a[4] * b[1]; + c5_nxt -= a[5] * b[1]; + c6_nxt -= a[6] * b[1]; + c7_nxt -= a[7] * b[1]; a += 8; b += 2; @@ -733,6 +585,7 @@ static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; @@ -746,43 +599,16 @@ static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c6 = *(c + 6); c7 = *(c + 7); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG i; - FLOAT a0, a1, a2, a3, a4, a5, a6, a7; - - a0 = a[0] * b[0]; - a1 = a[1] * b[0]; - a2 = a[2] * b[0]; - a3 = a[3] * b[0]; - a4 = a[4] * b[0]; - a5 = a[5] * b[0]; - a6 = a[6] * b[0]; - a7 = a[7] * b[0]; - - for (i = (bk - 1); i--; ) - { - a += 8; - b += 1; - - a0 += a[0] * b[0]; - a1 += a[1] * b[0]; - a2 += a[2] * b[0]; - a3 += a[3] * b[0]; - a4 += a[4] * b[0]; - a5 += a[5] * b[0]; - a6 += a[6] * b[0]; - a7 += a[7] * b[0]; - } - - c0 -= a0; - c1 -= a1; - c2 -= a2; - c3 -= a3; - c4 -= a4; - c5 -= a5; - c6 -= a6; - c7 -= a7; + a0 += a[0] * b[0]; + a1 += a[1] * b[0]; + a2 += a[2] * b[0]; + a3 += a[3] * b[0]; + a4 += a[4] * b[0]; + a5 += a[5] * b[0]; + a6 += a[6] * b[0]; + a7 += a[7] * b[0]; a += 8; b += 1; @@ -890,6 +716,8 @@ static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; @@ -902,81 +730,76 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; a += 4; b += 8; - - for (k = (bk - 1); k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; + + a += 4; + b += 8; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, @@ -990,12 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a10 = __msa_cast_to_vector_float(*(a + 10)); - src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); - src_a11 = __msa_cast_to_vector_float(*(a + 11)); - src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); - src_a15 = __msa_cast_to_vector_float(*(a + 15)); - src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); + COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); + COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); res_c0 *= src_a0; res_c4 *= src_a0; @@ -1041,6 +861,8 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v4f32 src_a10, src_a11, src_a15, src_a; @@ -1048,82 +870,51 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3; + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; a += 4; b += 4; - - for (k = (bk - 1) >> 1; k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } - - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, @@ -1135,12 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); - src_a10 = __msa_cast_to_vector_float(*(a + 10)); - src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); - src_a11 = __msa_cast_to_vector_float(*(a + 11)); - src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); - src_a15 = __msa_cast_to_vector_float(*(a + 15)); - src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10); + COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11); + COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; @@ -1169,6 +957,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; @@ -1181,43 +970,16 @@ static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[2] * b[0]; - res[3] = a[3] * b[0]; - res[4] = a[0] * b[1]; - res[5] = a[1] * b[1]; - res[6] = a[2] * b[1]; - res[7] = a[3] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[2] * b[0]; - res[3] += a[3] * b[0]; - res[4] += a[0] * b[1]; - res[5] += a[1] * b[1]; - res[6] += a[2] * b[1]; - res[7] += a[3] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c2 -= res[2]; - c3 -= res[3]; - c0_nxt -= res[4]; - c1_nxt -= res[5]; - c2_nxt -= res[6]; - c3_nxt -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; + c2_nxt -= a[2] * b[1]; + c3_nxt -= a[3] * b[1]; a += 4; b += 2; @@ -1285,6 +1047,7 @@ static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1292,31 +1055,12 @@ static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3; - - t0 = a[0] * b[0]; - t1 = a[1] * b[0]; - t2 = a[2] * b[0]; - t3 = a[3] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 1; - - t0 += a[0] * b[0]; - t1 += a[1] * b[0]; - t2 += a[2] * b[0]; - t3 += a[3] * b[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -1360,6 +1104,7 @@ static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; @@ -1381,67 +1126,24 @@ static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - res[8] = a[0] * b[4]; - res[9] = a[1] * b[4]; - res[10] = a[0] * b[5]; - res[11] = a[1] * b[5]; - res[12] = a[0] * b[6]; - res[13] = a[1] * b[6]; - res[14] = a[0] * b[7]; - res[15] = a[1] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 8; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - res[8] += a[0] * b[4]; - res[9] += a[1] * b[4]; - res[10] += a[0] * b[5]; - res[11] += a[1] * b[5]; - res[12] += a[0] * b[6]; - res[13] += a[1] * b[6]; - res[14] += a[0] * b[7]; - res[15] += a[1] * b[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; @@ -1512,6 +1214,7 @@ static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1524,43 +1227,16 @@ static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 4; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1605,6 +1281,7 @@ static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1612,32 +1289,12 @@ static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - res2 = a[0] * b[1]; - res3 = a[1] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - res2 += a[0] * b[1]; - res3 += a[1] * b[1]; - } - - c0 -= res0; - c1 -= res1; - - c0_nxt -= res2; - c1_nxt -= res3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1667,30 +1324,16 @@ static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 1; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1710,69 +1353,64 @@ static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - if (bk > 0) + BLASLONG k; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT c0, c1, c2, c3, c4, c5, c6, c7; - - c0 = a[0] * b[0]; - c1 = a[0] * b[1]; - c2 = a[0] * b[2]; - c3 = a[0] * b[3]; - c4 = a[0] * b[4]; - c5 = a[0] * b[5]; - c6 = a[0] * b[6]; - c7 = a[0] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 8; - - c0 += a[0] * b[0]; - c1 += a[0] * b[1]; - c2 += a[0] * b[2]; - c3 += a[0] * b[3]; - c4 += a[0] * b[4]; - c5 += a[0] * b[5]; - c6 += a[0] * b[6]; - c7 += a[0] * b[7]; - } - - *(c + 0 * ldc) -= c0; - *(c + 1 * ldc) -= c1; - *(c + 2 * ldc) -= c2; - *(c + 3 * ldc) -= c3; - *(c + 4 * ldc) -= c4; - *(c + 5 * ldc) -= c5; - *(c + 6 * ldc) -= c6; - *(c + 7 * ldc) -= c7; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; a += 1; b += 8; } - *c *= *a; - *(c + ldc) *= *a; - *(c + 2 * ldc) *= *a; - *(c + 3 * ldc) *= *a; - *(c + 4 * ldc) *= *a; - *(c + 5 * ldc) *= *a; - *(c + 6 * ldc) *= *a; - *(c + 7 * ldc) *= *a; + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + c4 *= *a; + c5 *= *a; + c6 *= *a; + c7 *= *a; - *b = *c; - *(b + 1) = *(c + ldc); - *(b + 2) = *(c + 2 * ldc); - *(b + 3) = *(c + 3 * ldc); - *(b + 4) = *(c + 4 * ldc); - *(b + 5) = *(c + 5 * ldc); - *(b + 6) = *(c + 6 * ldc); - *(b + 7) = *(c + 7 * ldc); + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; } static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1, c2, c3; c0 = *(c + 0 * ldc); @@ -1780,31 +1418,13 @@ static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - res2 = a[0] * b[2]; - res3 = a[0] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 4; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - res2 += a[0] * b[2]; - res3 += a[0] * b[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; a += 1; b += 4; } @@ -1827,30 +1447,16 @@ static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT c0, c1; c0 = *c; c1 = *(c + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; a += 1; b += 2; @@ -1865,22 +1471,11 @@ static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk) + BLASLONG k; + + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res; - - res = a[0] * b[0]; - - for (k = (bk - 1); k--;) - { - a++; - b++; - - res += a[0] * b[0]; - } - - *c -= res; + *c -= a[0] * b[0]; a++; b++; diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index 04bca1b12..69d7b5f72 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; @@ -45,105 +47,43 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; a += 8; b += 8; - - for (k = (bk - 1); k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - - a += 8; - b += 8; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); } src_b = LD_SP(b + 0); @@ -204,8 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - src_b31 = __msa_cast_to_vector_float(*(b + 31)); - src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); src_c4 *= src_b18; src_c5 *= src_b18; @@ -245,12 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - src_b54 = __msa_cast_to_vector_float(*(b + 54)); - src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); - src_b55 = __msa_cast_to_vector_float(*(b + 55)); - src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); - src_b63 = __msa_cast_to_vector_float(*(b + 63)); - src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); + COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); + COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); src_c8 *= src_b36; src_c9 *= src_b36; @@ -291,108 +227,71 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; - v4f32 src_b10, src_b11, src_b15, src_b; + v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; a += 8; b += 4; - - for (k = (bk - 1) / 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - } - - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - a += 8; - b += 4; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + a += 8; + b += 4; } src_b = LD_SP(b + 0); @@ -401,12 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b10 = __msa_cast_to_vector_float(*(b + 10)); - src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); - src_b11 = __msa_cast_to_vector_float(*(b + 11)); - src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); - src_b15 = __msa_cast_to_vector_float(*(b + 15)); - src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); + COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); + COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); src_c0 *= src_b0; src_c1 *= src_b0; @@ -443,100 +339,62 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1, res2, res3; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + for (k = 0; k < (bk >> 1); k++) + { LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; a += 8; b += 2; - for (k = (bk - 1) / 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - a += 8; - b += 2; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - - a += 8; - b += 2; - } - - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - - a += 8; - b += 2; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; + a += 8; + b += 2; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(a, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + a += 8; + b += 2; } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c0 *= src_b0; src_c1 *= src_b0; @@ -552,125 +410,94 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0; + BLASLONG k; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; - if (bk) + LD_SP2(c, 4, src_c0, src_c1); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - v4f32 src_a0, src_a1, res0, res1; - LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; a += 8; b += 1; - for (k = (bk - 1) >> 2; k--;) - { - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; + a += 8; + b += 1; - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; + a += 8; + b += 1; - LD_SP2(a, 4, src_a0, src_a1); + LD_SP2(a, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - a += 8; - b += 1; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - } - - if ((bk - 1) & 3) - { - if ((bk - 1) & 2) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - } - - if ((bk - 1) & 1) - { - LD_SP2(a, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*b); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - a += 8; - b += 1; - } - } - - LD_SP2(c, 4, src_c0, src_c1); - - src_c0 -= res0; - src_c1 -= res1; + a += 8; + b += 1; } - else + + if (bk & 3) { - LD_SP2(c, 4, src_c0, src_c1); + if (bk & 2) + { + LD_SP2(a, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + } + + if (bk & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + a += 8; + b += 1; + } } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c0 *= src_b0; src_c1 *= src_b0; @@ -681,12 +508,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; - v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; @@ -695,121 +523,35 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk) - { - BLASLONG k; - v4f32 src_a0, res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; a += 4; b += 8; - - for (k = (bk - 1) / 2; k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } - - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - - a += 4; - b += 8; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); } src_b = LD_SP(b + 0); @@ -832,8 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); - src_b31 = __msa_cast_to_vector_float(*(b + 31)); - src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); @@ -843,12 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); - src_b54 = __msa_cast_to_vector_float(*(b + 54)); - src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); - src_b55 = __msa_cast_to_vector_float(*(b + 55)); - src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); - src_b63 = __msa_cast_to_vector_float(*(b + 63)); - src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54); + COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55); + COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -909,87 +647,58 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; - v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b; + v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - v4f32 src_a0, res0, res1, res2, res3; + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; a += 4; b += 4; - - for (k = ((bk - 1) >> 1); k--;) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } - - if ((bk - 1) & 1) - { - src_a0 = LD_SP(a); - - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - a += 4; - b += 4; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; + + a += 4; + b += 4; } src_b = LD_SP(b + 0); @@ -998,12 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b10 = __msa_cast_to_vector_float(*(b + 10)); - src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); - src_b11 = __msa_cast_to_vector_float(*(b + 11)); - src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); - src_b15 = __msa_cast_to_vector_float(*(b + 15)); - src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10); + COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11); + COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -1029,82 +735,105 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0, src_b1, src_b3; + BLASLONG k; + v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - v4f32 src_a, res0, res1; + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 = src_a * src_b0; - res1 = src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; a += 4; b += 2; - for (k = ((bk - 1) >> 1); k--;) - { - src_a = LD_SP(a); - src_b0 = LD_SP(b); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - a += 4; - b += 2; + a += 4; + b += 2; - src_a = LD_SP(a); - src_b0 = LD_SP(b); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - a += 4; - b += 2; - } - - if ((bk - 1) & 1) - { - src_a = LD_SP(a); - src_b0 = LD_SP(b); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - - a += 4; - b += 2; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - - src_c0 -= res0; - src_c1 -= res1; + a += 4; + b += 2; } - else + + if (bk & 3) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); + if (bk & 2) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } + + if (bk & 1) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + a += 4; + b += 2; + } } - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(b + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; @@ -1118,6 +847,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); @@ -1125,31 +855,12 @@ static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2); c3 = *(c + 3); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3; - - t0 = a[0] * b[0]; - t1 = a[1] * b[0]; - t2 = a[2] * b[0]; - t3 = a[3] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 4; - b += 1; - - t0 += a[0] * b[0]; - t1 += a[1] * b[0]; - t2 += a[2] * b[0]; - t3 += a[3] * b[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c2 -= a[2] * b[0]; + c3 -= a[3] * b[0]; a += 4; b += 1; @@ -1175,6 +886,7 @@ static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; @@ -1199,67 +911,24 @@ static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[16]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - res[8] = a[0] * b[4]; - res[9] = a[1] * b[4]; - res[10] = a[0] * b[5]; - res[11] = a[1] * b[5]; - res[12] = a[0] * b[6]; - res[13] = a[1] * b[6]; - res[14] = a[0] * b[7]; - res[15] = a[1] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 8; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - res[8] += a[0] * b[4]; - res[9] += a[1] * b[4]; - res[10] += a[0] * b[5]; - res[11] += a[1] * b[5]; - res[12] += a[0] * b[6]; - res[13] += a[1] * b[6]; - res[14] += a[0] * b[7]; - res[15] += a[1] * b[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; + c0_nxt4 -= a[0] * b[4]; + c1_nxt4 -= a[1] * b[4]; + c0_nxt5 -= a[0] * b[5]; + c1_nxt5 -= a[1] * b[5]; + c0_nxt6 -= a[0] * b[6]; + c1_nxt6 -= a[1] * b[6]; + c0_nxt7 -= a[0] * b[7]; + c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; @@ -1447,6 +1116,7 @@ static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; @@ -1459,43 +1129,16 @@ static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[8]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - res[4] = a[0] * b[2]; - res[5] = a[1] * b[2]; - res[6] = a[0] * b[3]; - res[7] = a[1] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 4; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - res[4] += a[0] * b[2]; - res[5] += a[1] * b[2]; - res[6] += a[0] * b[3]; - res[7] += a[1] * b[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c1_nxt1 -= a[1] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c1_nxt2 -= a[1] * b[2]; + c0_nxt3 -= a[0] * b[3]; + c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; @@ -1557,6 +1200,7 @@ static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; c0 = *(c + 0); @@ -1564,31 +1208,12 @@ static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res[4]; - - res[0] = a[0] * b[0]; - res[1] = a[1] * b[0]; - res[2] = a[0] * b[1]; - res[3] = a[1] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 2; - - res[0] += a[0] * b[0]; - res[1] += a[1] * b[0]; - res[2] += a[0] * b[1]; - res[3] += a[1] * b[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt -= res[2]; - c1_nxt -= res[3]; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; + c0_nxt -= a[0] * b[1]; + c1_nxt -= a[1] * b[1]; a += 2; b += 2; @@ -1620,30 +1245,16 @@ static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[1] * b[0]; - - for (k = (bk - 1); k--;) - { - a += 2; - b += 1; - - res0 += a[0] * b[0]; - res1 += a[1] * b[0]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[1] * b[0]; a += 2; b += 1; @@ -1663,6 +1274,7 @@ static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; @@ -1676,43 +1288,16 @@ static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; - - t0 = a[0] * b[0]; - t1 = a[0] * b[1]; - t2 = a[0] * b[2]; - t3 = a[0] * b[3]; - t4 = a[0] * b[4]; - t5 = a[0] * b[5]; - t6 = a[0] * b[6]; - t7 = a[0] * b[7]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 8; - - t0 += a[0] * b[0]; - t1 += a[0] * b[1]; - t2 += a[0] * b[2]; - t3 += a[0] * b[3]; - t4 += a[0] * b[4]; - t5 += a[0] * b[5]; - t6 += a[0] * b[6]; - t7 += a[0] * b[7]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; + c4 -= a[0] * b[4]; + c5 -= a[0] * b[5]; + c6 -= a[0] * b[6]; + c7 -= a[0] * b[7]; a += 1; b += 8; @@ -1820,6 +1405,7 @@ static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); @@ -1827,31 +1413,12 @@ static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1, res2, res3; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - res2 = a[0] * b[2]; - res3 = a[0] * b[3]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 4; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - res2 += a[0] * b[2]; - res3 += a[0] * b[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; + c2 -= a[0] * b[2]; + c3 -= a[0] * b[3]; a += 1; b += 4; @@ -1895,30 +1462,16 @@ static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; FLOAT b0, b1, b3, c0, c1; - c0 = *(c + 0); + c0 = *c; c1 = *(c + ldc); - if (bk) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res0, res1; - - res0 = a[0] * b[0]; - res1 = a[0] * b[1]; - - for (k = (bk - 1); k--;) - { - a += 1; - b += 2; - - res0 += a[0] * b[0]; - res1 += a[0] * b[1]; - } - - c0 -= res0; - c1 -= res1; + c0 -= a[0] * b[0]; + c1 -= a[0] * b[1]; a += 1; b += 2; @@ -1942,22 +1495,11 @@ static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk) + BLASLONG k; + + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT res; - - res = a[0] * b[0]; - - for (k = (bk - 1); k--;) - { - a++; - b++; - - res += a[0] * b[0]; - } - - *c -= res; + *c -= a[0] * b[0]; a++; b++; diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index 25a8a0b6e..eefd3a665 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -30,6 +30,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; @@ -45,104 +48,43 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; - v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + for (k = 0; k < bk; k++) + { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 = src_a0 * src_b0; - res9 = src_a1 * src_b0; - res10 = src_a0 * src_b1; - res11 = src_a1 * src_b1; - res12 = src_a0 * src_b2; - res13 = src_a1 * src_b2; - res14 = src_a0 * src_b3; - res15 = src_a1 * src_b3; + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; - for (k = (bk - 1); k--;) - { - aa += 8; - bb += 8; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res8 += src_a0 * src_b0; - res9 += src_a1 * src_b0; - res10 += src_a0 * src_b1; - res11 += src_a1 * src_b1; - res12 += src_a0 * src_b2; - res13 += src_a1 * src_b2; - res14 += src_a0 * src_b3; - res15 += src_a1 * src_b3; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - src_c8 -= res8; - src_c9 -= res9; - src_c10 -= res10; - src_c11 -= res11; - src_c12 -= res12; - src_c13 -= res13; - src_c14 -= res14; - src_c15 -= res15; - } - else - { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - LD_SP2(c_nxt4line, 4, src_c8, src_c9); - LD_SP2(c_nxt5line, 4, src_c10, src_c11); - LD_SP2(c_nxt6line, 4, src_c12, src_c13); - LD_SP2(c_nxt7line, 4, src_c14, src_c15); + aa += 8; + bb += 8; } b -= 64; @@ -216,8 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - src_b36 = __msa_cast_to_vector_float(*(b + 36)); - src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); src_c8 *= src_b36; src_c9 *= src_b36; @@ -262,12 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); - src_b9 = __msa_cast_to_vector_float(*(b + 9)); - src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); - src_b8 = __msa_cast_to_vector_float(*(b + 8)); - src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); + COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c2 *= src_b9; src_c3 *= src_b9; @@ -285,6 +223,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; v4f32 src_b13, src_b14, src_b15; @@ -292,103 +233,60 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + aa += 8; + bb += 4; LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; - res4 = src_a0 * src_b2; - res5 = src_a1 * src_b2; - res6 = src_a0 * src_b3; - res7 = src_a1 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; - for (k = (bk - 1) / 2; k--;) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - } - - if ((bk - 1) & 1) - { - aa += 8; - bb += 4; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - res4 += src_a0 * src_b2; - res5 += src_a1 * src_b2; - res6 += src_a0 * src_b3; - res7 += src_a1 * src_b3; - - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; + aa += 8; + bb += 4; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - LD_SP2(c_nxt2line, 4, src_c4, src_c5); - LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; } a -= 32; @@ -400,12 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b5 = __msa_cast_to_vector_float(*(b + 5)); - src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b4 = __msa_cast_to_vector_float(*(b + 4)); - src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); + COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c7 *= src_b15; src_c6 *= src_b15; @@ -442,101 +337,63 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk > 0) + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, res0, res1, res2, res3; + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + + aa += 8; + bb += 2; LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; - res2 = src_a0 * src_b1; - res3 = src_a1 * src_b1; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; - for (k = (bk - 1) >> 1; k--;) - { - aa += 8; - bb += 2; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - - aa += 8; - bb += 2; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - } - - if ((bk - 1) & 1) - { - aa += 8; - bb += 2; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b1 = __msa_cast_to_vector_float(*(bb + 1)); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - res2 += src_a0 * src_b1; - res3 += src_a1 * src_b1; - } - - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; + aa += 8; + bb += 2; } - else + + if (bk & 1) { - LD_SP2(c, 4, src_c0, src_c1); - LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; } a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - src_b2 = __msa_cast_to_vector_float(*(b + 2)); - src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); + COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); src_c2 *= src_b3; src_c3 *= src_b3; @@ -552,126 +409,95 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0; + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; - if (bk > 0) + LD_SP2(c, 4, src_c0, src_c1); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, res0, res1; + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 = src_a0 * src_b0; - res1 = src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - for (k = (bk - 1) >> 2; k--;) - { - aa += 8; - bb += 1; + aa += 8; + bb += 1; - LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - aa += 8; - bb += 1; + aa += 8; + bb += 1; - LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(aa, 4, src_a0, src_a1); - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - } - - if ((bk - 1) & 3) - { - if ((bk - 1) & 2) - { - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - } - - if ((bk - 1) & 1) - { - aa += 8; - bb += 1; - - LD_SP2(aa, 4, src_a0, src_a1); - - src_b0 = __msa_cast_to_vector_float(*bb); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a0 * src_b0; - res1 += src_a1 * src_b0; - } - } - - LD_SP2(c, 4, src_c0, src_c1); - - src_c0 -= res0; - src_c1 -= res1; + aa += 8; + bb += 1; } - else + + if (bk & 3) { - LD_SP2(c, 4, src_c0, src_c1); + if (bk & 2) + { + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + + aa += 8; + bb += 1; + } + + if (bk & 1) + { + LD_SP2(aa, 4, src_a0, src_a1); + + COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0); + + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + } } a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c0 *= src_b0; src_c1 *= src_b0; @@ -682,6 +508,9 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; @@ -696,79 +525,35 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; - if (bk > 0) - { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_b1, src_b2, src_b3; - v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + for (k = 0; k < bk; k++) + { src_a0 = LD_SP(aa); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a0 * src_b0; - res1 = src_a0 * src_b1; - res2 = src_a0 * src_b2; - res3 = src_a0 * src_b3; + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a0 * src_b1; + src_c2 -= src_a0 * src_b2; + src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 = src_a0 * src_b0; - res5 = src_a0 * src_b1; - res6 = src_a0 * src_b2; - res7 = src_a0 * src_b3; + src_c4 -= src_a0 * src_b0; + src_c5 -= src_a0 * src_b1; + src_c6 -= src_a0 * src_b2; + src_c7 -= src_a0 * src_b3; - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 8; - - src_a0 = LD_SP(aa); - - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 += src_a0 * src_b0; - res1 += src_a0 * src_b1; - res2 += src_a0 * src_b2; - res3 += src_a0 * src_b3; - - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res4 += src_a0 * src_b0; - res5 += src_a0 * src_b1; - res6 += src_a0 * src_b2; - res7 += src_a0 * src_b3; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; - src_c4 -= res4; - src_c5 -= res5; - src_c6 -= res6; - src_c7 -= res7; - } - else - { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - src_c4 = LD_SP(c_nxt4line); - src_c5 = LD_SP(c_nxt5line); - src_c6 = LD_SP(c_nxt6line); - src_c7 = LD_SP(c_nxt7line); + aa += 4; + bb += 8; } a -= 32; @@ -794,8 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); - src_b36 = __msa_cast_to_vector_float(*(b + 36)); - src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); @@ -805,12 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); - src_b9 = __msa_cast_to_vector_float(*(b + 9)); - src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); - src_b8 = __msa_cast_to_vector_float(*(b + 8)); - src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9); + COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c7 *= src_b63; src_c6 -= src_c7 * src_b62; @@ -871,89 +652,60 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; v4f32 src_c0, src_c1, src_c2, src_c3, src_b; v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; - v4f32 src_b14, src_b15; + v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + for (k = 0; k < (bk >> 1); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a, src_b1, src_b2, src_b3, res0, res1, res2, res3; + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; + + aa += 4; + bb += 4; src_a = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - res0 = src_a * src_b0; - res1 = src_a * src_b1; - res2 = src_a * src_b2; - res3 = src_a * src_b3; - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 4; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; - src_a = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - - aa += 4; - bb += 4; - - src_a = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - } - - if ((bk - 1) & 1) - { - aa += 4; - bb += 4; - - src_a = LD_SP(aa); - - src_b = LD_SP(bb); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - res2 += src_a * src_b2; - res3 += src_a * src_b3; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); - - src_c0 -= res0; - src_c1 -= res1; - src_c2 -= res2; - src_c3 -= res3; + aa += 4; + bb += 4; } - else + + if (bk & 1) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - src_c2 = LD_SP(c_nxt2line); - src_c3 = LD_SP(c_nxt3line); + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + src_c2 -= src_a * src_b2; + src_c3 -= src_a * src_b3; } a -= 16; @@ -965,12 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); - src_b5 = __msa_cast_to_vector_float(*(b + 5)); - src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); - src_b4 = __msa_cast_to_vector_float(*(b + 4)); - src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5); + COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c3 *= src_b15; src_c2 -= src_c3 * src_b14; @@ -996,83 +745,106 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - v4f32 src_c0, src_c1, src_b0, src_b2, src_b3; + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; - if (bk > 0) + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + for (k = 0; k < (bk >> 2); k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a, src_b1, res0, res1; + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 = src_a * src_b0; - res1 = src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - for (k = ((bk - 1) >> 1); k--;) - { - aa += 4; - bb += 2; + aa += 4; + bb += 2; - src_a = LD_SP(aa); - src_b0 = LD_SP(bb); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - aa += 4; - bb += 2; + aa += 4; + bb += 2; - src_a = LD_SP(aa); - src_b0 = LD_SP(bb); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - res0 += src_a * src_b0; - res1 += src_a * src_b1; - } + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; - if ((bk - 1) & 1) - { - aa += 4; - bb += 2; - - src_a = LD_SP(aa); - src_b0 = LD_SP(bb); - src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); - - res0 += src_a * src_b0; - res1 += src_a * src_b1; - } - - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); - - src_c0 -= res0; - src_c1 -= res1; + aa += 4; + bb += 2; } - else + + if (bk & 3) { - src_c0 = LD_SP(c); - src_c1 = LD_SP(c_nxt1line); + if (bk & 2) + { + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + + aa += 4; + bb += 2; + } + + if (bk & 1) + { + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 -= src_a * src_b0; + src_c1 -= src_a * src_b1; + } } a -= 8; b -= 4; - src_b3 = __msa_cast_to_vector_float(*(b + 3)); - src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); - src_b2 = __msa_cast_to_vector_float(*(b + 2)); - src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); - src_b0 = __msa_cast_to_vector_float(*(b + 0)); - src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3); + COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2); + COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0); src_c1 *= src_b3; src_c0 -= src_c1 * src_b2; @@ -1086,6 +858,8 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); @@ -1093,32 +867,15 @@ static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) c2 = *(c + 2); c3 = *(c + 3); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c2 -= aa[2] * bb[0]; + c3 -= aa[3] * bb[0]; - t0 = aa[0] * bb[0]; - t1 = aa[1] * bb[0]; - t2 = aa[2] * bb[0]; - t3 = aa[3] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 4; - bb += 1; - - t0 += aa[0] * bb[0]; - t1 += aa[1] * bb[0]; - t2 += aa[2] * bb[0]; - t3 += aa[3] * bb[0]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; + aa += 4; + bb += 1; } a -= 4; @@ -1144,6 +901,8 @@ static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; @@ -1167,68 +926,27 @@ static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[16]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; + c0_nxt4 -= aa[0] * bb[4]; + c1_nxt4 -= aa[1] * bb[4]; + c0_nxt5 -= aa[0] * bb[5]; + c1_nxt5 -= aa[1] * bb[5]; + c0_nxt6 -= aa[0] * bb[6]; + c1_nxt6 -= aa[1] * bb[6]; + c0_nxt7 -= aa[0] * bb[7]; + c1_nxt7 -= aa[1] * bb[7]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - res[8] = aa[0] * bb[4]; - res[9] = aa[1] * bb[4]; - res[10] = aa[0] * bb[5]; - res[11] = aa[1] * bb[5]; - res[12] = aa[0] * bb[6]; - res[13] = aa[1] * bb[6]; - res[14] = aa[0] * bb[7]; - res[15] = aa[1] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 8; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - res[8] += aa[0] * bb[4]; - res[9] += aa[1] * bb[4]; - res[10] += aa[0] * bb[5]; - res[11] += aa[1] * bb[5]; - res[12] += aa[0] * bb[6]; - res[13] += aa[1] * bb[6]; - res[14] += aa[0] * bb[7]; - res[15] += aa[1] * bb[7]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; - c0_nxt4 -= res[8]; - c1_nxt4 -= res[9]; - c0_nxt5 -= res[10]; - c1_nxt5 -= res[11]; - c0_nxt6 -= res[12]; - c1_nxt6 -= res[13]; - c0_nxt7 -= res[14]; - c1_nxt7 -= res[15]; + aa += 2; + bb += 8; } a -= 16; @@ -1416,6 +1134,8 @@ static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; @@ -1428,44 +1148,19 @@ static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[8]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c1_nxt1 -= aa[1] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c1_nxt2 -= aa[1] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + c1_nxt3 -= aa[1] * bb[3]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - res[4] = aa[0] * bb[2]; - res[5] = aa[1] * bb[2]; - res[6] = aa[0] * bb[3]; - res[7] = aa[1] * bb[3]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 4; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - res[4] += aa[0] * bb[2]; - res[5] += aa[1] * bb[2]; - res[6] += aa[0] * bb[3]; - res[7] += aa[1] * bb[3]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt1 -= res[2]; - c1_nxt1 -= res[3]; - c0_nxt2 -= res[4]; - c1_nxt2 -= res[5]; - c0_nxt3 -= res[6]; - c1_nxt3 -= res[7]; + aa += 2; + bb += 4; } a -= 8; @@ -1515,6 +1210,8 @@ static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); @@ -1522,32 +1219,15 @@ static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res[4]; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; + c0_nxt -= aa[0] * bb[1]; + c1_nxt -= aa[1] * bb[1]; - res[0] = aa[0] * bb[0]; - res[1] = aa[1] * bb[0]; - res[2] = aa[0] * bb[1]; - res[3] = aa[1] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 2; - - res[0] += aa[0] * bb[0]; - res[1] += aa[1] * bb[0]; - res[2] += aa[0] * bb[1]; - res[3] += aa[1] * bb[1]; - } - - c0 -= res[0]; - c1 -= res[1]; - c0_nxt -= res[2]; - c1_nxt -= res[3]; + aa += 2; + bb += 2; } a -= 4; @@ -1579,31 +1259,20 @@ static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; + c0 -= aa[0] * bb[0]; + c1 -= aa[1] * bb[0]; - res0 = aa[0] * bb[0]; - res1 = aa[1] * bb[0]; - - for (k = (bk - 1); k--;) - { - aa += 2; - bb += 1; - - res0 += aa[0] * bb[0]; - res1 += aa[1] * bb[0]; - } - - c0 -= res0; - c1 -= res1; + aa += 2; + bb += 1; } a -= 2; @@ -1623,6 +1292,8 @@ static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63; @@ -1637,44 +1308,19 @@ static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; + c4 -= aa[0] * bb[4]; + c5 -= aa[0] * bb[5]; + c6 -= aa[0] * bb[6]; + c7 -= aa[0] * bb[7]; - t0 = aa[0] * bb[0]; - t1 = aa[0] * bb[1]; - t2 = aa[0] * bb[2]; - t3 = aa[0] * bb[3]; - t4 = aa[0] * bb[4]; - t5 = aa[0] * bb[5]; - t6 = aa[0] * bb[6]; - t7 = aa[0] * bb[7]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 8; - - t0 += aa[0] * bb[0]; - t1 += aa[0] * bb[1]; - t2 += aa[0] * bb[2]; - t3 += aa[0] * bb[3]; - t4 += aa[0] * bb[4]; - t5 += aa[0] * bb[5]; - t6 += aa[0] * bb[6]; - t7 += aa[0] * bb[7]; - } - - c0 -= t0; - c1 -= t1; - c2 -= t2; - c3 -= t3; - c4 -= t4; - c5 -= t5; - c6 -= t6; - c7 -= t7; + aa += 1; + bb += 8; } a -= 8; @@ -1782,6 +1428,8 @@ static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c2, c3; @@ -1790,32 +1438,15 @@ static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1, res2, res3; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; + c2 -= aa[0] * bb[2]; + c3 -= aa[0] * bb[3]; - res0 = aa[0] * bb[0]; - res1 = aa[0] * bb[1]; - res2 = aa[0] * bb[2]; - res3 = aa[0] * bb[3]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 4; - - res0 += aa[0] * bb[0]; - res1 += aa[0] * bb[1]; - res2 += aa[0] * bb[2]; - res3 += aa[0] * bb[3]; - } - - c0 -= res0; - c1 -= res1; - c2 -= res2; - c3 -= res3; + aa += 1; + bb += 4; } a -= 4; @@ -1850,31 +1481,20 @@ static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { + BLASLONG k; + FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); c1 = *(c + ldc); - if (bk > 0) + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res0, res1; + c0 -= aa[0] * bb[0]; + c1 -= aa[0] * bb[1]; - res0 = aa[0] * bb[0]; - res1 = aa[0] * bb[1]; - - for (k = (bk - 1); k--;) - { - aa += 1; - bb += 2; - - res0 += aa[0] * bb[0]; - res1 += aa[0] * bb[1]; - } - - c0 -= res0; - c1 -= res1; + aa += 1; + bb += 2; } a -= 2; @@ -1898,23 +1518,11 @@ static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { - if (bk > 0) + BLASLONG k; + + for (k = 0; k < bk; k++) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - FLOAT res; - - res = *aa * *bb; - - for (k = (bk - 1); k--;) - { - aa++; - bb++; - - res += *aa * *bb; - } - - *c -= res; + *c -= a[k] * b[k]; } *c *= *(a - 1);