From ad9f3178705130d590cec55475b7039d3ae4c1ad Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Fri, 20 May 2016 10:59:03 +0530 Subject: [PATCH] STRSM optimization for MIPS P5600 and I6400 using MSA Signed-off-by: Kaustubh Raste --- CONTRIBUTORS.md | 1 + kernel/mips/KERNEL.P5600 | 8 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 6 +- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 6 +- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 2 +- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 2 +- kernel/mips/macros_msa.h | 24 + kernel/mips/strsm_kernel_LN_8x8_msa.c | 2133 ++++++++++++++++++++++++ kernel/mips/strsm_kernel_LT_8x8_msa.c | 2099 ++++++++++++++++++++++++ kernel/mips/strsm_kernel_RN_8x8_msa.c | 2162 +++++++++++++++++++++++++ kernel/mips/strsm_kernel_RT_8x8_msa.c | 2118 ++++++++++++++++++++++++ 11 files changed, 8549 insertions(+), 12 deletions(-) create mode 100644 kernel/mips/strsm_kernel_LN_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_LT_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_RN_8x8_msa.c create mode 100644 kernel/mips/strsm_kernel_RT_8x8_msa.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a13308f71..5ecf32b91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -160,3 +160,4 @@ In chronological order: * Kaustubh Raste * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA + * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index d7d49055f..802f0e0e5 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index d0792bf85..dc21dab45 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (j = (n >> 2); j--;) { - kk = m; + kk = m + offset; if (m & 7) { @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (n & 2) { - kk = m; + kk = m + offset; if (m & 7) { @@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (n & 1) { - kk = m; + kk = m + offset; if (m & 7) { diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index db902c0de..897fd313b 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (j = (n >> 2); j--;) { - kk = 0; + kk = offset; aa = a; cc = c; @@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (n & 2) { - kk = 0; + kk = offset; aa = a; cc = c; @@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (n & 1) { - kk = 0; + kk = offset; aa = a; cc = c; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index 518daad13..44313241e 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, BLASLONG i, j, kk; FLOAT *aa, *cc; - kk = 0; + kk = -offset; for (j = (n >> 2); j--;) { diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index bef87d44d..49274e5bc 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, BLASLONG i, j, kk; FLOAT *aa, *cc, *bb; - kk = n; + kk = n - offset; c += n * ldc; b += n * k; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index fad6dd6cd..ae85220c6 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for word operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ + out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ +} + +#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ +{ \ + SPLATI_W2(RTYPE, in, 0, out0, out1); \ + SPLATI_W2(RTYPE, in, 2, out2, out3); \ +} +#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) + /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c new file mode 100644 index 000000000..3db7da3c4 --- /dev/null +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -0,0 +1,2133 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36; + v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + aa += 8; + bb += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + aa += 8; + bb += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + a -= 64; + b -= 64; + + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + res_c7 *= src_a63; + res_c15 *= src_a63; + + res_c6 -= res_c7 * src_a62; + res_c14 -= res_c15 * src_a62; + + res_c5 -= res_c7 * src_a61; + res_c13 -= res_c15 * src_a61; + + res_c4 -= res_c7 * src_a60; + res_c12 -= res_c15 * src_a60; + + res_c3 -= res_c7 * src_a59; + res_c11 -= res_c15 * src_a59; + + res_c2 -= res_c7 * src_a58; + res_c10 -= res_c15 * src_a58; + + res_c1 -= res_c7 * src_a57; + res_c9 -= res_c15 * src_a57; + + res_c0 -= res_c7 * src_a56; + res_c8 -= res_c15 * src_a56; + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c6 *= src_a54; + res_c14 *= src_a54; + + res_c5 -= res_c6 * src_a53; + res_c13 -= res_c14 * src_a53; + + res_c4 -= res_c6 * src_a52; + res_c12 -= res_c14 * src_a52; + + res_c3 -= res_c6 * src_a51; + res_c11 -= res_c14 * src_a51; + + res_c2 -= res_c6 * src_a50; + res_c10 -= res_c14 * src_a50; + + res_c1 -= res_c6 * src_a49; + res_c9 -= res_c14 * src_a49; + + res_c0 -= res_c6 * src_a48; + res_c8 -= res_c14 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + + res_c4 -= res_c5 * src_a44; + res_c12 -= res_c13 * src_a44; + + res_c3 -= res_c5 * src_a43; + res_c11 -= res_c13 * src_a43; + + res_c2 -= res_c5 * src_a42; + res_c10 -= res_c13 * src_a42; + + res_c1 -= res_c5 * src_a41; + res_c9 -= res_c13 * src_a41; + + res_c0 -= res_c5 * src_a40; + res_c8 -= res_c13 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = __msa_cast_to_vector_float(*(a + 36)); + src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + + res_c4 *= src_a36; + res_c12 *= src_a36; + + res_c3 -= res_c4 * src_a35; + res_c11 -= res_c12 * src_a35; + + res_c2 -= res_c4 * src_a34; + res_c10 -= res_c12 * src_a34; + + res_c1 -= res_c4 * src_a33; + res_c9 -= res_c12 * src_a33; + + res_c0 -= res_c4 * src_a32; + res_c8 -= res_c12 * src_a32; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c11 *= src_a27; + + res_c2 -= res_c3 * src_a26; + res_c10 -= res_c11 * src_a26; + + res_c1 -= res_c3 * src_a25; + res_c9 -= res_c11 * src_a25; + + res_c0 -= res_c3 * src_a24; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + + res_c1 -= res_c2 * src_a17; + res_c9 -= res_c10 * src_a17; + + res_c0 -= res_c2 * src_a16; + res_c8 -= res_c10 * src_a16; + + src_a9 = __msa_cast_to_vector_float(*(a + 9)); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); + src_a8 = __msa_cast_to_vector_float(*(a + 8)); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c1 *= src_a9; + res_c9 *= src_a9; + + res_c0 -= res_c1 * src_a8; + res_c8 -= res_c9 * src_a8; + + res_c0 *= src_a0; + res_c8 *= src_a0; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); +} + +static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; + v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35; + v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; + v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; + v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + a -= 64; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 60); + SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); + src_a = LD_SP(a + 56); + SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); + + src_a = LD_SP(a + 48); + SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); + src_a52 = LD_SP(a + 52); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); + src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); + src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + src_a = LD_SP(a + 40); + SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); + src_a44 = LD_SP(a + 44); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); + src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + src_a = LD_SP(a + 32); + SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); + src_a36 = __msa_cast_to_vector_float(*(a + 36)); + src_a36 = (v4f32) __msa_splati_w((v4i32) src_a36, 0); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a = LD_SP(a + 24); + SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + src_a16 = LD_SP(a + 16); + src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); + src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); + src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + src_a9 = __msa_cast_to_vector_float(*(a + 9)); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a9, 0); + src_a8 = __msa_cast_to_vector_float(*(a + 8)); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[2] * bb[0]; + res[3] = aa[3] * bb[0]; + res[4] = aa[4] * bb[0]; + res[5] = aa[5] * bb[0]; + res[6] = aa[6] * bb[0]; + res[7] = aa[7] * bb[0]; + res[8] = aa[0] * bb[1]; + res[9] = aa[1] * bb[1]; + res[10] = aa[2] * bb[1]; + res[11] = aa[3] * bb[1]; + res[12] = aa[4] * bb[1]; + res[13] = aa[5] * bb[1]; + res[14] = aa[6] * bb[1]; + res[15] = aa[7] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[2] * bb[0]; + res[3] += aa[3] * bb[0]; + res[4] += aa[4] * bb[0]; + res[5] += aa[5] * bb[0]; + res[6] += aa[6] * bb[0]; + res[7] += aa[7] * bb[0]; + res[8] += aa[0] * bb[1]; + res[9] += aa[1] * bb[1]; + res[10] += aa[2] * bb[1]; + res[11] += aa[3] * bb[1]; + res[12] += aa[4] * bb[1]; + res[13] += aa[5] * bb[1]; + res[14] += aa[6] * bb[1]; + res[15] += aa[7] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c4 -= res[4]; + c5 -= res[5]; + c6 -= res[6]; + c7 -= res[7]; + + c0_nxt -= res[8]; + c1_nxt -= res[9]; + c2_nxt -= res[10]; + c3_nxt -= res[11]; + c4_nxt -= res[12]; + c5_nxt -= res[13]; + c6_nxt -= res[14]; + c7_nxt -= res[15]; + } + + a -= 64; + b -= 16; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + c7_nxt *= a63; + c6 -= c7 * a62; + c6_nxt -= c7_nxt * a62; + c5 -= c7 * a61; + c5_nxt -= c7_nxt * a61; + c4 -= c7 * a60; + c4_nxt -= c7_nxt * a60; + c3 -= c7 * a59; + c3_nxt -= c7_nxt * a59; + c2 -= c7 * a58; + c2_nxt -= c7_nxt * a58; + c1 -= c7 * a57; + c1_nxt -= c7_nxt * a57; + c0 -= c7 * a56; + c0_nxt -= c7_nxt * a56; + + c6 *= a54; + c6_nxt *= a54; + c5 -= c6 * a53; + c5_nxt -= c6_nxt * a53; + c4 -= c6 * a52; + c4_nxt -= c6_nxt * a52; + c3 -= c6 * a51; + c3_nxt -= c6_nxt * a51; + c2 -= c6 * a50; + c2_nxt -= c6_nxt * a50; + c1 -= c6 * a49; + c1_nxt -= c6_nxt * a49; + c0 -= c6 * a48; + c0_nxt -= c6_nxt * a48; + + c5 *= a45; + c5_nxt *= a45; + c4 -= c5 * a44; + c4_nxt -= c5_nxt * a44; + c3 -= c5 * a43; + c3_nxt -= c5_nxt * a43; + c2 -= c5 * a42; + c2_nxt -= c5_nxt * a42; + c1 -= c5 * a41; + c1_nxt -= c5_nxt * a41; + c0 -= c5 * a40; + c0_nxt -= c5_nxt * a40; + + c4 *= a36; + c4_nxt *= a36; + c3 -= c4 * a35; + c3_nxt -= c4_nxt * a35; + c2 -= c4 * a34; + c2_nxt -= c4_nxt * a34; + c1 -= c4 * a33; + c1_nxt -= c4_nxt * a33; + c0 -= c4 * a32; + c0_nxt -= c4_nxt * a32; + + c3 *= a27; + c3_nxt *= a27; + c2 -= c3 * a26; + c2_nxt -= c3_nxt * a26; + c1 -= c3 * a25; + c1_nxt -= c3_nxt * a25; + c0 -= c3 * a24; + c0_nxt -= c3_nxt * a24; + + c2 *= a18; + c2_nxt *= a18; + c1 -= c2 * a17; + c1_nxt -= c2_nxt * a17; + c0 -= c2 * a16; + c0_nxt -= c2_nxt * a16; + + c1 *= a9; + c1_nxt *= a9; + c0 -= c1 * a8; + c0_nxt -= c1_nxt * a8; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; +} + +static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + t4 = aa[4] * bb[0]; + t5 = aa[5] * bb[0]; + t6 = aa[6] * bb[0]; + t7 = aa[7] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + t4 += aa[4] * bb[0]; + t5 += aa[5] * bb[0]; + t6 += aa[6] * bb[0]; + t7 += aa[7] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; +} + +static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 8; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + a -= 16; + b -= 32; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a5 = __msa_cast_to_vector_float(*(a + 5)); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a4 = __msa_cast_to_vector_float(*(a + 4)); + src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c3 *= src_a15; + res_c7 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; + v4f32 src_a13, src_a14, src_a15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 4; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + a -= 16; + b -= 16; + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 12); + SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); + src_a8 = LD_SP(a + 8); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); + src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); + src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); + src_a5 = __msa_cast_to_vector_float(*(a + 5)); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a4 = __msa_cast_to_vector_float(*(a + 4)); + src_a4 = (v4f32) __msa_splati_w((v4i32) src_a4, 0); + src_a0 = __msa_cast_to_vector_float(*(a + 0)); + src_a0 = (v4f32) __msa_splati_w((v4i32) src_a0, 0); + + res_c3 *= src_a15; + res_c2 -= res_c3 * src_a14; + res_c1 -= res_c3 * src_a13; + res_c0 -= res_c3 * src_a12; + + res_c2 *= src_a10; + res_c1 -= res_c2 * src_a9; + res_c0 -= res_c2 * src_a8; + + res_c1 *= src_a5; + res_c0 -= res_c1 * src_a4; + + res_c0 *= src_a0; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[2] * bb[0]; + res[3] = aa[3] * bb[0]; + res[4] = aa[0] * bb[1]; + res[5] = aa[1] * bb[1]; + res[6] = aa[2] * bb[1]; + res[7] = aa[3] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[2] * bb[0]; + res[3] += aa[3] * bb[0]; + res[4] += aa[0] * bb[1]; + res[5] += aa[1] * bb[1]; + res[6] += aa[2] * bb[1]; + res[7] += aa[3] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c0_nxt -= res[4]; + c1_nxt -= res[5]; + c2_nxt -= res[6]; + c3_nxt -= res[7]; + } + + a -= 16; + b -= 8; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + c3_nxt *= a15; + + c2 -= c3 * a14; + c2_nxt -= c3_nxt * a14; + + c2 *= a10; + c2_nxt *= a10; + + c1 -= c3 * a13; + c1_nxt -= c3_nxt * a13; + + c1 -= c2 * a9; + c1_nxt -= c2_nxt * a9; + + c1 *= a5; + c1_nxt *= a5; + + c0 -= c3 * a12; + c0_nxt -= c3_nxt * a12; + + c0 -= c2 * a8; + c0_nxt -= c2_nxt * a8; + + c0 -= c1 * a4; + c0_nxt -= c1_nxt * a4; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + } + + a -= 16; + b -= 4; + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; + FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + res[8] = aa[0] * bb[4]; + res[9] = aa[1] * bb[4]; + res[10] = aa[0] * bb[5]; + res[11] = aa[1] * bb[5]; + res[12] = aa[0] * bb[6]; + res[13] = aa[1] * bb[6]; + res[14] = aa[0] * bb[7]; + res[15] = aa[1] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 8; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + res[8] += aa[0] * bb[4]; + res[9] += aa[1] * bb[4]; + res[10] += aa[0] * bb[5]; + res[11] += aa[1] * bb[5]; + res[12] += aa[0] * bb[6]; + res[13] += aa[1] * bb[6]; + res[14] += aa[0] * bb[7]; + res[15] += aa[1] * bb[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + } + + a -= 4; + b -= 16; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + c1_nxt4 *= a3; + c1_nxt5 *= a3; + c1_nxt6 *= a3; + c1_nxt7 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt4 -= c1_nxt4 * a2; + c0_nxt5 -= c1_nxt5 * a2; + c0_nxt6 -= c1_nxt6 * a2; + c0_nxt7 -= c1_nxt7 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + c0_nxt4 *= a0; + c0_nxt5 *= a0; + c0_nxt6 *= a0; + c0_nxt7 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 4; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + } + + a -= 4; + b -= 8; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + c0 -= c1 * a2; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt3 -= c1_nxt3 * a2; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1, res2, res3; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + res2 = aa[0] * bb[1]; + res3 = aa[1] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 2; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + res2 += aa[0] * bb[1]; + res3 += aa[1] * bb[1]; + } + + c0 -= res0; + c1 -= res1; + c0_nxt -= res2; + c1_nxt -= res3; + } + + a -= 4; + b -= 4; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c1_nxt *= a3; + + c0 -= c1 * a2; + c0_nxt -= c1_nxt * a2; + + c0 *= a0; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 1; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 4; + b -= 2; + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk > 0) + { + FLOAT *aa = a, *bb = b; + BLASLONG k; + FLOAT r0, r1, r2, r3, r4, r5, r6, r7; + + r0 = aa[0] * bb[0]; + r1 = aa[0] * bb[1]; + r2 = aa[0] * bb[2]; + r3 = aa[0] * bb[3]; + r4 = aa[0] * bb[4]; + r5 = aa[0] * bb[5]; + r6 = aa[0] * bb[6]; + r7 = aa[0] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 8; + + r0 += aa[0] * bb[0]; + r1 += aa[0] * bb[1]; + r2 += aa[0] * bb[2]; + r3 += aa[0] * bb[3]; + r4 += aa[0] * bb[4]; + r5 += aa[0] * bb[5]; + r6 += aa[0] * bb[6]; + r7 += aa[0] * bb[7]; + } + + c0 -= r0; + c1 -= r1; + c2 -= r2; + c3 -= r3; + c4 -= r4; + c5 -= r5; + c6 -= r6; + c7 -= r7; + } + + a0 = *(a - 1); + + c0 *= a0; + c1 *= a0; + c2 *= a0; + c3 *= a0; + c4 *= a0; + c5 *= a0; + c6 *= a0; + c7 *= a0; + + *(b - 8) = c0; + *(b - 7) = c1; + *(b - 6) = c2; + *(b - 5) = c3; + *(b - 4) = c4; + *(b - 3) = c5; + *(b - 2) = c6; + *(b - 1) = c7; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT a0, c0, c1, c2, c3; + + a0 = *(a - 1); + + c0 = *(c + 0 * ldc) * a0; + c1 = *(c + 1 * ldc) * a0; + c2 = *(c + 2 * ldc) * a0; + c3 = *(c + 3 * ldc) * a0; + + *(b - 4) = c0; + *(b - 3) = c1; + *(b - 2) = c2; + *(b - 1) = c3; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT a0, c0, c1; + + a0 = *(a - 1); + + c0 = *(c + 0 * ldc) * a0; + c1 = *(c + 1 * ldc) * a0; + + *(b - 2) = c0; + *(b - 1) = c1; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c1; +} + +static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = m + offset; + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, k -kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m + offset; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + cc = c + (m - 1); + + ssolve_1x1_ln_msa(aa, b + kk, cc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & ~1) - 2) * k + 2 * kk; + cc = c + ((m & ~1) - 2); + + ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & ~3) - 4) * k + 4 * kk; + cc = c + ((m & ~3) - 4); + + ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk)); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & ~7) - 8) * k; + cc = c + ((m & ~7) - 8); + + do + { + ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c new file mode 100644 index 000000000..0c61d3618 --- /dev/null +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -0,0 +1,2099 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + a += 8; + b += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + a += 8; + b += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, + res_c8, res_c9, res_c10, res_c11); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, + res_c12, res_c13, res_c14, res_c15); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c8 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c9 -= res_c8 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c10 -= res_c8 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c11 -= res_c8 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c12 -= res_c8 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c13 -= res_c8 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c14 -= res_c8 * src_a6; + res_c7 -= res_c0 * src_a7; + res_c15 -= res_c8 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c9 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c10 -= res_c9 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c11 -= res_c9 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c12 -= res_c9 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c13 -= res_c9 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c14 -= res_c9 * src_a14; + res_c7 -= res_c1 * src_a15; + res_c15 -= res_c9 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c10 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c11 -= res_c10 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c12 -= res_c10 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c13 -= res_c10 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c14 -= res_c10 * src_a22; + res_c7 -= res_c2 * src_a23; + res_c15 -= res_c10 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = __msa_cast_to_vector_float(*(a + 31)); + src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + + res_c3 *= src_a27; + res_c11 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c12 -= res_c11 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c13 -= res_c11 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c14 -= res_c11 * src_a30; + res_c7 -= res_c3 * src_a31; + res_c15 -= res_c11 * src_a31; + + ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); + ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, + src_c8, src_c10, src_c12, src_c14); + + ST_SP(src_c0, c); + ST_SP(src_c2, c_nxt1line); + ST_SP(src_c4, c_nxt2line); + ST_SP(src_c6, c_nxt3line); + ST_SP(src_c8, c_nxt4line); + ST_SP(src_c10, c_nxt5line); + ST_SP(src_c12, c_nxt6line); + ST_SP(src_c14, c_nxt7line); + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c12 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c13 -= res_c12 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c14 -= res_c12 * src_a38; + res_c7 -= res_c4 * src_a39; + res_c15 -= res_c12 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c13 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c14 -= res_c13 * src_a46; + res_c7 -= res_c5 * src_a47; + res_c15 -= res_c13 * src_a47; + + src_a54 = __msa_cast_to_vector_float(*(a + 54)); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); + src_a55 = __msa_cast_to_vector_float(*(a + 55)); + src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); + src_a63 = __msa_cast_to_vector_float(*(a + 63)); + src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + + res_c6 *= src_a54; + res_c14 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); + ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); + + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, + src_c9, src_c11, src_c13, src_c15); + + ST_SP(src_c1, c + 4); + ST_SP(src_c3, c_nxt1line + 4); + ST_SP(src_c5, c_nxt2line + 4); + ST_SP(src_c7, c_nxt3line + 4); + ST_SP(src_c9, c_nxt4line + 4); + ST_SP(src_c11, c_nxt5line + 4); + ST_SP(src_c13, c_nxt6line + 4); + ST_SP(src_c15, c_nxt7line + 4); +} + +static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + a += 8; + b += 4; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a = LD_SP(a + 4); + SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a = LD_SP(a + 9); + SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); + src_a13 = LD_SP(a + 13); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); + src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); + src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a = LD_SP(a + 18); + SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); + src_a22 = LD_SP(a + 22); + src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); + src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a = LD_SP(a + 27); + SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); + src_a31 = __msa_cast_to_vector_float(*(a + 31)); + src_a31 = (v4f32) __msa_splati_w((v4i32) src_a31, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + src_a = LD_SP(a + 36); + SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = LD_SP(a + 45); + src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); + src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); + src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a54 = __msa_cast_to_vector_float(*(a + 54)); + src_a54 = (v4f32) __msa_splati_w((v4i32) src_a54, 0); + src_a55 = __msa_cast_to_vector_float(*(a + 55)); + src_a55 = (v4f32) __msa_splati_w((v4i32) src_a55, 0); + src_a63 = __msa_cast_to_vector_float(*(a + 63)); + src_a63 = (v4f32) __msa_splati_w((v4i32) src_a63, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + res_c7 *= src_a63; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + b += 16; + ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c2, src_c4, src_c6); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c1, src_c3, src_c5, src_c7); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + c4_nxt = *(c + 4 + ldc); + c5_nxt = *(c + 5 + ldc); + c6_nxt = *(c + 6 + ldc); + c7_nxt = *(c + 7 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[2] * b[0]; + res[3] = a[3] * b[0]; + res[4] = a[4] * b[0]; + res[5] = a[5] * b[0]; + res[6] = a[6] * b[0]; + res[7] = a[7] * b[0]; + res[8] = a[0] * b[1]; + res[9] = a[1] * b[1]; + res[10] = a[2] * b[1]; + res[11] = a[3] * b[1]; + res[12] = a[4] * b[1]; + res[13] = a[5] * b[1]; + res[14] = a[6] * b[1]; + res[15] = a[7] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 8; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[2] * b[0]; + res[3] += a[3] * b[0]; + res[4] += a[4] * b[0]; + res[5] += a[5] * b[0]; + res[6] += a[6] * b[0]; + res[7] += a[7] * b[0]; + res[8] += a[0] * b[1]; + res[9] += a[1] * b[1]; + res[10] += a[2] * b[1]; + res[11] += a[3] * b[1]; + res[12] += a[4] * b[1]; + res[13] += a[5] * b[1]; + res[14] += a[6] * b[1]; + res[15] += a[7] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c4 -= res[4]; + c5 -= res[5]; + c6 -= res[6]; + c7 -= res[7]; + c0_nxt -= res[8]; + c1_nxt -= res[9]; + c2_nxt -= res[10]; + c3_nxt -= res[11]; + c4_nxt -= res[12]; + c5_nxt -= res[13]; + c6_nxt -= res[14]; + c7_nxt -= res[15]; + + a += 8; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a9; + c1_nxt *= a9; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + c2 -= c1 * a10; + c2_nxt -= c1_nxt * a10; + c2 *= a18; + c2_nxt *= a18; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + c3 -= c1 * a11; + c3_nxt -= c1_nxt * a11; + c3 -= c2 * a19; + c3_nxt -= c2_nxt * a19; + c3 *= a27; + c3_nxt *= a27; + + c4 -= c0 * a4; + c4_nxt -= c0_nxt * a4; + c4 -= c1 * a12; + c4_nxt -= c1_nxt * a12; + c4 -= c2 * a20; + c4_nxt -= c2_nxt * a20; + c4 -= c3 * a28; + c4_nxt -= c3_nxt * a28; + c4 *= a36; + c4_nxt *= a36; + + c5 -= c0 * a5; + c5_nxt -= c0_nxt * a5; + c5 -= c1 * a13; + c5_nxt -= c1_nxt * a13; + c5 -= c2 * a21; + c5_nxt -= c2_nxt * a21; + c5 -= c3 * a29; + c5_nxt -= c3_nxt * a29; + c5 -= c4 * a37; + c5_nxt -= c4_nxt * a37; + c5 *= a45; + c5_nxt *= a45; + + c6 -= c0 * a6; + c6_nxt -= c0_nxt * a6; + c6 -= c1 * a14; + c6_nxt -= c1_nxt * a14; + c6 -= c2 * a22; + c6_nxt -= c2_nxt * a22; + c6 -= c3 * a30; + c6_nxt -= c3_nxt * a30; + c6 -= c4 * a38; + c6_nxt -= c4_nxt * a38; + c6 -= c5 * a46; + c6_nxt -= c5_nxt * a46; + c6 *= a54; + c6_nxt *= a54; + + c7 -= c0 * a7; + c7_nxt -= c0_nxt * a7; + c7 -= c1 * a15; + c7_nxt -= c1_nxt * a15; + c7 -= c2 * a23; + c7_nxt -= c2_nxt * a23; + c7 -= c3 * a31; + c7_nxt -= c3_nxt * a31; + c7 -= c4 * a39; + c7_nxt -= c4_nxt * a39; + c7 -= c5 * a47; + c7_nxt -= c5_nxt * a47; + c7 -= c6 * a55; + c7_nxt -= c6_nxt * a55; + c7 *= a63; + c7_nxt *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; + *(c + 4 + ldc) = c4_nxt; + *(c + 5 + ldc) = c5_nxt; + *(c + 6 + ldc) = c6_nxt; + *(c + 7 + ldc) = c7_nxt; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + *(b + 8) = c4; + *(b + 9) = c4_nxt; + *(b + 10) = c5; + *(b + 11) = c5_nxt; + *(b + 12) = c6; + *(b + 13) = c6_nxt; + *(b + 14) = c7; + *(b + 15) = c7_nxt; +} + +static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7; + + a0 = a[0] * b[0]; + a1 = a[1] * b[0]; + a2 = a[2] * b[0]; + a3 = a[3] * b[0]; + a4 = a[4] * b[0]; + a5 = a[5] * b[0]; + a6 = a[6] * b[0]; + a7 = a[7] * b[0]; + + for (i = (bk - 1); i--; ) + { + a += 8; + b += 1; + + a0 += a[0] * b[0]; + a1 += a[1] * b[0]; + a2 += a[2] * b[0]; + a3 += a[3] * b[0]; + a4 += a[4] * b[0]; + a5 += a[5] * b[0]; + a6 += a[6] * b[0]; + a7 += a[7] * b[0]; + } + + c0 -= a0; + c1 -= a1; + c2 -= a2; + c3 -= a3; + c4 -= a4; + c5 -= a5; + c6 -= a6; + c7 -= a7; + + a += 8; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + a += 4; + b += 8; + + for (k = (bk - 1); k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, + res_c4, res_c5, res_c6, res_c7); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = __msa_cast_to_vector_float(*(a + 10)); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); + src_a11 = __msa_cast_to_vector_float(*(a + 11)); + src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); + src_a15 = __msa_cast_to_vector_float(*(a + 15)); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + + res_c0 *= src_a0; + res_c4 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c5 -= res_c4 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c6 -= res_c4 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c7 -= res_c4 * src_a3; + + res_c1 *= src_a5; + res_c5 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c6 -= res_c5 * src_a6; + res_c3 -= res_c1 * src_a7; + res_c7 -= res_c5 * src_a7; + + res_c2 *= src_a10; + res_c6 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c7 -= res_c6 * src_a11; + + res_c3 *= src_a15; + res_c7 *= src_a15; + + ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); + ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, + src_c4, src_c5, src_c6, src_c7); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v4f32 src_a10, src_a11, src_a15, src_a; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + a += 4; + b += 4; + + for (k = (bk - 1) >> 1; k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, + res_c0, res_c1, res_c2, res_c3); + + src_a = LD_SP(a + 0); + SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); + src_a5 = LD_SP(a + 5); + src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); + src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); + src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); + src_a10 = __msa_cast_to_vector_float(*(a + 10)); + src_a10 = (v4f32) __msa_splati_w((v4i32) src_a10, 0); + src_a11 = __msa_cast_to_vector_float(*(a + 11)); + src_a11 = (v4f32) __msa_splati_w((v4i32) src_a11, 0); + src_a15 = __msa_cast_to_vector_float(*(a + 15)); + src_a15 = (v4f32) __msa_splati_w((v4i32) src_a15, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + + res_c3 *= src_a15; + + ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); + + TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, + src_c0, src_c1, src_c2, src_c3); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + c2_nxt = *(c + 2 + ldc); + c3_nxt = *(c + 3 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[2] * b[0]; + res[3] = a[3] * b[0]; + res[4] = a[0] * b[1]; + res[5] = a[1] * b[1]; + res[6] = a[2] * b[1]; + res[7] = a[3] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[2] * b[0]; + res[3] += a[3] * b[0]; + res[4] += a[0] * b[1]; + res[5] += a[1] * b[1]; + res[6] += a[2] * b[1]; + res[7] += a[3] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c2 -= res[2]; + c3 -= res[3]; + c0_nxt -= res[4]; + c1_nxt -= res[5]; + c2_nxt -= res[6]; + c3_nxt -= res[7]; + + a += 4; + b += 2; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + c0_nxt *= a0; + + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + + c1 *= a5; + c1_nxt *= a5; + + c2 -= c0 * a2; + c2_nxt -= c0_nxt * a2; + + c2 -= c1 * a6; + c2_nxt -= c1_nxt * a6; + + c2 *= a10; + c2_nxt *= a10; + + c3 -= c0 * a3; + c3_nxt -= c0_nxt * a3; + + c3 -= c1 * a7; + c3_nxt -= c1_nxt * a7; + + c3 -= c2 * a11; + c3_nxt -= c2_nxt * a11; + + c3 *= a15; + c3_nxt *= a15; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + *(b + 4) = c2; + *(b + 5) = c2_nxt; + *(b + 6) = c3; + *(b + 7) = c3_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; + *(c + 2 + ldc) = c2_nxt; + *(c + 3 + ldc) = c3_nxt; +} + +static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT t0, t1, t2, t3; + + t0 = a[0] * b[0]; + t1 = a[1] * b[0]; + t2 = a[2] * b[0]; + t3 = a[3] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 1; + + t0 += a[0] * b[0]; + t1 += a[1] * b[0]; + t2 += a[2] * b[0]; + t3 += a[3] * b[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + + a += 4; + b += 1; + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; + FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; + FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + res[8] = a[0] * b[4]; + res[9] = a[1] * b[4]; + res[10] = a[0] * b[5]; + res[11] = a[1] * b[5]; + res[12] = a[0] * b[6]; + res[13] = a[1] * b[6]; + res[14] = a[0] * b[7]; + res[15] = a[1] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 8; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + res[8] += a[0] * b[4]; + res[9] += a[1] * b[4]; + res[10] += a[0] * b[5]; + res[11] += a[1] * b[5]; + res[12] += a[0] * b[6]; + res[13] += a[1] * b[6]; + res[14] += a[0] * b[7]; + res[15] += a[1] * b[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + + a += 2; + b += 8; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 = c0 * a0; + c1 = (c1 - c0 * a1) * a3; + + c0_nxt1 = c0_nxt1 * a0; + c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3; + + c0_nxt2 = c0_nxt2 * a0; + c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3; + + c0_nxt3 = c0_nxt3 * a0; + c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3; + + c0_nxt4 = c0_nxt4 * a0; + c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3; + + c0_nxt5 = c0_nxt5 * a0; + c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3; + + c0_nxt6 = c0_nxt6 * a0; + c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3; + + c0_nxt7 = c0_nxt7 * a0; + c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c0_nxt4; + *(b + 5) = c0_nxt5; + *(b + 6) = c0_nxt6; + *(b + 7) = c0_nxt7; + *(b + 8) = c1; + *(b + 9) = c1_nxt1; + *(b + 10) = c1_nxt2; + *(b + 11) = c1_nxt3; + *(b + 12) = c1_nxt4; + *(b + 13) = c1_nxt5; + *(b + 14) = c1_nxt6; + *(b + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 4; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + + a += 2; + b += 4; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + c1 -= c0 * a1; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt3 -= c0_nxt3 * a1; + c1 *= a3; + c1_nxt1 *= a3; + c1_nxt2 *= a3; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + res2 = a[0] * b[1]; + res3 = a[1] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + res2 += a[0] * b[1]; + res3 += a[1] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + c0_nxt -= res2; + c1_nxt -= res3; + + a += 2; + b += 2; + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c0_nxt *= a0; + c1 -= c0 * a1; + c1_nxt -= c0_nxt * a1; + c1 *= a3; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 1; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + } + + c0 -= res0; + c1 -= res1; + + a += 2; + b += 1; + } + + c0 *= *(a + 0); + + c1 -= c0 * *(a + 1); + c1 *= *(a + 3); + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG k; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = a[0] * b[0]; + c1 = a[0] * b[1]; + c2 = a[0] * b[2]; + c3 = a[0] * b[3]; + c4 = a[0] * b[4]; + c5 = a[0] * b[5]; + c6 = a[0] * b[6]; + c7 = a[0] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 8; + + c0 += a[0] * b[0]; + c1 += a[0] * b[1]; + c2 += a[0] * b[2]; + c3 += a[0] * b[3]; + c4 += a[0] * b[4]; + c5 += a[0] * b[5]; + c6 += a[0] * b[6]; + c7 += a[0] * b[7]; + } + + *(c + 0 * ldc) -= c0; + *(c + 1 * ldc) -= c1; + *(c + 2 * ldc) -= c2; + *(c + 3 * ldc) -= c3; + *(c + 4 * ldc) -= c4; + *(c + 5 * ldc) -= c5; + *(c + 6 * ldc) -= c6; + *(c + 7 * ldc) -= c7; + + a += 1; + b += 8; + } + + *c *= *a; + *(c + ldc) *= *a; + *(c + 2 * ldc) *= *a; + *(c + 3 * ldc) *= *a; + *(c + 4 * ldc) *= *a; + *(c + 5 * ldc) *= *a; + *(c + 6 * ldc) *= *a; + *(c + 7 * ldc) *= *a; + + *b = *c; + *(b + 1) = *(c + ldc); + *(b + 2) = *(c + 2 * ldc); + *(b + 3) = *(c + 3 * ldc); + *(b + 4) = *(c + 4 * ldc); + *(b + 5) = *(c + 5 * ldc); + *(b + 6) = *(c + 6 * ldc); + *(b + 7) = *(c + 7 * ldc); +} + +static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0 * ldc); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + res2 = a[0] * b[2]; + res3 = a[0] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 4; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + res2 += a[0] * b[2]; + res3 += a[0] * b[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + a += 1; + b += 4; + } + + c0 *= *a; + c1 *= *a; + c2 *= *a; + c3 *= *a; + + *c = c0; + *(c + ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + + *b = *c; + *(b + 1) = *(c + ldc); + *(b + 2) = *(c + 2 * ldc); + *(b + 3) = *(c + 3 * ldc); +} + +static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c1; + + c0 = *c; + c1 = *(c + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + a += 1; + b += 2; + } + + *c = c0 * *a; + *(c + ldc) = c1 * *a; + + *b = *c; + *(b + 1) = *(c + ldc); +} + +static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG k; + FLOAT res; + + res = a[0] * b[0]; + + for (k = (bk - 1); k--;) + { + a++; + b++; + + res += a[0] * b[0]; + } + + *c -= res; + + a++; + b++; + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + for (j = (n >> 3); j--;) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x8_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x8_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 2) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = offset; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + ssolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + ssolve_1x1_lt_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c new file mode 100644 index 000000000..04bca1b12 --- /dev/null +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -0,0 +1,2162 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + a += 8; + b += 8; + + for (k = (bk - 1); k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + + a += 8; + b += 8; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + src_c8 -= src_c0 * src_b4; + src_c9 -= src_c1 * src_b4; + src_c10 -= src_c0 * src_b5; + src_c11 -= src_c1 * src_b5; + src_c12 -= src_c0 * src_b6; + src_c13 -= src_c1 * src_b6; + src_c14 -= src_c0 * src_b7; + src_c15 -= src_c1 * src_b7; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c4 -= src_c2 * src_b10; + src_c5 -= src_c3 * src_b10; + src_c6 -= src_c2 * src_b11; + src_c7 -= src_c3 * src_b11; + src_c8 -= src_c2 * src_b12; + src_c9 -= src_c3 * src_b12; + src_c10 -= src_c2 * src_b13; + src_c11 -= src_c3 * src_b13; + src_c12 -= src_c2 * src_b14; + src_c13 -= src_c3 * src_b14; + src_c14 -= src_c2 * src_b15; + src_c15 -= src_c3 * src_b15; + + ST_SP2(src_c2, src_c3, a + 8, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = __msa_cast_to_vector_float(*(b + 31)); + src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c6 -= src_c4 * src_b19; + src_c7 -= src_c5 * src_b19; + src_c8 -= src_c4 * src_b20; + src_c9 -= src_c5 * src_b20; + src_c10 -= src_c4 * src_b21; + src_c11 -= src_c5 * src_b21; + src_c12 -= src_c4 * src_b22; + src_c13 -= src_c5 * src_b22; + src_c14 -= src_c4 * src_b23; + src_c15 -= src_c5 * src_b23; + + ST_SP2(src_c4, src_c5, a + 16, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c8 -= src_c6 * src_b28; + src_c9 -= src_c7 * src_b28; + src_c10 -= src_c6 * src_b29; + src_c11 -= src_c7 * src_b29; + src_c12 -= src_c6 * src_b30; + src_c13 -= src_c7 * src_b30; + src_c14 -= src_c6 * src_b31; + src_c15 -= src_c7 * src_b31; + + ST_SP2(src_c6, src_c7, a + 24, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = __msa_cast_to_vector_float(*(b + 54)); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); + src_b55 = __msa_cast_to_vector_float(*(b + 55)); + src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); + src_b63 = __msa_cast_to_vector_float(*(b + 63)); + src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c10 -= src_c8 * src_b37; + src_c11 -= src_c9 * src_b37; + src_c12 -= src_c8 * src_b38; + src_c13 -= src_c9 * src_b38; + src_c14 -= src_c8 * src_b39; + src_c15 -= src_c9 * src_b39; + + ST_SP2(src_c8, src_c9, a + 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c12 -= src_c10 * src_b46; + src_c13 -= src_c11 * src_b46; + src_c14 -= src_c10 * src_b47; + src_c15 -= src_c11 * src_b47; + + ST_SP2(src_c10, src_c11, a + 40, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c14 -= src_c12 * src_b55; + src_c15 -= src_c13 * src_b55; + + ST_SP2(src_c12, src_c13, a + 48, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + + src_c14 *= src_b63; + src_c15 *= src_b63; + + ST_SP2(src_c14, src_c15, a + 56, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); +} + +static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v4f32 src_b10, src_b11, src_b15, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + a += 8; + b += 4; + + for (k = (bk - 1) / 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + a += 8; + b += 4; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = __msa_cast_to_vector_float(*(b + 10)); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); + src_b11 = __msa_cast_to_vector_float(*(b + 11)); + src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); + src_b15 = __msa_cast_to_vector_float(*(b + 15)); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c2 *= src_b5; + src_c3 *= src_b5; + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c4 *= src_b10; + src_c5 *= src_b10; + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1, res2, res3; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + + a += 8; + b += 2; + + for (k = (bk - 1) / 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + a += 8; + b += 2; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, src_a1, res0, res1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + + a += 8; + b += 1; + + for (k = (bk - 1) >> 2; k--;) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + + if ((bk - 1) & 3) + { + if ((bk - 1) & 2) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + + if ((bk - 1) & 1) + { + LD_SP2(a, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*b); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + a += 8; + b += 1; + } + } + + LD_SP2(c, 4, src_c0, src_c1); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; + v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; + v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; + v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; + v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + a += 4; + b += 8; + + for (k = (bk - 1) / 2; k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + + a += 4; + b += 8; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b = LD_SP(b + 4); + SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); + + src_b = LD_SP(b + 9); + SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); + src_b13 = LD_SP(b + 13); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); + src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); + src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); + + src_b = LD_SP(b + 18); + SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); + src_b22 = LD_SP(b + 22); + src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); + src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); + + src_b = LD_SP(b + 27); + SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); + src_b31 = __msa_cast_to_vector_float(*(b + 31)); + src_b31 = (v4f32) __msa_splati_w((v4i32) src_b31, 0); + + src_b = LD_SP(b + 36); + SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); + + src_b45 = LD_SP(b + 45); + src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); + src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); + + src_b54 = __msa_cast_to_vector_float(*(b + 54)); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b54, 0); + src_b55 = __msa_cast_to_vector_float(*(b + 55)); + src_b55 = (v4f32) __msa_splati_w((v4i32) src_b55, 0); + src_b63 = __msa_cast_to_vector_float(*(b + 63)); + src_b63 = (v4f32) __msa_splati_w((v4i32) src_b63, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + src_c4 -= src_c0 * src_b4; + src_c5 -= src_c0 * src_b5; + src_c6 -= src_c0 * src_b6; + src_c7 -= src_c0 * src_b7; + + src_c1 *= src_b9; + src_c2 -= src_c1 * src_b10; + src_c3 -= src_c1 * src_b11; + src_c4 -= src_c1 * src_b12; + src_c5 -= src_c1 * src_b13; + src_c6 -= src_c1 * src_b14; + src_c7 -= src_c1 * src_b15; + + src_c2 *= src_b18; + src_c3 -= src_c2 * src_b19; + src_c4 -= src_c2 * src_b20; + src_c5 -= src_c2 * src_b21; + src_c6 -= src_c2 * src_b22; + src_c7 -= src_c2 * src_b23; + + src_c3 *= src_b27; + src_c4 -= src_c3 * src_b28; + src_c5 -= src_c3 * src_b29; + src_c6 -= src_c3 * src_b30; + src_c7 -= src_c3 * src_b31; + + src_c4 *= src_b36; + src_c5 -= src_c4 * src_b37; + src_c6 -= src_c4 * src_b38; + src_c7 -= src_c4 * src_b39; + + src_c5 *= src_b45; + src_c6 -= src_c5 * src_b46; + src_c7 -= src_c5 * src_b47; + + src_c6 *= src_b54; + src_c7 -= src_c6 * src_b55; + + src_c7 *= src_b63; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; + v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a0, res0, res1, res2, res3; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + a += 4; + b += 4; + + for (k = ((bk - 1) >> 1); k--;) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + if ((bk - 1) & 1) + { + src_a0 = LD_SP(a); + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + a += 4; + b += 4; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + src_b = LD_SP(b + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + src_b5 = LD_SP(b + 5); + src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); + src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b10 = __msa_cast_to_vector_float(*(b + 10)); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b10, 0); + src_b11 = __msa_cast_to_vector_float(*(b + 11)); + src_b11 = (v4f32) __msa_splati_w((v4i32) src_b11, 0); + src_b15 = __msa_cast_to_vector_float(*(b + 15)); + src_b15 = (v4f32) __msa_splati_w((v4i32) src_b15, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c2 -= src_c0 * src_b2; + src_c3 -= src_c0 * src_b3; + + src_c1 *= src_b5; + src_c2 -= src_c1 * src_b6; + src_c3 -= src_c1 * src_b7; + + src_c2 *= src_b10; + src_c3 -= src_c2 * src_b11; + + src_c3 *= src_b15; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0, src_b1, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk) + { + BLASLONG k; + v4f32 src_a, res0, res1; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a * src_b0; + res1 = src_a * src_b1; + + a += 4; + b += 2; + + for (k = ((bk - 1) >> 1); k--;) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + } + + if ((bk - 1) & 1) + { + src_a = LD_SP(a); + src_b0 = LD_SP(b); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + a += 4; + b += 2; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + } + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(b + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c0 *= src_b0; + src_c1 -= src_c0 * src_b1; + src_c1 *= src_b3; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk) + { + BLASLONG k; + FLOAT t0, t1, t2, t3; + + t0 = a[0] * b[0]; + t1 = a[1] * b[0]; + t2 = a[2] * b[0]; + t3 = a[3] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 4; + b += 1; + + t0 += a[0] * b[0]; + t1 += a[1] * b[0]; + t2 += a[2] * b[0]; + t3 += a[3] * b[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + + a += 4; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; + FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + FLOAT c0_nxt7, c1_nxt7; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[16]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + res[8] = a[0] * b[4]; + res[9] = a[1] * b[4]; + res[10] = a[0] * b[5]; + res[11] = a[1] * b[5]; + res[12] = a[0] * b[6]; + res[13] = a[1] * b[6]; + res[14] = a[0] * b[7]; + res[15] = a[1] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 8; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + res[8] += a[0] * b[4]; + res[9] += a[1] * b[4]; + res[10] += a[0] * b[5]; + res[11] += a[1] * b[5]; + res[12] += a[0] * b[6]; + res[13] += a[1] * b[6]; + res[14] += a[0] * b[7]; + res[15] += a[1] * b[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + + a += 2; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + + c0_nxt4 -= c0 * b4; + c1_nxt4 -= c1 * b4; + + c0_nxt5 -= c0 * b5; + c1_nxt5 -= c1 * b5; + + c0_nxt6 -= c0 * b6; + c1_nxt6 -= c1 * b6; + + c0_nxt7 -= c0 * b7; + c1_nxt7 -= c1 * b7; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0_nxt2 -= c0_nxt1 * b10; + c1_nxt2 -= c1_nxt1 * b10; + + c0_nxt3 -= c0_nxt1 * b11; + c1_nxt3 -= c1_nxt1 * b11; + + c0_nxt4 -= c0_nxt1 * b12; + c1_nxt4 -= c1_nxt1 * b12; + + c0_nxt5 -= c0_nxt1 * b13; + c1_nxt5 -= c1_nxt1 * b13; + + c0_nxt6 -= c0_nxt1 * b14; + c1_nxt6 -= c1_nxt1 * b14; + + c0_nxt7 -= c0_nxt1 * b15; + c1_nxt7 -= c1_nxt1 * b15; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt3 -= c0_nxt2 * b19; + c1_nxt3 -= c1_nxt2 * b19; + + c0_nxt4 -= c0_nxt2 * b20; + c1_nxt4 -= c1_nxt2 * b20; + + c0_nxt5 -= c0_nxt2 * b21; + c1_nxt5 -= c1_nxt2 * b21; + + c0_nxt6 -= c0_nxt2 * b22; + c1_nxt6 -= c1_nxt2 * b22; + + c0_nxt7 -= c0_nxt2 * b23; + c1_nxt7 -= c1_nxt2 * b23; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt4 -= c0_nxt3 * b28; + c1_nxt4 -= c1_nxt3 * b28; + + c0_nxt5 -= c0_nxt3 * b29; + c1_nxt5 -= c1_nxt3 * b29; + + c0_nxt6 -= c0_nxt3 * b30; + c1_nxt6 -= c1_nxt3 * b30; + + c0_nxt7 -= c0_nxt3 * b31; + c1_nxt7 -= c1_nxt3 * b31; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt5 -= c0_nxt4 * b37; + c1_nxt5 -= c1_nxt4 * b37; + + c0_nxt6 -= c0_nxt4 * b38; + c1_nxt6 -= c1_nxt4 * b38; + + c0_nxt7 -= c0_nxt4 * b39; + c1_nxt7 -= c1_nxt4 * b39; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt6 -= c0_nxt5 * b46; + c1_nxt6 -= c1_nxt5 * b46; + + c0_nxt7 -= c0_nxt5 * b47; + c1_nxt7 -= c1_nxt5 * b47; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt7 -= c0_nxt6 * b55; + c1_nxt7 -= c1_nxt6 * b55; + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; + FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[8]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + res[4] = a[0] * b[2]; + res[5] = a[1] * b[2]; + res[6] = a[0] * b[3]; + res[7] = a[1] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 4; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + res[4] += a[0] * b[2]; + res[5] += a[1] * b[2]; + res[6] += a[0] * b[3]; + res[7] += a[1] * b[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + + a += 2; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk) + { + BLASLONG k; + FLOAT res[4]; + + res[0] = a[0] * b[0]; + res[1] = a[1] * b[0]; + res[2] = a[0] * b[1]; + res[3] = a[1] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 2; + + res[0] += a[0] * b[0]; + res[1] += a[1] * b[0]; + res[2] += a[0] * b[1]; + res[3] += a[1] * b[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt -= res[2]; + c1_nxt -= res[3]; + + a += 2; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[1] * b[0]; + + for (k = (bk - 1); k--;) + { + a += 2; + b += 1; + + res0 += a[0] * b[0]; + res1 += a[1] * b[0]; + } + + c0 -= res0; + c1 -= res1; + + a += 2; + b += 1; + } + + b0 = *(b + 0); + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; + FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; + FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = a[0] * b[0]; + t1 = a[0] * b[1]; + t2 = a[0] * b[2]; + t3 = a[0] * b[3]; + t4 = a[0] * b[4]; + t5 = a[0] * b[5]; + t6 = a[0] * b[6]; + t7 = a[0] * b[7]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 8; + + t0 += a[0] * b[0]; + t1 += a[0] * b[1]; + t2 += a[0] * b[2]; + t3 += a[0] * b[3]; + t4 += a[0] * b[4]; + t5 += a[0] * b[5]; + t6 += a[0] * b[6]; + t7 += a[0] * b[7]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + + a += 1; + b += 8; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b4 = *(b + 4); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b9 = *(b + 9); + b10 = *(b + 10); + b11 = *(b + 11); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + b18 = *(b + 18); + b19 = *(b + 19); + b20 = *(b + 20); + b21 = *(b + 21); + b22 = *(b + 22); + b23 = *(b + 23); + b27 = *(b + 27); + b28 = *(b + 28); + b29 = *(b + 29); + b30 = *(b + 30); + b31 = *(b + 31); + b36 = *(b + 36); + b37 = *(b + 37); + b38 = *(b + 38); + b39 = *(b + 39); + b45 = *(b + 45); + b46 = *(b + 46); + b47 = *(b + 47); + b54 = *(b + 54); + b55 = *(b + 55); + b63 = *(b + 63); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b9; + + c2 -= c0 * b2; + c2 -= c1 * b10; + c2 *= b18; + + c3 -= c0 * b3; + c3 -= c1 * b11; + c3 -= c2 * b19; + c3 *= b27; + + c4 -= c0 * b4; + c4 -= c1 * b12; + c4 -= c2 * b20; + c4 -= c3 * b28; + c4 *= b36; + + c5 -= c0 * b5; + c5 -= c1 * b13; + c5 -= c2 * b21; + c5 -= c3 * b29; + c5 -= c4 * b37; + c5 *= b45; + + c6 -= c0 * b6; + c6 -= c1 * b14; + c6 -= c2 * b22; + c6 -= c3 * b30; + c6 -= c4 * b38; + c6 -= c5 * b46; + c6 *= b54; + + c7 -= c0 * b7; + c7 -= c1 * b15; + c7 -= c2 * b23; + c7 -= c3 * b31; + c7 -= c4 * b39; + c7 -= c5 * b47; + c7 -= c6 * b55; + c7 *= b63; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1, res2, res3; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + res2 = a[0] * b[2]; + res3 = a[0] * b[3]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 4; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + res2 += a[0] * b[2]; + res3 += a[0] * b[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + + a += 1; + b += 4; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b5; + + c2 -= c0 * b2; + c2 -= c1 * b6; + c2 *= b10; + + c3 -= c0 * b3; + c3 -= c1 * b7; + c3 -= c2 * b11; + c3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + if (bk) + { + BLASLONG k; + FLOAT res0, res1; + + res0 = a[0] * b[0]; + res1 = a[0] * b[1]; + + for (k = (bk - 1); k--;) + { + a += 1; + b += 2; + + res0 += a[0] * b[0]; + res1 += a[0] * b[1]; + } + + c0 -= res0; + c1 -= res1; + + a += 1; + b += 2; + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c1 -= c0 * b1; + c1 *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk) + { + BLASLONG k; + FLOAT res; + + res = a[0] * b[0]; + + for (k = (bk - 1); k--;) + { + a++; + b++; + + res += a[0] * b[0]; + } + + *c -= res; + + a++; + b++; + } + + *c *= *b; + *a = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = -offset; + + for (j = (n >> 3); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 8; + b += 8 * k; + c += 8 * ldc; + } + + if (n & 7) + { + if (n & 4) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + kk += 4; + } + + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rn_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c new file mode 100644 index 000000000..25a8a0b6e --- /dev/null +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -0,0 +1,2118 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + v4f32 res8, res9, res10, res11, res12, res13, res14, res15; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 = src_a0 * src_b0; + res9 = src_a1 * src_b0; + res10 = src_a0 * src_b1; + res11 = src_a1 * src_b1; + res12 = src_a0 * src_b2; + res13 = src_a1 * src_b2; + res14 = src_a0 * src_b3; + res15 = src_a1 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 8; + bb += 8; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res8 += src_a0 * src_b0; + res9 += src_a1 * src_b0; + res10 += src_a0 * src_b1; + res11 += src_a1 * src_b1; + res12 += src_a0 * src_b2; + res13 += src_a1 * src_b2; + res14 += src_a0 * src_b3; + res15 += src_a1 * src_b3; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + src_c8 -= res8; + src_c9 -= res9; + src_c10 -= res10; + src_c11 -= res11; + src_c12 -= res12; + src_c13 -= res13; + src_c14 -= res14; + src_c15 -= res15; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + LD_SP2(c_nxt4line, 4, src_c8, src_c9); + LD_SP2(c_nxt5line, 4, src_c10, src_c11); + LD_SP2(c_nxt6line, 4, src_c12, src_c13); + LD_SP2(c_nxt7line, 4, src_c14, src_c15); + } + + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_c15 *= src_b63; + src_c14 *= src_b63; + src_c13 -= src_c15 * src_b62; + src_c12 -= src_c14 * src_b62; + src_c11 -= src_c15 * src_b61; + src_c10 -= src_c14 * src_b61; + src_c9 -= src_c15 * src_b60; + src_c8 -= src_c14 * src_b60; + src_c7 -= src_c15 * src_b59; + src_c6 -= src_c14 * src_b59; + src_c5 -= src_c15 * src_b58; + src_c4 -= src_c14 * src_b58; + src_c3 -= src_c15 * src_b57; + src_c2 -= src_c14 * src_b57; + src_c1 -= src_c15 * src_b56; + src_c0 -= src_c14 * src_b56; + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_c12 *= src_b54; + src_c13 *= src_b54; + src_c10 -= src_c12 * src_b53; + src_c11 -= src_c13 * src_b53; + src_c8 -= src_c12 * src_b52; + src_c9 -= src_c13 * src_b52; + src_c6 -= src_c12 * src_b51; + src_c7 -= src_c13 * src_b51; + src_c4 -= src_c12 * src_b50; + src_c5 -= src_c13 * src_b50; + src_c2 -= src_c12 * src_b49; + src_c3 -= src_c13 * src_b49; + src_c0 -= src_c12 * src_b48; + src_c1 -= src_c13 * src_b48; + + ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4); + ST_SP2(src_c12, src_c13, c_nxt6line, 4); + ST_SP2(src_c14, src_c15, c_nxt7line, 4); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_c10 *= src_b45; + src_c11 *= src_b45; + src_c8 -= src_c10 * src_b44; + src_c9 -= src_c11 * src_b44; + src_c6 -= src_c10 * src_b43; + src_c7 -= src_c11 * src_b43; + src_c4 -= src_c10 * src_b42; + src_c5 -= src_c11 * src_b42; + src_c2 -= src_c10 * src_b41; + src_c3 -= src_c11 * src_b41; + src_c0 -= src_c10 * src_b40; + src_c1 -= src_c11 * src_b40; + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = __msa_cast_to_vector_float(*(b + 36)); + src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + + src_c8 *= src_b36; + src_c9 *= src_b36; + src_c6 -= src_c8 * src_b35; + src_c7 -= src_c9 * src_b35; + src_c4 -= src_c8 * src_b34; + src_c5 -= src_c9 * src_b34; + src_c2 -= src_c8 * src_b33; + src_c3 -= src_c9 * src_b33; + src_c0 -= src_c8 * src_b32; + src_c1 -= src_c9 * src_b32; + + ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4); + ST_SP2(src_c8, src_c9, c_nxt4line, 4); + ST_SP2(src_c10, src_c11, c_nxt5line, 4); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_c6 *= src_b27; + src_c7 *= src_b27; + src_c4 -= src_c6 * src_b26; + src_c5 -= src_c7 * src_b26; + src_c2 -= src_c6 * src_b25; + src_c3 -= src_c7 * src_b25; + src_c0 -= src_c6 * src_b24; + src_c1 -= src_c7 * src_b24; + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_c4 *= src_b18; + src_c5 *= src_b18; + src_c2 -= src_c4 * src_b17; + src_c3 -= src_c5 * src_b17; + src_c0 -= src_c4 * src_b16; + src_c1 -= src_c5 * src_b16; + + ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); + + src_b9 = __msa_cast_to_vector_float(*(b + 9)); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); + src_b8 = __msa_cast_to_vector_float(*(b + 8)); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c2 *= src_b9; + src_c3 *= src_b9; + src_c0 -= src_c2 * src_b8; + src_c1 -= src_c3 * src_b8; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; + v4f32 src_b13, src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + res4 = src_a0 * src_b2; + res5 = src_a1 * src_b2; + res6 = src_a0 * src_b3; + res7 = src_a1 * src_b3; + + for (k = (bk - 1) / 2; k--;) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 4; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + res4 += src_a0 * src_b2; + res5 += src_a1 * src_b2; + res6 += src_a0 * src_b3; + res7 += src_a1 * src_b3; + + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + LD_SP2(c_nxt2line, 4, src_c4, src_c5); + LD_SP2(c_nxt3line, 4, src_c6, src_c7); + } + + a -= 32; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = __msa_cast_to_vector_float(*(b + 5)); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b4 = __msa_cast_to_vector_float(*(b + 4)); + src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c7 *= src_b15; + src_c6 *= src_b15; + src_c5 -= src_c7 * src_b14; + src_c4 -= src_c6 * src_b14; + src_c3 -= src_c7 * src_b13; + src_c2 -= src_c6 * src_b13; + src_c1 -= src_c7 * src_b12; + src_c0 -= src_c6 * src_b12; + + src_c5 *= src_b10; + src_c4 *= src_b10; + src_c3 -= src_c5 * src_b9; + src_c2 -= src_c4 * src_b9; + src_c1 -= src_c5 * src_b8; + src_c0 -= src_c4 * src_b8; + + src_c3 *= src_b5; + src_c2 *= src_b5; + src_c1 -= src_c3 * src_b4; + src_c0 -= src_c2 * src_b4; + + src_c1 *= src_b0; + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); + ST_SP2(src_c4, src_c5, c_nxt2line, 4); + ST_SP2(src_c6, src_c7, c_nxt3line, 4); +} + +static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, res0, res1, res2, res3; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + res2 = src_a0 * src_b1; + res3 = src_a1 * src_b1; + + for (k = (bk - 1) >> 1; k--;) + { + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 2; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b1 = __msa_cast_to_vector_float(*(bb + 1)); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + res2 += src_a0 * src_b1; + res3 += src_a1 * src_b1; + } + + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + LD_SP2(c_nxt1line, 4, src_c2, src_c3); + } + + a -= 16; + b -= 4; + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + src_b2 = __msa_cast_to_vector_float(*(b + 2)); + src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + + src_c2 *= src_b3; + src_c3 *= src_b3; + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP2(src_c0, src_c1, c, 4); + ST_SP2(src_c2, src_c3, c_nxt1line, 4); +} + +static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, res0, res1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a0 * src_b0; + res1 = src_a1 * src_b0; + + for (k = (bk - 1) >> 2; k--;) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + + if ((bk - 1) & 3) + { + if ((bk - 1) & 2) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + + if ((bk - 1) & 1) + { + aa += 8; + bb += 1; + + LD_SP2(aa, 4, src_a0, src_a1); + + src_b0 = __msa_cast_to_vector_float(*bb); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a0 * src_b0; + res1 += src_a1 * src_b0; + } + } + + LD_SP2(c, 4, src_c0, src_c1); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + LD_SP2(c, 4, src_c0, src_c1); + } + + a -= 8; + b -= 1; + + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + ST_SP2(src_c0, src_c1, c, 4); +} + +static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; + v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; + v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; + v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; + v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + FLOAT *c_nxt4line = c + 4 * ldc; + FLOAT *c_nxt5line = c + 5 * ldc; + FLOAT *c_nxt6line = c + 6 * ldc; + FLOAT *c_nxt7line = c + 7 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_b1, src_b2, src_b3; + v4f32 res0, res1, res2, res3, res4, res5, res6, res7; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a0 * src_b0; + res1 = src_a0 * src_b1; + res2 = src_a0 * src_b2; + res3 = src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 = src_a0 * src_b0; + res5 = src_a0 * src_b1; + res6 = src_a0 * src_b2; + res7 = src_a0 * src_b3; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 8; + + src_a0 = LD_SP(aa); + + src_b = LD_SP(bb + 0); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 += src_a0 * src_b0; + res1 += src_a0 * src_b1; + res2 += src_a0 * src_b2; + res3 += src_a0 * src_b3; + + src_b = LD_SP(bb + 4); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res4 += src_a0 * src_b0; + res5 += src_a0 * src_b1; + res6 += src_a0 * src_b2; + res7 += src_a0 * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + src_c4 -= res4; + src_c5 -= res5; + src_c6 -= res6; + src_c7 -= res7; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + src_c4 = LD_SP(c_nxt4line); + src_c5 = LD_SP(c_nxt5line); + src_c6 = LD_SP(c_nxt6line); + src_c7 = LD_SP(c_nxt7line); + } + + a -= 32; + b -= 64; + + src_b = LD_SP(b + 60); + SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); + src_b = LD_SP(b + 56); + SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); + + src_b = LD_SP(b + 48); + SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); + src_b52 = LD_SP(b + 52); + src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); + src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); + src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); + + src_b = LD_SP(b + 40); + SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); + src_b44 = LD_SP(b + 44); + src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); + src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); + + src_b = LD_SP(b + 32); + SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); + src_b36 = __msa_cast_to_vector_float(*(b + 36)); + src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0); + + src_b = LD_SP(b + 24); + SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); + + src_b16 = LD_SP(b + 16); + src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); + src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); + src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); + + src_b9 = __msa_cast_to_vector_float(*(b + 9)); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0); + src_b8 = __msa_cast_to_vector_float(*(b + 8)); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c7 *= src_b63; + src_c6 -= src_c7 * src_b62; + src_c5 -= src_c7 * src_b61; + src_c4 -= src_c7 * src_b60; + src_c3 -= src_c7 * src_b59; + src_c2 -= src_c7 * src_b58; + src_c1 -= src_c7 * src_b57; + src_c0 -= src_c7 * src_b56; + + src_c6 *= src_b54; + src_c5 -= src_c6 * src_b53; + src_c4 -= src_c6 * src_b52; + src_c3 -= src_c6 * src_b51; + src_c2 -= src_c6 * src_b50; + src_c1 -= src_c6 * src_b49; + src_c0 -= src_c6 * src_b48; + + src_c5 *= src_b45; + src_c4 -= src_c5 * src_b44; + src_c3 -= src_c5 * src_b43; + src_c2 -= src_c5 * src_b42; + src_c1 -= src_c5 * src_b41; + src_c0 -= src_c5 * src_b40; + + src_c4 *= src_b36; + src_c3 -= src_c4 * src_b35; + src_c2 -= src_c4 * src_b34; + src_c1 -= src_c4 * src_b33; + src_c0 -= src_c4 * src_b32; + + src_c3 *= src_b27; + src_c2 -= src_c3 * src_b26; + src_c1 -= src_c3 * src_b25; + src_c0 -= src_c3 * src_b24; + + src_c2 *= src_b18; + src_c1 -= src_c2 * src_b17; + src_c0 -= src_c2 * src_b16; + + src_c1 *= src_b9; + src_c0 -= src_c1 * src_b8; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); + ST_SP(src_c4, c_nxt4line); + ST_SP(src_c5, c_nxt5line); + ST_SP(src_c6, c_nxt6line); + ST_SP(src_c7, c_nxt7line); +} + +static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_c2, src_c3, src_b; + v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v4f32 src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, src_b2, src_b3, res0, res1, res2, res3; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + res0 = src_a * src_b0; + res1 = src_a * src_b1; + res2 = src_a * src_b2; + res3 = src_a * src_b3; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 4; + + src_a = LD_SP(aa); + + src_b = LD_SP(bb); + SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + res2 += src_a * src_b2; + res3 += src_a * src_b3; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + + src_c0 -= res0; + src_c1 -= res1; + src_c2 -= res2; + src_c3 -= res3; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + src_c2 = LD_SP(c_nxt2line); + src_c3 = LD_SP(c_nxt3line); + } + + a -= 16; + b -= 16; + + src_b = LD_SP(b + 12); + SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); + src_b8 = LD_SP(b + 8); + src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); + src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); + src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); + src_b5 = __msa_cast_to_vector_float(*(b + 5)); + src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); + src_b4 = __msa_cast_to_vector_float(*(b + 4)); + src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c3 *= src_b15; + src_c2 -= src_c3 * src_b14; + src_c1 -= src_c3 * src_b13; + src_c0 -= src_c3 * src_b12; + + src_c2 *= src_b10; + src_c1 -= src_c2 * src_b9; + src_c0 -= src_c2 * src_b8; + + src_c1 *= src_b5; + src_c0 -= src_c1 * src_b4; + + src_c0 *= src_b0; + + ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); + ST_SP(src_c2, c_nxt2line); + ST_SP(src_c3, c_nxt3line); +} + +static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v4f32 src_c0, src_c1, src_b0, src_b2, src_b3; + FLOAT *c_nxt1line = c + ldc; + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a, src_b1, res0, res1; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 = src_a * src_b0; + res1 = src_a * src_b1; + + for (k = ((bk - 1) >> 1); k--;) + { + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + } + + if ((bk - 1) & 1) + { + aa += 4; + bb += 2; + + src_a = LD_SP(aa); + src_b0 = LD_SP(bb); + src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + res0 += src_a * src_b0; + res1 += src_a * src_b1; + } + + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + + src_c0 -= res0; + src_c1 -= res1; + } + else + { + src_c0 = LD_SP(c); + src_c1 = LD_SP(c_nxt1line); + } + + a -= 8; + b -= 4; + + src_b3 = __msa_cast_to_vector_float(*(b + 3)); + src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0); + src_b2 = __msa_cast_to_vector_float(*(b + 2)); + src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0); + src_b0 = __msa_cast_to_vector_float(*(b + 0)); + src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); + + src_c1 *= src_b3; + src_c0 -= src_c1 * src_b2; + src_c0 *= src_b0; + + ST_SP2(src_c0, src_c1, a, 4); + + ST_SP(src_c0, c); + ST_SP(src_c1, c_nxt1line); +} + +static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3; + + t0 = aa[0] * bb[0]; + t1 = aa[1] * bb[0]; + t2 = aa[2] * bb[0]; + t3 = aa[3] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 4; + bb += 1; + + t0 += aa[0] * bb[0]; + t1 += aa[1] * bb[0]; + t2 += aa[2] * bb[0]; + t3 += aa[3] * bb[0]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + } + + a -= 4; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + c0_nxt4 = *(c + 0 + 4 * ldc); + c1_nxt4 = *(c + 1 + 4 * ldc); + c0_nxt5 = *(c + 0 + 5 * ldc); + c1_nxt5 = *(c + 1 + 5 * ldc); + c0_nxt6 = *(c + 0 + 6 * ldc); + c1_nxt6 = *(c + 1 + 6 * ldc); + c0_nxt7 = *(c + 0 + 7 * ldc); + c1_nxt7 = *(c + 1 + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[16]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + res[8] = aa[0] * bb[4]; + res[9] = aa[1] * bb[4]; + res[10] = aa[0] * bb[5]; + res[11] = aa[1] * bb[5]; + res[12] = aa[0] * bb[6]; + res[13] = aa[1] * bb[6]; + res[14] = aa[0] * bb[7]; + res[15] = aa[1] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 8; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + res[8] += aa[0] * bb[4]; + res[9] += aa[1] * bb[4]; + res[10] += aa[0] * bb[5]; + res[11] += aa[1] * bb[5]; + res[12] += aa[0] * bb[6]; + res[13] += aa[1] * bb[6]; + res[14] += aa[0] * bb[7]; + res[15] += aa[1] * bb[7]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + c0_nxt4 -= res[8]; + c1_nxt4 -= res[9]; + c0_nxt5 -= res[10]; + c1_nxt5 -= res[11]; + c0_nxt6 -= res[12]; + c1_nxt6 -= res[13]; + c0_nxt7 -= res[14]; + c1_nxt7 -= res[15]; + } + + a -= 16; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c0_nxt7 *= b63; + c1_nxt7 *= b63; + + c0_nxt6 -= c0_nxt7 * b62; + c1_nxt6 -= c1_nxt7 * b62; + + c0_nxt6 *= b54; + c1_nxt6 *= b54; + + c0_nxt5 -= c0_nxt7 * b61; + c1_nxt5 -= c1_nxt7 * b61; + + c0_nxt5 -= c0_nxt6 * b53; + c1_nxt5 -= c1_nxt6 * b53; + + c0_nxt5 *= b45; + c1_nxt5 *= b45; + + c0_nxt4 -= c0_nxt7 * b60; + c1_nxt4 -= c1_nxt7 * b60; + + c0_nxt4 -= c0_nxt6 * b52; + c1_nxt4 -= c1_nxt6 * b52; + + c0_nxt4 -= c0_nxt5 * b44; + c1_nxt4 -= c1_nxt5 * b44; + + c0_nxt4 *= b36; + c1_nxt4 *= b36; + + c0_nxt3 -= c0_nxt7 * b59; + c1_nxt3 -= c1_nxt7 * b59; + + c0_nxt3 -= c0_nxt6 * b51; + c1_nxt3 -= c1_nxt6 * b51; + + c0_nxt3 -= c0_nxt5 * b43; + c1_nxt3 -= c1_nxt5 * b43; + + c0_nxt3 -= c0_nxt4 * b35; + c1_nxt3 -= c1_nxt4 * b35; + + c0_nxt3 *= b27; + c1_nxt3 *= b27; + + c0_nxt2 -= c0_nxt7 * b58; + c1_nxt2 -= c1_nxt7 * b58; + + c0_nxt2 -= c0_nxt6 * b50; + c1_nxt2 -= c1_nxt6 * b50; + + c0_nxt2 -= c0_nxt5 * b42; + c1_nxt2 -= c1_nxt5 * b42; + + c0_nxt2 -= c0_nxt4 * b34; + c1_nxt2 -= c1_nxt4 * b34; + + c0_nxt2 -= c0_nxt3 * b26; + c1_nxt2 -= c1_nxt3 * b26; + + c0_nxt2 *= b18; + c1_nxt2 *= b18; + + c0_nxt1 -= c0_nxt7 * b57; + c1_nxt1 -= c1_nxt7 * b57; + + c0_nxt1 -= c0_nxt6 * b49; + c1_nxt1 -= c1_nxt6 * b49; + + c0_nxt1 -= c0_nxt5 * b41; + c1_nxt1 -= c1_nxt5 * b41; + + c0_nxt1 -= c0_nxt4 * b33; + c1_nxt1 -= c1_nxt4 * b33; + + c0_nxt1 -= c0_nxt3 * b25; + c1_nxt1 -= c1_nxt3 * b25; + + c0_nxt1 -= c0_nxt2 * b17; + c1_nxt1 -= c1_nxt2 * b17; + + c0_nxt1 *= b9; + c1_nxt1 *= b9; + + c0 -= c0_nxt7 * b56; + c1 -= c1_nxt7 * b56; + + c0 -= c0_nxt6 * b48; + c1 -= c1_nxt6 * b48; + + c0 -= c0_nxt5 * b40; + c1 -= c1_nxt5 * b40; + + c0 -= c0_nxt4 * b32; + c1 -= c1_nxt4 * b32; + + c0 -= c0_nxt3 * b24; + c1 -= c1_nxt3 * b24; + + c0 -= c0_nxt2 * b16; + c1 -= c1_nxt2 * b16; + + c0 -= c0_nxt1 * b8; + c1 -= c1_nxt1 * b8; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + *(a + 8) = c0_nxt4; + *(a + 9) = c1_nxt4; + *(a + 10) = c0_nxt5; + *(a + 11) = c1_nxt5; + *(a + 12) = c0_nxt6; + *(a + 13) = c1_nxt6; + *(a + 14) = c0_nxt7; + *(a + 15) = c1_nxt7; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; + *(c + 0 + 4 * ldc) = c0_nxt4; + *(c + 1 + 4 * ldc) = c1_nxt4; + *(c + 0 + 5 * ldc) = c0_nxt5; + *(c + 1 + 5 * ldc) = c1_nxt5; + *(c + 0 + 6 * ldc) = c0_nxt6; + *(c + 1 + 6 * ldc) = c1_nxt6; + *(c + 0 + 7 * ldc) = c0_nxt7; + *(c + 1 + 7 * ldc) = c1_nxt7; +} + +static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[8]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + res[4] = aa[0] * bb[2]; + res[5] = aa[1] * bb[2]; + res[6] = aa[0] * bb[3]; + res[7] = aa[1] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 4; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + res[4] += aa[0] * bb[2]; + res[5] += aa[1] * bb[2]; + res[6] += aa[0] * bb[3]; + res[7] += aa[1] * bb[3]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt1 -= res[2]; + c1_nxt1 -= res[3]; + c0_nxt2 -= res[4]; + c1_nxt2 -= res[5]; + c0_nxt3 -= res[6]; + c1_nxt3 -= res[7]; + } + + a -= 8; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10; + c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10; + + c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5; + c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5; + + c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0; + c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res[4]; + + res[0] = aa[0] * bb[0]; + res[1] = aa[1] * bb[0]; + res[2] = aa[0] * bb[1]; + res[3] = aa[1] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 2; + + res[0] += aa[0] * bb[0]; + res[1] += aa[1] * bb[0]; + res[2] += aa[0] * bb[1]; + res[3] += aa[1] * bb[1]; + } + + c0 -= res[0]; + c1 -= res[1]; + c0_nxt -= res[2]; + c1_nxt -= res[3]; + } + + a -= 4; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c1 -= c1_nxt * b2; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[1] * bb[0]; + + for (k = (bk - 1); k--;) + { + aa += 2; + bb += 1; + + res0 += aa[0] * bb[0]; + res1 += aa[1] * bb[0]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 2; + b -= 1; + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; + FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; + FLOAT b56, b57, b58, b59, b60, b61, b62, b63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + c4 = *(c + 4 * ldc); + c5 = *(c + 5 * ldc); + c6 = *(c + 6 * ldc); + c7 = *(c + 7 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = aa[0] * bb[0]; + t1 = aa[0] * bb[1]; + t2 = aa[0] * bb[2]; + t3 = aa[0] * bb[3]; + t4 = aa[0] * bb[4]; + t5 = aa[0] * bb[5]; + t6 = aa[0] * bb[6]; + t7 = aa[0] * bb[7]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 8; + + t0 += aa[0] * bb[0]; + t1 += aa[0] * bb[1]; + t2 += aa[0] * bb[2]; + t3 += aa[0] * bb[3]; + t4 += aa[0] * bb[4]; + t5 += aa[0] * bb[5]; + t6 += aa[0] * bb[6]; + t7 += aa[0] * bb[7]; + } + + c0 -= t0; + c1 -= t1; + c2 -= t2; + c3 -= t3; + c4 -= t4; + c5 -= t5; + c6 -= t6; + c7 -= t7; + } + + a -= 8; + b -= 64; + + b0 = *(b + 0); + b8 = *(b + 8); + b9 = *(b + 9); + b16 = *(b + 16); + b17 = *(b + 17); + b18 = *(b + 18); + b24 = *(b + 24); + b25 = *(b + 25); + b26 = *(b + 26); + b27 = *(b + 27); + b32 = *(b + 32); + b33 = *(b + 33); + b34 = *(b + 34); + b35 = *(b + 35); + b36 = *(b + 36); + b40 = *(b + 40); + b41 = *(b + 41); + b42 = *(b + 42); + b43 = *(b + 43); + b44 = *(b + 44); + b45 = *(b + 45); + b48 = *(b + 48); + b49 = *(b + 49); + b50 = *(b + 50); + b51 = *(b + 51); + b52 = *(b + 52); + b53 = *(b + 53); + b54 = *(b + 54); + b56 = *(b + 56); + b57 = *(b + 57); + b58 = *(b + 58); + b59 = *(b + 59); + b60 = *(b + 60); + b61 = *(b + 61); + b62 = *(b + 62); + b63 = *(b + 63); + + c7 *= b63; + + c6 -= c7 * b62; + c6 *= b54; + + c5 -= c7 * b61; + c5 -= c6 * b53; + c5 *= b45; + + c4 -= c7 * b60; + c4 -= c6 * b52; + c4 -= c5 * b44; + c4 *= b36; + + c3 -= c7 * b59; + c3 -= c6 * b51; + c3 -= c5 * b43; + c3 -= c4 * b35; + c3 *= b27; + + c2 -= c7 * b58; + c2 -= c6 * b50; + c2 -= c5 * b42; + c2 -= c4 * b34; + c2 -= c3 * b26; + c2 *= b18; + + c1 -= c7 * b57; + c1 -= c6 * b49; + c1 -= c5 * b41; + c1 -= c4 * b33; + c1 -= c3 * b25; + c1 -= c2 * b17; + c1 *= b9; + + c0 -= c7 * b56; + c0 -= c6 * b48; + c0 -= c5 * b40; + c0 -= c4 * b32; + c0 -= c3 * b24; + c0 -= c2 * b16; + c0 -= c1 * b8; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + *(a + 4) = c4; + *(a + 5) = c5; + *(a + 6) = c6; + *(a + 7) = c7; + + *(c + 0) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; + *(c + 4 * ldc) = c4; + *(c + 5 * ldc) = c5; + *(c + 6 * ldc) = c6; + *(c + 7 * ldc) = c7; +} + +static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1 * ldc); + c2 = *(c + 2 * ldc); + c3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1, res2, res3; + + res0 = aa[0] * bb[0]; + res1 = aa[0] * bb[1]; + res2 = aa[0] * bb[2]; + res3 = aa[0] * bb[3]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 4; + + res0 += aa[0] * bb[0]; + res1 += aa[0] * bb[1]; + res2 += aa[0] * bb[2]; + res3 += aa[0] * bb[3]; + } + + c0 -= res0; + c1 -= res1; + c2 -= res2; + c3 -= res3; + } + + a -= 4; + b -= 16; + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c3 *= b15; + c2 = (c2 - c3 * b14) * b10; + c1 = ((c1 - c3 * b13) - c2 * b9) * b5; + c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c) = c0; + *(c + 1 * ldc) = c1; + *(c + 2 * ldc) = c2; + *(c + 3 * ldc) = c3; +} + +static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + ldc); + + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res0, res1; + + res0 = aa[0] * bb[0]; + res1 = aa[0] * bb[1]; + + for (k = (bk - 1); k--;) + { + aa += 1; + bb += 2; + + res0 += aa[0] * bb[0]; + res1 += aa[0] * bb[1]; + } + + c0 -= res0; + c1 -= res1; + } + + a -= 2; + b -= 4; + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c1 *= b3; + + c0 -= c1 * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + ldc) = c1; +} + +static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG k; + FLOAT *aa = a, *bb = b; + FLOAT res; + + res = *aa * *bb; + + for (k = (bk - 1); k--;) + { + aa++; + bb++; + + res += *aa * *bb; + } + + *c -= res; + } + + *c *= *(a - 1); + *(b - 1) = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + FLOAT *aa, *cc; + BLASLONG i, j, kk; + + kk = n - offset; + c += n * ldc; + b += n * k; + + if (n & 7) + { + if (n & 1) + { + aa = a; + b -= k; + c -= ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + b -= 2 * k; + c -= 2 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 2; + } + + if (n & 4) + { + aa = a; + b -= 4 * k; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + } + + for (j = (n >> 3); j--;) + { + aa = a; + b -= 8 * k; + c -= 8 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk)); + + aa += k; + cc += 1; + } + } + + kk -= 8; + } + + return 0; +}