diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 999413be2..a13308f71 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -157,3 +157,6 @@ In chronological order:
* Shivraj Patil
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
+
+* Kaustubh Raste
+ * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index d21575251..0ac30d77c 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -118,10 +118,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
+DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
+DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
+DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
new file mode 100644
index 000000000..9f0eb95a5
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
@@ -0,0 +1,1416 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
+ v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
+ v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
+ v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
+ v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
+ v2f64 src_a61, src_a62, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(pbb, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ pba += 8;
+ pbb += 4;
+
+ LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
+ LD_DP2(pbb, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a8;
+ src_a1 = src_a9;
+ src_a2 = src_a16;
+ src_a3 = src_a17;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+ }
+
+ a -= 64;
+ b -= 32;
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
+ res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
+ res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
+ res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
+ res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
+ res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
+ res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
+ res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
+
+ src_a54 = __msa_cast_to_vector_double(*(a + 54));
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+ src_a62 = LD_DP(a + 62);
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
+ src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
+ src_a60 = LD_DP(a + 60);
+ src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
+ src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
+ src_a52 = LD_DP(a + 52);
+ src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
+ src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
+ src_a44 = LD_DP(a + 44);
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
+ src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
+ src_a36 = __msa_cast_to_vector_double(*(a + 36));
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+
+ res_c7 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c6 *= src_a54;
+
+ res_c15 *= src_a63;
+ res_c14 -= res_c15 * src_a62;
+ res_c14 *= src_a54;
+
+ ST_DP(res_c7, b + 28);
+ ST_DP(res_c6, b + 24);
+ ST_DP(res_c15, b + 30);
+ ST_DP(res_c14, b + 26);
+ src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
+ src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
+ ST_DP(src_c3, c + 6);
+ ST_DP(src_c7, c_nxt1line + 6);
+ ST_DP(src_c11, c_nxt2line + 6);
+ ST_DP(src_c15, c_nxt3line + 6);
+
+ res_c5 -= res_c7 * src_a61;
+ res_c5 -= res_c6 * src_a53;
+ res_c5 *= src_a45;
+
+ res_c4 -= res_c7 * src_a60;
+ res_c4 -= res_c6 * src_a52;
+ res_c4 -= res_c5 * src_a44;
+ res_c4 *= src_a36;
+
+ res_c13 -= res_c15 * src_a61;
+ res_c13 -= res_c14 * src_a53;
+ res_c13 *= src_a45;
+
+ res_c12 -= res_c15 * src_a60;
+ res_c12 -= res_c14 * src_a52;
+ res_c12 -= res_c13 * src_a44;
+ res_c12 *= src_a36;
+
+ src_a56 = LD_DP(a + 56);
+ src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
+ src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
+ src_a58 = LD_DP(a + 58);
+ src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
+ src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
+
+ ST_DP(res_c4, b + 16);
+ ST_DP(res_c5, b + 20);
+ ST_DP(res_c12, b + 18);
+ ST_DP(res_c13, b + 22);
+
+ src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
+ src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
+ ST_DP(src_c2, c + 4);
+ ST_DP(src_c6, c_nxt1line + 4);
+ ST_DP(src_c10, c_nxt2line + 4);
+ ST_DP(src_c14, c_nxt3line + 4);
+
+ src_a50 = LD_DP(a + 50);
+ src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
+ src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
+ src_a42 = LD_DP(a + 42);
+ src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
+ src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
+ src_a34 = LD_DP(a + 34);
+ src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
+ src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
+ src_a26 = LD_DP(a + 26);
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
+ src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
+ src_a18 = __msa_cast_to_vector_double(*(a + 18));
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+
+ res_c3 -= res_c7 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+
+ res_c11 -= res_c15 * src_a59;
+ res_c10 -= res_c15 * src_a58;
+ res_c9 -= res_c15 * src_a57;
+ res_c8 -= res_c15 * src_a56;
+
+ res_c3 -= res_c6 * src_a51;
+ res_c3 -= res_c5 * src_a43;
+ res_c3 -= res_c4 * src_a35;
+ res_c3 *= src_a27;
+
+ res_c2 -= res_c6 * src_a50;
+ res_c2 -= res_c5 * src_a42;
+ res_c2 -= res_c4 * src_a34;
+ res_c2 -= res_c3 * src_a26;
+ res_c2 *= src_a18;
+
+ res_c11 -= res_c14 * src_a51;
+ res_c11 -= res_c13 * src_a43;
+ res_c11 -= res_c12 * src_a35;
+ res_c11 *= src_a27;
+
+ res_c10 -= res_c14 * src_a50;
+ res_c10 -= res_c13 * src_a42;
+ res_c10 -= res_c12 * src_a34;
+ res_c10 -= res_c11 * src_a26;
+ res_c10 *= src_a18;
+
+ src_a48 = LD_DP(a + 48);
+ src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
+ src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
+ src_a40 = LD_DP(a + 40);
+ src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
+ src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
+
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c10, b + 10);
+ ST_DP(res_c11, b + 14);
+
+ src_a32 = LD_DP(a + 32);
+ src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
+ src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
+ src_a24 = LD_DP(a + 24);
+ src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
+ src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
+
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
+ src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
+ ST_DP(src_c1, c + 2);
+ ST_DP(src_c5, c_nxt1line + 2);
+ ST_DP(src_c9, c_nxt2line + 2);
+ ST_DP(src_c13, c_nxt3line + 2);
+
+ res_c1 -= res_c6 * src_a49;
+ res_c1 -= res_c5 * src_a41;
+ res_c1 -= res_c4 * src_a33;
+ res_c1 -= res_c3 * src_a25;
+
+ res_c0 -= res_c6 * src_a48;
+ res_c0 -= res_c5 * src_a40;
+ res_c0 -= res_c4 * src_a32;
+ res_c0 -= res_c3 * src_a24;
+
+ res_c9 -= res_c14 * src_a49;
+ res_c9 -= res_c13 * src_a41;
+ res_c9 -= res_c12 * src_a33;
+ res_c9 -= res_c11 * src_a25;
+
+ res_c8 -= res_c14 * src_a48;
+ res_c8 -= res_c13 * src_a40;
+ res_c8 -= res_c12 * src_a32;
+ res_c8 -= res_c11 * src_a24;
+
+ src_a16 = LD_DP(a + 16);
+ src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
+ src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
+ src_a8 = LD_DP(a + 8);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ res_c1 -= res_c2 * src_a17;
+ res_c1 *= src_a9;
+
+ res_c9 -= res_c10 * src_a17;
+ res_c9 *= src_a9;
+
+ res_c0 -= res_c2 * src_a16;
+ res_c0 -= res_c1 * src_a8;
+ res_c0 *= src_a0;
+
+ res_c8 -= res_c10 * src_a16;
+ res_c8 -= res_c9 * src_a8;
+ res_c8 *= src_a0;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c8, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c9, b + 6);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
+ src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
+
+ ST_DP(src_c0, c);
+ ST_DP(src_c4, c_nxt1line);
+ ST_DP(src_c8, c_nxt2line);
+ ST_DP(src_c12, c_nxt3line);
+}
+
+static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
+ v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
+ v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
+ v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
+ v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
+ v2f64 src_a61, src_a62, src_a63;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pbb);
+
+ for (i = bk - 1; i--;)
+ {
+ pba += 8;
+ pbb += 2;
+
+ LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
+ src_b1 = LD_DP(pbb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a8;
+ src_a1 = src_a9;
+ src_a2 = src_a16;
+ src_a3 = src_a17;
+ src_b0 = src_b1;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+
+ src_a56 = LD_DP(a - 8);
+ src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
+ src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
+ src_a58 = LD_DP(a - 6);
+ src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
+ src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
+ src_a60 = LD_DP(a - 4);
+ src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
+ src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
+ src_a62 = LD_DP(a - 2);
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
+ src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
+
+ res_c7 *= src_a63;
+ res_c6 -= res_c7 * src_a62;
+ res_c5 -= res_c7 * src_a61;
+ res_c4 -= res_c7 * src_a60;
+ res_c3 -= res_c7 * src_a59;
+ res_c2 -= res_c7 * src_a58;
+ res_c1 -= res_c7 * src_a57;
+ res_c0 -= res_c7 * src_a56;
+
+ src_a48 = LD_DP(a - 16);
+ src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
+ src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
+ src_a50 = LD_DP(a - 14);
+ src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
+ src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
+ src_a52 = LD_DP(a - 12);
+ src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
+ src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
+ src_a54 = __msa_cast_to_vector_double(*(a - 10));
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ src_a40 = LD_DP(a - 24);
+ src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
+ src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
+ src_a42 = LD_DP(a - 22);
+ src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
+ src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
+ src_a44 = LD_DP(a - 20);
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
+ src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
+
+ res_c6 *= src_a54;
+ res_c5 -= res_c6 * src_a53;
+ res_c4 -= res_c6 * src_a52;
+ res_c3 -= res_c6 * src_a51;
+ res_c2 -= res_c6 * src_a50;
+ res_c1 -= res_c6 * src_a49;
+ res_c0 -= res_c6 * src_a48;
+
+ res_c5 *= src_a45;
+ res_c4 -= res_c5 * src_a44;
+ res_c3 -= res_c5 * src_a43;
+ res_c2 -= res_c5 * src_a42;
+ res_c1 -= res_c5 * src_a41;
+ res_c0 -= res_c5 * src_a40;
+
+ ST_DP(res_c7, b - 2);
+ ST_DP(res_c6, b - 4);
+ ST_DP(res_c5, b - 6);
+
+ src_a32 = LD_DP(a - 32);
+ src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
+ src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
+ src_a34 = LD_DP(a - 30);
+ src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
+ src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
+ src_a36 = __msa_cast_to_vector_double(*(a - 28));
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+
+ res_c4 *= src_a36;
+ res_c3 -= res_c4 * src_a35;
+ res_c2 -= res_c4 * src_a34;
+ res_c1 -= res_c4 * src_a33;
+ res_c0 -= res_c4 * src_a32;
+
+ src_a24 = LD_DP(a - 40);
+ src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
+ src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
+ src_a26 = LD_DP(a - 38);
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
+ src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
+ src_a16 = LD_DP(a - 48);
+ src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
+ src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
+ src_a18 = __msa_cast_to_vector_double(*(a - 46));
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a0 = __msa_cast_to_vector_double(*(a - 64));
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a8 = LD_DP(a - 56);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+
+ res_c3 *= src_a27;
+ res_c2 -= res_c3 * src_a26;
+ res_c1 -= res_c3 * src_a25;
+ res_c0 -= res_c3 * src_a24;
+
+ res_c2 *= src_a18;
+ res_c1 -= res_c2 * src_a17;
+ res_c0 -= res_c2 * src_a16;
+
+ res_c1 *= src_a9;
+ res_c0 -= res_c1 * src_a8;
+
+ res_c0 *= src_a0;
+
+ ST_DP(res_c4, b - 8);
+ ST_DP(res_c3, b - 10);
+ ST_DP(res_c2, b - 12);
+ ST_DP(res_c1, b - 14);
+ ST_DP(res_c0, b - 16);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+}
+
+static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35;
+ FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53;
+ FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ if (bk > 0)
+ {
+ int i;
+ FLOAT *aa = a, *bb = b;
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
+
+ for (i = bk; i--; )
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+ a2 = aa[2];
+ a3 = aa[3];
+ a4 = aa[4];
+ a5 = aa[5];
+ a6 = aa[6];
+ a7 = aa[7];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+ c2 -= a2 * b0;
+ c3 -= a3 * b0;
+ c4 -= a4 * b0;
+ c5 -= a5 * b0;
+ c6 -= a6 * b0;
+ c7 -= a7 * b0;
+
+ aa += 8;
+ bb += 1;
+ }
+ }
+
+ a -= 64;
+ b -= 8;
+
+ a0 = *(a + 0);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a16 = *(a + 16);
+ a17 = *(a + 17);
+ a18 = *(a + 18);
+ a24 = *(a + 24);
+ a25 = *(a + 25);
+ a26 = *(a + 26);
+ a27 = *(a + 27);
+ a32 = *(a + 32);
+ a33 = *(a + 33);
+ a34 = *(a + 34);
+ a35 = *(a + 35);
+ a36 = *(a + 36);
+ a40 = *(a + 40);
+ a41 = *(a + 41);
+ a42 = *(a + 42);
+ a43 = *(a + 43);
+ a44 = *(a + 44);
+ a45 = *(a + 45);
+ a48 = *(a + 48);
+ a49 = *(a + 49);
+ a50 = *(a + 50);
+ a51 = *(a + 51);
+ a52 = *(a + 52);
+ a53 = *(a + 53);
+ a54 = *(a + 54);
+ a56 = *(a + 56);
+ a57 = *(a + 57);
+ a58 = *(a + 58);
+ a59 = *(a + 59);
+ a60 = *(a + 60);
+ a61 = *(a + 61);
+ a62 = *(a + 62);
+ a63 = *(a + 63);
+
+ c7 *= a63;
+
+ c6 -= c7 * a62;
+ c6 *= a54;
+
+ c5 -= c7 * a61;
+ c5 -= c6 * a53;
+ c5 *= a45;
+
+ c4 -= c7 * a60;
+ c4 -= c6 * a52;
+ c4 -= c5 * a44;
+ c4 *= a36;
+
+ c3 -= c7 * a59;
+ c3 -= c6 * a51;
+ c3 -= c5 * a43;
+ c3 -= c4 * a35;
+ c3 *= a27;
+
+ c2 -= c7 * a58;
+ c2 -= c6 * a50;
+ c2 -= c5 * a42;
+ c2 -= c4 * a34;
+ c2 -= c3 * a26;
+ c2 *= a18;
+
+ c1 -= c7 * a57;
+ c1 -= c6 * a49;
+ c1 -= c5 * a41;
+ c1 -= c4 * a33;
+ c1 -= c3 * a25;
+ c1 -= c2 * a17;
+ c1 *= a9;
+
+ c0 -= c7 * a56;
+ c0 -= c6 * a48;
+ c0 -= c5 * a40;
+ c0 -= c4 * a32;
+ c0 -= c3 * a24;
+ c0 -= c2 * a16;
+ c0 -= c1 * a8;
+ c0 *= a0;
+
+ *(b + 7) = c7;
+ *(b + 6) = c6;
+ *(b + 5) = c5;
+ *(b + 4) = c4;
+ *(b + 3) = c3;
+ *(b + 2) = c2;
+ *(b + 1) = c1;
+ *(b + 0) = c0;
+
+ *(c + 7) = c7;
+ *(c + 6) = c6;
+ *(c + 5) = c5;
+ *(c + 4) = c4;
+ *(c + 3) = c3;
+ *(c + 2) = c2;
+ *(c + 1) = c1;
+ *(c + 0) = c0;
+}
+
+static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
+ v2f64 src_a14, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 16, *bb = b + 16;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ LD_DP2(bb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 4;
+ }
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
+
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+
+ src_a9 = LD_DP(a + 9);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+
+ src_a8 = __msa_cast_to_vector_double(*(a + 8));
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+
+ res_c3 *= src_a15;
+ res_c7 *= src_a15;
+
+ res_c2 -= res_c3 * src_a14;
+ res_c6 -= res_c7 * src_a14;
+ res_c2 *= src_a10;
+ res_c6 *= src_a10;
+
+ res_c1 -= res_c3 * src_a13;
+ res_c5 -= res_c7 * src_a13;
+ res_c1 -= res_c2 * src_a9;
+ res_c5 -= res_c6 * src_a9;
+ res_c1 *= src_a5;
+ res_c5 *= src_a5;
+
+ res_c0 -= res_c3 * src_a12;
+ res_c4 -= res_c7 * src_a12;
+ res_c0 -= res_c2 * src_a8;
+ res_c4 -= res_c6 * src_a8;
+ res_c0 -= res_c1 * src_a4;
+ res_c4 -= res_c5 * src_a4;
+ res_c0 *= src_a0;
+ res_c4 *= src_a0;
+
+ ST_DP(res_c7, b + 14);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c6, b + 10);
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c5, b + 6);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c4, b + 2);
+ ST_DP(res_c0, b + 0);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+}
+
+static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
+ v2f64 src_a14, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 16, *bb = b + 8;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ src_b0 = LD_DP(bb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 2;
+ }
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+
+ src_a9 = LD_DP(a + 9);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+
+ src_a8 = __msa_cast_to_vector_double(*(a + 8));
+ src_a0 = __msa_cast_to_vector_double(*(a + 0));
+
+ src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+
+ res_c3 *= src_a15;
+
+ res_c2 -= res_c3 * src_a14;
+ res_c2 *= src_a10;
+
+ res_c1 -= res_c3 * src_a13;
+ res_c1 -= res_c2 * src_a9;
+ res_c1 *= src_a5;
+
+ res_c0 -= res_c3 * src_a12;
+ res_c0 -= res_c2 * src_a8;
+ res_c0 -= res_c1 * src_a4;
+ res_c0 *= src_a0;
+
+ ST_DP(res_c3, b + 6);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c0, b + 0);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+}
+
+static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
+ FLOAT c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 16, *bb = b + 4;
+ FLOAT a0, a1, a2, a3, b0;
+
+ for (i = bk; i--;)
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+ a2 = aa[2];
+ a3 = aa[3];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+ c2 -= a2 * b0;
+ c3 -= a3 * b0;
+
+ aa += 4;
+ bb += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a8 = *(a + 8);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+
+ c3 *= a15;
+
+ c2 -= c3 * a14;
+ c2 *= a10;
+
+ c1 -= c3 * a13;
+ c1 -= c2 * a9;
+ c1 *= a5;
+
+ c0 -= c3 * a12;
+ c0 -= c2 * a8;
+ c0 -= c1 * a4;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 4, *bb = b + 8;
+ FLOAT a0, a1, b0, b1, b2, b3;
+
+ for (i = bk; i--;)
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = bb[1];
+ c0_nxt1 -= a0 * b1;
+ c1_nxt1 -= a1 * b1;
+
+ b2 = bb[2];
+ c0_nxt2 -= a0 * b2;
+ c1_nxt2 -= a1 * b2;
+
+ b3 = bb[3];
+ c0_nxt3 -= a0 * b3;
+ c1_nxt3 -= a1 * b3;
+
+ aa += 2;
+ bb += 4;
+ }
+ }
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ c1_nxt1 *= a3;
+ c0_nxt1 -= c1_nxt1 * a2;
+ c0_nxt1 *= a0;
+
+ c1_nxt2 *= a3;
+ c0_nxt2 -= c1_nxt2 * a2;
+ c0_nxt2 *= a0;
+
+ c1_nxt3 *= a3;
+ c0_nxt3 -= c1_nxt3 * a2;
+ c0_nxt3 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 4, *bb = b + 4;
+ FLOAT a0, a1, b0, b1;
+
+ for (i = bk; i--;)
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = bb[1];
+ c0_nxt -= a0 * b1;
+ c1_nxt -= a1 * b1;
+
+ aa += 2;
+ bb += 2;
+ }
+ }
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ c1_nxt *= a3;
+
+ c0_nxt -= c1_nxt * a2;
+ c0_nxt *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a2, a3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0;
+ FLOAT *aa = a + 4, *bb = b + 2;
+
+ for (i = bk; i--;)
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ aa += 2;
+ bb += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+
+ c1 *= a3;
+ c0 -= c1 * a2;
+ c0 *= a0;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+
+ a0 = *a;
+ c0 = *(c + 0);
+ c0_nxt1 = *(c + 1 * ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 1, *bb = b + 4;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+ }
+
+ c0 *= a0;
+ c0_nxt1 *= a0;
+ c0_nxt2 *= a0;
+ c0_nxt3 *= a0;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+}
+
+static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
+{
+ *c *= *a;
+ *(c + ldc) = *a * *(c + ldc);
+
+ *b = *c;
+ *(b + 1) = *(c + ldc);
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG kk, i, j;
+ FLOAT *aa, *bb, *cc;
+
+ for (j = (n >> 2); j--;)
+ {
+ kk = m;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + (m - 1) * k + kk;
+ bb = b + 4 * kk;
+ cc = c + (m - 1);
+
+ dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk);
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k + 2 * kk;
+ bb = b + 4 * kk;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k + 4 * kk;
+ bb = b + 4 * kk;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ kk = m;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ aa = a + ((m & -1) - 1) * k;
+ cc = c + ((m & -1) - 1);
+
+ dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc);
+
+ kk -= 1;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = m;
+
+ if (m & 7)
+ {
+ if (m & 1)
+ {
+ kk -= 1;
+ aa = a + ((m & -1) - 1) * k + kk;
+ cc = c + ((m & -1) - 1);
+
+ *cc *= *aa;
+ *(b + kk) = *cc;
+ }
+
+ if (m & 2)
+ {
+ aa = a + ((m & -2) - 2) * k + kk * 2;
+ cc = c + ((m & -2) - 2);
+
+ dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk);
+
+ kk -= 2;
+ }
+
+ if (m & 4)
+ {
+ aa = a + ((m & -4) - 4) * k;
+ cc = c + ((m & -4) - 4);
+
+ dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk);
+
+ kk -= 4;
+ }
+ }
+
+ i = (m >> 3);
+ if (i > 0)
+ {
+ aa = a + ((m & -8) - 8) * k;
+ cc = c + ((m & -8) - 8);
+
+ do
+ {
+ dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk);
+
+ aa -= 8 * k;
+ cc -= 8;
+ kk -= 8;
+ i --;
+ } while (i > 0);
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
new file mode 100644
index 000000000..da35aa8f9
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
@@ -0,0 +1,1397 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ a += 8;
+ b += 4;
+
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(b, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ a += 8;
+ b += 4;
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
+ res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
+ res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
+ res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
+ res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
+ res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
+ res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
+ res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+
+ res_c8 *= src_a0;
+ res_c9 -= res_c8 * src_a1;
+ res_c10 -= res_c8 * src_a2;
+ res_c11 -= res_c8 * src_a3;
+ res_c12 -= res_c8 * src_a4;
+ res_c13 -= res_c8 * src_a5;
+ res_c14 -= res_c8 * src_a6;
+ res_c15 -= res_c8 * src_a7;
+
+ src_a9 = __msa_cast_to_vector_double(*(a + 9));
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ res_c1 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+
+ res_c9 *= src_a9;
+ res_c10 -= res_c9 * src_a10;
+ res_c11 -= res_c9 * src_a11;
+ res_c12 -= res_c9 * src_a12;
+ res_c13 -= res_c9 * src_a13;
+ res_c14 -= res_c9 * src_a14;
+ res_c15 -= res_c9 * src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c8, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c9, b + 6);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
+ src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
+
+ ST_DP(src_c0, c);
+ ST_DP(src_c4, c_nxt1line);
+ ST_DP(src_c8, c_nxt2line);
+ ST_DP(src_c12, c_nxt3line);
+
+ src_a18 = LD_DP(a + 18);
+ src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a20 = LD_DP(a + 20);
+ src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
+ src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
+ src_a22 = LD_DP(a + 22);
+ src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
+ src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+
+ res_c10 *= src_a18;
+ res_c11 -= res_c10 * src_a19;
+ res_c12 -= res_c10 * src_a20;
+ res_c13 -= res_c10 * src_a21;
+ res_c14 -= res_c10 * src_a22;
+ res_c15 -= res_c10 * src_a23;
+
+ src_a27 = __msa_cast_to_vector_double(*(a + 27));
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+ src_a28 = LD_DP(a + 28);
+ src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
+ src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
+ src_a30 = LD_DP(a + 30);
+ src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
+ src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
+
+ res_c3 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+
+ res_c11 *= src_a27;
+ res_c12 -= res_c11 * src_a28;
+ res_c13 -= res_c11 * src_a29;
+ res_c14 -= res_c11 * src_a30;
+ res_c15 -= res_c11 * src_a31;
+
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c10, b + 10);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c11, b + 14);
+
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
+ src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
+
+ src_a36 = LD_DP(a + 36);
+ src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+ src_a38 = LD_DP(a + 38);
+ src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
+ src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
+
+ res_c4 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+
+ res_c12 *= src_a36;
+ res_c13 -= res_c12 * src_a37;
+ res_c14 -= res_c12 * src_a38;
+ res_c15 -= res_c12 * src_a39;
+
+ src_a45 = __msa_cast_to_vector_double(*(a + 45));
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+ src_a46 = LD_DP(a + 46);
+ src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
+ src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
+
+ res_c5 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+
+ res_c13 *= src_a45;
+ res_c14 -= res_c13 * src_a46;
+ res_c15 -= res_c13 * src_a47;
+
+ ST_DP(src_c1, c + 2);
+ ST_DP(src_c5, c_nxt1line + 2);
+ ST_DP(src_c9, c_nxt2line + 2);
+ ST_DP(src_c13, c_nxt3line + 2);
+
+ ST_DP(res_c4, b + 16);
+ ST_DP(res_c12, b + 18);
+ ST_DP(res_c5, b + 20);
+ ST_DP(res_c13, b + 22);
+
+ src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
+ src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
+
+ src_a63 = __msa_cast_to_vector_double(*(a + 63));
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+ src_a54 = LD_DP(a + 54);
+ src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ res_c6 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+
+ res_c14 *= src_a54;
+ res_c15 -= res_c14 * src_a55;
+
+ res_c7 *= src_a63;
+ res_c15 *= src_a63;
+
+ ST_DP(src_c2, c + 4);
+ ST_DP(src_c6, c_nxt1line + 4);
+ ST_DP(src_c10, c_nxt2line + 4);
+ ST_DP(src_c14, c_nxt3line + 4);
+
+ ST_DP(res_c6, b + 24);
+ ST_DP(res_c14, b + 26);
+ ST_DP(res_c7, b + 28);
+ ST_DP(res_c15, b + 30);
+
+ src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
+ src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
+
+ ST_DP(src_c3, c + 6);
+ ST_DP(src_c7, c_nxt1line + 6);
+ ST_DP(src_c11, c_nxt2line + 6);
+ ST_DP(src_c15, c_nxt3line + 6);
+}
+
+static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
+ v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
+ v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
+ v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ a += 8;
+ b += 2;
+ }
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+ src_a4 = LD_DP(a + 4);
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
+ src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+ res_c4 -= res_c0 * src_a4;
+ res_c5 -= res_c0 * src_a5;
+ res_c6 -= res_c0 * src_a6;
+ res_c7 -= res_c0 * src_a7;
+
+ src_a9 = __msa_cast_to_vector_double(*(a + 9));
+ src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a12 = LD_DP(a + 12);
+ src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
+ src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);
+ src_a14 = LD_DP(a + 14);
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
+ src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);
+
+ res_c1 *= src_a9;
+ res_c2 -= res_c1 * src_a10;
+ res_c3 -= res_c1 * src_a11;
+ res_c4 -= res_c1 * src_a12;
+ res_c5 -= res_c1 * src_a13;
+ res_c6 -= res_c1 * src_a14;
+ res_c7 -= res_c1 * src_a15;
+
+ src_a18 = LD_DP(a + 18);
+ src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1);
+ src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+ src_a20 = LD_DP(a + 20);
+ src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1);
+ src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0);
+ src_a22 = LD_DP(a + 22);
+ src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1);
+ src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0);
+
+ res_c2 *= src_a18;
+ res_c3 -= res_c2 * src_a19;
+ res_c4 -= res_c2 * src_a20;
+ res_c5 -= res_c2 * src_a21;
+ res_c6 -= res_c2 * src_a22;
+ res_c7 -= res_c2 * src_a23;
+
+ src_a27 = __msa_cast_to_vector_double(*(a + 27));
+ src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+ src_a28 = LD_DP(a + 28);
+ src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
+ src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
+ src_a30 = LD_DP(a + 30);
+ src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1);
+ src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0);
+
+ res_c3 *= src_a27;
+ res_c4 -= res_c3 * src_a28;
+ res_c5 -= res_c3 * src_a29;
+ res_c6 -= res_c3 * src_a30;
+ res_c7 -= res_c3 * src_a31;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c3, b + 6);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c4, src_c5, c + ldc, 2);
+
+ src_a36 = LD_DP(a + 36);
+ src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
+ src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+ src_a38 = LD_DP(a + 38);
+ src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1);
+ src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0);
+
+ res_c4 *= src_a36;
+ res_c5 -= res_c4 * src_a37;
+ res_c6 -= res_c4 * src_a38;
+ res_c7 -= res_c4 * src_a39;
+
+ src_a45 = __msa_cast_to_vector_double(*(a + 45));
+ src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+ src_a46 = LD_DP(a + 46);
+ src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
+ src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
+
+ res_c5 *= src_a45;
+ res_c6 -= res_c5 * src_a46;
+ res_c7 -= res_c5 * src_a47;
+
+ src_a63 = __msa_cast_to_vector_double(*(a + 63));
+ src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+ src_a54 = LD_DP(a + 54);
+ src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
+ src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+
+ res_c6 *= src_a54;
+ res_c7 -= res_c6 * src_a55;
+
+ res_c7 *= src_a63;
+
+ ST_DP(res_c4, b + 8);
+ ST_DP(res_c5, b + 10);
+ ST_DP(res_c6, b + 12);
+ ST_DP(res_c7, b + 14);
+
+ src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+
+ ST_DP2(src_c2, src_c3, c + 4, 2);
+ ST_DP2(src_c6, src_c7, c + 4 + ldc, 2);
+}
+
+static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
+ FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
+ FLOAT a45, a46, a47, a54, a55, a63;
+ FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+ c4 = *(c + 4);
+ c5 = *(c + 5);
+ c6 = *(c + 6);
+ c7 = *(c + 7);
+
+ if (bk > 0)
+ {
+ int i;
+ FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
+
+ for (i = bk; i--; )
+ {
+ a0 = a[0];
+ a1 = a[1];
+ a2 = a[2];
+ a3 = a[3];
+ a4 = a[4];
+ a5 = a[5];
+ a6 = a[6];
+ a7 = a[7];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+ c2 -= a2 * b0;
+ c3 -= a3 * b0;
+ c4 -= a4 * b0;
+ c5 -= a5 * b0;
+ c6 -= a6 * b0;
+ c7 -= a7 * b0;
+
+ a += 8;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a4 = *(a + 4);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a9 = *(a + 9);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a12 = *(a + 12);
+ a13 = *(a + 13);
+ a14 = *(a + 14);
+ a15 = *(a + 15);
+ a18 = *(a + 18);
+ a19 = *(a + 19);
+ a20 = *(a + 20);
+ a21 = *(a + 21);
+ a22 = *(a + 22);
+ a23 = *(a + 23);
+ a27 = *(a + 27);
+ a28 = *(a + 28);
+ a29 = *(a + 29);
+ a30 = *(a + 30);
+ a31 = *(a + 31);
+ a36 = *(a + 36);
+ a37 = *(a + 37);
+ a38 = *(a + 38);
+ a39 = *(a + 39);
+ a45 = *(a + 45);
+ a46 = *(a + 46);
+ a47 = *(a + 47);
+ a54 = *(a + 54);
+ a55 = *(a + 55);
+ a63 = *(a + 63);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a9;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a10;
+ c2 *= a18;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a11;
+ c3 -= c2 * a19;
+ c3 *= a27;
+
+ c4 -= c0 * a4;
+ c4 -= c1 * a12;
+ c4 -= c2 * a20;
+ c4 -= c3 * a28;
+ c4 *= a36;
+
+ c5 -= c0 * a5;
+ c5 -= c1 * a13;
+ c5 -= c2 * a21;
+ c5 -= c3 * a29;
+ c5 -= c4 * a37;
+ c5 *= a45;
+
+ c6 -= c0 * a6;
+ c6 -= c1 * a14;
+ c6 -= c2 * a22;
+ c6 -= c3 * a30;
+ c6 -= c4 * a38;
+ c6 -= c5 * a46;
+ c6 *= a54;
+
+ c7 -= c0 * a7;
+ c7 -= c1 * a15;
+ c7 -= c2 * a23;
+ c7 -= c3 * a31;
+ c7 -= c4 * a39;
+ c7 -= c5 * a47;
+ c7 -= c6 * a55;
+ c7 *= a63;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+ *(c + 4) = c4;
+ *(c + 5) = c5;
+ *(c + 6) = c6;
+ *(c + 7) = c7;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+ *(b + 4) = c4;
+ *(b + 5) = c5;
+ *(b + 6) = c6;
+ *(b + 7) = c7;
+}
+
+static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v2f64 src_a10, src_a11, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ a += 4;
+ b += 4;
+ }
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
+ res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
+ res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
+ res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+
+ res_c4 *= src_a0;
+ res_c5 -= res_c4 * src_a1;
+ res_c6 -= res_c4 * src_a2;
+ res_c7 -= res_c4 * src_a3;
+
+ src_a5 = __msa_cast_to_vector_double(*(a + 5));
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c1 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+
+ res_c5 *= src_a5;
+ res_c6 -= res_c5 * src_a6;
+ res_c7 -= res_c5 * src_a7;
+
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a15 = __msa_cast_to_vector_double(*(a + 15));
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+
+ res_c2 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+ res_c3 *= src_a15;
+
+ res_c6 *= src_a10;
+ res_c7 -= res_c6 * src_a11;
+ res_c7 *= src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c4, b + 2);
+ ST_DP(res_c1, b + 4);
+ ST_DP(res_c5, b + 6);
+ ST_DP(res_c2, b + 8);
+ ST_DP(res_c6, b + 10);
+ ST_DP(res_c3, b + 12);
+ ST_DP(res_c7, b + 14);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
+ src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
+
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
+ src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+}
+
+static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
+ v2f64 src_a10, src_a11, src_a15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ a += 4;
+ b += 2;
+ }
+ }
+
+ res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
+ res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
+ res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+
+ src_a0 = LD_DP(a + 0);
+ src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
+ src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+ src_a2 = LD_DP(a + 2);
+ src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1);
+ src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0);
+
+ res_c0 *= src_a0;
+ res_c1 -= res_c0 * src_a1;
+ res_c2 -= res_c0 * src_a2;
+ res_c3 -= res_c0 * src_a3;
+
+ src_a5 = __msa_cast_to_vector_double(*(a + 5));
+ src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+ src_a6 = LD_DP(a + 6);
+ src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
+ src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
+
+ res_c1 *= src_a5;
+ res_c2 -= res_c1 * src_a6;
+ res_c3 -= res_c1 * src_a7;
+
+ src_a10 = LD_DP(a + 10);
+ src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
+ src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
+ src_a15 = __msa_cast_to_vector_double(*(a + 15));
+ src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+
+ res_c2 *= src_a10;
+ res_c3 -= res_c2 * src_a11;
+ res_c3 *= src_a15;
+
+ ST_DP(res_c0, b + 0);
+ ST_DP(res_c1, b + 2);
+ ST_DP(res_c2, b + 4);
+ ST_DP(res_c3, b + 6);
+
+ src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
+ src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
+ src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+}
+
+static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT c0, c1, c2, c3;
+ FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, a2, a3, b0;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+ a2 = a[2];
+ a3 = a[3];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+ c2 -= a2 * b0;
+ c3 -= a3 * b0;
+
+ a += 4;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a2 = *(a + 2);
+ a3 = *(a + 3);
+ a5 = *(a + 5);
+ a6 = *(a + 6);
+ a7 = *(a + 7);
+ a10 = *(a + 10);
+ a11 = *(a + 11);
+ a15 = *(a + 15);
+
+ c0 *= a0;
+
+ c1 -= c0 * a1;
+ c1 *= a5;
+
+ c2 -= c0 * a2;
+ c2 -= c1 * a6;
+ c2 *= a10;
+
+ c3 -= c0 * a3;
+ c3 -= c1 * a7;
+ c3 -= c2 * a11;
+ c3 *= a15;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a1, a3;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + ldc);
+ c1_nxt1 = *(c + 1 + ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0, b1, b2, b3;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = b[1];
+ c0_nxt1 -= a0 * b1;
+ c1_nxt1 -= a1 * b1;
+
+ b2 = b[2];
+ c0_nxt2 -= a0 * b2;
+ c1_nxt2 -= a1 * b2;
+
+ b3 = b[3];
+ c0_nxt3 -= a0 * b3;
+ c1_nxt3 -= a1 * b3;
+
+ a += 2;
+ b += 4;
+ }
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ c0_nxt1 *= a0;
+ c1_nxt1 -= c0_nxt1 * a1;
+ c1_nxt1 *= a3;
+
+ c0_nxt2 *= a0;
+ c1_nxt2 -= c0_nxt2 * a1;
+ c1_nxt2 *= a3;
+
+ c0_nxt3 *= a0;
+ c1_nxt3 -= c0_nxt3 * a1;
+ c1_nxt3 *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+ *(b + 4) = c1;
+ *(b + 5) = c1_nxt1;
+ *(b + 6) = c1_nxt2;
+ *(b + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt1;
+ *(c + 1 + ldc) = c1_nxt1;
+
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0, a1, a3;
+ FLOAT c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0, b1;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = b[1];
+ c0_nxt -= a0 * b1;
+ c1_nxt -= a1 * b1;
+
+ a += 2;
+ b += 2;
+ }
+ }
+
+ a0 = *a;
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ c0_nxt *= a0;
+ c1_nxt -= c0_nxt * a1;
+ c1_nxt *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+ *(b + 2) = c1;
+ *(b + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT a0, a1, a3, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ a += 2;
+ b += 1;
+ }
+ }
+
+ a0 = *(a + 0);
+ a1 = *(a + 1);
+ a3 = *(a + 3);
+
+ c0 *= a0;
+ c1 -= c0 * a1;
+ c1 *= a3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT a0;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+
+ c0 = *(c + 0);
+ c0_nxt1 = *(c + 1 * ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+ }
+
+ a0 = *a;
+
+ c0 *= a0;
+ c0_nxt1 *= a0;
+ c0_nxt2 *= a0;
+ c0_nxt3 *= a0;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt1;
+ *(b + 2) = c0_nxt2;
+ *(b + 3) = c0_nxt3;
+}
+
+static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT c0, c0_nxt;
+
+ c0 = *c;
+ c0_nxt = *(c + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *a * b[0];
+ c0_nxt -= *a * b[1];
+
+ a += 1;
+ b += 2;
+ }
+ }
+
+ c0 *= *a;
+ c0_nxt *= *a;
+
+ *(b + 0) = c0;
+ *(b + 1) = c0_nxt;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c0_nxt;
+}
+
+static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ *c -= *a * *b;
+
+ a += 1;
+ b += 1;
+ }
+ }
+
+ *c *= *a;
+ *b = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc;
+
+ for (j = (n >> 2); j--;)
+ {
+ kk = 0;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ kk = 0;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_lt_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ }
+
+ if (n & 1)
+ {
+ kk = 0;
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_lt_msa(aa, b, cc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ kk += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_lt_msa(aa, b, cc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ kk += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_lt_msa(aa, b, cc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ kk += 2;
+ }
+
+ if (m & 1)
+ {
+ dgmm_dsolve_1x1_msa(aa, b, cc, kk);
+
+ aa += k;
+ cc += 1;
+ kk += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
new file mode 100644
index 000000000..659f77266
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
@@ -0,0 +1,963 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
+ v2f64 src_b10, src_b11, src_b15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_b;
+
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ a += 8;
+ b += 4;
+
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(b, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ a += 8;
+ b += 4;
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+ src_b5 = __msa_cast_to_vector_double(*(b + 5));
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+ src_b6 = LD_DP(b + 6);
+ src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
+ src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
+ src_b10 = LD_DP(b + 10);
+ src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+ src_b15 = __msa_cast_to_vector_double(*(b + 15));
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ src_c4 -= src_c0 * src_b1;
+ src_c5 -= src_c1 * src_b1;
+ src_c6 -= src_c2 * src_b1;
+ src_c7 -= src_c3 * src_b1;
+
+ src_c4 *= src_b5;
+ src_c5 *= src_b5;
+ src_c6 *= src_b5;
+ src_c7 *= src_b5;
+
+ src_c8 -= src_c0 * src_b2;
+ src_c9 -= src_c1 * src_b2;
+ src_c10 -= src_c2 * src_b2;
+ src_c11 -= src_c3 * src_b2;
+
+ src_c8 -= src_c4 * src_b6;
+ src_c9 -= src_c5 * src_b6;
+ src_c10 -= src_c6 * src_b6;
+ src_c11 -= src_c7 * src_b6;
+
+ src_c8 *= src_b10;
+ src_c9 *= src_b10;
+ src_c10 *= src_b10;
+ src_c11 *= src_b10;
+
+ src_c12 -= src_c0 * src_b3;
+ src_c13 -= src_c1 * src_b3;
+ src_c14 -= src_c2 * src_b3;
+ src_c15 -= src_c3 * src_b3;
+
+ src_c12 -= src_c4 * src_b7;
+ src_c13 -= src_c5 * src_b7;
+ src_c14 -= src_c6 * src_b7;
+ src_c15 -= src_c7 * src_b7;
+
+ src_c12 -= src_c8 * src_b11;
+ src_c13 -= src_c9 * src_b11;
+ src_c14 -= src_c10 * src_b11;
+ src_c15 -= src_c11 * src_b11;
+
+ src_c12 *= src_b15;
+ src_c13 *= src_b15;
+ src_c14 *= src_b15;
+ src_c15 *= src_b15;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
+}
+
+static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b1, src_b3;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ a += 8;
+ b += 2;
+ }
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b3 = __msa_cast_to_vector_double(*(b + 3));
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ src_c4 -= src_c0 * src_b1;
+ src_c5 -= src_c1 * src_b1;
+ src_c6 -= src_c2 * src_b1;
+ src_c7 -= src_c3 * src_b1;
+
+ src_c4 *= src_b3;
+ src_c5 *= src_b3;
+ src_c6 *= src_b3;
+ src_c7 *= src_b3;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3;
+ v2f64 src_b0;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b;
+
+ for (i = bk; i--;)
+ {
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ a += 8;
+ b += 1;
+ }
+ }
+
+ src_b0 = __msa_cast_to_vector_double(*b);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
+ v2f64 src_b10, src_b11, src_b15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ LD_DP2(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ a += 4;
+ b += 4;
+ }
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+ src_b5 = __msa_cast_to_vector_double(*(b + 5));
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+ src_b6 = LD_DP(b + 6);
+ src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
+ src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
+ src_b10 = LD_DP(b + 10);
+ src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+ src_b15 = __msa_cast_to_vector_double(*(b + 15));
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+
+ src_c2 *= src_b5;
+ src_c3 *= src_b5;
+
+ src_c4 -= src_c0 * src_b2;
+ src_c5 -= src_c1 * src_b2;
+
+ src_c4 -= src_c2 * src_b6;
+ src_c5 -= src_c3 * src_b6;
+
+ src_c4 *= src_b10;
+ src_c5 *= src_b10;
+
+ src_c6 -= src_c0 * src_b3;
+ src_c7 -= src_c1 * src_b3;
+
+ src_c6 -= src_c2 * src_b7;
+ src_c7 -= src_c3 * src_b7;
+
+ src_c6 -= src_c4 * src_b11;
+ src_c7 -= src_c5 * src_b11;
+
+ src_c6 *= src_b15;
+ src_c7 *= src_b15;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(a, 2, src_a0, src_a1);
+ src_b0 = LD_DP(b);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ a += 4;
+ b += 2;
+ }
+ }
+
+ src_b0 = LD_DP(b + 0);
+ src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b3 = __msa_cast_to_vector_double(*(b + 3));
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ src_c2 -= src_c0 * src_b1;
+ src_c3 -= src_c1 * src_b1;
+
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT b0, c0, c1, c2, c3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, a2, a3;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+ a2 = a[2];
+ a3 = a[3];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+ c2 -= a2 * b0;
+ c3 -= a3 * b0;
+
+ a += 4;
+ b += 1;
+ }
+ }
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+ c2 *= b0;
+ c3 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0, b1, b2, b3;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = b[1];
+ c0_nxt1 -= a0 * b1;
+ c1_nxt1 -= a1 * b1;
+
+ b2 = b[2];
+ c0_nxt2 -= a0 * b2;
+ c1_nxt2 -= a1 * b2;
+
+ b3 = b[3];
+ c0_nxt3 -= a0 * b3;
+ c1_nxt3 -= a1 * b3;
+
+ a += 2;
+ b += 4;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt1 -= c0 * b1;
+ c1_nxt1 -= c1 * b1;
+ c0_nxt1 *= b5;
+ c1_nxt1 *= b5;
+
+ c0_nxt2 -= c0 * b2;
+ c1_nxt2 -= c1 * b2;
+ c0_nxt2 -= c0_nxt1 * b6;
+ c1_nxt2 -= c1_nxt1 * b6;
+ c0_nxt2 *= b10;
+ c1_nxt2 *= b10;
+
+ c0_nxt3 -= c0 * b3;
+ c1_nxt3 -= c1 * b3;
+ c0_nxt3 -= c0_nxt1 * b7;
+ c1_nxt3 -= c1_nxt1 * b7;
+ c0_nxt3 -= c0_nxt2 * b11;
+ c1_nxt3 -= c1_nxt2 * b11;
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+
+ *(c + 1) = c1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0, b1;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = b[1];
+ c0_nxt -= a0 * b1;
+ c1_nxt -= a1 * b1;
+
+ a += 2;
+ b += 2;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+ c1 *= b0;
+
+ c0_nxt -= c0 * b1;
+ c1_nxt -= c1 * b1;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT a0, a1, b0;
+
+ for (i = bk; i--;)
+ {
+ a0 = a[0];
+ a1 = a[1];
+
+ b0 = b[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ a += 2;
+ b += 1;
+ }
+ }
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+
+ c0 = *(c + 0);
+ c0_nxt1 = *(c + 1 * ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= a[0] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+
+ a += 1;
+ b += 4;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b2 = *(b + 2);
+ b3 = *(b + 3);
+ b5 = *(b + 5);
+ b6 = *(b + 6);
+ b7 = *(b + 7);
+ b10 = *(b + 10);
+ b11 = *(b + 11);
+ b15 = *(b + 15);
+
+ c0 *= b0;
+
+ c0_nxt1 -= c0 * b1;
+ c0_nxt1 *= b5;
+
+ c0_nxt2 -= c0 * b2;
+ c0_nxt2 -= c0_nxt1 * b6;
+ c0_nxt2 *= b10;
+
+ c0_nxt3 -= c0 * b3;
+ c0_nxt3 -= c0_nxt1 * b7;
+ c0_nxt3 -= c0_nxt2 * b11;
+ c0_nxt3 *= b15;
+
+ *(a + 0) = c0;
+ *(a + 1) = c0_nxt1;
+ *(a + 2) = c0_nxt2;
+ *(a + 3) = c0_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+}
+
+static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b1, b3, c0, c0_nxt;
+
+ c0 = *c;
+ c0_nxt = *(c + ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *a * b[0];
+ c0_nxt -= *a * b[1];
+
+ a += 1;
+ b += 2;
+ }
+ }
+
+ b0 = *(b + 0);
+ b1 = *(b + 1);
+ b3 = *(b + 3);
+
+ c0 *= b0;
+
+ c0_nxt -= c0 * b1;
+ c0_nxt *= b3;
+
+ *(a + 0) = c0;
+ *(a + 1) = c0_nxt;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c0_nxt;
+}
+
+static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = bk; i--;)
+ {
+ *c -= *a * *b;
+
+ a += 1;
+ b += 1;
+ }
+ }
+
+ *c *= *a;
+ *b = *c;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc;
+
+ kk = 0;
+
+ for (j = (n >> 2); j--;)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk += 4;
+ b += 4 * k;
+ c += 4 * ldc;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_rn_msa(aa, b, cc, ldc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += 2 * k;
+ c += 2 * ldc;
+ kk += 2;
+ }
+
+ if (n & 1)
+ {
+ aa = a;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_rn_msa(aa, b, cc, kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_rn_msa(aa, b, cc, kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_rn_msa(aa, b, cc, kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dgmm_dsolve_1x1_msa(b, aa, cc, kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ b += k;
+ c += ldc;
+ kk += 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
new file mode 100644
index 000000000..a90d5fec3
--- /dev/null
+++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
@@ -0,0 +1,866 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
+ v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
+ v2f64 src_b14, src_b15;
+ FLOAT *c_nxt1line = c + ldc;
+ FLOAT *c_nxt2line = c + 2 * ldc;
+ FLOAT *c_nxt3line = c + 3 * ldc;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
+ LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
+ LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2(pbb, 2, src_b0, src_b1);
+
+ for (i = (bk - 1); i--;)
+ {
+ pba += 8;
+ pbb += 4;
+
+ LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2(pbb, 2, src_b2, src_b3);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b2;
+ src_b1 = src_b3;
+ }
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+ }
+
+ a -= 32;
+ b -= 16;
+
+ src_b12 = LD_DP(b + 12);
+ src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
+ src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
+ src_b14 = LD_DP(b + 14);
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
+ src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
+
+ src_b8 = LD_DP(b + 8);
+ src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
+ src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
+ src_b10 = __msa_cast_to_vector_double(*(b + 10));
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b4 = LD_DP(b + 4);
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
+ src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
+
+ src_c12 *= src_b15;
+ src_c13 *= src_b15;
+ src_c14 *= src_b15;
+ src_c15 *= src_b15;
+
+ src_c8 -= src_c12 * src_b14;
+ src_c9 -= src_c13 * src_b14;
+ src_c10 -= src_c14 * src_b14;
+ src_c11 -= src_c15 * src_b14;
+
+ src_c8 *= src_b10;
+ src_c9 *= src_b10;
+ src_c10 *= src_b10;
+ src_c11 *= src_b10;
+
+ src_c4 -= src_c12 * src_b13;
+ src_c5 -= src_c13 * src_b13;
+ src_c6 -= src_c14 * src_b13;
+ src_c7 -= src_c15 * src_b13;
+
+ src_c4 -= src_c8 * src_b9;
+ src_c5 -= src_c9 * src_b9;
+ src_c6 -= src_c10 * src_b9;
+ src_c7 -= src_c11 * src_b9;
+
+ src_c4 *= src_b5;
+ src_c5 *= src_b5;
+ src_c6 *= src_b5;
+ src_c7 *= src_b5;
+
+ src_c0 -= src_c12 * src_b12;
+ src_c1 -= src_c13 * src_b12;
+ src_c2 -= src_c14 * src_b12;
+ src_c3 -= src_c15 * src_b12;
+
+ src_c0 -= src_c8 * src_b8;
+ src_c1 -= src_c9 * src_b8;
+ src_c2 -= src_c10 * src_b8;
+ src_c3 -= src_c11 * src_b8;
+
+ src_c0 -= src_c4 * src_b4;
+ src_c1 -= src_c5 * src_b4;
+ src_c2 -= src_c6 * src_b4;
+ src_c3 -= src_c7 * src_b4;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
+ ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
+ ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b2, src_b3;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+
+ LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b + 4);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+ }
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+
+ src_c4 *= src_b3;
+ src_c5 *= src_b3;
+ src_c6 *= src_b3;
+ src_c7 *= src_b3;
+
+ src_c0 -= src_c4 * src_b2;
+ src_c1 -= src_c5 * src_b2;
+ src_c2 -= src_c6 * src_b2;
+ src_c3 -= src_c7 * src_b2;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+}
+
+static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3;
+ v2f64 src_b0;
+
+ LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+
+ src_b0 = __msa_cast_to_vector_double(*b);
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+ src_c2 *= src_b0;
+ src_c3 *= src_b0;
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
+ v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
+ v2f64 src_b14, src_b15;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+ LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
+ LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 16, *bb = b + 16;
+ v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
+
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ LD_DP2(bb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c6 -= src_a0 * src_b;
+ src_c7 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 4;
+ }
+ }
+
+ src_b12 = LD_DP(b + 12);
+ src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
+ src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
+ src_b14 = LD_DP(b + 14);
+ src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
+ src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
+
+ src_b8 = LD_DP(b + 8);
+ src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
+ src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
+ src_b10 = __msa_cast_to_vector_double(*(b + 10));
+ src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b4 = LD_DP(b + 4);
+ src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
+ src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
+
+ src_c6 *= src_b15;
+ src_c7 *= src_b15;
+
+ src_c4 -= src_c6 * src_b14;
+ src_c5 -= src_c7 * src_b14;
+
+ src_c4 *= src_b10;
+ src_c5 *= src_b10;
+
+ src_c2 -= src_c6 * src_b13;
+ src_c3 -= src_c7 * src_b13;
+
+ src_c2 -= src_c4 * src_b9;
+ src_c3 -= src_c5 * src_b9;
+
+ src_c2 *= src_b5;
+ src_c3 *= src_b5;
+
+ src_c0 -= src_c6 * src_b12;
+ src_c1 -= src_c7 * src_b12;
+
+ src_c0 -= src_c4 * src_b8;
+ src_c1 -= src_c5 * src_b8;
+
+ src_c0 -= src_c2 * src_b4;
+ src_c1 -= src_c3 * src_b4;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
+ ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+ ST_DP2(src_c0, src_c1, c, 2);
+
+ ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+{
+ v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
+
+ LD_DP2(c, 2, src_c0, src_c1);
+ LD_DP2(c + ldc, 2, src_c2, src_c3);
+
+ if (bk > 0)
+ {
+ v2f64 src_a0, src_a1, src_b, src_b0;
+
+ LD_DP2(a + 8, 2, src_a0, src_a1);
+ src_b0 = LD_DP(b + 4);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+ }
+
+ src_b0 = __msa_cast_to_vector_double(*(b + 0));
+ src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+ src_b2 = LD_DP(b + 2);
+ src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
+ src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
+
+ src_c2 *= src_b3;
+ src_c3 *= src_b3;
+
+ src_c0 -= src_c2 * src_b2;
+ src_c1 -= src_c3 * src_b2;
+
+ src_c0 *= src_b0;
+ src_c1 *= src_b0;
+
+ ST_DP2(src_c0, src_c1, c, 2);
+ ST_DP2(src_c2, src_c3, c + ldc, 2);
+
+ ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
+}
+
+static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+{
+ FLOAT b0, c0, c1, c2, c3;
+
+ b0 = *(b + 0);
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c2 = *(c + 2);
+ c3 = *(c + 3);
+
+ c0 *= b0;
+ c1 *= b0;
+ c2 *= b0;
+ c3 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 2) = c2;
+ *(c + 3) = c3;
+}
+
+static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
+ FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+ c0_nxt1 = *(c + 0 + 1 * ldc);
+ c1_nxt1 = *(c + 1 + 1 * ldc);
+ c0_nxt2 = *(c + 0 + 2 * ldc);
+ c1_nxt2 = *(c + 1 + 2 * ldc);
+ c0_nxt3 = *(c + 0 + 3 * ldc);
+ c1_nxt3 = *(c + 1 + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 8, *bb = b + 16;
+ FLOAT a0, a1, b0, b1, b2, b3;
+
+ for (i = bk; i--;)
+ {
+ a0 = aa[0];
+ a1 = aa[1];
+
+ b0 = bb[0];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = bb[1];
+ c0_nxt1 -= a0 * b1;
+ c1_nxt1 -= a1 * b1;
+
+ b2 = bb[2];
+ c0_nxt2 -= a0 * b2;
+ c1_nxt2 -= a1 * b2;
+
+ b3 = bb[3];
+ c0_nxt3 -= a0 * b3;
+ c1_nxt3 -= a1 * b3;
+
+ aa += 2;
+ bb += 4;
+ }
+ }
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c0_nxt3 *= b15;
+ c1_nxt3 *= b15;
+
+ c0_nxt2 -= c0_nxt3 * b14;
+ c1_nxt2 -= c1_nxt3 * b14;
+ c0_nxt2 *= b10;
+ c1_nxt2 *= b10;
+
+ c0_nxt1 -= c0_nxt3 * b13;
+ c1_nxt1 -= c1_nxt3 * b13;
+ c0_nxt1 -= c0_nxt2 * b9;
+ c1_nxt1 -= c1_nxt2 * b9;
+ c0_nxt1 *= b5;
+ c1_nxt1 *= b5;
+
+ c0 -= c0_nxt3 * b12;
+ c1 -= c1_nxt3 * b12;
+ c0 -= c0_nxt2 * b8;
+ c1 -= c1_nxt2 * b8;
+ c0 -= c0_nxt1 * b4;
+ c1 -= c1_nxt1 * b4;
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt1;
+ *(a + 3) = c1_nxt1;
+ *(a + 4) = c0_nxt2;
+ *(a + 5) = c1_nxt2;
+ *(a + 6) = c0_nxt3;
+ *(a + 7) = c1_nxt3;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+
+ *(c + 0 + 1 * ldc) = c0_nxt1;
+ *(c + 1 + 1 * ldc) = c1_nxt1;
+
+ *(c + 0 + 2 * ldc) = c0_nxt2;
+ *(c + 1 + 2 * ldc) = c1_nxt2;
+
+ *(c + 0 + 3 * ldc) = c0_nxt3;
+ *(c + 1 + 3 * ldc) = c1_nxt3;
+}
+
+static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+{
+ FLOAT b0, b2, b3;
+ FLOAT c0, c1, c0_nxt, c1_nxt;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ c0_nxt = *(c + 0 + ldc);
+ c1_nxt = *(c + 1 + ldc);
+
+ if (bk > 0)
+ {
+ FLOAT a0, a1, b0, b1;
+
+ a0 = a[4];
+ a1 = a[5];
+
+ b0 = b[4];
+ c0 -= a0 * b0;
+ c1 -= a1 * b0;
+
+ b1 = b[5];
+ c0_nxt -= a0 * b1;
+ c1_nxt -= a1 * b1;
+ }
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c0_nxt *= b3;
+ c1_nxt *= b3;
+
+ c0 -= c0_nxt * b2;
+ c0 *= b0;
+
+ c1 -= c1_nxt * b2;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+ *(a + 2) = c0_nxt;
+ *(a + 3) = c1_nxt;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+ *(c + 0 + ldc) = c0_nxt;
+ *(c + 1 + ldc) = c1_nxt;
+}
+
+static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+{
+ FLOAT b0, c0, c1;
+
+ c0 = *(c + 0);
+ c1 = *(c + 1);
+
+ b0 = *b;
+
+ c0 *= b0;
+ c1 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c1;
+
+ *(c + 0) = c0;
+ *(c + 1) = c1;
+}
+
+static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
+ FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+
+ c0 = *(c + 0);
+ c0_nxt1 = *(c + 1 * ldc);
+ c0_nxt2 = *(c + 2 * ldc);
+ c0_nxt3 = *(c + 3 * ldc);
+
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a + 4, *bb = b + 16;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+
+ aa += 1;
+ bb += 4;
+ }
+ }
+
+ b0 = *b;
+ b4 = *(b + 4);
+ b5 = *(b + 5);
+ b8 = *(b + 8);
+ b9 = *(b + 9);
+ b10 = *(b + 10);
+ b12 = *(b + 12);
+ b13 = *(b + 13);
+ b14 = *(b + 14);
+ b15 = *(b + 15);
+
+ c0_nxt3 *= b15;
+
+ c0_nxt2 -= c0_nxt3 * b14;
+ c0_nxt2 *= b10;
+
+ c0_nxt1 -= c0_nxt3 * b13;
+ c0_nxt1 -= c0_nxt2 * b9;
+ c0_nxt1 *= b5;
+
+ c0 -= c0_nxt3 * b12;
+ c0 -= c0_nxt2 * b8;
+ c0 -= c0_nxt1 * b4;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c0_nxt1;
+ *(a + 2) = c0_nxt2;
+ *(a + 3) = c0_nxt3;
+
+ *(c) = c0;
+ *(c + 1 * ldc) = c0_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
+}
+
+static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+{
+ FLOAT b0, b2, b3, c0, c0_nxt;
+
+ c0 = *(c + 0);
+ c0_nxt = *(c + ldc);
+
+ if (bk > 0)
+ {
+ c0 -= a[2] * b[4];
+ c0_nxt -= a[2] * b[5];
+ }
+
+ b3 = *(b + 3);
+ b2 = *(b + 2);
+ b0 = *b;
+
+ c0_nxt *= b3;
+
+ c0 -= c0_nxt * b2;
+ c0 *= b0;
+
+ *(a + 0) = c0;
+ *(a + 1) = c0_nxt;
+
+ *(c + 0) = c0;
+ *(c + ldc) = c0_nxt;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ FLOAT *c, BLASLONG ldc, BLASLONG offset)
+{
+ BLASLONG i, j, kk;
+ FLOAT *aa, *cc, *bb;
+
+ kk = n;
+ c += n * ldc;
+ b += n * k;
+
+ if (n & 3)
+ {
+ if (n & 1)
+ {
+ aa = a;
+ c -= ldc;
+ b -= k;
+ bb = b + (kk - 1);
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ *cc *= *bb;
+ *(aa + kk - 1) = *cc;
+
+ aa += k;
+ cc += 1;
+ }
+
+ }
+
+ kk -= 1;
+ }
+
+ if (n & 2)
+ {
+ aa = a;
+ c -= 2 * ldc;
+ b -= 2 * k;
+ bb = b + 2 * kk;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk);
+ }
+ }
+
+ kk -= 2;
+ }
+ }
+
+ for (j = (n >> 2); j--;)
+ {
+ aa = a;
+ b -= 4 * k;
+ bb = b + 4 * kk;
+ c -= 4 * ldc;
+ cc = c;
+
+ for (i = (m >> 3); i--;)
+ {
+ dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk);
+
+ aa += 8 * k;
+ cc += 8;
+ }
+
+ if (m & 7)
+ {
+ if (m & 4)
+ {
+ dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk);
+
+ aa += 4 * k;
+ cc += 4;
+ }
+
+ if (m & 2)
+ {
+ dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk);
+
+ aa += 2 * k;
+ cc += 2;
+ }
+
+ if (m & 1)
+ {
+ dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk);
+
+ aa += k;
+ cc += 1;
+ }
+ }
+
+ kk -= 4;
+ }
+
+ return 0;
+}